From 7807028ccd0a36038aa7b996daacd79d4c9c49cb Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 10 May 2015 18:34:05 -0500 Subject: [PATCH 01/11] Revert "arc_evict, arc_evict_ghost: reduce stack usage using kmem_zalloc" This reverts commit 16fcdea36340c658b4557fd34a74915fd618f7a6 in preparation for the illumos 5497 "lock contention on arcs_mtx" patch which eliminates "marker" within the ARC code. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1699ea7e7e7a..9882295b9cf8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1825,15 +1825,13 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; - arc_buf_hdr_t *marker; + arc_buf_hdr_t marker = {{{ 0 }}}; int count = 0; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - marker = kmem_zalloc(sizeof (arc_buf_hdr_t), KM_SLEEP); - top: mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); @@ -1868,14 +1866,14 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * the hot code path, so don't sleep. */ if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, ab, marker); + list_insert_after(list, ab, &marker); mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - ab_prev = list_prev(list, marker); - list_remove(list, marker); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); count = 0; continue; } @@ -1959,8 +1957,6 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, goto top; } - kmem_free(marker, sizeof (arc_buf_hdr_t)); - if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state->arcs_state); @@ -1990,7 +1986,7 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { arc_buf_hdr_t *ab, *ab_prev; - arc_buf_hdr_t *marker; + arc_buf_hdr_t marker; list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; @@ -1998,9 +1994,7 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, int count = 0; ASSERT(GHOST_STATE(state)); - - marker = kmem_zalloc(sizeof (arc_buf_hdr_t), KM_SLEEP); - + bzero(&marker, sizeof (marker)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { @@ -2026,12 +2020,12 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, * before reacquiring the lock. */ if (count++ > arc_evict_iterations) { - list_insert_after(list, ab, marker); + list_insert_after(list, ab, &marker); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, marker); - list_remove(list, marker); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); count = 0; continue; } @@ -2063,13 +2057,13 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, * hash lock to become available. Once its * available, restart from where we left off. */ - list_insert_after(list, ab, marker); + list_insert_after(list, ab, &marker); mutex_exit(&state->arcs_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, marker); - list_remove(list, marker); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); } else { bufs_skipped += 1; } @@ -2082,8 +2076,6 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, goto top; } - kmem_free(marker, sizeof (arc_buf_hdr_t)); - if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); From f6b3b1f5d68a98b71ef5759b83eec15cd0e7a89f Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sun, 15 Feb 2015 22:28:32 -0600 Subject: [PATCH 02/11] Revert "fix l2arc compression buffers leak" This reverts commit 037763e44e0f6d7284e9328db988a89fdc975a4e in preparation for the illumos 5497 "lock contention on arcs_mtx" patch which includes a fix for this very problem. ZoL had picked up a subset of the illumos 5497 patch to deal with the l2arc compression buffer leak. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 65 ++++++++---------------------------------------- 1 file changed, 10 insertions(+), 55 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9882295b9cf8..e129e454561f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -309,7 +309,6 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; @@ -398,7 +397,6 @@ static arc_stats_t arc_stats = { { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, - { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, @@ -1474,21 +1472,6 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } -static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) -{ - l2arc_data_free_t *df; - - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); -} - /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. @@ -1499,7 +1482,14 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { - arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = buf->b_data; + df->l2df_size = hdr->b_size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); @@ -1510,23 +1500,6 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ -static void -arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) -{ - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(MUTEX_HELD(&l2arc_buflist_mtx)); - - if (l2hdr->b_tmp_cdata == NULL) - return; - - ASSERT(HDR_L2_WRITING(hdr)); - arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, - zio_data_buf_free); - ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - l2hdr->b_tmp_cdata = NULL; -} - static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) { @@ -1622,7 +1595,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - arc_buf_l2_cdata_free(hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, @@ -3673,7 +3645,6 @@ arc_release(arc_buf_t *buf, void *tag) l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); - arc_buf_l2_cdata_free(hdr); hdr->b_l2hdr = NULL; list_remove(l2hdr->b_dev->l2ad_buflist, hdr); } @@ -4915,11 +4886,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; ab->b_l2hdr = NULL; - /* - * We are destroying l2hdr, so ensure that - * its compressed buffer, if any, is not leaked. - */ - ASSERT(abl2->b_tmp_cdata == NULL); kmem_cache_free(l2arc_hdr_cache, abl2); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); @@ -5154,14 +5120,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_data = l2hdr->b_tmp_cdata; buf_sz = l2hdr->b_asize; - /* - * If the data has not been compressed, then clear b_tmp_cdata - * to make sure that it points only to a temporary compression - * buffer. - */ - if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)) - l2hdr->b_tmp_cdata = NULL; - /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_p_sz; @@ -5352,18 +5310,15 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) { l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; - ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)); - if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) { + if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); - l2hdr->b_tmp_cdata = NULL; - } else { - ASSERT(l2hdr->b_tmp_cdata == NULL); } + l2hdr->b_tmp_cdata = NULL; } /* From 97639d0a528fea141574c65241be686e9f8d5c72 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Fri, 23 Jan 2015 08:08:14 -0600 Subject: [PATCH 03/11] Revert "Allow arc_evict_ghost() to only evict meta data" Illumos 5497 "lock contention on arcs_mtx" reworks eviction and obviates the need for this. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e129e454561f..68180b0937b1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -498,8 +498,7 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); static void arc_buf_watch(arc_buf_t *buf); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); @@ -1945,7 +1944,6 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * that evicting from the ghost list in this hot code path, leave * this chore to the arc_reclaim_thread(). */ - return (stolen); } @@ -1954,12 +1952,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; arc_buf_hdr_t marker; - list_t *list = &state->arcs_list[type]; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -2094,7 +2091,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA); + arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = @@ -2102,7 +2099,7 @@ arc_adjust(void) if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA); + arc_evict_ghost(arc_mfu_ghost, 0, delta); } } @@ -2231,15 +2228,14 @@ arc_adjust_meta(void) if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[type]); - arc_evict_ghost(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; + arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); + arc_evict_ghost(arc_mru_ghost, 0, delta); } if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[type]); - arc_evict_ghost(arc_mfu_ghost, 0, delta, type); + arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); + arc_evict_ghost(arc_mfu_ghost, 0, delta); } /* @@ -2301,8 +2297,8 @@ arc_flush(spa_t *spa) break; } - arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA); - arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); From ad4af89561eb16e45df37f3ce242679042718e0e Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Sat, 24 Jan 2015 12:40:59 -0600 Subject: [PATCH 04/11] Partially revert "Add ddt, ddt_entry, and l2arc_hdr caches" This reverts only the l2arc_hdr part of commit ecf3d9b8e63e5659269e15db527380c65780f71a in preparation for the illumos 5497 "lock contention on arcs_mtx" patch which does the same thing but uses the newer two-level ARC structure following the Illumos 5408 "managing ZFS cache devices requires lots of RAM" patch. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 68180b0937b1..1941044a0c75 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -797,7 +797,6 @@ buf_hash_remove(arc_buf_hdr_t *buf) */ static kmem_cache_t *hdr_cache; static kmem_cache_t *buf_cache; -static kmem_cache_t *l2arc_hdr_cache; static void buf_fini(void) @@ -819,7 +818,6 @@ buf_fini(void) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); - kmem_cache_destroy(l2arc_hdr_cache); } /* @@ -921,8 +919,6 @@ buf_init(void) 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); - l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE, - 0, NULL, NULL, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -1598,7 +1594,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); @@ -3738,7 +3734,7 @@ arc_release(arc_buf_t *buf, void *tag) ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); vdev_space_update(l2hdr->b_dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); @@ -4623,7 +4619,7 @@ l2arc_write_done(zio_t *zio) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_dropped += abl2->b_asize; ab->b_l2hdr = NULL; - kmem_cache_free(l2arc_hdr_cache, abl2); + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -4882,7 +4878,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; ab->b_l2hdr = NULL; - kmem_cache_free(l2arc_hdr_cache, abl2); + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } @@ -5028,9 +5024,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_SLEEP); + l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), + KM_PUSHPAGE); l2hdr->b_dev = dev; - l2hdr->b_daddr = 0; arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); ab->b_flags |= ARC_L2_WRITING; From 2a4324141f4a0811ba29dfef123fe5dad2ca1b03 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Sat, 6 Dec 2014 09:24:32 -0800 Subject: [PATCH 05/11] Illumos 5369 - arc flags should be an enum 5369 arc flags should be an enum 5370 consistent arc_buf_hdr_t naming scheme Reviewed by: Matthew Ahrens Reviewed by: Alex Reece Reviewed by: Sebastien Roy Reviewed by: Richard Elling Approved by: Richard Lowe Porting notes: ZoL has moved some ARC definitions into arc_impl.h. Signed-off-by: Brian Behlendorf Ported by: Tim Chase --- cmd/zdb/zdb.c | 2 +- include/sys/arc.h | 41 ++- include/sys/arc_impl.h | 2 +- module/zfs/arc.c | 689 +++++++++++++++++++------------------- module/zfs/dbuf.c | 11 +- module/zfs/dmu_diff.c | 2 +- module/zfs/dmu_objset.c | 6 +- module/zfs/dmu_send.c | 6 +- module/zfs/dmu_traverse.c | 12 +- module/zfs/dsl_scan.c | 8 +- module/zfs/spa_stats.c | 2 +- module/zfs/zil.c | 4 +- module/zfs/zio.c | 2 +- 13 files changed, 398 insertions(+), 389 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 6e75d0c21641..8572dae2e922 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1250,7 +1250,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; diff --git a/include/sys/arc.h b/include/sys/arc.h index 215c75b6dfa3..25e2f035253d 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -57,6 +57,36 @@ struct arc_prune { refcount_t p_refcnt; }; +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_NONE = 1 << 0, /* No flags set */ + ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ + ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ + ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ + ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ + ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ + ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ +} arc_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; @@ -71,15 +101,6 @@ typedef enum arc_buf_contents { ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; -/* - * These are the flags we pass into calls to the arc - */ -#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ -#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ -#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ -#define ARC_CACHED (1 << 4) /* I/O was already in cache */ -#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */ /* * The following breakdows of arc_size exist for kstat only. @@ -146,7 +167,7 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_phys_t *zb); + arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index e7068ea188e3..1f8351a6784b 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -107,7 +107,7 @@ struct arc_buf_hdr { arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - uint32_t b_flags; + arc_flags_t b_flags; uint32_t b_datacnt; arc_callback_t *b_acb; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1941044a0c75..7b34e6825814 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -495,50 +495,26 @@ static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); -static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); -static void arc_buf_watch(arc_buf_t *buf); - -static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ - -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ -#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ -#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ -#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ -#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) -#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) \ + ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ + (hdr)->b_l2hdr != NULL) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) /* * Other sizes @@ -655,14 +631,20 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void l2arc_read_done(zio_t *zio); +static void arc_get_data_buf(arc_buf_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static int arc_evict_needed(arc_buf_contents_t); +static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); +static void arc_buf_watch(arc_buf_t *); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); -static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, - enum zio_compress c); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); +static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); +static void l2arc_release_cdata_buf(arc_buf_hdr_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) @@ -707,14 +689,14 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; + arc_buf_hdr_t *hdr; mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; - return (buf); + return (hdr); } } mutex_exit(hash_lock); @@ -729,27 +711,27 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * Otherwise returns NULL. */ static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; + arc_buf_hdr_t *fhdr; uint32_t i; - ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); - ASSERT(buf->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(buf)); + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); *lockp = hash_lock; mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); } - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { @@ -767,22 +749,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) } static void -buf_hash_remove(arc_buf_hdr_t *buf) +buf_hash_remove(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); + ASSERT(HDR_IN_HASH_TABLE(hdr)); - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT(fhdr != NULL); + hdrp = &fhdr->b_hash_next; } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -828,14 +810,14 @@ buf_fini(void) static int hdr_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_hdr_t *buf = vbuf; - - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&buf->b_arc_node); - list_link_init(&buf->b_l2node); + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, sizeof (arc_buf_hdr_t)); + refcount_create(&hdr->b_refcnt); + cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_arc_node); + list_link_init(&hdr->b_l2node); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); @@ -862,12 +844,12 @@ buf_cons(void *vbuf, void *unused, int kmflag) static void hdr_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(buf)); - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); - mutex_destroy(&buf->b_freeze_lock); + ASSERT(BUF_EMPTY(hdr)); + refcount_destroy(&hdr->b_refcnt); + cv_destroy(&hdr->b_cv); + mutex_destroy(&hdr->b_freeze_lock); arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } @@ -942,7 +924,7 @@ arc_cksum_verify(arc_buf_t *buf) mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { + (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } @@ -1021,7 +1003,7 @@ arc_buf_thaw(arc_buf_t *buf) if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } @@ -1056,54 +1038,54 @@ arc_buf_freeze(arc_buf_t *buf) } static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; - uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; - - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT0(ab->b_datacnt); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; + if ((refcount_add(&hdr->b_refcnt, tag) == 1) && + (hdr->b_state != arc_anon)) { + uint64_t delta = hdr->b_size * hdr->b_datacnt; + list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + + ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); + mutex_enter(&hdr->b_state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(hdr->b_state)) { + ASSERT0(hdr->b_datacnt); + ASSERT3P(hdr->b_buf, ==, NULL); + delta = hdr->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); + mutex_exit(&hdr->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; + if (hdr->b_flags & ARC_FLAG_PREFETCH) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = ab->b_state; + arc_state_t *state = hdr->b_state; ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[ab->b_type]; + uint64_t *size = &state->arcs_lsize[hdr->b_type]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(size, ab->b_size * ab->b_datacnt); + ASSERT(!list_link_active(&hdr->b_arc_node)); + list_insert_head(&state->arcs_list[hdr->b_type], hdr); + ASSERT(hdr->b_datacnt > 0); + atomic_add_64(size, hdr->b_size * hdr->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1162,19 +1144,20 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) { - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); + arc_state_t *old_state = hdr->b_state; + int64_t refcnt = refcount_count(&hdr->b_refcnt); uint64_t from_delta, to_delta; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || hdr->b_datacnt > 0); + ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); - from_delta = to_delta = ab->b_datacnt * ab->b_size; + from_delta = to_delta = hdr->b_datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the @@ -1183,22 +1166,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + ASSERT(list_link_active(&hdr->b_arc_node)); + list_remove(&old_state->arcs_list[hdr->b_type], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; + ASSERT(hdr->b_buf == NULL); + from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); @@ -1208,18 +1191,19 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + list_insert_head(&new_state->arcs_list[hdr->b_type], + hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; + ASSERT(hdr->b_datacnt == 0); + ASSERT(hdr->b_buf == NULL); + to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1228,9 +1212,9 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } } - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); + ASSERT(!BUF_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); /* adjust state sizes */ if (to_delta) @@ -1239,7 +1223,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - ab->b_state = new_state; + hdr->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) @@ -1462,7 +1446,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); } @@ -1660,7 +1644,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) } else { ASSERT(buf == hdr->b_buf); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { @@ -1712,7 +1696,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || refcount_is_zero(&hdr->b_refcnt)); @@ -1787,7 +1771,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *ab, *ab_prev = NULL; + arc_buf_hdr_t *hdr, *hdr_prev = NULL; list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; @@ -1803,24 +1787,24 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (spa && ab->b_spa != spa) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - ddi_get_lbolt() - ab->b_arc_access < + if (HDR_IO_IN_PROGRESS(hdr) || + (spa && hdr->b_spa != spa) || + (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && + ddi_get_lbolt() - hdr->b_arc_access < zfs_arc_min_prefetch_lifespan)) { skipped++; continue; } /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) + if (recycle && hdr->b_size != bytes && + hdr_prev && hdr_prev->b_size == bytes) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; /* @@ -1833,34 +1817,34 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, * the hot code path, so don't sleep. */ if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&ab->b_refcnt)); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; + ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT(hdr->b_datacnt > 0); + while (hdr->b_buf) { + arc_buf_t *buf = hdr->b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes && - !HDR_L2_WRITING(ab)) { + bytes_evicted += hdr->b_size; + if (recycle && hdr->b_type == type && + hdr->b_size == bytes && + !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } @@ -1869,7 +1853,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; + hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; @@ -1882,26 +1866,26 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (ab->b_l2hdr) { + if (hdr->b_l2hdr) { ARCSTAT_INCR(arcstat_evict_l2_cached, - ab->b_size); + hdr->b_size); } else { - if (l2arc_write_eligible(ab->b_spa, ab)) { + if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, - ab->b_size); + hdr->b_size); } else { ARCSTAT_INCR( arcstat_evict_l2_ineligible, - ab->b_size); + hdr->b_size); } } - if (ab->b_datacnt == 0) { - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags |= ARC_IN_HASH_TABLE; - ab->b_flags &= ~ARC_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); + if (hdr->b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } if (!have_lock) mutex_exit(hash_lock); @@ -1950,7 +1934,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; arc_buf_hdr_t marker; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; @@ -1962,18 +1946,18 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) bzero(&marker, sizeof (marker)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - if (ab->b_type > ARC_BUFC_NUMTYPES) - panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) + for (hdr = list_tail(list); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(list, hdr); + if (hdr->b_type > ARC_BUFC_NUMTYPES) + panic("invalid hdr=%p", (void *)hdr); + if (spa && hdr->b_spa != spa) continue; /* ignore markers */ - if (ab->b_spa == 0) + if (hdr->b_spa == 0) continue; - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; @@ -1985,35 +1969,35 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) * before reacquiring the lock. */ if (count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&state->arcs_mtx); kpreempt(KPREEMPT_SYNC); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; continue; } if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; + bytes_deleted += hdr->b_size; - if (ab->b_l2hdr != NULL) { + if (hdr->b_l2hdr != NULL) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, ab, hash_lock); + arc_change_state(arc_l2c_only, hdr, hash_lock); mutex_exit(hash_lock); } else { - arc_change_state(arc_anon, ab, hash_lock); + arc_change_state(arc_anon, hdr, hash_lock); mutex_exit(hash_lock); - arc_hdr_destroy(ab); + arc_hdr_destroy(hdr); } - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (bytes >= 0 && bytes_deleted >= bytes) break; } else if (bytes < 0) { @@ -2022,12 +2006,12 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) * hash lock to become available. Once its * available, restart from where we left off. */ - list_insert_after(list, ab, &marker); + list_insert_after(list, hdr, &marker); mutex_exit(&state->arcs_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); + hdr_prev = list_prev(list, &marker); list_remove(list, &marker); } else { bufs_skipped += 1; @@ -2734,7 +2718,8 @@ arc_get_data_buf(arc_buf_t *buf) * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; + state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? + arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; @@ -2821,25 +2806,25 @@ arc_get_data_buf(arc_buf_t *buf) * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); - if (buf->b_state == arc_anon) { + if (hdr->b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); + ASSERT(hdr->b_arc_access == 0); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); - } else if (buf->b_state == arc_mru) { + } else if (hdr->b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2850,15 +2835,15 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + if (refcount_count(&hdr->b_refcnt) == 0) { + ASSERT(list_link_active(&hdr->b_arc_node)); } else { - buf->b_flags &= ~ARC_PREFETCH; - atomic_inc_32(&buf->b_mru_hits); + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + atomic_inc_32(&hdr->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = now; + hdr->b_arc_access = now; return; } @@ -2867,19 +2852,19 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, buf->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&buf->b_mru_hits); + atomic_inc_32(&hdr->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { + } else if (hdr->b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2887,22 +2872,22 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + if (refcount_count(&hdr->b_refcnt) > 0) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - buf->b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { + } else if (hdr->b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2912,14 +2897,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); + if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { + ASSERT(refcount_count(&hdr->b_refcnt) == 0); + ASSERT(list_link_active(&hdr->b_arc_node)); } - atomic_inc_32(&buf->b_mfu_hits); + atomic_inc_32(&hdr->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = ddi_get_lbolt(); - } else if (buf->b_state == arc_mfu_ghost) { + hdr->b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2927,31 +2912,31 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (hdr->b_flags & ARC_FLAG_PREFETCH) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&buf->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_refcnt)); new_state = arc_mru; } - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (buf->b_state == arc_l2c_only) { + } else if (hdr->b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", buf->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); } } @@ -3019,9 +3004,9 @@ arc_read_done(zio_t *zio) (found == hdr && HDR_L2_READING(hdr))); } - hdr->b_flags &= ~ARC_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) - hdr->b_flags &= ~ARC_L2CACHE; + hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; @@ -3061,18 +3046,18 @@ arc_read_done(zio_t *zio) } } hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_datacnt == 1); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; + hdr->b_flags |= ARC_FLAG_IO_ERROR; if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) @@ -3138,8 +3123,8 @@ arc_read_done(zio_t *zio) */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_phys_t *zb) + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; @@ -3162,16 +3147,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (hdr != NULL && hdr->b_datacnt > 0) { - *arc_flags |= ARC_CACHED; + *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); goto top; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; @@ -3209,24 +3194,24 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } - } else if (*arc_flags & ARC_PREFETCH && + } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); @@ -3276,18 +3261,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } + /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { + if (*arc_flags & ARC_FLAG_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; + hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); @@ -3296,14 +3282,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; + if (*arc_flags & ARC_FLAG_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREFETCH; else add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -3325,7 +3311,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; if (hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { @@ -3352,7 +3338,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), + ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); @@ -3415,12 +3401,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - if (*arc_flags & ARC_NOWAIT) { + if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } - ASSERT(*arc_flags & ARC_WAIT); + ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; @@ -3446,12 +3432,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } @@ -3522,7 +3508,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (HDR_BUF_AVAILABLE(hdr)) { arc_buf_t *buf = hdr->b_buf; add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); arc_release(buf, FTAG); @@ -3589,7 +3575,7 @@ arc_clear_callback(arc_buf_t *buf) arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3700,7 +3686,7 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_mfu_hits = 0; nhdr->b_mfu_ghost_hits = 0; nhdr->b_l2_hits = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; @@ -3790,7 +3776,7 @@ arc_write_ready(zio_t *zio) mutex_exit(&hdr->b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; } /* @@ -3871,13 +3857,13 @@ arc_write_done(zio_t *zio) ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_refcnt)); @@ -3900,12 +3886,12 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); + ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == NULL); if (l2arc) - hdr->b_flags |= ARC_L2CACHE; + hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) - hdr->b_flags |= ARC_L2COMPRESS; + hdr->b_flags |= ARC_FLAG_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_physdone = physdone; @@ -4395,7 +4381,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -4404,8 +4390,8 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || - HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); @@ -4566,7 +4552,7 @@ l2arc_write_done(zio_t *zio) l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; - arc_buf_hdr_t *head, *ab, *ab_prev; + arc_buf_hdr_t *head, *hdr, *hdr_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4590,17 +4576,17 @@ l2arc_write_done(zio_t *zio) /* * All writes completed, or an error was hit. */ - for (ab = list_prev(buflist, head); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); - abl2 = ab->b_l2hdr; + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + abl2 = hdr->b_l2hdr; /* * Release the temporary compressed buffer as soon as possible. */ if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); + l2arc_release_cdata_buf(hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage @@ -4615,19 +4601,19 @@ l2arc_write_done(zio_t *zio) /* * Error - drop L2ARC entry. */ - list_remove(buflist, ab); + list_remove(buflist, hdr); ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_dropped += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; mutex_exit(hash_lock); } @@ -4774,7 +4760,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; l2arc_buf_hdr_t *abl2; - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; @@ -4806,10 +4792,10 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) top: mutex_enter(&l2arc_buflist_mtx); - for (ab = list_tail(buflist); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. @@ -4821,19 +4807,19 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) goto top; } - if (HDR_L2_WRITE_HEAD(ab)) { + if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ - list_remove(buflist, ab); + list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } - if (!all && ab->b_l2hdr != NULL && - (ab->b_l2hdr->b_daddr > taddr || - ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && hdr->b_l2hdr != NULL && + (hdr->b_l2hdr->b_daddr > taddr || + hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4842,7 +4828,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) break; } - if (HDR_FREE_IN_PROGRESS(ab)) { + if (HDR_FREE_IN_PROGRESS(hdr)) { /* * Already on the path to destruction. */ @@ -4850,45 +4836,45 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) continue; } - if (ab->b_state == arc_l2c_only) { - ASSERT(!HDR_L2_READING(ab)); + if (hdr->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ - arc_change_state(arc_anon, ab, hash_lock); - arc_hdr_destroy(ab); + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } else { /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ - if (HDR_L2_READING(ab)) { + if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - ab->b_flags |= ARC_L2_EVICTED; + hdr->b_flags |= ARC_FLAG_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ - if (ab->b_l2hdr != NULL) { - abl2 = ab->b_l2hdr; + if (hdr->b_l2hdr != NULL) { + abl2 = hdr->b_l2hdr; ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); bytes_evicted += abl2->b_asize; - ab->b_l2hdr = NULL; + hdr->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } - list_remove(buflist, ab); + list_remove(buflist, hdr); /* * This may have been leftover after a * failed write. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } @@ -4901,7 +4887,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Find and write ARC buffers to the L2ARC device. * - * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. @@ -4913,7 +4899,7 @@ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { - arc_buf_hdr_t *ab, *ab_prev, *head; + arc_buf_hdr_t *hdr, *hdr_prev, *head; list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; @@ -4935,7 +4921,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - head->b_flags |= ARC_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; /* * We will want to try to compress buffers that are at least 2x the @@ -4959,25 +4945,25 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - ab = list_head(list); + hdr = list_head(list); else - ab = list_tail(list); + hdr = list_tail(list); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; ab; ab = ab_prev) { + for (; hdr; hdr = hdr_prev) { l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; if (arc_warm == B_FALSE) - ab_prev = list_next(list, ab); + hdr_prev = list_next(list, hdr); else - ab_prev = list_prev(list, ab); + hdr_prev = list_prev(list, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. @@ -4985,7 +4971,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += ab->b_size; + passed_sz += hdr->b_size; if (passed_sz > headroom) { /* * Searched too far. @@ -4994,12 +4980,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, break; } - if (!l2arc_write_eligible(guid, ab)) { + if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } - if ((write_sz + ab->b_size) > target_sz) { + if ((write_sz + hdr->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -5029,32 +5015,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_dev = dev; arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ab->b_flags |= ARC_L2_WRITING; + hdr->b_flags |= ARC_FLAG_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access ab->b_buf + * there. This is because can't access hdr->b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = ab->b_size; - l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_asize = hdr->b_size; + l2hdr->b_tmp_cdata = hdr->b_buf->b_data; l2hdr->b_hits = 0; - buf_sz = ab->b_size; - ab->b_l2hdr = l2hdr; + buf_sz = hdr->b_size; + hdr->b_l2hdr = l2hdr; - list_insert_head(dev->l2ad_buflist, ab); + list_insert_head(dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(ab->b_buf); - arc_cksum_compute(ab->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_buf); + arc_cksum_compute(hdr->b_buf, B_TRUE); mutex_exit(hash_lock); @@ -5080,21 +5066,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ - for (ab = list_prev(dev->l2ad_buflist, head); ab; - ab = list_prev(dev->l2ad_buflist, ab)) { + for (hdr = list_prev(dev->l2ad_buflist, head); hdr; + hdr = list_prev(dev->l2ad_buflist, hdr)) { l2arc_buf_hdr_t *l2hdr; uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_L2_WRITING in the previous step, but we must take - * care to only access its L2 cache parameters. In particular, - * ab->b_buf may be invalid by now due to ARC eviction. + * it as ARC_FLAG_L2_WRITING in the previous step, but we must + * take care to only access its L2 cache parameters. In + * particular, hdr->b_buf may be invalid by now due to + * ARC eviction. */ - l2hdr = ab->b_l2hdr; + l2hdr = hdr->b_l2hdr; l2hdr->b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) && + if (!l2arc_nocompress && (hdr->b_flags & ARC_FLAG_L2COMPRESS) && l2hdr->b_asize >= buf_compress_minsz) { if (l2arc_compress_buf(l2hdr)) { /* @@ -5298,9 +5285,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * done, we can dispose of it. */ static void -l2arc_release_cdata_buf(arc_buf_hdr_t *ab) +l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { /* @@ -5308,7 +5295,7 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *ab) * temporary buffer for it, so now we need to release it. */ ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); + zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size); } l2hdr->b_tmp_cdata = NULL; } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 48e0e347a0d3..4e0f857c5ca7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -653,7 +653,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; zbookmark_phys_t zb; - uint32_t aflags = ARC_NOWAIT; + uint32_t aflags = ARC_FLAG_NOWAIT; int err; DB_DNODE_ENTER(db); @@ -707,9 +707,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -721,7 +721,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_CACHED) + if (aflags & ARC_FLAG_CACHED) *flags |= DB_RF_CACHED; return (SET_ERROR(err)); @@ -2028,7 +2028,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = + ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 32e451a77773..91415d0d2dcb 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } else if (zb->zb_level == 0) { dnode_phys_t *blk; arc_buf_t *abuf; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; int blksz = BP_GET_LSIZE(bp); int i; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 6ef6adbd9e93..bc1aa12867a2 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -306,15 +306,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index dddc8094fc5a..8fa6797c4cdb 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -486,7 +486,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, @@ -504,7 +504,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); @@ -521,8 +521,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, err = dump_write_embedded(dsp, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ - uint32_t aflags = ARC_WAIT; uint64_t offset; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 8673132a0eeb..6c69a2339dee 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -177,7 +177,7 @@ static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; @@ -273,7 +273,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -307,7 +307,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, kmem_free(czb, sizeof (zbookmark_phys_t)); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; dnode_phys_t *cdnp; @@ -331,7 +331,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; dnode_phys_t *mdnp, *gdnp, *udnp; @@ -448,7 +448,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (pfd->pd_cancel) @@ -545,7 +545,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6b9b04a0ee19..0489359710b8 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -590,7 +590,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; @@ -655,7 +655,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int err; if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; @@ -682,7 +682,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -708,7 +708,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 3e39dba2c2e6..2b8559b5d276 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) if (zfs_read_history == 0 && ssh->size == 0) return; - if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ff4d2cec0a2b..6a3885816cb0 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -280,7 +280,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1e5be8bfc838..c378742eda0a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2241,7 +2241,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; From b9541d6b7d765883f8a5fe7c1bde74df5c256ff6 Mon Sep 17 00:00:00 2001 From: Chris Williamson Date: Mon, 29 Dec 2014 19:12:23 -0800 Subject: [PATCH 06/11] Illumos 5408 - managing ZFS cache devices requires lots of RAM 5408 managing ZFS cache devices requires lots of RAM Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Don Brady Reviewed by: Josef 'Jeff' Sipek Approved by: Garrett D'Amore Porting notes: Due to the restructuring of the ARC-related structures, this patch conflicts with at least the following existing ZoL commits: 6e1d7276c94cbd7c2e19f9232f6ba4bafa62dbe0 Fix inaccurate arcstat_l2_hdr_size calculations The ARC_SPACE_HDRS constant no longer exists and has been somewhat equivalently replaced by HDR_L2ONLY_SIZE. e0b0ca983d6897bcddf05af2c0e5d01ff66f90db Add visibility in to cached dbufs The new layering of l{1,2}arc_buf_hdr_t within the arc_buf_hdr struct requires additional structure member names to be used when referencing the inner items. Also, the presence of L1 or L2 inner member is indicated by flags using the new HDR_HAS_L{1,2}HDR macros. Ported by: Tim Chase Signed-off-by: Brian Behlendorf --- cmd/ztest/ztest.c | 9 +- include/sys/arc.h | 27 +- include/sys/arc_impl.h | 95 ++- include/sys/trace_arc.h | 54 +- module/zfs/arc.c | 1395 +++++++++++++++++++++++---------------- 5 files changed, 948 insertions(+), 632 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index f1a8ff61d7ee..642cab5f2f33 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { @@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( @@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { @@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { diff --git a/include/sys/arc.h b/include/sys/arc.h index 25e2f035253d..903f0b413167 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -81,10 +81,29 @@ typedef enum arc_flags ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */ - ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + } arc_flags_t; struct arc_buf { diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1f8351a6784b..556cc258330d 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -74,8 +74,6 @@ typedef struct arc_state { arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,27 +94,45 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -133,9 +149,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +163,51 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 8b885eff73a7..b9df228eae33 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; - __entry->hdr_cksum0 = ab->b_cksum0; __entry->hdr_flags = ab->b_flags; - __entry->hdr_datacnt = ab->b_datacnt; - __entry->hdr_type = ab->b_type; + __entry->hdr_datacnt = ab->b_l1hdr.b_datacnt; __entry->hdr_size = ab->b_size; __entry->hdr_spa = ab->b_spa; - __entry->hdr_state_type = ab->b_state->arcs_state; - __entry->hdr_access = ab->b_arc_access; - __entry->hdr_mru_hits = ab->b_mru_hits; - __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; - __entry->hdr_mfu_hits = ab->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; - __entry->hdr_l2_hits = ab->b_l2_hits; - __entry->hdr_refcount = ab->b_refcnt.rc_count; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x datacnt %u type %u size %llu spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, @@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) - __field(uint64_t, hdr_cksum0) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_datacnt) __field(arc_buf_contents_t, hdr_type) @@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; - __entry->hdr_cksum0 = hdr->b_cksum0; __entry->hdr_flags = hdr->b_flags; - __entry->hdr_datacnt = hdr->b_datacnt; - __entry->hdr_type = hdr->b_type; + __entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt; __entry->hdr_size = hdr->b_size; __entry->hdr_spa = hdr->b_spa; - __entry->hdr_state_type = hdr->b_state->arcs_state; - __entry->hdr_access = hdr->b_arc_access; - __entry->hdr_mru_hits = hdr->b_mru_hits; - __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; - __entry->hdr_mfu_hits = hdr->b_mfu_hits; - __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - __entry->hdr_l2_hits = hdr->b_l2_hits; - __entry->hdr_refcount = hdr->b_refcnt.rc_count; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; @@ -325,8 +319,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), - TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " - "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x datacnt %u size %llu spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " @@ -334,8 +328,8 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class, "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], - __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, - __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_size, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7b34e6825814..e69889ab5810 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -118,7 +118,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -308,6 +308,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; @@ -396,6 +397,7 @@ static arc_stats_t arc_stats = { { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, @@ -506,22 +508,38 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) \ - ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) + #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) + /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -591,7 +609,6 @@ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -606,19 +623,6 @@ typedef struct l2arc_read_callback { enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - uint32_t b_hits; - uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; @@ -637,12 +641,13 @@ static int arc_evict_needed(arc_buf_contents_t); static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); static void arc_buf_watch(arc_buf_t *); +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); @@ -665,8 +670,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -679,7 +683,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -709,6 +712,7 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) @@ -721,8 +725,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - *lockp = hash_lock; - mutex_enter(hash_lock); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) @@ -777,7 +787,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -798,7 +809,8 @@ buf_fini(void) #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -808,17 +820,29 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, sizeof (arc_buf_hdr_t)); - refcount_create(&hdr->b_refcnt); - cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_arc_node); - list_link_init(&hdr->b_l2node); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -842,15 +866,25 @@ buf_cons(void *vbuf, void *unused, int kmflag) */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); - refcount_destroy(&hdr->b_refcnt); - cv_destroy(&hdr->b_cv); - mutex_destroy(&hdr->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + ASSERTV(arc_buf_hdr_t *hdr = vbuf); + + ASSERT(BUF_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -897,8 +931,11 @@ buf_init(void) goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -912,6 +949,81 @@ buf_init(void) } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + dev = hdr->b_l2hdr.b_dev; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* + * We might be removing the L1hdr of a buffer which was just + * written out to L2ARC. If such a buffer is compressed then we + * need to free its b_tmp_cdata before destroying the header. + */ + if (hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -922,16 +1034,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -940,10 +1051,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -954,16 +1065,16 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -997,24 +1108,50 @@ arc_buf_watch(arc_buf_t *buf) #endif } +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1031,7 +1168,7 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); @@ -1040,30 +1177,37 @@ arc_buf_freeze(arc_buf_t *buf) static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + arc_state_t *state; + + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&hdr->b_refcnt, tag) == 1) && - (hdr->b_state != arc_anon)) { - uint64_t delta = hdr->b_size * hdr->b_datacnt; - list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; - - ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); - mutex_enter(&hdr->b_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(list, hdr); - if (GHOST_STATE(hdr->b_state)) { - ASSERT0(hdr->b_datacnt); - ASSERT3P(hdr->b_buf, ==, NULL); - delta = hdr->b_size; + state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&state->arcs_mtx); } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&hdr->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } @@ -1071,21 +1215,27 @@ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = hdr->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[hdr->b_type]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_arc_node)); - list_insert_head(&state->arcs_list[hdr->b_type], hdr); - ASSERT(hdr->b_datacnt > 0); - atomic_add_64(size, hdr->b_size * hdr->b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1102,31 +1252,45 @@ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; - arc_state_t *state = hdr->b_state; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; memset(abi, 0, sizeof (arc_buf_info_t)); abi->abi_flags = hdr->b_flags; - abi->abi_datacnt = hdr->b_datacnt; + + if (l1hdr) { + abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&l1hdr->b_refcnt); + } + + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_asize = l2hdr->b_asize; + abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr); + abi->abi_l2arc_hits = l2hdr->b_hits; + } + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = hdr->b_type; + abi->abi_state_contents = arc_buf_type(hdr); abi->abi_state_index = -1; abi->abi_size = hdr->b_size; - abi->abi_access = hdr->b_arc_access; - abi->abi_mru_hits = hdr->b_mru_hits; - abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - abi->abi_holds = refcount_count(&hdr->b_refcnt); - - if (hdr->b_l2hdr) { - abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; - abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; - abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; - abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; - } - if (state && state_index && list_link_active(&hdr->b_arc_node)) { - list_t *list = &state->arcs_list[hdr->b_type]; + if (l1hdr && state && state_index && + list_link_active(&l1hdr->b_arc_node)) { + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; arc_buf_hdr_t *h; mutex_enter(&state->arcs_mtx); @@ -1147,40 +1311,60 @@ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *old_state = hdr->b_state; - int64_t refcnt = refcount_count(&hdr->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || hdr->b_datacnt > 0); - ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = hdr->b_datacnt * hdr->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { + if (old_state != arc_anon && old_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &old_state->arcs_lsize[buftype]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(&old_state->arcs_list[hdr->b_type], hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(hdr->b_buf == NULL); + ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); @@ -1189,20 +1373,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (use_mutex) mutex_exit(&old_state->arcs_mtx); } - if (new_state != arc_anon) { + if (new_state != arc_anon && new_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &new_state->arcs_lsize[buftype]; + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[hdr->b_type], - hdr); + list_insert_head(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(hdr->b_datacnt == 0); - ASSERT(hdr->b_buf == NULL); + ASSERT0(datacnt); + ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1216,20 +1406,22 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - hdr->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1307,30 +1499,36 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; VERIFY3U(size, <=, spa_maxblocksize(spa)); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1363,8 +1561,9 @@ arc_return_buf(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1373,12 +1572,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1392,15 +1591,16 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); @@ -1410,11 +1610,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1437,17 +1637,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } @@ -1485,10 +1688,10 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_buf_t **bufp; /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; + if (buf->b_data != NULL) { + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -1503,11 +1706,12 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_space_return(size, ARC_SPACE_DATA); } } - if (list_link_active(&buf->b_hdr->b_arc_node)) { + if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); + ASSERT(refcount_is_zero( + &buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); @@ -1520,13 +1724,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ - if (buf->b_hdr->b_datacnt > 1 && - buf->b_hdr->b_type == ARC_BUFC_DATA) { + if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && + HDR_ISTYPE_DATA(buf->b_hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; + ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); + buf->b_hdr->b_l1hdr.b_datacnt -= 1; } /* only remove the buf if requested */ @@ -1534,7 +1738,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; + bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; @@ -1549,84 +1754,82 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_datacnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - if (l2hdr != NULL) { - boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ if (!buflist_held) { - mutex_enter(&l2arc_buflist_mtx); - l2hdr = hdr->b_l2hdr; + mutex_enter(&l2hdr->b_dev->l2ad_mtx); + l2hdr = &hdr->b_l2hdr; } - if (l2hdr != NULL) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; - } + list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); if (!buflist_held) - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&l2hdr->b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); - } - } + if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - ASSERT(!list_link_active(&hdr->b_arc_node)); + if (HDR_HAS_L1HDR(hdr)) { + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + + if (buf->b_efunc != NULL) { + mutex_enter(&arc_eviction_mtx); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, + TRUE); + } + } + } + ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; + int hashed = hdr->b_l1hdr.b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); @@ -1639,10 +1842,10 @@ arc_buf_free(arc_buf_t *buf, void *tag) ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } @@ -1656,7 +1859,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) @@ -1676,8 +1879,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock = NULL; boolean_t no_callback = (buf->b_efunc == NULL); - if (hdr->b_state == arc_anon) { - ASSERT(hdr->b_datacnt == 1); + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT(hdr->b_l1hdr.b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } @@ -1685,21 +1888,22 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(hdr->b_l1hdr.b_datacnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_state != arc_anon); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); + ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || + refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); mutex_exit(hash_lock); return (no_callback); } @@ -1745,7 +1949,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) return (B_TRUE); } - if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); @@ -1784,16 +1988,27 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; top: - mutex_enter(&state->arcs_mtx); + /* + * The ghost list lock must be acquired first in order to prevent + * a 3 party deadlock: + * + * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by + * l2ad_mtx in arc_hdr_realloc + * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx + * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by + * arc_*_ghost->arcs_mtx and forms a deadlock cycle. + * + * This situation is avoided by acquiring the ghost list lock first. + */ mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || - (spa && hdr->b_spa != spa) || - (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) && - ddi_get_lbolt() - hdr->b_arc_access < + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < zfs_arc_min_prefetch_lifespan)) { skipped++; continue; @@ -1818,11 +2033,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, */ if (!recycle && count++ > arc_evict_iterations) { list_insert_after(list, hdr, &marker); - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(&state->arcs_mtx); hdr_prev = list_prev(list, &marker); list_remove(list, &marker); count = 0; @@ -1832,28 +2047,29 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, hash_lock = HDR_LOCK(hdr); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_datacnt > 0); - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } - if (buf->b_data) { + if (buf->b_data != NULL) { bytes_evicted += hdr->b_size; - if (recycle && hdr->b_type == type && + if (recycle && + arc_buf_type(hdr) == type && hdr->b_size == bytes && !HDR_L2_WRITING(hdr)) { stolen = buf->b_data; recycle = FALSE; } } - if (buf->b_efunc) { + if (buf->b_efunc != NULL) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); - hdr->b_buf = buf->b_next; + hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; @@ -1866,7 +2082,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (hdr->b_l2hdr) { + if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); } else { @@ -1880,7 +2096,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - if (hdr->b_datacnt == 0) { + if (hdr->b_l1hdr.b_datacnt == 0) { arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; @@ -1896,8 +2112,8 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, } } - mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); + mutex_exit(&evicted_state->arcs_mtx); if (list == &state->arcs_list[ARC_BUFC_DATA] && (bytes < 0 || bytes_evicted < bytes)) { @@ -1948,7 +2164,7 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) mutex_enter(&state->arcs_mtx); for (hdr = list_tail(list); hdr; hdr = hdr_prev) { hdr_prev = list_prev(list, hdr); - if (hdr->b_type > ARC_BUFC_NUMTYPES) + if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) panic("invalid hdr=%p", (void *)hdr); if (spa && hdr->b_spa != spa) continue; @@ -1980,16 +2196,23 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) } if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(hdr->b_buf == NULL); + ASSERT(!HDR_HAS_L1HDR(hdr) || + hdr->b_l1hdr.b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += hdr->b_size; - if (hdr->b_l2hdr != NULL) { + if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); mutex_exit(hash_lock); } else { arc_change_state(arc_anon, hdr, hash_lock); @@ -2253,27 +2476,27 @@ arc_flush(spa_t *spa) { uint64_t guid = 0; - if (spa) + if (spa != NULL) guid = spa_load_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) + if (spa != NULL) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) + if (spa != NULL) break; } @@ -2345,8 +2568,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); } /* @@ -2689,9 +2912,9 @@ arc_evict_needed(arc_buf_contents_t type) static void arc_get_data_buf(arc_buf_t *buf) { - arc_state_t *state = buf->b_hdr->b_state; + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_buf_contents_t evict = ARC_BUFC_DATA; boolean_t recycle = TRUE; @@ -2718,8 +2941,7 @@ arc_get_data_buf(arc_buf_t *buf) * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ? - arc_mru : arc_mfu; + state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; @@ -2782,20 +3004,21 @@ arc_get_data_buf(arc_buf_t *buf) * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { + if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); + if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], + size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_state == arc_anon && + arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -2811,20 +3034,21 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) clock_t now; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (hdr->b_state == arc_anon) { + if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(hdr->b_arc_access == 0); - hdr->b_arc_access = ddi_get_lbolt(); + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); - } else if (hdr->b_state == arc_mru) { + } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2835,15 +3059,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - if (refcount_count(&hdr->b_refcnt) == 0) { - ASSERT(list_link_active(&hdr->b_arc_node)); + if (HDR_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + ASSERT(list_link_active( + &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; return; } @@ -2852,19 +3077,20 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, hdr->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - hdr->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&hdr->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (hdr->b_state == arc_mru_ghost) { + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2872,9 +3098,9 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&hdr->b_refcnt) > 0) + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) hdr->b_flags &= ~ARC_FLAG_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -2882,12 +3108,12 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (hdr->b_state == arc_mfu) { + } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2897,14 +3123,14 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) { - ASSERT(refcount_count(&hdr->b_refcnt) == 0); - ASSERT(list_link_active(&hdr->b_arc_node)); + if ((HDR_PREFETCH(hdr)) != 0) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); } - atomic_inc_32(&hdr->b_mfu_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_arc_access = ddi_get_lbolt(); - } else if (hdr->b_state == arc_mfu_ghost) { + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2912,31 +3138,32 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) { + if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&hdr->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&hdr->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (hdr->b_state == arc_l2c_only) { + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - hdr->b_arc_access = ddi_get_lbolt(); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", + hdr->b_l1hdr.b_state); } } @@ -3005,11 +3232,11 @@ arc_read_done(zio_t *zio) } hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH)) + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ - callback_list = hdr->b_acb; + callback_list = hdr->b_l1hdr.b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = @@ -3023,7 +3250,8 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); arc_buf_watch(buf); - if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -3045,24 +3273,25 @@ arc_read_done(zio_t *zio) abuf = NULL; } } - hdr->b_acb = NULL; + hdr->b_l1hdr.b_acb = NULL; hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_FLAG_IO_ERROR; - if (hdr->b_state != arc_anon) + if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -3070,9 +3299,9 @@ arc_read_done(zio_t *zio) * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ - cv_broadcast(&hdr->b_cv); + cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock) { + if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* @@ -3081,8 +3310,8 @@ arc_read_done(zio_t *zio) * moved to the anonymous state (so that it won't show up * in the cache). */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ @@ -3145,14 +3374,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && hdr->b_datacnt > 0) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } @@ -3170,8 +3399,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; @@ -3180,7 +3409,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, goto out; } - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); @@ -3189,7 +3419,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ - buf = hdr->b_buf; + buf = hdr->b_l1hdr.b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { @@ -3200,7 +3430,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, } } else if (*arc_flags & ARC_FLAG_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); @@ -3211,8 +3441,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) @@ -3224,7 +3454,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; - uint64_t b_asize = 0; + int32_t b_asize = 0; /* * Gracefully handle a damaged logical block size as a @@ -3251,7 +3481,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -3275,11 +3504,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_FLAG_INDIRECT; } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -3296,29 +3534,29 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; + hdr->b_l1hdr.b_buf = buf; + ASSERT0(hdr->b_l1hdr.b_datacnt); + hdr->b_l1hdr.b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } - ASSERT(!GHOST_STATE(hdr->b_state)); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; + ASSERT(hdr->b_l1hdr.b_acb == NULL); + hdr->b_l1hdr.b_acb = acb; hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; - if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr->b_dev->l2ad_writing; - addr = hdr->b_l2hdr->b_daddr; - b_compress = hdr->b_l2hdr->b_compress; - b_asize = hdr->b_l2hdr->b_asize; + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + b_compress = HDR_GET_COMPRESS(hdr); + b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -3338,8 +3576,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { @@ -3352,14 +3590,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ - if (hdr->b_l2hdr != NULL && + if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr->b_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); @@ -3481,8 +3719,9 @@ void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || + func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); @@ -3506,7 +3745,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_buf; + arc_buf_t *buf = hdr->b_l1hdr.b_buf; add_reference(hdr, hash_lock, FTAG); hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); @@ -3564,17 +3803,19 @@ arc_clear_callback(arc_buf_t *buf) hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, + hdr->b_l1hdr.b_datacnt); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, FALSE, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3593,10 +3834,9 @@ arc_clear_callback(arc_buf_t *buf) void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock = NULL; - l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size = 0; + kmutex_t *hash_lock; + arc_state_t *state; + arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3605,56 +3845,89 @@ arc_release(arc_buf_t *buf, void *tag) */ mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(BUF_EMPTY(hdr)); - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT(buf->b_efunc == NULL); - } else { - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + ASSERT3P(buf->b_efunc, ==, NULL); + ASSERT3P(buf->b_private, ==, NULL); + + hdr->b_l1hdr.b_arc_access = 0; + arc_buf_thaw(buf); + + return; } - l2hdr = hdr->b_l2hdr; - if (l2hdr) { - mutex_enter(&l2arc_buflist_mtx); - hdr->b_l2hdr = NULL; - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - buf_size = hdr->b_size; /* * Do we have more than one buf? */ - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; + arc_buf_contents_t type = arc_buf_type(hdr); uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; + bufp = &hdr->b_l1hdr.b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3P(state, !=, arc_l2c_only); + ASSERT3U(state->arcs_size, >=, hdr->b_size); + atomic_add_64(&state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + uint64_t *size; + + ASSERT3P(state, !=, arc_l2c_only); + size = &state->arcs_lsize[type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } @@ -3663,68 +3936,60 @@ arc_release(arc_buf_t *buf, void *tag) * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } - hdr->b_datacnt -= 1; + hdr->b_l1hdr.b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_mru_hits = 0; - nhdr->b_mru_ghost_hits = 0; - nhdr->b_mfu_hits = 0; - nhdr->b_mfu_ghost_hits = 0; - nhdr->b_l2_hits = 0; + + nhdr->b_l1hdr.b_mru_hits = 0; + nhdr->b_l1hdr.b_mru_ghost_hits = 0; + nhdr->b_l1hdr.b_mfu_hits = 0; + nhdr->b_l1hdr.b_mfu_ghost_hits = 0; + nhdr->b_l1hdr.b_l2_hits = 0; nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; - nhdr->b_l2hdr = NULL; - nhdr->b_datacnt = 1; + nhdr->b_flags |= arc_bufc_to_flags(type); + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_datacnt = 1; + nhdr->b_l1hdr.b_state = arc_anon; + nhdr->b_l1hdr.b_arc_access = 0; nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_refcnt, tag); + + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock */ + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - if (hdr->b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; - if (hash_lock) - mutex_exit(hash_lock); + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; - - if (l2hdr) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -buf_size); - mutex_exit(&l2arc_buflist_mtx); - } } int @@ -3733,7 +3998,8 @@ arc_released(arc_buf_t *buf) int released; mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } @@ -3745,7 +4011,7 @@ arc_referenced(arc_buf_t *buf) int referenced; mutex_enter(&buf->b_evict_lock); - referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } @@ -3758,7 +4024,9 @@ arc_write_ready(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); callback->awcb_ready(zio, buf, callback->awcb_private); /* @@ -3768,12 +4036,12 @@ arc_write_ready(zio_t *zio) * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_freeze_lock); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_exit(&hdr->b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; @@ -3798,7 +4066,7 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { @@ -3806,7 +4074,6 @@ arc_write_done(zio_t *zio) } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } } else { ASSERT(BUF_EMPTY(hdr)); @@ -3827,7 +4094,7 @@ arc_write_done(zio_t *zio) arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove @@ -3837,7 +4104,8 @@ arc_write_done(zio_t *zio) if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); - ASSERT(refcount_is_zero(&exists->b_refcnt)); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); @@ -3851,22 +4119,22 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_datacnt == 1); - ASSERT(hdr->b_state == arc_anon); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (!exists && hdr->b_state == arc_anon) + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); @@ -3886,8 +4154,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); if (l2arc) hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) @@ -4102,25 +4371,35 @@ arc_init(void) mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4231,7 +4510,7 @@ arc_fini(void) buf_fini(); - ASSERT(arc_loaned_bytes == 0); + ASSERT0(arc_loaned_bytes); } /* @@ -4390,7 +4669,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (hdr->b_spa != spa_guid || hdr->b_l2hdr != NULL || + if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); @@ -4443,20 +4722,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) return (next); } -static void -l2arc_hdr_stat_add(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); -} - -static void -l2arc_hdr_stat_remove(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); -} - /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -4553,7 +4818,6 @@ l2arc_write_done(zio_t *zio) l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; - l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4563,7 +4827,7 @@ l2arc_write_done(zio_t *zio) ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); - buflist = dev->l2ad_buflist; + buflist = &dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -4571,42 +4835,43 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); /* * All writes completed, or an error was hit. */ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); - abl2 = hdr->b_l2hdr; - - /* - * Release the temporary compressed buffer as soon as possible. - */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage - * of eviction. Its ARC_L2_WRITING flag will be + * of eviction. Its ARC_FLAG_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } + /* + * It's possible that this buffer got evicted from the L1 cache + * before we grabbed the vdev + hash locks, in which case + * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. + * Only free the buffer if we still have an L1 hdr. + */ + if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_dropped += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } @@ -4620,8 +4885,9 @@ l2arc_write_done(zio_t *zio) atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); - kmem_cache_free(hdr_cache, head); - mutex_exit(&l2arc_buflist_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); @@ -4759,16 +5025,12 @@ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; - l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; - buflist = dev->l2ad_buflist; - - if (buflist == NULL) - return; + buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* @@ -4791,7 +5053,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) uint64_t, taddr, boolean_t, all); top: - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -4801,7 +5063,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; @@ -4817,9 +5079,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) continue; } - if (!all && hdr->b_l2hdr != NULL && - (hdr->b_l2hdr->b_daddr > taddr || - hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && HDR_HAS_L2HDR(hdr) && + (hdr->b_l2hdr.b_daddr > taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4828,15 +5090,8 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) break; } - if (HDR_FREE_IN_PROGRESS(hdr)) { - /* - * Already on the path to destruction. - */ - mutex_exit(hash_lock); - continue; - } - - if (hdr->b_state == arc_l2c_only) { + ASSERT(HDR_HAS_L2HDR(hdr)); + if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. @@ -4846,6 +5101,8 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write @@ -4859,26 +5116,18 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Tell ARC this no longer exists in L2ARC. */ - if (hdr->b_l2hdr != NULL) { - abl2 = hdr->b_l2hdr; - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_evicted += abl2->b_asize; - hdr->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - } + /* Tell ARC this no longer exists in L2ARC. */ + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; list_remove(buflist, hdr); - /* - * This may have been leftover after a - * failed write. - */ + /* This may have been leftover after a failed write. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; } mutex_exit(hash_lock); } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; @@ -4920,8 +5169,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; - head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_HAS_L2HDR; /* * We will want to try to compress buffers that are at least 2x the @@ -4932,7 +5182,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&l2arc_buflist_mtx); + mutex_enter(&dev->l2ad_mtx); for (try = 0; try <= 3; try++) { uint64_t passed_sz = 0; @@ -4954,7 +5204,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { - l2arc_buf_hdr_t *l2hdr; kmutex_t *hash_lock; uint64_t buf_sz; @@ -4997,7 +5246,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ - list_insert_head(dev->l2ad_buflist, head); + list_insert_head(&dev->l2ad_buflist, head); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5010,37 +5259,33 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), - KM_PUSHPAGE); - l2hdr->b_dev = dev; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - + hdr->b_l2hdr.b_dev = dev; + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); hdr->b_flags |= ARC_FLAG_L2_WRITING; - /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access hdr->b_buf + * there. This is because can't access b_l1hdr.b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ - l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = hdr->b_size; - l2hdr->b_tmp_cdata = hdr->b_buf->b_data; - l2hdr->b_hits = 0; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + hdr->b_l2hdr.b_asize = hdr->b_size; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; buf_sz = hdr->b_size; - hdr->b_l2hdr = l2hdr; + hdr->b_flags |= ARC_FLAG_HAS_L2HDR; - list_insert_head(dev->l2ad_buflist, hdr); + list_insert_head(&dev->l2ad_buflist, hdr); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(hdr->b_buf); - arc_cksum_compute(hdr->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_l1hdr.b_buf); + arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); mutex_exit(hash_lock); @@ -5056,8 +5301,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&l2arc_buflist_mtx); - kmem_cache_free(hdr_cache, head); + mutex_exit(&dev->l2ad_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); return (0); } @@ -5066,24 +5312,22 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * and work backwards, retracing the course of the buffer selector * loop above. */ - for (hdr = list_prev(dev->l2ad_buflist, head); hdr; - hdr = list_prev(dev->l2ad_buflist, hdr)) { - l2arc_buf_hdr_t *l2hdr; + for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; + hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must * take care to only access its L2 cache parameters. In - * particular, hdr->b_buf may be invalid by now due to + * particular, hdr->l1hdr.b_buf may be invalid by now due to * ARC eviction. */ - l2hdr = hdr->b_l2hdr; - l2hdr->b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (hdr->b_flags & ARC_FLAG_L2COMPRESS) && - l2hdr->b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(l2hdr)) { + if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && + hdr->b_l2hdr.b_asize >= buf_compress_minsz) { + if (l2arc_compress_buf(hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. @@ -5096,8 +5340,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ - buf_data = l2hdr->b_tmp_cdata; - buf_sz = l2hdr->b_asize; + buf_data = hdr->b_l1hdr.b_tmp_cdata; + buf_sz = hdr->b_l2hdr.b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { @@ -5122,7 +5366,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); @@ -5150,7 +5394,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its + * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left @@ -5168,17 +5412,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * buffer was incompressible). */ static boolean_t -l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) +l2arc_compress_buf(arc_buf_hdr_t *hdr) { void *cdata; size_t csize, len, rounded; + l2arc_buf_hdr_t *l2hdr; - ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); - ASSERT(l2hdr->b_tmp_cdata != NULL); + ASSERT(HDR_HAS_L2HDR(hdr)); + + l2hdr = &hdr->b_l2hdr; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); len = l2hdr->b_asize; cdata = zio_data_buf_alloc(len); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, + ASSERT3P(cdata, !=, NULL); + csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, cdata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); @@ -5190,9 +5441,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); l2hdr->b_asize = 0; - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } else if (csize > 0 && csize < len) { @@ -5200,9 +5451,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); l2hdr->b_asize = csize; - l2hdr->b_tmp_cdata = cdata; + hdr->b_l1hdr.b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { @@ -5250,9 +5501,9 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * need to fill its io_data after we're done restoring the * buffer's contents. */ - ASSERT(hdr->b_buf != NULL); - bzero(hdr->b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; + ASSERT(hdr->b_l1hdr.b_buf != NULL); + bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; } else { ASSERT(zio->io_data != NULL); /* @@ -5287,17 +5538,17 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ - ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); + zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size); } - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; } /* @@ -5442,13 +5693,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ - adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); - list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2node)); + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); @@ -5494,8 +5745,8 @@ l2arc_remove_vdev(vdev_t *vd) * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); - list_destroy(remdev->l2ad_buflist); - kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -5510,7 +5761,6 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; @@ -5535,7 +5785,6 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); From ca0bf58d65f77e944b9905571df9a2eae647aeca Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Mon, 12 Jan 2015 19:52:19 -0800 Subject: [PATCH 07/11] Illumos 5497 - lock contention on arcs_mtx Reviewed by: George Wilson Reviewed by: Matthew Ahrens Reviewed by: Richard Elling Approved by: Dan McDonald Porting notes and other significant code changes: The illumos 5368 patch (ARC should cache more metadata), which was never picked up by ZoL, is mostly reverted by this patch. Since ZoL relies on the kernel asynchronously calling the shrinker to actually reap memory, the shrinker wakes up arc_reclaim_waiters_cv every time it runs. The arc_adapt_thread() function no longer calls arc_do_user_evicts() since the newly-added arc_user_evicts_thread() calls it periodically. Notable conflicting ZoL commits which conflicted with this patch or whose effects are either duplicated or un-done by this patch: 302f753 - Integrate ARC more tightly with Linux 39e055c - Adjust arc_p based on "bytes" in arc_shrink f521ce1 - Allow "arc_p" to drop to zero or grow to "arc_c" 77765b5 - Remove "arc_meta_used" from arc_adjust calculation 94520ca - Prune metadata from ghost lists in arc_adjust_meta Trace support for multilist_insert() and multilist_remove() has been added and produces the following output: fio-12498 [077] .... 112936.448324: zfs_multilist__insert: ml { offset 240 numsublists 80 sublistidx 63 } fio-12498 [077] .... 112936.448347: zfs_multilist__remove: ml { offset 240 numsublists 80 sublistidx 29 } The following arcstats have been removed: recycle_miss - Used by arcstat.py and arc_summary.py, both of which have been updated appropriately. l2_writes_hdr_miss The following arcstats have been added: evict_not_enough - Number of times arc_evict_state() was unable to evict enough buffers to reach its target amount. evict_l2_skip - Number of times arc_evict_hdr() skipped eviction because it was being written to the l2arc. l2_writes_lock_retry - Replaces l2_writes_hdr_miss. Number of times l2arc_write_done() failed to acquire hash_lock (and re-tries). arc_meta_min - Shows the value of the zfs_arc_meta_min module parameter (see below). The "index" column of the "dbuf" kstat has been removed since it doesn't have a direct analog in the new multilist scheme. Additional multilist- related stats could be added in the future but would likely require extensions to the mulilist API. The following module parameters have been added: zfs_arc_evict_batch_limit - Number of ARC headers to free per sub-list before moving on to the next sub-list. zfs_arc_meta_min - Enforce a floor on the amount of metadata in the ARC. zfs_arc_num_sublists_per_state - Number of multilist sub-lists per ARC state. zfs_arc_overflow_shift - Controls amount by which the ARC must exceed the target size to be considered "overflowing". Ported-by: Tim Chase Signed-off-by: Brian Behlendorf #include +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; @@ -146,7 +152,6 @@ typedef enum arc_state_type { typedef struct arc_buf_info { arc_state_type_t abi_state_type; arc_buf_contents_t abi_state_contents; - uint64_t abi_state_index; uint32_t abi_flags; uint32_t abi_datacnt; uint64_t abi_size; @@ -200,7 +205,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); boolean_t arc_clear_callback(arc_buf_t *buf); -void arc_flush(spa_t *spa); +void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 556cc258330d..54f5e9f4094a 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -67,10 +67,22 @@ extern "C" { */ typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + /* + * list of evictable buffers + */ + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + uint64_t arcs_size; + /* + * supports the "dbufs" kstat + */ arc_state_type_t arcs_state; } arc_state_t; @@ -136,7 +148,7 @@ typedef struct l1arc_buf_hdr { /* protected by arc state mutex */ arc_state_t *b_state; - list_node_t b_arc_node; + multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; diff --git a/include/sys/multilist.h b/include/sys/multilist.h new file mode 100644 index 000000000000..98d707dd71ef --- /dev/null +++ b/include/sys/multilist.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line, in an effort to try and prevent cache line + * contention. + */ +} ____cacheline_aligned; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +void multilist_create(multilist_t *, size_t, size_t, unsigned int, + multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/include/sys/trace_multilist.h b/include/sys/trace_multilist.h new file mode 100644 index 000000000000..11d2f2701ac4 --- /dev/null +++ b/include/sys/trace_multilist.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MULTILIST_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * multilist_t *, ..., + * unsigned int, ..., + * void *, ...); + */ + +DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, + TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), + TP_ARGS(ml, sublist_idx, obj), + TP_STRUCT__entry( + __field(size_t, ml_offset) + __field(uint64_t, ml_num_sublists) + + __field(unsigned int, sublist_idx) + ), + TP_fast_assign( + __entry->ml_offset = ml->ml_offset; + __entry->ml_num_sublists = ml->ml_num_sublists; + + __entry->sublist_idx = sublist_idx; + ), + TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", + __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) +); + +#define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ +DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ + TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \ + TP_ARGS(ml, sublist_idx, obj)) +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert); +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); + +#endif /* _TRACE_MULTILIST_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_multilist +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 3dc54f1d7d90..761b1d57a116 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -609,6 +609,7 @@ extern void delay(clock_t ticks); } while (0); #define max_ncpus 64 +#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN)) #define minclsyspri 60 #define maxclsyspri 99 diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 85bc0510a81d..e25591300ef7 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -55,6 +55,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/lzjb.c \ $(top_srcdir)/module/zfs/lz4.c \ $(top_srcdir)/module/zfs/metaslab.c \ + $(top_srcdir)/module/zfs/multilist.c \ $(top_srcdir)/module/zfs/range_tree.c \ $(top_srcdir)/module/zfs/refcount.c \ $(top_srcdir)/module/zfs/rrwlock.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 359e9f72f35e..250adc9efa2b 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -347,6 +347,19 @@ increased to reduce the memory footprint. Default value: \fB8192\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_evict_batch_limit\fR (int) +.ad +.RS 12n +Number ARC headers to evict per sub-list before proceding to another sub-list. +This batch-style operation prevents entire sub-lists from being evicted at once +but comes at a cost of additional unlocking and locking. +.sp +Default value: \fB10\fR. +.RE + .sp .ne 2 .na @@ -395,6 +408,19 @@ for meta data. Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_meta_min\fR (ulong) +.ad +.RS 12n +The minimum allowed size in bytes that meta data buffers may consume in +the ARC. This value defaults to 0 which disables a floor on the amount +of the ARC devoted meta data. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na @@ -447,6 +473,40 @@ Min life of prefetch block Default value: \fB100\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_num_sublists_per_state\fR (int) +.ad +.RS 12n +To allow more fine-grained locking, each ARC state contains a series +of lists for both data and meta data objects. Locking is performed at +the level of these "sub-lists". This parameters controls the number of +sub-lists per ARC state. +.sp +Default value: 1 or the number of on-online CPUs, whichever is greater +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_overflow_shift\fR (int) +.ad +.RS 12n +The ARC size is considered to be overflowing if it exceeds the current +ARC target size (arc_c) by a threshold determined by this parameter. +The threshold is calculated as a fraction of arc_c using the formula +"arc_c >> \fBzfs_arc_overflow_shift\fR". + +The default value of 8 causes the ARC to be considered to be overflowing +if it exceeds the target size by 1/256th (0.3%) of the target size. + +When the ARC is overflowing, new buffer allocations are stalled until +the reclaim thread catches up and the overflow condition no longer exists. +.sp +Default value: \fB8\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 954841f33137..e5753ae81118 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -37,6 +37,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o $(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o $(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o $(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o +$(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o $(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o $(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o $(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e69889ab5810..67ef87daf137 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -135,6 +135,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -154,9 +155,14 @@ boolean_t arc_watch = B_FALSE; #endif -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +static kmutex_t arc_user_evicts_lock; +static kcondvar_t arc_user_evicts_cv; +static boolean_t arc_user_evicts_thread_exit; /* number of objects to prune from caches when arc_meta_limit is reached */ int zfs_arc_meta_prune = 10000; @@ -167,14 +173,27 @@ typedef enum arc_reclaim_strategy { } arc_reclaim_strategy_t; /* - * The number of iterations through arc_evict_*() before we - * drop & reacquire the lock. + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). + */ +int zfs_arc_evict_batch_limit = 10; + +/* + * The number of sublists used for each of the arc state lists. If this + * is not set to a suitable value by the user, it will be configured to + * the number of CPUs on the system in arc_init(). */ -int arc_evict_iterations = 100; +int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ int zfs_arc_grow_retry = 5; +/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +int zfs_arc_overflow_shift = 8; + /* disable anon data aggressively growing arc_p */ int zfs_arc_p_aggressive_disable = 1; @@ -199,6 +218,12 @@ int zfs_disable_dup_eviction = 0; /* average block used to size buf_hash_table */ int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + /* * If this percent of memory is free, don't throttle. */ @@ -220,6 +245,7 @@ static boolean_t arc_warm; unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; +unsigned long zfs_arc_meta_min = 0; /* * Limit the number of restarts in arc_adjust_meta() @@ -250,7 +276,6 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; - kstat_named_t arcstat_recycle_miss; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held @@ -264,9 +289,15 @@ typedef struct arc_stats { * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -305,11 +336,12 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; @@ -332,6 +364,7 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; } arc_stats_t; static arc_stats_t arc_stats = { @@ -350,12 +383,13 @@ static arc_stats_t arc_stats = { { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -394,11 +428,12 @@ static arc_stats_t arc_stats = { { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, @@ -421,6 +456,7 @@ static arc_stats_t arc_stats = { { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -486,6 +522,7 @@ static arc_state_t *arc_l2c_only; #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ @@ -495,7 +532,6 @@ static arc_state_t *arc_l2c_only; static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ @@ -637,8 +673,7 @@ static uint8_t l2arc_thread_exit; static void arc_get_data_buf(arc_buf_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); -static int arc_evict_needed(arc_buf_contents_t); -static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); +static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); @@ -830,6 +865,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); @@ -874,6 +910,7 @@ hdr_full_dest(void *vbuf, void *unused) cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } @@ -981,18 +1018,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); } else { ASSERT(hdr->b_l1hdr.b_buf == NULL); ASSERT0(hdr->b_l1hdr.b_datacnt); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + /* - * We might be removing the L1hdr of a buffer which was just - * written out to L2ARC. If such a buffer is compressed then we - * need to free its b_tmp_cdata before destroying the header. + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field + * might try to be accessed, even though it was removed. */ - if (hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; } /* @@ -1188,14 +1238,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { + arc_buf_contents_t type = arc_buf_type(hdr); uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; - list_t *list = &state->arcs_list[arc_buf_type(hdr)]; - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_remove(list, hdr); - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(list, hdr); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_datacnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -1204,7 +1253,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&state->arcs_mtx); } /* remove the prefetch flag if we get a reference */ hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -1227,16 +1275,15 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; + arc_buf_contents_t type = arc_buf_type(hdr); + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_insert(list, hdr); - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT(hdr->b_l1hdr.b_datacnt > 0); atomic_add_64(size, hdr->b_size * hdr->b_l1hdr.b_datacnt); - mutex_exit(&state->arcs_mtx); } return (cnt); } @@ -1285,26 +1332,11 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); - abi->abi_state_index = -1; abi->abi_size = hdr->b_size; - - if (l1hdr && state && state_index && - list_link_active(&l1hdr->b_arc_node)) { - list_t *list = &state->arcs_list[arc_buf_type(hdr)]; - arc_buf_hdr_t *h; - - mutex_enter(&state->arcs_mtx); - for (h = list_head(list); h != NULL; h = list_next(list, h)) { - abi->abi_state_index++; - if (h == hdr) - break; - } - mutex_exit(&state->arcs_mtx); - } } /* - * Move the supplied buffer to the indicated state. The mutex + * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void @@ -1348,15 +1380,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); uint64_t *size = &old_state->arcs_lsize[buftype]; - if (use_mutex) - mutex_enter(&old_state->arcs_mtx); - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); - list_remove(&old_state->arcs_list[buftype], hdr); + multilist_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, @@ -1369,12 +1396,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); - - if (use_mutex) - mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon && new_state != arc_l2c_only) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); uint64_t *size = &new_state->arcs_lsize[buftype]; /* @@ -1384,10 +1407,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - if (use_mutex) - mutex_enter(&new_state->arcs_mtx); - - list_insert_head(&new_state->arcs_list[buftype], hdr); + multilist_insert(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -1396,9 +1416,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, to_delta = hdr->b_size; } atomic_add_64(size, to_delta); - - if (use_mutex) - mutex_exit(&new_state->arcs_mtx); } } @@ -1420,8 +1437,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ - ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && - list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); + ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1524,6 +1541,7 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_datacnt = 1; + hdr->b_l1hdr.b_tmp_cdata = NULL; arc_get_data_buf(buf); @@ -1654,6 +1672,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +static void +arc_buf_free_on_write(void *data, size_t size, + void (*free_func)(void *, size_t)) +{ + l2arc_data_free_t *df; + + df = kmem_alloc(sizeof (*df), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} + /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. @@ -1664,26 +1697,74 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { - l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = buf->b_data; - df->l2df_size = hdr->b_size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); + arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); } } +static void +arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); + + /* + * The b_tmp_cdata field is linked off of the b_l1hdr, so if + * that doesn't exist, the header is in the arc_l2c_only state, + * and there isn't anything to free (it's already been freed). + */ + if (!HDR_HAS_L1HDR(hdr)) + return; + + /* + * The header isn't being written to the l2arc device, thus it + * shouldn't have a b_tmp_cdata to free. + */ + if (!HDR_L2_WRITING(hdr)) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + /* + * The header does not have compression enabled. This can be due + * to the buffer not being compressible, or because we're + * freeing the buffer before the second phase of + * l2arc_write_buffer() has started (which does the compression + * step). In either case, b_tmp_cdata does not point to a + * separately compressed buffer, so there's nothing to free (it + * points to the same buffer as the arc_buf_t's b_data field). + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + hdr->b_l1hdr.b_tmp_cdata = NULL; + return; + } + + /* + * There's nothing to free since the buffer was all zero's and + * compressed to a zero length buffer. + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))); + + arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size, zio_data_buf_free); + + ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); + hdr->b_l1hdr.b_tmp_cdata = NULL; +} + /* * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) +arc_buf_destroy(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; @@ -1696,17 +1777,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); - } + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf, zio_buf_free); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf, zio_data_buf_free); + arc_space_return(size, ARC_SPACE_DATA); } - if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero( @@ -1774,6 +1855,12 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); @@ -1797,27 +1884,26 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - FALSE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); } else { - arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, - TRUE); + arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); } } } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { @@ -1843,7 +1929,7 @@ arc_buf_free(arc_buf_t *buf, void *tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); @@ -1857,16 +1943,16 @@ arc_buf_free(arc_buf_t *buf, void *tag) * this buffer unless the write completes before we finish * decrementing the reference count. */ - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); else arc_hdr_destroy(hdr); } @@ -1896,7 +1982,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else if (no_callback) { ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); @@ -1957,353 +2043,390 @@ arc_buf_eviction_needed(arc_buf_t *buf) } /* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on its state prior to entering this + * function. The following transitions are possible: * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so may not catch all candidates. - * It may also return without evicting as much space as requested. + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted */ -static void * -arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *hdr, *hdr_prev = NULL; - list_t *list = &state->arcs_list[type]; - kmutex_t *hash_lock; - boolean_t have_lock; - void *stolen = NULL; - arc_buf_hdr_t marker = {{{ 0 }}}; - int count = 0; - - ASSERT(state == arc_mru || state == arc_mfu); - - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; -top: - /* - * The ghost list lock must be acquired first in order to prevent - * a 3 party deadlock: - * - * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by - * l2ad_mtx in arc_hdr_realloc - * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx - * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by - * arc_*_ghost->arcs_mtx and forms a deadlock cycle. - * - * This situation is avoided by acquiring the ghost list lock first. - */ - mutex_enter(&evicted_state->arcs_mtx); - mutex_enter(&state->arcs_mtx); - - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - zfs_arc_min_prefetch_lifespan)) { - skipped++; - continue; - } - /* "lookahead" for better eviction candidate */ - if (recycle && hdr->b_size != bytes && - hdr_prev && hdr_prev->b_size == bytes) - continue; + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - /* ignore markers */ - if (hdr->b_spa == 0) - continue; + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. - * - * If we are looking for a buffer to recycle, we are in - * the hot code path, so don't sleep. + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. its b_tmp_cdata field) during its write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing its L1 piece) until the header is + * done being written to the l2arc. */ - if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - mutex_exit(&evicted_state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&evicted_state->arcs_mtx); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); } - hash_lock = HDR_LOCK(hdr); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); - ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - missed += 1; - break; - } - if (buf->b_data != NULL) { - bytes_evicted += hdr->b_size; - if (recycle && - arc_buf_type(hdr) == type && - hdr->b_size == bytes && - !HDR_L2_WRITING(hdr)) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc != NULL) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - hdr->b_l1hdr.b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += hdr->b_size; - if (HDR_HAS_L2HDR(hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_cached, - hdr->b_size); - } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - hdr->b_size); - } else { - ARCSTAT_INCR( - arcstat_evict_l2_ineligible, - hdr->b_size); - } - } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (hdr->b_l1hdr.b_datacnt == 0) { - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); - } - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; + if (HDR_HAS_L2HDR(hdr)) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); } else { - missed += 1; + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } + return (bytes_evicted); } - mutex_exit(&state->arcs_mtx); - mutex_exit(&evicted_state->arcs_mtx); + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_evicted < bytes)) { - /* Prevent second pass from recycling metadata into data */ - recycle = FALSE; - type = ARC_BUFC_METADATA; - list = &state->arcs_list[type]; - goto top; + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + arc_min_prefetch_lifespan)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); } - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x\n", - (longlong_t)bytes_evicted, state->arcs_state); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += hdr->b_size; + if (buf->b_efunc != NULL) { + mutex_enter(&arc_user_evicts_lock); + arc_buf_destroy(buf, FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, TRUE); + } + } - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) + ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); + else + ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + } - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); + if (hdr->b_l1hdr.b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } - /* - * Note: we have just evicted some data into the ghost state, - * potentially putting the ghost size over the desired size. Rather - * that evicting from the ghost list in this hot code path, leave - * this chore to the arc_reclaim_thread(). - */ - return (stolen); + return (bytes_evicted); } -/* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. - */ -static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *hdr, *hdr_prev; - arc_buf_hdr_t marker; - list_t *list = &state->arcs_list[ARC_BUFC_DATA]; + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - int count = 0; + int evict_count = 0; - ASSERT(GHOST_STATE(state)); - bzero(&marker, sizeof (marker)); -top: - mutex_enter(&state->arcs_mtx); - for (hdr = list_tail(list); hdr; hdr = hdr_prev) { - hdr_prev = list_prev(list, hdr); - if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) - panic("invalid hdr=%p", (void *)hdr); - if (spa && hdr->b_spa != spa) - continue; + ASSERT3P(marker, !=, NULL); + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); + + mls = multilist_sublist_lock(ml, idx); - /* ignore markers */ + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; + + /* + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). + */ + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ if (hdr->b_spa == 0) continue; - hash_lock = HDR_LOCK(hdr); - /* caller may be trying to modify this buffer, skip it */ - if (MUTEX_HELD(hash_lock)) + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); continue; + } + + hash_lock = HDR_LOCK(hdr); /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). */ - if (count++ > arc_evict_iterations) { - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; - } + ASSERT(!MUTEX_HELD(hash_lock)); + if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!HDR_HAS_L1HDR(hdr) || - hdr->b_l1hdr.b_buf == NULL); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += hdr->b_size; + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); - if (HDR_HAS_L2HDR(hdr)) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, hdr, hash_lock); - /* - * dropping from L1+L2 cached to L2-only, - * realloc to remove the L1 header. - */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, - hdr_l2only_cache); - mutex_exit(hash_lock); - } else { - arc_change_state(arc_anon, hdr, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(hdr); - } + bytes_evicted += evicted; - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (bytes >= 0 && bytes_deleted >= bytes) - break; - } else if (bytes < 0) { /* - * Insert a list marker and then wait for the - * hash lock to become available. Once its - * available, restart from where we left off. + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. */ - list_insert_after(list, hdr, &marker); - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - mutex_enter(&state->arcs_mtx); - hdr_prev = list_prev(list, &marker); - list_remove(list, &marker); + if (evicted != 0) + evict_count++; + + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_buf() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast, they will be woken up + * just before arc_reclaim_thread() sleeps. + */ + mutex_enter(&arc_reclaim_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_reclaim_waiters_cv); + mutex_exit(&arc_reclaim_lock); } else { - bufs_skipped += 1; + ARCSTAT_BUMP(arcstat_mutex_miss); } } - mutex_exit(&state->arcs_mtx); - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; - } - - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); - } + multilist_sublist_unlock(mls); - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p\n", - (longlong_t)bytes_deleted, state); + return (bytes_evicted); } -static void -arc_adjust(void) +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) { - int64_t adjustment, delta; + uint64_t total_evicted = 0; + multilist_t *ml = &state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + int i; + + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); + + num_sublists = multilist_get_num_sublists(ml); /* - * Adjust MRU size + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls; - adjustment = MIN((int64_t)(arc_size - arc_c), - (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; - if (adjustment > 0 && arc_mru->arcs_size > 0) { - delta = MIN(arc_mru->arcs_size, adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); + mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); } /* - * Adjust MFU size + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; - adjustment = arc_size - arc_c; + for (i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; - if (adjustment > 0 && arc_mfu->arcs_size > 0) { - delta = MIN(arc_mfu->arcs_size, adjustment); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - } + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; - /* - * Adjust ghost lists - */ + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); - adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } - if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { - delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta); + break; + } } - adjustment = - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); - if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { - delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta); + kmem_cache_free(hdr_full_cache, markers[i]); } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); +} + +/* + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. + */ +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) +{ + uint64_t evicted = 0; + + while (state->arcs_lsize[type] != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + + if (!retry) + break; + } + + return (evicted); } /* @@ -2348,17 +2471,356 @@ arc_do_user_prune(int64_t adjustment) mutex_exit(&arc_prune_mtx); } +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; + + if (bytes > 0 && state->arcs_lsize[type] > 0) { + delta = MIN(state->arcs_lsize[type], bytes); + return (arc_evict_state(state, spa, delta, type)); + } + + return (0); +} + +/* + * The goal of this function is to evict enough meta data buffers from the + * ARC in order to enforce the arc_meta_limit. Achieving this is slightly + * more complicated than it appears because it is common for data buffers + * to have holds on meta data buffers. In addition, dnode meta data buffers + * will be held by the dnodes in the block preventing them from being freed. + * This means we can't simply traverse the ARC and expect to always find + * enough unheld meta data buffer to release. + * + * Therefore, this function has been updated to make alternating passes + * over the ARC releasing data buffers and then newly unheld meta data + * buffers. This ensures forward progress is maintained and arc_meta_used + * will decrease. Normally this is sufficient, but if required the ARC + * will call the registered prune callbacks causing dentry and inodes to + * be dropped from the VFS cache. This will make dnode meta data buffers + * available for reclaim. + */ +static uint64_t +arc_adjust_meta(void) +{ + int64_t adjustmnt, delta, prune = 0; + uint64_t total_evicted = 0; + arc_buf_contents_t type = ARC_BUFC_DATA; + unsigned long restarts = zfs_arc_meta_adjust_restarts; + +restart: + /* + * This slightly differs than the way we evict from the mru in + * arc_adjust because we don't have a "target" value (i.e. no + * "meta" arc_p). As a result, I think we can completely + * cannibalize the metadata in the MRU before we evict the + * metadata from the MFU. I think we probably need to implement a + * "metadata arc_p" value to do this properly. + */ + adjustmnt = arc_meta_used - arc_meta_limit; + + if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { + delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); + total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); + adjustmnt -= delta; + } + + /* + * We can't afford to recalculate adjustmnt here. If we do, + * new metadata buffers can sneak into the MRU or ANON lists, + * thus penalize the MFU metadata. Although the fudge factor is + * small, it has been empirically shown to be significant for + * certain workloads (e.g. creating many empty directories). As + * such, we use the original calculation for adjustmnt, and + * simply decrement the amount of data evicted from the MRU. + */ + + if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { + delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); + total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); + } + + adjustmnt = arc_meta_used - arc_meta_limit; + + if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { + delta = MIN(adjustmnt, + arc_mru_ghost->arcs_lsize[type]); + total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); + adjustmnt -= delta; + } + + if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { + delta = MIN(adjustmnt, + arc_mfu_ghost->arcs_lsize[type]); + total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); + } + + /* + * If after attempting to make the requested adjustment to the ARC + * the meta limit is still being exceeded then request that the + * higher layers drop some cached objects which have holds on ARC + * meta buffers. Requests to the upper layers will be made with + * increasingly large scan sizes until the ARC is below the limit. + */ + if (arc_meta_used > arc_meta_limit) { + if (type == ARC_BUFC_DATA) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + + if (zfs_arc_meta_prune) { + prune += zfs_arc_meta_prune; + arc_do_user_prune(prune); + } + } + + if (restarts > 0) { + restarts--; + goto restart; + } + } + return (total_evicted); +} + +/* + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. + */ +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) +{ + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; + + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); + + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; + } + + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } + } + + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); +} + +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t +arc_adjust(void) +{ + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; + + /* + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. + */ + total_evicted += arc_adjust_meta(); + + /* + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. + */ + target = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); + + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + } + + /* + * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. + */ + target = arc_size - arc_c; + + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + } + + /* + * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. + */ + target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + static void arc_do_user_evicts(void) { - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (buf->b_efunc != NULL) VERIFY0(buf->b_efunc(buf->b_private)); @@ -2366,146 +2828,39 @@ arc_do_user_evicts(void) buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); } - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); } -/* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and arc_meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. - */ -static void -arc_adjust_meta(void) +void +arc_flush(spa_t *spa, boolean_t retry) { - int64_t adjustmnt, delta, prune = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - unsigned long restarts = zfs_arc_meta_adjust_restarts; - -restart: - /* - * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. - */ - adjustmnt = arc_meta_used - arc_meta_limit; - - if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) { - delta = MIN(arc_mru->arcs_lsize[type], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, type); - adjustmnt -= delta; - } - - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ - - if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) { - delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, type); - } - - adjustmnt = arc_meta_used - arc_meta_limit; - - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { - delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mru_ghost, 0, delta); - } - - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) { - delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mfu_ghost, 0, delta); - } + uint64_t guid = 0; /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. + * If retry is TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. */ - if (arc_meta_used > arc_meta_limit) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_do_user_prune(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } -} - -/* - * Flush all *evictable* data from the cache for the given spa. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ -void -arc_flush(spa_t *spa) -{ - uint64_t guid = 0; + ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa != NULL) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa != NULL) - break; - } + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); - arc_evict_ghost(arc_mru_ghost, guid, -1); - arc_evict_ghost(arc_mfu_ghost, guid, -1); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } @@ -2538,7 +2893,7 @@ arc_shrink(uint64_t bytes) } if (arc_size > arc_c) - arc_adjust(); + (void) arc_adjust(); } static void @@ -2568,6 +2923,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } } + kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); } @@ -2578,21 +2934,41 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) * reclamation has been entirely delegated to the arc_shrinker_func() * which is registered with the VM. To reflect this change in behavior * the arc_reclaim thread has been renamed to arc_adapt. + * + * The following comment from arc_reclaim_thread() in illumos is still + * applicable: + * + * Threads can block in arc_get_data_buf() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_buf() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_adapt_thread(void) { callb_cpr_t cpr; fstrans_cookie_t cookie; + uint64_t arc_evicted; - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); cookie = spl_fstrans_mark(); - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { + mutex_enter(&arc_reclaim_lock); + while (arc_reclaim_thread_exit == 0) { #ifndef _KERNEL arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + mutex_exit(&arc_reclaim_lock); if (spa_get_random(100) == 0) { if (arc_no_grow) { @@ -2614,6 +2990,8 @@ arc_adapt_thread(void) arc_kmem_reap_now(last_reclaim, 0); arc_warm = B_TRUE; } +#else /* _KERNEL */ + mutex_exit(&arc_reclaim_lock); #endif /* !_KERNEL */ /* No recent memory pressure allow the ARC to grow. */ @@ -2621,18 +2999,23 @@ arc_adapt_thread(void) ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time)) arc_no_grow = FALSE; - arc_adjust_meta(); + arc_evicted = arc_adjust(); - arc_adjust(); + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + if (arc_size <= arc_c || arc_evicted == 0) + cv_broadcast(&arc_reclaim_waiters_cv); - if (arc_eviction_list != NULL) - arc_do_user_evicts(); + mutex_enter(&arc_reclaim_lock); /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + (void) cv_timedwait_interruptible(&arc_reclaim_thread_cv, + &arc_reclaim_lock, (ddi_get_lbolt() + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); /* Allow the module options to be changed */ @@ -2650,14 +3033,59 @@ arc_adapt_thread(void) zfs_arc_meta_limit <= arc_c_max && zfs_arc_meta_limit != arc_meta_limit) arc_meta_limit = zfs_arc_meta_limit; + } + arc_reclaim_thread_exit = 0; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + spl_fstrans_unmark(cookie); + thread_exit(); +} + +static void +arc_user_evicts_thread(void) +{ + callb_cpr_t cpr; + fstrans_cookie_t cookie; + CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); + cookie = spl_fstrans_mark(); + mutex_enter(&arc_user_evicts_lock); + while (!arc_user_evicts_thread_exit) { + mutex_exit(&arc_user_evicts_lock); + + arc_do_user_evicts(); + + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + mutex_enter(&arc_user_evicts_lock); + + /* + * Block until signaled, or after one second (we need to + * call the arc's kstat update function regularly). + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_interruptible(&arc_user_evicts_cv, + &arc_user_evicts_lock, ddi_get_lbolt() + hz); + CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); } - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + arc_user_evicts_thread_exit = FALSE; + cv_broadcast(&arc_user_evicts_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ spl_fstrans_unmark(cookie); thread_exit(); } @@ -2759,9 +3187,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) return (SHRINK_STOP); /* Reclaim in progress */ - if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + if (mutex_tryenter(&arc_reclaim_lock) == 0) return (SHRINK_STOP); + mutex_exit(&arc_reclaim_lock); + /* * Evict the requested number of pages by shrinking arc_c the * requested amount. If there is nothing left to evict just @@ -2780,6 +3210,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) pages = SHRINK_STOP; } + /* + * We've reaped what we can, wake up threads. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd @@ -2795,8 +3230,6 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - mutex_exit(&arc_reclaim_thr_lock); - return (pages); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); @@ -2871,43 +3304,25 @@ arc_adapt(int bytes, arc_state_t *state) } /* - * Check if the cache has reached its limits and eviction is required - * prior to insert. + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. */ -static int -arc_evict_needed(arc_buf_contents_t type) +static boolean_t +arc_is_overflowing(void) { - if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) - return (1); - - if (arc_no_grow) - return (1); + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); - return (arc_size > arc_c); + return (arc_size >= arc_c + overflow); } /* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. + * The buffer, supplied as the first argument, needs a data block. If we + * are hitting the hard limit for the cache size, we must sleep, waiting + * for the eviction thread to catch up. If we're past the target size + * but below the hard limit, we'll only signal the reclaim thread and + * continue on. */ static void arc_get_data_buf(arc_buf_t *buf) @@ -2915,91 +3330,54 @@ arc_get_data_buf(arc_buf_t *buf) arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = arc_buf_type(buf->b_hdr); - arc_buf_contents_t evict = ARC_BUFC_DATA; - boolean_t recycle = TRUE; arc_adapt(size, state); /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. */ - if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); + if (arc_is_overflowing()) { + mutex_enter(&arc_reclaim_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } - goto out; - } - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] >= size && - arc_p > mru_used) ? arc_mfu : arc_mru; - } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] >= size && - mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + mutex_exit(&arc_reclaim_lock); } - /* - * Evict data buffers prior to metadata buffers, unless we're - * over the metadata limit and adding a metadata buffer. - */ if (type == ARC_BUFC_METADATA) { - if (arc_meta_used >= arc_meta_limit) - evict = ARC_BUFC_METADATA; - else - /* - * In this case, we're evicting data while - * adding metadata. Thus, to prevent recycling a - * data buffer into a metadata buffer, recycling - * is disabled in the following arc_evict call. - */ - recycle = FALSE; + buf->b_data = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = zio_data_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); } - if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - - /* - * If we are unable to recycle an existing meta buffer - * signal the reclaim thread. It will notify users - * via the prune callback to drop references. The - * prune callback in run in the context of the reclaim - * thread to avoid deadlocking on the hash_lock. - * Of course, only do this when recycle is true. - */ - if (recycle) - cv_signal(&arc_reclaim_thr_cv); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); - } - - /* Only bump this if we tried to recycle and failed */ - if (recycle) - ARCSTAT_BUMP(arcstat_recycle_miss); - } - ASSERT(buf->b_data != NULL); -out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. @@ -3008,7 +3386,17 @@ arc_get_data_buf(arc_buf_t *buf) arc_buf_hdr_t *hdr = buf->b_hdr; atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); - if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], size); @@ -3017,8 +3405,7 @@ arc_get_data_buf(arc_buf_t *buf) * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && + if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -3061,7 +3448,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - ASSERT(list_link_active( + /* link protected by hash lock */ + ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; @@ -3125,7 +3513,8 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* link protected by hash_lock */ + ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); @@ -3517,7 +3906,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_FLAG_PREFETCH) @@ -3813,7 +4202,7 @@ arc_clear_callback(arc_buf_t *buf) if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; @@ -3839,13 +4228,15 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; /* - * It would be nice to assert that if it's DMU metadata (level > + * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be @@ -3892,6 +4283,13 @@ arc_release(arc_buf_t *buf, void *tag) mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; @@ -3964,6 +4362,7 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_datacnt = 1; nhdr->b_l1hdr.b_state = arc_anon; nhdr->b_l1hdr.b_arc_access = 0; + nhdr->b_l1hdr.b_tmp_cdata = NULL; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); @@ -3973,8 +4372,8 @@ arc_release(arc_buf_t *buf, void *tag) } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); - /* protected by hash lock */ - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; @@ -4297,11 +4696,50 @@ arc_kstat_update(kstat_t *ksp, int rw) return (0); } +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!BUF_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout its lifetime + * (i.e. its b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + void arc_init(void) { - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ zfs_arc_min_prefetch_lifespan = 1 * hz; @@ -4349,6 +4787,9 @@ arc_init(void) if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = num_online_cpus(); + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -4363,43 +4804,46 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4410,12 +4854,12 @@ arc_init(void) buf_init(); - arc_thread_exit = 0; + arc_reclaim_thread_exit = FALSE; + arc_user_evicts_thread_exit = FALSE; list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, @@ -4430,6 +4874,9 @@ arc_init(void) (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, TS_RUN, minclsyspri); + (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + arc_dead = FALSE; arc_warm = B_FALSE; @@ -4458,17 +4905,36 @@ arc_fini(void) { arc_prune_t *p; - mutex_enter(&arc_reclaim_thr_lock); #ifdef _KERNEL spl_unregister_shrinker(&arc_shrinker); #endif /* _KERNEL */ - arc_thread_exit = 1; - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); + + mutex_enter(&arc_user_evicts_lock); + arc_user_evicts_thread_exit = TRUE; + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_user_evicts_thread_exit) { + cv_signal(&arc_user_evicts_cv); + cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); + } + mutex_exit(&arc_user_evicts_lock); - arc_flush(NULL); + /* Use TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, TRUE); arc_dead = TRUE; @@ -4488,25 +4954,23 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); - - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - mutex_destroy(&arc_l2c_only->arcs_mtx); + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); + + mutex_destroy(&arc_user_evicts_lock); + cv_destroy(&arc_user_evicts_cv); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); buf_fini(); @@ -4835,34 +5299,62 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&dev->l2ad_mtx); - /* * All writes completed, or an error was hit. */ +top: + mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* - * This buffer misses out. It may be in a stage - * of eviction. Its ARC_FLAG_L2_WRITING flag will be - * left set, denying reads to this buffer. + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. */ - ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); - continue; + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; } /* - * It's possible that this buffer got evicted from the L1 cache - * before we grabbed the vdev + hash locks, in which case - * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. - * Only free the buffer if we still have an L1 hdr. + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We may have allocated a buffer for L2ARC compression, + * we must release it to avoid leaking this data. */ - if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(hdr); + l2arc_release_cdata_buf(hdr); if (zio->io_error != 0) { /* @@ -4876,7 +5368,8 @@ l2arc_write_done(zio_t *zio) } /* - * Allow ARC to begin reads to this L2ARC entry. + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; @@ -4984,35 +5477,37 @@ l2arc_read_done(zio_t *zio) * the data lists. This function returns a locked list, and also returns * the lock pointer. */ -static list_t * -l2arc_list_locked(int list_num, kmutex_t **lock) +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) { - list_t *list = NULL; + multilist_t *ml = NULL; + unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } - ASSERT(!(MUTEX_HELD(*lock))); - mutex_enter(*lock); - return (list); + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); } /* @@ -5058,6 +5553,12 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. @@ -5122,8 +5623,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; list_remove(buflist, hdr); - /* This may have been leftover after a failed write. */ - hdr->b_flags &= ~ARC_FLAG_L2_WRITING; + /* Ensure this header has finished being written */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); } mutex_exit(hash_lock); } @@ -5149,11 +5651,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; - list_t *list; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; void *buf_data; - kmutex_t *list_lock = NULL; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; @@ -5182,12 +5682,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&dev->l2ad_mtx); for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; - list = l2arc_list_locked(try, &list_lock); - /* * L2ARC fast warmup. * @@ -5195,9 +5693,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - hdr = list_head(list); + hdr = multilist_sublist_head(mls); else - hdr = list_tail(list); + hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) @@ -5208,9 +5706,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, uint64_t buf_sz; if (arc_warm == B_FALSE) - hdr_prev = list_next(list, hdr); + hdr_prev = multilist_sublist_next(mls, hdr); else - hdr_prev = list_prev(list, hdr); + hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { @@ -5246,7 +5744,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5278,7 +5778,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, buf_sz = hdr->b_size; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; + mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); /* * Compute and store the buffer cksum before @@ -5292,7 +5794,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, write_sz += buf_sz; } - mutex_exit(list_lock); + multilist_sublist_unlock(mls); if (full == B_TRUE) break; @@ -5301,12 +5803,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&dev->l2ad_mtx); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } + mutex_enter(&dev->l2ad_mtx); + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector @@ -5316,6 +5819,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must @@ -5538,8 +6049,26 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { + enum zio_compress comp = HDR_GET_COMPRESS(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { + ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); + + if (comp == ZIO_COMPRESS_OFF) { + /* + * In this case, b_tmp_cdata points to the same buffer + * as the arc_buf_t's b_data field. We don't want to + * free it, since the arc_buf_t will handle that. + */ + hdr->b_l1hdr.b_tmp_cdata = NULL; + } else if (comp == ZIO_COMPRESS_EMPTY) { + /* + * In this case, b_tmp_cdata was compressed to an empty + * buffer, thus there's nothing to free and b_tmp_cdata + * should have been set to NULL in l2arc_write_buffers(). + */ + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + } else { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. @@ -5547,8 +6076,9 @@ l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size); + hdr->b_l1hdr.b_tmp_cdata = NULL; } - hdr->b_l1hdr.b_tmp_cdata = NULL; + } /* @@ -5834,6 +6364,9 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); module_param(zfs_arc_meta_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); +module_param(zfs_arc_meta_min, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); + module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); @@ -5865,6 +6398,10 @@ MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle"); module_param(zfs_arc_min_prefetch_lifespan, int, 0644); MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); +module_param(zfs_arc_num_sublists_per_state, int, 0644); +MODULE_PARM_DESC(zfs_arc_num_sublists_per_state, + "Number of sublists used in each of the ARC state lists"); + module_param(l2arc_write_max, ulong, 0644); MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 5e7eaf1acf33..afdf828ed542 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -48,12 +48,12 @@ dbuf_stats_hash_table_headers(char *buf, size_t size) (void) snprintf(buf, size, "%-88s | %-124s | %s\n" "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " - "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-5s %-5s %-8s %-6s %-8s %-12s " "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", - "atype", "index", "flags", "count", "asize", "access", + "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"); @@ -77,7 +77,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) nwritten = snprintf(buf, size, "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " - "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-5d %-5d 0x%-6x %-6lu %-8llu %-12llu " "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", /* dmu_buf_impl_t */ @@ -94,7 +94,6 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) /* arc_buf_info_t */ abi.abi_state_type, abi.abi_state_contents, - (longlong_t)abi.abi_state_index, abi.abi_flags, (ulong_t)abi.abi_datacnt, (u_longlong_t)abi.abi_size, diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 7d84d8bf4e81..108fc5299f41 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -317,7 +317,14 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - arc_flush(dp->dp_spa); + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c new file mode 100644 index 000000000000..e4446ded2208 --- /dev/null +++ b/module/zfs/multilist.c @@ -0,0 +1,375 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* needed for spa_get_random() */ +#include + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +#ifdef DEBUG +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} +#endif + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +void +multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, + multilist_sublist_index_func_t *index_func) +{ + int i; + + ASSERT3P(ml, !=, NULL); + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + int i; + + ASSERT(multilist_is_empty(ml)); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + int i; + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} diff --git a/module/zfs/trace.c b/module/zfs/trace.c index 470cf18bff30..0c9990e8547b 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -23,6 +23,7 @@ * (and only one) C file, so this dummy file exists for that purpose. */ +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 5afb23c595ae..40b507a0b6d8 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) - arc_flush(NULL); + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); return (0); } From 4f34bd9792bad1affe5b93aeef406fd7dc2df0f8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 4 Jun 2015 16:25:37 -0700 Subject: [PATCH 08/11] Add taskq_wait_outstanding() function SPL commit behlendorf/spl@9cef1b5 adds the taskq_wait_outstanding() interface. See the commit log for the full justification for this addition. This patch adds the required user space counterpart. Signed-off-by: Brian Behlendorf Signed-off-by: Tim Chase --- include/sys/zfs_context.h | 1 + lib/libzpool/taskq.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 761b1d57a116..8b9a5f46fbf0 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -468,6 +468,7 @@ extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index d63bc28e2379..c6fa2fff72b6 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -220,6 +220,12 @@ taskq_wait_id(taskq_t *tq, taskqid_t id) taskq_wait(tq); } +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + taskq_wait(tq); +} + static void taskq_thread(void *arg) { From c5528b9ba622421a213e128704de4090fa0db773 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 5 Jun 2015 11:26:35 -0700 Subject: [PATCH 09/11] Use taskq_wait_outstanding() function Replace taskq_wait() with taskq_wait_oustanding(). This way callers will only block until previously submitted tasks have been completed. This was the previous behavior of task_wait() prior to the introduction of taskq_wait_outstanding() so this isn't really a functionalty change for these callers. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/metaslab.c | 4 ++-- module/zfs/txg.c | 2 +- module/zfs/zfs_vfsops.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 164b6b87edd4..7ff1a4f5af7e 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -556,7 +556,7 @@ metaslab_group_passivate(metaslab_group_t *mg) return; } - taskq_wait(mg->mg_taskq); + taskq_wait_outstanding(mg->mg_taskq, 0); metaslab_group_alloc_update(mg); mgprev = mg->mg_prev; @@ -1596,7 +1596,7 @@ metaslab_group_preload(metaslab_group_t *mg) int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait(mg->mg_taskq); + taskq_wait_outstanding(mg->mg_taskq, 0); return; } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 2977bf9f3404..c542b0a75248 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -471,7 +471,7 @@ txg_wait_callbacks(dsl_pool_t *dp) tx_state_t *tx = &dp->dp_tx; if (tx->tx_commit_cb_taskq != NULL) - taskq_wait(tx->tx_commit_cb_taskq); + taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } static void diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index e86b21aeed61..2b532a33359a 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1152,8 +1152,8 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) */ int round = 0; while (zsb->z_nr_znodes > 0) { - taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool( - zsb->z_os))); + taskq_wait_outstanding(dsl_pool_iput_taskq( + dmu_objset_pool(zsb->z_os)), 0); if (++round > 1 && !unmounting) break; } @@ -1740,7 +1740,7 @@ zfs_init(void) void zfs_fini(void) { - taskq_wait(system_taskq); + taskq_wait_outstanding(system_taskq, 0); unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); From f6046738365571bd647f804958dfdff8a32fbde4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 30 May 2015 09:57:53 -0500 Subject: [PATCH 10/11] Make arc_prune() asynchronous As described in the comment above arc_adapt_thread() it is critical that the arc_adapt_thread() function never sleep while holding a hash lock. This behavior was possible in the Linux implementation because the arc_prune() logic was implemented to be synchronous. Under illumos the analogous dnlc_reduce_cache() function is asynchronous. To address this the arc_do_user_prune() function is has been reworked in to two new functions as follows: * arc_prune_async() is an asynchronous implementation which dispatches the prune callback to be run by the system taskq. This makes it suitable to use in the context of the arc_adapt_thread(). * arc_prune() is a synchronous implementation which depends on the arc_prune_async() implementation but blocks until the outstanding callbacks complete. This is used in arc_kmem_reap_now() where it is safe, and expected, that memory will be freed. This patch additionally adds the zfs_arc_meta_strategy module option while allows the meta reclaim strategy to be configured. It defaults to a balanced strategy which has been proved to work well under Linux but the illumos meta-only strategy can be enabled. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- include/sys/arc.h | 6 ++ module/zfs/arc.c | 151 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 118 insertions(+), 39 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index 38f9f27fea61..0961d4b4d2cb 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -59,10 +59,16 @@ arc_done_func_t arc_getbuf_func; struct arc_prune { arc_prune_func_t *p_pfunc; void *p_private; + uint64_t p_adjust; list_node_t p_node; refcount_t p_refcnt; }; +typedef enum arc_strategy { + ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ + ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ +} arc_strategy_t; + typedef enum arc_flags { /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 67ef87daf137..561c2312455a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -167,6 +167,9 @@ static boolean_t arc_user_evicts_thread_exit; /* number of objects to prune from caches when arc_meta_limit is reached */ int zfs_arc_meta_prune = 10000; +/* The preferred strategy to employ when arc_meta_limit is reached */ +int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; + typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ ARC_RECLAIM_CONS /* Conservative reclaim strategy */ @@ -531,6 +534,7 @@ static arc_state_t *arc_l2c_only; static list_t arc_prune_list; static kmutex_t arc_prune_mtx; +static taskq_t *arc_prune_taskq; static arc_buf_t *arc_eviction_list; static arc_buf_hdr_t arc_eviction_hdr; @@ -2430,47 +2434,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, } /* - * Request that arc user drop references so that N bytes can be released - * from the cache. This provides a mechanism to ensure the arc can honor - * the arc_meta_limit and reclaim buffers which are pinned in the cache - * by higher layers. (i.e. the zpl) + * Helper function for arc_prune() it is responsible for safely handling + * the execution of a registered arc_prune_func_t. */ static void -arc_do_user_prune(int64_t adjustment) +arc_prune_task(void *ptr) { - arc_prune_func_t *func; - void *private; - arc_prune_t *cp, *np; + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; - mutex_enter(&arc_prune_mtx); + if (func != NULL) + func(ap->p_adjust, ap->p_private); - cp = list_head(&arc_prune_list); - while (cp != NULL) { - func = cp->p_pfunc; - private = cp->p_private; - np = list_next(&arc_prune_list, cp); - refcount_add(&cp->p_refcnt, func); - mutex_exit(&arc_prune_mtx); + /* Callback unregistered concurrently with execution */ + if (refcount_remove(&ap->p_refcnt, func) == 0) { + ASSERT(!list_link_active(&ap->p_node)); + refcount_destroy(&ap->p_refcnt); + kmem_free(ap, sizeof (*ap)); + } +} - if (func != NULL) - func(adjustment, private); +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asyncronously so it may be safely called + * in the context of the arc_adapt_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; - mutex_enter(&arc_prune_mtx); + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { - /* User removed prune callback concurrently with execution */ - if (refcount_remove(&cp->p_refcnt, func) == 0) { - ASSERT(!list_link_active(&cp->p_node)); - refcount_destroy(&cp->p_refcnt); - kmem_free(cp, sizeof (*cp)); - } + if (refcount_count(&ap->p_refcnt) >= 2) + continue; - cp = np; + refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP); + ARCSTAT_BUMP(arcstat_prune); } - - ARCSTAT_BUMP(arcstat_prune); mutex_exit(&arc_prune_mtx); } +static void +arc_prune(int64_t adjust) +{ + arc_prune_async(adjust); + taskq_wait_outstanding(arc_prune_taskq, 0); +} + /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function @@ -2511,7 +2532,7 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, * available for reclaim. */ static uint64_t -arc_adjust_meta(void) +arc_adjust_meta_balanced(void) { int64_t adjustmnt, delta, prune = 0; uint64_t total_evicted = 0; @@ -2580,7 +2601,7 @@ arc_adjust_meta(void) if (zfs_arc_meta_prune) { prune += zfs_arc_meta_prune; - arc_do_user_prune(prune); + arc_prune_async(prune); } } @@ -2592,6 +2613,50 @@ arc_adjust_meta(void) return (total_evicted); } +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta_only(void) +{ + uint64_t total_evicted = 0; + int64_t target; + + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); + + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space alloted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p))); + + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +static uint64_t +arc_adjust_meta(void) +{ + if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) + return (arc_adjust_meta_only()); + else + return (arc_adjust_meta_balanced()); +} + /* * Return the type of the oldest buffer in the given arc state * @@ -2905,6 +2970,14 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; + if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { + /* + * We are exceeding our meta-data cache limit. + * Prune some entries to release holds on meta-data. + */ + arc_prune(zfs_arc_meta_prune); + } + /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. @@ -2929,15 +3002,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } /* - * Unlike other ZFS implementations this thread is only responsible for - * adapting the target ARC size on Linux. The responsibility for memory - * reclamation has been entirely delegated to the arc_shrinker_func() - * which is registered with the VM. To reflect this change in behavior - * the arc_reclaim thread has been renamed to arc_adapt. - * - * The following comment from arc_reclaim_thread() in illumos is still - * applicable: - * * Threads can block in arc_get_data_buf() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_buf() are sleeping while holding the hash lock for their @@ -4862,6 +4926,9 @@ arc_init(void) mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri, + max_ncpus, INT_MAX, TASKQ_PREPOPULATE); + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -4943,6 +5010,9 @@ arc_fini(void) arc_ksp = NULL; } + taskq_wait(arc_prune_taskq); + taskq_destroy(arc_prune_taskq); + mutex_enter(&arc_prune_mtx); while ((p = list_head(&arc_prune_list)) != NULL) { list_remove(&arc_prune_list, p); @@ -6374,6 +6444,9 @@ module_param(zfs_arc_meta_adjust_restarts, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, "Limit number of restarts in arc_adjust_meta"); +module_param(zfs_arc_meta_strategy, int, 0644); +MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy"); + module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); From 121b3cae742a0670d902a51bc61d49dc4a3e4445 Mon Sep 17 00:00:00 2001 From: Tim Chase Date: Thu, 4 Jun 2015 08:06:27 -0500 Subject: [PATCH 11/11] Increase arc_c_min to allow safe operation of arc_adapt() ZoL had lowered the minimum ARC size to 4MiB to better accommodate tiny systems such as the raspberry pi, however, as of addition of large block support, the arc_adapt() function depends on arc_c being >= 32MiB (2 * SPA_MAXBLOCKSIZE). This patch raises the minimum ARC size to 32MiB and adds a VERIFY test to arc_adapt() for future-proofing. Signed-off-by: Tim Chase Signed-off-by: Brian Behlendorf --- module/zfs/arc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 561c2312455a..805e7b59f5f7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -3088,8 +3088,8 @@ arc_adapt_thread(void) zfs_arc_max != arc_c_max) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 0 && - zfs_arc_min < arc_c_max && + if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT && + zfs_arc_min <= arc_c_max && zfs_arc_min != arc_c_min) arc_c_min = zfs_arc_min; @@ -3355,7 +3355,8 @@ arc_adapt(int bytes, arc_state_t *state) * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ - if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + VERIFY3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); + if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; @@ -4826,8 +4827,8 @@ arc_init(void) spl_register_shrinker(&arc_shrinker); #endif - /* set min cache to zero */ - arc_c_min = 4<<20; + /* set min cache to allow safe operation of arc_adapt() */ + arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT; /* set max to 1/2 of all memory */ arc_c_max = arc_c * 4; @@ -4837,7 +4838,8 @@ arc_init(void) */ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT && + zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; arc_c = arc_c_max;