Skip to content

Commit

Permalink
L2ARC: Relax locking during write
Browse files Browse the repository at this point in the history
Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
Closes #16040
  • Loading branch information
amotin authored and behlendorf committed Apr 19, 2024
1 parent f4ce02a commit 575872c
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 99 deletions.
5 changes: 4 additions & 1 deletion include/sys/multilist.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);

multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
void multilist_sublist_lock(multilist_sublist_t *);
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
void multilist_sublist_unlock(multilist_sublist_t *);

void multilist_sublist_insert_head(multilist_sublist_t *, void *);
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
void multilist_sublist_remove(multilist_sublist_t *, void *);
int multilist_sublist_is_empty(multilist_sublist_t *);
Expand Down
179 changes: 93 additions & 86 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,

ASSERT3P(marker, !=, NULL);

mls = multilist_sublist_lock(ml, idx);
mls = multilist_sublist_lock_idx(ml, idx);

for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
hdr = multilist_sublist_prev(mls, marker)) {
Expand Down Expand Up @@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
return (bytes_evicted);
}

static arc_buf_hdr_t *
arc_state_alloc_marker(void)
{
arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);

/*
* A b_spa of 0 is used to indicate that this header is
* a marker. This fact is used in arc_evict_state_impl().
*/
marker->b_spa = 0;

return (marker);
}

static void
arc_state_free_marker(arc_buf_hdr_t *marker)
{
kmem_cache_free(hdr_full_cache, marker);
}

/*
* Allocate an array of buffer headers used as placeholders during arc state
* eviction.
Expand All @@ -3994,24 +4014,16 @@ arc_state_alloc_markers(int count)
arc_buf_hdr_t **markers;

markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
for (int i = 0; i < count; i++) {
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);

/*
* A b_spa of 0 is used to indicate that this header is
* a marker. This fact is used in arc_evict_state_impl().
*/
markers[i]->b_spa = 0;

}
for (int i = 0; i < count; i++)
markers[i] = arc_state_alloc_marker();
return (markers);
}

static void
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
{
for (int i = 0; i < count; i++)
kmem_cache_free(hdr_full_cache, markers[i]);
arc_state_free_marker(markers[i]);
kmem_free(markers, sizeof (*markers) * count);
}

Expand Down Expand Up @@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls;

mls = multilist_sublist_lock(ml, i);
mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_insert_tail(mls, markers[i]);
multilist_sublist_unlock(mls);
}
Expand Down Expand Up @@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
}

for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}
Expand Down Expand Up @@ -8628,7 +8640,7 @@ l2arc_sublist_lock(int list_num)
* sublists being selected.
*/
idx = multilist_get_random_index(ml);
return (multilist_sublist_lock(ml, idx));
return (multilist_sublist_lock_idx(ml, idx));
}

/*
Expand Down Expand Up @@ -9040,9 +9052,9 @@ l2arc_blk_fetch_done(zio_t *zio)
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *hdr, *hdr_prev, *head;
uint64_t write_asize, write_psize, write_lsize, headroom;
boolean_t full;
arc_buf_hdr_t *hdr, *head, *marker;
uint64_t write_asize, write_psize, headroom;
boolean_t full, from_head = !arc_warm;
l2arc_write_callback_t *cb = NULL;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
Expand All @@ -9051,10 +9063,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT3P(dev->l2ad_vdev, !=, NULL);

pio = NULL;
write_lsize = write_asize = write_psize = 0;
write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
marker = arc_state_alloc_marker();

/*
* Copy buffers for L2ARC writing.
Expand All @@ -9069,40 +9082,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}

multilist_sublist_t *mls = l2arc_sublist_lock(pass);
uint64_t passed_sz = 0;

VERIFY3P(mls, !=, NULL);
headroom = target_sz * l2arc_headroom;
if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;

/*
* L2ARC fast warmup.
*
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
if (arc_warm == B_FALSE)
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
ASSERT3P(mls, !=, NULL);
if (from_head)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);

headroom = target_sz * l2arc_headroom;
if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;

for (; hdr; hdr = hdr_prev) {
while (hdr != NULL) {
kmutex_t *hash_lock;
abd_t *to_write = NULL;

if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr);
else
hdr_prev = multilist_sublist_prev(mls, hdr);

hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
/*
* Skip this buffer rather than waiting.
*/
skip:
/* Skip this buffer rather than waiting. */
if (from_head)
hdr = multilist_sublist_next(mls, hdr);
else
hdr = multilist_sublist_prev(mls, hdr);
continue;
}

Expand All @@ -9117,11 +9124,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)

if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
continue;
goto skip;
}

ASSERT(HDR_HAS_L1HDR(hdr));

ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3U(arc_hdr_size(hdr), >, 0);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
Expand All @@ -9143,12 +9149,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}

/*
* We rely on the L1 portion of the header below, so
* it's invalid for this header to have been evicted out
* of the ghost cache, prior to being written out. The
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
* We should not sleep with sublist lock held or it
* may block ARC eviction. Insert a marker to save
* the position and drop the lock.
*/
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
if (from_head) {
multilist_sublist_insert_after(mls, hdr,
marker);
} else {
multilist_sublist_insert_before(mls, hdr,
marker);
}
multilist_sublist_unlock(mls);

/*
* If this header has b_rabd, we can use this since it
Expand Down Expand Up @@ -9179,87 +9191,80 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
&to_write);
if (ret != 0) {
arc_hdr_clear_flags(hdr,
ARC_FLAG_L2_WRITING);
ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
continue;
goto next;
}

l2arc_free_abd_on_write(to_write, asize, type);
}

hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
hdr->b_l2hdr.b_hits = 0;
hdr->b_l2hdr.b_arcs_state =
hdr->b_l1hdr.b_state->arcs_state;
mutex_enter(&dev->l2ad_mtx);
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
mutex_exit(&dev->l2ad_mtx);
}
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
ARC_FLAG_L2_WRITING);

(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
l2arc_hdr_arcstats_increment(hdr);

boolean_t commit = l2arc_log_blk_insert(dev, hdr);
mutex_exit(hash_lock);

if (pio == NULL) {
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
/*
* Create a list to save allocated abd buffers
* for l2arc_log_blk_commit().
*/
list_create(&cb->l2wcb_abd_list,
sizeof (l2arc_lb_abd_buf_t),
offsetof(l2arc_lb_abd_buf_t, node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}

hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_hits = 0;

hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
hdr->b_l2hdr.b_arcs_state =
hdr->b_l1hdr.b_state->arcs_state;
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);

mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);

(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);

wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, asize, to_write,
dev->l2ad_hand, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);

write_lsize += HDR_GET_LSIZE(hdr);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
zio_nowait(wzio);

write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);

mutex_exit(hash_lock);

/*
* Append buf info to current log and commit if full.
* arcstat_l2_{size,asize} kstats are updated
* internally.
*/
if (l2arc_log_blk_insert(dev, hdr)) {
/*
* l2ad_hand will be adjusted in
* l2arc_log_blk_commit().
*/
if (commit) {
/* l2ad_hand will be adjusted inside. */
write_asize +=
l2arc_log_blk_commit(dev, pio, cb);
}

zio_nowait(wzio);
next:
multilist_sublist_lock(mls);
if (from_head)
hdr = multilist_sublist_next(mls, marker);
else
hdr = multilist_sublist_prev(mls, marker);
multilist_sublist_remove(mls, marker);
}

multilist_sublist_unlock(mls);
Expand All @@ -9268,9 +9273,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}

arc_state_free_marker(marker);

/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_lsize);
ASSERT0(write_psize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);

Expand Down Expand Up @@ -10598,7 +10605,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);

dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
multilist_sublist_t *mls = multilist_sublist_lock(
multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);

ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
Expand Down
Loading

0 comments on commit 575872c

Please sign in to comment.