From 3ec34e55271d433e3c2dbb861a886361e006ca0a Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Wed, 15 Mar 2017 16:41:52 -0700 Subject: [PATCH] OpenZFS 9284 - arc_reclaim_thread has 2 jobs Following the fix for 9018 (Replace kmem_cache_reap_now() with kmem_cache_reap_soon), the arc_reclaim_thread() no longer blocks while reaping. However, the code is still confusing and error-prone, because this thread has two responsibilities. We should instead separate this into two threads each with their own responsibility: 1. keep `arc_size` under `arc_c`, by calling `arc_adjust()`, which improves `arc_is_overflowing()` 2. keep enough free memory in the system, by calling `arc_kmem_reap_now()` plus `arc_shrink()`, which improves `arc_available_memory()`. Furthermore, we can use the zthr infrastructure to separate the "should we do something" from "do it" parts of the logic, and normalize the start up / shut down of the threads. Authored by: Brad Lewis Reviewed by: Matt Ahrens Reviewed by: Serapheim Dimitropoulos Reviewed by: Pavel Zakharov Reviewed by: Dan Kimmel Reviewed by: Paul Dagnelie Reviewed by: Dan McDonald Reviewed by: Tim Kordas Reviewed by: Tim Chase Reviewed by: Brian Behlendorf Ported-by: Brad Lewis Signed-off-by: Brad Lewis OpenZFS-issue: https://www.illumos.org/issues/9284 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/de753e34f9 Closes #8165 --- include/spl/sys/kmem.h | 2 + include/sys/zfs_context.h | 1 + include/sys/zthr.h | 4 + lib/libzpool/kernel.c | 6 + module/spl/spl-kmem-cache.c | 12 ++ module/zfs/arc.c | 404 +++++++++++++++++++++--------------- module/zfs/zthr.c | 28 ++- 7 files changed, 285 insertions(+), 172 deletions(-) diff --git a/include/spl/sys/kmem.h b/include/spl/sys/kmem.h index d6b428551fe1..72d3a776530c 100644 --- a/include/spl/sys/kmem.h +++ b/include/spl/sys/kmem.h @@ -163,6 +163,7 @@ extern unsigned int spl_kmem_alloc_max; #define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) #define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) +#define kmem_cache_reap_active spl_kmem_cache_reap_active extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); @@ -181,5 +182,6 @@ extern void spl_kmem_free_track(const void *buf, size_t size); extern int spl_kmem_init(void); extern void spl_kmem_fini(void); +extern int spl_kmem_cache_reap_active(void); #endif /* _SPL_KMEM_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 3637cc617697..11a32bb3117a 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -773,6 +773,7 @@ typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); extern int __spl_pf_fstrans_check(void); +extern int kmem_cache_reap_active(void); #define ____cacheline_aligned diff --git a/include/sys/zthr.h b/include/sys/zthr.h index 62da2eea811c..ce6033ecb6b7 100644 --- a/include/sys/zthr.h +++ b/include/sys/zthr.h @@ -29,6 +29,7 @@ struct zthr { kmutex_t zthr_lock; kcondvar_t zthr_cv; boolean_t zthr_cancel; + hrtime_t zthr_wait_time; zthr_checkfunc_t *zthr_checkfunc; zthr_func_t *zthr_func; @@ -38,6 +39,9 @@ struct zthr { extern zthr_t *zthr_create(zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg); +extern zthr_t *zthr_create_timer(zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t nano_wait); + extern void zthr_exit(zthr_t *t, int rc); extern void zthr_destroy(zthr_t *t); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index f5eafb917254..926f4f4f40b4 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1276,6 +1276,12 @@ __spl_pf_fstrans_check(void) return (0); } +int +kmem_cache_reap_active(void) +{ + return (0); +} + void *zvol_tag = "zvol_tag"; void diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 5492c6a4600c..620f03ddf388 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -1732,6 +1732,18 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) } EXPORT_SYMBOL(spl_kmem_cache_reap_now); +/* + * This is stubbed out for code consistency with other platforms. There + * is existing logic to prevent concurrent reaping so while this is ugly + * it should do no harm. + */ +int +spl_kmem_cache_reap_active() +{ + return (0); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_active); + /* * Reap all free slabs from all registered caches. */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 96557054cc60..9e0ffd06d503 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -20,10 +20,10 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* @@ -299,7 +299,7 @@ #endif #include #include -#include +#include #include #include #include @@ -311,10 +311,22 @@ boolean_t arc_watch = B_FALSE; #endif -static kmutex_t arc_reclaim_lock; -static kcondvar_t arc_reclaim_thread_cv; -static boolean_t arc_reclaim_thread_exit; -static kcondvar_t arc_reclaim_waiters_cv; +/* + * This thread's job is to keep enough free memory in the system, by + * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves + * arc_available_memory(). + */ +static zthr_t *arc_reap_zthr; + +/* + * This thread's job is to keep arc_size under arc_c, by calling + * arc_adjust(), which improves arc_is_overflowing(). + */ +static zthr_t *arc_adjust_zthr; + +static kmutex_t arc_adjust_lock; +static kcondvar_t arc_adjust_waiters_cv; +static boolean_t arc_adjust_needed = B_FALSE; /* * The number of headers to evict in arc_evict_state_impl() before @@ -326,20 +338,25 @@ static kcondvar_t arc_reclaim_waiters_cv; int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ -static int arc_grow_retry = 5; +static int arc_grow_retry = 5; + +/* + * Minimum time between calls to arc_kmem_reap_soon(). + */ +int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; +int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ -static int arc_p_min_shift = 4; +int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 7; +static int arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL -static uint_t zfs_arc_pc_percent = 0; +static uint_t zfs_arc_pc_percent = 0; #endif /* @@ -366,7 +383,10 @@ static int arc_min_prescient_prefetch_ms; */ int arc_lotsfree_percent = 10; -static int arc_dead; +/* + * hdr_recl() uses this to determine if the arc is up and running. + */ +static boolean_t arc_initialized; /* * The arc has filled available memory and has now warmed up. @@ -906,6 +926,7 @@ aggsum_t astat_bonus_size; aggsum_t astat_hdr_size; aggsum_t astat_l2_hdr_size; +static hrtime_t arc_growtime; static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static taskq_t *arc_prune_taskq; @@ -1380,8 +1401,8 @@ hdr_recl(void *unused) * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ - if (!arc_dead) - cv_signal(&arc_reclaim_thread_cv); + if (arc_initialized) + zthr_wakeup(arc_reap_zthr); } static void @@ -4097,13 +4118,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * function should proceed in this case). * * If threads are left sleeping, due to not - * using cv_broadcast, they will be woken up - * just before arc_reclaim_thread() sleeps. + * using cv_broadcast here, they will be woken + * up via cv_broadcast in arc_adjust_cb() just + * before arc_adjust_zthr sleeps. */ - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); if (!arc_is_overflowing()) - cv_signal(&arc_reclaim_waiters_cv); - mutex_exit(&arc_reclaim_lock); + cv_signal(&arc_adjust_waiters_cv); + mutex_exit(&arc_adjust_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -4763,8 +4785,8 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } -void -arc_shrink(int64_t to_free) +static void +arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); uint64_t c = arc_c; @@ -4782,10 +4804,14 @@ arc_shrink(int64_t to_free) arc_c = arc_c_min; } - if (asize > arc_c) - (void) arc_adjust(); + if (asize > arc_c) { + /* See comment in arc_adjust_cb_check() on why lock+flag */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + mutex_exit(&arc_adjust_lock); + zthr_wakeup(arc_adjust_zthr); + } } - /* * Return maximum amount of memory that we could possibly use. Reduced * to half of all memory in user space which is primarily used for testing. @@ -4989,7 +5015,7 @@ arc_reclaim_needed(void) } static void -arc_kmem_reap_now(void) +arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -5044,135 +5070,169 @@ arc_kmem_reap_now(void) } } +/* ARGSUSED */ +static boolean_t +arc_adjust_cb_check(void *arg, zthr_t *zthr) +{ + /* + * This is necessary in order to keep the kstat information + * up to date for tools that display kstat data such as the + * mdb ::arc dcmd and the Linux crash utility. These tools + * typically do not call kstat's update function, but simply + * dump out stats from the most recent update. Without + * this call, these commands may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date(the arc_adjust_zthr has a maximum sleep + * time of 1 second); but that should suffice. The + * arc_state_t structures can be queried directly if more + * accurate information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + /* + * We have to rely on arc_get_data_impl() to tell us when to adjust, + * rather than checking if we are overflowing here, so that we are + * sure to not leave arc_get_data_impl() waiting on + * arc_adjust_waiters_cv. If we have become "not overflowing" since + * arc_get_data_impl() checked, we need to wake it up. We could + * broadcast the CV here, but arc_get_data_impl() may have not yet + * gone to sleep. We would need to use a mutex to ensure that this + * function doesn't broadcast until arc_get_data_impl() has gone to + * sleep (e.g. the arc_adjust_lock). However, the lock ordering of + * such a lock would necessarily be incorrect with respect to the + * zthr_lock, which is held before this function is called, and is + * held by arc_get_data_impl() when it calls zthr_wakeup(). + */ + return (arc_adjust_needed); +} + /* - * Threads can block in arc_get_data_impl() waiting for this thread to evict - * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_impl() are sleeping while holding the hash lock for their - * particular arc header. Thus, we must be careful to never sleep on a - * hash lock in this thread. This is to prevent the following deadlock: - * - * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", - * waiting for the reclaim thread to signal it. - * - * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, - * fails, and goes to sleep forever. - * - * This possible deadlock is avoided by always acquiring a hash lock - * using mutex_tryenter() from arc_reclaim_thread(). + * Keep arc_size under arc_c by running arc_adjust which evicts data + * from the ARC. */ /* ARGSUSED */ -static void -arc_reclaim_thread(void *unused) +static int +arc_adjust_cb(void *arg, zthr_t *zthr) { - fstrans_cookie_t cookie = spl_fstrans_mark(); - hrtime_t growtime = 0; - callb_cpr_t cpr; - - CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); + uint64_t evicted = 0; + fstrans_cookie_t cookie = spl_fstrans_mark(); - mutex_enter(&arc_reclaim_lock); - while (!arc_reclaim_thread_exit) { - uint64_t evicted = 0; - uint64_t need_free = arc_need_free; - arc_tuning_update(); + /* Evict from cache */ + evicted = arc_adjust(); + /* + * If evicted is zero, we couldn't evict anything + * via arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. Additionally, zthr_iscancelled() is + * checked here so that if the arc is shutting down, the + * broadcast will wake any remaining arc adjust waiters. + */ + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && + evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; + if (!arc_adjust_needed) { /* - * This is necessary in order for the mdb ::arc dcmd to - * show up to date information. Since the ::arc command - * does not call the kstat's update function, without - * this call, the command may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date; but that should suffice. The arc_state_t - * structures can be queried directly if more accurate - * information is needed. + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * arc_get_data_impl() sooner. */ -#ifndef __linux__ - if (arc_ksp != NULL) - arc_ksp->ks_update(arc_ksp, KSTAT_READ); -#endif - mutex_exit(&arc_reclaim_lock); + cv_broadcast(&arc_adjust_waiters_cv); + arc_need_free = 0; + } + mutex_exit(&arc_adjust_lock); + spl_fstrans_unmark(cookie); + + return (0); +} +/* ARGSUSED */ +static boolean_t +arc_reap_cb_check(void *arg, zthr_t *zthr) +{ + int64_t free_memory = arc_available_memory(); + + /* + * If a kmem reap is already active, don't schedule more. We must + * check for this because kmem_cache_reap_soon() won't actually + * block on the cache being reaped (this is to prevent callers from + * becoming implicitly blocked by a system-wide kmem reap -- which, + * on a system with many, many full magazines, can take minutes). + */ + if (!kmem_cache_reap_active() && free_memory < 0) { + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; /* - * We call arc_adjust() before (possibly) calling - * arc_kmem_reap_now(), so that we can wake up - * arc_get_data_buf() sooner. + * Wait at least zfs_grow_retry (default 5) seconds + * before considering growing. */ - evicted = arc_adjust(); + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + return (B_TRUE); + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= arc_growtime) { + arc_no_grow = B_FALSE; + } - int64_t free_memory = arc_available_memory(); - if (free_memory < 0) { + return (B_FALSE); +} - arc_no_grow = B_TRUE; - arc_warm = B_TRUE; +/* + * Keep enough free memory in the system by reaping the ARC's kmem + * caches. To cause more slabs to be reapable, we may reduce the + * target size of the cache (arc_c), causing the arc_adjust_cb() + * to free more buffers. + */ +/* ARGSUSED */ +static int +arc_reap_cb(void *arg, zthr_t *zthr) +{ + int64_t free_memory; + fstrans_cookie_t cookie = spl_fstrans_mark(); - /* - * Wait at least zfs_grow_retry (default 5) seconds - * before considering growing. - */ - growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + /* + * Kick off asynchronous kmem_reap()'s of all our caches. + */ + arc_kmem_reap_soon(); - arc_kmem_reap_now(); + /* + * Wait at least arc_kmem_cache_reap_retry_ms between + * arc_kmem_reap_soon() calls. Without this check it is possible to + * end up in a situation where we spend lots of time reaping + * caches, while we're near arc_c_min. Waiting here also gives the + * subsequent free memory check a chance of finding that the + * asynchronous reap has already freed enough memory, and we don't + * need to call arc_reduce_target_size(). + */ + delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); - /* - * If we are still low on memory, shrink the ARC - * so that we have arc_shrink_min free space. - */ - free_memory = arc_available_memory(); + /* + * Reduce the target size as needed to maintain the amount of free + * memory in the system at a fraction of the arc_size (1/128th by + * default). If oversubscribed (free_memory < 0) then reduce the + * target arc_size by the deficit amount plus the fractional + * amount. If free memory is positive but less then the fractional + * amount, reduce by what is needed to hit the fractional amount. + */ + free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { #ifdef _KERNEL - to_free = MAX(to_free, need_free); + to_free = MAX(to_free, arc_need_free); #endif - arc_shrink(to_free); - } - } else if (free_memory < arc_c >> arc_no_grow_shift) { - arc_no_grow = B_TRUE; - } else if (gethrtime() >= growtime) { - arc_no_grow = B_FALSE; - } - - mutex_enter(&arc_reclaim_lock); - - /* - * If evicted is zero, we couldn't evict anything via - * arc_adjust(). This could be due to hash lock - * collisions, but more likely due to the majority of - * arc buffers being unevictable. Therefore, even if - * arc_size is above arc_c, another pass is unlikely to - * be helpful and could potentially cause us to enter an - * infinite loop. - */ - if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) { - /* - * We're either no longer overflowing, or we - * can't evict anything more, so we should wake - * up any threads before we go to sleep and remove - * the bytes we were working on from arc_need_free - * since nothing more will be done here. - */ - cv_broadcast(&arc_reclaim_waiters_cv); - ARCSTAT_INCR(arcstat_need_free, -need_free); - - /* - * Block until signaled, or after one second (we - * might need to perform arc_kmem_reap_now() - * even if we aren't being signalled) - */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv, - &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); - } + arc_reduce_target_size(to_free); } - - arc_reclaim_thread_exit = B_FALSE; - cv_broadcast(&arc_reclaim_thread_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ spl_fstrans_unmark(cookie); - thread_exit(); + + return (0); } #ifdef _KERNEL @@ -5276,21 +5336,21 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) return (SHRINK_STOP); /* Reclaim in progress */ - if (mutex_tryenter(&arc_reclaim_lock) == 0) { + if (mutex_tryenter(&arc_adjust_lock) == 0) { ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan)); return (0); } - mutex_exit(&arc_reclaim_lock); + mutex_exit(&arc_adjust_lock); /* * Evict the requested number of pages by shrinking arc_c the * requested amount. */ if (pages > 0) { - arc_shrink(ptob(sc->nr_to_scan)); + arc_reduce_target_size(ptob(sc->nr_to_scan)); if (current_is_kswapd()) - arc_kmem_reap_now(); + arc_kmem_reap_soon(); #ifdef HAVE_SPLIT_SHRINKER_CALLBACK pages = MAX((int64_t)pages - (int64_t)btop(arc_evictable_memory()), 0); @@ -5300,7 +5360,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) /* * We've shrunk what we can, wake up threads. */ - cv_broadcast(&arc_reclaim_waiters_cv); + cv_broadcast(&arc_adjust_waiters_cv); } else pages = SHRINK_STOP; @@ -5315,7 +5375,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { arc_no_grow = B_TRUE; - arc_kmem_reap_now(); + arc_kmem_reap_soon(); ARCSTAT_BUMP(arcstat_memory_direct_count); } @@ -5369,8 +5429,11 @@ arc_adapt(int bytes, arc_state_t *state) } ASSERT((int64_t)arc_p >= 0); + /* + * Wake reap thread if we do not have any available memory + */ if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thread_cv); + zthr_wakeup(arc_reap_zthr); return; } @@ -5478,7 +5541,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { - mutex_enter(&arc_reclaim_lock); + mutex_enter(&arc_adjust_lock); /* * Now that we've acquired the lock, we may no longer be @@ -5492,11 +5555,12 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * shouldn't cause any harm. */ if (arc_is_overflowing()) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + (void) cv_wait(&arc_adjust_waiters_cv, + &arc_adjust_lock); } - - mutex_exit(&arc_reclaim_lock); + mutex_exit(&arc_adjust_lock); } VERIFY3U(hdr->b_type, ==, type); @@ -7687,10 +7751,8 @@ void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); - - mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); - cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; @@ -7750,6 +7812,13 @@ arc_init(void) arc_c = arc_c_min; arc_state_init(); + + /* + * The arc must be "uninitialized", so that hdr_recl() (which is + * registered by buf_init()) will not access arc_reap_zthr before + * it is created. + */ + ASSERT(!arc_initialized); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), @@ -7759,8 +7828,6 @@ arc_init(void) arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - arc_reclaim_thread_exit = B_FALSE; - arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7770,10 +7837,12 @@ arc_init(void) kstat_install(arc_ksp); } - (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, defclsyspri); + arc_adjust_zthr = zthr_create(arc_adjust_cb_check, + arc_adjust_cb, NULL); + arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, + arc_reap_cb, NULL, SEC2NSEC(1)); - arc_dead = B_FALSE; + arc_initialized = B_TRUE; arc_warm = B_FALSE; /* @@ -7805,22 +7874,10 @@ arc_fini(void) spl_unregister_shrinker(&arc_shrinker); #endif /* _KERNEL */ - mutex_enter(&arc_reclaim_lock); - arc_reclaim_thread_exit = B_TRUE; - /* - * The reclaim thread will set arc_reclaim_thread_exit back to - * B_FALSE when it is finished exiting; we're waiting for that. - */ - while (arc_reclaim_thread_exit) { - cv_signal(&arc_reclaim_thread_cv); - cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); - } - mutex_exit(&arc_reclaim_lock); - /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); - arc_dead = B_TRUE; + arc_initialized = B_FALSE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); @@ -7841,9 +7898,14 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - mutex_destroy(&arc_reclaim_lock); - cv_destroy(&arc_reclaim_thread_cv); - cv_destroy(&arc_reclaim_waiters_cv); + (void) zthr_cancel(arc_adjust_zthr); + zthr_destroy(arc_adjust_zthr); + + (void) zthr_cancel(arc_reap_zthr); + zthr_destroy(arc_reap_zthr); + + mutex_destroy(&arc_adjust_lock); + cv_destroy(&arc_adjust_waiters_cv); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c index 1c4a8e02cbbe..c5b11dafc5fd 100644 --- a/module/zfs/zthr.c +++ b/module/zfs/zthr.c @@ -47,6 +47,10 @@ * 3] When the zthr is done, it changes the indicator to stopped, allowing * a new cycle to start. * + * Besides being awakened by other threads, a zthr can be configured + * during creation to wakeup on its own after a specified interval + * [see zthr_create_timer()]. + * * == ZTHR creation * * Every zthr needs three inputs to start running: @@ -74,6 +78,9 @@ * * To start a zthr: * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * or + * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, + * args, max_sleep); * * After that you should be able to wakeup, cancel, and resume the * zthr from another thread using zthr_pointer. @@ -189,7 +196,13 @@ zthr_procedure(void *arg) mutex_enter(&t->zthr_lock); } else { /* go to sleep */ - cv_wait_sig(&t->zthr_cv, &t->zthr_lock); + if (t->zthr_wait_time == 0) { + cv_wait_sig(&t->zthr_cv, &t->zthr_lock); + } else { + (void) cv_timedwait_sig_hires(&t->zthr_cv, + &t->zthr_lock, t->zthr_wait_time, + MSEC2NSEC(1), 0); + } } } mutex_exit(&t->zthr_lock); @@ -199,6 +212,18 @@ zthr_procedure(void *arg) zthr_t * zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) +{ + return (zthr_create_timer(checkfunc, func, arg, (hrtime_t)0)); +} + +/* + * Create a zthr with specified maximum sleep time. If the time + * in sleeping state exceeds max_sleep, a wakeup(do the check and + * start working if required) will be triggered. + */ +zthr_t * +zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func, + void *arg, hrtime_t max_sleep) { zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); mutex_init(&t->zthr_lock, NULL, MUTEX_DEFAULT, NULL); @@ -208,6 +233,7 @@ zthr_create(zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg) t->zthr_checkfunc = checkfunc; t->zthr_func = func; t->zthr_arg = arg; + t->zthr_wait_time = max_sleep; t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);