diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 435096becc75..656b4212febd 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -149,13 +149,7 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; +uint_t arc_reduce_dnlc_percent = 3; /* * The number of iterations through arc_evict_*() before we @@ -170,7 +164,19 @@ static int arc_grow_retry = 60; static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 5; +static int arc_shrink_shift = 7; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks @@ -2194,16 +2200,10 @@ arc_flush(spa_t *spa) } void -arc_shrink(void) +arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { - uint64_t to_free; -#ifdef _KERNEL - to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); -#else - to_free = arc_c >> arc_shrink_shift; -#endif if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -2222,25 +2222,49 @@ arc_shrink(void) arc_adjust(); } +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + /* - * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of 1 indicates that the system - * is under memory pressure and that the arc should adjust accordingly. + * Additional reserve of pages for pp_reserve. */ -static int -arc_reclaim_needed(void) -{ - uint64_t extra; +int64_t arc_pages_pp_reserve = 64; -#ifdef _KERNEL +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; - if (needfree) - return (1); +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n; + free_memory_reason_t r = FMR_UNKNOWN; - /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later - */ - extra = desfree; +#ifdef _KERNEL + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } + } /* * check that we're out of range of the pageout scanner. It starts to @@ -2249,8 +2273,11 @@ arc_reclaim_needed(void) * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ - if (freemem < lotsfree + needfree + extra) - return (1); + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } /* * check to make sure that swapfs has enough space so that anon @@ -2259,8 +2286,13 @@ arc_reclaim_needed(void) * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } + /* * Check that we have enough availrmem that memory locking (e.g., via @@ -2269,8 +2301,12 @@ arc_reclaim_needed(void) * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ - if (availrmem <= pages_pp_maximum) - return (1); + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } #if defined(__i386) /* @@ -2284,9 +2320,12 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (vmem_size(heap_arena, VMEM_FREE) < - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) - return (1); + n = vmem_size(heap_arena, VMEM_FREE) - + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } #endif /* @@ -2298,19 +2337,40 @@ arc_reclaim_needed(void) * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ - if (zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) - return (1); + if (zio_arena != NULL) { + n = vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } #else + /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) - return (1); + lowest = -1024; #endif - return (0); + + last_free_memory = lowest; + last_free_reason = r; + + return (lowest); +} + + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); } static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) +arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -2335,13 +2395,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif #endif - /* - * An aggressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -2356,47 +2409,57 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(hdr_cache); kmem_cache_reap_now(range_seg_cache); - /* - * Ask the vmem areana to reclaim unused memory from its - * quantum caches. - */ - if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ vmem_qcache_reap(zio_arena); + } } static void arc_reclaim_thread(void) { clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { + int64_t free_memory = arc_available_memory(); + if (free_memory < 0) { - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - membar_producer(); - } + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; - /* reset the growth delay for every reclaim */ + /* + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. + */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - arc_kmem_reap_now(last_reclaim); - arc_warm = B_TRUE; + arc_kmem_reap_now(); - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { - arc_no_grow = FALSE; + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL + to_free = MAX(to_free, ptob(needfree)); +#endif + arc_shrink(to_free); + } + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (ddi_get_lbolt() >= growtime) { + arc_no_grow = B_FALSE; } arc_adjust(); @@ -3739,6 +3802,15 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) void arc_init(void) { + /* + * allmem is "all memory that we could possibly use". + */ +#ifdef _KERNEL + uint64_t allmem = ptob(physmem - swapfs_minfree); +#else + uint64_t allmem = (physmem * PAGESIZE) / 2; +#endif + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); @@ -3746,7 +3818,7 @@ arc_init(void) arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ - arc_c = physmem * PAGESIZE / 8; + arc_c = allmem / 8; #ifdef _KERNEL /* @@ -3758,21 +3830,21 @@ arc_init(void) #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 64<<20); + arc_c_min = MAX(allmem / 32, 64 << 20); /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); + if (allmem >= 1 << 30) + arc_c_max = allmem - (1 << 30); else arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 6, arc_c_max); + arc_c_max = MAX(allmem * 3 / 4, arc_c_max); /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 64MB) */ - if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; arc_c = arc_c_max; @@ -3800,6 +3872,12 @@ arc_init(void) if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; + /* + * Ensure that arc_no_grow_shift is less than arc_shrink_shift. + */ + if (arc_no_grow_shift >= arc_shrink_shift) + arc_no_grow_shift = arc_shrink_shift - 1; + if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift;