Skip to content

Commit

Permalink
Cap metaslab memory usage
Browse files Browse the repository at this point in the history
On systems with large amounts of storage and high fragmentation, a huge 
amount of space can be used by storing metaslab range trees. Since 
metaslabs are only unloaded during a txg sync, and only if they have 
been inactive for 8 txgs, it is possible to get into a state where all 
of the system's memory is consumed by range trees and metaslabs, and 
txgs cannot sync. While ZFS knows how to evict ARC data when needed, 
it has no such mechanism for range tree data. This can result in boot 
hangs for some system configurations.

First, we add the ability to unload metaslabs outside of syncing 
context. Second, we store a multilist of all loaded metaslabs, sorted 
by their selection txg, so we can quickly identify the oldest 
metaslabs.  We use a multilist to reduce lock contention during heavy 
write workloads. Finally, we add logic that will unload a metaslab 
when we're loading a new metaslab, if we're using more than a certain 
fraction of the available memory on range trees.

Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: George Wilson <[email protected]>
Reviewed-by: Sebastien Roy <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Paul Dagnelie <[email protected]>
Closes #9128
  • Loading branch information
pcd1193182 authored and behlendorf committed Aug 16, 2019
1 parent 9323aad commit f09fda5
Show file tree
Hide file tree
Showing 11 changed files with 289 additions and 58 deletions.
1 change: 1 addition & 0 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);

uint64_t arc_all_memory(void);
uint64_t arc_target_bytes(void);
void arc_init(void);
void arc_fini(void);
Expand Down
6 changes: 3 additions & 3 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ int metaslab_sort_by_flushed(const void *, const void *);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *);

int metaslab_load(metaslab_t *);
void metaslab_potentially_unload(metaslab_t *, uint64_t);
void metaslab_unload(metaslab_t *);
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);

Expand Down Expand Up @@ -110,7 +109,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);

void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
Expand All @@ -133,7 +132,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *);
void metaslab_enable(metaslab_t *, boolean_t);
void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
void metaslab_set_selected_txg(metaslab_t *, uint64_t);

extern int metaslab_debug_load;

Expand Down
12 changes: 12 additions & 0 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
#include <sys/multilist.h>

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -194,6 +195,12 @@ struct metaslab_class {
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];

/*
* List of all loaded metaslabs in the class, sorted in order of most
* recent use.
*/
multilist_t *mc_metaslab_txg_list;
};

/*
Expand Down Expand Up @@ -378,6 +385,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
uint64_t ms_allocated_this_txg;
uint64_t ms_allocating_total;

/*
* The following range trees are accessed only from syncing context.
Expand Down Expand Up @@ -508,6 +516,10 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
/*
* Node in metaslab class's selected txg list
*/
multilist_node_t ms_class_txg_node;

/*
* Allocs and frees that are committed to the vdev log spacemap but
Expand Down
15 changes: 15 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,21 @@ considering only the histogram instead.
Default value: \fB3600 seconds\fR (one hour)
.RE

.sp
.ne 2
.na
\fBzfs_metaslab_mem_limit\fR (int)
.ad
.RS 12n
When we are loading a new metaslab, we check the amount of memory being used
to store metaslab range trees. If it is over a threshold, we attempt to unload
the least recently used metaslab to prevent the system from clogging all of
its memory with range trees. This tunable sets the percentage of total system
memory that is the threshold.
.sp
Default value: \fB75 percent\fR
.RE

.sp
.ne 2
.na
Expand Down
3 changes: 1 addition & 2 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,6 @@ static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *);
static void arc_tuning_update(void);
static void arc_prune_async(int64_t);
static uint64_t arc_all_memory(void);

static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
Expand Down Expand Up @@ -4828,7 +4827,7 @@ arc_reduce_target_size(int64_t to_free)
* Return maximum amount of memory that we could possibly use. Reduced
* to half of all memory in user space which is primarily used for testing.
*/
static uint64_t
uint64_t
arc_all_memory(void)
{
#ifdef _KERNEL
Expand Down
Loading

0 comments on commit f09fda5

Please sign in to comment.