Skip to content

Commit

Permalink
mm: keep page cache radix tree nodes in check
Browse files Browse the repository at this point in the history
Previously, page cache radix tree nodes were freed after reclaim emptied
out their page pointers.  But now reclaim stores shadow entries in their
place, which are only reclaimed when the inodes themselves are
reclaimed.  This is problematic for bigger files that are still in use
after they have a significant amount of their cache reclaimed, without
any of those pages actually refaulting.  The shadow entries will just
sit there and waste memory.  In the worst case, the shadow entries will
accumulate until the machine runs out of memory.

To get this under control, the VM will track radix tree nodes
exclusively containing shadow entries on a per-NUMA node list.  Per-NUMA
rather than global because we expect the radix tree nodes themselves to
be allocated node-locally and we want to reduce cross-node references of
otherwise independent cache workloads.  A simple shrinker will then
reclaim these nodes on memory pressure.

A few things need to be stored in the radix tree node to implement the
shadow node LRU and allow tree deletions coming from the list:

1. There is no index available that would describe the reverse path
   from the node up to the tree root, which is needed to perform a
   deletion.  To solve this, encode in each node its offset inside the
   parent.  This can be stored in the unused upper bits of the same
   member that stores the node's height at no extra space cost.

2. The number of shadow entries needs to be counted in addition to the
   regular entries, to quickly detect when the node is ready to go to
   the shadow node LRU list.  The current entry count is an unsigned
   int but the maximum number of entries is 64, so a shadow counter
   can easily be stored in the unused upper bits.

3. Tree modification needs tree lock and tree root, which are located
   in the address space, so store an address_space backpointer in the
   node.  The parent pointer of the node is in a union with the 2-word
   rcu_head, so the backpointer comes at no extra cost as well.

4. The node needs to be linked to an LRU list, which requires a list
   head inside the node.  This does increase the size of the node, but
   it does not change the number of objects that fit into a slab page.

[[email protected]: export the right function]
Signed-off-by: Johannes Weiner <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
Reviewed-by: Minchan Kim <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Bob Liu <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Greg Thelen <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Jan Kara <[email protected]>
Cc: KOSAKI Motohiro <[email protected]>
Cc: Luigi Semenzato <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Metin Doslu <[email protected]>
Cc: Michel Lespinasse <[email protected]>
Cc: Ozgun Erdogan <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Ryan Mallon <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
hnaz authored and torvalds committed Apr 3, 2014
1 parent 139e561 commit 449dd69
Show file tree
Hide file tree
Showing 10 changed files with 359 additions and 43 deletions.
8 changes: 7 additions & 1 deletion include/linux/list_lru.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
LRU_REMOVED_RETRY, /* item removed, but lock has been
dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
Expand All @@ -32,7 +34,11 @@ struct list_lru {
};

void list_lru_destroy(struct list_lru *lru);
int list_lru_init(struct list_lru *lru);
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
static inline int list_lru_init(struct list_lru *lru)
{
return list_lru_init_key(lru, NULL);
}

/**
* list_lru_add: add an element to the lru list's tail
Expand Down
1 change: 1 addition & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ enum zone_stat_item {
#endif
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM,
NR_ANON_TRANSPARENT_HUGEPAGES,
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
Expand Down
32 changes: 24 additions & 8 deletions include/linux/radix-tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,37 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
#define RADIX_TREE_TAG_LONGS \
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))

/* Height component in node->path */
#define RADIX_TREE_HEIGHT_SHIFT (RADIX_TREE_MAX_PATH + 1)
#define RADIX_TREE_HEIGHT_MASK ((1UL << RADIX_TREE_HEIGHT_SHIFT) - 1)

/* Internally used bits of node->count */
#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)

struct radix_tree_node {
unsigned int height; /* Height from the bottom */
unsigned int path; /* Offset in parent & height from the bottom */
unsigned int count;
union {
struct radix_tree_node *parent; /* Used when ascending tree */
struct rcu_head rcu_head; /* Used when freeing node */
struct {
/* Used when ascending tree */
struct radix_tree_node *parent;
/* For tree user */
void *private_data;
};
/* Used when freeing node */
struct rcu_head rcu_head;
};
/* For tree user */
struct list_head private_list;
void __rcu *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};

#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))

/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
unsigned int height;
Expand Down Expand Up @@ -251,7 +267,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
Expand Down
31 changes: 31 additions & 0 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,37 @@ struct swap_list_t {
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
void workingset_activation(struct page *page);
extern struct list_lru workingset_shadow_nodes;

static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
{
return node->count & RADIX_TREE_COUNT_MASK;
}

static inline void workingset_node_pages_inc(struct radix_tree_node *node)
{
node->count++;
}

static inline void workingset_node_pages_dec(struct radix_tree_node *node)
{
node->count--;
}

static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
{
return node->count >> RADIX_TREE_COUNT_SHIFT;
}

static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
{
node->count += 1U << RADIX_TREE_COUNT_SHIFT;
}

static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
{
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
}

/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
Expand Down
36 changes: 22 additions & 14 deletions lib/radix-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,8 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)

/* Increase the height. */
newheight = root->height+1;
node->height = newheight;
BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
node->path = newheight;
node->count = 1;
node->parent = NULL;
slot = root->rnode;
Expand Down Expand Up @@ -400,11 +401,12 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
/* Have to add a child node. */
if (!(slot = radix_tree_node_alloc(root)))
return -ENOMEM;
slot->height = height;
slot->path = height;
slot->parent = node;
if (node) {
rcu_assign_pointer(node->slots[offset], slot);
node->count++;
slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
} else
rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
}
Expand Down Expand Up @@ -498,7 +500,7 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
}
node = indirect_to_ptr(node);

height = node->height;
height = node->path & RADIX_TREE_HEIGHT_MASK;
if (index > radix_tree_maxindex(height))
return NULL;

Expand Down Expand Up @@ -704,7 +706,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
return (index == 0);
node = indirect_to_ptr(node);

height = node->height;
height = node->path & RADIX_TREE_HEIGHT_MASK;
if (index > radix_tree_maxindex(height))
return 0;

Expand Down Expand Up @@ -741,7 +743,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
{
unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK;
struct radix_tree_node *rnode, *node;
unsigned long index, offset;
unsigned long index, offset, height;

if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
return NULL;
Expand Down Expand Up @@ -772,7 +774,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
return NULL;

restart:
shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT;
height = rnode->path & RADIX_TREE_HEIGHT_MASK;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
offset = index >> shift;

/* Index outside of the tree */
Expand Down Expand Up @@ -1142,7 +1145,7 @@ static unsigned long __locate(struct radix_tree_node *slot, void *item,
unsigned int shift, height;
unsigned long i;

height = slot->height;
height = slot->path & RADIX_TREE_HEIGHT_MASK;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;

for ( ; height > 1; height--) {
Expand Down Expand Up @@ -1205,7 +1208,8 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
}

node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height);
max_index = radix_tree_maxindex(node->path &
RADIX_TREE_HEIGHT_MASK);
if (cur_index > max_index) {
rcu_read_unlock();
break;
Expand Down Expand Up @@ -1301,7 +1305,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
*
* Returns %true if @node was freed, %false otherwise.
*/
bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
bool deleted = false;
Expand All @@ -1320,9 +1324,10 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,

parent = node->parent;
if (parent) {
index >>= RADIX_TREE_MAP_SHIFT;
unsigned int offset;

parent->slots[index & RADIX_TREE_MAP_MASK] = NULL;
offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
parent->slots[offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
Expand Down Expand Up @@ -1386,7 +1391,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
node->slots[offset] = NULL;
node->count--;

__radix_tree_delete_node(root, index, node);
__radix_tree_delete_node(root, node);

return entry;
}
Expand Down Expand Up @@ -1419,9 +1424,12 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
EXPORT_SYMBOL(radix_tree_tagged);

static void
radix_tree_node_ctor(void *node)
radix_tree_node_ctor(void *arg)
{
memset(node, 0, sizeof(struct radix_tree_node));
struct radix_tree_node *node = arg;

memset(node, 0, sizeof(*node));
INIT_LIST_HEAD(&node->private_list);
}

static __init unsigned long __maxindex(unsigned int height)
Expand Down
90 changes: 74 additions & 16 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,17 @@
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
if (shadow) {
void **slot;
struct radix_tree_node *node;
unsigned long index;
unsigned int offset;
unsigned int tag;
void **slot;

slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
radix_tree_replace_slot(slot, shadow);
VM_BUG_ON(!PageLocked(page));

__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);

if (shadow) {
mapping->nrshadows++;
/*
* Make sure the nrshadows update is committed before
Expand All @@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
* same time and miss a shadow entry.
*/
smp_wmb();
} else
radix_tree_delete(&mapping->page_tree, page->index);
}
mapping->nrpages--;

if (!node) {
/* Clear direct pointer tags in root node */
mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
radix_tree_replace_slot(slot, shadow);
return;
}

/* Clear tree tags for the removed page */
index = page->index;
offset = index & RADIX_TREE_MAP_MASK;
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
if (test_bit(offset, node->tags[tag]))
radix_tree_tag_clear(&mapping->page_tree, index, tag);
}

/* Delete page, swap shadow entry */
radix_tree_replace_slot(slot, shadow);
workingset_node_pages_dec(node);
if (shadow)
workingset_node_shadows_inc(node);
else
if (__radix_tree_delete_node(&mapping->page_tree, node))
return;

/*
* Track node that only contains shadow entries.
*
* Avoid acquiring the list_lru lock if already tracked. The
* list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_pages(node) &&
list_empty(&node->private_list)) {
node->private_data = mapping;
list_lru_add(&workingset_shadow_nodes, &node->private_list);
}
}

/*
Expand Down Expand Up @@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
static int page_cache_tree_insert(struct address_space *mapping,
struct page *page, void **shadowp)
{
struct radix_tree_node *node;
void **slot;
int error;

slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
if (slot) {
error = __radix_tree_create(&mapping->page_tree, page->index,
&node, &slot);
if (error)
return error;
if (*slot) {
void *p;

p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
radix_tree_replace_slot(slot, page);
mapping->nrshadows--;
mapping->nrpages++;
if (shadowp)
*shadowp = p;
return 0;
mapping->nrshadows--;
if (node)
workingset_node_shadows_dec(node);
}
error = radix_tree_insert(&mapping->page_tree, page->index, page);
if (!error)
mapping->nrpages++;
return error;
radix_tree_replace_slot(slot, page);
mapping->nrpages++;
if (node) {
workingset_node_pages_inc(node);
/*
* Don't track node that contains actual pages.
*
* Avoid acquiring the list_lru lock if already
* untracked. The list_empty() test is safe as
* node->private_list is protected by
* mapping->tree_lock.
*/
if (!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
}
return 0;
}

static int __add_to_page_cache_locked(struct page *page,
Expand Down
Loading

0 comments on commit 449dd69

Please sign in to comment.