Skip to content

Commit

Permalink
slab: remove synchronous synchronize_sched() from memcg cache deactiv…
Browse files Browse the repository at this point in the history
…ation path

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is not
under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

slub uses synchronize_sched() to deactivate a memcg cache.
synchronize_sched() is an expensive and slow operation and doesn't scale
when a huge number of caches are destroyed back-to-back.  While there
used to be a simple batching mechanism, the batching was too restricted
to be helpful.

This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub
can use to schedule sched RCU callback instead of performing
synchronize_sched() synchronously while holding cgroup_mutex.  While
this adds online cpus, mems and slab_mutex operations, operating on
these locks back-to-back from the same kworker, which is what's gonna
happen when there are many to deactivate, isn't expensive at all and
this gets rid of the scalability problem completely.

Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Tejun Heo <[email protected]>
Reported-by: Jay Vana <[email protected]>
Acked-by: Vladimir Davydov <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Pekka Enberg <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Joonsoo Kim <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
htejun authored and torvalds committed Feb 23, 2017
1 parent c9fc586 commit 01fb58b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 4 deletions.
6 changes: 6 additions & 0 deletions include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,12 @@ struct memcg_cache_params {
struct mem_cgroup *memcg;
struct list_head children_node;
struct list_head kmem_caches_node;

void (*deact_fn)(struct kmem_cache *);
union {
struct rcu_head deact_rcu_head;
struct work_struct deact_work;
};
};
};
};
Expand Down
2 changes: 2 additions & 0 deletions mm/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,

extern void slab_init_memcg_params(struct kmem_cache *);
extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *));

#else /* CONFIG_MEMCG && !CONFIG_SLOB */

Expand Down
60 changes: 60 additions & 0 deletions mm/slab_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,66 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
put_online_cpus();
}

static void kmemcg_deactivate_workfn(struct work_struct *work)
{
struct kmem_cache *s = container_of(work, struct kmem_cache,
memcg_params.deact_work);

get_online_cpus();
get_online_mems();

mutex_lock(&slab_mutex);

s->memcg_params.deact_fn(s);

mutex_unlock(&slab_mutex);

put_online_mems();
put_online_cpus();

/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
css_put(&s->memcg_params.memcg->css);
}

static void kmemcg_deactivate_rcufn(struct rcu_head *head)
{
struct kmem_cache *s = container_of(head, struct kmem_cache,
memcg_params.deact_rcu_head);

/*
* We need to grab blocking locks. Bounce to ->deact_work. The
* work item shares the space with the RCU head and can't be
* initialized eariler.
*/
INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
schedule_work(&s->memcg_params.deact_work);
}

/**
* slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
* sched RCU grace period
* @s: target kmem_cache
* @deact_fn: deactivation function to call
*
* Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
* held after a sched RCU grace period. The slab is guaranteed to stay
* alive until @deact_fn is finished. This is to be used from
* __kmemcg_cache_deactivate().
*/
void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *))
{
if (WARN_ON_ONCE(is_root_cache(s)) ||
WARN_ON_ONCE(s->memcg_params.deact_fn))
return;

/* pin memcg so that @s doesn't get destroyed in the middle */
css_get(&s->memcg_params.memcg->css);

s->memcg_params.deact_fn = deact_fn;
call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
}

void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
int idx;
Expand Down
12 changes: 8 additions & 4 deletions mm/slub.c
Original file line number Diff line number Diff line change
Expand Up @@ -3957,6 +3957,12 @@ int __kmem_cache_shrink(struct kmem_cache *s)
}

#ifdef CONFIG_MEMCG
static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
{
/* called with all the locks held after a sched RCU grace period */
__kmem_cache_shrink(s);
}

void __kmemcg_cache_deactivate(struct kmem_cache *s)
{
/*
Expand All @@ -3968,11 +3974,9 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)

/*
* s->cpu_partial is checked locklessly (see put_cpu_partial), so
* we have to make sure the change is visible.
* we have to make sure the change is visible before shrinking.
*/
synchronize_sched();

__kmem_cache_shrink(s);
slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
#endif

Expand Down

0 comments on commit 01fb58b

Please sign in to comment.