Skip to content
This repository has been archived by the owner on Feb 26, 2020. It is now read-only.

Commit

Permalink
Merge branch 'kmem-cache-optimization'
Browse files Browse the repository at this point in the history
This branch contains kmem cache optimizations designed to resolve
the lockups reported in openzfs/zfs#922.  The lockups were
largely the result of spin lock contention in the slab under low
memory conditions.  Fundamentally, these changes are all designed
to minimize that contention though a variety of methods.

  * Improved vmem cached deadlock detection
  * Track emergency objects in rbtree
  * Optimize spl_kmem_cache_free()
  * Never spin in kmem_cache_alloc()

Signed-off-by: Brian Behlendorf <[email protected]>
openzfs/zfs#922
  • Loading branch information
behlendorf committed Nov 8, 2012
2 parents 65c2fc5 + dc1b302 commit 366346c
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 50 deletions.
8 changes: 6 additions & 2 deletions include/sys/kmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/hash.h>
#include <linux/rbtree.h>
#include <linux/ctype.h>
#include <asm/atomic.h>
#include <sys/types.h>
Expand Down Expand Up @@ -340,6 +341,7 @@ enum {
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */
KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
Expand All @@ -366,6 +368,7 @@ typedef enum kmem_cbrc {
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
Expand Down Expand Up @@ -433,8 +436,8 @@ typedef struct spl_kmem_alloc {
} spl_kmem_alloc_t;

typedef struct spl_kmem_emergency {
struct rb_node ske_node; /* Emergency tree linkage */
void *ske_obj; /* Buffer address */
struct list_head ske_list; /* Emergency list linkage */
} spl_kmem_emergency_t;

typedef struct spl_kmem_cache {
Expand All @@ -461,7 +464,7 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */
struct list_head skc_emergency_list; /* Min sized objects */
struct rb_root skc_emergency_tree; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */
wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */
Expand All @@ -473,6 +476,7 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */
uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
uint64_t skc_obj_emergency; /* Obj emergency current */
uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t;
Expand Down
164 changes: 119 additions & 45 deletions module/spl/spl-kmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -1116,8 +1116,54 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
SEXIT;
}

static spl_kmem_emergency_t *
spl_emergency_search(struct rb_root *root, void *obj)
{
struct rb_node *node = root->rb_node;
spl_kmem_emergency_t *ske;
unsigned long address = (unsigned long)obj;

while (node) {
ske = container_of(node, spl_kmem_emergency_t, ske_node);

if (address < (unsigned long)ske->ske_obj)
node = node->rb_left;
else if (address > (unsigned long)ske->ske_obj)
node = node->rb_right;
else
return ske;
}

return NULL;
}

static int
spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
spl_kmem_emergency_t *ske_tmp;
unsigned long address = (unsigned long)ske->ske_obj;

while (*new) {
ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);

parent = *new;
if (address < (unsigned long)ske_tmp->ske_obj)
new = &((*new)->rb_left);
else if (address > (unsigned long)ske_tmp->ske_obj)
new = &((*new)->rb_right);
else
return 0;
}

rb_link_node(&ske->ske_node, parent, new);
rb_insert_color(&ske->ske_node, root);

return 1;
}

/*
* Allocate a single emergency object for use by the caller.
* Allocate a single emergency object and track it in a red black tree.
*/
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
Expand All @@ -1143,48 +1189,49 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
SRETURN(-ENOMEM);
}

if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);

spin_lock(&skc->skc_lock);
skc->skc_obj_total++;
skc->skc_obj_emergency++;
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
skc->skc_obj_emergency_max = skc->skc_obj_emergency;

list_add(&ske->ske_list, &skc->skc_emergency_list);
empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
if (likely(empty)) {
skc->skc_obj_total++;
skc->skc_obj_emergency++;
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
skc->skc_obj_emergency_max = skc->skc_obj_emergency;
}
spin_unlock(&skc->skc_lock);

if (unlikely(!empty)) {
kfree(ske->ske_obj);
kfree(ske);
SRETURN(-EINVAL);
}

if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);

*obj = ske->ske_obj;

SRETURN(0);
}

/*
* Free the passed object if it is an emergency object or a normal slab
* object. Currently this is done by walking what should be a short list of
* emergency objects. If this proves to be too inefficient we can replace
* the simple list with a hash.
* Locate the passed object in the red black tree and free it.
*/
static int
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *m, *n, *ske = NULL;
spl_kmem_emergency_t *ske;
SENTRY;

spin_lock(&skc->skc_lock);
list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
if (m->ske_obj == obj) {
list_del(&m->ske_list);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
ske = m;
break;
}
ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
if (likely(ske)) {
rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
}
spin_unlock(&skc->skc_lock);

if (ske == NULL)
if (unlikely(ske == NULL))
SRETURN(-ENOENT);

if (skc->skc_dtor)
Expand Down Expand Up @@ -1483,7 +1530,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list);
INIT_LIST_HEAD(&skc->skc_emergency_list);
skc->skc_emergency_tree = RB_ROOT;
spin_lock_init(&skc->skc_lock);
init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0;
Expand All @@ -1495,6 +1542,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0;
skc->skc_obj_deadlock = 0;
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;

Expand Down Expand Up @@ -1589,7 +1637,6 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
ASSERT(list_empty(&skc->skc_emergency_list));

kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
Expand Down Expand Up @@ -1662,6 +1709,7 @@ spl_cache_grow_work(void *data)

atomic_dec(&skc->skc_ref);
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
spin_unlock(&skc->skc_lock);

Expand All @@ -1677,26 +1725,35 @@ spl_cache_grow_wait(spl_kmem_cache_t *skc)
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
}

static int
spl_cache_reclaim_wait(void *word)
{
schedule();
return 0;
}

/*
* No available objects on any slabs, create a new slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
int remaining, rc = 0;
int remaining, rc;
SENTRY;

ASSERT(skc->skc_magic == SKC_MAGIC);
might_sleep();
*obj = NULL;

/*
* Before allocating a new slab check if the slab is being reaped.
* If it is there is a good chance we can wait until it finishes
* and then use one of the newly freed but not aged-out slabs.
* Before allocating a new slab wait for any reaping to complete and
* then return so the local magazine can be rechecked for new objects.
*/
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
SRETURN(-EAGAIN);
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
spl_cache_reclaim_wait, TASK_UNINTERRUPTIBLE);
SRETURN(rc ? rc : -EAGAIN);
}

/*
* This is handled by dispatching a work request to the global work
Expand All @@ -1722,17 +1779,30 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
}

/*
* Allow a single timer tick before falling back to synchronously
* allocating the minimum about of memory required by the caller.
* The goal here is to only detect the rare case where a virtual slab
* allocation has deadlocked. We must be careful to minimize the use
* of emergency objects which are more expensive to track. Therefore,
* we set a very long timeout for the asynchronous allocation and if
* the timeout is reached the cache is flagged as deadlocked. From
* this point only new emergency objects will be allocated until the
* asynchronous allocation completes and clears the deadlocked flag.
*/
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), 1);
if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
rc = spl_emergency_alloc(skc, flags, obj);
} else {
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), HZ);

if (remaining == 0) {
if (test_bit(KMC_BIT_NOEMERGENCY, &skc->skc_flags))
rc = -ENOMEM;
else
rc = spl_emergency_alloc(skc, flags, obj);
if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) {
spin_lock(&skc->skc_lock);
if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
skc->skc_obj_deadlock++;
}
spin_unlock(&skc->skc_lock);
}

rc = -ENOMEM;
}

SRETURN(rc);
Expand Down Expand Up @@ -1962,11 +2032,12 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
atomic_inc(&skc->skc_ref);

/*
* Emergency objects are never part of the virtual address space
* so if we get a virtual address we can optimize this check out.
* Only virtual slabs may have emergency objects and these objects
* are guaranteed to have physical addresses. They must be removed
* from the tree of emergency objects and the freed.
*/
if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
SGOTO(out, 0);
if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj))
SGOTO(out, spl_emergency_free(skc, obj));

local_irq_save(flags);

Expand Down Expand Up @@ -2094,6 +2165,9 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
/* Reclaim from the cache, ignoring it's age and delay. */
spl_slab_reclaim(skc, count, 1);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
smp_mb__after_clear_bit();
wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);

atomic_dec(&skc->skc_ref);

SEXIT;
Expand Down
9 changes: 6 additions & 3 deletions module/spl/spl-proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,12 +625,14 @@ slab_seq_show_headers(struct seq_file *f)
"--------------------- cache ----------"
"--------------------------------------------- "
"----- slab ------ "
"---- object -----------------\n");
"---- object ----- "
"--- emergency ---\n");
seq_printf(f,
"name "
" flags size alloc slabsize objsize "
"total alloc max "
"total alloc max emerg max\n");
"total alloc max "
"dlock alloc max\n");
}

static int
Expand All @@ -643,7 +645,7 @@ slab_seq_show(struct seq_file *f, void *p)
spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
"%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
(long unsigned)skc->skc_flags,
(long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
(long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
Expand All @@ -655,6 +657,7 @@ slab_seq_show(struct seq_file *f, void *p)
(long unsigned)skc->skc_obj_total,
(long unsigned)skc->skc_obj_alloc,
(long unsigned)skc->skc_obj_max,
(long unsigned)skc->skc_obj_deadlock,
(long unsigned)skc->skc_obj_emergency,
(long unsigned)skc->skc_obj_emergency_max);

Expand Down

0 comments on commit 366346c

Please sign in to comment.