Skip to content
This repository has been archived by the owner on Feb 26, 2020. It is now read-only.

Emergency slab objects #155

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions include/sys/kmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ enum {
KMC_BIT_KMEM = 5, /* Use kmem cache */
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
KMC_BIT_DESTROY = 17, /* Destroy in progress */
KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
Expand All @@ -315,6 +316,7 @@ typedef enum kmem_cbrc {
#define KMC_KMEM (1 << KMC_BIT_KMEM)
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_GROWING (1 << KMC_BIT_GROWING)
#define KMC_REAPING (1 << KMC_BIT_REAPING)
#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
#define KMC_TOTAL (1 << KMC_BIT_TOTAL)
Expand Down Expand Up @@ -373,6 +375,17 @@ typedef struct spl_kmem_slab {
uint32_t sks_ref; /* Ref count used objects */
} spl_kmem_slab_t;

typedef struct spl_kmem_alloc {
struct spl_kmem_cache *ska_cache; /* Owned by cache */
int ska_flags; /* Allocation flags */
struct delayed_work ska_work; /* Allocation work */
} spl_kmem_alloc_t;

typedef struct spl_kmem_emergency {
void *ske_obj; /* Buffer address */
struct list_head ske_list; /* Emergency list linkage */
} spl_kmem_emergency_t;

typedef struct spl_kmem_cache {
uint32_t skc_magic; /* Sanity magic */
uint32_t skc_name_size; /* Name length */
Expand All @@ -397,7 +410,9 @@ typedef struct spl_kmem_cache {
struct list_head skc_list; /* List of caches linkage */
struct list_head skc_complete_list;/* Completely alloc'ed */
struct list_head skc_partial_list; /* Partially alloc'ed */
struct list_head skc_emergency_list; /* Min sized objects */
spinlock_t skc_lock; /* Cache lock */
wait_queue_head_t skc_waitq; /* Allocation waiters */
uint64_t skc_slab_fail; /* Slab alloc failures */
uint64_t skc_slab_create;/* Slab creates */
uint64_t skc_slab_destroy;/* Slab destroys */
Expand All @@ -407,6 +422,8 @@ typedef struct spl_kmem_cache {
uint64_t skc_obj_total; /* Obj total current */
uint64_t skc_obj_alloc; /* Obj alloc current */
uint64_t skc_obj_max; /* Obj max historic */
uint64_t skc_obj_emergency; /* Obj emergency current */
uint64_t skc_obj_emergency_max; /* Obj emergency max */
} spl_kmem_cache_t;
#define kmem_cache_t spl_kmem_cache_t

Expand Down
234 changes: 194 additions & 40 deletions module/spl/spl-kmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
SEXIT;
}

/*
* Allocate a single emergency object for use by the caller.
*/
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
{
spl_kmem_emergency_t *ske;
int empty;
SENTRY;

/* Last chance use a partial slab if one now exists */
spin_lock(&skc->skc_lock);
empty = list_empty(&skc->skc_partial_list);
spin_unlock(&skc->skc_lock);
if (!empty)
SRETURN(-EEXIST);

ske = kmalloc(sizeof(*ske), GFP_NOIO | __GFP_HIGH);
if (ske == NULL)
SRETURN(-ENOMEM);

ske->ske_obj = kmalloc(skc->skc_obj_size, GFP_NOIO | __GFP_HIGH);
if (ske->ske_obj == NULL) {
kfree(ske);
SRETURN(-ENOMEM);
}

if (skc->skc_ctor)
skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);

spin_lock(&skc->skc_lock);
skc->skc_obj_total++;
skc->skc_obj_emergency++;
if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
skc->skc_obj_emergency_max = skc->skc_obj_emergency;

list_add(&ske->ske_list, &skc->skc_emergency_list);
spin_unlock(&skc->skc_lock);

*obj = ske->ske_obj;

SRETURN(0);
}

/*
* Free the passed object if it is an emergency object or a normal slab
* object. Currently this is done by walking what should be a short list of
* emergency objects. If this proves to be too inefficient we can replace
* the simple list with a hash.
*/
static int
spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
{
spl_kmem_emergency_t *m, *n, *ske = NULL;
SENTRY;

spin_lock(&skc->skc_lock);
list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
if (m->ske_obj == obj) {
list_del(&m->ske_list);
skc->skc_obj_emergency--;
skc->skc_obj_total--;
ske = m;
break;
}
}
spin_unlock(&skc->skc_lock);

if (ske == NULL)
SRETURN(-ENOENT);

if (skc->skc_dtor)
skc->skc_dtor(ske->ske_obj, skc->skc_private);

kfree(ske->ske_obj);
kfree(ske);

SRETURN(0);
}

/*
* Called regularly on all caches to age objects out of the magazines
* which have not been access in skc->skc_delay seconds. This prevents
Expand Down Expand Up @@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
INIT_LIST_HEAD(&skc->skc_list);
INIT_LIST_HEAD(&skc->skc_complete_list);
INIT_LIST_HEAD(&skc->skc_partial_list);
INIT_LIST_HEAD(&skc->skc_emergency_list);
spin_lock_init(&skc->skc_lock);
init_waitqueue_head(&skc->skc_waitq);
skc->skc_slab_fail = 0;
skc->skc_slab_create = 0;
skc->skc_slab_destroy = 0;
Expand All @@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_total = 0;
skc->skc_obj_alloc = 0;
skc->skc_obj_max = 0;
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;

if (align) {
VERIFY(ISP2(align));
Expand Down Expand Up @@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
ASSERT3U(skc->skc_obj_alloc, ==, 0);
ASSERT3U(skc->skc_slab_total, ==, 0);
ASSERT3U(skc->skc_obj_total, ==, 0);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
ASSERT(list_empty(&skc->skc_emergency_list));

kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
Expand Down Expand Up @@ -1581,59 +1667,113 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
}

/*
* No available objects on any slabs, create a new slab. Since this
* is an expensive operation we do it without holding the spin lock and
* only briefly acquire it when we link in the fully allocated and
* constructed slab.
* Generic slab allocation function to run by the global work queues.
* It is responsible for allocating a new slab, linking it in to the list
* of partial slabs, and then waking any waiters.
*/
static spl_kmem_slab_t *
spl_cache_grow(spl_kmem_cache_t *skc, int flags)
static void
spl_cache_grow_work(void *data)
{
spl_kmem_alloc_t *ska =
spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work);
spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks;

sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
spin_lock(&skc->skc_lock);
if (sks) {
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
}

atomic_dec(&skc->skc_ref);
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
spin_unlock(&skc->skc_lock);

kfree(ska);
}

/*
* Returns non-zero when a new slab should be available.
*/
static int
spl_cache_grow_wait(spl_kmem_cache_t *skc)
{
return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
}

/*
* No available objects on any slabs, create a new slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
int remaining, rc = 0;
SENTRY;

ASSERT(skc->skc_magic == SKC_MAGIC);
local_irq_enable();
might_sleep();
*obj = NULL;

/*
* Before allocating a new slab check if the slab is being reaped.
* If it is there is a good chance we can wait until it finishes
* and then use one of the newly freed but not aged-out slabs.
*/
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
schedule();
SGOTO(out, sks= NULL);
}
if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
SRETURN(-EAGAIN);

/* Allocate a new slab for the cache */
sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
if (sks == NULL)
SGOTO(out, sks = NULL);
/*
* This is handled by dispatching a work request to the global work
* queue. This allows us to asynchronously allocate a new slab while
* retaining the ability to safely fall back to a smaller synchronous
* allocations to ensure forward progress is always maintained.
*/
if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
spl_kmem_alloc_t *ska;

/* Link the new empty slab in to the end of skc_partial_list. */
spin_lock(&skc->skc_lock);
skc->skc_slab_total++;
skc->skc_obj_total += sks->sks_objs;
list_add_tail(&sks->sks_list, &skc->skc_partial_list);
spin_unlock(&skc->skc_lock);
out:
local_irq_disable();
ska = kmalloc(sizeof(*ska), GFP_NOIO | __GFP_HIGH);
if (ska == NULL) {
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
SRETURN(-ENOMEM);
}

SRETURN(sks);
atomic_inc(&skc->skc_ref);
ska->ska_cache = skc;
ska->ska_flags = flags;
spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska);
schedule_delayed_work(&ska->ska_work, 0);
}

/*
* Allow 1/10 of a second before falling back to synchronously
* allocating the minimum about of memory required by the caller
* is the safest way to avoid potential deadlocks.
*/
remaining = wait_event_timeout(skc->skc_waitq,
spl_cache_grow_wait(skc), HZ / 10);
if (remaining == 0)
rc = spl_emergency_alloc(skc, flags, obj);

SRETURN(rc);
}

/*
* Refill a per-cpu magazine with objects from the slabs for this
* cache. Ideally the magazine can be repopulated using existing
* objects which have been released, however if we are unable to
* locate enough free objects new slabs of objects will be created.
* Refill a per-cpu magazine with objects from the slabs for this cache.
* Ideally the magazine can be repopulated using existing objects which have
* been released, however if we are unable to locate enough free objects new
* slabs of objects will be created. On success NULL is returned, otherwise
* the address of a single emergency object is returned for use by the caller.
*/
static int
static void *
spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
{
spl_kmem_slab_t *sks;
int rc = 0, refill;
int count = 0, rc, refill;
void *obj = NULL;
SENTRY;

ASSERT(skc->skc_magic == SKC_MAGIC);
Expand All @@ -1647,8 +1787,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
if (list_empty(&skc->skc_partial_list)) {
spin_unlock(&skc->skc_lock);

sks = spl_cache_grow(skc, flags);
if (!sks)
local_irq_enable();
rc = spl_cache_grow(skc, flags, &obj);
local_irq_disable();

/* Emergency object for immediate use by caller */
if (rc == 0 && obj != NULL)
SRETURN(obj);

if (rc)
SGOTO(out, rc);

/* Rescheduled to different CPU skm is not local */
Expand All @@ -1673,9 +1820,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)

/* Consume as many objects as needed to refill the requested
* cache. We must also be careful not to overfill it. */
while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
ASSERT(skm->skm_avail < skm->skm_size);
ASSERT(rc < skm->skm_size);
ASSERT(count < skm->skm_size);
skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
}

Expand All @@ -1688,8 +1835,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)

spin_unlock(&skc->skc_lock);
out:
/* Returns the number of entries added to cache */
SRETURN(rc);
SRETURN(NULL);
}

/*
Expand Down Expand Up @@ -1804,10 +1950,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
obj = skm->skm_objs[--skm->skm_avail];
skm->skm_age = jiffies;
} else {
/* Per-CPU cache empty, directly allocate from
* the slab and refill the per-CPU cache. */
(void)spl_cache_refill(skc, skm, flags);
SGOTO(restart, obj = NULL);
obj = spl_cache_refill(skc, skm, flags);
if (obj == NULL)
SGOTO(restart, obj = NULL);
}

local_irq_restore(irq_flags);
Expand Down Expand Up @@ -1838,6 +1983,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
atomic_inc(&skc->skc_ref);

/*
* Emergency objects are never part of the virtual address space
* so if we get a virtual address we can optimize this check out.
*/
if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
SGOTO(out, 0);

local_irq_save(flags);

/* Safe to update per-cpu structure without lock, but
Expand All @@ -1855,6 +2008,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
skm->skm_objs[skm->skm_avail++] = obj;

local_irq_restore(flags);
out:
atomic_dec(&skc->skc_ref);

SEXIT;
Expand Down
Loading