openzfs · behlendorf · Aug 7, 2012
diff --git a/include/sys/kmem.h b/include/sys/kmem.h
@@ -291,6 +291,7 @@ enum {
 	KMC_BIT_KMEM		= 5,	/* Use kmem cache */
 	KMC_BIT_VMEM		= 6,	/* Use vmem cache */
 	KMC_BIT_OFFSLAB		= 7,	/* Objects not on slab */
+	KMC_BIT_GROWING         = 15,   /* Growing in progress */
 	KMC_BIT_REAPING		= 16,	/* Reaping in progress */
 	KMC_BIT_DESTROY		= 17,	/* Destroy in progress */
 	KMC_BIT_TOTAL		= 18,	/* Proc handler helper bit */
@@ -315,6 +316,7 @@ typedef enum kmem_cbrc {
 #define KMC_KMEM		(1 << KMC_BIT_KMEM)
 #define KMC_VMEM		(1 << KMC_BIT_VMEM)
 #define KMC_OFFSLAB		(1 << KMC_BIT_OFFSLAB)
+#define KMC_GROWING		(1 << KMC_BIT_GROWING)
 #define KMC_REAPING		(1 << KMC_BIT_REAPING)
 #define KMC_DESTROY		(1 << KMC_BIT_DESTROY)
 #define KMC_TOTAL		(1 << KMC_BIT_TOTAL)
@@ -373,6 +375,17 @@ typedef struct spl_kmem_slab {
 	uint32_t		sks_ref;	/* Ref count used objects */
 } spl_kmem_slab_t;
 
+typedef struct spl_kmem_alloc {
+	struct spl_kmem_cache	*ska_cache;	/* Owned by cache */
+	int			ska_flags;	/* Allocation flags */
+	struct delayed_work	ska_work;	/* Allocation work */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+	void			*ske_obj;	/* Buffer address */
+	struct list_head	ske_list;	/* Emergency list linkage */
+} spl_kmem_emergency_t;
+
 typedef struct spl_kmem_cache {
 	uint32_t		skc_magic;	/* Sanity magic */
 	uint32_t		skc_name_size;	/* Name length */
@@ -397,7 +410,9 @@ typedef struct spl_kmem_cache {
 	struct list_head	skc_list;	/* List of caches linkage */
 	struct list_head	skc_complete_list;/* Completely alloc'ed */
 	struct list_head	skc_partial_list; /* Partially alloc'ed */
+	struct list_head	skc_emergency_list; /* Min sized objects */
 	spinlock_t		skc_lock;	/* Cache lock */
+	wait_queue_head_t	skc_waitq;	/* Allocation waiters */
 	uint64_t		skc_slab_fail;	/* Slab alloc failures */
 	uint64_t		skc_slab_create;/* Slab creates */
 	uint64_t		skc_slab_destroy;/* Slab destroys */
@@ -407,6 +422,8 @@ typedef struct spl_kmem_cache {
 	uint64_t		skc_obj_total;	/* Obj total current */
 	uint64_t		skc_obj_alloc;	/* Obj alloc current */
 	uint64_t		skc_obj_max;	/* Obj max historic */
+	uint64_t		skc_obj_emergency; /* Obj emergency current */
+	uint64_t		skc_obj_emergency_max; /* Obj emergency max */
 } spl_kmem_cache_t;
 #define kmem_cache_t		spl_kmem_cache_t
 

diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
@@ -1143,6 +1143,86 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
 	SEXIT;
 }
 
+/*
+ * Allocate a single emergency object for use by the caller.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	spl_kmem_emergency_t *ske;
+	int empty;
+	SENTRY;
+
+	/* Last chance use a partial slab if one now exists */
+	spin_lock(&skc->skc_lock);
+	empty = list_empty(&skc->skc_partial_list);
+	spin_unlock(&skc->skc_lock);
+	if (!empty)
+		SRETURN(-EEXIST);
+
+	ske = kmalloc(sizeof(*ske), GFP_NOIO | __GFP_HIGH);
+	if (ske == NULL)
+		SRETURN(-ENOMEM);
+
+	ske->ske_obj = kmalloc(skc->skc_obj_size, GFP_NOIO | __GFP_HIGH);
+	if (ske->ske_obj == NULL) {
+		kfree(ske);
+		SRETURN(-ENOMEM);
+	}
+
+	if (skc->skc_ctor)
+		skc->skc_ctor(ske->ske_obj, skc->skc_private, flags);
+
+	spin_lock(&skc->skc_lock);
+	skc->skc_obj_total++;
+	skc->skc_obj_emergency++;
+	if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+		skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+
+	list_add(&ske->ske_list, &skc->skc_emergency_list);
+	spin_unlock(&skc->skc_lock);
+
+	*obj = ske->ske_obj;
+
+	SRETURN(0);
+}
+
+/*
+ * Free the passed object if it is an emergency object or a normal slab
+ * object.  Currently this is done by walking what should be a short list of
+ * emergency objects.  If this proves to be too inefficient we can replace
+ * the simple list with a hash.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_emergency_t *m, *n, *ske = NULL;
+	SENTRY;
+
+	spin_lock(&skc->skc_lock);
+	list_for_each_entry_safe(m, n, &skc->skc_emergency_list, ske_list) {
+		if (m->ske_obj == obj) {
+			list_del(&m->ske_list);
+			skc->skc_obj_emergency--;
+			skc->skc_obj_total--;
+			ske = m;
+			break;
+		}
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (ske == NULL)
+		SRETURN(-ENOENT);
+
+	if (skc->skc_dtor)
+		skc->skc_dtor(ske->ske_obj, skc->skc_private);
+
+	kfree(ske->ske_obj);
+	kfree(ske);
+
+	SRETURN(0);
+}
+
 /*
  * Called regularly on all caches to age objects out of the magazines
  * which have not been access in skc->skc_delay seconds.  This prevents
@@ -1430,7 +1510,9 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
 	INIT_LIST_HEAD(&skc->skc_partial_list);
+	INIT_LIST_HEAD(&skc->skc_emergency_list);
 	spin_lock_init(&skc->skc_lock);
+	init_waitqueue_head(&skc->skc_waitq);
 	skc->skc_slab_fail = 0;
 	skc->skc_slab_create = 0;
 	skc->skc_slab_destroy = 0;
@@ -1440,6 +1522,8 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
 	skc->skc_obj_total = 0;
 	skc->skc_obj_alloc = 0;
 	skc->skc_obj_max = 0;
+	skc->skc_obj_emergency = 0;
+	skc->skc_obj_emergency_max = 0;
 
 	if (align) {
 		VERIFY(ISP2(align));
@@ -1530,7 +1614,9 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 	ASSERT3U(skc->skc_obj_alloc, ==, 0);
 	ASSERT3U(skc->skc_slab_total, ==, 0);
 	ASSERT3U(skc->skc_obj_total, ==, 0);
+	ASSERT3U(skc->skc_obj_emergency, ==, 0);
 	ASSERT(list_empty(&skc->skc_complete_list));
+	ASSERT(list_empty(&skc->skc_emergency_list));
 
 	kmem_free(skc->skc_name, skc->skc_name_size);
 	spin_unlock(&skc->skc_lock);
@@ -1581,59 +1667,113 @@ spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 }
 
 /*
- * No available objects on any slabs, create a new slab.  Since this
- * is an expensive operation we do it without holding the spin lock and
- * only briefly acquire it when we link in the fully allocated and
- * constructed slab.
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
  */
-static spl_kmem_slab_t *
-spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+static void
+spl_cache_grow_work(void *data)
 {
+	spl_kmem_alloc_t *ska =
+		spl_get_work_data(data, spl_kmem_alloc_t, ska_work.work);
+	spl_kmem_cache_t *skc = ska->ska_cache;
 	spl_kmem_slab_t *sks;
+
+	sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
+	spin_lock(&skc->skc_lock);
+	if (sks) {
+		skc->skc_slab_total++;
+		skc->skc_obj_total += sks->sks_objs;
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+	}
+
+	atomic_dec(&skc->skc_ref);
+	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+	wake_up_all(&skc->skc_waitq);
+	spin_unlock(&skc->skc_lock);
+
+	kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+	return !test_bit(KMC_BIT_GROWING, &skc->skc_flags);
+}
+
+/*
+ * No available objects on any slabs, create a new slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	int remaining, rc = 0;
 	SENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
-	local_irq_enable();
 	might_sleep();
+	*obj = NULL;
 
 	/*
 	 * Before allocating a new slab check if the slab is being reaped.
 	 * If it is there is a good chance we can wait until it finishes
 	 * and then use one of the newly freed but not aged-out slabs.
 	 */
-	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
-		schedule();
-		SGOTO(out, sks= NULL);
-	}
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags))
+		SRETURN(-EAGAIN);
 
-	/* Allocate a new slab for the cache */
-	sks = spl_slab_alloc(skc, flags | __GFP_NORETRY | KM_NODEBUG);
-	if (sks == NULL)
-		SGOTO(out, sks = NULL);
+	/*
+	 * This is handled by dispatching a work request to the global work
+	 * queue.  This allows us to asynchronously allocate a new slab while
+	 * retaining the ability to safely fall back to a smaller synchronous
+	 * allocations to ensure forward progress is always maintained.
+	 */
+	if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+		spl_kmem_alloc_t *ska;
 
-	/* Link the new empty slab in to the end of skc_partial_list. */
-	spin_lock(&skc->skc_lock);
-	skc->skc_slab_total++;
-	skc->skc_obj_total += sks->sks_objs;
-	list_add_tail(&sks->sks_list, &skc->skc_partial_list);
-	spin_unlock(&skc->skc_lock);
-out:
-	local_irq_disable();
+		ska = kmalloc(sizeof(*ska), GFP_NOIO | __GFP_HIGH);
+		if (ska == NULL) {
+			clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+			wake_up_all(&skc->skc_waitq);
+			SRETURN(-ENOMEM);
+		}
 
-	SRETURN(sks);
+		atomic_inc(&skc->skc_ref);
+		ska->ska_cache = skc;
+		ska->ska_flags = flags;
+		spl_init_delayed_work(&ska->ska_work, spl_cache_grow_work, ska);
+		schedule_delayed_work(&ska->ska_work, 0);
+	}
+
+	/*
+	 * Allow 1/10 of a second before falling back to synchronously
+	 * allocating the minimum about of memory required by the caller
+	 * is the safest way to avoid potential deadlocks.
+	 */
+	remaining = wait_event_timeout(skc->skc_waitq,
+				       spl_cache_grow_wait(skc), HZ / 10);
+	if (remaining == 0)
+		rc = spl_emergency_alloc(skc, flags, obj);
+
+	SRETURN(rc);
 }
 
 /*
- * Refill a per-cpu magazine with objects from the slabs for this
- * cache.  Ideally the magazine can be repopulated using existing
- * objects which have been released, however if we are unable to
- * locate enough free objects new slabs of objects will be created.
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created.  On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
  */
-static int
+static void *
 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 {
 	spl_kmem_slab_t *sks;
-	int rc = 0, refill;
+	int count = 0, rc, refill;
+	void *obj = NULL;
 	SENTRY;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
@@ -1647,8 +1787,15 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 		if (list_empty(&skc->skc_partial_list)) {
 			spin_unlock(&skc->skc_lock);
 
-			sks = spl_cache_grow(skc, flags);
-			if (!sks)
+			local_irq_enable();
+			rc = spl_cache_grow(skc, flags, &obj);
+			local_irq_disable();
+
+			/* Emergency object for immediate use by caller */
+			if (rc == 0 && obj != NULL)
+				SRETURN(obj);
+
+			if (rc)
 				SGOTO(out, rc);
 
 			/* Rescheduled to different CPU skm is not local */
@@ -1673,9 +1820,9 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 
 		/* Consume as many objects as needed to refill the requested
 		 * cache.  We must also be careful not to overfill it. */
-		while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++rc) {
+		while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) {
 			ASSERT(skm->skm_avail < skm->skm_size);
-			ASSERT(rc < skm->skm_size);
+			ASSERT(count < skm->skm_size);
 			skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks);
 		}
 
@@ -1688,8 +1835,7 @@ spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 
 	spin_unlock(&skc->skc_lock);
 out:
-	/* Returns the number of entries added to cache */
-	SRETURN(rc);
+	SRETURN(NULL);
 }
 
 /*
@@ -1804,10 +1950,9 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 		obj = skm->skm_objs[--skm->skm_avail];
 		skm->skm_age = jiffies;
 	} else {
-		/* Per-CPU cache empty, directly allocate from
-		 * the slab and refill the per-CPU cache. */
-		(void)spl_cache_refill(skc, skm, flags);
-		SGOTO(restart, obj = NULL);
+		obj = spl_cache_refill(skc, skm, flags);
+		if (obj == NULL)
+			SGOTO(restart, obj = NULL);
 	}
 
 	local_irq_restore(irq_flags);
@@ -1838,6 +1983,14 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 	atomic_inc(&skc->skc_ref);
+
+	/*
+	 * Emergency objects are never part of the virtual address space
+	 * so if we get a virtual address we can optimize this check out.
+	 */
+	if (!kmem_virt(obj) && !spl_emergency_free(skc, obj))
+		SGOTO(out, 0);
+
 	local_irq_save(flags);
 
 	/* Safe to update per-cpu structure without lock, but
@@ -1855,6 +2008,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 	skm->skm_objs[skm->skm_avail++] = obj;
 
 	local_irq_restore(flags);
+out:
 	atomic_dec(&skc->skc_ref);
 
 	SEXIT;