diff --git a/include/linux/proc_compat.h b/include/linux/proc_compat.h index 5bbe85081df8..2c57f39d2ec5 100644 --- a/include/linux/proc_compat.h +++ b/include/linux/proc_compat.h @@ -22,8 +22,8 @@ * with the SPL. If not, see . \*****************************************************************************/ -#ifndef _SPL_PROC_H -#define _SPL_PROC_H +#ifndef _SPL_PROC_COMPAT_H +#define _SPL_PROC_COMPAT_H #include @@ -32,4 +32,4 @@ extern struct proc_dir_entry *proc_spl_kstat; int spl_proc_init(void); void spl_proc_fini(void); -#endif /* SPL_PROC_H */ +#endif /* SPL_PROC_COMPAT_H */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 2d21c5728718..f9e883fd41ab 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -44,6 +44,7 @@ KERNEL_H = \ $(top_srcdir)/include/sys/isa_defs.h \ $(top_srcdir)/include/sys/kidmap.h \ $(top_srcdir)/include/sys/kmem.h \ + $(top_srcdir)/include/sys/kmem_cache.h \ $(top_srcdir)/include/sys/kobj.h \ $(top_srcdir)/include/sys/kstat.h \ $(top_srcdir)/include/sys/list.h \ @@ -94,6 +95,7 @@ KERNEL_H = \ $(top_srcdir)/include/sys/varargs.h \ $(top_srcdir)/include/sys/vfs.h \ $(top_srcdir)/include/sys/vfs_opreg.h \ + $(top_srcdir)/include/sys/vmem.h \ $(top_srcdir)/include/sys/vmsystm.h \ $(top_srcdir)/include/sys/vnode.h \ $(top_srcdir)/include/sys/zmod.h \ diff --git a/include/sys/kmem.h b/include/sys/kmem.h index 936e49d6dfec..8d5e729373fa 100644 --- a/include/sys/kmem.h +++ b/include/sys/kmem.h @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,517 +20,131 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . -\*****************************************************************************/ + */ #ifndef _SPL_KMEM_H #define _SPL_KMEM_H -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +extern int kmem_debugging(void); +extern char *kmem_vasprintf(const char *fmt, va_list ap); +extern char *kmem_asprintf(const char *fmt, ...); +extern char *strdup(const char *str); +extern void strfree(char *str); /* * Memory allocation interfaces */ -#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */ -#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */ -#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */ -#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */ -#define KM_FLAGS __GFP_BITS_MASK -#define KM_VMFLAGS GFP_LEVEL_MASK +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ -/* - * Used internally, the kernel does not need to support this flag - */ -#ifndef __GFP_ZERO -# define __GFP_ZERO 0x8000 -#endif +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) /* - * PF_NOFS is a per-process debug flag which is set in current->flags to - * detect when a process is performing an unsafe allocation. All tasks - * with PF_NOFS set must strictly use KM_PUSHPAGE for allocations because - * if they enter direct reclaim and initiate I/O the may deadlock. - * - * When debugging is disabled, any incorrect usage will be detected and - * a call stack with warning will be printed to the console. The flags - * will then be automatically corrected to allow for safe execution. If - * debugging is enabled this will be treated as a fatal condition. - * - * To avoid any risk of conflicting with the existing PF_ flags. The - * PF_NOFS bit shadows the rarely used PF_MUTEX_TESTER bit. Only when - * CONFIG_RT_MUTEX_TESTER is not set, and we know this bit is unused, - * will the PF_NOFS bit be valid. Happily, most existing distributions - * ship a kernel with CONFIG_RT_MUTEX_TESTER disabled. + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. */ -#if !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) -#define PF_NOFS PF_MUTEX_TESTER - -static inline void -sanitize_flags(struct task_struct *p, gfp_t *flags) +static inline gfp_t +kmem_flags_convert(int flags) { - if (unlikely((p->flags & PF_NOFS) && (*flags & (__GFP_IO|__GFP_FS)))) { -#ifdef NDEBUG - printk(KERN_WARNING "Fixing allocation for task %s (%d) " - "which used GFP flags 0x%x with PF_NOFS set\n", - p->comm, p->pid, *flags); - spl_dumpstack(); - *flags &= ~(__GFP_IO|__GFP_FS); -#else - PANIC("FATAL allocation for task %s (%d) which used GFP " - "flags 0x%x with PF_NOFS set\n", p->comm, p->pid, *flags); -#endif /* NDEBUG */ + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; + + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if ((current->flags & PF_FSTRANS)) + lflags &= ~(__GFP_IO|__GFP_FS); } -} -#else -#define PF_NOFS 0x00000000 -#define sanitize_flags(p, fl) ((void)0) -#endif /* !defined(CONFIG_RT_MUTEX_TESTER) && defined(PF_MUTEX_TESTER) */ -/* - * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as - * early as 2.6.32. To avoid this issue when it occurs in upstream kernels - * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC). - * I would prefer the caller handle the failure case cleanly but we are - * trying to emulate Solaris and those are not the Solaris semantics. - */ -static inline void * -kmalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - sanitize_flags(current, &flags); + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; - do { - ptr = kmalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; - return ptr; + return (lflags); } -static inline void * -kzalloc_nofail(size_t size, gfp_t flags) -{ - void *ptr; - - sanitize_flags(current, &flags); - - do { - ptr = kzalloc(size, flags); - } while (ptr == NULL && (flags & __GFP_WAIT)); +typedef struct { + struct task_struct *fstrans_thread; + unsigned int saved_flags; +} fstrans_cookie_t; - return ptr; -} - -static inline void * -kmalloc_node_nofail(size_t size, gfp_t flags, int node) +static inline fstrans_cookie_t +spl_fstrans_mark(void) { - void *ptr; - - sanitize_flags(current, &flags); + fstrans_cookie_t cookie; - do { - ptr = kmalloc_node(size, flags, node); - } while (ptr == NULL && (flags & __GFP_WAIT)); + cookie.fstrans_thread = current; + cookie.saved_flags = current->flags & PF_FSTRANS; + current->flags |= PF_FSTRANS; - return ptr; + return (cookie); } -static inline void * -vmalloc_nofail(size_t size, gfp_t flags) +static inline void +spl_fstrans_unmark(fstrans_cookie_t cookie) { - void *ptr; - - sanitize_flags(current, &flags); + ASSERT3P(cookie.fstrans_thread, ==, current); + ASSERT(current->flags & PF_FSTRANS); - /* - * Retry failed __vmalloc() allocations once every second. The - * rational for the delay is that the likely failure modes are: - * - * 1) The system has completely exhausted memory, in which case - * delaying 1 second for the memory reclaim to run is reasonable - * to avoid thrashing the system. - * 2) The system has memory but has exhausted the small virtual - * address space available on 32-bit systems. Retrying the - * allocation immediately will only result in spinning on the - * virtual address space lock. It is better delay a second and - * hope that another process will free some of the address space. - * But the bottom line is there is not much we can actually do - * since we can never safely return a failure and honor the - * Solaris semantics. - */ - while (1) { - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); - if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } else { - break; - } - } - - return ptr; + current->flags &= ~(PF_FSTRANS); + current->flags |= cookie.saved_flags; } -static inline void * -vzalloc_nofail(size_t size, gfp_t flags) +static inline int +spl_fstrans_check(void) { - void *ptr; - - ptr = vmalloc_nofail(size, flags); - if (ptr) - memset(ptr, 0, (size)); - - return ptr; + return (current->flags & PF_FSTRANS); } -#ifdef DEBUG_KMEM - -/* - * Memory accounting functions to be used only when DEBUG_KMEM is set. - */ -# ifdef HAVE_ATOMIC64_T - -# define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) -# define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) -# define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) -# define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) -# define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size) - +#ifdef HAVE_ATOMIC64_T +#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) extern atomic64_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; -extern atomic64_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -# else /* HAVE_ATOMIC64_T */ - -# define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) -# define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) -# define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) -# define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) -# define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used) -# define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used) -# define vmem_alloc_used_read() atomic_read(&vmem_alloc_used) -# define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size) - +#else /* HAVE_ATOMIC64_T */ +#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; -extern atomic_t vmem_alloc_used; -extern unsigned long long vmem_alloc_max; - -# endif /* HAVE_ATOMIC64_T */ - -# ifdef DEBUG_KMEM_TRACKING -/* - * DEBUG_KMEM && DEBUG_KMEM_TRACKING - * - * The maximum level of memory debugging. All memory will be accounted - * for and each allocation will be explicitly tracked. Any allocation - * which is leaked will be reported on module unload and the exact location - * where that memory was allocation will be reported. This level of memory - * tracking will have a significant impact on performance and should only - * be enabled for debugging. This feature may be enabled by passing - * --enable-debug-kmem-tracking to configure. - */ -# define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -# define kmem_free(ptr, sz) kmem_free_track((ptr), (sz)) +#endif /* HAVE_ATOMIC64_T */ -# define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_track((ptr), (sz)) +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; -extern void *kmem_alloc_track(size_t, int, const char *, int, int, int); -extern void kmem_free_track(const void *, size_t); -extern void *vmem_alloc_track(size_t, int, const char *, int); -extern void vmem_free_track(const void *, size_t); +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) -# else /* DEBUG_KMEM_TRACKING */ -/* - * DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * The default build will set DEBUG_KEM. This provides basic memory - * accounting with little to no impact on performance. When the module - * is unloaded in any memory was leaked the total number of leaked bytes - * will be reported on the console. To disable this basic accounting - * pass the --disable-debug-kmem option to configure. - */ -# define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__, 0, 0) -# define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__, 1, nd) -# define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz)) - -# define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \ - __FUNCTION__, __LINE__) -# define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\ - __FUNCTION__, __LINE__) -# define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz)) - -extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int); -extern void kmem_free_debug(const void *, size_t); -extern void *vmem_alloc_debug(size_t, int, const char *, int); -extern void vmem_free_debug(const void *, size_t); - -# endif /* DEBUG_KMEM_TRACKING */ -#else /* DEBUG_KMEM */ -/* - * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING - * - * All debugging is disabled. There will be no overhead even for - * minimal memory accounting. To enable basic accounting pass the - * --enable-debug-kmem option to configure. - */ -# define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl)) -# define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl)) -# define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd)) -# define kmem_free(ptr, sz) ((void)(sz), kfree(ptr)) - -# define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl)) -# define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl)) -# define vmem_free(ptr, sz) ((void)(sz), vfree(ptr)) - -#endif /* DEBUG_KMEM */ - -extern int kmem_debugging(void); -extern char *kmem_vasprintf(const char *fmt, va_list ap); -extern char *kmem_asprintf(const char *fmt, ...); -extern char *strdup(const char *str); -extern void strfree(char *str); - - -/* - * Slab allocation interfaces. The SPL slab differs from the standard - * Linux SLAB or SLUB primarily in that each cache may be backed by slabs - * allocated from the physical or virtal memory address space. The virtual - * slabs allow for good behavior when allocation large objects of identical - * size. This slab implementation also supports both constructors and - * destructions which the Linux slab does not. - */ -enum { - KMC_BIT_NOTOUCH = 0, /* Don't update ages */ - KMC_BIT_NODEBUG = 1, /* Default behavior */ - KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ - KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ - KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ - KMC_BIT_KMEM = 5, /* Use kmem cache */ - KMC_BIT_VMEM = 6, /* Use vmem cache */ - KMC_BIT_SLAB = 7, /* Use Linux slab cache */ - KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ - KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ - KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ - KMC_BIT_GROWING = 15, /* Growing in progress */ - KMC_BIT_REAPING = 16, /* Reaping in progress */ - KMC_BIT_DESTROY = 17, /* Destroy in progress */ - KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ - KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ - KMC_BIT_MAX = 20, /* Proc handler helper bit */ -}; - -/* kmem move callback return values */ -typedef enum kmem_cbrc { - KMEM_CBRC_YES = 0, /* Object moved */ - KMEM_CBRC_NO = 1, /* Object not moved */ - KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ - KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ - KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ -} kmem_cbrc_t; - -#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) -#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) -#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) -#define KMC_NOHASH (1 << KMC_BIT_NOHASH) -#define KMC_QCACHE (1 << KMC_BIT_QCACHE) -#define KMC_KMEM (1 << KMC_BIT_KMEM) -#define KMC_VMEM (1 << KMC_BIT_VMEM) -#define KMC_SLAB (1 << KMC_BIT_SLAB) -#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) -#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) -#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) -#define KMC_GROWING (1 << KMC_BIT_GROWING) -#define KMC_REAPING (1 << KMC_BIT_REAPING) -#define KMC_DESTROY (1 << KMC_BIT_DESTROY) -#define KMC_TOTAL (1 << KMC_BIT_TOTAL) -#define KMC_ALLOC (1 << KMC_BIT_ALLOC) -#define KMC_MAX (1 << KMC_BIT_MAX) - -#define KMC_REAP_CHUNK INT_MAX -#define KMC_DEFAULT_SEEKS 1 - -#define KMC_EXPIRE_AGE 0x1 /* Due to age */ -#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ - -#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ - -extern unsigned int spl_kmem_cache_expire; -extern struct list_head spl_kmem_cache_list; -extern struct rw_semaphore spl_kmem_cache_sem; - -#define SKM_MAGIC 0x2e2e2e2e -#define SKO_MAGIC 0x20202020 -#define SKS_MAGIC 0x22222222 -#define SKC_MAGIC 0x2c2c2c2c - -#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ -#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB 16 /* Target objects per slab */ -#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ -#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ - -#define POINTER_IS_VALID(p) 0 /* Unimplemented */ -#define POINTER_INVALIDATE(pp) /* Unimplemented */ - -typedef int (*spl_kmem_ctor_t)(void *, void *, int); -typedef void (*spl_kmem_dtor_t)(void *, void *); -typedef void (*spl_kmem_reclaim_t)(void *); - -typedef struct spl_kmem_magazine { - uint32_t skm_magic; /* Sanity magic */ - uint32_t skm_avail; /* Available objects */ - uint32_t skm_size; /* Magazine size */ - uint32_t skm_refill; /* Batch refill size */ - struct spl_kmem_cache *skm_cache; /* Owned by cache */ - unsigned long skm_age; /* Last cache access */ - unsigned int skm_cpu; /* Owned by cpu */ - void *skm_objs[0]; /* Object pointers */ -} spl_kmem_magazine_t; - -typedef struct spl_kmem_obj { - uint32_t sko_magic; /* Sanity magic */ - void *sko_addr; /* Buffer address */ - struct spl_kmem_slab *sko_slab; /* Owned by slab */ - struct list_head sko_list; /* Free object list linkage */ -} spl_kmem_obj_t; - -typedef struct spl_kmem_slab { - uint32_t sks_magic; /* Sanity magic */ - uint32_t sks_objs; /* Objects per slab */ - struct spl_kmem_cache *sks_cache; /* Owned by cache */ - struct list_head sks_list; /* Slab list linkage */ - struct list_head sks_free_list; /* Free object list */ - unsigned long sks_age; /* Last modify jiffie */ - uint32_t sks_ref; /* Ref count used objects */ -} spl_kmem_slab_t; - -typedef struct spl_kmem_alloc { - struct spl_kmem_cache *ska_cache; /* Owned by cache */ - int ska_flags; /* Allocation flags */ - taskq_ent_t ska_tqe; /* Task queue entry */ -} spl_kmem_alloc_t; - -typedef struct spl_kmem_emergency { - struct rb_node ske_node; /* Emergency tree linkage */ - void *ske_obj; /* Buffer address */ -} spl_kmem_emergency_t; - -typedef struct spl_kmem_cache { - uint32_t skc_magic; /* Sanity magic */ - uint32_t skc_name_size; /* Name length */ - char *skc_name; /* Name string */ - spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */ - uint32_t skc_mag_size; /* Magazine size */ - uint32_t skc_mag_refill; /* Magazine refill count */ - spl_kmem_ctor_t skc_ctor; /* Constructor */ - spl_kmem_dtor_t skc_dtor; /* Destructor */ - spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ - void *skc_private; /* Private data */ - void *skc_vmp; /* Unused */ - struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ - unsigned long skc_flags; /* Flags */ - uint32_t skc_obj_size; /* Object size */ - uint32_t skc_obj_align; /* Object alignment */ - uint32_t skc_slab_objs; /* Objects per slab */ - uint32_t skc_slab_size; /* Slab size */ - uint32_t skc_delay; /* Slab reclaim interval */ - uint32_t skc_reap; /* Slab reclaim count */ - atomic_t skc_ref; /* Ref count callers */ - taskqid_t skc_taskqid; /* Slab reclaim task */ - struct list_head skc_list; /* List of caches linkage */ - struct list_head skc_complete_list;/* Completely alloc'ed */ - struct list_head skc_partial_list; /* Partially alloc'ed */ - struct rb_root skc_emergency_tree; /* Min sized objects */ - spinlock_t skc_lock; /* Cache lock */ - wait_queue_head_t skc_waitq; /* Allocation waiters */ - uint64_t skc_slab_fail; /* Slab alloc failures */ - uint64_t skc_slab_create;/* Slab creates */ - uint64_t skc_slab_destroy;/* Slab destroys */ - uint64_t skc_slab_total; /* Slab total current */ - uint64_t skc_slab_alloc; /* Slab alloc current */ - uint64_t skc_slab_max; /* Slab max historic */ - uint64_t skc_obj_total; /* Obj total current */ - uint64_t skc_obj_alloc; /* Obj alloc current */ - uint64_t skc_obj_max; /* Obj max historic */ - uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ - uint64_t skc_obj_emergency; /* Obj emergency current */ - uint64_t skc_obj_emergency_max; /* Obj emergency max */ -} spl_kmem_cache_t; -#define kmem_cache_t spl_kmem_cache_t - -extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, - size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); -extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, - kmem_cbrc_t (*)(void *, void *, size_t, void *)); -extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); -extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); -extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); -extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); -extern void spl_kmem_reap(void); - -int spl_kmem_init(void); -void spl_kmem_fini(void); - -#define kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) \ - spl_kmem_cache_create(name,size,align,ctor,dtor,rclm,priv,vmp,flags) -#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) -#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) -#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) -#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) -#define kmem_cache_reap_now(skc) \ - spl_kmem_cache_reap_now(skc, skc->skc_reap) -#define kmem_reap() spl_kmem_reap() -#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \ - ((ptr) < (void *)VMALLOC_END)) +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); /* - * Allow custom slab allocation flags to be set for KMC_SLAB based caches. - * One use for this function is to ensure the __GFP_COMP flag is part of - * the default allocation mask which ensures higher order allocations are - * properly refcounted. This flag was added to the default ->allocflags - * as of Linux 3.11. + * The following functions are only available for internal use. */ -static inline void -kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags) -{ - if (skc->skc_linux_cache == NULL) - return; - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= flags; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= flags; -#endif -} +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); + +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); #endif /* _SPL_KMEM_H */ diff --git a/include/sys/kmem_cache.h b/include/sys/kmem_cache.h new file mode 100644 index 000000000000..75a0a55b7d1f --- /dev/null +++ b/include/sys/kmem_cache.h @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include + +/* + * Slab allocation interfaces. The SPL slab differs from the standard + * Linux SLAB or SLUB primarily in that each cache may be backed by slabs + * allocated from the physical or virtal memory address space. The virtual + * slabs allow for good behavior when allocation large objects of identical + * size. This slab implementation also supports both constructors and + * destructors which the Linux slab does not. + */ +enum { + KMC_BIT_NOTOUCH = 0, /* Don't update ages */ + KMC_BIT_NODEBUG = 1, /* Default behavior */ + KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ + KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ + KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ + KMC_BIT_KMEM = 5, /* Use kmem cache */ + KMC_BIT_VMEM = 6, /* Use vmem cache */ + KMC_BIT_SLAB = 7, /* Use Linux slab cache */ + KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ + KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ + KMC_BIT_GROWING = 15, /* Growing in progress */ + KMC_BIT_REAPING = 16, /* Reaping in progress */ + KMC_BIT_DESTROY = 17, /* Destroy in progress */ + KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ + KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ + KMC_BIT_MAX = 20, /* Proc handler helper bit */ +}; + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) +#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) +#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) +#define KMC_NOHASH (1 << KMC_BIT_NOHASH) +#define KMC_QCACHE (1 << KMC_BIT_QCACHE) +#define KMC_KMEM (1 << KMC_BIT_KMEM) +#define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) +#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) +#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) +#define KMC_GROWING (1 << KMC_BIT_GROWING) +#define KMC_REAPING (1 << KMC_BIT_REAPING) +#define KMC_DESTROY (1 << KMC_BIT_DESTROY) +#define KMC_TOTAL (1 << KMC_BIT_TOTAL) +#define KMC_ALLOC (1 << KMC_BIT_ALLOC) +#define KMC_MAX (1 << KMC_BIT_MAX) + +#define KMC_REAP_CHUNK INT_MAX +#define KMC_DEFAULT_SEEKS 1 + +#define KMC_EXPIRE_AGE 0x1 /* Due to age */ +#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ + +#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ + +extern unsigned int spl_kmem_cache_expire; +extern struct list_head spl_kmem_cache_list; +extern struct rw_semaphore spl_kmem_cache_sem; + +#define SKM_MAGIC 0x2e2e2e2e +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ +#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ +#ifdef _LP64 +#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ +#else +#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ +#endif + +#define SPL_MAX_ORDER (MAX_ORDER - 3) +#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) + +#ifdef CONFIG_SLUB +#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER +#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) +#else +#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) +#endif + +#define POINTER_IS_VALID(p) 0 /* Unimplemented */ +#define POINTER_INVALIDATE(pp) /* Unimplemented */ + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); +typedef void (*spl_kmem_reclaim_t)(void *); + +typedef struct spl_kmem_magazine { + uint32_t skm_magic; /* Sanity magic */ + uint32_t skm_avail; /* Available objects */ + uint32_t skm_size; /* Magazine size */ + uint32_t skm_refill; /* Batch refill size */ + struct spl_kmem_cache *skm_cache; /* Owned by cache */ + unsigned long skm_age; /* Last cache access */ + unsigned int skm_cpu; /* Owned by cpu */ + void *skm_objs[0]; /* Object pointers */ +} spl_kmem_magazine_t; + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + uint32_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_alloc { + struct spl_kmem_cache *ska_cache; /* Owned by cache */ + int ska_flags; /* Allocation flags */ + taskq_ent_t ska_tqe; /* Task queue entry */ +} spl_kmem_alloc_t; + +typedef struct spl_kmem_emergency { + struct rb_node ske_node; /* Emergency tree linkage */ + unsigned long ske_obj; /* Buffer address */ +} spl_kmem_emergency_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_magazine_t *skc_mag[NR_CPUS]; /* Per-CPU warm cache */ + uint32_t skc_mag_size; /* Magazine size */ + uint32_t skc_mag_refill; /* Magazine refill count */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ + unsigned long skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_obj_align; /* Object alignment */ + uint32_t skc_slab_objs; /* Objects per slab */ + uint32_t skc_slab_size; /* Slab size */ + uint32_t skc_delay; /* Slab reclaim interval */ + uint32_t skc_reap; /* Slab reclaim count */ + atomic_t skc_ref; /* Ref count callers */ + taskqid_t skc_taskqid; /* Slab reclaim task */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list; /* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rb_root skc_emergency_tree; /* Min sized objects */ + spinlock_t skc_lock; /* Cache lock */ + wait_queue_head_t skc_waitq; /* Allocation waiters */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create; /* Slab creates */ + uint64_t skc_slab_destroy; /* Slab destroys */ + uint64_t skc_slab_total; /* Slab total current */ + uint64_t skc_slab_alloc; /* Slab alloc current */ + uint64_t skc_slab_max; /* Slab max historic */ + uint64_t skc_obj_total; /* Obj total current */ + uint64_t skc_obj_alloc; /* Obj alloc current */ + uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ + uint64_t skc_obj_emergency; /* Obj emergency current */ + uint64_t skc_obj_emergency_max; /* Obj emergency max */ +} spl_kmem_cache_t; +#define kmem_cache_t spl_kmem_cache_t + +extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, + size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); +extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); +extern void spl_kmem_reap(void); + +#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ + spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) \ + spl_kmem_cache_reap_now(skc, skc->skc_reap) +#define kmem_reap() spl_kmem_reap() + +/* + * The following functions are only available for internal use. + */ +extern int spl_kmem_cache_init(void); +extern void spl_kmem_cache_fini(void); + +#endif /* _SPL_KMEM_CACHE_H */ diff --git a/include/sys/types.h b/include/sys/types.h index 3b3a42edebc0..ec0455cdcf66 100644 --- a/include/sys/types.h +++ b/include/sys/types.h @@ -48,7 +48,6 @@ typedef long long longlong_t; typedef long long offset_t; typedef struct task_struct kthread_t; typedef struct task_struct proc_t; -typedef struct vmem { } vmem_t; typedef short pri_t; typedef struct timespec timestruc_t; /* definition per SVr4 */ typedef struct timespec timespec_t; diff --git a/include/sys/vmem.h b/include/sys/vmem.h new file mode 100644 index 000000000000..8aadc9d03b30 --- /dev/null +++ b/include/sys/vmem.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_VMEM_H +#define _SPL_VMEM_H + +#include +#include +#include + +typedef struct vmem { } vmem_t; + +extern vmem_t *heap_arena; +extern vmem_t *zio_alloc_arena; +extern vmem_t *zio_arena; + +extern size_t vmem_size(vmem_t *vmp, int typemask); +extern void *spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot); + +/* + * Memory allocation interfaces + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#ifndef VMALLOC_TOTAL +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +#endif + +/* + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. + * + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. + * + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: + * + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. + * + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. + */ + +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); + +int spl_vmem_init(void); +void spl_vmem_fini(void); + +#endif /* _SPL_VMEM_H */ diff --git a/include/sys/vmsystm.h b/include/sys/vmsystm.h index c6b86866b0a0..2fa169523ddf 100644 --- a/include/sys/vmsystm.h +++ b/include/sys/vmsystm.h @@ -37,19 +37,6 @@ #define physmem totalram_pages #define freemem nr_free_pages() -extern vmem_t *heap_arena; /* primary kernel heap arena */ -extern vmem_t *zio_alloc_arena; /* arena for zio caches */ -extern vmem_t *zio_arena; /* arena for allocating zio memory */ - -extern size_t vmem_size(vmem_t *vmp, int typemask); - -#define VMEM_ALLOC 0x01 -#define VMEM_FREE 0x02 - -#ifndef VMALLOC_TOTAL -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#endif - #define xcopyin(from, to, size) copy_from_user(to, from, size) #define xcopyout(from, to, size) copy_to_user(to, from, size) diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 index 33e10b53c8e5..3e7e877fbbbc 100644 --- a/man/man5/spl-module-parameters.5 +++ b/man/man5/spl-module-parameters.5 @@ -17,67 +17,197 @@ Description of the different parameters to the SPL module. .sp .ne 2 .na -\fBspl_debug_subsys\fR (ulong) +\fBspl_kmem_cache_expire\fR (uint) .ad .RS 12n -Subsystem debugging level mask. +Cache expiration is part of default Illumos cache behavior. The idea is +that objects in magazines which have not been recently accessed should be +returned to the slabs periodically. This is known as cache aging and +when enabled objects will be typically returned after 15 seconds. +.sp +On the other hand Linux slabs are designed to never move objects back to +the slabs unless there is memory pressure. This is possible because under +Linux the cache will be notified when memory is low and objects can be +released. +.sp +By default only the Linux method is enabled. It has been shown to improve +responsiveness on low memory systems and not negatively impact the performance +of systems with more memory. This policy may be changed by setting the +\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled +concurrently. .sp -Default value: \fB~0\fR. +0x01 - Aging (Illumos), 0x02 - Low memory (Linux) +.sp +Default value: \fB0x02\fR .RE .sp .ne 2 .na -\fBspl_debug_mask\fR (ulong) +\fBspl_kmem_cache_reclaim\fR (uint) .ad .RS 12n -Debugging level mask. +When this is set it prevents Linux from being able to rapidly reclaim all the +memory held by the kmem caches. This may be useful in circumstances where +it's preferable that Linux reclaim memory from some other subsystem first. +Setting this will increase the likelihood out of memory events on a memory +constrained system. .sp -Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). +Default value: \fB0\fR .RE .sp .ne 2 .na -\fBspl_debug_printk\fR (ulong) +\fBspl_kmem_cache_obj_per_slab\fR (uint) .ad .RS 12n -Console printk level mask. +The preferred number of objects per slab in the cache. In general, a larger +value will increase the caches memory footprint while decreasing the time +required to perform an allocation. Conversely, a smaller value will minimize +the footprint and improve cache reclaim time but individual allocations may +take longer. .sp -Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). +Default value: \fB8\fR .RE .sp .ne 2 .na -\fBspl_debug_mb\fR (int) +\fBspl_kmem_cache_obj_per_slab_min\fR (uint) .ad .RS 12n -Total debug buffer size. +The minimum number of objects allowed per slab. Normally slabs will contain +\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very +large objects it's desirable to only have a few, or even just one, object per +slab. .sp -Default value: \fB-1\fR. +Default value: \fB1\fR .RE .sp .ne 2 .na -\fBspl_debug_panic_on_bug\fR (int) +\fBspl_kmem_cache_max_size\fR (uint) .ad .RS 12n -Panic on BUG +The maximum size of a kmem cache slab in MiB. This effectively limits +the maximum cache object size to \fBspl_kmem_cache_max_size\fR / +\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with +object sized larger than this limit. .sp -Use \fB1\fR for yes and \fB0\fR for no (default). +Default value: \fB32 (64-bit) or 4 (32-bit)\fR .RE .sp .ne 2 .na -\fBspl_kmem_cache_expire\fR (uint) +\fBspl_kmem_cache_slab_limit\fR (uint) +.ad +.RS 12n +For small objects the Linux slab allocator should be used to make the most +efficient use of the memory. However, large objects are not supported by +the Linux slab and therefore the SPL implementation is preferred. This +value is used to determine the cutoff between a small and large object. +.sp +Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated +using the Linux slab allocator, large objects use the SPL allocator. A +cutoff of 16K was determined to be optimal for architectures using 4K pages. +.sp +Default value: \fB16,384\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_kmem_limit\fR (uint) +.ad +.RS 12n +Depending on the size of a cache object it may be backed by kmalloc()'d +or vmalloc()'d memory. This is because the size of the required allocation +greatly impacts the best way to allocate the memory. +.sp +When objects are small and only a small number of memory pages need to be +allocated, ideally just one, then kmalloc() is very efficient. However, +when allocating multiple pages with kmalloc() it gets increasingly expensive +because the pages must be physically contiguous. +.sp +For this reason we shift to vmalloc() for slabs of large objects which +which removes the need for contiguous pages. We cannot use vmalloc() in +all cases because there is significant locking overhead involved. This +function takes a single global lock over the entire virtual address range +which serializes all allocations. Using slightly different allocation +functions for small and large objects allows us to handle a wide range of +object sizes. +.sh +The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff +size. One quarter the PAGE_SIZE is used as the default value because +\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at +most we will need to allocate four contiguous pages. +.sp +Default value: \fBPAGE_SIZE/4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_warn\fR (uint) .ad .RS 12n -By age (0x1) or low memory (0x2) +As a general rule kmem_alloc() allocations should be small, preferably +just a few pages since they must by physically contiguous. Therefore, a +rate limited warning will be printed to the console for any kmem_alloc() +which exceeds a reasonable threshold. +.sp +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. This value was selected to be small +enough to ensure the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. Since this value is tunable, +developers are encouraged to set it lower when testing so any new largish +allocations are quickly caught. These warnings may be disabled by setting +the threshold to zero. +.sp +Default value: \fB32,768\fR +.RE + .sp -Default value: \fB0\fR. +.ne 2 +.na +\fBspl_kmem_alloc_max\fR (uint) +.ad +.RS 12n +Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. Therefore, a maximum kmem size with reasonable safely +margin of 4x is set. Kmem_alloc() allocations larger than this maximum +will quickly fail. Vmem_alloc() allocations less than or equal to this +value will use kmalloc(), but shift to vmalloc() when exceeding this value. +.sp +Default value: \fBKMALLOC_MAX_SIZE/4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_magazine_size\fR (uint) +.ad +.RS 12n +Cache magazines are an optimization designed to minimize the cost of +allocating memory. They do this by keeping a per-cpu cache of recently +freed objects, which can then be reallocated without taking a lock. This +can improve performance on highly contended caches. However, because +objects in magazines will prevent otherwise empty slabs from being +immediately released this may not be ideal for low memory machines. +.sp +For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a +maximum magazine size. When this value is set to 0 the magazine size will +be automatically determined based on the object size. Otherwise magazines +will be limited to 2-256 objects per magazine (i.e per cpu). Magazines +may never be entirely disabled in this implementation. +.sp +Default value: \fB0\fR .RE .sp @@ -86,9 +216,12 @@ Default value: \fB0\fR. \fBspl_hostid\fR (ulong) .ad .RS 12n -The system hostid. +The system hostid, when set this can be used to uniquely identify a system. +By default this value is set to zero which indicates the hostid is disabled. +It can be explicitly enabled by placing a unique non-zero value in +\fB/etc/hostid/\fR. .sp -Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) +Default value: \fB0\fR .RE .sp @@ -97,9 +230,10 @@ Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) \fBspl_hostid_path\fR (charp) .ad .RS 12n -The system hostid file +The expected path to locate the system hostid when specified. This value +may be overridden for non-standard configurations. .sp -Default value: \fB/etc/hostid\fR. +Default value: \fB/etc/hostid\fR .RE .sp @@ -108,7 +242,10 @@ Default value: \fB/etc/hostid\fR. \fBspl_taskq_thread_bind\fR (int) .ad .RS 12n -Bind taskq thread to CPU +Bind taskq threads to specific CPUs. When enabled all taskq threads will +be distributed evenly over the available CPUs. By default, this behavior +is disabled to allow the Linux scheduler the maximum flexibility to determine +where a thread should run. .sp -Default value: \fB0\fR. +Default value: \fB0\fR .RE diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index 9f67ed6465c9..d1742448deb8 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -8,6 +8,8 @@ obj-$(CONFIG_SPL) := $(MODULE).o $(MODULE)-objs += @top_srcdir@/module/spl/spl-proc.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem.o +$(MODULE)-objs += @top_srcdir@/module/spl/spl-kmem-cache.o +$(MODULE)-objs += @top_srcdir@/module/spl/spl-vmem.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-thread.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-taskq.o $(MODULE)-objs += @top_srcdir@/module/spl/spl-rwlock.o diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c index 2a0052f56988..cebb8f2b10e8 100644 --- a/module/spl/spl-condvar.c +++ b/module/spl/spl-condvar.c @@ -25,6 +25,7 @@ \*****************************************************************************/ #include +#include void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c index 803f03a85938..b706ccecd3ee 100644 --- a/module/spl/spl-generic.c +++ b/module/spl/spl-generic.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -38,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -479,12 +482,46 @@ zone_get_hostid(void *zone) } EXPORT_SYMBOL(zone_get_hostid); +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + goto out1; + + rc = spl_vmem_init(); + if (rc) + goto out2; + + rc = spl_kmem_cache_init(); + if (rc) + goto out3; + + return (rc); +out3: + spl_vmem_fini(); +out2: + spl_kmem_fini(); +out1: + return (rc); +} + +static void +spl_kvmem_fini(void) +{ + spl_kmem_cache_fini(); + spl_vmem_fini(); + spl_kmem_fini(); +} + static int __init spl_init(void) { int rc = 0; - if ((rc = spl_kmem_init())) + if ((rc = spl_kvmem_init())) goto out1; if ((rc = spl_mutex_init())) @@ -530,7 +567,7 @@ __init spl_init(void) out3: spl_mutex_fini(); out2: - spl_kmem_fini(); + spl_kvmem_fini(); out1: printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer " "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE, @@ -552,7 +589,7 @@ spl_fini(void) spl_taskq_fini(); spl_rw_fini(); spl_mutex_fini(); - spl_kmem_fini(); + spl_kvmem_fini(); } /* Called when a dependent module is loaded */ diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c new file mode 100644 index 000000000000..6fcc7c4e1b61 --- /dev/null +++ b/module/spl/spl-kmem-cache.c @@ -0,0 +1,1733 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)\n"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); + } else { + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + free_pages((unsigned long)ptr, get_order(size)); + } else { + vfree(ptr); + } +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return (1UL << (fls64(spl_obj_size(skc)) + 1)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) + kv_free(skc, sko->sko_addr, offslab_size); + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + if (skc->skc_flags & KMC_OFFSLAB) { + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + /* + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. + */ + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_online_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_online_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_NOTOUCH Disable cache object aging (unsupported) + * KMC_NODEBUG Disable debugging (unsupported) + * KMC_NOHASH Disable hashing (unsupported) + * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab + * KMC_KMEM Force kmem backed cache + * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + /* + * Allocate memory for a new cache and initialize it. Unfortunately, + * this usually ends up being a large allocation of ~32k because + * we need to allocate enough memory for the worst case number of + * cpus in the magazine, skc_mag[NR_CPUS]. + */ + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characteristics are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) + skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ + else + skc->skc_flags |= KMC_VMEM; + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, 0, NULL); + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + spl_kmem_slab_t *sks; + +#if defined(PF_MEMALLOC_NOIO) + unsigned noio_flag = memalloc_noio_save(); + sks = spl_slab_alloc(skc, ska->ska_flags); + memalloc_noio_restore(noio_flag); +#else + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, ska->ska_flags); + spl_fstrans_unmark(cookie); +#endif + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + } + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + spin_unlock(&skc->skc_lock); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(spin_is_locked(&skc->skc_lock)); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + atomic_dec(&skc->skc_ref); + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + atomic_inc(&skc->skc_ref); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + goto out; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + goto out; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. The + * per-cpu caches will be drained when is set KMC_EXPIRE_MEM. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) + kmem_cache_shrink(skc->skc_linux_cache); + + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine and free all now empty slabs. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c index 502f5365b67e..914f0fbf7d2f 100644 --- a/module/spl/spl-kmem.c +++ b/module/spl/spl-kmem.c @@ -1,4 +1,4 @@ -/*****************************************************************************\ +/* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -20,106 +20,55 @@ * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . - ***************************************************************************** - * Solaris Porting Layer (SPL) Kmem Implementation. -\*****************************************************************************/ - -#include -#include -#include - -/* - * Within the scope of spl-kmem.c file the kmem_cache_* definitions - * are removed to allow access to the real Linux slab allocator. - */ -#undef kmem_cache_destroy -#undef kmem_cache_create -#undef kmem_cache_alloc -#undef kmem_cache_free - - -/* - * Cache expiration was implemented because it was part of the default Solaris - * kmem_cache behavior. The idea is that per-cpu objects which haven't been - * accessed in several seconds should be returned to the cache. On the other - * hand Linux slabs never move objects back to the slabs unless there is - * memory pressure on the system. By default the Linux method is enabled - * because it has been shown to improve responsiveness on low memory systems. - * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. - */ -unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; -EXPORT_SYMBOL(spl_kmem_cache_expire); -module_param(spl_kmem_cache_expire, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); - -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. */ -unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); -unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; -module_param(spl_kmem_cache_obj_per_slab, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); - -unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; -module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, - "Minimal number of objects per slab"); - -unsigned int spl_kmem_cache_max_size = 32; -module_param(spl_kmem_cache_max_size, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); +#include +#include +#include +#include +#include +#include /* - * For small objects the Linux slab allocator should be used to make the most - * efficient use of the memory. However, large objects are not supported by - * the Linux slab and therefore the SPL implementation is preferred. A cutoff - * of 16K was determined to be optimal for architectures using 4K pages. - */ -#if PAGE_SIZE == 4096 -unsigned int spl_kmem_cache_slab_limit = 16384; -#else -unsigned int spl_kmem_cache_slab_limit = 0; -#endif -module_param(spl_kmem_cache_slab_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_slab_limit, - "Objects less than N bytes use the Linux slab"); - -unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4); -module_param(spl_kmem_cache_kmem_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, - "Objects less than N bytes use the kmalloc"); - -vmem_t *heap_arena = NULL; -EXPORT_SYMBOL(heap_arena); - -vmem_t *zio_alloc_arena = NULL; -EXPORT_SYMBOL(zio_alloc_arena); - -vmem_t *zio_arena = NULL; -EXPORT_SYMBOL(zio_arena); - -size_t -vmem_size(vmem_t *vmp, int typemask) -{ - ASSERT3P(vmp, ==, NULL); - ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC); - ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE); - - return (VMALLOC_TOTAL); -} -EXPORT_SYMBOL(vmem_size); + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to eight pages but capped at 32K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); int kmem_debugging(void) { - return 0; + return (0); } EXPORT_SYMBOL(kmem_debugging); @@ -135,7 +84,7 @@ kmem_vasprintf(const char *fmt, va_list ap) va_end(aq); } while (ptr == NULL); - return ptr; + return (ptr); } EXPORT_SYMBOL(kmem_vasprintf); @@ -151,7 +100,7 @@ kmem_asprintf(const char *fmt, ...) va_end(ap); } while (ptr == NULL); - return ptr; + return (ptr); } EXPORT_SYMBOL(kmem_asprintf); @@ -162,17 +111,17 @@ __strdup(const char *str, int flags) int n; n = strlen(str); - ptr = kmalloc_nofail(n + 1, flags); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); if (ptr) memcpy(ptr, str, n + 1); - return ptr; + return (ptr); } char * strdup(const char *str) { - return __strdup(str, KM_SLEEP); + return (__strdup(str, KM_SLEEP)); } EXPORT_SYMBOL(strdup); @@ -184,32 +133,140 @@ strfree(char *str) EXPORT_SYMBOL(strfree); /* - * Memory allocation interfaces and debugging for basic kmem_* - * and vmem_* style memory allocation. When DEBUG_KMEM is enabled - * the SPL will keep track of the total memory allocated, and - * report any memory leaked when the module is unloaded. + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if (unlikely(size > spl_kmem_alloc_max)) { + if (flags & KM_VMEM) { + ptr = spl_vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem */ #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -# ifdef HAVE_ATOMIC64_T +#ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); unsigned long long kmem_alloc_max = 0; -atomic64_t vmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long vmem_alloc_max = 0; -# else /* HAVE_ATOMIC64_T */ +#else /* HAVE_ATOMIC64_T */ atomic_t kmem_alloc_used = ATOMIC_INIT(0); unsigned long long kmem_alloc_max = 0; -atomic_t vmem_alloc_used = ATOMIC_INIT(0); -unsigned long long vmem_alloc_max = 0; -# endif /* HAVE_ATOMIC64_T */ +#endif /* HAVE_ATOMIC64_T */ EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); -EXPORT_SYMBOL(vmem_alloc_used); -EXPORT_SYMBOL(vmem_alloc_max); -/* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is * unloaded a list of all leaked addresses and where they were allocated * will be dumped to the console. Enabling this feature has a significant @@ -219,42 +276,33 @@ EXPORT_SYMBOL(vmem_alloc_max); * contended particularly on xfree(). If we want to run with this detailed * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking */ -# ifdef DEBUG_KMEM_TRACKING +#ifdef DEBUG_KMEM_TRACKING -# define KMEM_HASH_BITS 10 -# define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) +#include +#include -# define VMEM_HASH_BITS 10 -# define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS) +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ } kmem_debug_t; -spinlock_t kmem_lock; -struct hlist_head kmem_table[KMEM_TABLE_SIZE]; -struct list_head kmem_list; - -spinlock_t vmem_lock; -struct hlist_head vmem_table[VMEM_TABLE_SIZE]; -struct list_head vmem_list; - -EXPORT_SYMBOL(kmem_lock); -EXPORT_SYMBOL(kmem_table); -EXPORT_SYMBOL(kmem_list); - -EXPORT_SYMBOL(vmem_lock); -EXPORT_SYMBOL(vmem_table); -EXPORT_SYMBOL(vmem_list); +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; static kmem_debug_t * -kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -270,7 +318,7 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void * hlist_del_init(&p->kd_hlist); list_del_init(&p->kd_list); spin_unlock_irqrestore(lock, flags); - return p; + return (p); } } @@ -279,1965 +327,224 @@ kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void * return (NULL); } -void * -kmem_alloc_track(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) { void *ptr = NULL; kmem_debug_t *dptr; unsigned long irq_flags; - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), - flags & ~__GFP_ZERO); - - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d " - "failed (%lld/%llu)\n", sizeof(kmem_debug_t), flags, - func, line, kmem_alloc_used_read(), kmem_alloc_max); - } else { - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it since the module might have been unloaded. - * This can only fail in the KM_NOSLEEP case. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = kmalloc_nofail(size, flags); - } + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - kmem_alloc_used_read(), kmem_alloc_max); - goto out; - } + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); - spin_lock_irqsave(&kmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &kmem_list); - spin_unlock_irqrestore(&kmem_lock, irq_flags); - } -out: return (ptr); } -EXPORT_SYMBOL(kmem_alloc_track); -void -kmem_free_track(const void *ptr, size_t size) +inline void +spl_kmem_free_track(const void *ptr, size_t size) { kmem_debug_t *dptr; - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - /* Must exist in hash due to kmem_alloc() */ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); - ASSERT(dptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - kmem_alloc_used_sub(size); kfree(dptr->kd_func); - - memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); kfree(dptr); - memset((void *)ptr, 0x5a, size); - kfree(ptr); + spl_kmem_free_debug(ptr, size); } -EXPORT_SYMBOL(kmem_free_track); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ void * -vmem_alloc_track(size_t size, int flags, const char *func, int line) -{ - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; - - ASSERT(flags & KM_SLEEP); - - /* Function may be called with KM_NOSLEEP so failure is possible */ - dptr = (kmem_debug_t *) kmalloc_nofail(sizeof(kmem_debug_t), - flags & ~__GFP_ZERO); - if (unlikely(dptr == NULL)) { - printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - sizeof(kmem_debug_t), flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - } else { - /* - * We use __strdup() below because the string pointed to by - * __FUNCTION__ might not be available by the time we want - * to print it, since the module might have been unloaded. - * This can never fail because we have already asserted - * that flags is KM_SLEEP. - */ - dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO); - if (unlikely(dptr->kd_func == NULL)) { - kfree(dptr); - printk(KERN_WARNING "debug __strdup() at %s:%d " - "failed (%lld/%llu)\n", func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO); - } else { - ptr = vmalloc_nofail(size, flags); - } - - if (unlikely(ptr == NULL)) { - kfree(dptr->kd_func); - kfree(dptr); - printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) " - "at %s:%d failed (%lld/%llu)\n", - (unsigned long long) size, flags, func, line, - vmem_alloc_used_read(), vmem_alloc_max); - goto out; - } - - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&vmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &vmem_list); - spin_unlock_irqrestore(&vmem_lock, irq_flags); - } -out: - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_track); - -void -vmem_free_track(const void *ptr, size_t size) +spl_kmem_alloc(size_t size, int flags, const char *func, int line) { - kmem_debug_t *dptr; - - ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr, - (unsigned long long) size); - - /* Must exist in hash due to vmem_alloc() */ - dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr); - ASSERT(dptr); - - /* Size must match */ - ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), " - "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size, - (unsigned long long) size, dptr->kd_func, dptr->kd_line); - - vmem_alloc_used_sub(size); - kfree(dptr->kd_func); + ASSERT0(flags & ~KM_PUBLIC_MASK); - memset((void *)dptr, 0x5a, sizeof(kmem_debug_t)); - kfree(dptr); - - memset((void *)ptr, 0x5a, size); - vfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(vmem_free_track); - -# else /* DEBUG_KMEM_TRACKING */ +EXPORT_SYMBOL(spl_kmem_alloc); void * -kmem_alloc_debug(size_t size, int flags, const char *func, int line, - int node_alloc, int node) +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) { - void *ptr; - - /* - * Marked unlikely because we should never be doing this, - * we tolerate to up 2 pages but a single page is best. - */ - if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) { - printk(KERN_WARNING - "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - spl_dumpstack(); - } - - /* Use the correct allocator */ - if (node_alloc) { - ASSERT(!(flags & __GFP_ZERO)); - ptr = kmalloc_node_nofail(size, flags, node); - } else if (flags & __GFP_ZERO) { - ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = kmalloc_nofail(size, flags); - } + ASSERT0(flags & ~KM_PUBLIC_MASK); - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max); - } else { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - } + flags |= KM_ZERO; - return (ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif } -EXPORT_SYMBOL(kmem_alloc_debug); +EXPORT_SYMBOL(spl_kmem_zalloc); void -kmem_free_debug(const void *ptr, size_t size) +spl_kmem_free(const void *buf, size_t size) { - ASSERT(ptr || size > 0); - kmem_alloc_used_sub(size); - kfree(ptr); +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif } -EXPORT_SYMBOL(kmem_free_debug); +EXPORT_SYMBOL(spl_kmem_free); -void * -vmem_alloc_debug(size_t size, int flags, const char *func, int line) +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) { - void *ptr; + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; - ASSERT(flags & KM_SLEEP); + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); - /* Use the correct allocator */ - if (flags & __GFP_ZERO) { - ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO)); - } else { - ptr = vmalloc_nofail(size, flags); - } + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ + if (i > min) + break; - if (unlikely(ptr == NULL)) { - printk(KERN_WARNING - "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n", - (unsigned long long)size, flags, func, line, - (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max); - } else { - vmem_alloc_used_add(size); - if (unlikely(vmem_alloc_used_read() > vmem_alloc_max)) - vmem_alloc_max = vmem_alloc_used_read(); + flag = 0; + break; + } } - return (ptr); -} -EXPORT_SYMBOL(vmem_alloc_debug); + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } -void -vmem_free_debug(const void *ptr, size_t size) -{ - ASSERT(ptr || size > 0); - vmem_alloc_used_sub(size); - vfree(ptr); + return (str); } -EXPORT_SYMBOL(vmem_free_debug); - -# endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - -/* - * Slab allocation interfaces - * - * While the Linux slab implementation was inspired by the Solaris - * implementation I cannot use it to emulate the Solaris APIs. I - * require two features which are not provided by the Linux slab. - * - * 1) Constructors AND destructors. Recent versions of the Linux - * kernel have removed support for destructors. This is a deal - * breaker for the SPL which contains particularly expensive - * initializers for mutex's, condition variables, etc. We also - * require a minimal level of cleanup for these data types unlike - * many Linux data type which do need to be explicitly destroyed. - * - * 2) Virtual address space backed slab. Callers of the Solaris slab - * expect it to work well for both small are very large allocations. - * Because of memory fragmentation the Linux slab which is backed - * by kmalloc'ed memory performs very badly when confronted with - * large numbers of large allocations. Basing the slab on the - * virtual address space removes the need for contiguous pages - * and greatly improve performance for large allocations. - * - * For these reasons, the SPL has its own slab implementation with - * the needed features. It is not as highly optimized as either the - * Solaris or Linux slabs, but it should get me most of what is - * needed until it can be optimized or obsoleted by another approach. - * - * One serious concern I do have about this method is the relatively - * small virtual address space on 32bit arches. This will seriously - * constrain the size of the slab caches and their performance. - * - * XXX: Improve the partial slab list by carefully maintaining a - * strict ordering of fullest to emptiest slabs based on - * the slab reference count. This guarantees the when freeing - * slabs back to the system we need only linearly traverse the - * last N slabs in the list to discover all the freeable slabs. - * - * XXX: NUMA awareness for optionally allocating memory close to a - * particular core. This can be advantageous if you know the slab - * object will be short lived and primarily accessed from one core. - * - * XXX: Slab coloring may also yield performance improvements and would - * be desirable to implement. - */ - -struct list_head spl_kmem_cache_list; /* List of caches */ -struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ -static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); - -SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); -SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, - spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); - -static void * -kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) { - void *ptr; - - ASSERT(ISP2(size)); + int i; - if (skc->skc_flags & KMC_KMEM) - ptr = (void *)__get_free_pages(flags | __GFP_COMP, - get_order(size)); - else - ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + spin_lock_init(lock); + INIT_LIST_HEAD(list); - /* Resulting allocated memory will be page aligned */ - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); - return ptr; + return (0); } static void -kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) { - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - ASSERT(ISP2(size)); + unsigned long flags; + kmem_debug_t *kd; + char str[17]; - /* - * The Linux direct reclaim path uses this out of band value to - * determine if forward progress is being made. Normally this is - * incremented by kmem_freepages() which is part of the various - * Linux slab implementations. However, since we are using none - * of that infrastructure we are responsible for incrementing it. - */ - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); - if (skc->skc_flags & KMC_KMEM) - free_pages((unsigned long)ptr, get_order(size)); - else - vfree(ptr); -} + list_for_each_entry(kd, list, kd_list) + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); -/* - * Required space for each aligned sks. - */ -static inline uint32_t -spl_sks_size(spl_kmem_cache_t *skc) -{ - return P2ROUNDUP_TYPED(sizeof(spl_kmem_slab_t), - skc->skc_obj_align, uint32_t); + spin_unlock_irqrestore(lock, flags); } +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -/* - * Required space for each aligned object. - */ -static inline uint32_t -spl_obj_size(spl_kmem_cache_t *skc) +int +spl_kmem_init(void) { - uint32_t align = skc->skc_obj_align; +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); - return P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + - P2ROUNDUP_TYPED(sizeof(spl_kmem_obj_t), align, uint32_t); -} +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ -/* - * Lookup the spl_kmem_object_t for an object given that object. - */ -static inline spl_kmem_obj_t * -spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) -{ - return obj + P2ROUNDUP_TYPED(skc->skc_obj_size, - skc->skc_obj_align, uint32_t); -} - -/* - * Required space for each offslab object taking in to account alignment - * restrictions and the power-of-two requirement of kv_alloc(). - */ -static inline uint32_t -spl_offslab_size(spl_kmem_cache_t *skc) -{ - return 1UL << (fls64(spl_obj_size(skc)) + 1); -} - -/* - * It's important that we pack the spl_kmem_obj_t structure and the - * actual objects in to one large address space to minimize the number - * of calls to the allocator. It is far better to do a few large - * allocations and then subdivide it ourselves. Now which allocator - * we use requires balancing a few trade offs. - * - * For small objects we use kmem_alloc() because as long as you are - * only requesting a small number of pages (ideally just one) its cheap. - * However, when you start requesting multiple pages with kmem_alloc() - * it gets increasingly expensive since it requires contiguous pages. - * For this reason we shift to vmem_alloc() for slabs of large objects - * which removes the need for contiguous pages. We do not use - * vmem_alloc() in all cases because there is significant locking - * overhead in __get_vm_area_node(). This function takes a single - * global lock when acquiring an available virtual address range which - * serializes all vmem_alloc()'s for all slab caches. Using slightly - * different allocation functions for small and large objects should - * give us the best of both worlds. - * - * KMC_ONSLAB KMC_OFFSLAB - * - * +------------------------+ +-----------------+ - * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ - * | skc_obj_size <-+ | | +-----------------+ | | - * | spl_kmem_obj_t | | | | - * | skc_obj_size <---+ | +-----------------+ | | - * | spl_kmem_obj_t | | | skc_obj_size | <-+ | - * | ... v | | spl_kmem_obj_t | | - * +------------------------+ +-----------------+ v - */ -static spl_kmem_slab_t * -spl_slab_alloc(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko, *n; - void *base, *obj; - uint32_t obj_size, offslab_size = 0; - int i, rc = 0; - - base = kv_alloc(skc, skc->skc_slab_size, flags); - if (base == NULL) - return (NULL); - - sks = (spl_kmem_slab_t *)base; - sks->sks_magic = SKS_MAGIC; - sks->sks_objs = skc->skc_slab_objs; - sks->sks_age = jiffies; - sks->sks_cache = skc; - INIT_LIST_HEAD(&sks->sks_list); - INIT_LIST_HEAD(&sks->sks_free_list); - sks->sks_ref = 0; - obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_OFFSLAB) - offslab_size = spl_offslab_size(skc); - - for (i = 0; i < sks->sks_objs; i++) { - if (skc->skc_flags & KMC_OFFSLAB) { - obj = kv_alloc(skc, offslab_size, flags); - if (!obj) { - rc = -ENOMEM; - goto out; - } - } else { - obj = base + spl_sks_size(skc) + (i * obj_size); - } - - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - sko = spl_sko_from_obj(skc, obj); - sko->sko_addr = obj; - sko->sko_magic = SKO_MAGIC; - sko->sko_slab = sks; - INIT_LIST_HEAD(&sko->sko_list); - list_add_tail(&sko->sko_list, &sks->sks_free_list); - } - -out: - if (rc) { - if (skc->skc_flags & KMC_OFFSLAB) - list_for_each_entry_safe(sko, n, &sks->sks_free_list, - sko_list) - kv_free(skc, sko->sko_addr, offslab_size); - - kv_free(skc, base, skc->skc_slab_size); - sks = NULL; - } - - return (sks); -} - -/* - * Remove a slab from complete or partial list, it must be called with - * the 'skc->skc_lock' held but the actual free must be performed - * outside the lock to prevent deadlocking on vmem addresses. - */ -static void -spl_slab_free(spl_kmem_slab_t *sks, - struct list_head *sks_list, struct list_head *sko_list) -{ - spl_kmem_cache_t *skc; - - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref == 0); - - skc = sks->sks_cache; - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - /* - * Update slab/objects counters in the cache, then remove the - * slab from the skc->skc_partial_list. Finally add the slab - * and all its objects in to the private work lists where the - * destructors will be called and the memory freed to the system. - */ - skc->skc_obj_total -= sks->sks_objs; - skc->skc_slab_total--; - list_del(&sks->sks_list); - list_add(&sks->sks_list, sks_list); - list_splice_init(&sks->sks_free_list, sko_list); -} - -/* - * Traverses all the partial slabs attached to a cache and free those - * which which are currently empty, and have not been touched for - * skc_delay seconds to avoid thrashing. The count argument is - * passed to optionally cap the number of slabs reclaimed, a count - * of zero means try and reclaim everything. When flag is set we - * always free an available slab regardless of age. - */ -static void -spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag) -{ - spl_kmem_slab_t *sks, *m; - spl_kmem_obj_t *sko, *n; - LIST_HEAD(sks_list); - LIST_HEAD(sko_list); - uint32_t size = 0; - int i = 0; - - /* - * Move empty slabs and objects which have not been touched in - * skc_delay seconds on to private lists to be freed outside - * the spin lock. This delay time is important to avoid thrashing - * however when flag is set the delay will not be used. - */ - spin_lock(&skc->skc_lock); - list_for_each_entry_safe_reverse(sks,m,&skc->skc_partial_list,sks_list){ - /* - * All empty slabs are at the end of skc->skc_partial_list, - * therefore once a non-empty slab is found we can stop - * scanning. Additionally, stop when reaching the target - * reclaim 'count' if a non-zero threshold is given. - */ - if ((sks->sks_ref > 0) || (count && i >= count)) - break; - - if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) { - spl_slab_free(sks, &sks_list, &sko_list); - i++; - } - } - spin_unlock(&skc->skc_lock); - - /* - * The following two loops ensure all the object destructors are - * run, any offslab objects are freed, and the slabs themselves - * are freed. This is all done outside the skc->skc_lock since - * this allows the destructor to sleep, and allows us to perform - * a conditional reschedule when a freeing a large number of - * objects and slabs back to the system. - */ - if (skc->skc_flags & KMC_OFFSLAB) - size = spl_offslab_size(skc); - - list_for_each_entry_safe(sko, n, &sko_list, sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); - - if (skc->skc_flags & KMC_OFFSLAB) - kv_free(skc, sko->sko_addr, size); - } - - list_for_each_entry_safe(sks, m, &sks_list, sks_list) { - ASSERT(sks->sks_magic == SKS_MAGIC); - kv_free(skc, sks, skc->skc_slab_size); - } -} - -static spl_kmem_emergency_t * -spl_emergency_search(struct rb_root *root, void *obj) -{ - struct rb_node *node = root->rb_node; - spl_kmem_emergency_t *ske; - unsigned long address = (unsigned long)obj; - - while (node) { - ske = container_of(node, spl_kmem_emergency_t, ske_node); - - if (address < (unsigned long)ske->ske_obj) - node = node->rb_left; - else if (address > (unsigned long)ske->ske_obj) - node = node->rb_right; - else - return ske; - } - - return NULL; -} - -static int -spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - spl_kmem_emergency_t *ske_tmp; - unsigned long address = (unsigned long)ske->ske_obj; - - while (*new) { - ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); - - parent = *new; - if (address < (unsigned long)ske_tmp->ske_obj) - new = &((*new)->rb_left); - else if (address > (unsigned long)ske_tmp->ske_obj) - new = &((*new)->rb_right); - else - return 0; - } - - rb_link_node(&ske->ske_node, parent, new); - rb_insert_color(&ske->ske_node, root); - - return 1; -} - -/* - * Allocate a single emergency object and track it in a red black tree. - */ -static int -spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) -{ - spl_kmem_emergency_t *ske; - int empty; - - /* Last chance use a partial slab if one now exists */ - spin_lock(&skc->skc_lock); - empty = list_empty(&skc->skc_partial_list); - spin_unlock(&skc->skc_lock); - if (!empty) - return (-EEXIST); - - ske = kmalloc(sizeof(*ske), flags); - if (ske == NULL) - return (-ENOMEM); - - ske->ske_obj = kmalloc(skc->skc_obj_size, flags); - if (ske->ske_obj == NULL) { - kfree(ske); - return (-ENOMEM); - } - - spin_lock(&skc->skc_lock); - empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); - if (likely(empty)) { - skc->skc_obj_total++; - skc->skc_obj_emergency++; - if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) - skc->skc_obj_emergency_max = skc->skc_obj_emergency; - } - spin_unlock(&skc->skc_lock); - - if (unlikely(!empty)) { - kfree(ske->ske_obj); - kfree(ske); - return (-EINVAL); - } - - *obj = ske->ske_obj; - - return (0); -} - -/* - * Locate the passed object in the red black tree and free it. - */ -static int -spl_emergency_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_emergency_t *ske; - - spin_lock(&skc->skc_lock); - ske = spl_emergency_search(&skc->skc_emergency_tree, obj); - if (likely(ske)) { - rb_erase(&ske->ske_node, &skc->skc_emergency_tree); - skc->skc_obj_emergency--; - skc->skc_obj_total--; - } - spin_unlock(&skc->skc_lock); - - if (unlikely(ske == NULL)) - return (-ENOENT); - - kfree(ske->ske_obj); - kfree(ske); - - return (0); -} - -/* - * Release objects from the per-cpu magazine back to their slab. The flush - * argument contains the max number of entries to remove from the magazine. - */ -static void -__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - int i, count = MIN(flush, skm->skm_avail); - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - for (i = 0; i < count; i++) - spl_cache_shrink(skc, skm->skm_objs[i]); - - skm->skm_avail -= count; - memmove(skm->skm_objs, &(skm->skm_objs[count]), - sizeof(void *) * skm->skm_avail); -} - -static void -spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - spin_lock(&skc->skc_lock); - __spl_cache_flush(skc, skm, flush); - spin_unlock(&skc->skc_lock); -} - -static void -spl_magazine_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_cpu == smp_processor_id()); - ASSERT(irqs_disabled()); - - /* There are no available objects or they are too young to age out */ - if ((skm->skm_avail == 0) || - time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) - return; - - /* - * Because we're executing in interrupt context we may have - * interrupted the holder of this lock. To avoid a potential - * deadlock return if the lock is contended. - */ - if (!spin_trylock(&skc->skc_lock)) - return; - - __spl_cache_flush(skc, skm, skm->skm_refill); - spin_unlock(&skc->skc_lock); -} - -/* - * Called regularly to keep a downward pressure on the cache. - * - * Objects older than skc->skc_delay seconds in the per-cpu magazines will - * be returned to the caches. This is done to prevent idle magazines from - * holding memory which could be better used elsewhere. The delay is - * present to prevent thrashing the magazine. - * - * The newly released objects may result in empty partial slabs. Those - * slabs should be released to the system. Otherwise moving the objects - * out of the magazines is just wasted work. - */ -static void -spl_cache_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - taskqid_t id = 0; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* Dynamically disabled at run time */ - if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) - return; - - atomic_inc(&skc->skc_ref); - - if (!(skc->skc_flags & KMC_NOMAGAZINE)) - on_each_cpu(spl_magazine_age, skc, 1); - - spl_slab_reclaim(skc, skc->skc_reap, 0); - - while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { - id = taskq_dispatch_delay( - spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - /* Destroy issued after dispatch immediately cancel it */ - if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) - taskq_cancel_id(spl_kmem_cache_taskq, id); - } - - spin_lock(&skc->skc_lock); - skc->skc_taskqid = id; - spin_unlock(&skc->skc_lock); - - atomic_dec(&skc->skc_ref); -} - -/* - * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. - * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, - * for very small objects we may end up with more than this so as not - * to waste space in the minimal allocation of a single page. Also for - * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, - * lower than this and we will fail. - */ -static int -spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) -{ - uint32_t sks_size, obj_size, max_size; - - if (skc->skc_flags & KMC_OFFSLAB) { - *objs = spl_kmem_cache_obj_per_slab; - *size = P2ROUNDUP(sizeof(spl_kmem_slab_t), PAGE_SIZE); - return (0); - } else { - sks_size = spl_sks_size(skc); - obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_KMEM) - max_size = ((uint32_t)1 << (MAX_ORDER-3)) * PAGE_SIZE; - else - max_size = (spl_kmem_cache_max_size * 1024 * 1024); - - /* Power of two sized slab */ - for (*size = PAGE_SIZE; *size <= max_size; *size *= 2) { - *objs = (*size - sks_size) / obj_size; - if (*objs >= spl_kmem_cache_obj_per_slab) - return (0); - } - - /* - * Unable to satisfy target objects per slab, fall back to - * allocating a maximally sized slab and assuming it can - * contain the minimum objects count use it. If not fail. - */ - *size = max_size; - *objs = (*size - sks_size) / obj_size; - if (*objs >= (spl_kmem_cache_obj_per_slab_min)) - return (0); - } - - return (-ENOSPC); -} - -/* - * Make a guess at reasonable per-cpu magazine size based on the size of - * each object and the cost of caching N of them in each magazine. Long - * term this should really adapt based on an observed usage heuristic. - */ -static int -spl_magazine_size(spl_kmem_cache_t *skc) -{ - uint32_t obj_size = spl_obj_size(skc); - int size; - - /* Per-magazine sizes below assume a 4Kib page size */ - if (obj_size > (PAGE_SIZE * 256)) - size = 4; /* Minimum 4Mib per-magazine */ - else if (obj_size > (PAGE_SIZE * 32)) - size = 16; /* Minimum 2Mib per-magazine */ - else if (obj_size > (PAGE_SIZE)) - size = 64; /* Minimum 256Kib per-magazine */ - else if (obj_size > (PAGE_SIZE / 4)) - size = 128; /* Minimum 128Kib per-magazine */ - else - size = 256; - - return (size); -} - -/* - * Allocate a per-cpu magazine to associate with a specific core. - */ -static spl_kmem_magazine_t * -spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) -{ - spl_kmem_magazine_t *skm; - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skc->skc_mag_size; - - skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu)); - if (skm) { - skm->skm_magic = SKM_MAGIC; - skm->skm_avail = 0; - skm->skm_size = skc->skc_mag_size; - skm->skm_refill = skc->skc_mag_refill; - skm->skm_cache = skc; - skm->skm_age = jiffies; - skm->skm_cpu = cpu; - } - - return (skm); -} - -/* - * Free a per-cpu magazine associated with a specific core. - */ -static void -spl_magazine_free(spl_kmem_magazine_t *skm) -{ - int size = sizeof(spl_kmem_magazine_t) + - sizeof(void *) * skm->skm_size; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_avail == 0); - - kmem_free(skm, size); -} - -/* - * Create all pre-cpu magazines of reasonable sizes. - */ -static int -spl_magazine_create(spl_kmem_cache_t *skc) -{ - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return (0); - - skc->skc_mag_size = spl_magazine_size(skc); - skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; - - for_each_online_cpu(i) { - skc->skc_mag[i] = spl_magazine_alloc(skc, i); - if (!skc->skc_mag[i]) { - for (i--; i >= 0; i--) - spl_magazine_free(skc->skc_mag[i]); - - return (-ENOMEM); - } - } - - return (0); -} - -/* - * Destroy all pre-cpu magazines. - */ -static void -spl_magazine_destroy(spl_kmem_cache_t *skc) -{ - spl_kmem_magazine_t *skm; - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return; - - for_each_online_cpu(i) { - skm = skc->skc_mag[i]; - spl_cache_flush(skc, skm, skm->skm_avail); - spl_magazine_free(skm); - } -} - -/* - * Create a object cache based on the following arguments: - * name cache name - * size cache object size - * align cache object alignment - * ctor cache object constructor - * dtor cache object destructor - * reclaim cache object reclaim - * priv cache private data for ctor/dtor/reclaim - * vmp unused must be NULL - * flags - * KMC_NOTOUCH Disable cache object aging (unsupported) - * KMC_NODEBUG Disable debugging (unsupported) - * KMC_NOHASH Disable hashing (unsupported) - * KMC_QCACHE Disable qcache (unsupported) - * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab - * KMC_KMEM Force kmem backed cache - * KMC_VMEM Force vmem backed cache - * KMC_SLAB Force Linux slab backed cache - * KMC_OFFSLAB Locate objects off the slab - */ -spl_kmem_cache_t * -spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, - spl_kmem_dtor_t dtor, - spl_kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) -{ - spl_kmem_cache_t *skc; - int rc; - - /* - * Unsupported flags - */ - ASSERT0(flags & KMC_NOMAGAZINE); - ASSERT0(flags & KMC_NOHASH); - ASSERT0(flags & KMC_QCACHE); - ASSERT(vmp == NULL); - - might_sleep(); - - /* - * Allocate memory for a new cache an initialize it. Unfortunately, - * this usually ends up being a large allocation of ~32k because - * we need to allocate enough memory for the worst case number of - * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we - * explicitly pass KM_NODEBUG to suppress the kmem warning - */ - skc = kmem_zalloc(sizeof(*skc), KM_SLEEP| KM_NODEBUG); - if (skc == NULL) - return (NULL); - - skc->skc_magic = SKC_MAGIC; - skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP); - if (skc->skc_name == NULL) { - kmem_free(skc, sizeof(*skc)); - return (NULL); - } - strncpy(skc->skc_name, name, skc->skc_name_size); - - skc->skc_ctor = ctor; - skc->skc_dtor = dtor; - skc->skc_reclaim = reclaim; - skc->skc_private = priv; - skc->skc_vmp = vmp; - skc->skc_linux_cache = NULL; - skc->skc_flags = flags; - skc->skc_obj_size = size; - skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; - skc->skc_delay = SPL_KMEM_CACHE_DELAY; - skc->skc_reap = SPL_KMEM_CACHE_REAP; - atomic_set(&skc->skc_ref, 0); - - INIT_LIST_HEAD(&skc->skc_list); - INIT_LIST_HEAD(&skc->skc_complete_list); - INIT_LIST_HEAD(&skc->skc_partial_list); - skc->skc_emergency_tree = RB_ROOT; - spin_lock_init(&skc->skc_lock); - init_waitqueue_head(&skc->skc_waitq); - skc->skc_slab_fail = 0; - skc->skc_slab_create = 0; - skc->skc_slab_destroy = 0; - skc->skc_slab_total = 0; - skc->skc_slab_alloc = 0; - skc->skc_slab_max = 0; - skc->skc_obj_total = 0; - skc->skc_obj_alloc = 0; - skc->skc_obj_max = 0; - skc->skc_obj_deadlock = 0; - skc->skc_obj_emergency = 0; - skc->skc_obj_emergency_max = 0; - - /* - * Verify the requested alignment restriction is sane. - */ - if (align) { - VERIFY(ISP2(align)); - VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); - VERIFY3U(align, <=, PAGE_SIZE); - skc->skc_obj_align = align; - } - - /* - * When no specific type of slab is requested (kmem, vmem, or - * linuxslab) then select a cache type based on the object size - * and default tunables. - */ - if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { - - /* - * Objects smaller than spl_kmem_cache_slab_limit can - * use the Linux slab for better space-efficiency. By - * default this functionality is disabled until its - * performance characters are fully understood. - */ - if (spl_kmem_cache_slab_limit && - size <= (size_t)spl_kmem_cache_slab_limit) - skc->skc_flags |= KMC_SLAB; - - /* - * Small objects, less than spl_kmem_cache_kmem_limit per - * object should use kmem because their slabs are small. - */ - else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) - skc->skc_flags |= KMC_KMEM; - - /* - * All other objects are considered large and are placed - * on vmem backed slabs. - */ - else - skc->skc_flags |= KMC_VMEM; - } - - /* - * Given the type of slab allocate the required resources. - */ - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - rc = spl_slab_size(skc, - &skc->skc_slab_objs, &skc->skc_slab_size); - if (rc) - goto out; - - rc = spl_magazine_create(skc); - if (rc) - goto out; - } else { - skc->skc_linux_cache = kmem_cache_create( - skc->skc_name, size, align, 0, NULL); - if (skc->skc_linux_cache == NULL) { - rc = ENOMEM; - goto out; - } - - kmem_cache_set_allocflags(skc, __GFP_COMP); - skc->skc_flags |= KMC_NOMAGAZINE; - } - - if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) - skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, - spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - down_write(&spl_kmem_cache_sem); - list_add_tail(&skc->skc_list, &spl_kmem_cache_list); - up_write(&spl_kmem_cache_sem); - - return (skc); -out: - kmem_free(skc->skc_name, skc->skc_name_size); - kmem_free(skc, sizeof(*skc)); - return (NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_create); - -/* - * Register a move callback to for cache defragmentation. - * XXX: Unimplemented but harmless to stub out for now. - */ -void -spl_kmem_cache_set_move(spl_kmem_cache_t *skc, - kmem_cbrc_t (move)(void *, void *, size_t, void *)) -{ - ASSERT(move != NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_set_move); - -/* - * Destroy a cache and all objects associated with the cache. - */ -void -spl_kmem_cache_destroy(spl_kmem_cache_t *skc) -{ - DECLARE_WAIT_QUEUE_HEAD(wq); - taskqid_t id; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); - - down_write(&spl_kmem_cache_sem); - list_del_init(&skc->skc_list); - up_write(&spl_kmem_cache_sem); - - /* Cancel any and wait for any pending delayed tasks */ - VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - spin_lock(&skc->skc_lock); - id = skc->skc_taskqid; - spin_unlock(&skc->skc_lock); - - taskq_cancel_id(spl_kmem_cache_taskq, id); - - /* Wait until all current callers complete, this is mainly - * to catch the case where a low memory situation triggers a - * cache reaping action which races with this destroy. */ - wait_event(wq, atomic_read(&skc->skc_ref) == 0); - - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - spl_magazine_destroy(skc); - spl_slab_reclaim(skc, 0, 1); - } else { - ASSERT(skc->skc_flags & KMC_SLAB); - kmem_cache_destroy(skc->skc_linux_cache); - } - - spin_lock(&skc->skc_lock); - - /* Validate there are no objects in use and free all the - * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - ASSERT3U(skc->skc_slab_alloc, ==, 0); - ASSERT3U(skc->skc_obj_alloc, ==, 0); - ASSERT3U(skc->skc_slab_total, ==, 0); - ASSERT3U(skc->skc_obj_total, ==, 0); - ASSERT3U(skc->skc_obj_emergency, ==, 0); - ASSERT(list_empty(&skc->skc_complete_list)); - - kmem_free(skc->skc_name, skc->skc_name_size); - spin_unlock(&skc->skc_lock); - - kmem_free(skc, sizeof(*skc)); -} -EXPORT_SYMBOL(spl_kmem_cache_destroy); - -/* - * Allocate an object from a slab attached to the cache. This is used to - * repopulate the per-cpu magazine caches in batches when they run low. - */ -static void * -spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) -{ - spl_kmem_obj_t *sko; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); - ASSERT(sko->sko_magic == SKO_MAGIC); - ASSERT(sko->sko_addr != NULL); - - /* Remove from sks_free_list */ - list_del_init(&sko->sko_list); - - sks->sks_age = jiffies; - sks->sks_ref++; - skc->skc_obj_alloc++; - - /* Track max obj usage statistics */ - if (skc->skc_obj_alloc > skc->skc_obj_max) - skc->skc_obj_max = skc->skc_obj_alloc; - - /* Track max slab usage statistics */ - if (sks->sks_ref == 1) { - skc->skc_slab_alloc++; - - if (skc->skc_slab_alloc > skc->skc_slab_max) - skc->skc_slab_max = skc->skc_slab_alloc; - } - - return sko->sko_addr; -} - -/* - * Generic slab allocation function to run by the global work queues. - * It is responsible for allocating a new slab, linking it in to the list - * of partial slabs, and then waking any waiters. - */ -static void -spl_cache_grow_work(void *data) -{ - spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; - spl_kmem_cache_t *skc = ska->ska_cache; - spl_kmem_slab_t *sks; - - sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG); - spin_lock(&skc->skc_lock); - if (sks) { - skc->skc_slab_total++; - skc->skc_obj_total += sks->sks_objs; - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - } - - atomic_dec(&skc->skc_ref); - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - wake_up_all(&skc->skc_waitq); - spin_unlock(&skc->skc_lock); - - kfree(ska); -} - -/* - * Returns non-zero when a new slab should be available. - */ -static int -spl_cache_grow_wait(spl_kmem_cache_t *skc) -{ - return !test_bit(KMC_BIT_GROWING, &skc->skc_flags); -} - -/* - * No available objects on any slabs, create a new slab. Note that this - * functionality is disabled for KMC_SLAB caches which are backed by the - * Linux slab. - */ -static int -spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) -{ - int remaining, rc; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); - *obj = NULL; - - /* - * Before allocating a new slab wait for any reaping to complete and - * then return so the local magazine can be rechecked for new objects. - */ - if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { - rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, - TASK_UNINTERRUPTIBLE); - return (rc ? rc : -EAGAIN); - } - - /* - * This is handled by dispatching a work request to the global work - * queue. This allows us to asynchronously allocate a new slab while - * retaining the ability to safely fall back to a smaller synchronous - * allocations to ensure forward progress is always maintained. - */ - if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { - spl_kmem_alloc_t *ska; - - ska = kmalloc(sizeof(*ska), flags); - if (ska == NULL) { - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - wake_up_all(&skc->skc_waitq); - return (-ENOMEM); - } - - atomic_inc(&skc->skc_ref); - ska->ska_cache = skc; - ska->ska_flags = flags & ~__GFP_FS; - taskq_init_ent(&ska->ska_tqe); - taskq_dispatch_ent(spl_kmem_cache_taskq, - spl_cache_grow_work, ska, 0, &ska->ska_tqe); - } - - /* - * The goal here is to only detect the rare case where a virtual slab - * allocation has deadlocked. We must be careful to minimize the use - * of emergency objects which are more expensive to track. Therefore, - * we set a very long timeout for the asynchronous allocation and if - * the timeout is reached the cache is flagged as deadlocked. From - * this point only new emergency objects will be allocated until the - * asynchronous allocation completes and clears the deadlocked flag. - */ - if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { - rc = spl_emergency_alloc(skc, flags, obj); - } else { - remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), HZ); - - if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { - spin_lock(&skc->skc_lock); - if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { - set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - skc->skc_obj_deadlock++; - } - spin_unlock(&skc->skc_lock); - } - - rc = -ENOMEM; - } - - return (rc); -} - -/* - * Refill a per-cpu magazine with objects from the slabs for this cache. - * Ideally the magazine can be repopulated using existing objects which have - * been released, however if we are unable to locate enough free objects new - * slabs of objects will be created. On success NULL is returned, otherwise - * the address of a single emergency object is returned for use by the caller. - */ -static void * -spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) -{ - spl_kmem_slab_t *sks; - int count = 0, rc, refill; - void *obj = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - - refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); - spin_lock(&skc->skc_lock); - - while (refill > 0) { - /* No slabs available we may need to grow the cache */ - if (list_empty(&skc->skc_partial_list)) { - spin_unlock(&skc->skc_lock); - - local_irq_enable(); - rc = spl_cache_grow(skc, flags, &obj); - local_irq_disable(); - - /* Emergency object for immediate use by caller */ - if (rc == 0 && obj != NULL) - return (obj); - - if (rc) - goto out; - - /* Rescheduled to different CPU skm is not local */ - if (skm != skc->skc_mag[smp_processor_id()]) - goto out; - - /* Potentially rescheduled to the same CPU but - * allocations may have occurred from this CPU while - * we were sleeping so recalculate max refill. */ - refill = MIN(refill, skm->skm_size - skm->skm_avail); - - spin_lock(&skc->skc_lock); - continue; - } - - /* Grab the next available slab */ - sks = list_entry((&skc->skc_partial_list)->next, - spl_kmem_slab_t, sks_list); - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref < sks->sks_objs); - ASSERT(!list_empty(&sks->sks_free_list)); - - /* Consume as many objects as needed to refill the requested - * cache. We must also be careful not to overfill it. */ - while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { - ASSERT(skm->skm_avail < skm->skm_size); - ASSERT(count < skm->skm_size); - skm->skm_objs[skm->skm_avail++]=spl_cache_obj(skc,sks); - } - - /* Move slab to skc_complete_list when full */ - if (sks->sks_ref == sks->sks_objs) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_complete_list); - } - } - - spin_unlock(&skc->skc_lock); -out: - return (NULL); -} - -/* - * Release an object back to the slab from which it came. - */ -static void -spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_slab_t *sks = NULL; - spl_kmem_obj_t *sko = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(spin_is_locked(&skc->skc_lock)); - - sko = spl_sko_from_obj(skc, obj); - ASSERT(sko->sko_magic == SKO_MAGIC); - sks = sko->sko_slab; - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_cache == skc); - list_add(&sko->sko_list, &sks->sks_free_list); - - sks->sks_age = jiffies; - sks->sks_ref--; - skc->skc_obj_alloc--; - - /* Move slab to skc_partial_list when no longer full. Slabs - * are added to the head to keep the partial list is quasi-full - * sorted order. Fuller at the head, emptier at the tail. */ - if (sks->sks_ref == (sks->sks_objs - 1)) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_partial_list); - } - - /* Move empty slabs to the end of the partial list so - * they can be easily found and freed during reclamation. */ - if (sks->sks_ref == 0) { - list_del(&sks->sks_list); - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - skc->skc_slab_alloc--; - } -} - -/* - * Allocate an object from the per-cpu magazine, or if the magazine - * is empty directly allocate from a slab and repopulate the magazine. - */ -void * -spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_magazine_t *skm; - void *obj = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - ASSERT(flags & KM_SLEEP); - - atomic_inc(&skc->skc_ref); - - /* - * Allocate directly from a Linux slab. All optimizations are left - * to the underlying cache we only need to guarantee that KM_SLEEP - * callers will never fail. - */ - if (skc->skc_flags & KMC_SLAB) { - struct kmem_cache *slc = skc->skc_linux_cache; - - do { - obj = kmem_cache_alloc(slc, flags | __GFP_COMP); - } while ((obj == NULL) && !(flags & KM_NOSLEEP)); - - goto ret; - } - - local_irq_disable(); - -restart: - /* Safe to update per-cpu structure without lock, but - * in the restart case we must be careful to reacquire - * the local magazine since this may have changed - * when we need to grow the cache. */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - if (likely(skm->skm_avail)) { - /* Object available in CPU cache, use it */ - obj = skm->skm_objs[--skm->skm_avail]; - skm->skm_age = jiffies; - } else { - obj = spl_cache_refill(skc, skm, flags); - if (obj == NULL) - goto restart; - } - - local_irq_enable(); - ASSERT(obj); - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - -ret: - /* Pre-emptively migrate object to CPU L1 cache */ - if (obj) { - if (obj && skc->skc_ctor) - skc->skc_ctor(obj, skc->skc_private, flags); - else - prefetchw(obj); - } - - atomic_dec(&skc->skc_ref); - - return (obj); -} - -EXPORT_SYMBOL(spl_kmem_cache_alloc); - -/* - * Free an object back to the local per-cpu magazine, there is no - * guarantee that this is the same magazine the object was originally - * allocated from. We may need to flush entire from the magazine - * back to the slabs to make space. - */ -void -spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_magazine_t *skm; - unsigned long flags; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - atomic_inc(&skc->skc_ref); - - /* - * Run the destructor - */ - if (skc->skc_dtor) - skc->skc_dtor(obj, skc->skc_private); - - /* - * Free the object from the Linux underlying Linux slab. - */ - if (skc->skc_flags & KMC_SLAB) { - kmem_cache_free(skc->skc_linux_cache, obj); - goto out; - } - - /* - * Only virtual slabs may have emergency objects and these objects - * are guaranteed to have physical addresses. They must be removed - * from the tree of emergency objects and the freed. - */ - if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) { - spl_emergency_free(skc, obj); - goto out; - } - - local_irq_save(flags); - - /* Safe to update per-cpu structure without lock, but - * no remote memory allocation tracking is being performed - * it is entirely possible to allocate an object from one - * CPU cache and return it to another. */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - /* Per-CPU cache full, flush it to make space */ - if (unlikely(skm->skm_avail >= skm->skm_size)) - spl_cache_flush(skc, skm, skm->skm_refill); - - /* Available space in cache, use it */ - skm->skm_objs[skm->skm_avail++] = obj; - - local_irq_restore(flags); -out: - atomic_dec(&skc->skc_ref); -} -EXPORT_SYMBOL(spl_kmem_cache_free); - -/* - * The generic shrinker function for all caches. Under Linux a shrinker - * may not be tightly coupled with a slab cache. In fact Linux always - * systematically tries calling all registered shrinker callbacks which - * report that they contain unused objects. Because of this we only - * register one shrinker function in the shim layer for all slab caches. - * We always attempt to shrink all caches when this generic shrinker - * is called. - * - * If sc->nr_to_scan is zero, the caller is requesting a query of the - * number of objects which can potentially be freed. If it is nonzero, - * the request is to free that many objects. - * - * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks - * in struct shrinker and also require the shrinker to return the number - * of objects freed. - * - * Older kernels require the shrinker to return the number of freeable - * objects following the freeing of nr_to_free. - * - * Linux semantics differ from those under Solaris, which are to - * free all available objects which may (and probably will) be more - * objects than the requested nr_to_scan. - */ -static spl_shrinker_t -__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc; - int alloc = 0; - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (sc->nr_to_scan) { -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK - uint64_t oldalloc = skc->skc_obj_alloc; - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); - if (oldalloc > skc->skc_obj_alloc) - alloc += oldalloc - skc->skc_obj_alloc; -#else - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan >> fls64(skc->skc_slab_objs), 1)); - alloc += skc->skc_obj_alloc; -#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ - } else { - /* Request to query number of freeable objects */ - alloc += skc->skc_obj_alloc; - } - } - up_read(&spl_kmem_cache_sem); - - /* - * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. - * This functionality only exists to work around a rare issue where - * shrink_slabs() is repeatedly invoked by many cores causing the - * system to thrash. - */ - if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) - return (SHRINK_STOP); - - return (MAX(alloc, 0)); -} - -SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); - -/* - * Call the registered reclaim function for a cache. Depending on how - * many and which objects are released it may simply repopulate the - * local magazine which will then need to age-out. Objects which cannot - * fit in the magazine we will be released back to their slabs which will - * also need to age out before being release. This is all just best - * effort and we do not want to thrash creating and destroying slabs. - */ -void -spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) -{ - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - atomic_inc(&skc->skc_ref); - - /* - * Execute the registered reclaim callback if it exists. The - * per-cpu caches will be drained when is set KMC_EXPIRE_MEM. - */ - if (skc->skc_flags & KMC_SLAB) { - if (skc->skc_reclaim) - skc->skc_reclaim(skc->skc_private); - - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) - kmem_cache_shrink(skc->skc_linux_cache); - - goto out; - } - - /* - * Prevent concurrent cache reaping when contended. - */ - if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) - goto out; - - /* - * When a reclaim function is available it may be invoked repeatedly - * until at least a single slab can be freed. This ensures that we - * do free memory back to the system. This helps minimize the chance - * of an OOM event when the bulk of memory is used by the slab. - * - * When free slabs are already available the reclaim callback will be - * skipped. Additionally, if no forward progress is detected despite - * a reclaim function the cache will be skipped to avoid deadlock. - * - * Longer term this would be the correct place to add the code which - * repacks the slabs in order minimize fragmentation. - */ - if (skc->skc_reclaim) { - uint64_t objects = UINT64_MAX; - int do_reclaim; - - do { - spin_lock(&skc->skc_lock); - do_reclaim = - (skc->skc_slab_total > 0) && - ((skc->skc_slab_total - skc->skc_slab_alloc) == 0) && - (skc->skc_obj_alloc < objects); - - objects = skc->skc_obj_alloc; - spin_unlock(&skc->skc_lock); - - if (do_reclaim) - skc->skc_reclaim(skc->skc_private); - - } while (do_reclaim); - } - - /* Reclaim from the magazine then the slabs ignoring age and delay. */ - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { - spl_kmem_magazine_t *skm; - unsigned long irq_flags; - - local_irq_save(irq_flags); - skm = skc->skc_mag[smp_processor_id()]; - spl_cache_flush(skc, skm, skm->skm_avail); - local_irq_restore(irq_flags); - } - - spl_slab_reclaim(skc, count, 1); - clear_bit(KMC_BIT_REAPING, &skc->skc_flags); - smp_wmb(); - wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); -out: - atomic_dec(&skc->skc_ref); -} -EXPORT_SYMBOL(spl_kmem_cache_reap_now); - -/* - * Reap all free slabs from all registered caches. - */ -void -spl_kmem_reap(void) -{ - struct shrink_control sc; - - sc.nr_to_scan = KMC_REAP_CHUNK; - sc.gfp_mask = GFP_KERNEL; - - (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); -} -EXPORT_SYMBOL(spl_kmem_reap); - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* Minimum number of printable characters found - * to make it worthwhile to print this as ascii. */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return str; -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) -{ - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - - spin_unlock_irqrestore(lock, flags); -} -#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ -#define spl_kmem_init_tracking(list, lock, size) -#define spl_kmem_fini_tracking(list, lock) -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ - -int -spl_kmem_init(void) -{ - int rc = 0; - -#ifdef DEBUG_KMEM - kmem_alloc_used_set(0); - vmem_alloc_used_set(0); - - spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); - spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE); -#endif - - init_rwsem(&spl_kmem_cache_sem); - INIT_LIST_HEAD(&spl_kmem_cache_list); - spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", - 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); - - spl_register_shrinker(&spl_kmem_cache_shrinker); - - return (rc); + return (0); } void spl_kmem_fini(void) { - spl_unregister_shrinker(&spl_kmem_cache_shrinker); - taskq_destroy(spl_kmem_cache_taskq); - #ifdef DEBUG_KMEM - /* Display all unreclaimed memory addresses, including the + /* + * Display all unreclaimed memory addresses, including the * allocation size and the first few bytes of what's located * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. */ + * a serious concern here since it is module unload time. + */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - kmem_alloc_used_read(), kmem_alloc_max); - - if (vmem_alloc_used_read() != 0) - printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n", - vmem_alloc_used_read(), vmem_alloc_max); + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); +#ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); - spl_kmem_fini_tracking(&vmem_list, &vmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ } diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c index cb27ed3d376d..e8917a3ea80c 100644 --- a/module/spl/spl-kstat.c +++ b/module/spl/spl-kstat.c @@ -26,6 +26,7 @@ #include #include +#include #ifndef HAVE_PDE_DATA #define PDE_DATA(x) (PDE(x)->data) diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c index 137af7188a03..a434ef54fd34 100644 --- a/module/spl/spl-proc.c +++ b/module/spl/spl-proc.c @@ -26,9 +26,14 @@ #include #include +#include +#include +#include +#include #include #include #include +#include #include #if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) @@ -348,26 +353,6 @@ static struct ctl_table spl_kmem_table[] = { .mode = 0444, .proc_handler = &proc_doulongvec_minmax, }, - { - .procname = "vmem_used", - .data = &vmem_alloc_used, -# ifdef HAVE_ATOMIC64_T - .maxlen = sizeof(atomic64_t), -# else - .maxlen = sizeof(atomic_t), -# endif /* HAVE_ATOMIC64_T */ - .mode = 0444, - .proc_handler = &proc_domemused, - }, - { - .procname = "vmem_max", - .data = &vmem_alloc_max, - .maxlen = sizeof(unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, { .procname = "slab_kmem_total", .data = (void *)(KMC_KMEM | KMC_TOTAL), diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c index c9d532f4e7a4..9a0987527b27 100644 --- a/module/spl/spl-tsd.c +++ b/module/spl/spl-tsd.c @@ -61,6 +61,7 @@ #include #include #include +#include typedef struct tsd_hash_bin { spinlock_t hb_lock; @@ -336,8 +337,7 @@ tsd_hash_table_init(uint_t bits) if (table == NULL) return (NULL); - table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, - KM_SLEEP | KM_NODEBUG); + table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP); if (table->ht_bins == NULL) { kmem_free(table, sizeof(tsd_hash_table_t)); return (NULL); diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c new file mode 100644 index 000000000000..bca27f263d91 --- /dev/null +++ b/module/spl/spl-vmem.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include + +vmem_t *heap_arena = NULL; +EXPORT_SYMBOL(heap_arena); + +vmem_t *zio_alloc_arena = NULL; +EXPORT_SYMBOL(zio_alloc_arena); + +vmem_t *zio_arena = NULL; +EXPORT_SYMBOL(zio_arena); + +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + ASSERT3P(vmp, ==, NULL); + ASSERT3S(typemask & VMEM_ALLOC, ==, VMEM_ALLOC); + ASSERT3S(typemask & VMEM_FREE, ==, VMEM_FREE); + + return (VMALLOC_TOTAL); +} +EXPORT_SYMBOL(vmem_size); + +/* + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. + */ +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_VMEM; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); + +void * +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= (KM_VMEM | KM_ZERO); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_zalloc); + +void +spl_vmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_vmem_free); + +/* + * Public vmalloc() interface designed to be safe to be called during I/O. + */ +void * +spl_vmalloc(unsigned long size, gfp_t lflags, pgprot_t prot) +{ +#if defined(PF_MEMALLOC_NOIO) + void *ptr; + unsigned noio_flag = 0; + + if (spl_fstrans_check()) + noio_flag = memalloc_noio_save(); + + ptr = __vmalloc(size, lflags, prot); + + if (spl_fstrans_check()) + memalloc_noio_restore(noio_flag); + + return (ptr); +#else + return (__vmalloc(size, lflags, prot)); +#endif +} +EXPORT_SYMBOL(spl_vmalloc); + +int +spl_vmem_init(void) +{ + return (0); +} + +void +spl_vmem_fini(void) +{ +} diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c index e5db0ec2ccb8..97eb4ef731ed 100644 --- a/module/spl/spl-vnode.c +++ b/module/spl/spl-vnode.c @@ -26,6 +26,7 @@ #include #include +#include #include #include diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c index 2967b03ceaa1..77c2a1ddef3b 100644 --- a/module/spl/spl-zlib.c +++ b/module/spl/spl-zlib.c @@ -54,6 +54,7 @@ #include +#include #include #include diff --git a/module/splat/splat-condvar.c b/module/splat/splat-condvar.c index 3ee2ffc9e7a7..ed633acdaa8d 100644 --- a/module/splat/splat-condvar.c +++ b/module/splat/splat-condvar.c @@ -24,8 +24,9 @@ * Solaris Porting LAyer Tests (SPLAT) Condition Variable Tests. \*****************************************************************************/ -#include #include +#include +#include #include "splat-internal.h" #define SPLAT_CONDVAR_NAME "condvar" diff --git a/module/splat/splat-internal.h b/module/splat/splat-internal.h index eff8a9e74f98..832132696d88 100644 --- a/module/splat/splat-internal.h +++ b/module/splat/splat-internal.h @@ -27,6 +27,7 @@ #include "splat-ctl.h" #include +#include #define SPLAT_SUBSYSTEM_INIT(type) \ ({ splat_subsystem_t *_sub_; \ diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c index cf47ce65afec..cd0000bae671 100644 --- a/module/splat/splat-kmem.c +++ b/module/splat/splat-kmem.c @@ -25,7 +25,11 @@ \*****************************************************************************/ #include +#include +#include +#include #include +#include #include "splat-internal.h" #define SPLAT_KMEM_NAME "kmem" @@ -92,11 +96,11 @@ splat_kmem_test1(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_alloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -124,11 +128,11 @@ splat_kmem_test2(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 32))) { + while ((!rc) && (size <= spl_kmem_alloc_warn)) { count = 0; for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) { - ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG); + ptr[i] = kmem_zalloc(size, KM_SLEEP); if (ptr[i]) count++; } @@ -168,7 +172,11 @@ splat_kmem_test3(struct file *file, void *arg) int size = PAGE_SIZE; int i, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_alloc() size to ensure both + * the kmem_alloc() and vmem_alloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { @@ -200,7 +208,11 @@ splat_kmem_test4(struct file *file, void *arg) int size = PAGE_SIZE; int i, j, count, rc = 0; - while ((!rc) && (size <= (PAGE_SIZE * 1024))) { + /* + * Test up to 4x the maximum kmem_zalloc() size to ensure both + * the kmem_zalloc() and vmem_zalloc() call paths are used. + */ + while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) { count = 0; for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) { @@ -572,87 +584,124 @@ splat_kmem_cache_test_thread(void *arg) static int splat_kmem_cache_test(struct file *file, void *arg, char *name, - int size, int align, int flags) + int size, int align, int flags) { - kmem_cache_priv_t *kcp; - kmem_cache_data_t *kcd = NULL; - int rc = 0, max; + kmem_cache_priv_t *kcp = NULL; + kmem_cache_data_t **kcd = NULL; + int i, rc = 0, objs = 0; + + splat_vprint(file, name, + "Testing size=%d, align=%d, flags=0x%04x\n", + size, align, flags); kcp = splat_kmem_cache_test_kcp_alloc(file, name, size, align, 0); if (!kcp) { splat_vprint(file, name, "Unable to create '%s'\n", "kcp"); - return -ENOMEM; + return (-ENOMEM); } - kcp->kcp_cache = - kmem_cache_create(SPLAT_KMEM_CACHE_NAME, - kcp->kcp_size, kcp->kcp_align, - splat_kmem_cache_test_constructor, - splat_kmem_cache_test_destructor, - NULL, kcp, NULL, flags); - if (!kcp->kcp_cache) { - splat_vprint(file, name, - "Unable to create '%s'\n", - SPLAT_KMEM_CACHE_NAME); + kcp->kcp_cache = kmem_cache_create(SPLAT_KMEM_CACHE_NAME, + kcp->kcp_size, kcp->kcp_align, + splat_kmem_cache_test_constructor, + splat_kmem_cache_test_destructor, + NULL, kcp, NULL, flags); + if (kcp->kcp_cache == NULL) { + splat_vprint(file, name, "Unable to create " + "name='%s', size=%d, align=%d, flags=0x%x\n", + SPLAT_KMEM_CACHE_NAME, size, align, flags); rc = -ENOMEM; goto out_free; } - kcd = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); - if (!kcd) { - splat_vprint(file, name, - "Unable to allocate from '%s'\n", - SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; + /* + * Allocate several slabs worth of objects to verify functionality. + * However, on 32-bit systems with limited address space constrain + * it to a single slab for the purposes of this test. + */ +#ifdef _LP64 + objs = SPL_KMEM_CACHE_OBJ_PER_SLAB * 4; +#else + objs = 1; +#endif + kcd = kmem_zalloc(sizeof (kmem_cache_data_t *) * objs, KM_SLEEP); + if (kcd == NULL) { + splat_vprint(file, name, "Unable to allocate pointers " + "for %d objects\n", objs); + rc = -ENOMEM; goto out_free; } - if (!kcd->kcd_flag) { - splat_vprint(file, name, - "Failed to run contructor for '%s'\n", - SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; - goto out_free; - } + for (i = 0; i < objs; i++) { + kcd[i] = kmem_cache_alloc(kcp->kcp_cache, KM_SLEEP); + if (kcd[i] == NULL) { + splat_vprint(file, name, "Unable to allocate " + "from '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } - if (kcd->kcd_magic != kcp->kcp_magic) { - splat_vprint(file, name, - "Failed to pass private data to constructor " - "for '%s'\n", SPLAT_KMEM_CACHE_NAME); - rc = -EINVAL; - goto out_free; + if (!kcd[i]->kcd_flag) { + splat_vprint(file, name, "Failed to run constructor " + "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } + + if (kcd[i]->kcd_magic != kcp->kcp_magic) { + splat_vprint(file, name, + "Failed to pass private data to constructor " + "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } } - max = kcp->kcp_count; - kmem_cache_free(kcp->kcp_cache, kcd); + for (i = 0; i < objs; i++) { + kmem_cache_free(kcp->kcp_cache, kcd[i]); + + /* Destructors are run for every kmem_cache_free() */ + if (kcd[i]->kcd_flag) { + splat_vprint(file, name, + "Failed to run destructor for '%s'\n", + SPLAT_KMEM_CACHE_NAME); + rc = -EINVAL; + goto out_free; + } + } - /* Destroy the entire cache which will force destructors to - * run and we can verify one was called for every object */ - kmem_cache_destroy(kcp->kcp_cache); if (kcp->kcp_count) { splat_vprint(file, name, - "Failed to run destructor on all slab objects " - "for '%s'\n", SPLAT_KMEM_CACHE_NAME); + "Failed to run destructor on all slab objects for '%s'\n", + SPLAT_KMEM_CACHE_NAME); rc = -EINVAL; } + kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs); + kmem_cache_destroy(kcp->kcp_cache); + splat_kmem_cache_test_kcp_free(kcp); splat_vprint(file, name, - "Successfully ran ctors/dtors for %d elements in '%s'\n", - max, SPLAT_KMEM_CACHE_NAME); + "Success ran alloc'd/free'd %d objects of size %d\n", + objs, size); - return rc; + return (rc); out_free: - if (kcd) - kmem_cache_free(kcp->kcp_cache, kcd); + if (kcd) { + for (i = 0; i < objs; i++) { + if (kcd[i] != NULL) + kmem_cache_free(kcp->kcp_cache, kcd[i]); + } + + kmem_free(kcd, sizeof (kmem_cache_data_t *) * objs); + } if (kcp->kcp_cache) kmem_cache_destroy(kcp->kcp_cache); splat_kmem_cache_test_kcp_free(kcp); - return rc; + return (rc); } static int @@ -746,35 +795,49 @@ static int splat_kmem_test5(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST5_NAME; - int rc; - - /* On slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 128, 0, 0); - if (rc) - return rc; + int i, rc = 0; - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_KMEM); - if (rc) - return rc; + /* Randomly pick small object sizes and alignments. */ + for (i = 0; i < 100; i++) { + int size, align, flags = 0; + uint32_t rnd; + + /* Evenly distribute tests over all value cache types */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + switch (rnd & 0x03) { + default: + case 0x00: + flags = 0; + break; + case 0x01: + flags = KMC_KMEM; + break; + case 0x02: + flags = KMC_VMEM; + break; + case 0x03: + flags = KMC_SLAB; + break; + } - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_VMEM); - if (rc) - return rc; + /* The following flags are set with a 1/10 chance */ + flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0); + flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0); - /* Off slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 128, 0, KMC_OFFSLAB); - if (rc) - return rc; + /* 32b - PAGE_SIZE */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + size = MAX(rnd % (PAGE_SIZE + 1), 32); - rc = splat_kmem_cache_test(file, arg, name, 128, 0, - KMC_KMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* 2^N where (3 <= N <= PAGE_SHIFT) */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1))); - rc = splat_kmem_cache_test(file, arg, name, 128, 0, - KMC_VMEM | KMC_OFFSLAB); + rc = splat_kmem_cache_test(file, arg, name, size, align, flags); + if (rc) + return (rc); + } - return rc; + return (rc); } /* @@ -784,44 +847,53 @@ static int splat_kmem_test6(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST6_NAME; - int rc; - - /* On slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, 0); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, KMC_KMEM); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0, KMC_VMEM); - if (rc) - return rc; - - rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0, KMC_VMEM); - if (rc) - return rc; + int i, max_size, rc = 0; + + /* Randomly pick large object sizes and alignments. */ + for (i = 0; i < 100; i++) { + int size, align, flags = 0; + uint32_t rnd; + + /* Evenly distribute tests over all value cache types */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + switch (rnd & 0x03) { + default: + case 0x00: + flags = 0; + max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; + break; + case 0x01: + flags = KMC_KMEM; + max_size = (SPL_MAX_ORDER_NR_PAGES - 2) * PAGE_SIZE; + break; + case 0x02: + flags = KMC_VMEM; + max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; + break; + case 0x03: + flags = KMC_SLAB; + max_size = SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; + break; + } - /* Off slab (default + kmem + vmem) */ - rc = splat_kmem_cache_test(file, arg, name, 256*1024, 0, KMC_OFFSLAB); - if (rc) - return rc; + /* The following flags are set with a 1/10 chance */ + flags |= ((((rnd >> 8) % 10) == 0) ? KMC_OFFSLAB : 0); + flags |= ((((rnd >> 16) % 10) == 0) ? KMC_NOEMERGENCY : 0); - rc = splat_kmem_cache_test(file, arg, name, 64*1024, 0, - KMC_KMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* PAGE_SIZE - max_size */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + size = MAX(rnd % (max_size + 1), PAGE_SIZE), - rc = splat_kmem_cache_test(file, arg, name, 1024*1024, 0, - KMC_VMEM | KMC_OFFSLAB); - if (rc) - return rc; + /* 2^N where (3 <= N <= PAGE_SHIFT) */ + get_random_bytes((void *)&rnd, sizeof (uint32_t)); + align = (1 << MAX(3, rnd % (PAGE_SHIFT + 1))); - rc = splat_kmem_cache_test(file, arg, name, 16*1024*1024, 0, - KMC_VMEM | KMC_OFFSLAB); + rc = splat_kmem_cache_test(file, arg, name, size, align, flags); + if (rc) + return (rc); + } - return rc; + return (rc); } /* @@ -831,14 +903,20 @@ static int splat_kmem_test7(struct file *file, void *arg) { char *name = SPLAT_KMEM_TEST7_NAME; + int max_size = (SPL_KMEM_CACHE_MAX_SIZE * 1024 * 1024) / 2; int i, rc; for (i = SPL_KMEM_CACHE_ALIGN; i <= PAGE_SIZE; i *= 2) { - rc = splat_kmem_cache_test(file, arg, name, 157, i, 0); + uint32_t size; + + get_random_bytes((void *)&size, sizeof (uint32_t)); + size = MAX(size % (max_size + 1), 32); + + rc = splat_kmem_cache_test(file, arg, name, size, i, 0); if (rc) return rc; - rc = splat_kmem_cache_test(file, arg, name, 157, i, + rc = splat_kmem_cache_test(file, arg, name, size, i, KMC_OFFSLAB); if (rc) return rc; diff --git a/module/splat/splat-taskq.c b/module/splat/splat-taskq.c index d8406f159268..8229fed390f9 100644 --- a/module/splat/splat-taskq.c +++ b/module/splat/splat-taskq.c @@ -25,8 +25,10 @@ \*****************************************************************************/ #include +#include #include #include +#include #include #include "splat-internal.h" diff --git a/module/splat/splat-zlib.c b/module/splat/splat-zlib.c index c614c5e6ca1c..eaa48369db90 100644 --- a/module/splat/splat-zlib.c +++ b/module/splat/splat-zlib.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "splat-internal.h" #define SPLAT_ZLIB_NAME "zlib"