Skip to content
Snippets Groups Projects
slab.c 116 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * linux/mm/slab.c
     * Written by Mark Hemment, 1996/97.
     * (markhe@nextd.demon.co.uk)
     *
     * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
     *
     * Major cleanup, different bufctl logic, per-cpu arrays
     *	(c) 2000 Manfred Spraul
     *
     * Cleanup, make the head arrays unconditional, preparation for NUMA
     * 	(c) 2002 Manfred Spraul
     *
     * An implementation of the Slab Allocator as described in outline in;
     *	UNIX Internals: The New Frontiers by Uresh Vahalia
     *	Pub: Prentice Hall	ISBN 0-13-101908-2
     * or with a little more detail in;
     *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
     *	Jeff Bonwick (Sun Microsystems).
     *	Presented at: USENIX Summer 1994 Technical Conference
     *
     * The memory is organized in caches, one cache for each object type.
     * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
     * Each cache consists out of many slabs (they are small (usually one
     * page long) and always contiguous), and each slab contains multiple
     * initialized objects.
     *
     * This means, that your constructor is used only for newly allocated
     * slabs and you must pass objects with the same intializations to
     * kmem_cache_free.
     *
     * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
     * normal). If you need a special memory type, then must create a new
     * cache for that memory type.
     *
     * In order to reduce fragmentation, the slabs are sorted in 3 groups:
     *   full slabs with 0 free objects
     *   partial slabs
     *   empty slabs with no allocated objects
     *
     * If partial slabs exist, then new allocations come from these slabs,
     * otherwise from empty slabs or new slabs are allocated.
     *
     * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
     * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
     *
     * Each cache has a short per-cpu head array, most allocs
     * and frees go into that array, and if that array overflows, then 1/2
     * of the entries in the array are given back into the global cache.
     * The head array is strictly LIFO and should improve the cache hit rates.
     * On SMP, it additionally reduces the spinlock operations.
     *
    
    Andrew Morton's avatar
    Andrew Morton committed
     * The c_cpuarray may not be read with enabled local interrupts -
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * it's changed with a smp_call_function().
     *
     * SMP synchronization:
     *  constructors and destructors are called without any locking.
    
     *  Several members in struct kmem_cache and struct slab never change, they
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	are accessed without any locking.
     *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
     *  	and local interrupts are disabled so slab code is preempt-safe.
     *  The non-constant members are protected with a per-cache irq spinlock.
     *
     * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
     * in 2000 - many ideas in the current implementation are derived from
     * his patch.
     *
     * Further notes from the original documentation:
     *
     * 11 April '97.  Started multi-threading - markhe
    
     *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	The sem is only needed when accessing/extending the cache-chain, which
     *	can never happen inside an interrupt (kmem_cache_create(),
     *	kmem_cache_shrink() and kmem_cache_reap()).
     *
     *	At present, each engine can be growing a cache.  This should be blocked.
     *
    
     * 15 March 2005. NUMA slab allocator.
     *	Shai Fultheim <shai@scalex86.org>.
     *	Shobhit Dayal <shobhit@calsoftinc.com>
     *	Alok N Kataria <alokk@calsoftinc.com>
     *	Christoph Lameter <christoph@lameter.com>
     *
     *	Modified the slab allocator to be node aware on NUMA systems.
     *	Each node has its own list of partial, free and full slabs.
     *	All object allocations for a node occur from node specific slab lists.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    #include	<linux/slab.h>
    #include	<linux/mm.h>
    
    #include	<linux/poison.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<linux/swap.h>
    #include	<linux/cache.h>
    #include	<linux/interrupt.h>
    #include	<linux/init.h>
    #include	<linux/compiler.h>
    
    #include	<linux/cpuset.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<linux/seq_file.h>
    #include	<linux/notifier.h>
    #include	<linux/kallsyms.h>
    #include	<linux/cpu.h>
    #include	<linux/sysctl.h>
    #include	<linux/module.h>
    #include	<linux/rcupdate.h>
    
    #include	<linux/string.h>
    
    #include	<linux/uaccess.h>
    
    #include	<linux/nodemask.h>
    
    #include	<linux/mempolicy.h>
    
    #include	<linux/mutex.h>
    
    #include	<linux/fault-inject.h>
    
    #include	<linux/rtmutex.h>
    
    #include	<linux/reciprocal_div.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include	<asm/cacheflush.h>
    #include	<asm/tlbflush.h>
    #include	<asm/page.h>
    
    /*
    
     * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		  0 for faster, smaller code (especially in the critical paths).
     *
     * STATS	- 1 to collect stats for /proc/slabinfo.
     *		  0 for faster, smaller code (especially in the critical paths).
     *
     * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
     */
    
    #ifdef CONFIG_DEBUG_SLAB
    #define	DEBUG		1
    #define	STATS		1
    #define	FORCED_DEBUG	1
    #else
    #define	DEBUG		0
    #define	STATS		0
    #define	FORCED_DEBUG	0
    #endif
    
    /* Shouldn't this be in a header file somewhere? */
    #define	BYTES_PER_WORD		sizeof(void *)
    
    #ifndef cache_line_size
    #define cache_line_size()	L1_CACHE_BYTES
    #endif
    
    #ifndef ARCH_KMALLOC_MINALIGN
    /*
     * Enforce a minimum alignment for the kmalloc caches.
     * Usually, the kmalloc caches are cache_line_size() aligned, except when
     * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
     * Some archs want to perform DMA into kmalloc caches and need a guaranteed
    
     * alignment larger than the alignment of a 64-bit integer.
     * ARCH_KMALLOC_MINALIGN allows that.
     * Note that increasing this value may disable some debug features.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    #ifndef ARCH_SLAB_MINALIGN
    /*
     * Enforce a minimum alignment for all caches.
     * Intended for archs that get misalignment faults even for BYTES_PER_WORD
     * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
     * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
     * some debug features.
     */
    #define ARCH_SLAB_MINALIGN 0
    #endif
    
    #ifndef ARCH_KMALLOC_FLAGS
    #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
    #endif
    
    /* Legal flag mask for kmem_cache_create(). */
    #if DEBUG
    
    # define CREATE_MASK	(SLAB_RED_ZONE | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
    
    			 SLAB_CACHE_DMA | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
    
    			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #else
    
    # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
    
    			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    /*
     * kmem_bufctl_t:
     *
     * Bufctl's are used for linking objs within a slab
     * linked offsets.
     *
     * This implementation relies on "struct page" for locating the cache &
     * slab an object belongs to.
     * This allows the bufctl structure to be small (one int), but limits
     * the number of objects a slab (not a cache) can contain when off-slab
     * bufctls are used. The limit is the size of the largest general cache
     * that does not use off-slab slabs.
     * For 32bit archs with 4 kB pages, is this 56.
     * This is not serious, as it is only for large objects, when it is unwise
     * to have too many per slab.
     * Note: This limit can be raised by introducing a general cache whose size
     * is less than 512 (PAGE_SIZE<<3), but greater than 256.
     */
    
    
    typedef unsigned int kmem_bufctl_t;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
    #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
    
    #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
    #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * struct slab
     *
     * Manages the objs in a slab. Placed either at the beginning of mem allocated
     * for a slab, or allocated from an general cache.
     * Slabs are chained into three list: fully used, partial, fully free slabs.
     */
    struct slab {
    
    	struct list_head list;
    	unsigned long colouroff;
    	void *s_mem;		/* including colour offset */
    	unsigned int inuse;	/* num of objs active in slab */
    	kmem_bufctl_t free;
    	unsigned short nodeid;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    /*
     * struct slab_rcu
     *
     * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
     * arrange for kmem_freepages to be called via RCU.  This is useful if
     * we need to approach a kernel structure obliquely, from its address
     * obtained without the usual locking.  We can lock the structure to
     * stabilize it and check it's still at the given address, only if we
     * can be sure that the memory has not been meanwhile reused for some
     * other kind of object (which our subsystem's lock might corrupt).
     *
     * rcu_read_lock before reading the address, then rcu_read_unlock after
     * taking the spinlock within the structure expected at that address.
     *
     * We assume struct slab_rcu can overlay struct slab when destroying.
     */
    struct slab_rcu {
    
    	struct rcu_head head;
    
    	struct kmem_cache *cachep;
    
    	void *addr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    /*
     * struct array_cache
     *
     * Purpose:
     * - LIFO ordering, to hand out cache-warm objects from _alloc
     * - reduce the number of linked list operations
     * - reduce spinlock operations
     *
     * The limit is stored in the per-cpu structure to reduce the data cache
     * footprint.
     *
     */
    struct array_cache {
    	unsigned int avail;
    	unsigned int limit;
    	unsigned int batchcount;
    	unsigned int touched;
    
    	spinlock_t lock;
    
    Andrew Morton's avatar
    Andrew Morton committed
    	void *entry[0];	/*
    			 * Must have this definition in here for the proper
    			 * alignment of array_cache. Also simplifies accessing
    			 * the entries.
    			 * [0] is for gcc 2.95. It should really be [].
    			 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * bootstrap: The caches do not work without cpuarrays anymore, but the
     * cpuarrays are allocated from the generic caches...
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    #define BOOT_CPUCACHE_ENTRIES	1
    struct arraycache_init {
    	struct array_cache cache;
    
    	void *entries[BOOT_CPUCACHE_ENTRIES];
    
     * The slab lists for all objects.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    struct kmem_list3 {
    
    	struct list_head slabs_partial;	/* partial list first, better asm code */
    	struct list_head slabs_full;
    	struct list_head slabs_free;
    	unsigned long free_objects;
    	unsigned int free_limit;
    
    	unsigned int colour_next;	/* Per-node cache coloring */
    
    	spinlock_t list_lock;
    	struct array_cache *shared;	/* shared per node */
    	struct array_cache **alien;	/* on other nodes */
    
    	unsigned long next_reap;	/* updated without locking */
    	int free_touched;		/* updated without locking */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    /*
     * Need this for bootstrapping a per node allocator.
     */
    #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
    struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
    #define	CACHE_CACHE 0
    #define	SIZE_AC 1
    #define	SIZE_L3 (1 + MAX_NUMNODES)
    
    
    static int drain_freelist(struct kmem_cache *cache,
    			struct kmem_list3 *l3, int tofree);
    static void free_block(struct kmem_cache *cachep, void **objpp, int len,
    			int node);
    
    static int enable_cpucache(struct kmem_cache *cachep);
    
    static void cache_reap(struct work_struct *unused);
    
    Andrew Morton's avatar
    Andrew Morton committed
     * This function must be completely optimized away if a constant is passed to
     * it.  Mostly the same as what is in linux/slab.h except it returns an index.
    
    static __always_inline int index_of(const size_t size)
    
    	extern void __bad_size(void);
    
    
    	if (__builtin_constant_p(size)) {
    		int i = 0;
    
    #define CACHE(x) \
    	if (size <=x) \
    		return i; \
    	else \
    		i++;
    #include "linux/kmalloc_sizes.h"
    #undef CACHE
    
    static int slab_early_init = 1;
    
    
    #define INDEX_AC index_of(sizeof(struct arraycache_init))
    #define INDEX_L3 index_of(sizeof(struct kmem_list3))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static void kmem_list3_init(struct kmem_list3 *parent)
    
    {
    	INIT_LIST_HEAD(&parent->slabs_full);
    	INIT_LIST_HEAD(&parent->slabs_partial);
    	INIT_LIST_HEAD(&parent->slabs_free);
    	parent->shared = NULL;
    	parent->alien = NULL;
    
    	spin_lock_init(&parent->list_lock);
    	parent->free_objects = 0;
    	parent->free_touched = 0;
    }
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define MAKE_LIST(cachep, listp, slab, nodeid)				\
    	do {								\
    		INIT_LIST_HEAD(listp);					\
    		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
    	do {								\
    
    	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
    	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
    	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * manages a cache.
     */
    
    struct kmem_cache {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* 1) per-cpu data, touched during every alloc/free */
    
    	struct array_cache *array[NR_CPUS];
    
    /* 2) Cache tunables. Protected by cache_chain_mutex */
    
    	unsigned int batchcount;
    	unsigned int limit;
    	unsigned int shared;
    
    	unsigned int buffer_size;
    
    /* 3) touched by every alloc & free from the backend */
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    	unsigned int flags;		/* constant flags */
    	unsigned int num;		/* # of objs per slab */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    /* 4) cache_grow/shrink */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* order of pgs per slab (2^n) */
    
    	unsigned int gfporder;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* force GFP flags, e.g. GFP_DMA */
    
    	gfp_t gfpflags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    	size_t colour;			/* cache colouring range */
    
    	unsigned int colour_off;	/* colour offset */
    
    	struct kmem_cache *slabp_cache;
    
    	unsigned int slab_size;
    
    Andrew Morton's avatar
    Andrew Morton committed
    	unsigned int dflags;		/* dynamic flags */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* constructor func */
    
    	void (*ctor) (void *, struct kmem_cache *, unsigned long);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    /* 5) cache creation/removal */
    
    	const char *name;
    	struct list_head next;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    /* 6) statistics */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #if STATS
    
    	unsigned long num_active;
    	unsigned long num_allocations;
    	unsigned long high_mark;
    	unsigned long grown;
    	unsigned long reaped;
    	unsigned long errors;
    	unsigned long max_freeable;
    	unsigned long node_allocs;
    	unsigned long node_frees;
    
    	atomic_t allochit;
    	atomic_t allocmiss;
    	atomic_t freehit;
    	atomic_t freemiss;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    #if DEBUG
    
    	/*
    	 * If debugging is enabled, then the allocator can add additional
    	 * fields and/or padding to every object. buffer_size contains the total
    	 * object size including these internal fields, the following two
    	 * variables contain the offset to the user object and its size.
    	 */
    	int obj_offset;
    	int obj_size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    Eric Dumazet's avatar
    Eric Dumazet committed
    	/*
    	 * We put nodelists[] at the end of kmem_cache, because we want to size
    	 * this array to nr_node_ids slots instead of MAX_NUMNODES
    	 * (see kmem_cache_init())
    	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
    	 * is statically defined, so we reserve the max number of nodes.
    	 */
    	struct kmem_list3 *nodelists[MAX_NUMNODES];
    	/*
    	 * Do not add fields after nodelists[]
    	 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    #define CFLGS_OFF_SLAB		(0x80000000UL)
    #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
    
    #define BATCHREFILL_LIMIT	16
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * Optimization question: fewer reaps means less probability for unnessary
     * cpucache drain/refill cycles.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * OTOH the cpuarrays can contain lots of objects,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * which could lock up otherwise freeable slabs.
     */
    #define REAPTIMEOUT_CPUC	(2*HZ)
    #define REAPTIMEOUT_LIST3	(4*HZ)
    
    #if STATS
    #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
    #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
    #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
    #define	STATS_INC_GROWN(x)	((x)->grown++)
    
    #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_HIGH(x)						\
    	do {								\
    		if ((x)->num_active > (x)->high_mark)			\
    			(x)->high_mark = (x)->num_active;		\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define	STATS_INC_ERR(x)	((x)->errors++)
    #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
    
    #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
    
    #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_FREEABLE(x, i)					\
    	do {								\
    		if ((x)->max_freeable < i)				\
    			(x)->max_freeable = i;				\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
    #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
    #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
    #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
    #else
    #define	STATS_INC_ACTIVE(x)	do { } while (0)
    #define	STATS_DEC_ACTIVE(x)	do { } while (0)
    #define	STATS_INC_ALLOCED(x)	do { } while (0)
    #define	STATS_INC_GROWN(x)	do { } while (0)
    
    #define	STATS_ADD_REAPED(x,y)	do { } while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define	STATS_SET_HIGH(x)	do { } while (0)
    #define	STATS_INC_ERR(x)	do { } while (0)
    #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
    
    #define	STATS_INC_NODEFREES(x)	do { } while (0)
    
    #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_FREEABLE(x, i) do { } while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define STATS_INC_ALLOCHIT(x)	do { } while (0)
    #define STATS_INC_ALLOCMISS(x)	do { } while (0)
    #define STATS_INC_FREEHIT(x)	do { } while (0)
    #define STATS_INC_FREEMISS(x)	do { } while (0)
    #endif
    
    #if DEBUG
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * memory layout of objects:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 0		: objp
    
     * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 		the end of an object is aligned with the end of the real
     * 		allocation. Catches writes behind the end of the allocation.
    
     * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 		redzone word.
    
     * cachep->obj_offset: The real object.
     * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
    
    Andrew Morton's avatar
    Andrew Morton committed
     * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
     *					[BYTES_PER_WORD long]
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int obj_offset(struct kmem_cache *cachep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return cachep->obj_offset;
    
    static int obj_size(struct kmem_cache *cachep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    
    	return (unsigned long long*) (objp + obj_offset(cachep) -
    				      sizeof(unsigned long long));
    
    static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    	if (cachep->flags & SLAB_STORE_USER)
    
    		return (unsigned long long *)(objp + cachep->buffer_size -
    					      sizeof(unsigned long long) -
    					      BYTES_PER_WORD);
    	return (unsigned long long *) (objp + cachep->buffer_size -
    				       sizeof(unsigned long long));
    
    static void **dbg_userword(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
    
    	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
    
    #define obj_offset(x)			0
    #define obj_size(cachep)		(cachep->buffer_size)
    
    #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
    
    #endif
    
    /*
     * Do not go above this order unless 0 objects fit into the slab.
     */
    #define	BREAK_GFP_ORDER_HI	1
    #define	BREAK_GFP_ORDER_LO	0
    static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * Functions for storing/retrieving the cachep and or slab from the page
     * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
     * these are used to find the cache which an obj belongs to.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
    {
    	page->lru.next = (struct list_head *)cache;
    }
    
    static inline struct kmem_cache *page_get_cache(struct page *page)
    {
    
    	page = compound_head(page);
    
    	BUG_ON(!PageSlab(page));
    
    	return (struct kmem_cache *)page->lru.next;
    }
    
    static inline void page_set_slab(struct page *page, struct slab *slab)
    {
    	page->lru.prev = (struct list_head *)slab;
    }
    
    static inline struct slab *page_get_slab(struct page *page)
    {
    
    	BUG_ON(!PageSlab(page));
    
    	return (struct slab *)page->lru.prev;
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static inline struct kmem_cache *virt_to_cache(const void *obj)
    {
    
    	struct page *page = virt_to_head_page(obj);
    
    	return page_get_cache(page);
    }
    
    static inline struct slab *virt_to_slab(const void *obj)
    {
    
    	struct page *page = virt_to_head_page(obj);
    
    	return page_get_slab(page);
    }
    
    
    static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
    				 unsigned int idx)
    {
    	return slab->s_mem + cache->buffer_size * idx;
    }
    
    
    /*
     * We want to avoid an expensive divide : (offset / cache->buffer_size)
     *   Using the fact that buffer_size is a constant for a particular cache,
     *   we can replace (offset / cache->buffer_size) by
     *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
     */
    static inline unsigned int obj_to_index(const struct kmem_cache *cache,
    					const struct slab *slab, void *obj)
    
    	u32 offset = (obj - slab->s_mem);
    	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * These are the default caches for kmalloc. Custom caches can have other sizes.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct cache_sizes malloc_sizes[] = {
    #define CACHE(x) { .cs_size = (x) },
    #include <linux/kmalloc_sizes.h>
    	CACHE(ULONG_MAX)
    #undef CACHE
    };
    EXPORT_SYMBOL(malloc_sizes);
    
    /* Must match cache_sizes above. Out of line to keep cache footprint low. */
    struct cache_names {
    	char *name;
    	char *name_dma;
    };
    
    static struct cache_names __initdata cache_names[] = {
    #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
    #include <linux/kmalloc_sizes.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #undef CACHE
    };
    
    static struct arraycache_init initarray_cache __initdata =
    
        { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static struct arraycache_init initarray_generic =
    
        { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* internal cache of cache description objs */
    
    static struct kmem_cache cache_cache = {
    
    	.batchcount = 1,
    	.limit = BOOT_CPUCACHE_ENTRIES,
    	.shared = 1,
    
    	.buffer_size = sizeof(struct kmem_cache),
    
    	.name = "kmem_cache",
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    #define BAD_ALIEN_MAGIC 0x01020304ul
    
    
    #ifdef CONFIG_LOCKDEP
    
    /*
     * Slab sometimes uses the kmalloc slabs to store the slab headers
     * for other slabs "off slab".
     * The locking for this is tricky in that it nests within the locks
     * of all other slabs in a few places; to deal with this special
     * locking we put on-slab caches into a separate lock-class.
    
     *
     * We set lock class for alien array caches which are up during init.
     * The lock annotation will be lost if all cpus of a node goes down and
     * then comes back up during hotplug
    
    static struct lock_class_key on_slab_l3_key;
    static struct lock_class_key on_slab_alc_key;
    
    static inline void init_lock_keys(void)
    
    	struct cache_sizes *s = malloc_sizes;
    
    	while (s->cs_size != ULONG_MAX) {
    		for_each_node(q) {
    			struct array_cache **alc;
    			int r;
    			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
    			if (!l3 || OFF_SLAB(s->cs_cachep))
    				continue;
    			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
    			alc = l3->alien;
    			/*
    			 * FIXME: This check for BAD_ALIEN_MAGIC
    			 * should go away when common slab code is taught to
    			 * work even without alien caches.
    			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
    			 * for alloc_alien_cache,
    			 */
    			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
    				continue;
    			for_each_node(r) {
    				if (alc[r])
    					lockdep_set_class(&alc[r]->lock,
    					     &on_slab_alc_key);
    			}
    		}
    		s++;
    
    static inline void init_lock_keys(void)
    
    /*
     * 1. Guard access to the cache-chain.
     * 2. Protect sanity of cpu_online_map against cpu hotplug events
     */
    
    static DEFINE_MUTEX(cache_chain_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static struct list_head cache_chain;
    
    /*
     * chicken and egg problem: delay the per-cpu array allocation
     * until the general caches are up.
     */
    static enum {
    	NONE,
    
    	PARTIAL_AC,
    	PARTIAL_L3,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	FULL
    } g_cpucache_up;
    
    
    /*
     * used by boot code to determine if it can use slab based allocator
     */
    int slab_is_available(void)
    {
    	return g_cpucache_up == FULL;
    }
    
    
    static DEFINE_PER_CPU(struct delayed_work, reap_work);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return cachep->array[smp_processor_id()];
    }
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    static inline struct kmem_cache *__find_general_cachep(size_t size,
    							gfp_t gfpflags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct cache_sizes *csizep = malloc_sizes;
    
    #if DEBUG
    	/* This happens if someone tries to call
    
    	 * kmem_cache_create(), or __kmalloc(), before
    	 * the generic caches are initialized.
    	 */
    
    	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    	while (size > csizep->cs_size)
    		csizep++;
    
    	/*
    
    	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * has cs_{dma,}cachep==NULL. Thus no special case
    	 * for large kmalloc calls required.
    	 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (unlikely(gfpflags & GFP_DMA))
    		return csizep->cs_dmacachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return csizep->cs_cachep;
    }
    
    
    static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
    
    {
    	return __find_general_cachep(size, gfpflags);
    }
    
    
    static size_t slab_mgmt_size(size_t nr_objs, size_t align)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * Calculate the number of objects and left-over bytes for a given buffer size.
     */
    
    static void cache_estimate(unsigned long gfporder, size_t buffer_size,
    			   size_t align, int flags, size_t *left_over,
    			   unsigned int *num)
    {
    	int nr_objs;
    	size_t mgmt_size;
    	size_t slab_size = PAGE_SIZE << gfporder;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/*
    	 * The slab management structure can be either off the slab or
    	 * on it. For the latter case, the memory allocated for a
    	 * slab is used for:
    	 *
    	 * - The struct slab
    	 * - One kmem_bufctl_t for each object
    	 * - Padding to respect alignment of @align
    	 * - @buffer_size bytes for each object
    	 *
    	 * If the slab management structure is off the slab, then the
    	 * alignment will already be calculated into the size. Because
    	 * the slabs are all pages aligned, the objects will be at the
    	 * correct alignment when allocated.
    	 */
    	if (flags & CFLGS_OFF_SLAB) {
    		mgmt_size = 0;
    		nr_objs = slab_size / buffer_size;
    
    		if (nr_objs > SLAB_LIMIT)
    			nr_objs = SLAB_LIMIT;
    	} else {
    		/*
    		 * Ignore padding for the initial guess. The padding
    		 * is at most @align-1 bytes, and @buffer_size is at
    		 * least @align. In the worst case, this result will
    		 * be one greater than the number of objects that fit
    		 * into the memory allocation when taking the padding
    		 * into account.
    		 */
    		nr_objs = (slab_size - sizeof(struct slab)) /
    			  (buffer_size + sizeof(kmem_bufctl_t));
    
    		/*
    		 * This calculated number will be either the right
    		 * amount, or one greater than what we want.
    		 */
    		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
    		       > slab_size)
    			nr_objs--;
    
    		if (nr_objs > SLAB_LIMIT)
    			nr_objs = SLAB_LIMIT;
    
    		mgmt_size = slab_mgmt_size(nr_objs, align);
    	}
    	*num = nr_objs;
    	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    static void __slab_error(const char *function, struct kmem_cache *cachep,
    			char *msg)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
    
    	       function, cachep->name, msg);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dump_stack();
    }
    
    
    /*
     * By default on NUMA we use alien caches to stage the freeing of
     * objects allocated from other nodes. This causes massive memory
     * inefficiencies when using fake NUMA setup to split memory into a
     * large number of small nodes, so it can be disabled on the command
     * line
      */
    
    static int use_alien_caches __read_mostly = 1;
    static int __init noaliencache_setup(char *s)
    {
    	use_alien_caches = 0;
    	return 1;
    }
    __setup("noaliencache", noaliencache_setup);
    
    
    #ifdef CONFIG_NUMA
    /*
     * Special reaping functions for NUMA systems called from cache_reap().
     * These take care of doing round robin flushing of alien caches (containing
     * objects freed on different nodes from which they were allocated) and the
     * flushing of remote pcps by calling drain_node_pages.
     */
    static DEFINE_PER_CPU(unsigned long, reap_node);
    
    static void init_reap_node(int cpu)
    {
    	int node;
    
    	node = next_node(cpu_to_node(cpu), node_online_map);
    	if (node == MAX_NUMNODES)
    
    		node = first_node(node_online_map);
    
    	per_cpu(reap_node, cpu) = node;
    
    }
    
    static void next_reap_node(void)
    {
    	int node = __get_cpu_var(reap_node);
    
    	node = next_node(node, node_online_map);
    	if (unlikely(node >= MAX_NUMNODES))
    		node = first_node(node_online_map);
    	__get_cpu_var(reap_node) = node;
    }
    
    #else
    #define init_reap_node(cpu) do { } while (0)
    #define next_reap_node(void) do { } while (0)
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
     * via the workqueue/eventd.
     * Add the CPU number into the expiration time to minimize the possibility of
     * the CPUs getting into lockstep and contending for the global cache chain
     * lock.
     */
    static void __devinit start_cpu_timer(int cpu)
    {
    
    	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*
    	 * When this gets called from do_initcalls via cpucache_init(),
    	 * init_workqueues() has already run, so keventd will be setup
    	 * at that time.
    	 */
    
    	if (keventd_up() && reap_work->work.func == NULL) {
    
    		INIT_DELAYED_WORK(reap_work, cache_reap);
    
    		schedule_delayed_work_on(cpu, reap_work,
    					__round_jiffies_relative(HZ, cpu));
    
    static struct array_cache *alloc_arraycache(int node, int entries,
    
    					    int batchcount)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct array_cache *nc = NULL;
    
    
    	nc = kmalloc_node(memsize, GFP_KERNEL, node);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (nc) {
    		nc->avail = 0;
    		nc->limit = entries;
    		nc->batchcount = batchcount;
    		nc->touched = 0;
    
    		spin_lock_init(&nc->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return nc;
    }
    
    
    /*
     * Transfer objects in one arraycache to another.
     * Locking must be handled by the caller.
     *
     * Return the number of entries transferred.
     */
    static int transfer_objects(struct array_cache *to,
    		struct array_cache *from, unsigned int max)
    {
    	/* Figure out how many entries to transfer */
    	int nr = min(min(from->avail, max), to->limit - to->avail);
    
    	if (!nr)
    		return 0;
    
    	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
    			sizeof(void *) *nr);
    
    	from->avail -= nr;
    	to->avail += nr;
    	to->touched = 1;
    	return nr;
    }
    
    
    #ifndef CONFIG_NUMA
    
    #define drain_alien_cache(cachep, alien) do { } while (0)
    #define reap_alien(cachep, l3) do { } while (0)
    
    static inline struct array_cache **alloc_alien_cache(int node, int limit)
    {
    	return (struct array_cache **)BAD_ALIEN_MAGIC;
    }
    
    static inline void free_alien_cache(struct array_cache **ac_ptr)
    {