Skip to content
Snippets Groups Projects
slab.c 122 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * linux/mm/slab.c
     * Written by Mark Hemment, 1996/97.
     * (markhe@nextd.demon.co.uk)
     *
     * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
     *
     * Major cleanup, different bufctl logic, per-cpu arrays
     *	(c) 2000 Manfred Spraul
     *
     * Cleanup, make the head arrays unconditional, preparation for NUMA
     * 	(c) 2002 Manfred Spraul
     *
     * An implementation of the Slab Allocator as described in outline in;
     *	UNIX Internals: The New Frontiers by Uresh Vahalia
     *	Pub: Prentice Hall	ISBN 0-13-101908-2
     * or with a little more detail in;
     *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
     *	Jeff Bonwick (Sun Microsystems).
     *	Presented at: USENIX Summer 1994 Technical Conference
     *
     * The memory is organized in caches, one cache for each object type.
     * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
     * Each cache consists out of many slabs (they are small (usually one
     * page long) and always contiguous), and each slab contains multiple
     * initialized objects.
     *
     * This means, that your constructor is used only for newly allocated
    
    Simon Arlott's avatar
    Simon Arlott committed
     * slabs and you must pass objects with the same initializations to
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * kmem_cache_free.
     *
     * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
     * normal). If you need a special memory type, then must create a new
     * cache for that memory type.
     *
     * In order to reduce fragmentation, the slabs are sorted in 3 groups:
     *   full slabs with 0 free objects
     *   partial slabs
     *   empty slabs with no allocated objects
     *
     * If partial slabs exist, then new allocations come from these slabs,
     * otherwise from empty slabs or new slabs are allocated.
     *
     * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
     * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
     *
     * Each cache has a short per-cpu head array, most allocs
     * and frees go into that array, and if that array overflows, then 1/2
     * of the entries in the array are given back into the global cache.
     * The head array is strictly LIFO and should improve the cache hit rates.
     * On SMP, it additionally reduces the spinlock operations.
     *
    
    Andrew Morton's avatar
    Andrew Morton committed
     * The c_cpuarray may not be read with enabled local interrupts -
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * it's changed with a smp_call_function().
     *
     * SMP synchronization:
     *  constructors and destructors are called without any locking.
    
     *  Several members in struct kmem_cache and struct slab never change, they
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	are accessed without any locking.
     *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
     *  	and local interrupts are disabled so slab code is preempt-safe.
     *  The non-constant members are protected with a per-cache irq spinlock.
     *
     * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
     * in 2000 - many ideas in the current implementation are derived from
     * his patch.
     *
     * Further notes from the original documentation:
     *
     * 11 April '97.  Started multi-threading - markhe
    
     *	The global cache-chain is protected by the mutex 'slab_mutex'.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *	The sem is only needed when accessing/extending the cache-chain, which
     *	can never happen inside an interrupt (kmem_cache_create(),
     *	kmem_cache_shrink() and kmem_cache_reap()).
     *
     *	At present, each engine can be growing a cache.  This should be blocked.
     *
    
     * 15 March 2005. NUMA slab allocator.
     *	Shai Fultheim <shai@scalex86.org>.
     *	Shobhit Dayal <shobhit@calsoftinc.com>
     *	Alok N Kataria <alokk@calsoftinc.com>
     *	Christoph Lameter <christoph@lameter.com>
     *
     *	Modified the slab allocator to be node aware on NUMA systems.
     *	Each node has its own list of partial, free and full slabs.
     *	All object allocations for a node occur from node specific slab lists.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    #include	<linux/slab.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<linux/mm.h>
    
    #include	<linux/poison.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<linux/swap.h>
    #include	<linux/cache.h>
    #include	<linux/interrupt.h>
    #include	<linux/init.h>
    #include	<linux/compiler.h>
    
    #include	<linux/cpuset.h>
    
    #include	<linux/proc_fs.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<linux/seq_file.h>
    #include	<linux/notifier.h>
    #include	<linux/kallsyms.h>
    #include	<linux/cpu.h>
    #include	<linux/sysctl.h>
    #include	<linux/module.h>
    #include	<linux/rcupdate.h>
    
    #include	<linux/string.h>
    
    #include	<linux/uaccess.h>
    
    #include	<linux/nodemask.h>
    
    #include	<linux/kmemleak.h>
    
    #include	<linux/mempolicy.h>
    
    #include	<linux/mutex.h>
    
    #include	<linux/fault-inject.h>
    
    #include	<linux/rtmutex.h>
    
    #include	<linux/reciprocal_div.h>
    
    #include	<linux/debugobjects.h>
    
    #include	<linux/kmemcheck.h>
    
    #include	<linux/memory.h>
    
    #include	<linux/prefetch.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #include	<net/sock.h>
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include	<asm/cacheflush.h>
    #include	<asm/tlbflush.h>
    #include	<asm/page.h>
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *		  0 for faster, smaller code (especially in the critical paths).
     *
     * STATS	- 1 to collect stats for /proc/slabinfo.
     *		  0 for faster, smaller code (especially in the critical paths).
     *
     * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
     */
    
    #ifdef CONFIG_DEBUG_SLAB
    #define	DEBUG		1
    #define	STATS		1
    #define	FORCED_DEBUG	1
    #else
    #define	DEBUG		0
    #define	STATS		0
    #define	FORCED_DEBUG	0
    #endif
    
    /* Shouldn't this be in a header file somewhere? */
    #define	BYTES_PER_WORD		sizeof(void *)
    
    #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #ifndef ARCH_KMALLOC_FLAGS
    #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
    #endif
    
    
    /*
     * true if a page was allocated from pfmemalloc reserves for network-based
     * swap
     */
    static bool pfmemalloc_active __read_mostly;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Legal flag mask for kmem_cache_create(). */
    #if DEBUG
    
    # define CREATE_MASK	(SLAB_RED_ZONE | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
    
    			 SLAB_CACHE_DMA | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
    
    			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
    
    			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #else
    
    # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
    
    			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
    
    			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    /*
     * kmem_bufctl_t:
     *
     * Bufctl's are used for linking objs within a slab
     * linked offsets.
     *
     * This implementation relies on "struct page" for locating the cache &
     * slab an object belongs to.
     * This allows the bufctl structure to be small (one int), but limits
     * the number of objects a slab (not a cache) can contain when off-slab
     * bufctls are used. The limit is the size of the largest general cache
     * that does not use off-slab slabs.
     * For 32bit archs with 4 kB pages, is this 56.
     * This is not serious, as it is only for large objects, when it is unwise
     * to have too many per slab.
     * Note: This limit can be raised by introducing a general cache whose size
     * is less than 512 (PAGE_SIZE<<3), but greater than 256.
     */
    
    
    typedef unsigned int kmem_bufctl_t;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
    #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
    
    #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
    #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * struct slab_rcu
     *
     * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
     * arrange for kmem_freepages to be called via RCU.  This is useful if
     * we need to approach a kernel structure obliquely, from its address
     * obtained without the usual locking.  We can lock the structure to
     * stabilize it and check it's still at the given address, only if we
     * can be sure that the memory has not been meanwhile reused for some
     * other kind of object (which our subsystem's lock might corrupt).
     *
     * rcu_read_lock before reading the address, then rcu_read_unlock after
     * taking the spinlock within the structure expected at that address.
     */
    struct slab_rcu {
    
    	struct rcu_head head;
    
    	struct kmem_cache *cachep;
    
    	void *addr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    /*
     * struct slab
     *
     * Manages the objs in a slab. Placed either at the beginning of mem allocated
     * for a slab, or allocated from an general cache.
     * Slabs are chained into three list: fully used, partial, fully free slabs.
     */
    struct slab {
    	union {
    		struct {
    			struct list_head list;
    			unsigned long colouroff;
    			void *s_mem;		/* including colour offset */
    			unsigned int inuse;	/* num of objs active in slab */
    			kmem_bufctl_t free;
    			unsigned short nodeid;
    		};
    		struct slab_rcu __slab_cover_slab_rcu;
    	};
    };
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * struct array_cache
     *
     * Purpose:
     * - LIFO ordering, to hand out cache-warm objects from _alloc
     * - reduce the number of linked list operations
     * - reduce spinlock operations
     *
     * The limit is stored in the per-cpu structure to reduce the data cache
     * footprint.
     *
     */
    struct array_cache {
    	unsigned int avail;
    	unsigned int limit;
    	unsigned int batchcount;
    	unsigned int touched;
    
    	spinlock_t lock;
    
    Andrew Morton's avatar
    Andrew Morton committed
    			 * Must have this definition in here for the proper
    			 * alignment of array_cache. Also simplifies accessing
    			 * the entries.
    
    			 *
    			 * Entries should not be directly dereferenced as
    			 * entries belonging to slabs marked pfmemalloc will
    			 * have the lower bits set SLAB_OBJ_PFMEMALLOC
    
    Andrew Morton's avatar
    Andrew Morton committed
    			 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    #define SLAB_OBJ_PFMEMALLOC	1
    static inline bool is_obj_pfmemalloc(void *objp)
    {
    	return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
    }
    
    static inline void set_obj_pfmemalloc(void **objp)
    {
    	*objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
    	return;
    }
    
    static inline void clear_obj_pfmemalloc(void **objp)
    {
    	*objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
    }
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * bootstrap: The caches do not work without cpuarrays anymore, but the
     * cpuarrays are allocated from the generic caches...
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    #define BOOT_CPUCACHE_ENTRIES	1
    struct arraycache_init {
    	struct array_cache cache;
    
    	void *entries[BOOT_CPUCACHE_ENTRIES];
    
     * The slab lists for all objects.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    struct kmem_list3 {
    
    	struct list_head slabs_partial;	/* partial list first, better asm code */
    	struct list_head slabs_full;
    	struct list_head slabs_free;
    	unsigned long free_objects;
    	unsigned int free_limit;
    
    	unsigned int colour_next;	/* Per-node cache coloring */
    
    	spinlock_t list_lock;
    	struct array_cache *shared;	/* shared per node */
    	struct array_cache **alien;	/* on other nodes */
    
    	unsigned long next_reap;	/* updated without locking */
    	int free_touched;		/* updated without locking */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    /*
     * Need this for bootstrapping a per node allocator.
     */
    
    #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
    
    static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
    
    #define	CACHE_CACHE 0
    
    #define	SIZE_AC MAX_NUMNODES
    #define	SIZE_L3 (2 * MAX_NUMNODES)
    
    static int drain_freelist(struct kmem_cache *cache,
    			struct kmem_list3 *l3, int tofree);
    static void free_block(struct kmem_cache *cachep, void **objpp, int len,
    			int node);
    
    static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
    
    static void cache_reap(struct work_struct *unused);
    
    Andrew Morton's avatar
    Andrew Morton committed
     * This function must be completely optimized away if a constant is passed to
     * it.  Mostly the same as what is in linux/slab.h except it returns an index.
    
    static __always_inline int index_of(const size_t size)
    
    	extern void __bad_size(void);
    
    
    	if (__builtin_constant_p(size)) {
    		int i = 0;
    
    #define CACHE(x) \
    	if (size <=x) \
    		return i; \
    	else \
    		i++;
    
    #include <linux/kmalloc_sizes.h>
    
    static int slab_early_init = 1;
    
    
    #define INDEX_AC index_of(sizeof(struct arraycache_init))
    #define INDEX_L3 index_of(sizeof(struct kmem_list3))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static void kmem_list3_init(struct kmem_list3 *parent)
    
    {
    	INIT_LIST_HEAD(&parent->slabs_full);
    	INIT_LIST_HEAD(&parent->slabs_partial);
    	INIT_LIST_HEAD(&parent->slabs_free);
    	parent->shared = NULL;
    	parent->alien = NULL;
    
    	spin_lock_init(&parent->list_lock);
    	parent->free_objects = 0;
    	parent->free_touched = 0;
    }
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define MAKE_LIST(cachep, listp, slab, nodeid)				\
    	do {								\
    		INIT_LIST_HEAD(listp);					\
    		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
    	do {								\
    
    	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
    	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
    	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #define CFLGS_OFF_SLAB		(0x80000000UL)
    #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
    
    #define BATCHREFILL_LIMIT	16
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * Optimization question: fewer reaps means less probability for unnessary
     * cpucache drain/refill cycles.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * OTOH the cpuarrays can contain lots of objects,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * which could lock up otherwise freeable slabs.
     */
    #define REAPTIMEOUT_CPUC	(2*HZ)
    #define REAPTIMEOUT_LIST3	(4*HZ)
    
    #if STATS
    #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
    #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
    #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
    #define	STATS_INC_GROWN(x)	((x)->grown++)
    
    #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_HIGH(x)						\
    	do {								\
    		if ((x)->num_active > (x)->high_mark)			\
    			(x)->high_mark = (x)->num_active;		\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define	STATS_INC_ERR(x)	((x)->errors++)
    #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
    
    #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
    
    #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_FREEABLE(x, i)					\
    	do {								\
    		if ((x)->max_freeable < i)				\
    			(x)->max_freeable = i;				\
    	} while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
    #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
    #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
    #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
    #else
    #define	STATS_INC_ACTIVE(x)	do { } while (0)
    #define	STATS_DEC_ACTIVE(x)	do { } while (0)
    #define	STATS_INC_ALLOCED(x)	do { } while (0)
    #define	STATS_INC_GROWN(x)	do { } while (0)
    
    #define	STATS_ADD_REAPED(x,y)	do { (void)(y); } while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define	STATS_SET_HIGH(x)	do { } while (0)
    #define	STATS_INC_ERR(x)	do { } while (0)
    #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
    
    #define	STATS_INC_NODEFREES(x)	do { } while (0)
    
    #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
    
    Andrew Morton's avatar
    Andrew Morton committed
    #define	STATS_SET_FREEABLE(x, i) do { } while (0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define STATS_INC_ALLOCHIT(x)	do { } while (0)
    #define STATS_INC_ALLOCMISS(x)	do { } while (0)
    #define STATS_INC_FREEHIT(x)	do { } while (0)
    #define STATS_INC_FREEMISS(x)	do { } while (0)
    #endif
    
    #if DEBUG
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * memory layout of objects:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 0		: objp
    
     * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 		the end of an object is aligned with the end of the real
     * 		allocation. Catches writes behind the end of the allocation.
    
     * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * 		redzone word.
    
     * cachep->obj_offset: The real object.
    
     * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
     * cachep->size - 1* BYTES_PER_WORD: last caller address
    
    Andrew Morton's avatar
    Andrew Morton committed
     *					[BYTES_PER_WORD long]
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int obj_offset(struct kmem_cache *cachep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return cachep->obj_offset;
    
    static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    
    	return (unsigned long long*) (objp + obj_offset(cachep) -
    				      sizeof(unsigned long long));
    
    static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
    	if (cachep->flags & SLAB_STORE_USER)
    
    		return (unsigned long long *)(objp + cachep->size -
    
    					      sizeof(unsigned long long) -
    
    					      REDZONE_ALIGN);
    
    	return (unsigned long long *) (objp + cachep->size -
    
    				       sizeof(unsigned long long));
    
    static void **dbg_userword(struct kmem_cache *cachep, void *objp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
    
    	return (void **)(objp + cachep->size - BYTES_PER_WORD);
    
    #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long long *)NULL;})
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
    
    #endif
    
    
    size_t slab_buffer_size(struct kmem_cache *cachep)
    {
    
    }
    EXPORT_SYMBOL(slab_buffer_size);
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * Do not go above this order unless 0 objects fit into the slab or
     * overridden on the command line.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    #define	SLAB_MAX_ORDER_HI	1
    #define	SLAB_MAX_ORDER_LO	0
    static int slab_max_order = SLAB_MAX_ORDER_LO;
    
    static bool slab_max_order_set __initdata;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static inline struct kmem_cache *page_get_cache(struct page *page)
    {
    
    	page = compound_head(page);
    
    	BUG_ON(!PageSlab(page));
    
    	return page->slab_cache;
    
    static inline struct kmem_cache *virt_to_cache(const void *obj)
    {
    
    	struct page *page = virt_to_head_page(obj);
    
    	return page->slab_cache;
    
    }
    
    static inline struct slab *virt_to_slab(const void *obj)
    {
    
    	struct page *page = virt_to_head_page(obj);
    
    
    	VM_BUG_ON(!PageSlab(page));
    	return page->slab_page;
    
    static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
    				 unsigned int idx)
    {
    
    	return slab->s_mem + cache->size * idx;
    
     * We want to avoid an expensive divide : (offset / cache->size)
     *   Using the fact that size is a constant for a particular cache,
     *   we can replace (offset / cache->size) by
    
     *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
     */
    static inline unsigned int obj_to_index(const struct kmem_cache *cache,
    					const struct slab *slab, void *obj)
    
    	u32 offset = (obj - slab->s_mem);
    	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * These are the default caches for kmalloc. Custom caches can have other sizes.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct cache_sizes malloc_sizes[] = {
    #define CACHE(x) { .cs_size = (x) },
    #include <linux/kmalloc_sizes.h>
    	CACHE(ULONG_MAX)
    #undef CACHE
    };
    EXPORT_SYMBOL(malloc_sizes);
    
    /* Must match cache_sizes above. Out of line to keep cache footprint low. */
    struct cache_names {
    	char *name;
    	char *name_dma;
    };
    
    static struct cache_names __initdata cache_names[] = {
    #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
    #include <linux/kmalloc_sizes.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #undef CACHE
    };
    
    static struct arraycache_init initarray_cache __initdata =
    
        { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static struct arraycache_init initarray_generic =
    
        { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* internal cache of cache description objs */
    
    static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
    
    static struct kmem_cache cache_cache = {
    
    	.nodelists = cache_cache_nodelists,
    
    	.batchcount = 1,
    	.limit = BOOT_CPUCACHE_ENTRIES,
    	.shared = 1,
    
    	.size = sizeof(struct kmem_cache),
    
    	.name = "kmem_cache",
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    
    #define BAD_ALIEN_MAGIC 0x01020304ul
    
    
    #ifdef CONFIG_LOCKDEP
    
    /*
     * Slab sometimes uses the kmalloc slabs to store the slab headers
     * for other slabs "off slab".
     * The locking for this is tricky in that it nests within the locks
     * of all other slabs in a few places; to deal with this special
     * locking we put on-slab caches into a separate lock-class.
    
     *
     * We set lock class for alien array caches which are up during init.
     * The lock annotation will be lost if all cpus of a node goes down and
     * then comes back up during hotplug
    
    static struct lock_class_key on_slab_l3_key;
    static struct lock_class_key on_slab_alc_key;
    
    
    static struct lock_class_key debugobj_l3_key;
    static struct lock_class_key debugobj_alc_key;
    
    static void slab_set_lock_classes(struct kmem_cache *cachep,
    		struct lock_class_key *l3_key, struct lock_class_key *alc_key,
    		int q)
    {
    	struct array_cache **alc;
    	struct kmem_list3 *l3;
    	int r;
    
    	l3 = cachep->nodelists[q];
    	if (!l3)
    		return;
    
    	lockdep_set_class(&l3->list_lock, l3_key);
    	alc = l3->alien;
    	/*
    	 * FIXME: This check for BAD_ALIEN_MAGIC
    	 * should go away when common slab code is taught to
    	 * work even without alien caches.
    	 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
    	 * for alloc_alien_cache,
    	 */
    	if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
    		return;
    	for_each_node(r) {
    		if (alc[r])
    			lockdep_set_class(&alc[r]->lock, alc_key);
    	}
    }
    
    static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
    {
    	slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
    }
    
    static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
    {
    	int node;
    
    	for_each_online_node(node)
    		slab_set_debugobj_lock_classes_node(cachep, node);
    }
    
    
    static void init_node_lock_keys(int q)
    
    	struct cache_sizes *s = malloc_sizes;
    
    
    		return;
    
    	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
    		struct kmem_list3 *l3;
    
    		l3 = s->cs_cachep->nodelists[q];
    		if (!l3 || OFF_SLAB(s->cs_cachep))
    
    
    		slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
    				&on_slab_alc_key, q);
    
    
    static inline void init_lock_keys(void)
    {
    	int node;
    
    	for_each_node(node)
    		init_node_lock_keys(node);
    }
    
    static void init_node_lock_keys(int q)
    {
    }
    
    
    static inline void init_lock_keys(void)
    
    
    static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
    {
    }
    
    static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
    {
    }
    
    static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	return cachep->array[smp_processor_id()];
    }
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    static inline struct kmem_cache *__find_general_cachep(size_t size,
    							gfp_t gfpflags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct cache_sizes *csizep = malloc_sizes;
    
    #if DEBUG
    	/* This happens if someone tries to call
    
    	 * kmem_cache_create(), or __kmalloc(), before
    	 * the generic caches are initialized.
    	 */
    
    	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	while (size > csizep->cs_size)
    		csizep++;
    
    	/*
    
    	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * has cs_{dma,}cachep==NULL. Thus no special case
    	 * for large kmalloc calls required.
    	 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (unlikely(gfpflags & GFP_DMA))
    		return csizep->cs_dmacachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return csizep->cs_cachep;
    }
    
    
    static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
    
    {
    	return __find_general_cachep(size, gfpflags);
    }
    
    
    static size_t slab_mgmt_size(size_t nr_objs, size_t align)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    /*
     * Calculate the number of objects and left-over bytes for a given buffer size.
     */
    
    static void cache_estimate(unsigned long gfporder, size_t buffer_size,
    			   size_t align, int flags, size_t *left_over,
    			   unsigned int *num)
    {
    	int nr_objs;
    	size_t mgmt_size;
    	size_t slab_size = PAGE_SIZE << gfporder;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/*
    	 * The slab management structure can be either off the slab or
    	 * on it. For the latter case, the memory allocated for a
    	 * slab is used for:
    	 *
    	 * - The struct slab
    	 * - One kmem_bufctl_t for each object
    	 * - Padding to respect alignment of @align
    	 * - @buffer_size bytes for each object
    	 *
    	 * If the slab management structure is off the slab, then the
    	 * alignment will already be calculated into the size. Because
    	 * the slabs are all pages aligned, the objects will be at the
    	 * correct alignment when allocated.
    	 */
    	if (flags & CFLGS_OFF_SLAB) {
    		mgmt_size = 0;
    		nr_objs = slab_size / buffer_size;
    
    		if (nr_objs > SLAB_LIMIT)
    			nr_objs = SLAB_LIMIT;
    	} else {
    		/*
    		 * Ignore padding for the initial guess. The padding
    		 * is at most @align-1 bytes, and @buffer_size is at
    		 * least @align. In the worst case, this result will
    		 * be one greater than the number of objects that fit
    		 * into the memory allocation when taking the padding
    		 * into account.
    		 */
    		nr_objs = (slab_size - sizeof(struct slab)) /
    			  (buffer_size + sizeof(kmem_bufctl_t));
    
    		/*
    		 * This calculated number will be either the right
    		 * amount, or one greater than what we want.
    		 */
    		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
    		       > slab_size)
    			nr_objs--;
    
    		if (nr_objs > SLAB_LIMIT)
    			nr_objs = SLAB_LIMIT;
    
    		mgmt_size = slab_mgmt_size(nr_objs, align);
    	}
    	*num = nr_objs;
    	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
    
    #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Andrew Morton's avatar
    Andrew Morton committed
    static void __slab_error(const char *function, struct kmem_cache *cachep,
    			char *msg)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
    
    	       function, cachep->name, msg);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dump_stack();
    }
    
    
    /*
     * By default on NUMA we use alien caches to stage the freeing of
     * objects allocated from other nodes. This causes massive memory
     * inefficiencies when using fake NUMA setup to split memory into a
     * large number of small nodes, so it can be disabled on the command
     * line
      */
    
    static int use_alien_caches __read_mostly = 1;
    static int __init noaliencache_setup(char *s)
    {
    	use_alien_caches = 0;
    	return 1;
    }
    __setup("noaliencache", noaliencache_setup);
    
    
    static int __init slab_max_order_setup(char *str)
    {
    	get_option(&str, &slab_max_order);
    	slab_max_order = slab_max_order < 0 ? 0 :
    				min(slab_max_order, MAX_ORDER - 1);
    	slab_max_order_set = true;
    
    	return 1;
    }
    __setup("slab_max_order=", slab_max_order_setup);
    
    
    #ifdef CONFIG_NUMA
    /*
     * Special reaping functions for NUMA systems called from cache_reap().
     * These take care of doing round robin flushing of alien caches (containing
     * objects freed on different nodes from which they were allocated) and the
     * flushing of remote pcps by calling drain_node_pages.
     */
    
    static DEFINE_PER_CPU(unsigned long, slab_reap_node);
    
    	node = next_node(cpu_to_mem(cpu), node_online_map);
    
    		node = first_node(node_online_map);
    
    	per_cpu(slab_reap_node, cpu) = node;
    
    	int node = __this_cpu_read(slab_reap_node);
    
    
    	node = next_node(node, node_online_map);
    	if (unlikely(node >= MAX_NUMNODES))
    		node = first_node(node_online_map);
    
    }
    
    #else
    #define init_reap_node(cpu) do { } while (0)
    #define next_reap_node(void) do { } while (0)
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
     * via the workqueue/eventd.
     * Add the CPU number into the expiration time to minimize the possibility of
     * the CPUs getting into lockstep and contending for the global cache chain
     * lock.
     */
    
    static void __cpuinit start_cpu_timer(int cpu)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*
    	 * When this gets called from do_initcalls via cpucache_init(),
    	 * init_workqueues() has already run, so keventd will be setup
    	 * at that time.
    	 */
    
    	if (keventd_up() && reap_work->work.func == NULL) {
    
    		INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
    
    		schedule_delayed_work_on(cpu, reap_work,
    					__round_jiffies_relative(HZ, cpu));
    
    static struct array_cache *alloc_arraycache(int node, int entries,
    
    					    int batchcount, gfp_t gfp)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct array_cache *nc = NULL;
    
    
    	nc = kmalloc_node(memsize, gfp, node);
    
    	/*
    	 * The array_cache structures contain pointers to free object.
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    	 * However, when such objects are allocated or transferred to another
    
    	 * cache the pointers are not cleared and they could be counted as
    	 * valid references during a kmemleak scan. Therefore, kmemleak must
    	 * not scan such objects.
    	 */
    	kmemleak_no_scan(nc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (nc) {
    		nc->avail = 0;
    		nc->limit = entries;
    		nc->batchcount = batchcount;
    		nc->touched = 0;
    
    		spin_lock_init(&nc->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return nc;
    }
    
    
    static inline bool is_slab_pfmemalloc(struct slab *slabp)
    {
    	struct page *page = virt_to_page(slabp->s_mem);
    
    	return PageSlabPfmemalloc(page);
    }
    
    /* Clears pfmemalloc_active if no slabs have pfmalloc set */
    static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
    						struct array_cache *ac)
    {
    	struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
    	struct slab *slabp;
    	unsigned long flags;
    
    	if (!pfmemalloc_active)
    		return;
    
    	spin_lock_irqsave(&l3->list_lock, flags);
    	list_for_each_entry(slabp, &l3->slabs_full, list)
    		if (is_slab_pfmemalloc(slabp))
    			goto out;
    
    	list_for_each_entry(slabp, &l3->slabs_partial, list)
    		if (is_slab_pfmemalloc(slabp))
    			goto out;
    
    	list_for_each_entry(slabp, &l3->slabs_free, list)
    		if (is_slab_pfmemalloc(slabp))
    			goto out;
    
    	pfmemalloc_active = false;
    out:
    	spin_unlock_irqrestore(&l3->list_lock, flags);
    }
    
    
    static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
    
    						gfp_t flags, bool force_refill)
    {
    	int i;
    	void *objp = ac->entry[--ac->avail];
    
    	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
    	if (unlikely(is_obj_pfmemalloc(objp))) {
    		struct kmem_list3 *l3;
    
    		if (gfp_pfmemalloc_allowed(flags)) {
    			clear_obj_pfmemalloc(&objp);
    			return objp;
    		}
    
    		/* The caller cannot use PFMEMALLOC objects, find another one */
    		for (i = 1; i < ac->avail; i++) {
    			/* If a !PFMEMALLOC object is found, swap them */
    			if (!is_obj_pfmemalloc(ac->entry[i])) {
    				objp = ac->entry[i];
    				ac->entry[i] = ac->entry[ac->avail];
    				ac->entry[ac->avail] = objp;
    				return objp;
    			}
    		}
    
    		/*
    		 * If there are empty slabs on the slabs_free list and we are
    		 * being forced to refill the cache, mark this one !pfmemalloc.
    		 */
    		l3 = cachep->nodelists[numa_mem_id()];