Skip to content
Snippets Groups Projects
slab.c 97.4 KiB
Newer Older
			l3->free_limit -= cachep->batchcount;
			if (nc)
				free_block(cachep, nc->entry, nc->avail, node);

			if (!cpus_empty(mask)) {
				spin_unlock(&l3->list_lock);
				goto unlock_cache;
			}

			if (l3->shared) {
				free_block(cachep, l3->shared->entry,
					   l3->shared->avail, node);
				kfree(l3->shared);
				l3->shared = NULL;
			}
			if (l3->alien) {
				drain_alien_cache(cachep, l3);
				free_alien_cache(l3->alien);
				l3->alien = NULL;
			}

			/* free slabs belonging to this node */
			if (__node_shrink(cachep, node)) {
				cachep->nodelists[node] = NULL;
				spin_unlock(&l3->list_lock);
				kfree(l3);
			} else {
				spin_unlock(&l3->list_lock);
			}
		      unlock_cache:
Linus Torvalds's avatar
Linus Torvalds committed
			spin_unlock_irq(&cachep->spinlock);
			kfree(nc);
		}
		mutex_unlock(&cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
		break;
#endif
	}
	return NOTIFY_OK;
	mutex_unlock(&cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
	return NOTIFY_BAD;
}

static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };

/*
 * swap the static kmem_list3 with kmalloced memory
 */
static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
{
	struct kmem_list3 *ptr;

	BUG_ON(cachep->nodelists[nodeid] != list);
	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
	BUG_ON(!ptr);

	local_irq_disable();
	memcpy(ptr, list, sizeof(struct kmem_list3));
	MAKE_ALL_LISTS(cachep, ptr, nodeid);
	cachep->nodelists[nodeid] = ptr;
	local_irq_enable();
}

Linus Torvalds's avatar
Linus Torvalds committed
/* Initialisation.
 * Called after the gfp() functions have been enabled, and before smp_init().
 */
void __init kmem_cache_init(void)
{
	size_t left_over;
	struct cache_sizes *sizes;
	struct cache_names *names;
	int i;

	for (i = 0; i < NUM_INIT_LISTS; i++) {
		kmem_list3_init(&initkmem_list3[i]);
		if (i < MAX_NUMNODES)
			cache_cache.nodelists[i] = NULL;
	}
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 * Fragmentation resistance on low memory - only use bigger
	 * page orders on machines with more than 32MB of memory.
	 */
	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
		slab_break_gfp_order = BREAK_GFP_ORDER_HI;

	/* Bootstrap is tricky, because several objects are allocated
	 * from caches that do not exist yet:
	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
	 *    structures of all caches, except cache_cache itself: cache_cache
	 *    is statically allocated.
	 *    Initially an __init data area is used for the head array and the
	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
	 *    array at the end of the bootstrap.
Linus Torvalds's avatar
Linus Torvalds committed
	 * 2) Create the first kmalloc cache.
	 *    The kmem_cache_t for the new cache is allocated normally.
	 *    An __init data area is used for the head array.
	 * 3) Create the remaining kmalloc caches, with minimally sized
	 *    head arrays.
Linus Torvalds's avatar
Linus Torvalds committed
	 * 4) Replace the __init data head arrays for cache_cache and the first
	 *    kmalloc cache with kmalloc allocated arrays.
	 * 5) Replace the __init data for kmem_list3 for cache_cache and
	 *    the other cache's with kmalloc allocated memory.
	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
Linus Torvalds's avatar
Linus Torvalds committed
	 */

	/* 1) create the cache_cache */
	INIT_LIST_HEAD(&cache_chain);
	list_add(&cache_cache.next, &cache_chain);
	cache_cache.colour_off = cache_line_size();
	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
Linus Torvalds's avatar
Linus Torvalds committed

	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
Linus Torvalds's avatar
Linus Torvalds committed

	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
		       &left_over, &cache_cache.num);
Linus Torvalds's avatar
Linus Torvalds committed
	if (!cache_cache.num)
		BUG();

	cache_cache.colour = left_over / cache_cache.colour_off;
Linus Torvalds's avatar
Linus Torvalds committed
	cache_cache.colour_next = 0;
	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
				      sizeof(struct slab), cache_line_size());
Linus Torvalds's avatar
Linus Torvalds committed

	/* 2+3) create the kmalloc caches */
	sizes = malloc_sizes;
	names = cache_names;

	/* Initialize the caches that provide memory for the array cache
	 * and the kmem_list3 structures first.
	 * Without this, further allocations will bug
	 */

	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
						      sizes[INDEX_AC].cs_size,
						      ARCH_KMALLOC_MINALIGN,
						      (ARCH_KMALLOC_FLAGS |
						       SLAB_PANIC), NULL, NULL);

	if (INDEX_AC != INDEX_L3)
		sizes[INDEX_L3].cs_cachep =
		    kmem_cache_create(names[INDEX_L3].name,
				      sizes[INDEX_L3].cs_size,
				      ARCH_KMALLOC_MINALIGN,
				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
				      NULL);
Linus Torvalds's avatar
Linus Torvalds committed
	while (sizes->cs_size != ULONG_MAX) {
		/*
		 * For performance, all the general caches are L1 aligned.
Linus Torvalds's avatar
Linus Torvalds committed
		 * This should be particularly beneficial on SMP boxes, as it
		 * eliminates "false sharing".
		 * Note for systems short on memory removing the alignment will
		 * allow tighter packing of the smaller caches.
		 */
		if (!sizes->cs_cachep)
			sizes->cs_cachep = kmem_cache_create(names->name,
							     sizes->cs_size,
							     ARCH_KMALLOC_MINALIGN,
							     (ARCH_KMALLOC_FLAGS
							      | SLAB_PANIC),
							     NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed

		/* Inc off-slab bufctl limit until the ceiling is hit. */
		if (!(OFF_SLAB(sizes->cs_cachep))) {
			offslab_limit = sizes->cs_size - sizeof(struct slab);
Linus Torvalds's avatar
Linus Torvalds committed
			offslab_limit /= sizeof(kmem_bufctl_t);
		}

		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
							sizes->cs_size,
							ARCH_KMALLOC_MINALIGN,
							(ARCH_KMALLOC_FLAGS |
							 SLAB_CACHE_DMA |
							 SLAB_PANIC), NULL,
							NULL);
Linus Torvalds's avatar
Linus Torvalds committed

		sizes++;
		names++;
	}
	/* 4) Replace the bootstrap head arrays */
	{
		void *ptr;
Linus Torvalds's avatar
Linus Torvalds committed
		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
		local_irq_disable();
		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
		memcpy(ptr, cpu_cache_get(&cache_cache),
		       sizeof(struct arraycache_init));
Linus Torvalds's avatar
Linus Torvalds committed
		cache_cache.array[smp_processor_id()] = ptr;
		local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
		local_irq_disable();
		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
		       != &initarray_generic.cache);
		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
		       sizeof(struct arraycache_init));
		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
Linus Torvalds's avatar
Linus Torvalds committed
		local_irq_enable();
	}
	/* 5) Replace the bootstrap kmem_list3's */
	{
		int node;
		/* Replace the static kmem_list3 structures for the boot cpu */
		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
			  numa_node_id());

		for_each_online_node(node) {
			init_list(malloc_sizes[INDEX_AC].cs_cachep,
				  &initkmem_list3[SIZE_AC + node], node);

			if (INDEX_AC != INDEX_L3) {
				init_list(malloc_sizes[INDEX_L3].cs_cachep,
					  &initkmem_list3[SIZE_L3 + node],
					  node);
Linus Torvalds's avatar
Linus Torvalds committed

	/* 6) resize the head arrays to their final sizes */
Linus Torvalds's avatar
Linus Torvalds committed
	{
		kmem_cache_t *cachep;
		mutex_lock(&cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
		list_for_each_entry(cachep, &cache_chain, next)
		    enable_cpucache(cachep);
		mutex_unlock(&cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	/* Done! */
	g_cpucache_up = FULL;

	/* Register a cpu startup notifier callback
	 * that initializes cpu_cache_get for all new cpus
Linus Torvalds's avatar
Linus Torvalds committed
	 */
	register_cpu_notifier(&cpucache_notifier);

	/* The reap timers are started later, with a module init call:
	 * That part of the kernel is not yet operational.
	 */
}

static int __init cpucache_init(void)
{
	int cpu;

	/* 
	 * Register the timers that return unneeded
	 * pages to gfp.
	 */
	for_each_online_cpu(cpu)
	    start_cpu_timer(cpu);
Linus Torvalds's avatar
Linus Torvalds committed

	return 0;
}

__initcall(cpucache_init);

/*
 * Interface to system's page allocator. No need to hold the cache-lock.
 *
 * If we requested dmaable memory, we will get it. Even if we
 * did not request dmaable memory, we might get it, but that
 * would be relatively rare and ignorable.
 */
static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct page *page;
	void *addr;
	int i;

	flags |= cachep->gfpflags;
	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
Linus Torvalds's avatar
Linus Torvalds committed
	if (!page)
		return NULL;
	addr = page_address(page);

	i = (1 << cachep->gfporder);
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		atomic_add(i, &slab_reclaim_pages);
	add_page_state(nr_slab, i);
	while (i--) {
		SetPageSlab(page);
		page++;
	}
	return addr;
}

/*
 * Interface to system's page release.
 */
static void kmem_freepages(kmem_cache_t *cachep, void *addr)
{
	unsigned long i = (1 << cachep->gfporder);
Linus Torvalds's avatar
Linus Torvalds committed
	struct page *page = virt_to_page(addr);
	const unsigned long nr_freed = i;

	while (i--) {
		if (!TestClearPageSlab(page))
			BUG();
		page++;
	}
	sub_page_state(nr_slab, nr_freed);
	if (current->reclaim_state)
		current->reclaim_state->reclaimed_slab += nr_freed;
	free_pages((unsigned long)addr, cachep->gfporder);
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
Linus Torvalds's avatar
Linus Torvalds committed
}

static void kmem_rcu_free(struct rcu_head *head)
{
	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
Linus Torvalds's avatar
Linus Torvalds committed
	kmem_cache_t *cachep = slab_rcu->cachep;

	kmem_freepages(cachep, slab_rcu->addr);
	if (OFF_SLAB(cachep))
		kmem_cache_free(cachep->slabp_cache, slab_rcu);
}

#if DEBUG

#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
			    unsigned long caller)
Linus Torvalds's avatar
Linus Torvalds committed
{
	int size = obj_size(cachep);
Linus Torvalds's avatar
Linus Torvalds committed

	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
Linus Torvalds's avatar
Linus Torvalds committed

	if (size < 5 * sizeof(unsigned long))
Linus Torvalds's avatar
Linus Torvalds committed
		return;

	*addr++ = 0x12345678;
	*addr++ = caller;
	*addr++ = smp_processor_id();
	size -= 3 * sizeof(unsigned long);
Linus Torvalds's avatar
Linus Torvalds committed
	{
		unsigned long *sptr = &caller;
		unsigned long svalue;

		while (!kstack_end(sptr)) {
			svalue = *sptr++;
			if (kernel_text_address(svalue)) {
				*addr++ = svalue;
Linus Torvalds's avatar
Linus Torvalds committed
				size -= sizeof(unsigned long);
				if (size <= sizeof(unsigned long))
					break;
			}
		}

	}
	*addr++ = 0x87654321;
Linus Torvalds's avatar
Linus Torvalds committed
}
#endif

static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
{
	int size = obj_size(cachep);
	addr = &((char *)addr)[obj_offset(cachep)];
Linus Torvalds's avatar
Linus Torvalds committed

	memset(addr, val, size);
	*(unsigned char *)(addr + size - 1) = POISON_END;
Linus Torvalds's avatar
Linus Torvalds committed
}

static void dump_line(char *data, int offset, int limit)
{
	int i;
	printk(KERN_ERR "%03x:", offset);
	for (i = 0; i < limit; i++) {
		printk(" %02x", (unsigned char)data[offset + i]);
Linus Torvalds's avatar
Linus Torvalds committed
	}
	printk("\n");
}
#endif

#if DEBUG

static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
{
	int i, size;
	char *realobj;

	if (cachep->flags & SLAB_RED_ZONE) {
		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
		       *dbg_redzone1(cachep, objp),
		       *dbg_redzone2(cachep, objp));
Linus Torvalds's avatar
Linus Torvalds committed
	}

	if (cachep->flags & SLAB_STORE_USER) {
		printk(KERN_ERR "Last user: [<%p>]",
		       *dbg_userword(cachep, objp));
Linus Torvalds's avatar
Linus Torvalds committed
		print_symbol("(%s)",
			     (unsigned long)*dbg_userword(cachep, objp));
Linus Torvalds's avatar
Linus Torvalds committed
		printk("\n");
	}
	realobj = (char *)objp + obj_offset(cachep);
	size = obj_size(cachep);
	for (i = 0; i < size && lines; i += 16, lines--) {
Linus Torvalds's avatar
Linus Torvalds committed
		int limit;
		limit = 16;
		if (i + limit > size)
			limit = size - i;
Linus Torvalds's avatar
Linus Torvalds committed
		dump_line(realobj, i, limit);
	}
}

static void check_poison_obj(kmem_cache_t *cachep, void *objp)
{
	char *realobj;
	int size, i;
	int lines = 0;

	realobj = (char *)objp + obj_offset(cachep);
	size = obj_size(cachep);
Linus Torvalds's avatar
Linus Torvalds committed

	for (i = 0; i < size; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
		char exp = POISON_FREE;
		if (i == size - 1)
Linus Torvalds's avatar
Linus Torvalds committed
			exp = POISON_END;
		if (realobj[i] != exp) {
			int limit;
			/* Mismatch ! */
			/* Print header */
			if (lines == 0) {
				printk(KERN_ERR
				       "Slab corruption: start=%p, len=%d\n",
				       realobj, size);
Linus Torvalds's avatar
Linus Torvalds committed
				print_objinfo(cachep, objp, 0);
			}
			/* Hexdump the affected line */
			i = (i / 16) * 16;
Linus Torvalds's avatar
Linus Torvalds committed
			limit = 16;
			if (i + limit > size)
				limit = size - i;
Linus Torvalds's avatar
Linus Torvalds committed
			dump_line(realobj, i, limit);
			i += 16;
			lines++;
			/* Limit to 5 lines */
			if (lines > 5)
				break;
		}
	}
	if (lines != 0) {
		/* Print some data about the neighboring objects, if they
		 * exist:
		 */
		struct slab *slabp = virt_to_slab(objp);
Linus Torvalds's avatar
Linus Torvalds committed
		int objnr;

		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
Linus Torvalds's avatar
Linus Torvalds committed
		if (objnr) {
			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
			realobj = (char *)objp + obj_offset(cachep);
Linus Torvalds's avatar
Linus Torvalds committed
			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
			       realobj, size);
Linus Torvalds's avatar
Linus Torvalds committed
			print_objinfo(cachep, objp, 2);
		}
		if (objnr + 1 < cachep->num) {
			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
			realobj = (char *)objp + obj_offset(cachep);
Linus Torvalds's avatar
Linus Torvalds committed
			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
			       realobj, size);
Linus Torvalds's avatar
Linus Torvalds committed
			print_objinfo(cachep, objp, 2);
		}
	}
}
#endif

#if DEBUG
/**
 * slab_destroy_objs - call the registered destructor for each object in
 *      a slab that is to be destroyed.
Linus Torvalds's avatar
Linus Torvalds committed
 */
static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
Linus Torvalds's avatar
Linus Torvalds committed
{
	int i;
	for (i = 0; i < cachep->num; i++) {
		void *objp = slabp->s_mem + cachep->buffer_size * i;
Linus Torvalds's avatar
Linus Torvalds committed

		if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
			if ((cachep->buffer_size % PAGE_SIZE) == 0
			    && OFF_SLAB(cachep))
				kernel_map_pages(virt_to_page(objp),
						 cachep->buffer_size / PAGE_SIZE,
Linus Torvalds's avatar
Linus Torvalds committed
			else
				check_poison_obj(cachep, objp);
#else
			check_poison_obj(cachep, objp);
#endif
		}
		if (cachep->flags & SLAB_RED_ZONE) {
			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				slab_error(cachep, "start of a freed object "
					   "was overwritten");
Linus Torvalds's avatar
Linus Torvalds committed
			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				slab_error(cachep, "end of a freed object "
					   "was overwritten");
Linus Torvalds's avatar
Linus Torvalds committed
		}
		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
Linus Torvalds's avatar
Linus Torvalds committed
	}
Linus Torvalds's avatar
Linus Torvalds committed
#else
static void slab_destroy_objs(kmem_cache_t *cachep, struct slab *slabp)
{
Linus Torvalds's avatar
Linus Torvalds committed
	if (cachep->dtor) {
		int i;
		for (i = 0; i < cachep->num; i++) {
			void *objp = slabp->s_mem + cachep->buffer_size * i;
			(cachep->dtor) (objp, cachep, 0);
Linus Torvalds's avatar
Linus Torvalds committed
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
#endif

/**
 * Destroy all the objs in a slab, and release the mem back to the system.
 * Before calling the slab must have been unlinked from the cache.
 * The cache-lock is not held/needed.
 */
static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
{
	void *addr = slabp->s_mem - slabp->colouroff;

	slab_destroy_objs(cachep, slabp);
Linus Torvalds's avatar
Linus Torvalds committed
	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
		struct slab_rcu *slab_rcu;

		slab_rcu = (struct slab_rcu *)slabp;
Linus Torvalds's avatar
Linus Torvalds committed
		slab_rcu->cachep = cachep;
		slab_rcu->addr = addr;
		call_rcu(&slab_rcu->head, kmem_rcu_free);
	} else {
		kmem_freepages(cachep, addr);
		if (OFF_SLAB(cachep))
			kmem_cache_free(cachep->slabp_cache, slabp);
	}
}

/* For setting up all the kmem_list3s for cache whose buffer_size is same
   as size of kmem_list3. */
static void set_up_list3s(kmem_cache_t *cachep, int index)
{
	int node;

	for_each_online_node(node) {
		cachep->nodelists[node] = &initkmem_list3[index + node];
		cachep->nodelists[node]->next_reap = jiffies +
		    REAPTIMEOUT_LIST3 +
		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
/**
 * calculate_slab_order - calculate size (page order) of slabs and the number
 *                        of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
					  size_t align, gfp_t flags)
{
	size_t left_over = 0;

	for (;; cachep->gfporder++) {
		unsigned int num;
		size_t remainder;

		if (cachep->gfporder > MAX_GFP_ORDER) {
			cachep->num = 0;
			break;
		}

		cache_estimate(cachep->gfporder, size, align, flags,
			       &remainder, &num);
		if (!num)
			continue;
		/* More than offslab_limit objects will cause problems */
		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
			break;

		cachep->num = num;
		left_over = remainder;

		/*
		 * Large number of objects is good, but very large slabs are
		 * currently bad for the gfp()s.
		 */
		if (cachep->gfporder >= slab_break_gfp_order)
			break;

		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
			/* Acceptable internal fragmentation */
			break;
	}
	return left_over;
}

Linus Torvalds's avatar
Linus Torvalds committed
/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 * @dtor: A destructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache
 * and the @dtor is run before the pages are handed back.
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting 
 * unloaded.
 * 
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
 * memory pressure.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
kmem_cache_t *
kmem_cache_create (const char *name, size_t size, size_t align,
	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
	void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
	size_t left_over, slab_size, ralign;
	kmem_cache_t *cachep = NULL;
	struct list_head *p;
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 * Sanity checks... these are all serious usage bugs.
	 */
	if ((!name) ||
	    in_interrupt() ||
	    (size < BYTES_PER_WORD) ||
	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
		printk(KERN_ERR "%s: Early error in slab %s\n",
		       __FUNCTION__, name);
		BUG();
	}
Linus Torvalds's avatar
Linus Torvalds committed

	mutex_lock(&cache_chain_mutex);

	list_for_each(p, &cache_chain) {
		kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
		mm_segment_t old_fs = get_fs();
		char tmp;
		int res;

		/*
		 * This happens when the module gets unloaded and doesn't
		 * destroy its slab cache and no-one else reuses the vmalloc
		 * area of the module.  Print a warning.
		 */
		set_fs(KERNEL_DS);
		res = __get_user(tmp, pc->name);
		set_fs(old_fs);
		if (res) {
			printk("SLAB: cache with size %d has lost its name\n",
		if (!strcmp(pc->name, name)) {
			printk("kmem_cache_create: duplicate cache %s\n", name);
			dump_stack();
			goto oops;
		}
	}

Linus Torvalds's avatar
Linus Torvalds committed
#if DEBUG
	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
		/* No constructor, but inital state check requested */
		printk(KERN_ERR "%s: No con, but init state check "
		       "requested - %s\n", __FUNCTION__, name);
Linus Torvalds's avatar
Linus Torvalds committed
		flags &= ~SLAB_DEBUG_INITIAL;
	}
#if FORCED_DEBUG
	/*
	 * Enable redzoning and last user accounting, except for caches with
	 * large objects, if the increased size would increase the object size
	 * above the next power of two: caches with object sizes just above a
	 * power of two have a significant amount of internal fragmentation.
	 */
	if ((size < 4096
	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
Linus Torvalds's avatar
Linus Torvalds committed
	if (!(flags & SLAB_DESTROY_BY_RCU))
		flags |= SLAB_POISON;
#endif
	if (flags & SLAB_DESTROY_BY_RCU)
		BUG_ON(flags & SLAB_POISON);
#endif
	if (flags & SLAB_DESTROY_BY_RCU)
		BUG_ON(dtor);

	/*
	 * Always checks flags, a caller might be expecting debug
	 * support which isn't available.
	 */
	if (flags & ~CREATE_MASK)
		BUG();

	/* Check that size is in terms of words.  This is needed to avoid
	 * unaligned accesses for some archs when redzoning is used, and makes
	 * sure any on-slab bufctl's are also correctly aligned.
	 */
	if (size & (BYTES_PER_WORD - 1)) {
		size += (BYTES_PER_WORD - 1);
		size &= ~(BYTES_PER_WORD - 1);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	/* calculate out the final buffer alignment: */
	/* 1) arch recommendation: can be overridden for debug */
	if (flags & SLAB_HWCACHE_ALIGN) {
		/* Default alignment: as specified by the arch code.
		 * Except if an object is really small, then squeeze multiple
		 * objects into one cacheline.
		 */
		ralign = cache_line_size();
		while (size <= ralign / 2)
Linus Torvalds's avatar
Linus Torvalds committed
			ralign /= 2;
	} else {
		ralign = BYTES_PER_WORD;
	}
	/* 2) arch mandated alignment: disables debug if necessary */
	if (ralign < ARCH_SLAB_MINALIGN) {
		ralign = ARCH_SLAB_MINALIGN;
		if (ralign > BYTES_PER_WORD)
			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
Linus Torvalds's avatar
Linus Torvalds committed
	}
	/* 3) caller mandated alignment: disables debug if necessary */
	if (ralign < align) {
		ralign = align;
		if (ralign > BYTES_PER_WORD)
			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
Linus Torvalds's avatar
Linus Torvalds committed
	}
	/* 4) Store it. Note that the debug code below can reduce
	 *    the alignment to BYTES_PER_WORD.
	 */
	align = ralign;

	/* Get cache's description obj. */
	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
	if (!cachep)
Linus Torvalds's avatar
Linus Torvalds committed
	memset(cachep, 0, sizeof(kmem_cache_t));

#if DEBUG
Linus Torvalds's avatar
Linus Torvalds committed

	if (flags & SLAB_RED_ZONE) {
		/* redzoning only works with word aligned caches */
		align = BYTES_PER_WORD;

		/* add space for red zone words */
		cachep->obj_offset += BYTES_PER_WORD;
		size += 2 * BYTES_PER_WORD;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	if (flags & SLAB_STORE_USER) {
		/* user store requires word alignment and
		 * one word storage behind the end of the real
		 * object.
		 */
		align = BYTES_PER_WORD;
		size += BYTES_PER_WORD;
	}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
		cachep->obj_offset += PAGE_SIZE - size;
Linus Torvalds's avatar
Linus Torvalds committed
		size = PAGE_SIZE;
	}
#endif
#endif

	/* Determine if the slab management is 'on' or 'off' slab. */
	if (size >= (PAGE_SIZE >> 3))
Linus Torvalds's avatar
Linus Torvalds committed
		/*
		 * Size is large, assume best to place the slab management obj
		 * off-slab (should allow better packing of objs).
		 */
		flags |= CFLGS_OFF_SLAB;

	size = ALIGN(size, align);

	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
		/*
		 * A VFS-reclaimable slab tends to have most allocations
		 * as GFP_NOFS and we really don't want to have to be allocating
		 * higher-order pages when we are unable to shrink dcache.
		 */
		cachep->gfporder = 0;
		cache_estimate(cachep->gfporder, size, align, flags,
			       &left_over, &cachep->num);
	} else
		left_over = calculate_slab_order(cachep, size, align, flags);
Linus Torvalds's avatar
Linus Torvalds committed

	if (!cachep->num) {
		printk("kmem_cache_create: couldn't create cache %s.\n", name);
		kmem_cache_free(&cache_cache, cachep);
		cachep = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
	}
	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
			  + sizeof(struct slab), align);
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 * If the slab has been placed off-slab, and we have enough space then
	 * move it on-slab. This is at the expense of any extra colouring.
	 */
	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
		flags &= ~CFLGS_OFF_SLAB;
		left_over -= slab_size;
	}

	if (flags & CFLGS_OFF_SLAB) {
		/* really off slab. No need for manual alignment */
		slab_size =
		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
Linus Torvalds's avatar
Linus Torvalds committed
	}

	cachep->colour_off = cache_line_size();
	/* Offset must be a multiple of the alignment. */
	if (cachep->colour_off < align)
		cachep->colour_off = align;
	cachep->colour = left_over / cachep->colour_off;
Linus Torvalds's avatar
Linus Torvalds committed
	cachep->slab_size = slab_size;
	cachep->flags = flags;
	cachep->gfpflags = 0;
	if (flags & SLAB_CACHE_DMA)
		cachep->gfpflags |= GFP_DMA;
	spin_lock_init(&cachep->spinlock);
	cachep->buffer_size = size;
Linus Torvalds's avatar
Linus Torvalds committed

	if (flags & CFLGS_OFF_SLAB)
		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
Linus Torvalds's avatar
Linus Torvalds committed
	cachep->ctor = ctor;
	cachep->dtor = dtor;
	cachep->name = name;

	/* Don't let CPUs to come and go */
	lock_cpu_hotplug();

	if (g_cpucache_up == FULL) {
		enable_cpucache(cachep);
	} else {
		if (g_cpucache_up == NONE) {
			/* Note: the first kmem_cache_create must create
			 * the cache that's used by kmalloc(24), otherwise
			 * the creation of further caches will BUG().
			 */
			cachep->array[smp_processor_id()] =
			    &initarray_generic.cache;

			/* If the cache that's used by
			 * kmalloc(sizeof(kmem_list3)) is the first cache,
			 * then we need to set up all its list3s, otherwise
			 * the creation of further caches will BUG().
			 */
			set_up_list3s(cachep, SIZE_AC);
			if (INDEX_AC == INDEX_L3)
				g_cpucache_up = PARTIAL_L3;
			else
				g_cpucache_up = PARTIAL_AC;
Linus Torvalds's avatar
Linus Torvalds committed
		} else {
			cachep->array[smp_processor_id()] =
			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

			if (g_cpucache_up == PARTIAL_AC) {
				set_up_list3s(cachep, SIZE_L3);
				g_cpucache_up = PARTIAL_L3;
			} else {
				int node;
				for_each_online_node(node) {

					cachep->nodelists[node] =
					    kmalloc_node(sizeof
							 (struct kmem_list3),
							 GFP_KERNEL, node);
					BUG_ON(!cachep->nodelists[node]);
					kmem_list3_init(cachep->
							nodelists[node]);
Linus Torvalds's avatar
Linus Torvalds committed
		}
		cachep->nodelists[numa_node_id()]->next_reap =
		    jiffies + REAPTIMEOUT_LIST3 +
		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
		BUG_ON(!cpu_cache_get(cachep));
		cpu_cache_get(cachep)->avail = 0;
		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
		cpu_cache_get(cachep)->batchcount = 1;
		cpu_cache_get(cachep)->touched = 0;
Linus Torvalds's avatar
Linus Torvalds committed
		cachep->batchcount = 1;
		cachep->limit = BOOT_CPUCACHE_ENTRIES;
Linus Torvalds's avatar
Linus Torvalds committed

	/* cache setup completed, link it into the list */
	list_add(&cachep->next, &cache_chain);
	unlock_cpu_hotplug();
Linus Torvalds's avatar
Linus Torvalds committed
	if (!cachep && (flags & SLAB_PANIC))
		panic("kmem_cache_create(): failed to create slab `%s'\n",
	mutex_unlock(&cache_chain_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
	return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);

#if DEBUG
static void check_irq_off(void)
{
	BUG_ON(!irqs_disabled());
}

static void check_irq_on(void)
{
	BUG_ON(irqs_disabled());
}

static void check_spinlock_acquired(kmem_cache_t *cachep)
{
#ifdef CONFIG_SMP
	check_irq_off();
	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
Linus Torvalds's avatar
Linus Torvalds committed
#endif
}
static void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
{
#ifdef CONFIG_SMP
	check_irq_off();
	assert_spin_locked(&cachep->nodelists[node]->list_lock);
#endif
}

Linus Torvalds's avatar
Linus Torvalds committed
#else
#define check_irq_off()	do { } while(0)
#define check_irq_on()	do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
#define check_spinlock_acquired_node(x, y) do { } while(0)
Linus Torvalds's avatar
Linus Torvalds committed
#endif

/*
 * Waits for all CPUs to execute func().
 */
static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
Linus Torvalds's avatar
Linus Torvalds committed
{
	check_irq_on();
	preempt_disable();

	local_irq_disable();
	func(arg);
	local_irq_enable();

	if (smp_call_function(func, arg, 1, 1))
		BUG();

	preempt_enable();
}

static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
				int force, int node);
Linus Torvalds's avatar
Linus Torvalds committed

static void do_drain(void *arg)
{
	kmem_cache_t *cachep = (kmem_cache_t *) arg;
Linus Torvalds's avatar
Linus Torvalds committed
	struct array_cache *ac;
Linus Torvalds's avatar
Linus Torvalds committed

	check_irq_off();
	ac = cpu_cache_get(cachep);
	spin_lock(&cachep->nodelists[node]->list_lock);
	free_block(cachep, ac->entry, ac->avail, node);
	spin_unlock(&cachep->nodelists[node]->list_lock);
Linus Torvalds's avatar
Linus Torvalds committed
	ac->avail = 0;
}