Skip to content
Snippets Groups Projects
page_alloc.c 62.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/mm/page_alloc.c
     *
     *  Manages the free list, the system allocates free pages here.
     *  Note that kmalloc() lives in slab.c
     *
     *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     *  Swap reorganised 29.12.95, Stephen Tweedie
     *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
     *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
     *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
     *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
     *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
     *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
     */
    
    #include <linux/stddef.h>
    #include <linux/mm.h>
    #include <linux/swap.h>
    #include <linux/interrupt.h>
    #include <linux/pagemap.h>
    #include <linux/bootmem.h>
    #include <linux/compiler.h>
    
    #include <linux/kernel.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/module.h>
    #include <linux/suspend.h>
    #include <linux/pagevec.h>
    #include <linux/blkdev.h>
    #include <linux/slab.h>
    #include <linux/notifier.h>
    #include <linux/topology.h>
    #include <linux/sysctl.h>
    #include <linux/cpu.h>
    #include <linux/cpuset.h>
    
    #include <linux/memory_hotplug.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/nodemask.h>
    #include <linux/vmalloc.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/tlbflush.h>
    
    #include <asm/div64.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include "internal.h"
    
    /*
     * MCD - HACK: Find somewhere to initialize this EARLY, or make this
     * initializer cleaner
     */
    
    nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
    
    EXPORT_SYMBOL(node_online_map);
    
    nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
    
    EXPORT_SYMBOL(node_possible_map);
    
    unsigned long totalram_pages __read_mostly;
    unsigned long totalhigh_pages __read_mostly;
    
    unsigned long totalreserve_pages __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    long nr_swap_pages;
    
    int percpu_pagelist_fraction;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static void __free_pages_ok(struct page *page, unsigned int order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * results with 256, 32 in the lowmem_reserve sysctl:
     *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
     *	1G machine -> (16M dma, 784M normal, 224M high)
     *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
     *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
     *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
    
     *
     * TBD: should special case ZONE_DMA32 machines here - in those we normally
     * don't need any ZONE_NORMAL reservation
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    EXPORT_SYMBOL(totalram_pages);
    
    /*
     * Used by page_zone() to look up the address of the struct zone whose
     * id is encoded in the upper bits of page->flags
     */
    
    struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    EXPORT_SYMBOL(zone_table);
    
    
    static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    int min_free_kbytes = 1024;
    
    
    unsigned long __meminitdata nr_kernel_pages;
    unsigned long __meminitdata nr_all_pages;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #ifdef CONFIG_DEBUG_VM
    
    static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int ret = 0;
    	unsigned seq;
    	unsigned long pfn = page_to_pfn(page);
    
    	do {
    		seq = zone_span_seqbegin(zone);
    		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
    			ret = 1;
    		else if (pfn < zone->zone_start_pfn)
    			ret = 1;
    	} while (zone_span_seqretry(zone, seq));
    
    	return ret;
    
    }
    
    static int page_is_consistent(struct zone *zone, struct page *page)
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifdef CONFIG_HOLES_IN_ZONE
    	if (!pfn_valid(page_to_pfn(page)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    	if (zone != page_zone(page))
    
    		return 0;
    
    	return 1;
    }
    /*
     * Temporary debugging check for pages not lying within a given zone.
     */
    static int bad_range(struct zone *zone, struct page *page)
    {
    	if (page_outside_zone_boundaries(zone, page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    
    	if (!page_is_consistent(zone, page))
    		return 1;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return 0;
    }
    
    
    #else
    static inline int bad_range(struct zone *zone, struct page *page)
    {
    	return 0;
    }
    #endif
    
    
    static void bad_page(struct page *page)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	printk(KERN_EMERG "Bad page state in process '%s'\n"
    
    		KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
    		KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
    		KERN_EMERG "Backtrace:\n",
    
    		current->comm, page, (int)(2*sizeof(unsigned long)),
    		(unsigned long)page->flags, page->mapping,
    		page_mapcount(page), page_count(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dump_stack();
    
    	page->flags &= ~(1 << PG_lru	|
    			1 << PG_private |
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			1 << PG_locked	|
    			1 << PG_active	|
    			1 << PG_dirty	|
    
    			1 << PG_reclaim |
    			1 << PG_slab    |
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			1 << PG_swapcache |
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	set_page_count(page, 0);
    	reset_page_mapcount(page);
    	page->mapping = NULL;
    
    	add_taint(TAINT_BAD_PAGE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Higher-order pages are called "compound pages".  They are structured thusly:
     *
     * The first PAGE_SIZE page is called the "head page".
     *
     * The remaining PAGE_SIZE pages are called "tail pages".
     *
     * All pages have PG_compound set.  All pages have their ->private pointing at
     * the head page (even the head page has this).
     *
    
     * The first tail page's ->lru.next holds the address of the compound page's
     * put_page() function.  Its ->lru.prev holds the order of allocation.
     * This usage means that zero-order pages may not be compound.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    
    static void free_compound_page(struct page *page)
    {
    	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static void prep_compound_page(struct page *page, unsigned long order)
    {
    	int i;
    	int nr_pages = 1 << order;
    
    
    	page[1].lru.next = (void *)free_compound_page;	/* set dtor */
    
    	page[1].lru.prev = (void *)order;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for (i = 0; i < nr_pages; i++) {
    		struct page *p = page + i;
    
    
    		__SetPageCompound(p);
    
    		set_page_private(p, (unsigned long)page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    }
    
    static void destroy_compound_page(struct page *page, unsigned long order)
    {
    	int i;
    	int nr_pages = 1 << order;
    
    
    	if (unlikely((unsigned long)page[1].lru.prev != order))
    
    		bad_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	for (i = 0; i < nr_pages; i++) {
    		struct page *p = page + i;
    
    
    		if (unlikely(!PageCompound(p) |
    				(page_private(p) != (unsigned long)page)))
    			bad_page(page);
    
    		__ClearPageCompound(p);
    
    static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
    {
    	int i;
    
    	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
    
    	/*
    	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
    	 * and __GFP_HIGHMEM from hard or soft interrupt context.
    	 */
    	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
    
    	for (i = 0; i < (1 << order); i++)
    		clear_highpage(page + i);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * function for dealing with page's order in buddy system.
     * zone->lock is already acquired when we use these.
     * So, we don't need atomic page->flags operations here.
     */
    
    static inline unsigned long page_order(struct page *page)
    {
    
    	return page_private(page);
    
    static inline void set_page_order(struct page *page, int order)
    {
    
    	set_page_private(page, order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline void rmv_page_order(struct page *page)
    {
    
    	set_page_private(page, 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Locate the struct page for both the matching buddy in our
     * pair (buddy1) and the combined O(n+1) page they form (page).
     *
     * 1) Any buddy B1 will have an order O twin B2 which satisfies
     * the following equation:
     *     B2 = B1 ^ (1 << O)
     * For example, if the starting buddy (buddy2) is #8 its order
     * 1 buddy is #10:
     *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
     *
     * 2) Any buddy B will have an order O+1 parent P which
     * satisfies the following equation:
     *     P = B & ~(1 << O)
     *
    
    Andreas Mohr's avatar
    Andreas Mohr committed
     * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    static inline struct page *
    __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
    {
    	unsigned long buddy_idx = page_idx ^ (1 << order);
    
    	return page + (buddy_idx - page_idx);
    }
    
    static inline unsigned long
    __find_combined_index(unsigned long page_idx, unsigned int order)
    {
    	return (page_idx & ~(1 << order));
    }
    
    /*
     * This function checks whether a page is free && is the buddy
     * we can do coalesce a page and its buddy if
    
     * (a) the buddy is not in a hole &&
    
     * (b) the buddy is in the buddy system &&
    
     * (c) a page and its buddy have the same order &&
     * (d) a page and its buddy are in the same zone.
    
     *
     * For recording whether a page is in the buddy system, we use PG_buddy.
     * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * For recording page's order, we use page_private(page).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static inline int page_is_buddy(struct page *page, struct page *buddy,
    								int order)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    #ifdef CONFIG_HOLES_IN_ZONE
    
    	if (!pfn_valid(page_to_pfn(buddy)))
    
    		return 0;
    #endif
    
    
    	if (page_zone_id(page) != page_zone_id(buddy))
    		return 0;
    
    	if (PageBuddy(buddy) && page_order(buddy) == order) {
    		BUG_ON(page_count(buddy) != 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Freeing function for a buddy system allocator.
     *
     * The concept of a buddy system is to maintain direct-mapped table
     * (containing bit values) for memory blocks of various "orders".
     * The bottom level table contains the map for the smallest allocatable
     * units of memory (here, pages), and each level above it describes
     * pairs of units from the levels below, hence, "buddies".
     * At a high level, all that happens here is marking the table entry
     * at the bottom level available, and propagating the changes upward
     * as necessary, plus some accounting needed to play nicely with other
     * parts of the VM system.
     * At each level, we keep a list of pages, which are heads of continuous
    
     * free pages of length of (1 << order) and marked with PG_buddy. Page's
    
     * order is recorded in page_private(page) field.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * So when we are allocating or freeing one, we can derive the state of the
     * other.  That is, if we allocate a small block, and both were   
     * free, the remainder of the region must be split into blocks.   
     * If a block is freed, and its buddy is also free, then this
     * triggers coalescing into a block of larger size.            
     *
     * -- wli
     */
    
    
    static inline void __free_one_page(struct page *page,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct zone *zone, unsigned int order)
    {
    	unsigned long page_idx;
    	int order_size = 1 << order;
    
    
    	if (unlikely(PageCompound(page)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		destroy_compound_page(page, order);
    
    	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
    
    	BUG_ON(page_idx & (order_size - 1));
    	BUG_ON(bad_range(zone, page));
    
    	zone->free_pages += order_size;
    	while (order < MAX_ORDER-1) {
    		unsigned long combined_idx;
    		struct free_area *area;
    		struct page *buddy;
    
    		buddy = __page_find_buddy(page, page_idx, order);
    
    		if (!page_is_buddy(page, buddy, order))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;		/* Move the buddy up one level. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_del(&buddy->lru);
    		area = zone->free_area + order;
    		area->nr_free--;
    		rmv_page_order(buddy);
    
    		combined_idx = __find_combined_index(page_idx, order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = page + (combined_idx - page_idx);
    		page_idx = combined_idx;
    		order++;
    	}
    	set_page_order(page, order);
    	list_add(&page->lru, &zone->free_area[order].free_list);
    	zone->free_area[order].nr_free++;
    }
    
    
    static inline int free_pages_check(struct page *page)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (unlikely(page_mapcount(page) |
    		(page->mapping != NULL)  |
    		(page_count(page) != 0)  |
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		(page->flags & (
    			1 << PG_lru	|
    			1 << PG_private |
    			1 << PG_locked	|
    			1 << PG_active	|
    			1 << PG_reclaim	|
    			1 << PG_slab	|
    			1 << PG_swapcache |
    
    			1 << PG_writeback |
    
    		bad_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (PageDirty(page))
    
    		__ClearPageDirty(page);
    
    	/*
    	 * For now, we report if PG_reserved was found set, but do not
    	 * clear it, and do not free the page.  But we shall soon need
    	 * to do more, for when the ZERO_PAGE count wraps negative.
    	 */
    	return PageReserved(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Frees a list of pages. 
     * Assumes all pages on list are in same zone, and of same order.
    
     * count is the number of pages to free.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * If the zone was previously in an "all pages pinned" state then look to
     * see if this freeing clears that state.
     *
     * And clear the zone's pages_scanned counter, to hold off the "all pages are
     * pinned" detection logic.
     */
    
    static void free_pages_bulk(struct zone *zone, int count,
    					struct list_head *list, int order)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	spin_lock(&zone->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	zone->all_unreclaimable = 0;
    	zone->pages_scanned = 0;
    
    	while (count--) {
    		struct page *page;
    
    		BUG_ON(list_empty(list));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = list_entry(list->prev, struct page, lru);
    
    		/* have to delete it as __free_one_page list manipulates */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_del(&page->lru);
    
    		__free_one_page(page, zone, order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	spin_unlock(&zone->lock);
    
    static void free_one_page(struct zone *zone, struct page *page, int order)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(list);
    
    	list_add(&page->lru, &list);
    	free_pages_bulk(zone, 1, &list, order);
    }
    
    static void __free_pages_ok(struct page *page, unsigned int order)
    {
    	unsigned long flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int i;
    
    	int reserved = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	arch_free_page(page, order);
    
    	if (!PageHighMem(page))
    
    		debug_check_no_locks_freed(page_address(page),
    					   PAGE_SIZE<<order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	for (i = 0 ; i < (1 << order) ; ++i)
    
    		reserved += free_pages_check(page + i);
    
    	if (reserved)
    		return;
    
    
    	kernel_map_pages(page, 1 << order, 0);
    
    	local_irq_save(flags);
    
    	__count_vm_events(PGFREE, 1 << order);
    
    	free_one_page(page_zone(page), page, order);
    
    	local_irq_restore(flags);
    
    /*
     * permit the bootmem allocator to evade page validation on high-order frees
     */
    void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
    {
    	if (order == 0) {
    		__ClearPageReserved(page);
    		set_page_count(page, 0);
    
    		set_page_refcounted(page);
    
    		__free_page(page);
    
    		prefetchw(page);
    
    		for (loop = 0; loop < BITS_PER_LONG; loop++) {
    			struct page *p = &page[loop];
    
    
    			if (loop + 1 < BITS_PER_LONG)
    				prefetchw(p + 1);
    
    			__ClearPageReserved(p);
    			set_page_count(p, 0);
    		}
    
    
    		set_page_refcounted(page);
    
    		__free_pages(page, order);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * The order of subdivision here is critical for the IO subsystem.
     * Please do not alter this order without good reasons and regression
     * testing. Specifically, as large blocks of memory are subdivided,
     * the order in which smaller blocks are delivered depends on the order
     * they're subdivided in this function. This is the primary factor
     * influencing the order in which pages are delivered to the IO
     * subsystem according to empirical testing, and this is also justified
     * by considering the behavior of a buddy system containing a single
     * large block of memory acted on by a series of small allocations.
     * This behavior is a critical factor in sglist merging's success.
     *
     * -- wli
     */
    
    static inline void expand(struct zone *zone, struct page *page,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     	int low, int high, struct free_area *area)
    {
    	unsigned long size = 1 << high;
    
    	while (high > low) {
    		area--;
    		high--;
    		size >>= 1;
    		BUG_ON(bad_range(zone, &page[size]));
    		list_add(&page[size].lru, &area->free_list);
    		area->nr_free++;
    		set_page_order(&page[size], high);
    	}
    }
    
    /*
     * This page is about to be returned from the page allocator
     */
    
    static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (unlikely(page_mapcount(page) |
    		(page->mapping != NULL)  |
    		(page_count(page) != 0)  |
    
    		(page->flags & (
    			1 << PG_lru	|
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			1 << PG_private	|
    			1 << PG_locked	|
    			1 << PG_active	|
    			1 << PG_dirty	|
    			1 << PG_reclaim	|
    
    			1 << PG_slab    |
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			1 << PG_swapcache |
    
    			1 << PG_writeback |
    
    		bad_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/*
    	 * For now, we report if PG_reserved was found set, but do not
    	 * clear it, and do not allocate the page: as a safety net.
    	 */
    	if (PageReserved(page))
    		return 1;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
    			1 << PG_referenced | 1 << PG_arch_1 |
    			1 << PG_checked | 1 << PG_mappedtodisk);
    
    	set_page_private(page, 0);
    
    	set_page_refcounted(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	kernel_map_pages(page, 1 << order, 1);
    
    
    	if (gfp_flags & __GFP_ZERO)
    		prep_zero_page(page, order, gfp_flags);
    
    	if (order && (gfp_flags & __GFP_COMP))
    		prep_compound_page(page, order);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* 
     * Do the hard work of removing an element from the buddy allocator.
     * Call me with the zone->lock already held.
     */
    static struct page *__rmqueue(struct zone *zone, unsigned int order)
    {
    	struct free_area * area;
    	unsigned int current_order;
    	struct page *page;
    
    	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
    		area = zone->free_area + current_order;
    		if (list_empty(&area->free_list))
    			continue;
    
    		page = list_entry(area->free_list.next, struct page, lru);
    		list_del(&page->lru);
    		rmv_page_order(page);
    		area->nr_free--;
    		zone->free_pages -= 1UL << order;
    
    		expand(zone, page, order, current_order, area);
    		return page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	return NULL;
    }
    
    /* 
     * Obtain a specified number of elements from the buddy allocator, all under
     * a single hold of the lock, for efficiency.  Add them to the supplied list.
     * Returns the number of new pages which were placed at *list.
     */
    static int rmqueue_bulk(struct zone *zone, unsigned int order, 
    			unsigned long count, struct list_head *list)
    {
    	int i;
    	
    
    	spin_lock(&zone->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for (i = 0; i < count; ++i) {
    
    		struct page *page = __rmqueue(zone, order);
    		if (unlikely(page == NULL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    		list_add_tail(&page->lru, list);
    	}
    
    	spin_unlock(&zone->lock);
    
    	return i;
    
    /*
     * Called from the slab reaper to drain pagesets on a particular node that
     * belong to the currently executing processor.
    
     * Note that this function must be called with the thread pinned to
     * a single processor.
    
    	for (z = 0; z < MAX_NR_ZONES; z++) {
    		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
    
    		struct per_cpu_pageset *pset;
    
    
    		pset = zone_pcp(zone, smp_processor_id());
    
    		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
    			struct per_cpu_pages *pcp;
    
    			pcp = &pset->pcp[i];
    
    			if (pcp->count) {
    				local_irq_save(flags);
    				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
    				pcp->count = 0;
    				local_irq_restore(flags);
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
    static void __drain_pages(unsigned int cpu)
    {
    
    	unsigned long flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct zone *zone;
    	int i;
    
    	for_each_zone(zone) {
    		struct per_cpu_pageset *pset;
    
    
    		pset = zone_pcp(zone, cpu);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
    			struct per_cpu_pages *pcp;
    
    			pcp = &pset->pcp[i];
    
    			local_irq_save(flags);
    
    			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
    			pcp->count = 0;
    
    			local_irq_restore(flags);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    }
    #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
    
    #ifdef CONFIG_PM
    
    void mark_free_pages(struct zone *zone)
    {
    	unsigned long zone_pfn, flags;
    	int order;
    	struct list_head *curr;
    
    	if (!zone->spanned_pages)
    		return;
    
    	spin_lock_irqsave(&zone->lock, flags);
    	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
    		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
    
    	for (order = MAX_ORDER - 1; order >= 0; --order)
    		list_for_each(curr, &zone->free_area[order].free_list) {
    			unsigned long start_pfn, i;
    
    			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
    
    			for (i=0; i < (1<<order); i++)
    				SetPageNosaveFree(pfn_to_page(start_pfn+i));
    	}
    	spin_unlock_irqrestore(&zone->lock, flags);
    }
    
    /*
     * Spill all of this CPU's per-cpu pages back into the buddy allocator.
     */
    void drain_local_pages(void)
    {
    	unsigned long flags;
    
    	local_irq_save(flags);	
    	__drain_pages(smp_processor_id());
    	local_irq_restore(flags);	
    }
    #endif /* CONFIG_PM */
    
    /*
     * Free a 0-order page
     */
    static void fastcall free_hot_cold_page(struct page *page, int cold)
    {
    	struct zone *zone = page_zone(page);
    	struct per_cpu_pages *pcp;
    	unsigned long flags;
    
    	arch_free_page(page, 0);
    
    	if (PageAnon(page))
    		page->mapping = NULL;
    
    	if (free_pages_check(page))
    
    		return;
    
    	kernel_map_pages(page, 1, 0);
    
    
    	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	local_irq_save(flags);
    
    	__count_vm_event(PGFREE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	list_add(&page->lru, &pcp->list);
    	pcp->count++;
    
    	if (pcp->count >= pcp->high) {
    		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
    		pcp->count -= pcp->batch;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	local_irq_restore(flags);
    	put_cpu();
    }
    
    void fastcall free_hot_page(struct page *page)
    {
    	free_hot_cold_page(page, 0);
    }
    	
    void fastcall free_cold_page(struct page *page)
    {
    	free_hot_cold_page(page, 1);
    }
    
    
    /*
     * split_page takes a non-compound higher-order page, and splits it into
     * n (1<<order) sub-pages: page[0..n]
     * Each sub-page must be freed individually.
     *
     * Note: this is probably too low level an operation for use in drivers.
     * Please consult with lkml before using this in your driver.
     */
    void split_page(struct page *page, unsigned int order)
    {
    	int i;
    
    	BUG_ON(PageCompound(page));
    	BUG_ON(!page_count(page));
    
    	for (i = 1; i < (1 << order); i++)
    		set_page_refcounted(page + i);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
     * we cheat by calling it from here, in the order > 0 path.  Saves a branch
     * or two.
     */
    
    static struct page *buffered_rmqueue(struct zonelist *zonelist,
    			struct zone *zone, int order, gfp_t gfp_flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	unsigned long flags;
    
    	struct page *page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int cold = !!(gfp_flags & __GFP_COLD);
    
    	int cpu;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	cpu  = get_cpu();
    
    	if (likely(order == 0)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct per_cpu_pages *pcp;
    
    
    		pcp = &zone_pcp(zone, cpu)->pcp[cold];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		local_irq_save(flags);
    
    		if (!pcp->count) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			pcp->count += rmqueue_bulk(zone, 0,
    						pcp->batch, &pcp->list);
    
    			if (unlikely(!pcp->count))
    				goto failed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		page = list_entry(pcp->list.next, struct page, lru);
    		list_del(&page->lru);
    		pcp->count--;
    
    	} else {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		spin_lock_irqsave(&zone->lock, flags);
    		page = __rmqueue(zone, order);
    
    		spin_unlock(&zone->lock);
    		if (!page)
    			goto failed;
    
    	__count_zone_vm_events(PGALLOC, zone, 1 << order);
    
    	zone_statistics(zonelist, zone);
    
    	local_irq_restore(flags);
    	put_cpu();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	BUG_ON(bad_range(zone, page));
    
    	if (prep_new_page(page, order, gfp_flags))
    
    		goto again;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return page;
    
    
    failed:
    	local_irq_restore(flags);
    	put_cpu();
    	return NULL;
    
    #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
    
    #define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
    #define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
    #define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
    #define ALLOC_HARDER		0x10 /* try to alloc harder */
    #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
    #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Return 1 if free pages are above 'mark'. This takes into account the order
     * of the allocation.
     */
    int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
    
    		      int classzone_idx, int alloc_flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/* free_pages my go negative - that's OK */
    	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
    	int o;
    
    
    	if (alloc_flags & ALLOC_HIGH)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		min -= min / 2;
    
    	if (alloc_flags & ALLOC_HARDER)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		min -= min / 4;
    
    	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
    		return 0;
    	for (o = 0; o < order; o++) {
    		/* At the next order, this order's pages become unavailable */
    		free_pages -= z->free_area[o].nr_free << o;
    
    		/* Require fewer higher order pages to be free */
    		min >>= 1;
    
    		if (free_pages <= min)
    			return 0;
    	}
    	return 1;
    }
    
    
    /*
     * get_page_from_freeliest goes through the zonelist trying to allocate
     * a page.
     */
    static struct page *
    get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
    		struct zonelist *zonelist, int alloc_flags)
    
    	struct zone **z = zonelist->zones;
    	struct page *page = NULL;
    	int classzone_idx = zone_idx(*z);
    
    	/*
    	 * Go through the zonelist once, looking for a zone with enough free.
    	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
    	 */
    	do {
    		if ((alloc_flags & ALLOC_CPUSET) &&
    				!cpuset_zone_allowed(*z, gfp_mask))
    			continue;
    
    		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
    
    			unsigned long mark;
    			if (alloc_flags & ALLOC_WMARK_MIN)
    				mark = (*z)->pages_min;
    			else if (alloc_flags & ALLOC_WMARK_LOW)
    				mark = (*z)->pages_low;
    			else
    				mark = (*z)->pages_high;
    			if (!zone_watermark_ok(*z, order, mark,
    
    				    classzone_idx, alloc_flags))
    
    				if (!zone_reclaim_mode ||
    				    !zone_reclaim(*z, gfp_mask, order))
    					continue;
    
    		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
    
    		if (page) {
    			break;
    		}
    	} while (*(++z) != NULL);
    	return page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the 'heart' of the zoned buddy allocator.
     */
    struct page * fastcall
    
    __alloc_pages(gfp_t gfp_mask, unsigned int order,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct zonelist *zonelist)
    {
    
    Al Viro's avatar
    Al Viro committed
    	const gfp_t wait = gfp_mask & __GFP_WAIT;
    
    	struct zone **z;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    	struct reclaim_state reclaim_state;
    	struct task_struct *p = current;
    	int do_retry;
    
    	int alloc_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int did_some_progress;
    
    	might_sleep_if(wait);
    
    
    	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(*z == NULL)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Should this ever happen?? */
    		return NULL;
    	}
    
    	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
    
    				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
    
    	if (page)
    		goto got_pg;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	 * OK, we're below the kswapd watermark and have kicked background
    	 * reclaim. Now things get more complex, so set up alloc_flags according
    	 * to how we want to proceed.
    	 *
    	 * The caller may dip into page reserves a bit more if the caller
    	 * cannot run direct reclaim, or if the caller has realtime scheduling
    
    	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
    	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
    
    	alloc_flags = ALLOC_WMARK_MIN;
    
    	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
    		alloc_flags |= ALLOC_HARDER;
    	if (gfp_mask & __GFP_HIGH)
    		alloc_flags |= ALLOC_HIGH;
    
    	if (wait)
    		alloc_flags |= ALLOC_CPUSET;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*
    	 * Go through the zonelist again. Let __GFP_HIGH and allocations
    
    	 * coming from realtime tasks go deeper into reserves.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 *
    	 * This is the last chance, in general, before the goto nopage.
    	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
    
    	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
    	if (page)
    		goto got_pg;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* This allocation should allow future memory freeing. */
    
    
    	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
    			&& !in_interrupt()) {
    		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
    
    nofail_alloc:
    
    			/* go through the zonelist yet again, ignoring mins */
    
    			page = get_page_from_freelist(gfp_mask, order,
    
    				zonelist, ALLOC_NO_WATERMARKS);
    
    			if (page)
    				goto got_pg;
    
    			if (gfp_mask & __GFP_NOFAIL) {
    				blk_congestion_wait(WRITE, HZ/50);
    				goto nofail_alloc;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    		goto nopage;
    	}
    
    	/* Atomic allocations - we can't balance anything */
    	if (!wait)
    		goto nopage;
    
    rebalance:
    	cond_resched();
    
    	/* We now go into synchronous reclaim */
    
    	cpuset_memory_pressure_bump();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	p->flags |= PF_MEMALLOC;
    	reclaim_state.reclaimed_slab = 0;