Skip to content
Snippets Groups Projects
page_alloc.c 174 KiB
Newer Older
  • Learn to ignore specific revisions
  • 
    	while (nr_pageblocks--) {
    		set_pageblock_migratetype(pageblock_page, migratetype);
    		pageblock_page += pageblock_nr_pages;
    	}
    }
    
    
    /* Remove an element from the buddy allocator from the fallback list */
    
    static inline struct page *
    __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
    
    {
    	struct free_area * area;
    	int current_order;
    	struct page *page;
    	int migratetype, i;
    
    	/* Find the largest possible block of pages in the other list */
    	for (current_order = MAX_ORDER-1; current_order >= order;
    						--current_order) {
    
    			migratetype = fallbacks[start_migratetype][i];
    
    
    			/* MIGRATE_RESERVE handled later if necessary */
    			if (migratetype == MIGRATE_RESERVE)
    
    			area = &(zone->free_area[current_order]);
    			if (list_empty(&area->free_list[migratetype]))
    				continue;
    
    			page = list_entry(area->free_list[migratetype].next,
    					struct page, lru);
    			area->nr_free--;
    
    			/*
    
    			 * If breaking a large block of pages, move all free
    
    			 * pages to the preferred allocation list. If falling
    			 * back for a reclaimable kernel allocation, be more
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    			 * aggressive about taking ownership of free pages
    
    			 *
    			 * On the other hand, never change migration
    			 * type of MIGRATE_CMA pageblocks nor move CMA
    			 * pages on different free lists. We don't
    			 * want unmovable pages to be allocated from
    			 * MIGRATE_CMA areas.
    
    			if (!is_migrate_cma(migratetype) &&
    			    (unlikely(current_order >= pageblock_order / 2) ||
    			     start_migratetype == MIGRATE_RECLAIMABLE ||
    			     page_group_by_mobility_disabled)) {
    				int pages;
    
    				pages = move_freepages_block(zone, page,
    								start_migratetype);
    
    				/* Claim the whole block if over half of it is free */
    
    				if (pages >= (1 << (pageblock_order-1)) ||
    						page_group_by_mobility_disabled)
    
    					set_pageblock_migratetype(page,
    								start_migratetype);
    
    
    				migratetype = start_migratetype;
    
    
    			/* Remove the page from the freelists */
    			list_del(&page->lru);
    			rmv_page_order(page);
    
    
    			/* Take ownership for orders >= pageblock_order */
    
    			if (current_order >= pageblock_order &&
    			    !is_migrate_cma(migratetype))
    
    				change_pageblock_range(page, current_order,
    
    			expand(zone, page, order, current_order, area,
    			       is_migrate_cma(migratetype)
    			     ? migratetype : start_migratetype);
    
    
    			trace_mm_page_alloc_extfrag(page, order, current_order,
    				start_migratetype, migratetype);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Do the hard work of removing an element from the buddy allocator.
     * Call me with the zone->lock already held.
     */
    
    static struct page *__rmqueue(struct zone *zone, unsigned int order,
    						int migratetype)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct page *page;
    
    
    retry_reserve:
    
    	page = __rmqueue_smallest(zone, order, migratetype);
    
    	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
    
    		page = __rmqueue_fallback(zone, order, migratetype);
    
    		/*
    		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
    		 * is used because __rmqueue_smallest is an inline function
    		 * and we want just one call site
    		 */
    		if (!page) {
    			migratetype = MIGRATE_RESERVE;
    			goto retry_reserve;
    		}
    	}
    
    
    	trace_mm_page_alloc_zone_locked(page, order, migratetype);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * Obtain a specified number of elements from the buddy allocator, all under
     * a single hold of the lock, for efficiency.  Add them to the supplied list.
     * Returns the number of new pages which were placed at *list.
     */
    
    static int rmqueue_bulk(struct zone *zone, unsigned int order,
    
    			unsigned long count, struct list_head *list,
    
    			int migratetype, int cold)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int mt = migratetype, i;
    
    	spin_lock(&zone->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for (i = 0; i < count; ++i) {
    
    		struct page *page = __rmqueue(zone, order, migratetype);
    
    		if (unlikely(page == NULL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    
    
    		/*
    		 * Split buddy pages returned by expand() are received here
    		 * in physical page order. The page is added to the callers and
    		 * list and the list head then moves forward. From the callers
    		 * perspective, the linked list is ordered by page number in
    		 * some conditions. This is useful for IO devices that can
    		 * merge IO requests if the physical pages are ordered
    		 * properly.
    		 */
    
    		if (likely(cold == 0))
    			list_add(&page->lru, list);
    		else
    			list_add_tail(&page->lru, list);
    
    		if (IS_ENABLED(CONFIG_CMA)) {
    			mt = get_pageblock_migratetype(page);
    
    			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
    
    		set_freepage_migratetype(page, mt);
    
    		list = &page->lru;
    
    		if (is_migrate_cma(mt))
    			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
    					      -(1 << order));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
    
    	spin_unlock(&zone->lock);
    
    	return i;
    
     * Called from the vmstat counter updater to drain pagesets of this
     * currently executing processor on remote nodes after they have
     * expired.
     *
    
     * Note that this function must be called with the thread pinned to
     * a single processor.
    
    void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
    
    	local_irq_save(flags);
    
    	batch = ACCESS_ONCE(pcp->batch);
    	if (pcp->count >= batch)
    		to_drain = batch;
    
    	else
    		to_drain = pcp->count;
    
    	if (to_drain > 0) {
    		free_pcppages_bulk(zone, to_drain, pcp);
    		pcp->count -= to_drain;
    	}
    
    	local_irq_restore(flags);
    
    /*
     * Drain pages of the indicated processor.
     *
     * The processor must either be the current processor and the
     * thread pinned to the current processor or a processor that
     * is not online.
     */
    static void drain_pages(unsigned int cpu)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct zone *zone;
    
    
    	for_each_populated_zone(zone) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct per_cpu_pageset *pset;
    
    		struct per_cpu_pages *pcp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		local_irq_save(flags);
    		pset = per_cpu_ptr(zone->pageset, cpu);
    
    		if (pcp->count) {
    			free_pcppages_bulk(zone, pcp->count, pcp);
    			pcp->count = 0;
    		}
    
    		local_irq_restore(flags);
    
    /*
     * Spill all of this CPU's per-cpu pages back into the buddy allocator.
     */
    void drain_local_pages(void *arg)
    {
    	drain_pages(smp_processor_id());
    }
    
    /*
    
     * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
     *
     * Note that this code is protected against sending an IPI to an offline
     * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
     * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
     * nothing keeps CPUs from showing up after we populated the cpumask and
     * before the call to on_each_cpu_mask().
    
     */
    void drain_all_pages(void)
    {
    
    	int cpu;
    	struct per_cpu_pageset *pcp;
    	struct zone *zone;
    
    	/*
    	 * Allocate in the BSS so we wont require allocation in
    	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
    	 */
    	static cpumask_t cpus_with_pcps;
    
    	/*
    	 * We don't care about racing with CPU hotplug event
    	 * as offline notification will cause the notified
    	 * cpu to drain that CPU pcps and on_each_cpu_mask
    	 * disables preemption as part of its processing
    	 */
    	for_each_online_cpu(cpu) {
    		bool has_pcps = false;
    		for_each_populated_zone(zone) {
    			pcp = per_cpu_ptr(zone->pageset, cpu);
    			if (pcp->pcp.count) {
    				has_pcps = true;
    				break;
    			}
    		}
    		if (has_pcps)
    			cpumask_set_cpu(cpu, &cpus_with_pcps);
    		else
    			cpumask_clear_cpu(cpu, &cpus_with_pcps);
    	}
    	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    void mark_free_pages(struct zone *zone)
    {
    
    	unsigned long pfn, max_zone_pfn;
    	unsigned long flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct list_head *curr;
    
    	if (!zone->spanned_pages)
    		return;
    
    	spin_lock_irqsave(&zone->lock, flags);
    
    	max_zone_pfn = zone_end_pfn(zone);
    
    	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
    		if (pfn_valid(pfn)) {
    			struct page *page = pfn_to_page(pfn);
    
    
    			if (!swsusp_page_is_forbidden(page))
    				swsusp_unset_page_free(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for_each_migratetype_order(order, t) {
    		list_for_each(curr, &zone->free_area[order].free_list[t]) {
    
    			unsigned long i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			pfn = page_to_pfn(list_entry(curr, struct page, lru));
    			for (i = 0; i < (1UL << order); i++)
    
    				swsusp_set_page_free(pfn_to_page(pfn + i));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_unlock_irqrestore(&zone->lock, flags);
    }
    
    #endif /* CONFIG_PM */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Free a 0-order page
    
    Li Hong's avatar
    Li Hong committed
     * cold == 1 ? free a cold page : free a hot page
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    Li Hong's avatar
    Li Hong committed
    void free_hot_cold_page(struct page *page, int cold)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct zone *zone = page_zone(page);
    	struct per_cpu_pages *pcp;
    	unsigned long flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (!free_pages_prepare(page, 0))
    
    	migratetype = get_pageblock_migratetype(page);
    
    	set_freepage_migratetype(page, migratetype);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	local_irq_save(flags);
    
    	__count_vm_event(PGFREE);
    
    	/*
    	 * We only track unmovable, reclaimable and movable on pcp lists.
    	 * Free ISOLATE pages back to the allocator because they are being
    	 * offlined but treat RESERVE as movable pages so we can get those
    	 * areas back if necessary. Otherwise, we may have to free
    	 * excessively into the page allocator
    	 */
    	if (migratetype >= MIGRATE_PCPTYPES) {
    
    		if (unlikely(is_migrate_isolate(migratetype))) {
    
    			free_one_page(zone, page, 0, migratetype);
    			goto out;
    		}
    		migratetype = MIGRATE_MOVABLE;
    	}
    
    
    	pcp = &this_cpu_ptr(zone->pageset)->pcp;
    
    		list_add_tail(&page->lru, &pcp->lists[migratetype]);
    
    		list_add(&page->lru, &pcp->lists[migratetype]);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pcp->count++;
    
    	if (pcp->count >= pcp->high) {
    
    		unsigned long batch = ACCESS_ONCE(pcp->batch);
    		free_pcppages_bulk(zone, batch, pcp);
    		pcp->count -= batch;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	local_irq_restore(flags);
    }
    
    
    /*
     * Free a list of 0-order pages
     */
    void free_hot_cold_page_list(struct list_head *list, int cold)
    {
    	struct page *page, *next;
    
    	list_for_each_entry_safe(page, next, list, lru) {
    
    		trace_mm_page_free_batched(page, cold);
    
    		free_hot_cold_page(page, cold);
    	}
    }
    
    
    /*
     * split_page takes a non-compound higher-order page, and splits it into
     * n (1<<order) sub-pages: page[0..n]
     * Each sub-page must be freed individually.
     *
     * Note: this is probably too low level an operation for use in drivers.
     * Please consult with lkml before using this in your driver.
     */
    void split_page(struct page *page, unsigned int order)
    {
    	int i;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	VM_BUG_ON(PageCompound(page));
    	VM_BUG_ON(!page_count(page));
    
    
    #ifdef CONFIG_KMEMCHECK
    	/*
    	 * Split shadow pages too, because free(page[0]) would
    	 * otherwise free the whole shadow.
    	 */
    	if (kmemcheck_page_is_tracked(page))
    		split_page(virt_to_page(page[0].shadow), order);
    #endif
    
    
    	for (i = 1; i < (1 << order); i++)
    		set_page_refcounted(page + i);
    
    EXPORT_SYMBOL_GPL(split_page);
    
    static int __isolate_free_page(struct page *page, unsigned int order)
    
    {
    	unsigned long watermark;
    	struct zone *zone;
    
    
    	BUG_ON(!PageBuddy(page));
    
    	zone = page_zone(page);
    
    	if (!is_migrate_isolate(mt)) {
    
    		/* Obey watermarks as if the page was being allocated */
    		watermark = low_wmark_pages(zone) + (1 << order);
    		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
    			return 0;
    
    
    		__mod_zone_freepage_state(zone, -(1UL << order), mt);
    
    
    	/* Remove page from free list */
    	list_del(&page->lru);
    	zone->free_area[order].nr_free--;
    	rmv_page_order(page);
    
    	/* Set the pageblock if the isolated page is at least a pageblock */
    
    	if (order >= pageblock_order - 1) {
    		struct page *endpage = page + (1 << order) - 1;
    
    		for (; page < endpage; page += pageblock_nr_pages) {
    			int mt = get_pageblock_migratetype(page);
    
    			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
    
    				set_pageblock_migratetype(page,
    							  MIGRATE_MOVABLE);
    		}
    
    }
    
    /*
     * Similar to split_page except the page is already free. As this is only
     * being used for migration, the migratetype of the block also changes.
     * As this is called with interrupts disabled, the caller is responsible
     * for calling arch_alloc_page() and kernel_map_page() after interrupts
     * are enabled.
     *
     * Note: this is probably too low level an operation for use in drivers.
     * Please consult with lkml before using this in your driver.
     */
    int split_free_page(struct page *page)
    {
    	unsigned int order;
    	int nr_pages;
    
    	order = page_order(page);
    
    
    	nr_pages = __isolate_free_page(page, order);
    
    	if (!nr_pages)
    		return 0;
    
    	/* Split into individual pages */
    	set_page_refcounted(page);
    	split_page(page, order);
    	return nr_pages;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
     * we cheat by calling it from here, in the order > 0 path.  Saves a branch
     * or two.
     */
    
    static inline
    struct page *buffered_rmqueue(struct zone *preferred_zone,
    
    			struct zone *zone, int order, gfp_t gfp_flags,
    			int migratetype)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	unsigned long flags;
    
    	struct page *page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int cold = !!(gfp_flags & __GFP_COLD);
    
    
    	if (likely(order == 0)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct per_cpu_pages *pcp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		local_irq_save(flags);
    
    		pcp = &this_cpu_ptr(zone->pageset)->pcp;
    		list = &pcp->lists[migratetype];
    
    			pcp->count += rmqueue_bulk(zone, 0,
    
    			if (unlikely(list_empty(list)))
    
    		if (cold)
    			page = list_entry(list->prev, struct page, lru);
    		else
    			page = list_entry(list->next, struct page, lru);
    
    
    		list_del(&page->lru);
    		pcp->count--;
    
    	} else {
    
    		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
    			/*
    			 * __GFP_NOFAIL is not to be used in new code.
    			 *
    			 * All __GFP_NOFAIL callers should be fixed so that they
    			 * properly detect and handle allocation failures.
    			 *
    			 * We most definitely don't want callers attempting to
    
    			 * allocate greater than order-1 page units with
    
    			WARN_ON_ONCE(order > 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		spin_lock_irqsave(&zone->lock, flags);
    
    		page = __rmqueue(zone, order, migratetype);
    
    		spin_unlock(&zone->lock);
    		if (!page)
    			goto failed;
    
    		__mod_zone_freepage_state(zone, -(1 << order),
    					  get_pageblock_migratetype(page));
    
    	__count_zone_vm_events(PGALLOC, zone, 1 << order);
    
    	zone_statistics(preferred_zone, zone, gfp_flags);
    
    	local_irq_restore(flags);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	VM_BUG_ON(bad_range(zone, page));
    
    	if (prep_new_page(page, order, gfp_flags))
    
    		goto again;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return page;
    
    
    failed:
    	local_irq_restore(flags);
    	return NULL;
    
    #ifdef CONFIG_FAIL_PAGE_ALLOC
    
    
    	struct fault_attr attr;
    
    	u32 ignore_gfp_highmem;
    	u32 ignore_gfp_wait;
    
    } fail_page_alloc = {
    	.attr = FAULT_ATTR_INITIALIZER,
    
    	.ignore_gfp_wait = 1,
    	.ignore_gfp_highmem = 1,
    
    };
    
    static int __init setup_fail_page_alloc(char *str)
    {
    	return setup_fault_attr(&fail_page_alloc.attr, str);
    }
    __setup("fail_page_alloc=", setup_fail_page_alloc);
    
    
    static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    
    	if (order < fail_page_alloc.min_order)
    
    	if (gfp_mask & __GFP_NOFAIL)
    
    	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
    
    	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
    
    
    	return should_fail(&fail_page_alloc.attr, 1 << order);
    }
    
    #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
    
    static int __init fail_page_alloc_debugfs(void)
    {
    
    Al Viro's avatar
    Al Viro committed
    	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
    
    	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
    					&fail_page_alloc.attr);
    	if (IS_ERR(dir))
    		return PTR_ERR(dir);
    
    	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
    				&fail_page_alloc.ignore_gfp_wait))
    		goto fail;
    	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
    				&fail_page_alloc.ignore_gfp_highmem))
    		goto fail;
    	if (!debugfs_create_u32("min-order", mode, dir,
    				&fail_page_alloc.min_order))
    		goto fail;
    
    	return 0;
    fail:
    
    }
    
    late_initcall(fail_page_alloc_debugfs);
    
    #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
    
    #else /* CONFIG_FAIL_PAGE_ALLOC */
    
    
    static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
    
    }
    
    #endif /* CONFIG_FAIL_PAGE_ALLOC */
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * Return true if free pages are above 'mark'. This takes into account the order
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * of the allocation.
     */
    
    static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
    		      int classzone_idx, int alloc_flags, long free_pages)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/* free_pages my go negative - that's OK */
    
    	long min = mark;
    
    	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int o;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	free_pages -= (1 << order) - 1;
    
    	if (alloc_flags & ALLOC_HIGH)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		min -= min / 2;
    
    	if (alloc_flags & ALLOC_HARDER)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		min -= min / 4;
    
    #ifdef CONFIG_CMA
    	/* If allocation can't use CMA areas don't use free CMA pages */
    	if (!(alloc_flags & ALLOC_CMA))
    
    		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
    
    
    	if (free_pages - free_cma <= min + lowmem_reserve)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for (o = 0; o < order; o++) {
    		/* At the next order, this order's pages become unavailable */
    		free_pages -= z->free_area[o].nr_free << o;
    
    		/* Require fewer higher order pages to be free */
    		min >>= 1;
    
    		if (free_pages <= min)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	return true;
    }
    
    bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
    		      int classzone_idx, int alloc_flags)
    {
    	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
    					zone_page_state(z, NR_FREE_PAGES));
    }
    
    bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
    		      int classzone_idx, int alloc_flags)
    {
    	long free_pages = zone_page_state(z, NR_FREE_PAGES);
    
    	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
    		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
    
    	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
    								free_pages);
    
    #ifdef CONFIG_NUMA
    /*
     * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
     * skip over zones that are not allowed by the cpuset, or that have
     * been recently (in last second) found to be nearly full.  See further
     * comments in mmzone.h.  Reduces cache footprint of zonelist scans
    
    Simon Arlott's avatar
    Simon Arlott committed
     * that have to skip over a lot of full or unallowed zones.
    
     *
     * If the zonelist cache is present in the passed in zonelist, then
     * returns a pointer to the allowed node mask (either the current
    
     * tasks mems_allowed, or node_states[N_MEMORY].)
    
     *
     * If the zonelist cache is not available for this zonelist, does
     * nothing and returns NULL.
     *
     * If the fullzones BITMAP in the zonelist cache is stale (more than
     * a second since last zap'd) then we zap it out (clear its bits.)
     *
     * We hold off even calling zlc_setup, until after we've checked the
     * first zone in the zonelist, on the theory that most allocations will
     * be satisfied from that first zone, so best to examine that zone as
     * quickly as we can.
     */
    static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
    {
    	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
    	nodemask_t *allowednodes;	/* zonelist_cache approximation */
    
    	zlc = zonelist->zlcache_ptr;
    	if (!zlc)
    		return NULL;
    
    
    	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
    
    		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
    		zlc->last_full_zap = jiffies;
    	}
    
    	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
    					&cpuset_current_mems_allowed :
    
    	return allowednodes;
    }
    
    /*
     * Given 'z' scanning a zonelist, run a couple of quick checks to see
     * if it is worth looking at further for free memory:
     *  1) Check that the zone isn't thought to be full (doesn't have its
     *     bit set in the zonelist_cache fullzones BITMAP).
     *  2) Check that the zones node (obtained from the zonelist_cache
     *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
     * Return true (non-zero) if zone is worth looking at further, or
     * else return false (zero) if it is not.
     *
     * This check -ignores- the distinction between various watermarks,
     * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
     * found to be full for any variation of these watermarks, it will
     * be considered full for up to one second by all requests, unless
     * we are so low on memory on all allowed nodes that we are forced
     * into the second scan of the zonelist.
     *
     * In the second scan we ignore this zonelist cache and exactly
     * apply the watermarks to all zones, even it is slower to do so.
     * We are low on memory in the second scan, and should leave no stone
     * unturned looking for a free page.
     */
    
    static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
    
    						nodemask_t *allowednodes)
    {
    	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
    	int i;				/* index of *z in zonelist zones */
    	int n;				/* node that zone *z is on */
    
    	zlc = zonelist->zlcache_ptr;
    	if (!zlc)
    		return 1;
    
    
    	n = zlc->z_to_n[i];
    
    	/* This zone is worth trying if it is allowed but not full */
    	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
    }
    
    /*
     * Given 'z' scanning a zonelist, set the corresponding bit in
     * zlc->fullzones, so that subsequent attempts to allocate a page
     * from that zone don't waste time re-examining it.
     */
    
    static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
    
    {
    	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
    	int i;				/* index of *z in zonelist zones */
    
    	zlc = zonelist->zlcache_ptr;
    	if (!zlc)
    		return;
    
    
    /*
     * clear all zones full, called after direct reclaim makes progress so that
     * a zone that was recently full is not skipped over for up to a second
     */
    static void zlc_clear_zones_full(struct zonelist *zonelist)
    {
    	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
    
    	zlc = zonelist->zlcache_ptr;
    	if (!zlc)
    		return;
    
    	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
    }
    
    
    static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    {
    	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
    }
    
    static void __paginginit init_zone_allows_reclaim(int nid)
    {
    	int i;
    
    	for_each_online_node(i)
    
    		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
    
    			node_set(i, NODE_DATA(nid)->reclaim_nodes);
    
    #else	/* CONFIG_NUMA */
    
    static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
    {
    	return NULL;
    }
    
    
    static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
    
    static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
    
    
    static void zlc_clear_zones_full(struct zonelist *zonelist)
    {
    }
    
    
    static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
    {
    	return true;
    }
    
    static inline void init_zone_allows_reclaim(int nid)
    {
    }
    
     * get_page_from_freelist goes through the zonelist trying to allocate
    
     * a page.
     */
    static struct page *
    
    get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
    
    		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
    
    		struct zone *preferred_zone, int migratetype)
    
    	struct page *page = NULL;
    
    	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    	int zlc_active = 0;		/* set if using zonelist_cache */
    	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
    
    	classzone_idx = zone_idx(preferred_zone);
    
    	 * Scan zonelist, looking for a zone with enough free.
    
    	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
    	 */
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    						high_zoneidx, nodemask) {
    
    		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
    
    			!zlc_zone_worth_trying(zonelist, z, allowednodes))
    				continue;
    
    		if ((alloc_flags & ALLOC_CPUSET) &&
    
    			!cpuset_zone_allowed_softwall(zone, gfp_mask))
    
    		/*
    		 * When allocating a page cache page for writing, we
    		 * want to get it from a zone that is within its dirty
    		 * limit, such that no single zone holds more than its
    		 * proportional share of globally allowed dirty pages.
    		 * The dirty limits take into account the zone's
    		 * lowmem reserves and high watermark so that kswapd
    		 * should be able to balance it without having to
    		 * write pages from its LRU list.
    		 *
    		 * This may look like it could increase pressure on
    		 * lower zones by failing allocations in higher zones
    		 * before they are full.  But the pages that do spill
    		 * over are limited as the lower zones are protected
    		 * by this very same mechanism.  It should not become
    		 * a practical burden to them.
    		 *
    		 * XXX: For now, allow allocations to potentially
    		 * exceed the per-zone dirty limit in the slowpath
    		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
    		 * which is important when on a NUMA setup the allowed
    		 * zones are together not big enough to reach the
    		 * global limit.  The proper fix for these situations
    		 * will require awareness of zones in the
    		 * dirty-throttling and the flusher threads.
    		 */
    		if ((alloc_flags & ALLOC_WMARK_LOW) &&
    		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
    			goto this_zone_full;
    
    		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
    
    		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
    
    			unsigned long mark;
    
    			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
    
    			if (zone_watermark_ok(zone, order, mark,
    				    classzone_idx, alloc_flags))
    				goto try_this_zone;
    
    
    			if (IS_ENABLED(CONFIG_NUMA) &&
    					!did_zlc_setup && nr_online_nodes > 1) {
    
    				/*
    				 * we do zlc_setup if there are multiple nodes
    				 * and before considering the first zone allowed
    				 * by the cpuset.
    				 */
    				allowednodes = zlc_setup(zonelist, alloc_flags);
    				zlc_active = 1;
    				did_zlc_setup = 1;
    			}
    
    
    			if (zone_reclaim_mode == 0 ||
    			    !zone_allows_reclaim(preferred_zone, zone))
    
    			/*
    			 * As we may have just activated ZLC, check if the first
    			 * eligible zone has failed zone_reclaim recently.
    			 */
    
    			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
    
    				!zlc_zone_worth_trying(zonelist, z, allowednodes))
    				continue;
    
    
    			ret = zone_reclaim(zone, gfp_mask, order);
    			switch (ret) {
    			case ZONE_RECLAIM_NOSCAN:
    				/* did not scan */
    
    			case ZONE_RECLAIM_FULL:
    				/* scanned but unreclaimable */
    
    				if (zone_watermark_ok(zone, order, mark,
    
    					goto try_this_zone;
    
    				/*
    				 * Failed to reclaim enough to meet watermark.
    				 * Only mark the zone full if checking the min
    				 * watermark or if we failed to reclaim just
    				 * 1<<order pages or else the page allocator
    				 * fastpath will prematurely mark zones full
    				 * when the watermark is between the low and
    				 * min watermarks.
    				 */
    				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
    				    ret == ZONE_RECLAIM_SOME)
    
    					goto this_zone_full;
    
    		page = buffered_rmqueue(preferred_zone, zone, order,
    						gfp_mask, migratetype);
    
    		if (IS_ENABLED(CONFIG_NUMA))
    
    			zlc_mark_zone_full(zonelist, z);
    
    	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
    
    		/* Disable zlc cache for second zonelist scan */
    		zlc_active = 0;
    		goto zonelist_scan;
    	}
    
    
    	if (page)
    		/*
    		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
    		 * necessary to allocate the page. The expectation is
    		 * that the caller is taking steps that will free more
    		 * memory. The caller should avoid the page being used
    		 * for !PFMEMALLOC purposes.
    		 */
    		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
    
    
    	return page;
    
    /*
     * Large machines with many possible nodes should not always dump per-node
     * meminfo in irq context.
     */
    static inline bool should_suppress_show_mem(void)
    {