Newer
Older

Mel Gorman
committed
while (nr_pageblocks--) {
set_pageblock_migratetype(pageblock_page, migratetype);
pageblock_page += pageblock_nr_pages;
}
}
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area * area;
int current_order;
struct page *page;
int migratetype, i;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];

Mel Gorman
committed
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
continue;
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;
/*
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* aggressive about taking ownership of free pages
*
* On the other hand, never change migration
* type of MIGRATE_CMA pageblocks nor move CMA
* pages on different free lists. We don't
* want unmovable pages to be allocated from
* MIGRATE_CMA areas.
if (!is_migrate_cma(migratetype) &&
(unlikely(current_order >= pageblock_order / 2) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
page_group_by_mobility_disabled)) {
int pages;
pages = move_freepages_block(zone, page,
start_migratetype);
/* Claim the whole block if over half of it is free */

Mel Gorman
committed
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page,
start_migratetype);
migratetype = start_migratetype;
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);

Mel Gorman
committed
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order &&
!is_migrate_cma(migratetype))

Mel Gorman
committed
change_pageblock_range(page, current_order,
start_migratetype);
expand(zone, page, order, current_order, area,
is_migrate_cma(migratetype)
? migratetype : start_migratetype);

Mel Gorman
committed
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
return page;
}
}

Mel Gorman
committed
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)

Mel Gorman
committed
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

Mel Gorman
committed
page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}

Mel Gorman
committed
trace_mm_page_alloc_zone_locked(page, order, migratetype);
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
int mt = migratetype, i;
struct page *page = __rmqueue(zone, order, migratetype);
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
* list and the list head then moves forward. From the callers
* perspective, the linked list is ordered by page number in
* some conditions. This is useful for IO devices that can
* merge IO requests if the physical pages are ordered
* properly.
*/
if (likely(cold == 0))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
if (IS_ENABLED(CONFIG_CMA)) {
mt = get_pageblock_migratetype(page);
if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
mt = migratetype;
}
set_freepage_migratetype(page, mt);
if (is_migrate_cma(mt))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

Christoph Lameter
committed
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*
* Note that this function must be called with the thread pinned to
* a single processor.

Christoph Lameter
committed
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
unsigned long batch;
local_irq_save(flags);
batch = ACCESS_ONCE(pcp->batch);
if (pcp->count >= batch)
to_drain = batch;
else
to_drain = pcp->count;

KOSAKI Motohiro
committed
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
}
local_irq_restore(flags);
/*
* Drain pages of the indicated processor.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages(unsigned int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pages *pcp;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;

David Rientjes
committed
if (pcp->count) {
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
}
local_irq_restore(flags);
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
void drain_local_pages(void *arg)
{
drain_pages(smp_processor_id());
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* Note that this code is protected against sending an IPI to an offline
* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
* nothing keeps CPUs from showing up after we populated the cpumask and
* before the call to on_each_cpu_mask().
*/
void drain_all_pages(void)
{
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
int cpu;
struct per_cpu_pageset *pcp;
struct zone *zone;
/*
* Allocate in the BSS so we wont require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
bool has_pcps = false;
for_each_populated_zone(zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count) {
has_pcps = true;
break;
}
}
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
#ifdef CONFIG_HIBERNATION
unsigned long pfn, max_zone_pfn;
unsigned long flags;
struct list_head *curr;
if (!zone->spanned_pages)
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
for_each_migratetype_order(order, t) {
list_for_each(curr, &zone->free_area[order].free_list[t]) {
pfn = page_to_pfn(list_entry(curr, struct page, lru));
for (i = 0; i < (1UL << order); i++)
swsusp_set_page_free(pfn_to_page(pfn + i));
#endif /* CONFIG_PM */
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat RESERVE as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add_tail(&page->lru, &pcp->lists[migratetype]);
list_add(&page->lru, &pcp->lists[migratetype]);
unsigned long batch = ACCESS_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
out:
/*
* Free a list of 0-order pages
*/
void free_hot_cold_page_list(struct list_head *list, int cold)
{
struct page *page, *next;
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
}
}
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
void split_page(struct page *page, unsigned int order)
{
int i;
VM_BUG_ON(PageCompound(page));
VM_BUG_ON(!page_count(page));
#ifdef CONFIG_KMEMCHECK
/*
* Split shadow pages too, because free(page[0]) would
* otherwise free the whole shadow.
*/
if (kmemcheck_page_is_tracked(page))
split_page(virt_to_page(page[0].shadow), order);
#endif
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
static int __isolate_free_page(struct page *page, unsigned int order)
{
unsigned long watermark;
struct zone *zone;
BUG_ON(!PageBuddy(page));
zone = page_zone(page);

Marek Szyprowski
committed
mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {

Marek Szyprowski
committed
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);

Marek Szyprowski
committed
}
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
return 1UL << order;

Mel Gorman
committed
}
/*
* Similar to split_page except the page is already free. As this is only
* being used for migration, the migratetype of the block also changes.
* As this is called with interrupts disabled, the caller is responsible
* for calling arch_alloc_page() and kernel_map_page() after interrupts
* are enabled.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
int split_free_page(struct page *page)
{
unsigned int order;
int nr_pages;
order = page_order(page);
nr_pages = __isolate_free_page(page, order);

Mel Gorman
committed
if (!nr_pages)
return 0;
/* Split into individual pages */
set_page_refcounted(page);
split_page(page, order);
return nr_pages;
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
struct list_head *list;
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
if (cold)
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pageblock_migratetype(page));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
if (prep_new_page(page, order, gfp_flags))
failed:
local_irq_restore(flags);
return NULL;
#ifdef CONFIG_FAIL_PAGE_ALLOC
struct fault_attr attr;
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
.ignore_gfp_highmem = 1,
.min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
{
return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
if (order < fail_page_alloc.min_order)
if (gfp_mask & __GFP_NOFAIL)
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
return should_fail(&fail_page_alloc.attr, 1 << order);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_page_alloc_debugfs(void)
{
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
&fail_page_alloc.ignore_gfp_wait))
goto fail;
if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem))
goto fail;
if (!debugfs_create_u32("min-order", mode, dir,
&fail_page_alloc.min_order))
goto fail;
return 0;
fail:
debugfs_remove_recursive(dir);
}
late_initcall(fail_page_alloc_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#else /* CONFIG_FAIL_PAGE_ALLOC */
static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
* Return true if free pages are above 'mark'. This takes into account the order
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
long free_cma = 0;
free_pages -= (1 << order) - 1;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
if (free_pages - free_cma <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;
if (free_pages <= min)
return false;
return true;
}
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
#ifdef CONFIG_NUMA
/*
* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
* skip over zones that are not allowed by the cpuset, or that have
* been recently (in last second) found to be nearly full. See further
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current

Lai Jiangshan
committed
* tasks mems_allowed, or node_states[N_MEMORY].)
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
*
* If the fullzones BITMAP in the zonelist cache is stale (more than
* a second since last zap'd) then we zap it out (clear its bits.)
*
* We hold off even calling zlc_setup, until after we've checked the
* first zone in the zonelist, on the theory that most allocations will
* be satisfied from that first zone, so best to examine that zone as
* quickly as we can.
*/
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
nodemask_t *allowednodes; /* zonelist_cache approximation */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return NULL;
if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :

Lai Jiangshan
committed
&node_states[N_MEMORY];
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
return allowednodes;
}
/*
* Given 'z' scanning a zonelist, run a couple of quick checks to see
* if it is worth looking at further for free memory:
* 1) Check that the zone isn't thought to be full (doesn't have its
* bit set in the zonelist_cache fullzones BITMAP).
* 2) Check that the zones node (obtained from the zonelist_cache
* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
* Return true (non-zero) if zone is worth looking at further, or
* else return false (zero) if it is not.
*
* This check -ignores- the distinction between various watermarks,
* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
* found to be full for any variation of these watermarks, it will
* be considered full for up to one second by all requests, unless
* we are so low on memory on all allowed nodes that we are forced
* into the second scan of the zonelist.
*
* In the second scan we ignore this zonelist cache and exactly
* apply the watermarks to all zones, even it is slower to do so.
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return 1;
i = z - zonelist->_zonerefs;
n = zlc->z_to_n[i];
/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
}
/*
* Given 'z' scanning a zonelist, set the corresponding bit in
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
i = z - zonelist->_zonerefs;
set_bit(i, zlc->fullzones);
}
/*
* clear all zones full, called after direct reclaim makes progress so that
* a zone that was recently full is not skipped over for up to a second
*/
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
}
static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
for_each_online_node(i)

David Rientjes
committed
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);

David Rientjes
committed
else
zone_reclaim_mode = 1;
}
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
return NULL;
}
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
return 1;
}
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
static inline void init_zone_allows_reclaim(int nid)
{
}
#endif /* CONFIG_NUMA */
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
struct zoneref *z;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
/*
* Failed to reclaim enough to meet watermark.
* Only mark the zone full if checking the min
* watermark or if we failed to reclaim just
* 1<<order pages or else the page allocator
* fastpath will prematurely mark zones full
* when the watermark is between the low and
* min watermarks.
*/
if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
ret == ZONE_RECLAIM_SOME)
continue;
try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
if (page)
/*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
* that the caller is taking steps that will free more
* memory. The caller should avoid the page being used
* for !PFMEMALLOC purposes.
*/
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

David Rientjes
committed
/*
* Large machines with many possible nodes should not always dump per-node
* meminfo in irq context.
*/
static inline bool should_suppress_show_mem(void)
{