Newer
Older
break;
default:
/* Look ma, no brain */
BUG();
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
bool scan_adjusted = false;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
* For global direct reclaim, reclaim only the number of pages
* requested. Less care is taken to scan proportionally as it
* is more important to minimise direct reclaim stall latency
* than it is to properly age the LRU lists.
if (global_reclaim(sc) && !current_is_kswapd())
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
/*
* For kswapd and memcg, reclaim at least the number of pages
* requested. Ensure that the anon and file LRUs shrink
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
*/
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
lru = LRU_BASE;
percentage = nr_anon * 100 / scan_target;
} else {
unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
targets[LRU_ACTIVE_FILE] + 1;
lru = LRU_FILE;
percentage = nr_file * 100 / scan_target;
}
/* Stop scanning the smaller of the LRU */
nr[lru] = 0;
nr[lru + LRU_ACTIVE] = 0;
/*
* Recalculate the other LRU scan count based on its original
* scan target and the percentage scanning already complete
*/
lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
lru += LRU_ACTIVE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
throttle_vm_writeout(sc->gfp_mask);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
sc->priority < DEF_PRIORITY - 2))
return true;
return false;
}
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct zone *zone,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
return false;

Mel Gorman
committed
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
/* Consider stopping depending on scan and reclaim activity */
if (sc->gfp_mask & __GFP_REPEAT) {
/*
* For __GFP_REPEAT allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
* expensive but a __GFP_REPEAT caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
* For non-__GFP_REPEAT allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
* caller faster at the risk reclaim/compaction and
* the resulting allocation attempt fails
*/
if (!nr_reclaimed)
return false;
}
/*
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
switch (compaction_suitable(zone, sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
default:
return true;
}
}

Andrew Morton
committed
static void shrink_zone(struct zone *zone, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = sc->priority,
};
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct lruvec *lruvec;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
shrink_lruvec(lruvec, sc);

Johannes Weiner
committed
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));

Johannes Weiner
committed
}

Mel Gorman
committed
/* Returns true if compaction should go ahead for a high-order request */
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
/* Do not consider compaction for orders reclaim is meant to satisfy */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
balance_gap = min(low_wmark_pages(zone),
(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

Mel Gorman
committed
KSWAPD_ZONE_BALANCE_GAP_RATIO);
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
if (compaction_deferred(zone, sc->order))

Mel Gorman
committed
return watermark_ok;
/* If compaction is not ready to start, keep reclaiming */
if (!compaction_suitable(zone, sc->order))
return false;
return watermark_ok;
}
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*
* This function returns true if a zone is being reclaimed for a costly

Mel Gorman
committed
* high-order allocation and compaction is ready to begin. This indicates to

Mel Gorman
committed
* the caller that it should consider retrying the allocation instead of
* further reclaim.
static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zoneref *z;

Andrew Morton
committed
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;

Mel Gorman
committed
bool aborted_reclaim = false;

KAMEZAWA Hiroyuki
committed

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {

KAMEZAWA Hiroyuki
committed
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (global_reclaim(sc)) {

KAMEZAWA Hiroyuki
committed
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))

KAMEZAWA Hiroyuki
committed
continue; /* Let kswapd poll it */
if (IS_ENABLED(CONFIG_COMPACTION)) {
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem, like transparent huge
* page allocations.

Mel Gorman
committed
if (compaction_ready(zone, sc)) {

Mel Gorman
committed
aborted_reclaim = true;

Andrew Morton
committed
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */

KAMEZAWA Hiroyuki
committed
}
shrink_zone(zone, sc);

Mel Gorman
committed
return aborted_reclaim;
/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
return false;
return true;
/*
* This is the main entry point to direct page reclaim.
*
* If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed.
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
* caller can't do much about. We kick the writeback threads and take explicit
* naps in the hope that some of these pages can be written. But if the
* allocating task holds filesystem locks which prevent writeout this might not
* work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc,
struct shrink_control *shrink)
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
unsigned long writeback_threshold;

Mel Gorman
committed
bool aborted_reclaim;
delayacct_freepages_start();
if (global_reclaim(sc))

KAMEZAWA Hiroyuki
committed
count_vm_event(ALLOCSTALL);
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
aborted_reclaim = shrink_zones(zonelist, sc);
* Don't shrink slabs when reclaiming memory from over limit
* cgroups but do shrink slab at least once when aborting
* reclaim for compaction to avoid unevenly scanning file/anon
* LRU pages over slab pages.
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
node_set(zone_to_nid(zone),
shrink->nodes_to_scan);
shrink_slab(shrink, sc->nr_scanned, lru_pages);

KAMEZAWA Hiroyuki
committed
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;

KAMEZAWA Hiroyuki
committed
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc->nr_scanned;

KOSAKI Motohiro
committed
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
/*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
} while (--sc->priority >= 0 && !aborted_reclaim);

KOSAKI Motohiro
committed
delayacct_freepages_end();

KOSAKI Motohiro
committed
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/*
* As hibernation is going on, kswapd is freezed so that it can't mark
* the zone into all_unreclaimable. Thus bypassing all_unreclaimable
* check.
*/
if (oom_killer_disabled)
return 0;

Mel Gorman
committed
/* Aborted reclaim to try compaction? don't OOM, then */
if (aborted_reclaim)
return 1;

KOSAKI Motohiro
committed
/* top priority shrink_zones still had more to do? don't OOM, then */
if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

KOSAKI Motohiro
committed
return 1;
return 0;
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
unsigned long free_pages = 0;
int i;
bool wmark_ok;
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
pgdat->classzone_idx = min(pgdat->classzone_idx,
(enum zone_type)ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
return wmark_ok;
}
/*
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
* when the low watermark is reached.
*
* Returns true if a fatal signal was delivered during throttling. If this
* happens, the page allocator should not consider triggering the OOM killer.
*/
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
int high_zoneidx = gfp_zone(gfp_mask);
pg_data_t *pgdat;
/*
* Kernel threads should not be throttled as they may be indirectly
* responsible for cleaning pages necessary for reclaim to make forward
* progress. kjournald for example may enter direct reclaim while
* committing a transaction where throttling it could forcing other
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
goto out;
/*
* If a fatal signal is pending, this process should not throttle.
* It should return quickly so it can exit and free its memory
*/
if (fatal_signal_pending(current))
goto out;
/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
/*
* If the caller cannot enter the filesystem, it's possible that it
* is due to the caller holding an FS lock or performing a journal
* transaction in the case of a filesystem like ext[3|4]. In this case,
* it is not safe to block on pfmemalloc_wait as kswapd could be
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
check_pending:
if (fatal_signal_pending(current))
return true;
out:
return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)

Mel Gorman
committed
unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.priority = DEF_PRIORITY,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
return 1;

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
return nr_reclaimed;
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned)
{
struct scan_control sc = {
.nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
.priority = 0,
.target_mem_cgroup = memcg,
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.may_writepage,
sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
* if we don't reclaim here, the shrink_zone from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.priority = DEF_PRIORITY,
.target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
* scan does not need to be the current node.
*/
nid = mem_cgroup_select_victim_node(memcg);
zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
return nr_reclaimed;
static void age_active_anon(struct zone *zone, struct scan_control *sc)

Johannes Weiner
committed
{
struct mem_cgroup *memcg;

Johannes Weiner
committed
if (!total_swap_pages)
return;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);

Johannes Weiner
committed
}
static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
balance_gap, classzone_idx, 0))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
!compaction_suitable(zone, order))
return false;
return true;
}

Mel Gorman
committed
/*
* pgdat_balanced() is used when checking if a node is balanced.
*
* For order-0, all zones must be balanced!
*
* For high-order allocations only zones that meet watermarks and are in a
* zone allowed by the callers classzone_idx are added to balanced_pages. The
* total of balanced pages must be at least 25% of the zones allowed by
* classzone_idx for the node to be considered balanced. Forcing all zones to
* be balanced for high orders can cause excessive reclaim when there are
* imbalanced zones.

Mel Gorman
committed
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
* o On all other machines, the top zone must be at least a reasonable
* percentage of the middle zones. For example, on 32-bit x86, highmem

Mel Gorman
committed
* would need to be at least 256M for it to be balance a whole node.
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

Mel Gorman
committed
{
unsigned long managed_pages = 0;
unsigned long balanced_pages = 0;

Mel Gorman
committed
int i;
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;

Mel Gorman
committed
if (!populated_zone(zone))
continue;
managed_pages += zone->managed_pages;
/*
* A special case here:
*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well!
*/
if (!zone_reclaimable(zone)) {
balanced_pages += zone->managed_pages;
continue;
}
if (zone_balanced(zone, order, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
}
if (order)
return balanced_pages >= (managed_pages >> 2);
else
return true;

Mel Gorman
committed
}
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
*
* Returns true if kswapd is ready to sleep
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

Mel Gorman
committed
int classzone_idx)

Mel Gorman
committed
{
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
/*
* There is a potential race between when kswapd checks its watermarks
* and a process gets throttled. There is also a potential race if
* processes get throttled, kswapd wakes, a large process exits therby
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
* is going to sleep, no process should be sleeping on pfmemalloc_wait
* so wake them now if necessary. If necessary, processes will wake
* kswapd and get throttled again
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
wake_up(&pgdat->pfmemalloc_wait);
return false;
}

Mel Gorman
committed
return pgdat_balanced(pgdat, order, classzone_idx);

Mel Gorman
committed
}
/*
* kswapd shrinks the zone by the number of pages required to reach
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
unsigned long lru_pages,
unsigned long *nr_attempted)
int testorder = sc->order;
unsigned long balance_gap;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
bool lowmem_pressure;
/* Reclaim above the high watermark. */
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
/*
* Kswapd reclaims only single pages with compaction enabled. Trying
* too hard to reclaim until contiguous free pages have become
* available can hurt performance by evicting too much useful data
* from memory. Do not reclaim more than needed for compaction.
*/
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
compaction_suitable(zone, sc->order) !=
COMPACT_SKIPPED)
testorder = 0;
/*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
balance_gap = min(low_wmark_pages(zone),
(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
/*
* If there is no low memory pressure or the zone is balanced then no
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
if (!lowmem_pressure && zone_balanced(zone, testorder,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc);
nodes_clear(shrink.nodes_to_scan);
node_set(zone_to_nid(zone), shrink.nodes_to_scan);
reclaim_state->reclaimed_slab = 0;
shrink_slab(&shrink, sc->nr_scanned, lru_pages);
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
zone_clear_flag(zone, ZONE_WRITEBACK);
/*
* If a zone reaches its high watermark, consider it to be no longer
* congested. It's possible there are dirty pages backed by congested
* BDIs but as pressure is relieved, speculatively avoid congestion
* waits.
*/
if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) {
zone_clear_flag(zone, ZONE_CONGESTED);
zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
}
return sc->nr_scanned >= sc->nr_to_reclaim;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).

Mel Gorman
committed
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

Mel Gorman
committed
int *classzone_idx)
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */

Andrew Morton
committed
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
unsigned long nr_attempted = 0;
bool pgdat_needs_compaction = (order > 0);
/*
* Scan in the highmem->dma direction for the highest
* zone which needs scanning
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
age_active_anon(zone, &sc);

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone, force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
if (!zone_balanced(zone, order, 0, 0)) {