Newer
Older
switch (compaction_suitable(zone, sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
default:
return true;
}
}
static void shrink_zone(struct zone *zone, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = sc->priority,
};
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct lruvec *lruvec;
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
shrink_lruvec(lruvec, sc);

Johannes Weiner
committed
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));

Johannes Weiner
committed
}

Mel Gorman
committed
/* Returns true if compaction should go ahead for a high-order request */
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
/* Do not consider compaction for orders reclaim is meant to satisfy */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
balance_gap = min(low_wmark_pages(zone),
(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

Mel Gorman
committed
KSWAPD_ZONE_BALANCE_GAP_RATIO);
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
if (compaction_deferred(zone, sc->order))

Mel Gorman
committed
return watermark_ok;
/* If compaction is not ready to start, keep reclaiming */
if (!compaction_suitable(zone, sc->order))
return false;
return watermark_ok;
}
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*
* This function returns true if a zone is being reclaimed for a costly

Mel Gorman
committed
* high-order allocation and compaction is ready to begin. This indicates to

Mel Gorman
committed
* the caller that it should consider retrying the allocation instead of
* further reclaim.
static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zoneref *z;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;

Mel Gorman
committed
bool aborted_reclaim = false;

KAMEZAWA Hiroyuki
committed

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {

KAMEZAWA Hiroyuki
committed
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (global_reclaim(sc)) {

KAMEZAWA Hiroyuki
committed
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (zone->all_unreclaimable &&
sc->priority != DEF_PRIORITY)

KAMEZAWA Hiroyuki
committed
continue; /* Let kswapd poll it */
if (IS_ENABLED(CONFIG_COMPACTION)) {
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem, like transparent huge
* page allocations.

Mel Gorman
committed
if (compaction_ready(zone, sc)) {

Mel Gorman
committed
aborted_reclaim = true;
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */

KAMEZAWA Hiroyuki
committed
}
shrink_zone(zone, sc);

Mel Gorman
committed
return aborted_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (!zone->all_unreclaimable)
return false;
return true;
/*
* This is the main entry point to direct page reclaim.
*
* If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed.
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
* caller can't do much about. We kick the writeback threads and take explicit
* naps in the hope that some of these pages can be written. But if the
* allocating task holds filesystem locks which prevent writeout this might not
* work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc,
struct shrink_control *shrink)
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
unsigned long writeback_threshold;

Mel Gorman
committed
bool aborted_reclaim;
delayacct_freepages_start();
if (global_reclaim(sc))

KAMEZAWA Hiroyuki
committed
count_vm_event(ALLOCSTALL);
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
aborted_reclaim = shrink_zones(zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);

KAMEZAWA Hiroyuki
committed
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;

KAMEZAWA Hiroyuki
committed
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc->nr_scanned;

KOSAKI Motohiro
committed
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
/*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
sc->priority < DEF_PRIORITY - 2) {

Mel Gorman
committed
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
&cpuset_current_mems_allowed,
&preferred_zone);

Mel Gorman
committed
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
} while (--sc->priority >= 0);

KOSAKI Motohiro
committed
delayacct_freepages_end();

KOSAKI Motohiro
committed
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/*
* As hibernation is going on, kswapd is freezed so that it can't mark
* the zone into all_unreclaimable. Thus bypassing all_unreclaimable
* check.
*/
if (oom_killer_disabled)
return 0;

Mel Gorman
committed
/* Aborted reclaim to try compaction? don't OOM, then */
if (aborted_reclaim)
return 1;

KOSAKI Motohiro
committed
/* top priority shrink_zones still had more to do? don't OOM, then */
if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

KOSAKI Motohiro
committed
return 1;
return 0;
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
unsigned long free_pages = 0;
int i;
bool wmark_ok;
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
pgdat->classzone_idx = min(pgdat->classzone_idx,
(enum zone_type)ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
return wmark_ok;
}
/*
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
* when the low watermark is reached.
*
* Returns true if a fatal signal was delivered during throttling. If this
* happens, the page allocator should not consider triggering the OOM killer.
*/
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
int high_zoneidx = gfp_zone(gfp_mask);
pg_data_t *pgdat;
/*
* Kernel threads should not be throttled as they may be indirectly
* responsible for cleaning pages necessary for reclaim to make forward
* progress. kjournald for example may enter direct reclaim while
* committing a transaction where throttling it could forcing other
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
goto out;
/*
* If a fatal signal is pending, this process should not throttle.
* It should return quickly so it can exit and free its memory
*/
if (fatal_signal_pending(current))
goto out;
/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
/*
* If the caller cannot enter the filesystem, it's possible that it
* is due to the caller holding an FS lock or performing a journal
* transaction in the case of a filesystem like ext[3|4]. In this case,
* it is not safe to block on pfmemalloc_wait as kswapd could be
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
check_pending:
if (fatal_signal_pending(current))
return true;
out:
return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)

Mel Gorman
committed
unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.priority = DEF_PRIORITY,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
return 1;

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
return nr_reclaimed;
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned)
{
struct scan_control sc = {
.nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
.priority = 0,
.target_mem_cgroup = memcg,
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.may_writepage,
sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
* if we don't reclaim here, the shrink_zone from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.priority = DEF_PRIORITY,
.target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
* scan does not need to be the current node.
*/
nid = mem_cgroup_select_victim_node(memcg);
zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
return nr_reclaimed;
static void age_active_anon(struct zone *zone, struct scan_control *sc)

Johannes Weiner
committed
{
struct mem_cgroup *memcg;

Johannes Weiner
committed
if (!total_swap_pages)
return;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);

Johannes Weiner
committed
}
static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
balance_gap, classzone_idx, 0))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
!compaction_suitable(zone, order))
return false;
return true;
}

Mel Gorman
committed
/*
* pgdat_balanced() is used when checking if a node is balanced.
*
* For order-0, all zones must be balanced!
*
* For high-order allocations only zones that meet watermarks and are in a
* zone allowed by the callers classzone_idx are added to balanced_pages. The
* total of balanced pages must be at least 25% of the zones allowed by
* classzone_idx for the node to be considered balanced. Forcing all zones to
* be balanced for high orders can cause excessive reclaim when there are
* imbalanced zones.

Mel Gorman
committed
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
* o On all other machines, the top zone must be at least a reasonable
* percentage of the middle zones. For example, on 32-bit x86, highmem

Mel Gorman
committed
* would need to be at least 256M for it to be balance a whole node.
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

Mel Gorman
committed
{
unsigned long managed_pages = 0;
unsigned long balanced_pages = 0;

Mel Gorman
committed
int i;
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;

Mel Gorman
committed
if (!populated_zone(zone))
continue;
managed_pages += zone->managed_pages;
/*
* A special case here:
*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well!
*/
if (zone->all_unreclaimable) {
balanced_pages += zone->managed_pages;
continue;
}
if (zone_balanced(zone, order, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
}
if (order)
return balanced_pages >= (managed_pages >> 2);
else
return true;

Mel Gorman
committed
}
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
*
* Returns true if kswapd is ready to sleep
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

Mel Gorman
committed
int classzone_idx)

Mel Gorman
committed
{
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
/*
* There is a potential race between when kswapd checks its watermarks
* and a process gets throttled. There is also a potential race if
* processes get throttled, kswapd wakes, a large process exits therby
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
* is going to sleep, no process should be sleeping on pfmemalloc_wait
* so wake them now if necessary. If necessary, processes will wake
* kswapd and get throttled again
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
wake_up(&pgdat->pfmemalloc_wait);
return false;
}

Mel Gorman
committed
return pgdat_balanced(pgdat, order, classzone_idx);

Mel Gorman
committed
}
/*
* kswapd shrinks the zone by the number of pages required to reach
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim. This is used to determine if the scanning priority needs to be
* raised.
static bool kswapd_shrink_zone(struct zone *zone,
struct scan_control *sc,
unsigned long lru_pages,
unsigned long *nr_attempted)
{
unsigned long nr_slab;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
/* Reclaim above the high watermark. */
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
shrink_zone(zone, sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
return sc->nr_scanned >= sc->nr_to_reclaim;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).

Mel Gorman
committed
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

Mel Gorman
committed
int *classzone_idx)
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
unsigned long nr_attempted = 0;
bool pgdat_needs_compaction = (order > 0);
/*
* Scan in the highmem->dma direction for the highest
* zone which needs scanning
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
sc.priority != DEF_PRIORITY)
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
age_active_anon(zone, &sc);

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone, force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
if (!zone_balanced(zone, order, 0, 0)) {

Mel Gorman
committed
/*
* If balanced, clear the dirty and congested
* flags
*/
zone_clear_flag(zone, ZONE_CONGESTED);

Mel Gorman
committed
zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
lru_pages += zone_reclaimable_pages(zone);
/*
* If any zone is currently balanced then kswapd will
* not call compaction as it is expected that the
* necessary pages are already available.
*/
if (pgdat_needs_compaction &&
zone_watermark_ok(zone, order,
low_wmark_pages(zone),
*classzone_idx, 0))
pgdat_needs_compaction = false;
}
/*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
* We do this because the page allocator works in the opposite
* direction. This prevents the page allocator from allocating
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int testorder;

Mel Gorman
committed
unsigned long balance_gap;
if (zone->all_unreclaimable &&
sc.priority != DEF_PRIORITY)
nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;

Mel Gorman
committed
* We put equal pressure on every zone, unless
* one zone has way too many pages free
* already. The "too many pages" is defined
* as the high wmark plus a "gap" where the
* gap is either the low watermark or 1%
* of the zone, whichever is smaller.

Mel Gorman
committed
balance_gap = min(low_wmark_pages(zone),
(zone->managed_pages +

Mel Gorman
committed
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
/*
* Kswapd reclaims only single pages with compaction
* enabled. Trying too hard to reclaim until contiguous
* free pages have become available can hurt performance
* by evicting too much useful data from memory.
* Do not reclaim more than needed for compaction.
*/
testorder = order;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
compaction_suitable(zone, order) !=
COMPACT_SKIPPED)
testorder = 0;

Mel Gorman
committed
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
!zone_balanced(zone, testorder,
balance_gap, end_zone)) {
/*
* There should be no need to raise the
* scanning priority if enough pages are
* already being scanned that high
* watermark would be met at 100% efficiency.
*/
if (kswapd_shrink_zone(zone, &sc, lru_pages,
&nr_attempted))

Mel Gorman
committed
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
if (sc.priority < DEF_PRIORITY - 2)

KOSAKI Motohiro
committed

Mel Gorman
committed
if (zone->all_unreclaimable) {
if (end_zone && end_zone == i)
end_zone--;

Mel Gorman
committed
continue;

Mel Gorman
committed
}

Mel Gorman
committed
if (zone_balanced(zone, testorder, 0, end_zone))

Mel Gorman
committed
/*
* If a zone reaches its high watermark,
* consider it to be no longer congested. It's
* possible there are dirty pages backed by
* congested BDIs but as pressure is relieved,
* speculatively avoid congestion waits

Mel Gorman
committed
* or writing pages from kswapd context.

Mel Gorman
committed
*/
zone_clear_flag(zone, ZONE_CONGESTED);

Mel Gorman
committed
zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
* Fragmentation may mean that the system cannot be rebalanced
* for high-order allocations in all zones. If twice the
* allocation size has been reclaimed and the zones are still
* not balanced then recheck the watermarks at order-0 to
* prevent kswapd reclaiming excessively. Assume that a
* process requested a high-order can direct reclaim/compact.
if (order && sc.nr_reclaimed >= 2UL << order)
order = sc.order = 0;
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
* Compact if necessary and kswapd is reclaiming at least the
* high watermark number of pages as requsted
*/
if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
compact_pgdat(pgdat, order);
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
!pgdat_balanced(pgdat, order, *classzone_idx));

Mel Gorman
committed
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()

Mel Gorman
committed
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
*/

Mel Gorman
committed
*classzone_idx = end_zone;

Mel Gorman
committed
return order;

Mel Gorman
committed
static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
if (freezing(current) || kthread_should_stop())
return;
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
/*
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
* vmstat counters are not perfectly accurate and the estimated
* value for counters such as NR_FREE_PAGES can deviate from the
* true value by nr_online_cpus * threshold. To avoid the zone
* watermarks being breached while under pressure, we reduce the
* per-cpu vmstat threshold while kswapd is awake and restore
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
/*
* Compaction records what page blocks it recently failed to