Newer
Older
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*
* This function returns true if a zone is being reclaimed for a costly

Mel Gorman
committed
* high-order allocation and compaction is ready to begin. This indicates to

Mel Gorman
committed
* the caller that it should consider retrying the allocation instead of
* further reclaim.
static bool shrink_zones(int priority, struct zonelist *zonelist,
struct zoneref *z;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;

Mel Gorman
committed
bool aborted_reclaim = false;

KAMEZAWA Hiroyuki
committed

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {

KAMEZAWA Hiroyuki
committed
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (global_reclaim(sc)) {

KAMEZAWA Hiroyuki
committed
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)

KAMEZAWA Hiroyuki
committed
continue; /* Let kswapd poll it */
if (COMPACTION_BUILD) {
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem, like transparent huge
* page allocations.

Mel Gorman
committed
if (compaction_ready(zone, sc)) {

Mel Gorman
committed
aborted_reclaim = true;
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */

KAMEZAWA Hiroyuki
committed
}
shrink_zone(priority, zone, sc);

Mel Gorman
committed
return aborted_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
if (!zone->all_unreclaimable)
return false;
return true;
/*
* This is the main entry point to direct page reclaim.
*
* If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed.
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
* caller can't do much about. We kick the writeback threads and take explicit
* naps in the hope that some of these pages can be written. But if the
* allocating task holds filesystem locks which prevent writeout this might not
* work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc,
struct shrink_control *shrink)
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
unsigned long writeback_threshold;

Mel Gorman
committed
bool aborted_reclaim;
delayacct_freepages_start();
if (global_reclaim(sc))

KAMEZAWA Hiroyuki
committed
count_vm_event(ALLOCSTALL);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {

Mel Gorman
committed
aborted_reclaim = shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
for_each_zone_zonelist(zone, z, zonelist,
gfp_zone(sc->gfp_mask)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
}
shrink_slab(shrink, sc->nr_scanned, lru_pages);

KAMEZAWA Hiroyuki
committed
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;

KAMEZAWA Hiroyuki
committed
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc->nr_scanned;

KOSAKI Motohiro
committed
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
/*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&

Mel Gorman
committed
priority < DEF_PRIORITY - 2) {
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
&cpuset_current_mems_allowed,
&preferred_zone);

Mel Gorman
committed
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}

KOSAKI Motohiro
committed
delayacct_freepages_end();

KOSAKI Motohiro
committed
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/*
* As hibernation is going on, kswapd is freezed so that it can't mark
* the zone into all_unreclaimable. Thus bypassing all_unreclaimable
* check.
*/
if (oom_killer_disabled)
return 0;

Mel Gorman
committed
/* Aborted reclaim to try compaction? don't OOM, then */
if (aborted_reclaim)
return 1;

KOSAKI Motohiro
committed
/* top priority shrink_zones still had more to do? don't OOM, then */
if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

KOSAKI Motohiro
committed
return 1;
return 0;
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)

Mel Gorman
committed
unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

Mel Gorman
committed
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
return nr_reclaimed;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
unsigned long *nr_scanned)
{
struct scan_control sc = {
.nr_scanned = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.order = 0,
.target_mem_cgroup = memcg,
struct mem_cgroup_zone mz = {
.mem_cgroup = memcg,
.zone = zone,
};
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
* if we don't reclaim here, the shrink_zone from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
shrink_mem_cgroup_zone(0, &mz, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.target_mem_cgroup = memcg,
.nodemask = NULL, /* we don't care the placement */
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
* scan does not need to be the current node.
*/
nid = mem_cgroup_select_victim_node(memcg);
zonelist = NODE_DATA(nid)->node_zonelists;
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
return nr_reclaimed;

Johannes Weiner
committed
static void age_active_anon(struct zone *zone, struct scan_control *sc,
int priority)
{
struct mem_cgroup *memcg;

Johannes Weiner
committed
if (!total_swap_pages)
return;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct mem_cgroup_zone mz = {
.mem_cgroup = memcg,
.zone = zone,
};
if (inactive_anon_is_low(&mz))
shrink_active_list(SWAP_CLUSTER_MAX, &mz,
sc, priority, 0);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);

Johannes Weiner
committed
}

Mel Gorman
committed
/*
* pgdat_balanced is used when checking if a node is balanced for high-order
* allocations. Only zones that meet watermarks and are in a zone allowed
* by the callers classzone_idx are added to balanced_pages. The total of
* balanced pages must be at least 25% of the zones allowed by classzone_idx
* for the node to be considered balanced. Forcing all zones to be balanced
* for high orders can cause excessive reclaim when there are imbalanced zones.
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
* o On all other machines, the top zone must be at least a reasonable
* percentage of the middle zones. For example, on 32-bit x86, highmem

Mel Gorman
committed
* would need to be at least 256M for it to be balance a whole node.
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
*/
static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
int classzone_idx)
{
unsigned long present_pages = 0;
int i;
for (i = 0; i <= classzone_idx; i++)
present_pages += pgdat->node_zones[i].present_pages;
/* A special case here: if zone has no page, we think it's balanced */
return balanced_pages >= (present_pages >> 2);

Mel Gorman
committed
}

Mel Gorman
committed
/* is kswapd sleeping prematurely? */

Mel Gorman
committed
static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)

Mel Gorman
committed
{

KOSAKI Motohiro
committed
int i;

Mel Gorman
committed
unsigned long balanced = 0;
bool all_zones_ok = true;

Mel Gorman
committed
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)

Mel Gorman
committed
return true;

Mel Gorman
committed

Mel Gorman
committed
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {

KOSAKI Motohiro
committed
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;

Mel Gorman
committed
/*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well if kswapd
* is to sleep
*/
if (zone->all_unreclaimable) {
balanced += zone->present_pages;

KOSAKI Motohiro
committed
continue;

Mel Gorman
committed
}

KOSAKI Motohiro
committed
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),

Mel Gorman
committed
all_zones_ok = false;
else
balanced += zone->present_pages;

KOSAKI Motohiro
committed
}

Mel Gorman
committed

Mel Gorman
committed
/*
* For high-order requests, the balanced zones must contain at least
* 25% of the nodes pages for kswapd to sleep. For order-0, all zones
* must be balanced
*/
if (order)
return !pgdat_balanced(pgdat, balanced, classzone_idx);

Mel Gorman
committed
else
return !all_zones_ok;

Mel Gorman
committed
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).

Mel Gorman
committed
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

Mel Gorman
committed
int *classzone_idx)

Mel Gorman
committed
unsigned long balanced;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
/*
* kswapd doesn't want to be bailed out while reclaim. because
* we want to put equal scanning pressure on each zone.
*/
.nr_to_reclaim = ULONG_MAX,

Johannes Weiner
committed
.target_mem_cgroup = NULL,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
sc.nr_reclaimed = 0;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
unsigned long lru_pages = 0;

KOSAKI Motohiro
committed
int has_under_min_watermark_zone = 0;

Mel Gorman
committed
balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
* zone which needs scanning
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/

Johannes Weiner
committed
age_active_anon(zone, &sc, priority);

Mel Gorman
committed
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone, force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
} else {
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
lru_pages += zone_reclaimable_pages(zone);
}
/*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
* We do this because the page allocator works in the opposite
* direction. This prevents the page allocator from allocating
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab, testorder;

Mel Gorman
committed
unsigned long balance_gap;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
total_scanned += nr_soft_scanned;

Mel Gorman
committed
* We put equal pressure on every zone, unless
* one zone has way too many pages free
* already. The "too many pages" is defined
* as the high wmark plus a "gap" where the
* gap is either the low watermark or 1%
* of the zone, whichever is smaller.

Mel Gorman
committed
balance_gap = min(low_wmark_pages(zone),
(zone->present_pages +
KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
/*
* Kswapd reclaims only single pages with compaction
* enabled. Trying too hard to reclaim until contiguous
* free pages have become available can hurt performance
* by evicting too much useful data from memory.
* Do not reclaim more than needed for compaction.
*/
testorder = order;
if (COMPACTION_BUILD && order &&
compaction_suitable(zone, order) !=
COMPACT_SKIPPED)
testorder = 0;

Mel Gorman
committed
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
!zone_watermark_ok_safe(zone, testorder,

Mel Gorman
committed
high_wmark_pages(zone) + balance_gap,

Mel Gorman
committed
end_zone, 0)) {
shrink_zone(priority, zone, &sc);

Mel Gorman
committed
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
}
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)

KOSAKI Motohiro
committed

Mel Gorman
committed
if (zone->all_unreclaimable) {
if (end_zone && end_zone == i)
end_zone--;

Mel Gorman
committed
continue;

Mel Gorman
committed
}

Mel Gorman
committed
if (!zone_watermark_ok_safe(zone, testorder,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
* We are still under min water mark. This
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;

Mel Gorman
committed
} else {
/*
* If a zone reaches its high watermark,
* consider it to be no longer congested. It's
* possible there are dirty pages backed by
* congested BDIs but as pressure is relieved,
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);

Mel Gorman
committed
if (i <= *classzone_idx)

Mel Gorman
committed
balanced += zone->present_pages;

KOSAKI Motohiro
committed

Mel Gorman
committed
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/

KOSAKI Motohiro
committed
if (total_scanned && (priority < DEF_PRIORITY - 2)) {
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
else
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
/*
* We do this so kswapd doesn't build up large priorities for
* example when it is freeing in parallel with allocators. It
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
/*
* order-0: All zones must meet high watermark for a balanced node

Mel Gorman
committed
* high-order: Balanced zones must make up at least 25% of the node
* for the node to be balanced
*/

Mel Gorman
committed
if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
try_to_freeze();
/*
* Fragmentation may mean that the system cannot be
* rebalanced for high-order allocations in all zones.
* At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
* it means the zones have been fully scanned and are still
* not balanced. For high-order allocations, there is
* little point trying all over again as kswapd may
* infinite loop.
*
* Instead, recheck all watermarks at order-0 as they
* are the most important. If watermarks are ok, kswapd will go
* back to sleep. High-order users can still perform direct
* reclaim if they wish.
*/
if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
order = sc.order = 0;
/*
* If kswapd was reclaiming at a higher order, it has the option of
* sleeping without all zones being balanced. Before it does, it must
* ensure that the watermarks for order-0 on *all* zones are met and
* that the congestion flags are cleared. The congestion flag must
* be cleared as kswapd is the only mechanism that clears the flag
* and it is potentially going to sleep here.
*/
if (order) {
int zones_need_compaction = 1;
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
/* Would compaction fail due to lack of free memory? */
if (COMPACTION_BUILD &&
compaction_suitable(zone, order) == COMPACT_SKIPPED)
goto loop_again;
/* Confirm the zone is balanced for order-0 */
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone), 0, 0)) {
order = sc.order = 0;
goto loop_again;
}
/* Check if the memory needs to be defragmented. */
if (zone_watermark_ok(zone, order,
low_wmark_pages(zone), *classzone_idx, 0))
zones_need_compaction = 0;
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
}
if (zones_need_compaction)
compact_pgdat(pgdat, order);
}

Mel Gorman
committed
/*
* Return the order we were reclaiming at so sleeping_prematurely()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
*/

Mel Gorman
committed
*classzone_idx = end_zone;

Mel Gorman
committed
return order;

Mel Gorman
committed
static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
if (freezing(current) || kthread_should_stop())
return;
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */

Mel Gorman
committed
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
/*
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/

Mel Gorman
committed
if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
* vmstat counters are not perfectly accurate and the estimated
* value for counters such as NR_FREE_PAGES can deviate from the
* true value by nr_online_cpus * threshold. To avoid the zone
* watermarks being breached while under pressure, we reduce the
* per-cpu vmstat threshold while kswapd is awake and restore
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
schedule();
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
}
finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
* The background pageout daemon, started as a kernel thread
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
* that frees anything up. This is needed for things like routing
* etc, where we otherwise might have all activity going on in
* asynchronous contexts that cannot page things out.
*
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
static int kswapd(void *p)
{

Mel Gorman
committed
unsigned long order, new_order;
unsigned balanced_order;

Mel Gorman
committed
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
lockdep_set_current_reclaim_state(GFP_KERNEL);
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();

Mel Gorman
committed
order = new_order = 0;
balanced_order = 0;

Mel Gorman
committed
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;

David Rientjes
committed
int ret;

Mel Gorman
committed
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
if (balanced_classzone_idx >= new_classzone_idx &&
balanced_order == new_order) {

Mel Gorman
committed
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
* allocation or has tigher zone constraints
classzone_idx = new_classzone_idx;
kswapd_try_to_sleep(pgdat, balanced_order,
balanced_classzone_idx);
classzone_idx = pgdat->classzone_idx;
new_order = order;
new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;

Mel Gorman
committed
pgdat->classzone_idx = pgdat->nr_zones - 1;

David Rientjes
committed
ret = try_to_freeze();
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/

Mel Gorman
committed
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balanced_classzone_idx = classzone_idx;
balanced_order = balance_pgdat(pgdat, order,
&balanced_classzone_idx);

Mel Gorman
committed
}
}
return 0;
}
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order) {
pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
}
if (!waitqueue_active(&pgdat->kswapd_wait))
if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
/*
* The reclaimable count would be mostly accurate.
* The less reclaimable pages may be
* - mlocked pages, which will be moved to unevictable list when encountered
* - mapped pages, which may require several travels to be reclaimed
* - dirty pages, which is not "instantly" reclaimable
*/
unsigned long global_reclaimable_pages(void)
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
int nr;
nr = global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_FILE);
if (nr_swap_pages > 0)
nr += global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_INACTIVE_ANON);
return nr;
}
unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
if (nr_swap_pages > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
return nr;
#ifdef CONFIG_HIBERNATION
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
* freed pages.
*
* Rather than trying to age LRUs the aim is to preserve the overall
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct reclaim_state reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_HIGHUSER_MOVABLE,
.may_swap = 1,
.may_unmap = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
.order = 0,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};