Newer
Older
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
lru += LRU_ACTIVE;
if (file)
lru += LRU_FILE;
return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
/*
* clear_active_flags() is a helper for shrink_active_list(), clearing
* any active bits from the pages in the list.
*/
static unsigned long clear_active_flags(struct list_head *page_list,
unsigned int *count)
list_for_each_entry(page, page_list, lru) {
/**
* isolate_lru_page - tries to isolate a page from its LRU list
* @page: page to isolate from its LRU list
*
* Isolates a @page from an LRU list, clears PageLRU and adjusts the
* vmstat statistic corresponding to whatever LRU list the page was on.
*
* Returns 0 if the page was removed from an LRU list.
* Returns -EBUSY if the page was not on an LRU list.
*
* The returned page will have PageLRU() cleared. If it was found on
* the active list, it will have PageActive set. If it was found on
* the unevictable list, it will have the PageUnevictable bit set. That flag
* may need to be cleared by the caller before letting the page go.
*
* The vmstat statistic corresponding to the list on which the page was
* found will be decremented.
*
* Restrictions:
* (1) Must be called with an elevated refcount on the page. This is a
* fundamentnal difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
*/
int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page) && get_page_unless_zero(page)) {
ret = 0;
ClearPageLRU(page);
del_page_from_lru_list(zone, page, lru);
}
spin_unlock_irq(&zone->lru_lock);
}
return ret;
}
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
/*
* Are there way too many processes in the direct reclaim path already?
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
if (current_is_kswapd())
return 0;
if (!scanning_global_lru(sc))
return 0;
if (file) {
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
isolated = zone_page_state(zone, NR_ISOLATED_FILE);
} else {
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
return isolated > inactive;
}
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
static unsigned long shrink_inactive_list(unsigned long max_scan,
struct zone *zone, struct scan_control *sc,
int priority, int file)
{
LIST_HEAD(page_list);
struct pagevec pvec;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
unsigned long nr_taken;
unsigned long nr_scan;
unsigned long nr_freed;
unsigned int count[NR_LRU_LISTS] = { 0, };
int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
unsigned long nr_anon;
unsigned long nr_file;
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
&page_list, &nr_scan,
sc->order, mode,
zone, 0, file);
zone->pages_scanned += nr_scan;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
nr_scan);
else
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scan);
} else {
nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
&page_list, &nr_scan,
sc->order, mode,
zone, sc->mem_cgroup,
0, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
*/
}
if (nr_taken == 0)
goto done;
nr_active = clear_active_flags(&page_list, count);

Andy Whitcroft
committed
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE_FILE,
-count[LRU_ACTIVE_FILE]);
__mod_zone_page_state(zone, NR_INACTIVE_FILE,
-count[LRU_INACTIVE_FILE]);
__mod_zone_page_state(zone, NR_ACTIVE_ANON,
-count[LRU_ACTIVE_ANON]);
__mod_zone_page_state(zone, NR_INACTIVE_ANON,
-count[LRU_INACTIVE_ANON]);
nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
reclaim_stat->recent_scanned[0] += nr_anon;
reclaim_stat->recent_scanned[1] += nr_file;

Andy Whitcroft
committed
nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
/*
* If we are direct reclaiming for contiguous pages and we do
* not reclaim everything in the list, try again and wait
* for IO to complete. This will stall high-order allocations
* but that should be acceptable to the caller
*/
if (nr_freed < nr_taken && !current_is_kswapd() &&
sc->lumpy_reclaim_mode) {
congestion_wait(BLK_RW_ASYNC, HZ/10);

Andy Whitcroft
committed
/*
* The attempt at page out may have made some
* of the pages active, mark them inactive again.
*/
nr_active = clear_active_flags(&page_list, count);

Andy Whitcroft
committed
count_vm_events(PGDEACTIVATE, nr_active);
nr_freed += shrink_page_list(&page_list, sc,
PAGEOUT_IO_SYNC);
}
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_freed);
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
if (unlikely(!page_evictable(page, NULL))) {
spin_unlock_irq(&zone->lru_lock);
putback_lru_page(page);
spin_lock_irq(&zone->lru_lock);
continue;
}
SetPageLRU(page);
lru = page_lru(page);
add_page_to_lru_list(zone, page, lru);
reclaim_stat->recent_rotated[file]++;
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
spin_unlock_irq(&zone->lru_lock);
/*
* We are about to scan this zone at a certain priority level. If that priority
* level is smaller (ie: more urgent) than the previous priority, then note
* that priority level within the zone. This is done so that when the next
* process comes in to scan this zone, it will immediately start out at this
* priority level rather than having to build up its own scanning priority.
* Here, this priority affects only the reclaim-mapped threshold.
*/
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
if (priority < zone->prev_priority)
zone->prev_priority = priority;
}
/*
* This moves pages from the active list to the inactive list.
*
* We move them the other way if the page is referenced by one or more
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
* should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/

KAMEZAWA Hiroyuki
committed
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
static void move_active_pages_to_lru(struct zone *zone,
struct list_head *list,
enum lru_list lru)
{
unsigned long pgmoved = 0;
struct pagevec pvec;
struct page *page;
pagevec_init(&pvec, 1);
while (!list_empty(list)) {
page = lru_to_page(list);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
pgmoved++;
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}

KAMEZAWA Hiroyuki
committed
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority, int file)
unsigned long nr_taken;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
nr_taken = isolate_pages_global(nr_pages, &l_hold,
&pgscanned, sc->order,
ISOLATE_ACTIVE, zone,
1, file);

KAMEZAWA Hiroyuki
committed
zone->pages_scanned += pgscanned;
} else {
nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
&pgscanned, sc->order,
ISOLATE_ACTIVE, zone,
sc->mem_cgroup, 1, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
*/
reclaim_stat->recent_scanned[file] += nr_taken;

KAMEZAWA Hiroyuki
committed
__count_zone_vm_events(PGREFILL, zone, pgscanned);
__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (unlikely(!page_evictable(page, NULL))) {
putback_lru_page(page);
continue;
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}

KOSAKI Motohiro
committed
ClearPageActive(page); /* we are de-activating */
* Move pages back to the lru list.
spin_lock_irq(&zone->lru_lock);
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
reclaim_stat->recent_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active,
LRU_ACTIVE + file * LRU_FILE);
move_active_pages_to_lru(zone, &l_inactive,
LRU_BASE + file * LRU_FILE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
if (inactive * zone->inactive_ratio < active)
return 1;
return 0;
}
/**
* inactive_anon_is_low - check if anonymous pages need to be deactivated
* @zone: zone to check
* @sc: scan control of this context
*
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
low = inactive_anon_is_low_global(zone);
else
low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
static int inactive_file_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_FILE);
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
return (active > inactive);
}
/**
* inactive_file_is_low - check if file pages need to be deactivated
* @zone: zone to check
* @sc: scan control of this context
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
* than half of the file pages are on the inactive list.
*
* Once we get to that situation, protect the system's working
* set from being evicted by disabling active file page aging.
*
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
if (scanning_global_lru(sc))
low = inactive_file_is_low_global(zone);
else
low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
return low;
}
static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
int file)
{
if (file)
return inactive_file_is_low(zone, sc);
else
return inactive_anon_is_low(zone, sc);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct zone *zone, struct scan_control *sc, int priority)
{
int file = is_file_lru(lru);
if (is_active_lru(lru)) {
if (inactive_list_is_low(zone, sc, file))
shrink_active_list(nr_to_scan, zone, sc, priority, file);
return 0;
}
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
/*
* Smallish @nr_to_scan's are deposited in @nr_saved_scan,
* until we collected @swap_cluster_max pages to scan.
*/
static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
unsigned long *nr_saved_scan)
{
unsigned long nr;
*nr_saved_scan += nr_to_scan;
nr = *nr_saved_scan;
if (nr >= SWAP_CLUSTER_MAX)
*nr_saved_scan = 0;
else
nr = 0;
return nr;
}
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
* nr[0] = anon pages to scan; nr[1] = file pages to scan
static void get_scan_count(struct zone *zone, struct scan_control *sc,
unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
noswap = 1;
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
goto out;
}
anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
goto out;
}
/*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
*
* Because workloads change over time (and to avoid overflow)
* we keep these statistics as a floating average, which ends
* up weighing recent references more than old ones.
*
* anon in [0], file in [1]
*/
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
spin_unlock_irq(&zone->lru_lock);
}
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
spin_unlock_irq(&zone->lru_lock);
}
/*
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
anon_prio = sc->swappiness;
file_prio = 200 - sc->swappiness;
/*
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
unsigned long scan;
scan = zone_nr_lru_pages(zone, sc, l);
if (priority || noswap) {
scan >>= priority;
scan = div64_u64(scan * fraction[file], denominator);
}
nr[l] = nr_scan_try_batch(scan,
&reclaim_stat->nr_saved_scan[l]);
}
static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
{
/*
* If we need a large contiguous chunk of memory, or have
* trouble getting a small set of contiguous pages, we
* will reclaim both active and inactive pages.
*/
if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
sc->lumpy_reclaim_mode = 1;
else if (sc->order && priority < DEF_PRIORITY - 2)
sc->lumpy_reclaim_mode = 1;
else
sc->lumpy_reclaim_mode = 0;
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_zone(int priority, struct zone *zone,
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
get_scan_count(zone, sc, nr, priority);
set_lumpy_reclaim_mode(priority, sc);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
nr_to_scan = min_t(unsigned long,
nr[l], SWAP_CLUSTER_MAX);
nr[l] -= nr_to_scan;
nr_reclaimed += shrink_list(l, nr_to_scan,
zone, sc, priority);
/*
* On large memory systems, scan >> priority can become
* really large. This is fine for the starting priority;
* we want to put equal scanning pressure on each zone.
* However, if the VM has a harder time of freeing pages,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/

KOSAKI Motohiro
committed
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
sc->nr_reclaimed = nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);

Andrew Morton
committed
throttle_vm_writeout(sc->gfp_mask);
}
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* We reclaim from a zone even if that zone is over high_wmark_pages(zone).
* Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
* b) The target zone may be at high_wmark_pages(zone) but the lower zones
* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
* zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/

KOSAKI Motohiro
committed
static bool shrink_zones(int priority, struct zonelist *zonelist,
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;

KOSAKI Motohiro
committed
bool all_unreclaimable = true;

KAMEZAWA Hiroyuki
committed
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
sc->nodemask) {

KAMEZAWA Hiroyuki
committed
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/

KAMEZAWA Hiroyuki
committed
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
note_zone_scanning_priority(zone, priority);
if (zone->all_unreclaimable && priority != DEF_PRIORITY)

KAMEZAWA Hiroyuki
committed
continue; /* Let kswapd poll it */
} else {
/*
* Ignore cpuset limitation here. We just want to reduce
* # of used pages by us regardless of memory shortage.
*/
mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
priority);
}
shrink_zone(priority, zone, sc);

KOSAKI Motohiro
committed
all_unreclaimable = false;

KOSAKI Motohiro
committed
return all_unreclaimable;
/*
* This is the main entry point to direct page reclaim.
*
* If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed.
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
* caller can't do much about. We kick the writeback threads and take explicit
* naps in the hope that some of these pages can be written. But if the
* allocating task holds filesystem locks which prevent writeout this might not
* work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)

KOSAKI Motohiro
committed
bool all_unreclaimable;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;
get_mems_allowed();
delayacct_freepages_start();

KAMEZAWA Hiroyuki
committed
count_vm_event(ALLOCSTALL);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
if (!priority)
disable_swap_token();

KOSAKI Motohiro
committed
all_unreclaimable = shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
*/
unsigned long lru_pages = 0;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
lru_pages += zone_reclaimable_pages(zone);
}
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);

KAMEZAWA Hiroyuki
committed
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;

KAMEZAWA Hiroyuki
committed
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc->nr_scanned;

KOSAKI Motohiro
committed
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
/*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
sc->may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
priority < DEF_PRIORITY - 2)
congestion_wait(BLK_RW_ASYNC, HZ/10);

KOSAKI Motohiro
committed
/*
* Now that we've scanned all the zones at this priority level, note
* that level within the zone so that the next thread which performs
* scanning of this zone will immediately start out at this priority
* level. This affects only the decision whether or not to bring
* mapped pages onto the inactive list.
*/
if (priority < 0)
priority = 0;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

KAMEZAWA Hiroyuki
committed
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
zone->prev_priority = priority;
}
} else
mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
delayacct_freepages_end();
put_mems_allowed();

KOSAKI Motohiro
committed
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable)
return 1;
return 0;
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
return do_try_to_free_pages(zonelist, &sc);
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
struct zone *zone, int nid)
{
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
};
nodemask_t nm = nodemask_of_node(nid);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
sc.nodemask = &nm;
sc.nr_reclaimed = 0;
sc.nr_scanned = 0;
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
* if we don't reclaim here, the shrink_zone from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
shrink_zone(0, zone, &sc);
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask,
bool noswap,
unsigned int swappiness)
struct zonelist *zonelist;
struct scan_control sc = {
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.order = 0,
.mem_cgroup = mem_cont,
.nodemask = NULL, /* we don't care the placement */
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
return do_try_to_free_pages(zonelist, &sc);

Mel Gorman
committed
/* is kswapd sleeping prematurely? */

KOSAKI Motohiro
committed
static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)

Mel Gorman
committed
{

KOSAKI Motohiro
committed
int i;

Mel Gorman
committed
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return 1;
/* If after HZ/10, a zone is below the high mark, it's premature */

KOSAKI Motohiro
committed
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable)

KOSAKI Motohiro
committed
continue;

Mel Gorman
committed
if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
0, 0))
return 1;

KOSAKI Motohiro
committed
}

Mel Gorman
committed
return 0;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
* Returns the number of pages which were actually freed.
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is