Newer
Older
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
*
* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
* that it is possible to migrate without blocking
*/
if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
return ret;
if (PageDirty(page)) {
struct address_space *mapping;
/* ISOLATE_CLEAN means only clean pages */
if (mode & ISOLATE_CLEAN)
return ret;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
* without blocking
*/
mapping = page_mapping(page);
if (mapping && !mapping->a_ops->migratepage)
return ret;
}
}
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
return ret;
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
ret = 0;
}
return ret;
}
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @mz: The mem_cgroup_zone to pull pages from.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @active: True [1] if isolating active pages
* @file: True [1] if isolating file [!anon] pages
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct mem_cgroup_zone *mz, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
isolate_mode_t mode, int active, int file)
struct lruvec *lruvec;
struct list_head *src;
int lru = LRU_BASE;
lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
if (active)
lru += LRU_ACTIVE;
if (file)
lru += LRU_FILE;
src = &lruvec->lists[lru];
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
switch (__isolate_lru_page(page, mode, file)) {
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
continue;
trace_mm_vmscan_lru_isolate(sc->order,
nr_to_scan, scan,
nr_taken,
/**
* isolate_lru_page - tries to isolate a page from its LRU list
* @page: page to isolate from its LRU list
*
* Isolates a @page from an LRU list, clears PageLRU and adjusts the
* vmstat statistic corresponding to whatever LRU list the page was on.
*
* Returns 0 if the page was removed from an LRU list.
* Returns -EBUSY if the page was not on an LRU list.
*
* The returned page will have PageLRU() cleared. If it was found on
* the active list, it will have PageActive set. If it was found on
* the unevictable list, it will have the PageUnevictable bit set. That flag
* may need to be cleared by the caller before letting the page go.
*
* The vmstat statistic corresponding to the list on which the page was
* found will be decremented.
*
* Restrictions:
* (1) Must be called with an elevated refcount on the page. This is a
* fundamentnal difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
*/
int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
VM_BUG_ON(!page_count(page));
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
get_page(page);
del_page_from_lru_list(zone, page, lru);
}
spin_unlock_irq(&zone->lru_lock);
}
return ret;
}
/*
* Are there way too many processes in the direct reclaim path already?
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
if (current_is_kswapd())
return 0;
if (!global_reclaim(sc))
return 0;
if (file) {
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
isolated = zone_page_state(zone, NR_ISOLATED_FILE);
} else {
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
return isolated > inactive;
}
static noinline_for_stack void
putback_inactive_pages(struct mem_cgroup_zone *mz,
struct list_head *page_list)

Johannes Weiner
committed
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
struct zone *zone = mz->zone;
LIST_HEAD(pages_to_free);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(page_list)) {
struct page *page = lru_to_page(page_list);
int lru;
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
if (unlikely(!page_evictable(page, NULL))) {
spin_unlock_irq(&zone->lru_lock);
putback_lru_page(page);
spin_lock_irq(&zone->lru_lock);
continue;
}
SetPageLRU(page);
lru = page_lru(page);
add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
del_page_from_lru_list(zone, page, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
list_splice(&pages_to_free, page_list);
}

Johannes Weiner
committed
static noinline_for_stack void
update_isolated_counts(struct mem_cgroup_zone *mz,

Johannes Weiner
committed
unsigned long *nr_anon,

Mel Gorman
committed
{

Johannes Weiner
committed
struct zone *zone = mz->zone;

Mel Gorman
committed
unsigned int count[NR_LRU_LISTS] = { 0, };
unsigned long nr_active = 0;
struct page *page;
int lru;
/*
* Count pages and clear active flags
*/
list_for_each_entry(page, page_list, lru) {
int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
nr_active += numpages;
}
count[lru] += numpages;
}

Mel Gorman
committed

Mel Gorman
committed
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE_FILE,
-count[LRU_ACTIVE_FILE]);
__mod_zone_page_state(zone, NR_INACTIVE_FILE,
-count[LRU_INACTIVE_FILE]);
__mod_zone_page_state(zone, NR_ACTIVE_ANON,
-count[LRU_ACTIVE_ANON]);
__mod_zone_page_state(zone, NR_INACTIVE_ANON,
-count[LRU_INACTIVE_ANON]);
*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
preempt_enable();

Mel Gorman
committed
}
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
static noinline_for_stack unsigned long

Johannes Weiner
committed
shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
struct scan_control *sc, int priority, int file)
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;

Mel Gorman
committed
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
isolate_mode_t isolate_mode = ISOLATE_INACTIVE;

Johannes Weiner
committed
struct zone *zone = mz->zone;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
sc, isolate_mode, 0, file);
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
nr_scanned);
else
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
}
spin_unlock_irq(&zone->lru_lock);
return 0;
update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);

Johannes Weiner
committed
nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,

Mel Gorman
committed
&nr_dirty, &nr_writeback);

Andy Whitcroft
committed
spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[0] += nr_anon;
reclaim_stat->recent_scanned[1] += nr_file;
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
nr_reclaimed);
else
__count_zone_vm_events(PGSTEAL_DIRECT, zone,
nr_reclaimed);
}
putback_inactive_pages(mz, &page_list);
__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&page_list, 1);

Mel Gorman
committed
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
/*
* If reclaim is isolating dirty pages under writeback, it implies
* that the long-lived page allocation rate is exceeding the page
* laundering rate. Either the global limits are not being effective
* at throttling processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing device. The
* only option is to throttle from reclaim context which is not ideal
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
* This scales the number of dirty pages that must be under writeback
* before throttling depending on priority. It is a simple backoff
* function that has the most effect in the range DEF_PRIORITY to
* DEF_PRIORITY-2 which is the priority reclaim is considered to be
* in trouble and reclaim is considered to be in trouble.
*
* DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
* DEF_PRIORITY-1 50% must be PageWriteback
* DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
* ...
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
* isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
priority,
}
/*
* This moves pages from the active list to the inactive list.
*
* We move them the other way if the page is referenced by one or more
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
* should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/

KAMEZAWA Hiroyuki
committed
static void move_active_pages_to_lru(struct zone *zone,
struct list_head *list,
enum lru_list lru)
{
unsigned long pgmoved = 0;
struct page *page;
while (!list_empty(list)) {
page = lru_to_page(list);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
lruvec = mem_cgroup_lru_add_list(zone, page, lru);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += hpage_nr_pages(page);
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
del_page_from_lru_list(zone, page, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
list_add(&page->lru, pages_to_free);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}

KAMEZAWA Hiroyuki
committed
static void shrink_active_list(unsigned long nr_to_scan,

Johannes Weiner
committed
struct mem_cgroup_zone *mz,
struct scan_control *sc,
int priority, int file)
unsigned long nr_taken;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);

Johannes Weiner
committed
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = ISOLATE_ACTIVE;

Johannes Weiner
committed
struct zone *zone = mz->zone;
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
isolate_mode, 1, file);
if (global_reclaim(sc))
reclaim_stat->recent_scanned[file] += nr_taken;

KAMEZAWA Hiroyuki
committed
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (unlikely(!page_evictable(page, NULL))) {
putback_lru_page(page);
continue;
}

Mel Gorman
committed
if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}

Johannes Weiner
committed
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}

KOSAKI Motohiro
committed
ClearPageActive(page); /* we are de-activating */
* Move pages back to the lru list.
spin_lock_irq(&zone->lru_lock);
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
reclaim_stat->recent_rotated[file] += nr_rotated;
move_active_pages_to_lru(zone, &l_active, &l_hold,
LRU_ACTIVE + file * LRU_FILE);
move_active_pages_to_lru(zone, &l_inactive, &l_hold,
LRU_BASE + file * LRU_FILE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&l_hold, 1);
#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
if (inactive * zone->inactive_ratio < active)
return 1;
return 0;
}
/**
* inactive_anon_is_low - check if anonymous pages need to be deactivated
* @zone: zone to check
* @sc: scan control of this context
*
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/

Johannes Weiner
committed
static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
return 0;

Johannes Weiner
committed
if (!scanning_global_lru(mz))
return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
mz->zone);
return inactive_anon_is_low_global(mz->zone);

Johannes Weiner
committed
static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
{
return 0;
}
#endif
static int inactive_file_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_FILE);
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
return (active > inactive);
}
/**
* inactive_file_is_low - check if file pages need to be deactivated

Johannes Weiner
committed
* @mz: memory cgroup and zone to check
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
* than half of the file pages are on the inactive list.
*
* Once we get to that situation, protect the system's working
* set from being evicted by disabling active file page aging.
*
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/

Johannes Weiner
committed
static int inactive_file_is_low(struct mem_cgroup_zone *mz)

Johannes Weiner
committed
if (!scanning_global_lru(mz))
return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
mz->zone);

Johannes Weiner
committed
return inactive_file_is_low_global(mz->zone);

Johannes Weiner
committed
static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
{
if (file)

Johannes Weiner
committed
return inactive_file_is_low(mz);
else

Johannes Weiner
committed
return inactive_anon_is_low(mz);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,

Johannes Weiner
committed
struct mem_cgroup_zone *mz,
struct scan_control *sc, int priority)
int file = is_file_lru(lru);
if (is_active_lru(lru)) {

Johannes Weiner
committed
if (inactive_list_is_low(mz, file))
shrink_active_list(nr_to_scan, mz, sc, priority, file);
return 0;
}

Johannes Weiner
committed
return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);

Johannes Weiner
committed
static int vmscan_swappiness(struct mem_cgroup_zone *mz,
struct scan_control *sc)
{
if (global_reclaim(sc))
return vm_swappiness;

Johannes Weiner
committed
return mem_cgroup_swappiness(mz->mem_cgroup);
}
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
* nr[0] = anon pages to scan; nr[1] = file pages to scan

Johannes Weiner
committed
static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
unsigned long *nr, int priority)
{
unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
unsigned long ap, fp;

Johannes Weiner
committed
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
u64 fraction[2], denominator;
bool force_scan = false;
/*
* If the zone or memcg is small, nr[l] can be 0. This
* results in no scanning on this priority and a potential
* priority drop. Global direct reclaim can go to the next
* zone and tends to have no problems. Global kswapd is for
* zone balancing and it needs to scan a minimum amount. When
* reclaiming for a memcg, a priority drop can cause high
* latencies, so it's better to scan a minimum amount there as
* well.
*/
if (current_is_kswapd() && mz->zone->all_unreclaimable)
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
noswap = 1;
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
goto out;
}

Johannes Weiner
committed
anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
if (global_reclaim(sc)) {

Johannes Weiner
committed
free = zone_page_state(mz->zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
force-scan anon pages. */

Johannes Weiner
committed
if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
goto out;
/*
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/

Johannes Weiner
committed
anon_prio = vmscan_swappiness(mz, sc);
file_prio = 200 - vmscan_swappiness(mz, sc);
/*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
*
* Because workloads change over time (and to avoid overflow)
* we keep these statistics as a floating average, which ends
* up weighing recent references more than old ones.
*
* anon in [0], file in [1]
*/

Johannes Weiner
committed
spin_lock_irq(&mz->zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;

Johannes Weiner
committed
spin_unlock_irq(&mz->zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
if (priority || noswap) {
scan >>= priority;
if (!scan && force_scan)
scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(int priority, struct scan_control *sc)
{
if (COMPACTION_BUILD && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
priority < DEF_PRIORITY - 2))
return true;
return false;
}
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/

Johannes Weiner
committed
static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(priority, sc))
return false;

Mel Gorman
committed
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
/* Consider stopping depending on scan and reclaim activity */
if (sc->gfp_mask & __GFP_REPEAT) {
/*
* For __GFP_REPEAT allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
* expensive but a __GFP_REPEAT caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
* For non-__GFP_REPEAT allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
* caller faster at the risk reclaim/compaction and
* the resulting allocation attempt fails
*/
if (!nr_reclaimed)
return false;
}
/*
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);

Johannes Weiner
committed
inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
if (nr_swap_pages > 0)

Johannes Weiner
committed
inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */

Johannes Weiner
committed
switch (compaction_suitable(mz->zone, sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
default:
return true;
}
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/

Johannes Weiner
committed
static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
struct scan_control *sc)
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;

Johannes Weiner
committed
get_scan_count(mz, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;

Johannes Weiner
committed
mz, sc, priority);
/*
* On large memory systems, scan >> priority can become
* really large. This is fine for the starting priority;
* we want to put equal scanning pressure on each zone.
* However, if the VM has a harder time of freeing pages,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/

Johannes Weiner
committed
if (inactive_anon_is_low(mz))
shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
/* reclaim/compaction might need reclaim to continue */

Johannes Weiner
committed
if (should_continue_reclaim(mz, nr_reclaimed,
sc->nr_scanned - nr_scanned,
priority, sc))
goto restart;

Andrew Morton
committed
throttle_vm_writeout(sc->gfp_mask);

Johannes Weiner
committed
static void shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {

Johannes Weiner
committed
.zone = zone,
.priority = priority,

Johannes Weiner
committed
};
struct mem_cgroup *memcg;
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct mem_cgroup_zone mz = {
.mem_cgroup = memcg,
.zone = zone,
};

Johannes Weiner
committed
shrink_mem_cgroup_zone(priority, &mz, sc);
/*
* Limit reclaim has historically picked one memcg and
* scanned it with decreasing priority levels until
* nr_to_reclaim had been reclaimed. This priority
* cycle is thus over after a single memcg.
*
* Direct reclaim and kswapd, on the other hand, have
* to scan all memory cgroups to fulfill the overall
* scan target for the zone.
*/
if (!global_reclaim(sc)) {
mem_cgroup_iter_break(root, memcg);
break;
}
memcg = mem_cgroup_iter(root, memcg, &reclaim);
} while (memcg);

Johannes Weiner
committed
}

Mel Gorman
committed
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
/* Returns true if compaction should go ahead for a high-order request */
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long balance_gap, watermark;
bool watermark_ok;
/* Do not consider compaction for orders reclaim is meant to satisfy */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
balance_gap = min(low_wmark_pages(zone),
(zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
KSWAPD_ZONE_BALANCE_GAP_RATIO);
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
if (compaction_deferred(zone, sc->order))

Mel Gorman
committed
return watermark_ok;
/* If compaction is not ready to start, keep reclaiming */
if (!compaction_suitable(zone, sc->order))
return false;
return watermark_ok;
}
/*
* This is the direct reclaim path, for page-allocating processes. We only