Newer
Older
* if it is of the appropriate PageActive status. Pages which are being
* freed elsewhere are also ignored.
*
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
* returns 0 on success, -ve errno on failure.
*/
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
{
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
/* Compaction should not handle unevictable pages but CMA can do so */
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
/*
* To minimise LRU disruption, the caller can indicate that it only
* wants to isolate pages it will be able to operate on without
* blocking - clean pages for the most part.
*
* ISOLATE_CLEAN means that only clean pages should be isolated. This
* is used by reclaim when it is cannot write to backing storage
*
* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
* that it is possible to migrate without blocking
*/
if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
return ret;
if (PageDirty(page)) {
struct address_space *mapping;
/* ISOLATE_CLEAN means only clean pages */
if (mode & ISOLATE_CLEAN)
return ret;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
* without blocking
*/
mapping = page_mapping(page);
if (mapping && !mapping->a_ops->migratepage)
return ret;
}
}
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
return ret;
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
ret = 0;
}
return ret;
}
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
isolate_mode_t mode, enum lru_list lru)
struct list_head *src = &lruvec->lists[lru];
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
switch (__isolate_lru_page(page, mode)) {
nr_pages = hpage_nr_pages(page);
mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
continue;
trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
/**
* isolate_lru_page - tries to isolate a page from its LRU list
* @page: page to isolate from its LRU list
*
* Isolates a @page from an LRU list, clears PageLRU and adjusts the
* vmstat statistic corresponding to whatever LRU list the page was on.
*
* Returns 0 if the page was removed from an LRU list.
* Returns -EBUSY if the page was not on an LRU list.
*
* The returned page will have PageLRU() cleared. If it was found on
* the active list, it will have PageActive set. If it was found on
* the unevictable list, it will have the PageUnevictable bit set. That flag
* may need to be cleared by the caller before letting the page go.
*
* The vmstat statistic corresponding to the list on which the page was
* found will be decremented.
*
* Restrictions:
* (1) Must be called with an elevated refcount on the page. This is a
* fundamentnal difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
*/
int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
VM_BUG_ON(!page_count(page));
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
lruvec = mem_cgroup_page_lruvec(page, zone);
if (PageLRU(page)) {
get_page(page);
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
spin_unlock_irq(&zone->lru_lock);
}
return ret;
}
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
if (current_is_kswapd())
return 0;
if (!global_reclaim(sc))
return 0;
if (file) {
inactive = zone_page_state(zone, NR_INACTIVE_FILE);
isolated = zone_page_state(zone, NR_ISOLATED_FILE);
} else {
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
/*
* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
inactive >>= 3;
return isolated > inactive;
}
static noinline_for_stack void
putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct zone *zone = lruvec_zone(lruvec);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(page_list)) {
struct page *page = lru_to_page(page_list);
int lru;
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
putback_lru_page(page);
spin_lock_irq(&zone->lru_lock);
continue;
}
lruvec = mem_cgroup_page_lruvec(page, zone);
SetPageLRU(page);
lru = page_lru(page);
add_page_to_lru_list(page, lruvec, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
list_splice(&pages_to_free, page_list);
}
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)

Mel Gorman
committed
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
spin_unlock_irq(&zone->lru_lock);
return 0;

Minchan Kim
committed
nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
&nr_dirty, &nr_writeback, false);

Andy Whitcroft
committed
spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
nr_reclaimed);
else
__count_zone_vm_events(PGSTEAL_DIRECT, zone,
nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&page_list, 1);

Mel Gorman
committed
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
/*
* If reclaim is isolating dirty pages under writeback, it implies
* that the long-lived page allocation rate is exceeding the page
* laundering rate. Either the global limits are not being effective
* at throttling processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing device. The
* only option is to throttle from reclaim context which is not ideal
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
* This scales the number of dirty pages that must be under writeback
* before throttling depending on priority. It is a simple backoff
* function that has the most effect in the range DEF_PRIORITY to
* DEF_PRIORITY-2 which is the priority reclaim is considered to be
* in trouble and reclaim is considered to be in trouble.
*
* DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
* DEF_PRIORITY-1 50% must be PageWriteback
* DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
* ...
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
* isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >=
(nr_taken >> (DEF_PRIORITY - sc->priority)))

Mel Gorman
committed
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

Mel Gorman
committed
/*
* Similarly, if many dirty pages are encountered that are not
* currently being written then flag that kswapd should start
* writing back pages.
*/
if (global_reclaim(sc) && nr_dirty &&
nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority)))
zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
sc->priority,
}
/*
* This moves pages from the active list to the inactive list.
*
* We move them the other way if the page is referenced by one or more
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold zone->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
* should drop zone->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/

KAMEZAWA Hiroyuki
committed
static void move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *list,
enum lru_list lru)
{
struct zone *zone = lruvec_zone(lruvec);
unsigned long pgmoved = 0;
struct page *page;
while (!list_empty(list)) {
page = lru_to_page(list);
lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&zone->lru_lock);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&zone->lru_lock);
} else
list_add(&page->lru, pages_to_free);
}
}
__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
if (!is_active_lru(lru))
__count_vm_events(PGDEACTIVATE, pgmoved);
}

KAMEZAWA Hiroyuki
committed
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,

Johannes Weiner
committed
struct scan_control *sc,
enum lru_list lru)
unsigned long nr_taken;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
reclaim_stat->recent_scanned[file] += nr_taken;

KAMEZAWA Hiroyuki
committed
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}

Mel Gorman
committed
if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}

Johannes Weiner
committed
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}

KOSAKI Motohiro
committed
ClearPageActive(page); /* we are de-activating */
* Move pages back to the lru list.
spin_lock_irq(&zone->lru_lock);
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
reclaim_stat->recent_rotated[file] += nr_rotated;
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&l_hold, 1);
#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
if (inactive * zone->inactive_ratio < active)
return 1;
return 0;
}
/**
* inactive_anon_is_low - check if anonymous pages need to be deactivated
* @lruvec: LRU vector to check
*
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
static int inactive_anon_is_low(struct lruvec *lruvec)
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
return 0;
if (!mem_cgroup_disabled())
return mem_cgroup_inactive_anon_is_low(lruvec);

Johannes Weiner
committed
return inactive_anon_is_low_global(lruvec_zone(lruvec));
static inline int inactive_anon_is_low(struct lruvec *lruvec)
{
return 0;
}
#endif
/**
* inactive_file_is_low - check if file pages need to be deactivated
* @lruvec: LRU vector to check
*
* When the system is doing streaming IO, memory pressure here
* ensures that active file pages get deactivated, until more
* than half of the file pages are on the inactive list.
*
* Once we get to that situation, protect the system's working
* set from being evicted by disabling active file page aging.
*
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
static int inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
return inactive_file_is_low(lruvec);
else
return inactive_anon_is_low(lruvec);
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
if (is_active_lru(lru)) {
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
static int vmscan_swappiness(struct scan_control *sc)
{
if (global_reclaim(sc))
return vm_swappiness;
return mem_cgroup_swappiness(sc->target_mem_cgroup);
}
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
SCAN_ANON,
SCAN_FILE,
};
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file, free;
bool force_scan = false;
/*
* If the zone or memcg is small, nr[l] can be 0. This
* results in no scanning on this priority and a potential
* priority drop. Global direct reclaim can go to the next
* zone and tends to have no problems. Global kswapd is for
* zone balancing and it needs to scan a minimum amount. When
* reclaiming for a memcg, a priority drop can cause high
* latencies, so it's better to scan a minimum amount there as
* well.
*/
if (current_is_kswapd() && zone->all_unreclaimable)
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
/*
* Global reclaim will swap to prevent OOM even with no
* swappiness, but memcg users want to use this knob to
* disable swapping for individual groups completely when
* using the memory controller's swap limit feature would be
* too expensive.
*/
if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
goto out;
}
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
* (unless the swappiness setting disagrees with swapping).
*/
if (!sc->priority && vmscan_swappiness(sc)) {
goto out;
}
anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
get_lru_size(lruvec, LRU_INACTIVE_ANON);
file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
get_lru_size(lruvec, LRU_INACTIVE_FILE);
/*
* If it's foreseeable that reclaiming the file cache won't be
* enough to get the zone back into a desirable shape, we have
* to swap. Better start now and leave the - probably heavily
* thrashing - remaining file pages alone.
*/
if (global_reclaim(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
if (unlikely(file + free <= high_wmark_pages(zone))) {
/*
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
if (!inactive_file_is_low(lruvec)) {
goto out;
}
scan_balance = SCAN_FRACT;
/*
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
anon_prio = vmscan_swappiness(sc);
/*
* OK, so we have swap space and a fair amount of page cache
* pages. We use the recently rotated / recently scanned
* ratios to determine how valuable each cache is.
*
* Because workloads change over time (and to avoid overflow)
* we keep these statistics as a floating average, which ends
* up weighing recent references more than old ones.
*
* anon in [0], file in [1]
*/
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
reclaim_stat->recent_scanned[1] /= 2;
reclaim_stat->recent_rotated[1] /= 2;
* The amount of pressure on anon vs file pages is inversely
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
spin_unlock_irq(&zone->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long size;
size = get_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && force_scan)
scan = min(size, SWAP_CLUSTER_MAX);
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
switch (scan_balance) {
case SCAN_EQUAL:
/* Scan lists relative to size */
break;
case SCAN_FRACT:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
*/
scan = div64_u64(scan * fraction[file], denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file)
scan = 0;
break;
default:
/* Look ma, no brain */
BUG();
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
bool scan_adjusted = false;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
* For global direct reclaim, reclaim only the number of pages
* requested. Less care is taken to scan proportionally as it
* is more important to minimise direct reclaim stall latency
* than it is to properly age the LRU lists.
if (global_reclaim(sc) && !current_is_kswapd())
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
/*
* For kswapd and memcg, reclaim at least the number of pages
* requested. Ensure that the anon and file LRUs shrink
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
*/
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
lru = LRU_BASE;
percentage = nr_anon * 100 / scan_target;
} else {
unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
targets[LRU_ACTIVE_FILE] + 1;
lru = LRU_FILE;
percentage = nr_file * 100 / scan_target;
}
/* Stop scanning the smaller of the LRU */
nr[lru] = 0;
nr[lru + LRU_ACTIVE] = 0;
/*
* Recalculate the other LRU scan count based on its original
* scan target and the percentage scanning already complete
*/
lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
lru += LRU_ACTIVE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
throttle_vm_writeout(sc->gfp_mask);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
sc->priority < DEF_PRIORITY - 2))
return true;
return false;
}
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct zone *zone,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
return false;

Mel Gorman
committed
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
/* Consider stopping depending on scan and reclaim activity */
if (sc->gfp_mask & __GFP_REPEAT) {
/*
* For __GFP_REPEAT allocations, stop reclaiming if the
* full LRU list has been scanned and we are still failing
* to reclaim pages. This full LRU scan is potentially
* expensive but a __GFP_REPEAT caller really wants to succeed
*/
if (!nr_reclaimed && !nr_scanned)
return false;
} else {
/*
* For non-__GFP_REPEAT allocations which can presumably
* fail without consequence, stop if we failed to reclaim
* any pages from the last SWAP_CLUSTER_MAX number of
* pages that were scanned. This will return to the
* caller faster at the risk reclaim/compaction and
* the resulting allocation attempt fails
*/
if (!nr_reclaimed)
return false;
}
/*
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */