Newer
Older
/*
* linux/mm/vmscan.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/oom.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>

Mel Gorman
committed
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
unsigned long hibernation_mode;
/* Can mapped pages be reclaimed? */
int may_unmap;
/* Can pages be swapped as part of reclaim? */
int may_swap;
/* Scan (total_size >> priority) pages at once */
int priority;

Johannes Weiner
committed
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
*/
struct mem_cgroup *target_mem_cgroup;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
*/
nodemask_t *nodemask;
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetch(&prev->_field); \
} \
} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
/*
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
unsigned long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
static bool global_reclaim(struct scan_control *sc)
{

Johannes Weiner
committed
return !sc->target_mem_cgroup;

KAMEZAWA Hiroyuki
committed
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}

KAMEZAWA Hiroyuki
committed
#endif
unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
* Add a shrinker callback to be called from the vm.
int register_shrinker(struct shrinker *shrinker)
size_t size = sizeof(*shrinker->nr_deferred);
/*
* If we only have one possible node in the system anyway, save
* ourselves the trouble and disable NUMA aware behavior. This way we
* will save memory and some small loop time later.
*/
if (nr_node_ids == 1)
shrinker->flags &= ~SHRINKER_NUMA_AWARE;
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
kfree(shrinker->nr_deferred);
EXPORT_SYMBOL(unregister_shrinker);
static unsigned long
shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long nr_pages_scanned, unsigned long lru_pages)
{
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
long max_pass;
long nr;
long new_nr;
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
max_pass = shrinker->count_objects(shrinker, shrinkctl);
if (max_pass == 0)
return 0;
/*
* copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
total_scan += delta;
if (total_scan < 0) {
printk(KERN_ERR
"shrink_slab: %pF negative objects to delete nr=%ld\n",
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
total_scan = max_pass;
}
/*
* We need to avoid excessive windup on filesystem shrinkers
* due to large numbers of GFP_NOFS allocations causing the
* shrinkers to return -1 all the time. This results in a large
* nr being built up so when a shrink that can do some work
* comes along it empties the entire cache due to nr >>>
* max_pass. This is bad for sustaining a working set in
* memory.
*
* Hence only allow the shrinker to scan the entire cache when
* a large delta change is calculated directly.
*/
if (delta < max_pass / 4)
total_scan = min(total_scan, max_pass / 2);
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
if (total_scan > max_pass * 2)
total_scan = max_pass * 2;
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
nr_pages_scanned, lru_pages,
max_pass, delta, total_scan);
/*
* Normally, we should not scan less than batch_size objects in one
* pass to avoid too frequent shrinker calls, but if the slab has less
* than batch_size objects in total and we are really tight on memory,
* we will try to reclaim all available objects, otherwise we can end
* up failing allocations although there are plenty of reclaimable
* objects spread over several slabs with usage less than the
* batch_size.
*
* We detect the "tight on memory" situations by looking at the total
* number of objects we want to scan (total_scan). If it is greater
* than the total number of objects on slab (max_pass), we must be
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
while (total_scan >= batch_size ||
total_scan >= max_pass) {
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
count_vm_events(SLABS_SCANNED, nr_to_scan);
total_scan -= nr_to_scan;
cond_resched();
}
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
if (total_scan > 0)
new_nr = atomic_long_add_return(total_scan,
&shrinker->nr_deferred[nid]);
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
return freed;
/*
* Call the shrink functions to age shrinkable caches
*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
/*
* If we would return 0, our callers would understand that we
* have nothing else to shrink and give up trying. By returning
* 1 we keep it going and assume we'll be able to shrink next
* time.
*/
freed = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
shrinkctl->nid = 0;
freed += shrink_slab_node(shrinkctl, shrinker,
nr_pages_scanned, lru_pages);
continue;
}
for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
if (node_online(shrinkctl->nid))
freed += shrink_slab_node(shrinkctl, shrinker,
nr_pages_scanned, lru_pages);
out:
cond_resched();
}
static inline int is_page_cache_freeable(struct page *page)
{
/*
* A freeable page cache page is referenced only by the caller
* that isolated the page, the page cache radix tree and
* optional buffer heads at page->private.
*/
return page_count(page) - page_has_private(page) == 2;

KOSAKI Motohiro
committed
static int may_write_to_queue(struct backing_dev_info *bdi,
struct scan_control *sc)
if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
if (bdi == current->backing_dev_info)
return 1;
return 0;
}
/*
* We detected a synchronous write error writing a page out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
* prevents it from being freed up. But we have a ref on the page and once
* that page is locked, the mapping is pinned.
*
* We're allowed to run sleeping lock_page() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
PAGE_KEEP,
/* move page to the active list, page is locked */
PAGE_ACTIVATE,
/* page has been sent to the disk successfully, page is unlocked */
PAGE_SUCCESS,
/* page is clean and locked */
PAGE_CLEAN,
} pageout_t;
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().

Andy Whitcroft
committed
static pageout_t pageout(struct page *page, struct address_space *mapping,

KOSAKI Motohiro
committed
struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_aio_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;

Mel Gorman
committed
if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
};
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}

Andy Whitcroft
committed
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
static int __remove_mapping(struct address_space *mapping, struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page) [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_count.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_unfreeze_refs(page, 2);
goto cannot_free;
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
swapcache_free(swap, page);
void (*freepage)(struct page *);
freepage = mapping->a_ops->freepage;
mem_cgroup_uncharge_cache_page(page);
if (freepage != NULL)
freepage(page);
}
return 1;
cannot_free:
return 0;
}
/*
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
* someone else has a ref on the page, abort and return 0. If it was
* successfully detached, return 1. Assumes the caller has a single ref on
* this page.
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
if (__remove_mapping(mapping, page)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
page_unfreeze_refs(page, 1);
return 1;
}
return 0;
}
/**
* putback_lru_page - put previously isolated page onto appropriate LRU list
* @page: page to be put back to appropriate lru list
*
* Add previously isolated @page to appropriate LRU list.
* Page may still be unevictable for other reasons.
*
* lru_lock must not be held, interrupts must be enabled.
*/
void putback_lru_page(struct page *page)
{
bool is_unevictable;
int was_unevictable = PageUnevictable(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
redo:
ClearPageUnevictable(page);
/*
* For evictable pages, we can use the cache.
* In event of a race, worst case is we end up with an
* unevictable page on [in]active list.
* We know how to handle that.
*/
is_unevictable = false;
lru_cache_add(page);
} else {
/*
* Put unevictable pages directly on zone's unevictable
* list.
*/
is_unevictable = true;
* When racing with an mlock or AS_UNEVICTABLE clearing
* (page is unlocked) make sure that if the other thread
* does not observe our setting of PG_lru and fails
* isolation/check_move_unevictable_pages,
* we see PG_mlocked/AS_UNEVICTABLE cleared below and move
* the page back to the evictable list.
*
* The other side is TestClearPageMlocked() or shmem_lock().
}
/*
* page's status can change while we move it among lru. If an evictable
* page is on unevictable list, it never be freed. To avoid that,
* check after we added it to the list, again.
*/
if (is_unevictable && page_evictable(page)) {
if (!isolate_lru_page(page)) {
put_page(page);
goto redo;
}
/* This means someone else dropped this page from LRU
* So, it will be freed or putback to LRU again. There is
* nothing to do here.
*/
}
if (was_unevictable && !is_unevictable)
count_vm_event(UNEVICTABLE_PGRESCUED);
else if (!was_unevictable && is_unevictable)
count_vm_event(UNEVICTABLE_PGCULLED);
put_page(page); /* drop ref from isolate */
}
enum page_references {
PAGEREF_RECLAIM,
PAGEREF_RECLAIM_CLEAN,
PAGEREF_ACTIVATE,
};
static enum page_references page_check_references(struct page *page,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;

Johannes Weiner
committed
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
&vm_flags);
referenced_page = TestClearPageReferenced(page);
/*
* Mlock lost the isolation race with us. Let try_to_unmap()
* move the page to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;
if (referenced_ptes) {
if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
* references from the instantiating fault, so we need
* to look twice if a mapped file page is used more
* than once.
*
* Mark it and spare it for another trip around the
* inactive list. Another page table reference will
* lead to its activation.
*
* Note: the mark is set for activated pages as well
* so that recently deactivated but used pages are
* quickly recovered.
*/
SetPageReferenced(page);
if (referenced_page || referenced_ptes > 1)
return PAGEREF_ACTIVATE;
/*
* Activate file-backed executable pages after first usage.
*/
if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;

Mel Gorman
committed
/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page,
bool *dirty, bool *writeback)
{
struct address_space *mapping;

Mel Gorman
committed
/*
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
if (!page_is_file_cache(page)) {
*dirty = false;
*writeback = false;
return;
}
/* By default assume that the page flags are accurate */
*dirty = PageDirty(page);
*writeback = PageWriteback(page);
/* Verify dirty/writeback state if the filesystem supports it */
if (!page_has_private(page))
return;
mapping = page_mapping(page);
if (mapping && mapping->a_ops->is_dirty_writeback)
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);

Mel Gorman
committed
}
* shrink_page_list() returns the number of reclaimed pages
static unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,

Mel Gorman
committed
struct scan_control *sc,

Minchan Kim
committed
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,

Mel Gorman
committed
unsigned long *ret_nr_unqueued_dirty,
unsigned long *ret_nr_congested,

Minchan Kim
committed
unsigned long *ret_nr_writeback,
unsigned long *ret_nr_immediate,

Minchan Kim
committed
bool force_reclaim)
LIST_HEAD(free_pages);

Mel Gorman
committed
unsigned long nr_unqueued_dirty = 0;

Mel Gorman
committed
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;

Mel Gorman
committed
unsigned long nr_writeback = 0;
unsigned long nr_immediate = 0;
mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;

Minchan Kim
committed
enum page_references references = PAGEREF_RECLAIM_CLEAN;

Mel Gorman
committed
bool dirty, writeback;
cond_resched();
page = lru_to_page(page_list);
list_del(&page->lru);
VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);

Christoph Lameter
committed
if (unlikely(!page_evictable(page)))
if (!sc->may_unmap && page_mapped(page))

Christoph Lameter
committed
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;

Andy Whitcroft
committed
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

Mel Gorman
committed
/*
* The number of dirty pages determines if a zone is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
nr_dirty++;
if (dirty && !writeback)
nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
* pages are cycling through the LRU so quickly that the
* pages marked for immediate reclaim are making it to the
* end of the LRU a second time.
*/

Mel Gorman
committed
mapping = page_mapping(page);
if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
(writeback && PageReclaim(page)))

Mel Gorman
committed
nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
* are three cases to consider.
*
* 1) If reclaim is encountering an excessive number of pages
* under writeback and this page is both under writeback and
* PageReclaim then it indicates that pages are being queued
* for IO but are being recycled through the LRU before the
* IO can complete. Waiting on the page itself risks an
* indefinite stall if it is impossible to writeback the
* page due to IO error or disconnected storage so instead
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
* 2) Global reclaim encounters a page, memcg encounters a
* page that is not marked for immediate reclaim or
* the caller does not have __GFP_IO. In this case mark
* the page for immediate reclaim and continue scanning.
*
* __GFP_IO is checked because a loop driver thread might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
* Don't require __GFP_FS, since we're not going into the
* FS, just waiting on its writeback completion. Worryingly,
* ext4 gfs2 and xfs allocate pages with
* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
* may_enter_fs here is liable to OOM on them.
*
* 3) memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
*/

Andy Whitcroft
committed
if (PageWriteback(page)) {
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
zone_is_reclaim_writeback(zone)) {
nr_immediate++;
goto keep_locked;
/* Case 2 above */
} else if (global_reclaim(sc) ||
!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then
* setting PageReclaim here end up interpreted
* as PageReadahead - but that does not matter
* enough to care. What we do want is for this
* page to have PageReclaim set next time memcg
* reclaim reaches the tests above, so it will
* then wait_on_page_writeback() to avoid OOM;
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
/* Case 3 above */
} else {
wait_on_page_writeback(page);

Andy Whitcroft
committed
}

Minchan Kim
committed
if (!force_reclaim)
references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
case PAGEREF_KEEP:
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
; /* try to reclaim the page below */
}
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (!add_to_swap(page, page_list))

Mel Gorman
committed
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {

Minchan Kim
committed
switch (try_to_unmap(page, ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_MLOCK:
goto cull_mlocked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
if (PageDirty(page)) {
/*
* Only kswapd can writeback filesystem pages to

Mel Gorman
committed
* avoid risk of stack overflow but only writeback
* if many dirty pages have been encountered.

Mel Gorman
committed
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||

Mel Gorman
committed
!zone_is_reclaim_dirty(zone))) {

Mel Gorman
committed
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
* except we already have the page isolated
* and know it's dirty
*/
inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto keep_locked;
}
if (references == PAGEREF_RECLAIM_CLEAN)
if (!may_enter_fs)
if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */