Skip to content
Snippets Groups Projects
vmscan.c 103 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/mm/vmscan.c
     *
     *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     *
     *  Swap reorganised 29.12.95, Stephen Tweedie.
     *  kswapd added: 7.1.96  sct
     *  Removed kswapd_ctl limits, and swap out as many pages as needed
     *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
     *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
     *  Multiqueue VM started 5.8.00, Rik van Riel.
     */
    
    #include <linux/mm.h>
    #include <linux/module.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/kernel_stat.h>
    #include <linux/swap.h>
    #include <linux/pagemap.h>
    #include <linux/init.h>
    #include <linux/highmem.h>
    
    #include <linux/vmpressure.h>
    
    #include <linux/vmstat.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/file.h>
    #include <linux/writeback.h>
    #include <linux/blkdev.h>
    #include <linux/buffer_head.h>	/* for try_to_release_page(),
    					buffer_heads_over_limit */
    #include <linux/mm_inline.h>
    #include <linux/backing-dev.h>
    #include <linux/rmap.h>
    #include <linux/topology.h>
    #include <linux/cpu.h>
    #include <linux/cpuset.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/notifier.h>
    #include <linux/rwsem.h>
    
    #include <linux/delay.h>
    
    #include <linux/memcontrol.h>
    
    #include <linux/delayacct.h>
    
    #include <linux/sysctl.h>
    
    #include <linux/prefetch.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/tlbflush.h>
    #include <asm/div64.h>
    
    #include <linux/swapops.h>
    
    
    #include "internal.h"
    
    
    #define CREATE_TRACE_POINTS
    #include <trace/events/vmscan.h>
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct scan_control {
    	/* Incremented by the number of inactive pages that were scanned */
    	unsigned long nr_scanned;
    
    
    	/* Number of pages freed so far during a call to shrink_zones() */
    	unsigned long nr_reclaimed;
    
    
    	/* How many pages shrink_list() should reclaim */
    	unsigned long nr_to_reclaim;
    
    
    	unsigned long hibernation_mode;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* This context's GFP mask */
    
    	gfp_t gfp_mask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	int may_writepage;
    
    
    	/* Can mapped pages be reclaimed? */
    	int may_unmap;
    
    	/* Can pages be swapped as part of reclaim? */
    	int may_swap;
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	int order;
    
    	/* Scan (total_size >> priority) pages at once */
    	int priority;
    
    
    	/*
    	 * The memory cgroup that hit its limit and as a result is the
    	 * primary target of this reclaim invocation.
    	 */
    	struct mem_cgroup *target_mem_cgroup;
    
    	/*
    	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
    	 * are scanned.
    	 */
    	nodemask_t	*nodemask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
    
    #ifdef ARCH_HAS_PREFETCH
    #define prefetch_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetch(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    #ifdef ARCH_HAS_PREFETCHW
    #define prefetchw_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetchw(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    /*
     * From 0 .. 100.  Higher means more swappy.
     */
    int vm_swappiness = 60;
    
    unsigned long vm_total_pages;	/* The total number of pages which the VM controls */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static LIST_HEAD(shrinker_list);
    static DECLARE_RWSEM(shrinker_rwsem);
    
    
    #ifdef CONFIG_MEMCG
    
    static bool global_reclaim(struct scan_control *sc)
    {
    
    static bool global_reclaim(struct scan_control *sc)
    {
    	return true;
    }
    
    static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
    
    	if (!mem_cgroup_disabled())
    
    		return mem_cgroup_get_lru_size(lruvec, lru);
    
    	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Add a shrinker callback to be called from the vm
     */
    
    void register_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	atomic_long_set(&shrinker->nr_in_batch, 0);
    
    	down_write(&shrinker_rwsem);
    	list_add_tail(&shrinker->list, &shrinker_list);
    	up_write(&shrinker_rwsem);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    EXPORT_SYMBOL(register_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Remove one
     */
    
    void unregister_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	down_write(&shrinker_rwsem);
    	list_del(&shrinker->list);
    	up_write(&shrinker_rwsem);
    }
    
    EXPORT_SYMBOL(unregister_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static inline int do_shrinker_shrink(struct shrinker *shrinker,
    				     struct shrink_control *sc,
    				     unsigned long nr_to_scan)
    {
    	sc->nr_to_scan = nr_to_scan;
    	return (*shrinker->shrink)(shrinker, sc);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define SHRINK_BATCH 128
    /*
     * Call the shrink functions to age shrinkable caches
     *
     * Here we assume it costs one seek to replace a lru page and that it also
     * takes a seek to recreate a cache object.  With this in mind we age equal
     * percentages of the lru and ageable caches.  This should balance the seeks
     * generated by these structures.
     *
    
    Simon Arlott's avatar
    Simon Arlott committed
     * If the vm encountered mapped pages on the LRU it increase the pressure on
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * slab to avoid swapping.
     *
     * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     *
     * `lru_pages' represents the number of on-LRU pages in all the zones which
     * are eligible for the caller's allocation attempt.  It is used for balancing
     * slab reclaim versus page reclaim.
    
     *
     * Returns the number of slab objects which we shrunk.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_slab(struct shrink_control *shrink,
    
    			  unsigned long nr_pages_scanned,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct shrinker *shrinker;
    
    	unsigned long ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (nr_pages_scanned == 0)
    		nr_pages_scanned = SWAP_CLUSTER_MAX;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (!down_read_trylock(&shrinker_rwsem)) {
    		/* Assume we'll be able to shrink next time */
    		ret = 1;
    		goto out;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	list_for_each_entry(shrinker, &shrinker_list, list) {
    		unsigned long long delta;
    
    		long total_scan;
    		long max_pass;
    
    		int shrink_ret = 0;
    
    		long nr;
    		long new_nr;
    
    		long batch_size = shrinker->batch ? shrinker->batch
    						  : SHRINK_BATCH;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
    		if (max_pass <= 0)
    			continue;
    
    
    		/*
    		 * copy the current shrinker scan count into a local variable
    		 * and zero it so that other concurrent shrinker invocations
    		 * don't also do this scanning work.
    		 */
    
    		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
    
    		delta = (4 * nr_pages_scanned) / shrinker->seeks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do_div(delta, lru_pages + 1);
    
    		total_scan += delta;
    		if (total_scan < 0) {
    
    			printk(KERN_ERR "shrink_slab: %pF negative objects to "
    			       "delete nr=%ld\n",
    
    			       shrinker->shrink, total_scan);
    			total_scan = max_pass;
    
    		/*
    		 * We need to avoid excessive windup on filesystem shrinkers
    		 * due to large numbers of GFP_NOFS allocations causing the
    		 * shrinkers to return -1 all the time. This results in a large
    		 * nr being built up so when a shrink that can do some work
    		 * comes along it empties the entire cache due to nr >>>
    		 * max_pass.  This is bad for sustaining a working set in
    		 * memory.
    		 *
    		 * Hence only allow the shrinker to scan the entire cache when
    		 * a large delta change is calculated directly.
    		 */
    		if (delta < max_pass / 4)
    			total_scan = min(total_scan, max_pass / 2);
    
    
    		/*
    		 * Avoid risking looping forever due to too large nr value:
    		 * never try to free more than twice the estimate number of
    		 * freeable entries.
    		 */
    
    		if (total_scan > max_pass * 2)
    			total_scan = max_pass * 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		trace_mm_shrink_slab_start(shrinker, shrink, nr,
    
    					nr_pages_scanned, lru_pages,
    					max_pass, delta, total_scan);
    
    
    		while (total_scan >= batch_size) {
    
    			int nr_before;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
    			shrink_ret = do_shrinker_shrink(shrinker, shrink,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (shrink_ret == -1)
    				break;
    
    			if (shrink_ret < nr_before)
    				ret += nr_before - shrink_ret;
    
    			count_vm_events(SLABS_SCANNED, batch_size);
    			total_scan -= batch_size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    			cond_resched();
    		}
    
    
    		/*
    		 * move the unused scan count back into the shrinker in a
    		 * manner that handles concurrent updates. If we exhausted the
    		 * scan, there is no need to do an update.
    		 */
    
    		if (total_scan > 0)
    			new_nr = atomic_long_add_return(total_scan,
    					&shrinker->nr_in_batch);
    		else
    			new_nr = atomic_long_read(&shrinker->nr_in_batch);
    
    
    		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	up_read(&shrinker_rwsem);
    
    	return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline int is_page_cache_freeable(struct page *page)
    {
    
    	/*
    	 * A freeable page cache page is referenced only by the caller
    	 * that isolated the page, the page cache radix tree and
    	 * optional buffer heads at page->private.
    	 */
    
    	return page_count(page) - page_has_private(page) == 2;
    
    static int may_write_to_queue(struct backing_dev_info *bdi,
    			      struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	if (current->flags & PF_SWAPWRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	if (!bdi_write_congested(bdi))
    		return 1;
    	if (bdi == current->backing_dev_info)
    		return 1;
    	return 0;
    }
    
    /*
     * We detected a synchronous write error writing a page out.  Probably
     * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     * fsync(), msync() or close().
     *
     * The tricky part is that after writepage we cannot touch the mapping: nothing
     * prevents it from being freed up.  But we have a ref on the page and once
     * that page is locked, the mapping is pinned.
     *
     * We're allowed to run sleeping lock_page() here because we know the caller has
     * __GFP_FS.
     */
    static void handle_write_error(struct address_space *mapping,
    				struct page *page, int error)
    {
    
    	lock_page(page);
    
    	if (page_mapping(page) == mapping)
    		mapping_set_error(mapping, error);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unlock_page(page);
    }
    
    
    /* possible outcome of pageout() */
    typedef enum {
    	/* failed to write page out, page is locked */
    	PAGE_KEEP,
    	/* move page to the active list, page is locked */
    	PAGE_ACTIVATE,
    	/* page has been sent to the disk successfully, page is unlocked */
    	PAGE_SUCCESS,
    	/* page is clean and locked */
    	PAGE_CLEAN,
    } pageout_t;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * pageout is called by shrink_page_list() for each dirty page.
     * Calls ->writepage().
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static pageout_t pageout(struct page *page, struct address_space *mapping,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    	 * If the page is dirty, only perform writeback if that write
    	 * will be non-blocking.  To prevent this allocation from being
    	 * stalled by pagecache activity.  But note that there may be
    	 * stalls if we need to run get_block().  We could test
    	 * PagePrivate for that.
    	 *
    
    	 * If this process is currently in __generic_file_aio_write() against
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * this page's queue, we can perform writeback even if that
    	 * will block.
    	 *
    	 * If the page is swapcache, write it back even if that would
    	 * block, for some throttling. This happens by accident, because
    	 * swap_backing_dev_info is bust: it doesn't reflect the
    	 * congestion state of the swapdevs.  Easy to fix, if needed.
    	 */
    	if (!is_page_cache_freeable(page))
    		return PAGE_KEEP;
    	if (!mapping) {
    		/*
    		 * Some data journaling orphaned pages can have
    		 * page->mapping == NULL while being dirty with clean buffers.
    		 */
    
    		if (page_has_private(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (try_to_free_buffers(page)) {
    				ClearPageDirty(page);
    
    				printk("%s: orphaned page\n", __func__);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return PAGE_CLEAN;
    			}
    		}
    		return PAGE_KEEP;
    	}
    	if (mapping->a_ops->writepage == NULL)
    		return PAGE_ACTIVATE;
    
    	if (!may_write_to_queue(mapping->backing_dev_info, sc))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return PAGE_KEEP;
    
    	if (clear_page_dirty_for_io(page)) {
    		int res;
    		struct writeback_control wbc = {
    			.sync_mode = WB_SYNC_NONE,
    			.nr_to_write = SWAP_CLUSTER_MAX,
    
    			.range_start = 0,
    			.range_end = LLONG_MAX,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			.for_reclaim = 1,
    		};
    
    		SetPageReclaim(page);
    		res = mapping->a_ops->writepage(page, &wbc);
    		if (res < 0)
    			handle_write_error(mapping, page, res);
    
    		if (res == AOP_WRITEPAGE_ACTIVATE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			ClearPageReclaim(page);
    			return PAGE_ACTIVATE;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!PageWriteback(page)) {
    			/* synchronous write or broken a_ops? */
    			ClearPageReclaim(page);
    		}
    
    		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
    
    		inc_zone_page_state(page, NR_VMSCAN_WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return PAGE_SUCCESS;
    	}
    
    	return PAGE_CLEAN;
    }
    
    
     * Same as remove_mapping, but if the page is removed from the mapping, it
     * gets returned with a refcount of 0.
    
    static int __remove_mapping(struct address_space *mapping, struct page *page)
    
    	BUG_ON(!PageLocked(page));
    	BUG_ON(mapping != page_mapping(page));
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	spin_lock_irq(&mapping->tree_lock);
    
    	 * The non racy check for a busy page.
    	 *
    	 * Must be careful with the order of the tests. When someone has
    	 * a ref to the page, it may be possible that they dirty it then
    	 * drop the reference. So if PageDirty is tested before page_count
    	 * here, then the following race may occur:
    	 *
    	 * get_user_pages(&page);
    	 * [user mapping goes away]
    	 * write_to(page);
    	 *				!PageDirty(page)    [good]
    	 * SetPageDirty(page);
    	 * put_page(page);
    	 *				!page_count(page)   [good, discard it]
    	 *
    	 * [oops, our write_to data is lost]
    	 *
    	 * Reversing the order of the tests ensures such a situation cannot
    	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
    	 * load is not satisfied before that of page->_count.
    	 *
    	 * Note that if SetPageDirty is always performed via set_page_dirty,
    	 * and thus under tree_lock, then this ordering is not required.
    
    	if (!page_freeze_refs(page, 2))
    
    	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
    	if (unlikely(PageDirty(page))) {
    		page_unfreeze_refs(page, 2);
    
    
    	if (PageSwapCache(page)) {
    		swp_entry_t swap = { .val = page_private(page) };
    		__delete_from_swap_cache(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		spin_unlock_irq(&mapping->tree_lock);
    
    		swapcache_free(swap, page);
    
    		void (*freepage)(struct page *);
    
    		freepage = mapping->a_ops->freepage;
    
    
    		__delete_from_page_cache(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		spin_unlock_irq(&mapping->tree_lock);
    
    		mem_cgroup_uncharge_cache_page(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	spin_unlock_irq(&mapping->tree_lock);
    
    /*
     * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     * someone else has a ref on the page, abort and return 0.  If it was
     * successfully detached, return 1.  Assumes the caller has a single ref on
     * this page.
     */
    int remove_mapping(struct address_space *mapping, struct page *page)
    {
    	if (__remove_mapping(mapping, page)) {
    		/*
    		 * Unfreezing the refcount with 1 rather than 2 effectively
    		 * drops the pagecache ref for us without requiring another
    		 * atomic operation.
    		 */
    		page_unfreeze_refs(page, 1);
    		return 1;
    	}
    	return 0;
    }
    
    
    /**
     * putback_lru_page - put previously isolated page onto appropriate LRU list
     * @page: page to be put back to appropriate lru list
     *
     * Add previously isolated @page to appropriate LRU list.
     * Page may still be unevictable for other reasons.
     *
     * lru_lock must not be held, interrupts must be enabled.
     */
    void putback_lru_page(struct page *page)
    {
    	int lru;
    	int active = !!TestClearPageActive(page);
    
    	int was_unevictable = PageUnevictable(page);
    
    
    	VM_BUG_ON(PageLRU(page));
    
    redo:
    	ClearPageUnevictable(page);
    
    
    	if (page_evictable(page)) {
    
    		/*
    		 * For evictable pages, we can use the cache.
    		 * In event of a race, worst case is we end up with an
    		 * unevictable page on [in]active list.
    		 * We know how to handle that.
    		 */
    
    		lru = active + page_lru_base_type(page);
    
    		lru_cache_add_lru(page, lru);
    	} else {
    		/*
    		 * Put unevictable pages directly on zone's unevictable
    		 * list.
    		 */
    		lru = LRU_UNEVICTABLE;
    		add_page_to_unevictable_list(page);
    
    		 * When racing with an mlock or AS_UNEVICTABLE clearing
    		 * (page is unlocked) make sure that if the other thread
    		 * does not observe our setting of PG_lru and fails
    
    		 * isolation/check_move_unevictable_pages,
    
    		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
    
    		 * the page back to the evictable list.
    		 *
    
    		 * The other side is TestClearPageMlocked() or shmem_lock().
    
    	}
    
    	/*
    	 * page's status can change while we move it among lru. If an evictable
    	 * page is on unevictable list, it never be freed. To avoid that,
    	 * check after we added it to the list, again.
    	 */
    
    	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
    
    		if (!isolate_lru_page(page)) {
    			put_page(page);
    			goto redo;
    		}
    		/* This means someone else dropped this page from LRU
    		 * So, it will be freed or putback to LRU again. There is
    		 * nothing to do here.
    		 */
    	}
    
    
    	if (was_unevictable && lru != LRU_UNEVICTABLE)
    		count_vm_event(UNEVICTABLE_PGRESCUED);
    	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
    		count_vm_event(UNEVICTABLE_PGCULLED);
    
    
    	put_page(page);		/* drop ref from isolate */
    }
    
    
    enum page_references {
    	PAGEREF_RECLAIM,
    	PAGEREF_RECLAIM_CLEAN,
    
    	PAGEREF_ACTIVATE,
    };
    
    static enum page_references page_check_references(struct page *page,
    						  struct scan_control *sc)
    {
    
    	int referenced_ptes, referenced_page;
    
    	unsigned long vm_flags;
    
    
    	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
    					  &vm_flags);
    
    	referenced_page = TestClearPageReferenced(page);
    
    
    	/*
    	 * Mlock lost the isolation race with us.  Let try_to_unmap()
    	 * move the page to the unevictable list.
    	 */
    	if (vm_flags & VM_LOCKED)
    		return PAGEREF_RECLAIM;
    
    
    	if (referenced_ptes) {
    
    		if (PageSwapBacked(page))
    
    			return PAGEREF_ACTIVATE;
    		/*
    		 * All mapped pages start out with page table
    		 * references from the instantiating fault, so we need
    		 * to look twice if a mapped file page is used more
    		 * than once.
    		 *
    		 * Mark it and spare it for another trip around the
    		 * inactive list.  Another page table reference will
    		 * lead to its activation.
    		 *
    		 * Note: the mark is set for activated pages as well
    		 * so that recently deactivated but used pages are
    		 * quickly recovered.
    		 */
    		SetPageReferenced(page);
    
    
    		if (referenced_page || referenced_ptes > 1)
    
    			return PAGEREF_ACTIVATE;
    
    
    		/*
    		 * Activate file-backed executable pages after first usage.
    		 */
    		if (vm_flags & VM_EXEC)
    			return PAGEREF_ACTIVATE;
    
    
    
    	/* Reclaim if clean, defer dirty pages to writeback */
    
    	if (referenced_page && !PageSwapBacked(page))
    
    		return PAGEREF_RECLAIM_CLEAN;
    
    	return PAGEREF_RECLAIM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_page_list() returns the number of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_page_list(struct list_head *page_list,
    
    				      unsigned long *ret_nr_writeback,
    				      bool force_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(ret_pages);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int pgactivate = 0;
    
    	unsigned long nr_dirty = 0;
    	unsigned long nr_congested = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	cond_resched();
    
    
    	mem_cgroup_uncharge_start();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	while (!list_empty(page_list)) {
    		struct address_space *mapping;
    		struct page *page;
    		int may_enter_fs;
    
    		enum page_references references = PAGEREF_RECLAIM_CLEAN;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		cond_resched();
    
    		page = lru_to_page(page_list);
    		list_del(&page->lru);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		if (!trylock_page(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto keep;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageActive(page));
    
    		VM_BUG_ON(page_zone(page) != zone);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		sc->nr_scanned++;
    
    		if (unlikely(!page_evictable(page)))
    
    			goto cull_mlocked;
    
    		if (!sc->may_unmap && page_mapped(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Double the slab pressure for mapped and swapcache pages */
    		if (page_mapped(page) || PageSwapCache(page))
    			sc->nr_scanned++;
    
    
    		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
    			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
    
    		if (PageWriteback(page)) {
    
    			/*
    			 * memcg doesn't have any dirty pages throttling so we
    			 * could easily OOM just because too many pages are in
    
    			 * writeback and there is nothing else to reclaim.
    
    			 * Check __GFP_IO, certainly because a loop driver
    
    			 * thread might enter reclaim, and deadlock if it waits
    			 * on a page for which it is needed to do the write
    			 * (loop masks off __GFP_IO|__GFP_FS for this reason);
    			 * but more thought would probably show more reasons.
    
    			 *
    			 * Don't require __GFP_FS, since we're not going into
    			 * the FS, just waiting on its writeback completion.
    			 * Worryingly, ext4 gfs2 and xfs allocate pages with
    			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
    			 * testing may_enter_fs here is liable to OOM on them.
    
    			if (global_reclaim(sc) ||
    			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
    				/*
    				 * This is slightly racy - end_page_writeback()
    				 * might have just cleared PageReclaim, then
    				 * setting PageReclaim here end up interpreted
    				 * as PageReadahead - but that does not matter
    				 * enough to care.  What we do want is for this
    				 * page to have PageReclaim set next time memcg
    				 * reclaim reaches the tests above, so it will
    				 * then wait_on_page_writeback() to avoid OOM;
    				 * and it's also appropriate in global reclaim.
    				 */
    				SetPageReclaim(page);
    
    				nr_writeback++;
    
    			wait_on_page_writeback(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (!force_reclaim)
    			references = page_check_references(page, sc);
    
    
    		switch (references) {
    		case PAGEREF_ACTIVATE:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto activate_locked;
    
    		case PAGEREF_KEEP:
    			goto keep_locked;
    
    		case PAGEREF_RECLAIM:
    		case PAGEREF_RECLAIM_CLEAN:
    			; /* try to reclaim the page below */
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 * Anonymous process memory has backing store?
    		 * Try to allocate it some swap space here.
    		 */
    
    		if (PageAnon(page) && !PageSwapCache(page)) {
    
    			if (!(sc->gfp_mask & __GFP_IO))
    				goto keep_locked;
    
    			if (!add_to_swap(page, page_list))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto activate_locked;
    
    			may_enter_fs = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		mapping = page_mapping(page);
    
    		/*
    		 * The page is mapped into the page tables of one or more
    		 * processes. Try to unmap it here.
    		 */
    		if (page_mapped(page) && mapping) {
    
    			switch (try_to_unmap(page, ttu_flags)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_FAIL:
    				goto activate_locked;
    			case SWAP_AGAIN:
    				goto keep_locked;
    
    			case SWAP_MLOCK:
    				goto cull_mlocked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_SUCCESS:
    				; /* try to free the page below */
    			}
    		}
    
    		if (PageDirty(page)) {
    
    			/*
    			 * Only kswapd can writeback filesystem pages to
    
    			 * avoid risk of stack overflow but only writeback
    			 * if many dirty pages have been encountered.
    
    				/*
    				 * Immediately reclaim when written back.
    				 * Similar in principal to deactivate_page()
    				 * except we already have the page isolated
    				 * and know it's dirty
    				 */
    				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
    				SetPageReclaim(page);
    
    
    			if (references == PAGEREF_RECLAIM_CLEAN)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			if (!sc->may_writepage)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			/* Page is dirty, try to write it out here */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case PAGE_KEEP:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    			case PAGE_ACTIVATE:
    				goto activate_locked;
    			case PAGE_SUCCESS:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto keep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				/*
    				 * A synchronous write - probably a ramdisk.  Go
    				 * ahead and try to reclaim the page.
    				 */
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				if (!trylock_page(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto keep;
    				if (PageDirty(page) || PageWriteback(page))
    					goto keep_locked;
    				mapping = page_mapping(page);
    			case PAGE_CLEAN:
    				; /* try to free the page below */
    			}
    		}
    
    		/*
    		 * If the page has buffers, try to free the buffer mappings
    		 * associated with this page. If we succeed we try to free
    		 * the page as well.
    		 *
    		 * We do this even if the page is PageDirty().
    		 * try_to_release_page() does not perform I/O, but it is
    		 * possible for a page to have PageDirty set, but it is actually
    		 * clean (all its buffers are clean).  This happens if the
    		 * buffers were written out directly, with submit_bh(). ext3
    
    		 * will do this, as well as the blockdev mapping.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 * try_to_release_page() will discover that cleanness and will
    		 * drop the buffers and mark the page clean - it can be freed.
    		 *
    		 * Rarely, pages can have buffers and no ->mapping.  These are
    		 * the pages which were not successfully invalidated in
    		 * truncate_complete_page().  We try to drop those buffers here
    		 * and if that worked, and the page is no longer mapped into
    		 * process address space (page_count == 1) it can be freed.
    		 * Otherwise, leave the page on the LRU so it is swappable.
    		 */
    
    		if (page_has_private(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (!try_to_release_page(page, sc->gfp_mask))
    				goto activate_locked;
    
    			if (!mapping && page_count(page) == 1) {
    				unlock_page(page);
    				if (put_page_testzero(page))
    					goto free_it;
    				else {
    					/*
    					 * rare race with speculative reference.
    					 * the speculative reference will free
    					 * this page shortly, so we may
    					 * increment nr_reclaimed here (and
    					 * leave it off the LRU).
    					 */
    					nr_reclaimed++;
    					continue;
    				}
    			}
    
    		if (!mapping || !__remove_mapping(mapping, page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		/*
    		 * At this point, we have no other references and there is
    		 * no way to pick any more up (removed from LRU, removed
    		 * from pagecache). Can use non-atomic bitops now (and
    		 * we obviously don't have to worry about waking up a process
    		 * waiting on the page lock, because there are no references.
    		 */
    		__clear_page_locked(page);
    
    		nr_reclaimed++;
    
    
    		/*
    		 * Is there need to periodically free_page_list? It would
    		 * appear not as the counts should be low
    		 */
    		list_add(&page->lru, &free_pages);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		continue;
    
    
    cull_mlocked:
    
    		if (PageSwapCache(page))
    			try_to_free_swap(page);
    
    		unlock_page(page);
    		putback_lru_page(page);
    		continue;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    activate_locked:
    
    		/* Not a candidate for swapping, so reclaim swap space. */
    		if (PageSwapCache(page) && vm_swap_full())
    
    			try_to_free_swap(page);
    
    		VM_BUG_ON(PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		SetPageActive(page);
    		pgactivate++;
    keep_locked:
    		unlock_page(page);
    keep:
    		list_add(&page->lru, &ret_pages);
    
    		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*
    	 * Tag a zone as congested if all the dirty pages encountered were
    	 * backed by a congested BDI. In this case, reclaimers should just
    	 * back off and wait for congestion to clear because further reclaim
    	 * will encounter the same problem
    	 */
    
    	if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
    
    		zone_set_flag(zone, ZONE_CONGESTED);
    
    	free_hot_cold_page_list(&free_pages, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	list_splice(&ret_pages, page_list);
    
    	count_vm_events(PGACTIVATE, pgactivate);
    
    	mem_cgroup_uncharge_end();
    
    	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
    
    	return nr_reclaimed;
    
    unsigned long reclaim_clean_pages_from_list(struct zone *zone,
    					    struct list_head *page_list)
    {
    	struct scan_control sc = {
    		.gfp_mask = GFP_KERNEL,
    		.priority = DEF_PRIORITY,
    		.may_unmap = 1,
    	};
    	unsigned long ret, dummy1, dummy2;
    	struct page *page, *next;
    	LIST_HEAD(clean_pages);
    
    	list_for_each_entry_safe(page, next, page_list, lru) {
    		if (page_is_file_cache(page) && !PageDirty(page)) {
    			ClearPageActive(page);
    			list_move(&page->lru, &clean_pages);
    		}
    	}
    
    	ret = shrink_page_list(&clean_pages, zone, &sc,
    				TTU_UNMAP|TTU_IGNORE_ACCESS,
    				&dummy1, &dummy2, true);
    	list_splice(&clean_pages, page_list);
    	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
    	return ret;
    }
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    /*
     * Attempt to remove the specified page from its LRU.  Only take this page