Skip to content
Snippets Groups Projects
vmscan.c 80.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/mm/vmscan.c
     *
     *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     *
     *  Swap reorganised 29.12.95, Stephen Tweedie.
     *  kswapd added: 7.1.96  sct
     *  Removed kswapd_ctl limits, and swap out as many pages as needed
     *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
     *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
     *  Multiqueue VM started 5.8.00, Rik van Riel.
     */
    
    #include <linux/mm.h>
    #include <linux/module.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/kernel_stat.h>
    #include <linux/swap.h>
    #include <linux/pagemap.h>
    #include <linux/init.h>
    #include <linux/highmem.h>
    
    #include <linux/vmstat.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/file.h>
    #include <linux/writeback.h>
    #include <linux/blkdev.h>
    #include <linux/buffer_head.h>	/* for try_to_release_page(),
    					buffer_heads_over_limit */
    #include <linux/mm_inline.h>
    #include <linux/pagevec.h>
    #include <linux/backing-dev.h>
    #include <linux/rmap.h>
    #include <linux/topology.h>
    #include <linux/cpu.h>
    #include <linux/cpuset.h>
    #include <linux/notifier.h>
    #include <linux/rwsem.h>
    
    #include <linux/delay.h>
    
    #include <linux/memcontrol.h>
    
    #include <linux/delayacct.h>
    
    #include <linux/sysctl.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/tlbflush.h>
    #include <asm/div64.h>
    
    #include <linux/swapops.h>
    
    
    #include "internal.h"
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct scan_control {
    	/* Incremented by the number of inactive pages that were scanned */
    	unsigned long nr_scanned;
    
    
    	/* Number of pages freed so far during a call to shrink_zones() */
    	unsigned long nr_reclaimed;
    
    
    	/* How many pages shrink_list() should reclaim */
    	unsigned long nr_to_reclaim;
    
    
    	unsigned long hibernation_mode;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* This context's GFP mask */
    
    	gfp_t gfp_mask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	int may_writepage;
    
    
    	/* Can mapped pages be reclaimed? */
    	int may_unmap;
    
    	/* Can pages be swapped as part of reclaim? */
    	int may_swap;
    
    
    	int swappiness;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	int order;
    
    	/*
    	 * Intend to reclaim enough contenious memory rather than to reclaim
    	 * enough amount memory. I.e, it's the mode for high order allocation.
    	 */
    	bool lumpy_reclaim_mode;
    
    
    	/* Which cgroup do we reclaim from */
    	struct mem_cgroup *mem_cgroup;
    
    
    	/*
    	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
    	 * are scanned.
    	 */
    	nodemask_t	*nodemask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
    
    #ifdef ARCH_HAS_PREFETCH
    #define prefetch_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetch(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    #ifdef ARCH_HAS_PREFETCHW
    #define prefetchw_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetchw(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    /*
     * From 0 .. 100.  Higher means more swappy.
     */
    int vm_swappiness = 60;
    
    long vm_total_pages;	/* The total number of pages which the VM controls */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static LIST_HEAD(shrinker_list);
    static DECLARE_RWSEM(shrinker_rwsem);
    
    
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR
    
    #define scanning_global_lru(sc)	(!(sc)->mem_cgroup)
    
    #define scanning_global_lru(sc)	(1)
    
    static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
    						  struct scan_control *sc)
    {
    
    	if (!scanning_global_lru(sc))
    
    		return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
    
    
    	return &zone->reclaim_stat;
    }
    
    
    static unsigned long zone_nr_lru_pages(struct zone *zone,
    				struct scan_control *sc, enum lru_list lru)
    
    	if (!scanning_global_lru(sc))
    
    		return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
    
    
    	return zone_page_state(zone, NR_LRU_BASE + lru);
    }
    
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Add a shrinker callback to be called from the vm
     */
    
    void register_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	shrinker->nr = 0;
    	down_write(&shrinker_rwsem);
    	list_add_tail(&shrinker->list, &shrinker_list);
    	up_write(&shrinker_rwsem);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    EXPORT_SYMBOL(register_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Remove one
     */
    
    void unregister_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	down_write(&shrinker_rwsem);
    	list_del(&shrinker->list);
    	up_write(&shrinker_rwsem);
    }
    
    EXPORT_SYMBOL(unregister_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #define SHRINK_BATCH 128
    /*
     * Call the shrink functions to age shrinkable caches
     *
     * Here we assume it costs one seek to replace a lru page and that it also
     * takes a seek to recreate a cache object.  With this in mind we age equal
     * percentages of the lru and ageable caches.  This should balance the seeks
     * generated by these structures.
     *
    
    Simon Arlott's avatar
    Simon Arlott committed
     * If the vm encountered mapped pages on the LRU it increase the pressure on
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * slab to avoid swapping.
     *
     * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     *
     * `lru_pages' represents the number of on-LRU pages in all the zones which
     * are eligible for the caller's allocation attempt.  It is used for balancing
     * slab reclaim versus page reclaim.
    
     *
     * Returns the number of slab objects which we shrunk.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
    			unsigned long lru_pages)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct shrinker *shrinker;
    
    	unsigned long ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (scanned == 0)
    		scanned = SWAP_CLUSTER_MAX;
    
    	if (!down_read_trylock(&shrinker_rwsem))
    
    		return 1;	/* Assume we'll be able to shrink next time */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	list_for_each_entry(shrinker, &shrinker_list, list) {
    		unsigned long long delta;
    		unsigned long total_scan;
    
    		unsigned long max_pass;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		delta = (4 * scanned) / shrinker->seeks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do_div(delta, lru_pages + 1);
    		shrinker->nr += delta;
    
    			printk(KERN_ERR "shrink_slab: %pF negative objects to "
    			       "delete nr=%ld\n",
    			       shrinker->shrink, shrinker->nr);
    
    			shrinker->nr = max_pass;
    		}
    
    		/*
    		 * Avoid risking looping forever due to too large nr value:
    		 * never try to free more than twice the estimate number of
    		 * freeable entries.
    		 */
    		if (shrinker->nr > max_pass * 2)
    			shrinker->nr = max_pass * 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		total_scan = shrinker->nr;
    		shrinker->nr = 0;
    
    		while (total_scan >= SHRINK_BATCH) {
    			long this_scan = SHRINK_BATCH;
    			int shrink_ret;
    
    			int nr_before;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
    			shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
    								gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (shrink_ret == -1)
    				break;
    
    			if (shrink_ret < nr_before)
    				ret += nr_before - shrink_ret;
    
    			count_vm_events(SLABS_SCANNED, this_scan);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			total_scan -= this_scan;
    
    			cond_resched();
    		}
    
    		shrinker->nr += total_scan;
    	}
    	up_read(&shrinker_rwsem);
    
    	return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static inline int is_page_cache_freeable(struct page *page)
    {
    
    	/*
    	 * A freeable page cache page is referenced only by the caller
    	 * that isolated the page, the page cache radix tree and
    	 * optional buffer heads at page->private.
    	 */
    
    	return page_count(page) - page_has_private(page) == 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    static int may_write_to_queue(struct backing_dev_info *bdi)
    {
    
    	if (current->flags & PF_SWAPWRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	if (!bdi_write_congested(bdi))
    		return 1;
    	if (bdi == current->backing_dev_info)
    		return 1;
    	return 0;
    }
    
    /*
     * We detected a synchronous write error writing a page out.  Probably
     * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     * fsync(), msync() or close().
     *
     * The tricky part is that after writepage we cannot touch the mapping: nothing
     * prevents it from being freed up.  But we have a ref on the page and once
     * that page is locked, the mapping is pinned.
     *
     * We're allowed to run sleeping lock_page() here because we know the caller has
     * __GFP_FS.
     */
    static void handle_write_error(struct address_space *mapping,
    				struct page *page, int error)
    {
    
    	lock_page_nosync(page);
    
    	if (page_mapping(page) == mapping)
    		mapping_set_error(mapping, error);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unlock_page(page);
    }
    
    
    /* Request for sync pageout. */
    enum pageout_io {
    	PAGEOUT_IO_ASYNC,
    	PAGEOUT_IO_SYNC,
    };
    
    
    /* possible outcome of pageout() */
    typedef enum {
    	/* failed to write page out, page is locked */
    	PAGE_KEEP,
    	/* move page to the active list, page is locked */
    	PAGE_ACTIVATE,
    	/* page has been sent to the disk successfully, page is unlocked */
    	PAGE_SUCCESS,
    	/* page is clean and locked */
    	PAGE_CLEAN,
    } pageout_t;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * pageout is called by shrink_page_list() for each dirty page.
     * Calls ->writepage().
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static pageout_t pageout(struct page *page, struct address_space *mapping,
    						enum pageout_io sync_writeback)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    	 * If the page is dirty, only perform writeback if that write
    	 * will be non-blocking.  To prevent this allocation from being
    	 * stalled by pagecache activity.  But note that there may be
    	 * stalls if we need to run get_block().  We could test
    	 * PagePrivate for that.
    	 *
    
    	 * If this process is currently in __generic_file_aio_write() against
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * this page's queue, we can perform writeback even if that
    	 * will block.
    	 *
    	 * If the page is swapcache, write it back even if that would
    	 * block, for some throttling. This happens by accident, because
    	 * swap_backing_dev_info is bust: it doesn't reflect the
    	 * congestion state of the swapdevs.  Easy to fix, if needed.
    	 */
    	if (!is_page_cache_freeable(page))
    		return PAGE_KEEP;
    	if (!mapping) {
    		/*
    		 * Some data journaling orphaned pages can have
    		 * page->mapping == NULL while being dirty with clean buffers.
    		 */
    
    		if (page_has_private(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (try_to_free_buffers(page)) {
    				ClearPageDirty(page);
    
    				printk("%s: orphaned page\n", __func__);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return PAGE_CLEAN;
    			}
    		}
    		return PAGE_KEEP;
    	}
    	if (mapping->a_ops->writepage == NULL)
    		return PAGE_ACTIVATE;
    	if (!may_write_to_queue(mapping->backing_dev_info))
    		return PAGE_KEEP;
    
    	if (clear_page_dirty_for_io(page)) {
    		int res;
    		struct writeback_control wbc = {
    			.sync_mode = WB_SYNC_NONE,
    			.nr_to_write = SWAP_CLUSTER_MAX,
    
    			.range_start = 0,
    			.range_end = LLONG_MAX,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			.nonblocking = 1,
    			.for_reclaim = 1,
    		};
    
    		SetPageReclaim(page);
    		res = mapping->a_ops->writepage(page, &wbc);
    		if (res < 0)
    			handle_write_error(mapping, page, res);
    
    		if (res == AOP_WRITEPAGE_ACTIVATE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			ClearPageReclaim(page);
    			return PAGE_ACTIVATE;
    		}
    
    
    		/*
    		 * Wait on writeback if requested to. This happens when
    		 * direct reclaiming a large contiguous area and the
    		 * first attempt to free a range of pages fails.
    		 */
    		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
    			wait_on_page_writeback(page);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!PageWriteback(page)) {
    			/* synchronous write or broken a_ops? */
    			ClearPageReclaim(page);
    		}
    
    		inc_zone_page_state(page, NR_VMSCAN_WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return PAGE_SUCCESS;
    	}
    
    	return PAGE_CLEAN;
    }
    
    
     * Same as remove_mapping, but if the page is removed from the mapping, it
     * gets returned with a refcount of 0.
    
    static int __remove_mapping(struct address_space *mapping, struct page *page)
    
    	BUG_ON(!PageLocked(page));
    	BUG_ON(mapping != page_mapping(page));
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	spin_lock_irq(&mapping->tree_lock);
    
    	 * The non racy check for a busy page.
    	 *
    	 * Must be careful with the order of the tests. When someone has
    	 * a ref to the page, it may be possible that they dirty it then
    	 * drop the reference. So if PageDirty is tested before page_count
    	 * here, then the following race may occur:
    	 *
    	 * get_user_pages(&page);
    	 * [user mapping goes away]
    	 * write_to(page);
    	 *				!PageDirty(page)    [good]
    	 * SetPageDirty(page);
    	 * put_page(page);
    	 *				!page_count(page)   [good, discard it]
    	 *
    	 * [oops, our write_to data is lost]
    	 *
    	 * Reversing the order of the tests ensures such a situation cannot
    	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
    	 * load is not satisfied before that of page->_count.
    	 *
    	 * Note that if SetPageDirty is always performed via set_page_dirty,
    	 * and thus under tree_lock, then this ordering is not required.
    
    	if (!page_freeze_refs(page, 2))
    
    	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
    	if (unlikely(PageDirty(page))) {
    		page_unfreeze_refs(page, 2);
    
    
    	if (PageSwapCache(page)) {
    		swp_entry_t swap = { .val = page_private(page) };
    		__delete_from_swap_cache(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		spin_unlock_irq(&mapping->tree_lock);
    
    		swapcache_free(swap, page);
    
    	} else {
    		__remove_from_page_cache(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		spin_unlock_irq(&mapping->tree_lock);
    
    		mem_cgroup_uncharge_cache_page(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	spin_unlock_irq(&mapping->tree_lock);
    
    /*
     * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     * someone else has a ref on the page, abort and return 0.  If it was
     * successfully detached, return 1.  Assumes the caller has a single ref on
     * this page.
     */
    int remove_mapping(struct address_space *mapping, struct page *page)
    {
    	if (__remove_mapping(mapping, page)) {
    		/*
    		 * Unfreezing the refcount with 1 rather than 2 effectively
    		 * drops the pagecache ref for us without requiring another
    		 * atomic operation.
    		 */
    		page_unfreeze_refs(page, 1);
    		return 1;
    	}
    	return 0;
    }
    
    
    /**
     * putback_lru_page - put previously isolated page onto appropriate LRU list
     * @page: page to be put back to appropriate lru list
     *
     * Add previously isolated @page to appropriate LRU list.
     * Page may still be unevictable for other reasons.
     *
     * lru_lock must not be held, interrupts must be enabled.
     */
    void putback_lru_page(struct page *page)
    {
    	int lru;
    	int active = !!TestClearPageActive(page);
    
    	int was_unevictable = PageUnevictable(page);
    
    
    	VM_BUG_ON(PageLRU(page));
    
    redo:
    	ClearPageUnevictable(page);
    
    	if (page_evictable(page, NULL)) {
    		/*
    		 * For evictable pages, we can use the cache.
    		 * In event of a race, worst case is we end up with an
    		 * unevictable page on [in]active list.
    		 * We know how to handle that.
    		 */
    
    		lru = active + page_lru_base_type(page);
    
    		lru_cache_add_lru(page, lru);
    	} else {
    		/*
    		 * Put unevictable pages directly on zone's unevictable
    		 * list.
    		 */
    		lru = LRU_UNEVICTABLE;
    		add_page_to_unevictable_list(page);
    
    		/*
    		 * When racing with an mlock clearing (page is
    		 * unlocked), make sure that if the other thread does
    		 * not observe our setting of PG_lru and fails
    		 * isolation, we see PG_mlocked cleared below and move
    		 * the page back to the evictable list.
    		 *
    		 * The other side is TestClearPageMlocked().
    		 */
    		smp_mb();
    
    	}
    
    	/*
    	 * page's status can change while we move it among lru. If an evictable
    	 * page is on unevictable list, it never be freed. To avoid that,
    	 * check after we added it to the list, again.
    	 */
    	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
    		if (!isolate_lru_page(page)) {
    			put_page(page);
    			goto redo;
    		}
    		/* This means someone else dropped this page from LRU
    		 * So, it will be freed or putback to LRU again. There is
    		 * nothing to do here.
    		 */
    	}
    
    
    	if (was_unevictable && lru != LRU_UNEVICTABLE)
    		count_vm_event(UNEVICTABLE_PGRESCUED);
    	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
    		count_vm_event(UNEVICTABLE_PGCULLED);
    
    
    	put_page(page);		/* drop ref from isolate */
    }
    
    
    enum page_references {
    	PAGEREF_RECLAIM,
    	PAGEREF_RECLAIM_CLEAN,
    
    	PAGEREF_ACTIVATE,
    };
    
    static enum page_references page_check_references(struct page *page,
    						  struct scan_control *sc)
    {
    
    	int referenced_ptes, referenced_page;
    
    	unsigned long vm_flags;
    
    
    	referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
    	referenced_page = TestClearPageReferenced(page);
    
    
    	/* Lumpy reclaim - ignore references */
    
    		return PAGEREF_RECLAIM;
    
    	/*
    	 * Mlock lost the isolation race with us.  Let try_to_unmap()
    	 * move the page to the unevictable list.
    	 */
    	if (vm_flags & VM_LOCKED)
    		return PAGEREF_RECLAIM;
    
    
    	if (referenced_ptes) {
    		if (PageAnon(page))
    			return PAGEREF_ACTIVATE;
    		/*
    		 * All mapped pages start out with page table
    		 * references from the instantiating fault, so we need
    		 * to look twice if a mapped file page is used more
    		 * than once.
    		 *
    		 * Mark it and spare it for another trip around the
    		 * inactive list.  Another page table reference will
    		 * lead to its activation.
    		 *
    		 * Note: the mark is set for activated pages as well
    		 * so that recently deactivated but used pages are
    		 * quickly recovered.
    		 */
    		SetPageReferenced(page);
    
    		if (referenced_page)
    			return PAGEREF_ACTIVATE;
    
    		return PAGEREF_KEEP;
    	}
    
    
    	/* Reclaim if clean, defer dirty pages to writeback */
    
    	if (referenced_page)
    		return PAGEREF_RECLAIM_CLEAN;
    
    	return PAGEREF_RECLAIM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_page_list() returns the number of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_page_list(struct list_head *page_list,
    
    					struct scan_control *sc,
    					enum pageout_io sync_writeback)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(ret_pages);
    	struct pagevec freed_pvec;
    	int pgactivate = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	cond_resched();
    
    	pagevec_init(&freed_pvec, 1);
    	while (!list_empty(page_list)) {
    
    		enum page_references references;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct address_space *mapping;
    		struct page *page;
    		int may_enter_fs;
    
    		cond_resched();
    
    		page = lru_to_page(page_list);
    		list_del(&page->lru);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		if (!trylock_page(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto keep;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		sc->nr_scanned++;
    
    		if (unlikely(!page_evictable(page, NULL)))
    			goto cull_mlocked;
    
    		if (!sc->may_unmap && page_mapped(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Double the slab pressure for mapped and swapcache pages */
    		if (page_mapped(page) || PageSwapCache(page))
    			sc->nr_scanned++;
    
    
    		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
    			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
    
    		if (PageWriteback(page)) {
    			/*
    			 * Synchronous reclaim is performed in two passes,
    			 * first an asynchronous pass over the list to
    			 * start parallel writeback, and a second synchronous
    			 * pass to wait for the IO to complete.  Wait here
    			 * for any page for which writeback has already
    			 * started.
    			 */
    			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
    				wait_on_page_writeback(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		references = page_check_references(page, sc);
    		switch (references) {
    		case PAGEREF_ACTIVATE:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto activate_locked;
    
    		case PAGEREF_KEEP:
    			goto keep_locked;
    
    		case PAGEREF_RECLAIM:
    		case PAGEREF_RECLAIM_CLEAN:
    			; /* try to reclaim the page below */
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 * Anonymous process memory has backing store?
    		 * Try to allocate it some swap space here.
    		 */
    
    		if (PageAnon(page) && !PageSwapCache(page)) {
    
    			if (!(sc->gfp_mask & __GFP_IO))
    				goto keep_locked;
    
    			if (!add_to_swap(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto activate_locked;
    
    			may_enter_fs = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		mapping = page_mapping(page);
    
    		/*
    		 * The page is mapped into the page tables of one or more
    		 * processes. Try to unmap it here.
    		 */
    		if (page_mapped(page) && mapping) {
    
    			switch (try_to_unmap(page, TTU_UNMAP)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_FAIL:
    				goto activate_locked;
    			case SWAP_AGAIN:
    				goto keep_locked;
    
    			case SWAP_MLOCK:
    				goto cull_mlocked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_SUCCESS:
    				; /* try to free the page below */
    			}
    		}
    
    		if (PageDirty(page)) {
    
    			if (references == PAGEREF_RECLAIM_CLEAN)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			if (!sc->may_writepage)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			/* Page is dirty, try to write it out here */
    
    			switch (pageout(page, mapping, sync_writeback)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case PAGE_KEEP:
    				goto keep_locked;
    			case PAGE_ACTIVATE:
    				goto activate_locked;
    			case PAGE_SUCCESS:
    
    				if (PageWriteback(page) || PageDirty(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto keep;
    				/*
    				 * A synchronous write - probably a ramdisk.  Go
    				 * ahead and try to reclaim the page.
    				 */
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				if (!trylock_page(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto keep;
    				if (PageDirty(page) || PageWriteback(page))
    					goto keep_locked;
    				mapping = page_mapping(page);
    			case PAGE_CLEAN:
    				; /* try to free the page below */
    			}
    		}
    
    		/*
    		 * If the page has buffers, try to free the buffer mappings
    		 * associated with this page. If we succeed we try to free
    		 * the page as well.
    		 *
    		 * We do this even if the page is PageDirty().
    		 * try_to_release_page() does not perform I/O, but it is
    		 * possible for a page to have PageDirty set, but it is actually
    		 * clean (all its buffers are clean).  This happens if the
    		 * buffers were written out directly, with submit_bh(). ext3
    
    		 * will do this, as well as the blockdev mapping.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 * try_to_release_page() will discover that cleanness and will
    		 * drop the buffers and mark the page clean - it can be freed.
    		 *
    		 * Rarely, pages can have buffers and no ->mapping.  These are
    		 * the pages which were not successfully invalidated in
    		 * truncate_complete_page().  We try to drop those buffers here
    		 * and if that worked, and the page is no longer mapped into
    		 * process address space (page_count == 1) it can be freed.
    		 * Otherwise, leave the page on the LRU so it is swappable.
    		 */
    
    		if (page_has_private(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (!try_to_release_page(page, sc->gfp_mask))
    				goto activate_locked;
    
    			if (!mapping && page_count(page) == 1) {
    				unlock_page(page);
    				if (put_page_testzero(page))
    					goto free_it;
    				else {
    					/*
    					 * rare race with speculative reference.
    					 * the speculative reference will free
    					 * this page shortly, so we may
    					 * increment nr_reclaimed here (and
    					 * leave it off the LRU).
    					 */
    					nr_reclaimed++;
    					continue;
    				}
    			}
    
    		if (!mapping || !__remove_mapping(mapping, page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		/*
    		 * At this point, we have no other references and there is
    		 * no way to pick any more up (removed from LRU, removed
    		 * from pagecache). Can use non-atomic bitops now (and
    		 * we obviously don't have to worry about waking up a process
    		 * waiting on the page lock, because there are no references.
    		 */
    		__clear_page_locked(page);
    
    		nr_reclaimed++;
    
    		if (!pagevec_add(&freed_pvec, page)) {
    			__pagevec_free(&freed_pvec);
    			pagevec_reinit(&freed_pvec);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		continue;
    
    
    cull_mlocked:
    
    		if (PageSwapCache(page))
    			try_to_free_swap(page);
    
    		unlock_page(page);
    		putback_lru_page(page);
    		continue;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    activate_locked:
    
    		/* Not a candidate for swapping, so reclaim swap space. */
    		if (PageSwapCache(page) && vm_swap_full())
    
    			try_to_free_swap(page);
    
    		VM_BUG_ON(PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		SetPageActive(page);
    		pgactivate++;
    keep_locked:
    		unlock_page(page);
    keep:
    		list_add(&page->lru, &ret_pages);
    
    		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	list_splice(&ret_pages, page_list);
    	if (pagevec_count(&freed_pvec))
    
    		__pagevec_free(&freed_pvec);
    
    	count_vm_events(PGACTIVATE, pgactivate);
    
    	return nr_reclaimed;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    /*
     * Attempt to remove the specified page from its LRU.  Only take this page
     * if it is of the appropriate PageActive status.  Pages which are being
     * freed elsewhere are also ignored.
     *
     * page:	page to consider
     * mode:	one of the LRU isolation modes defined above
     *
     * returns 0 on success, -ve errno on failure.
     */
    
    int __isolate_lru_page(struct page *page, int mode, int file)
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    {
    	int ret = -EINVAL;
    
    	/* Only take pages on the LRU. */
    	if (!PageLRU(page))
    		return ret;
    
    	/*
    	 * When checking the active state, we need to be sure we are
    	 * dealing with comparible boolean values.  Take the logical not
    	 * of each.
    	 */
    	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
    		return ret;
    
    
    	if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
    
    	/*
    	 * When this function is being called for lumpy reclaim, we
    	 * initially look into all LRU pages, active, inactive and
    	 * unevictable; only give shrink_page_list evictable pages.
    	 */
    	if (PageUnevictable(page))
    		return ret;
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	ret = -EBUSY;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	if (likely(get_page_unless_zero(page))) {
    		/*
    		 * Be careful not to clear PageLRU until after we're
    		 * sure the page is not being freed elsewhere -- the
    		 * page release code relies on it.
    		 */
    		ClearPageLRU(page);
    		ret = 0;
    	}
    
    	return ret;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * zone->lru_lock is heavily contended.  Some of the functions that
     * shrink the lists perform better by taking out a batch of pages
     * and working on them outside the LRU lock.
     *
     * For pagecache intensive workloads, this function is the hottest
     * spot in the kernel (apart from copy_*_user functions).
     *
     * Appropriate locks must be held before calling this function.
     *
     * @nr_to_scan:	The number of pages to look through on the list.
     * @src:	The LRU list to pull pages off.
     * @dst:	The temp list to put pages on to.
     * @scanned:	The number of pages that were scanned.
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
     * @order:	The caller's attempted allocation order
     * @mode:	One of the LRU isolation modes
    
     * @file:	True [1] if isolating file [!anon] pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * returns how many pages were moved onto *@dst.
     */
    
    static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    		struct list_head *src, struct list_head *dst,
    
    		unsigned long *scanned, int order, int mode, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken = 0;
    
    	unsigned long scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		struct page *page;
    		unsigned long pfn;
    		unsigned long end_pfn;
    		unsigned long page_pfn;
    		int zone_id;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = lru_to_page(src);
    		prefetchw_prev_lru_page(page, src, flags);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLRU(page));
    
    		switch (__isolate_lru_page(page, mode, file)) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		case 0:
    			list_move(&page->lru, dst);
    
    			mem_cgroup_del_lru(page);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			break;
    
    		case -EBUSY:
    			/* else it is being freed elsewhere */
    			list_move(&page->lru, src);
    
    			mem_cgroup_rotate_lru_list(page, page_lru(page));
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			continue;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		default:
    			BUG();
    		}
    
    		if (!order)
    			continue;
    
    		/*
    		 * Attempt to take all pages in the order aligned region
    		 * surrounding the tag page.  Only take those pages of
    		 * the same active state as that tag page.  We may safely
    		 * round the target page pfn down to the requested order
    		 * as the mem_map is guarenteed valid out to MAX_ORDER,
    		 * where that page is in a different zone we will detect
    		 * it from its zone id and abort this block scan.
    		 */
    		zone_id = page_zone_id(page);
    		page_pfn = page_to_pfn(page);
    		pfn = page_pfn & ~((1 << order) - 1);
    		end_pfn = pfn + (1 << order);
    		for (; pfn < end_pfn; pfn++) {
    			struct page *cursor_page;
    
    			/* The target page is in the block, ignore it. */
    			if (unlikely(pfn == page_pfn))
    				continue;
    
    			/* Avoid holes within the zone. */
    			if (unlikely(!pfn_valid_within(pfn)))
    				break;
    
    			cursor_page = pfn_to_page(pfn);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			/* Check that we have not crossed a zone boundary. */
    			if (unlikely(page_zone_id(cursor_page) != zone_id))
    				continue;
    
    
    			/*
    			 * If we don't have enough swap space, reclaiming of
    			 * anon page which don't already have a swap slot is
    			 * pointless.
    			 */
    			if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
    					!PageSwapCache(cursor_page))
    				continue;
    
    
    			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    				list_move(&cursor_page->lru, dst);
    
    				mem_cgroup_del_lru(cursor_page);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    				nr_taken++;
    				scan++;
    			}
    		}