Skip to content
Snippets Groups Projects
vmscan.c 57.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/mm/vmscan.c
     *
     *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     *
     *  Swap reorganised 29.12.95, Stephen Tweedie.
     *  kswapd added: 7.1.96  sct
     *  Removed kswapd_ctl limits, and swap out as many pages as needed
     *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
     *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
     *  Multiqueue VM started 5.8.00, Rik van Riel.
     */
    
    #include <linux/mm.h>
    #include <linux/module.h>
    #include <linux/slab.h>
    #include <linux/kernel_stat.h>
    #include <linux/swap.h>
    #include <linux/pagemap.h>
    #include <linux/init.h>
    #include <linux/highmem.h>
    
    #include <linux/vmstat.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/file.h>
    #include <linux/writeback.h>
    #include <linux/blkdev.h>
    #include <linux/buffer_head.h>	/* for try_to_release_page(),
    					buffer_heads_over_limit */
    #include <linux/mm_inline.h>
    #include <linux/pagevec.h>
    #include <linux/backing-dev.h>
    #include <linux/rmap.h>
    #include <linux/topology.h>
    #include <linux/cpu.h>
    #include <linux/cpuset.h>
    #include <linux/notifier.h>
    #include <linux/rwsem.h>
    
    #include <linux/delay.h>
    
    #include <linux/memcontrol.h>
    
    #include <linux/delayacct.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/tlbflush.h>
    #include <asm/div64.h>
    
    #include <linux/swapops.h>
    
    
    #include "internal.h"
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct scan_control {
    	/* Incremented by the number of inactive pages that were scanned */
    	unsigned long nr_scanned;
    
    	/* This context's GFP mask */
    
    	gfp_t gfp_mask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	int may_writepage;
    
    
    	/* Can pages be swapped as part of reclaim? */
    	int may_swap;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
    	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
    	 * In this context, it doesn't matter that we scan the
    	 * whole list at once. */
    	int swap_cluster_max;
    
    
    	int all_unreclaimable;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    	int order;
    
    
    	/* Which cgroup do we reclaim from */
    	struct mem_cgroup *mem_cgroup;
    
    	/* Pluggable isolate pages callback */
    	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
    			unsigned long *scanned, int order, int mode,
    			struct zone *z, struct mem_cgroup *mem_cont,
    			int active);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
    
    #ifdef ARCH_HAS_PREFETCH
    #define prefetch_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetch(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    #ifdef ARCH_HAS_PREFETCHW
    #define prefetchw_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetchw(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    /*
     * From 0 .. 100.  Higher means more swappy.
     */
    int vm_swappiness = 60;
    
    long vm_total_pages;	/* The total number of pages which the VM controls */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static LIST_HEAD(shrinker_list);
    static DECLARE_RWSEM(shrinker_rwsem);
    
    
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR
    
    #define scan_global_lru(sc)	(!(sc)->mem_cgroup)
    #else
    #define scan_global_lru(sc)	(1)
    #endif
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Add a shrinker callback to be called from the vm
     */
    
    void register_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	shrinker->nr = 0;
    	down_write(&shrinker_rwsem);
    	list_add_tail(&shrinker->list, &shrinker_list);
    	up_write(&shrinker_rwsem);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    EXPORT_SYMBOL(register_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Remove one
     */
    
    void unregister_shrinker(struct shrinker *shrinker)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	down_write(&shrinker_rwsem);
    	list_del(&shrinker->list);
    	up_write(&shrinker_rwsem);
    }
    
    EXPORT_SYMBOL(unregister_shrinker);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #define SHRINK_BATCH 128
    /*
     * Call the shrink functions to age shrinkable caches
     *
     * Here we assume it costs one seek to replace a lru page and that it also
     * takes a seek to recreate a cache object.  With this in mind we age equal
     * percentages of the lru and ageable caches.  This should balance the seeks
     * generated by these structures.
     *
    
    Simon Arlott's avatar
    Simon Arlott committed
     * If the vm encountered mapped pages on the LRU it increase the pressure on
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * slab to avoid swapping.
     *
     * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     *
     * `lru_pages' represents the number of on-LRU pages in all the zones which
     * are eligible for the caller's allocation attempt.  It is used for balancing
     * slab reclaim versus page reclaim.
    
     *
     * Returns the number of slab objects which we shrunk.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
    			unsigned long lru_pages)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct shrinker *shrinker;
    
    	unsigned long ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (scanned == 0)
    		scanned = SWAP_CLUSTER_MAX;
    
    	if (!down_read_trylock(&shrinker_rwsem))
    
    		return 1;	/* Assume we'll be able to shrink next time */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	list_for_each_entry(shrinker, &shrinker_list, list) {
    		unsigned long long delta;
    		unsigned long total_scan;
    
    		unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		delta = (4 * scanned) / shrinker->seeks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do_div(delta, lru_pages + 1);
    		shrinker->nr += delta;
    
    		if (shrinker->nr < 0) {
    			printk(KERN_ERR "%s: nr=%ld\n",
    
    					__func__, shrinker->nr);
    
    			shrinker->nr = max_pass;
    		}
    
    		/*
    		 * Avoid risking looping forever due to too large nr value:
    		 * never try to free more than twice the estimate number of
    		 * freeable entries.
    		 */
    		if (shrinker->nr > max_pass * 2)
    			shrinker->nr = max_pass * 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		total_scan = shrinker->nr;
    		shrinker->nr = 0;
    
    		while (total_scan >= SHRINK_BATCH) {
    			long this_scan = SHRINK_BATCH;
    			int shrink_ret;
    
    			int nr_before;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			nr_before = (*shrinker->shrink)(0, gfp_mask);
    			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (shrink_ret == -1)
    				break;
    
    			if (shrink_ret < nr_before)
    				ret += nr_before - shrink_ret;
    
    			count_vm_events(SLABS_SCANNED, this_scan);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			total_scan -= this_scan;
    
    			cond_resched();
    		}
    
    		shrinker->nr += total_scan;
    	}
    	up_read(&shrinker_rwsem);
    
    	return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Called without lock on whether page is mapped, so answer is unstable */
    static inline int page_mapping_inuse(struct page *page)
    {
    	struct address_space *mapping;
    
    	/* Page is in somebody's page tables. */
    	if (page_mapped(page))
    		return 1;
    
    	/* Be more reluctant to reclaim swapcache than pagecache */
    	if (PageSwapCache(page))
    		return 1;
    
    	mapping = page_mapping(page);
    	if (!mapping)
    		return 0;
    
    	/* File is mmap'd by somebody? */
    	return mapping_mapped(mapping);
    }
    
    static inline int is_page_cache_freeable(struct page *page)
    {
    	return page_count(page) - !!PagePrivate(page) == 2;
    }
    
    static int may_write_to_queue(struct backing_dev_info *bdi)
    {
    
    	if (current->flags & PF_SWAPWRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	if (!bdi_write_congested(bdi))
    		return 1;
    	if (bdi == current->backing_dev_info)
    		return 1;
    	return 0;
    }
    
    /*
     * We detected a synchronous write error writing a page out.  Probably
     * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     * fsync(), msync() or close().
     *
     * The tricky part is that after writepage we cannot touch the mapping: nothing
     * prevents it from being freed up.  But we have a ref on the page and once
     * that page is locked, the mapping is pinned.
     *
     * We're allowed to run sleeping lock_page() here because we know the caller has
     * __GFP_FS.
     */
    static void handle_write_error(struct address_space *mapping,
    				struct page *page, int error)
    {
    	lock_page(page);
    
    	if (page_mapping(page) == mapping)
    		mapping_set_error(mapping, error);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unlock_page(page);
    }
    
    
    /* Request for sync pageout. */
    enum pageout_io {
    	PAGEOUT_IO_ASYNC,
    	PAGEOUT_IO_SYNC,
    };
    
    
    /* possible outcome of pageout() */
    typedef enum {
    	/* failed to write page out, page is locked */
    	PAGE_KEEP,
    	/* move page to the active list, page is locked */
    	PAGE_ACTIVATE,
    	/* page has been sent to the disk successfully, page is unlocked */
    	PAGE_SUCCESS,
    	/* page is clean and locked */
    	PAGE_CLEAN,
    } pageout_t;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * pageout is called by shrink_page_list() for each dirty page.
     * Calls ->writepage().
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static pageout_t pageout(struct page *page, struct address_space *mapping,
    						enum pageout_io sync_writeback)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    	 * If the page is dirty, only perform writeback if that write
    	 * will be non-blocking.  To prevent this allocation from being
    	 * stalled by pagecache activity.  But note that there may be
    	 * stalls if we need to run get_block().  We could test
    	 * PagePrivate for that.
    	 *
    	 * If this process is currently in generic_file_write() against
    	 * this page's queue, we can perform writeback even if that
    	 * will block.
    	 *
    	 * If the page is swapcache, write it back even if that would
    	 * block, for some throttling. This happens by accident, because
    	 * swap_backing_dev_info is bust: it doesn't reflect the
    	 * congestion state of the swapdevs.  Easy to fix, if needed.
    	 * See swapfile.c:page_queue_congested().
    	 */
    	if (!is_page_cache_freeable(page))
    		return PAGE_KEEP;
    	if (!mapping) {
    		/*
    		 * Some data journaling orphaned pages can have
    		 * page->mapping == NULL while being dirty with clean buffers.
    		 */
    
    		if (PagePrivate(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (try_to_free_buffers(page)) {
    				ClearPageDirty(page);
    
    				printk("%s: orphaned page\n", __func__);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return PAGE_CLEAN;
    			}
    		}
    		return PAGE_KEEP;
    	}
    	if (mapping->a_ops->writepage == NULL)
    		return PAGE_ACTIVATE;
    	if (!may_write_to_queue(mapping->backing_dev_info))
    		return PAGE_KEEP;
    
    	if (clear_page_dirty_for_io(page)) {
    		int res;
    		struct writeback_control wbc = {
    			.sync_mode = WB_SYNC_NONE,
    			.nr_to_write = SWAP_CLUSTER_MAX,
    
    			.range_start = 0,
    			.range_end = LLONG_MAX,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			.nonblocking = 1,
    			.for_reclaim = 1,
    		};
    
    		SetPageReclaim(page);
    		res = mapping->a_ops->writepage(page, &wbc);
    		if (res < 0)
    			handle_write_error(mapping, page, res);
    
    		if (res == AOP_WRITEPAGE_ACTIVATE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			ClearPageReclaim(page);
    			return PAGE_ACTIVATE;
    		}
    
    
    		/*
    		 * Wait on writeback if requested to. This happens when
    		 * direct reclaiming a large contiguous area and the
    		 * first attempt to free a range of pages fails.
    		 */
    		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
    			wait_on_page_writeback(page);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!PageWriteback(page)) {
    			/* synchronous write or broken a_ops? */
    			ClearPageReclaim(page);
    		}
    
    		inc_zone_page_state(page, NR_VMSCAN_WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return PAGE_SUCCESS;
    	}
    
    	return PAGE_CLEAN;
    }
    
    
     * Same as remove_mapping, but if the page is removed from the mapping, it
     * gets returned with a refcount of 0.
    
    static int __remove_mapping(struct address_space *mapping, struct page *page)
    
    	BUG_ON(!PageLocked(page));
    	BUG_ON(mapping != page_mapping(page));
    
    
    	write_lock_irq(&mapping->tree_lock);
    	/*
    
    	 * The non racy check for a busy page.
    	 *
    	 * Must be careful with the order of the tests. When someone has
    	 * a ref to the page, it may be possible that they dirty it then
    	 * drop the reference. So if PageDirty is tested before page_count
    	 * here, then the following race may occur:
    	 *
    	 * get_user_pages(&page);
    	 * [user mapping goes away]
    	 * write_to(page);
    	 *				!PageDirty(page)    [good]
    	 * SetPageDirty(page);
    	 * put_page(page);
    	 *				!page_count(page)   [good, discard it]
    	 *
    	 * [oops, our write_to data is lost]
    	 *
    	 * Reversing the order of the tests ensures such a situation cannot
    	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
    	 * load is not satisfied before that of page->_count.
    	 *
    	 * Note that if SetPageDirty is always performed via set_page_dirty,
    	 * and thus under tree_lock, then this ordering is not required.
    
    	if (!page_freeze_refs(page, 2))
    
    	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
    	if (unlikely(PageDirty(page))) {
    		page_unfreeze_refs(page, 2);
    
    
    	if (PageSwapCache(page)) {
    		swp_entry_t swap = { .val = page_private(page) };
    		__delete_from_swap_cache(page);
    		write_unlock_irq(&mapping->tree_lock);
    		swap_free(swap);
    
    	} else {
    		__remove_from_page_cache(page);
    		write_unlock_irq(&mapping->tree_lock);
    
    	}
    
    	return 1;
    
    cannot_free:
    	write_unlock_irq(&mapping->tree_lock);
    	return 0;
    }
    
    
    /*
     * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     * someone else has a ref on the page, abort and return 0.  If it was
     * successfully detached, return 1.  Assumes the caller has a single ref on
     * this page.
     */
    int remove_mapping(struct address_space *mapping, struct page *page)
    {
    	if (__remove_mapping(mapping, page)) {
    		/*
    		 * Unfreezing the refcount with 1 rather than 2 effectively
    		 * drops the pagecache ref for us without requiring another
    		 * atomic operation.
    		 */
    		page_unfreeze_refs(page, 1);
    		return 1;
    	}
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_page_list() returns the number of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_page_list(struct list_head *page_list,
    
    					struct scan_control *sc,
    					enum pageout_io sync_writeback)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(ret_pages);
    	struct pagevec freed_pvec;
    	int pgactivate = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	cond_resched();
    
    	pagevec_init(&freed_pvec, 1);
    	while (!list_empty(page_list)) {
    		struct address_space *mapping;
    		struct page *page;
    		int may_enter_fs;
    		int referenced;
    
    		cond_resched();
    
    		page = lru_to_page(page_list);
    		list_del(&page->lru);
    
    		if (TestSetPageLocked(page))
    			goto keep;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		sc->nr_scanned++;
    
    
    		if (!sc->may_swap && page_mapped(page))
    			goto keep_locked;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Double the slab pressure for mapped and swapcache pages */
    		if (page_mapped(page) || PageSwapCache(page))
    			sc->nr_scanned++;
    
    
    		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
    			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
    
    		if (PageWriteback(page)) {
    			/*
    			 * Synchronous reclaim is performed in two passes,
    			 * first an asynchronous pass over the list to
    			 * start parallel writeback, and a second synchronous
    			 * pass to wait for the IO to complete.  Wait here
    			 * for any page for which writeback has already
    			 * started.
    			 */
    			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
    				wait_on_page_writeback(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		referenced = page_referenced(page, 1, sc->mem_cgroup);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* In active use or really unfreeable?  Activate it. */
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
    					referenced && page_mapping_inuse(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto activate_locked;
    
    #ifdef CONFIG_SWAP
    		/*
    		 * Anonymous process memory has backing store?
    		 * Try to allocate it some swap space here.
    		 */
    
    			if (!add_to_swap(page, GFP_ATOMIC))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto activate_locked;
    #endif /* CONFIG_SWAP */
    
    		mapping = page_mapping(page);
    
    		/*
    		 * The page is mapped into the page tables of one or more
    		 * processes. Try to unmap it here.
    		 */
    		if (page_mapped(page) && mapping) {
    
    			switch (try_to_unmap(page, 0)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_FAIL:
    				goto activate_locked;
    			case SWAP_AGAIN:
    				goto keep_locked;
    			case SWAP_SUCCESS:
    				; /* try to free the page below */
    			}
    		}
    
    		if (PageDirty(page)) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			if (!sc->may_writepage)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			/* Page is dirty, try to write it out here */
    
    			switch (pageout(page, mapping, sync_writeback)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case PAGE_KEEP:
    				goto keep_locked;
    			case PAGE_ACTIVATE:
    				goto activate_locked;
    			case PAGE_SUCCESS:
    
    				if (PageWriteback(page) || PageDirty(page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					goto keep;
    				/*
    				 * A synchronous write - probably a ramdisk.  Go
    				 * ahead and try to reclaim the page.
    				 */
    				if (TestSetPageLocked(page))
    					goto keep;
    				if (PageDirty(page) || PageWriteback(page))
    					goto keep_locked;
    				mapping = page_mapping(page);
    			case PAGE_CLEAN:
    				; /* try to free the page below */
    			}
    		}
    
    		/*
    		 * If the page has buffers, try to free the buffer mappings
    		 * associated with this page. If we succeed we try to free
    		 * the page as well.
    		 *
    		 * We do this even if the page is PageDirty().
    		 * try_to_release_page() does not perform I/O, but it is
    		 * possible for a page to have PageDirty set, but it is actually
    		 * clean (all its buffers are clean).  This happens if the
    		 * buffers were written out directly, with submit_bh(). ext3
    		 * will do this, as well as the blockdev mapping. 
    		 * try_to_release_page() will discover that cleanness and will
    		 * drop the buffers and mark the page clean - it can be freed.
    		 *
    		 * Rarely, pages can have buffers and no ->mapping.  These are
    		 * the pages which were not successfully invalidated in
    		 * truncate_complete_page().  We try to drop those buffers here
    		 * and if that worked, and the page is no longer mapped into
    		 * process address space (page_count == 1) it can be freed.
    		 * Otherwise, leave the page on the LRU so it is swappable.
    		 */
    		if (PagePrivate(page)) {
    			if (!try_to_release_page(page, sc->gfp_mask))
    				goto activate_locked;
    
    			if (!mapping && page_count(page) == 1) {
    				unlock_page(page);
    				if (put_page_testzero(page))
    					goto free_it;
    				else {
    					/*
    					 * rare race with speculative reference.
    					 * the speculative reference will free
    					 * this page shortly, so we may
    					 * increment nr_reclaimed here (and
    					 * leave it off the LRU).
    					 */
    					nr_reclaimed++;
    					continue;
    				}
    			}
    
    		if (!mapping || !__remove_mapping(mapping, page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		unlock_page(page);
    
    		nr_reclaimed++;
    
    		if (!pagevec_add(&freed_pvec, page)) {
    			__pagevec_free(&freed_pvec);
    			pagevec_reinit(&freed_pvec);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		continue;
    
    activate_locked:
    		SetPageActive(page);
    		pgactivate++;
    keep_locked:
    		unlock_page(page);
    keep:
    		list_add(&page->lru, &ret_pages);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageLRU(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	list_splice(&ret_pages, page_list);
    	if (pagevec_count(&freed_pvec))
    
    		__pagevec_free(&freed_pvec);
    
    	count_vm_events(PGACTIVATE, pgactivate);
    
    	return nr_reclaimed;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    /* LRU Isolation modes. */
    #define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
    #define ISOLATE_ACTIVE 1	/* Isolate active pages. */
    #define ISOLATE_BOTH 2		/* Isolate both active and inactive pages. */
    
    /*
     * Attempt to remove the specified page from its LRU.  Only take this page
     * if it is of the appropriate PageActive status.  Pages which are being
     * freed elsewhere are also ignored.
     *
     * page:	page to consider
     * mode:	one of the LRU isolation modes defined above
     *
     * returns 0 on success, -ve errno on failure.
     */
    
    int __isolate_lru_page(struct page *page, int mode)
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    {
    	int ret = -EINVAL;
    
    	/* Only take pages on the LRU. */
    	if (!PageLRU(page))
    		return ret;
    
    	/*
    	 * When checking the active state, we need to be sure we are
    	 * dealing with comparible boolean values.  Take the logical not
    	 * of each.
    	 */
    	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
    		return ret;
    
    	ret = -EBUSY;
    	if (likely(get_page_unless_zero(page))) {
    		/*
    		 * Be careful not to clear PageLRU until after we're
    		 * sure the page is not being freed elsewhere -- the
    		 * page release code relies on it.
    		 */
    		ClearPageLRU(page);
    		ret = 0;
    	}
    
    	return ret;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * zone->lru_lock is heavily contended.  Some of the functions that
     * shrink the lists perform better by taking out a batch of pages
     * and working on them outside the LRU lock.
     *
     * For pagecache intensive workloads, this function is the hottest
     * spot in the kernel (apart from copy_*_user functions).
     *
     * Appropriate locks must be held before calling this function.
     *
     * @nr_to_scan:	The number of pages to look through on the list.
     * @src:	The LRU list to pull pages off.
     * @dst:	The temp list to put pages on to.
     * @scanned:	The number of pages that were scanned.
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
     * @order:	The caller's attempted allocation order
     * @mode:	One of the LRU isolation modes
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * returns how many pages were moved onto *@dst.
     */
    
    static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    		struct list_head *src, struct list_head *dst,
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		unsigned long *scanned, int order, int mode)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken = 0;
    
    	unsigned long scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		struct page *page;
    		unsigned long pfn;
    		unsigned long end_pfn;
    		unsigned long page_pfn;
    		int zone_id;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = lru_to_page(src);
    		prefetchw_prev_lru_page(page, src, flags);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLRU(page));
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		switch (__isolate_lru_page(page, mode)) {
    		case 0:
    			list_move(&page->lru, dst);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			break;
    
    		case -EBUSY:
    			/* else it is being freed elsewhere */
    			list_move(&page->lru, src);
    			continue;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		default:
    			BUG();
    		}
    
    		if (!order)
    			continue;
    
    		/*
    		 * Attempt to take all pages in the order aligned region
    		 * surrounding the tag page.  Only take those pages of
    		 * the same active state as that tag page.  We may safely
    		 * round the target page pfn down to the requested order
    		 * as the mem_map is guarenteed valid out to MAX_ORDER,
    		 * where that page is in a different zone we will detect
    		 * it from its zone id and abort this block scan.
    		 */
    		zone_id = page_zone_id(page);
    		page_pfn = page_to_pfn(page);
    		pfn = page_pfn & ~((1 << order) - 1);
    		end_pfn = pfn + (1 << order);
    		for (; pfn < end_pfn; pfn++) {
    			struct page *cursor_page;
    
    			/* The target page is in the block, ignore it. */
    			if (unlikely(pfn == page_pfn))
    				continue;
    
    			/* Avoid holes within the zone. */
    			if (unlikely(!pfn_valid_within(pfn)))
    				break;
    
    			cursor_page = pfn_to_page(pfn);
    			/* Check that we have not crossed a zone boundary. */
    			if (unlikely(page_zone_id(cursor_page) != zone_id))
    				continue;
    			switch (__isolate_lru_page(cursor_page, mode)) {
    			case 0:
    				list_move(&cursor_page->lru, dst);
    				nr_taken++;
    				scan++;
    				break;
    
    			case -EBUSY:
    				/* else it is being freed elsewhere */
    				list_move(&cursor_page->lru, src);
    			default:
    				break;
    			}
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	*scanned = scan;
    	return nr_taken;
    }
    
    
    static unsigned long isolate_pages_global(unsigned long nr,
    					struct list_head *dst,
    					unsigned long *scanned, int order,
    					int mode, struct zone *z,
    					struct mem_cgroup *mem_cont,
    					int active)
    {
    	if (active)
    		return isolate_lru_pages(nr, &z->active_list, dst,
    						scanned, order, mode);
    	else
    		return isolate_lru_pages(nr, &z->inactive_list, dst,
    						scanned, order, mode);
    }
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    /*
     * clear_active_flags() is a helper for shrink_active_list(), clearing
     * any active bits from the pages in the list.
     */
    static unsigned long clear_active_flags(struct list_head *page_list)
    {
    	int nr_active = 0;
    	struct page *page;
    
    	list_for_each_entry(page, page_list, lru)
    		if (PageActive(page)) {
    			ClearPageActive(page);
    			nr_active++;
    		}
    
    	return nr_active;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
     * of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_inactive_list(unsigned long max_scan,
    				struct zone *zone, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(page_list);
    	struct pagevec pvec;
    
    	unsigned long nr_scanned = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pagevec_init(&pvec, 1);
    
    	lru_add_drain();
    	spin_lock_irq(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct page *page;
    
    		unsigned long nr_taken;
    		unsigned long nr_scan;
    		unsigned long nr_freed;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		unsigned long nr_active;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			     &page_list, &nr_scan, sc->order,
    			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
    
    					     ISOLATE_BOTH : ISOLATE_INACTIVE,
    				zone, sc->mem_cgroup, 0);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		nr_active = clear_active_flags(&page_list);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
    		__mod_zone_page_state(zone, NR_INACTIVE,
    						-(nr_taken - nr_active));
    
    		if (scan_global_lru(sc))
    			zone->pages_scanned += nr_scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		spin_unlock_irq(&zone->lru_lock);
    
    
    		nr_scanned += nr_scan;
    
    		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
    
    		/*
    		 * If we are direct reclaiming for contiguous pages and we do
    		 * not reclaim everything in the list, try again and wait
    		 * for IO to complete. This will stall high-order allocations
    		 * but that should be acceptable to the caller
    		 */
    		if (nr_freed < nr_taken && !current_is_kswapd() &&
    					sc->order > PAGE_ALLOC_COSTLY_ORDER) {
    			congestion_wait(WRITE, HZ/10);
    
    			/*
    			 * The attempt at page out may have made some
    			 * of the pages active, mark them inactive again.
    			 */
    			nr_active = clear_active_flags(&page_list);
    			count_vm_events(PGDEACTIVATE, nr_active);
    
    			nr_freed += shrink_page_list(&page_list, sc,
    							PAGEOUT_IO_SYNC);
    		}
    
    
    		nr_reclaimed += nr_freed;
    
    		local_irq_disable();
    		if (current_is_kswapd()) {
    
    			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
    			__count_vm_events(KSWAPD_STEAL, nr_freed);
    
    			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
    
    		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
    
    		spin_lock(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * Put back any unfreeable pages.
    		 */
    		while (!list_empty(&page_list)) {
    			page = lru_to_page(&page_list);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			VM_BUG_ON(PageLRU(page));
    
    			SetPageLRU(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			list_del(&page->lru);
    			if (PageActive(page))
    				add_page_to_active_list(zone, page);
    			else
    				add_page_to_inactive_list(zone, page);
    			if (!pagevec_add(&pvec, page)) {
    				spin_unlock_irq(&zone->lru_lock);
    				__pagevec_release(&pvec);
    				spin_lock_irq(&zone->lru_lock);
    			}
    		}
    
      	} while (nr_scanned < max_scan);
    
    	spin_unlock(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    done:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pagevec_release(&pvec);
    
    	return nr_reclaimed;
    
    /*
     * We are about to scan this zone at a certain priority level.  If that priority
     * level is smaller (ie: more urgent) than the previous priority, then note
     * that priority level within the zone.  This is done so that when the next
     * process comes in to scan this zone, it will immediately start out at this
     * priority level rather than having to build up its own scanning priority.
     * Here, this priority affects only the reclaim-mapped threshold.
     */
    static inline void note_zone_scanning_priority(struct zone *zone, int priority)
    {
    	if (priority < zone->prev_priority)
    		zone->prev_priority = priority;
    }
    
    
    static inline int zone_is_near_oom(struct zone *zone)
    {
    
    	return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
    				+ zone_page_state(zone, NR_INACTIVE))*3;
    
    /*
     * Determine we should try to reclaim mapped pages.
     * This is called only when sc->mem_cgroup is NULL.
     */
    static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
    				int priority)
    {
    	long mapped_ratio;
    	long distress;
    	long swap_tendency;
    	long imbalance;
    	int reclaim_mapped = 0;
    	int prev_priority;
    
    	if (scan_global_lru(sc) && zone_is_near_oom(zone))
    		return 1;
    	/*
    	 * `distress' is a measure of how much trouble we're having
    	 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
    	 */
    	if (scan_global_lru(sc))
    		prev_priority = zone->prev_priority;
    	else
    		prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
    
    	distress = 100 >> min(prev_priority, priority);
    
    	/*
    	 * The point of this algorithm is to decide when to start
    	 * reclaiming mapped memory instead of just pagecache.  Work out
    	 * how much memory
    	 * is mapped.