Skip to content
Snippets Groups Projects
vmscan.c 45.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/mm/vmscan.c
     *
     *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     *
     *  Swap reorganised 29.12.95, Stephen Tweedie.
     *  kswapd added: 7.1.96  sct
     *  Removed kswapd_ctl limits, and swap out as many pages as needed
     *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
     *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
     *  Multiqueue VM started 5.8.00, Rik van Riel.
     */
    
    #include <linux/mm.h>
    #include <linux/module.h>
    #include <linux/slab.h>
    #include <linux/kernel_stat.h>
    #include <linux/swap.h>
    #include <linux/pagemap.h>
    #include <linux/init.h>
    #include <linux/highmem.h>
    
    #include <linux/vmstat.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/file.h>
    #include <linux/writeback.h>
    #include <linux/blkdev.h>
    #include <linux/buffer_head.h>	/* for try_to_release_page(),
    					buffer_heads_over_limit */
    #include <linux/mm_inline.h>
    #include <linux/pagevec.h>
    #include <linux/backing-dev.h>
    #include <linux/rmap.h>
    #include <linux/topology.h>
    #include <linux/cpu.h>
    #include <linux/cpuset.h>
    #include <linux/notifier.h>
    #include <linux/rwsem.h>
    
    #include <linux/delay.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/tlbflush.h>
    #include <asm/div64.h>
    
    #include <linux/swapops.h>
    
    
    #include "internal.h"
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    struct scan_control {
    	/* Incremented by the number of inactive pages that were scanned */
    	unsigned long nr_scanned;
    
    	/* This context's GFP mask */
    
    	gfp_t gfp_mask;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	int may_writepage;
    
    
    	/* Can pages be swapped as part of reclaim? */
    	int may_swap;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
    	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
    	 * In this context, it doesn't matter that we scan the
    	 * whole list at once. */
    	int swap_cluster_max;
    
    
    	int all_unreclaimable;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    };
    
    /*
     * The list of shrinker callbacks used by to apply pressure to
     * ageable caches.
     */
    struct shrinker {
    	shrinker_t		shrinker;
    	struct list_head	list;
    	int			seeks;	/* seeks to recreate an obj */
    	long			nr;	/* objs pending delete */
    };
    
    #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
    
    #ifdef ARCH_HAS_PREFETCH
    #define prefetch_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetch(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    #ifdef ARCH_HAS_PREFETCHW
    #define prefetchw_prev_lru_page(_page, _base, _field)			\
    	do {								\
    		if ((_page)->lru.prev != _base) {			\
    			struct page *prev;				\
    									\
    			prev = lru_to_page(&(_page->lru));		\
    			prefetchw(&prev->_field);			\
    		}							\
    	} while (0)
    #else
    #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
    #endif
    
    /*
     * From 0 .. 100.  Higher means more swappy.
     */
    int vm_swappiness = 60;
    
    long vm_total_pages;	/* The total number of pages which the VM controls */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    static LIST_HEAD(shrinker_list);
    static DECLARE_RWSEM(shrinker_rwsem);
    
    /*
     * Add a shrinker callback to be called from the vm
     */
    struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
    {
            struct shrinker *shrinker;
    
            shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
            if (shrinker) {
    	        shrinker->shrinker = theshrinker;
    	        shrinker->seeks = seeks;
    	        shrinker->nr = 0;
    	        down_write(&shrinker_rwsem);
    	        list_add_tail(&shrinker->list, &shrinker_list);
    	        up_write(&shrinker_rwsem);
    	}
    	return shrinker;
    }
    EXPORT_SYMBOL(set_shrinker);
    
    /*
     * Remove one
     */
    void remove_shrinker(struct shrinker *shrinker)
    {
    	down_write(&shrinker_rwsem);
    	list_del(&shrinker->list);
    	up_write(&shrinker_rwsem);
    	kfree(shrinker);
    }
    EXPORT_SYMBOL(remove_shrinker);
    
    #define SHRINK_BATCH 128
    /*
     * Call the shrink functions to age shrinkable caches
     *
     * Here we assume it costs one seek to replace a lru page and that it also
     * takes a seek to recreate a cache object.  With this in mind we age equal
     * percentages of the lru and ageable caches.  This should balance the seeks
     * generated by these structures.
     *
     * If the vm encounted mapped pages on the LRU it increase the pressure on
     * slab to avoid swapping.
     *
     * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
     *
     * `lru_pages' represents the number of on-LRU pages in all the zones which
     * are eligible for the caller's allocation attempt.  It is used for balancing
     * slab reclaim versus page reclaim.
    
     *
     * Returns the number of slab objects which we shrunk.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
    			unsigned long lru_pages)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct shrinker *shrinker;
    
    	unsigned long ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (scanned == 0)
    		scanned = SWAP_CLUSTER_MAX;
    
    	if (!down_read_trylock(&shrinker_rwsem))
    
    		return 1;	/* Assume we'll be able to shrink next time */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	list_for_each_entry(shrinker, &shrinker_list, list) {
    		unsigned long long delta;
    		unsigned long total_scan;
    
    		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		delta = (4 * scanned) / shrinker->seeks;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do_div(delta, lru_pages + 1);
    		shrinker->nr += delta;
    
    		if (shrinker->nr < 0) {
    			printk(KERN_ERR "%s: nr=%ld\n",
    					__FUNCTION__, shrinker->nr);
    			shrinker->nr = max_pass;
    		}
    
    		/*
    		 * Avoid risking looping forever due to too large nr value:
    		 * never try to free more than twice the estimate number of
    		 * freeable entries.
    		 */
    		if (shrinker->nr > max_pass * 2)
    			shrinker->nr = max_pass * 2;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		total_scan = shrinker->nr;
    		shrinker->nr = 0;
    
    		while (total_scan >= SHRINK_BATCH) {
    			long this_scan = SHRINK_BATCH;
    			int shrink_ret;
    
    			int nr_before;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			nr_before = (*shrinker->shrinker)(0, gfp_mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
    			if (shrink_ret == -1)
    				break;
    
    			if (shrink_ret < nr_before)
    				ret += nr_before - shrink_ret;
    
    			count_vm_events(SLABS_SCANNED, this_scan);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			total_scan -= this_scan;
    
    			cond_resched();
    		}
    
    		shrinker->nr += total_scan;
    	}
    	up_read(&shrinker_rwsem);
    
    	return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /* Called without lock on whether page is mapped, so answer is unstable */
    static inline int page_mapping_inuse(struct page *page)
    {
    	struct address_space *mapping;
    
    	/* Page is in somebody's page tables. */
    	if (page_mapped(page))
    		return 1;
    
    	/* Be more reluctant to reclaim swapcache than pagecache */
    	if (PageSwapCache(page))
    		return 1;
    
    	mapping = page_mapping(page);
    	if (!mapping)
    		return 0;
    
    	/* File is mmap'd by somebody? */
    	return mapping_mapped(mapping);
    }
    
    static inline int is_page_cache_freeable(struct page *page)
    {
    	return page_count(page) - !!PagePrivate(page) == 2;
    }
    
    static int may_write_to_queue(struct backing_dev_info *bdi)
    {
    
    	if (current->flags & PF_SWAPWRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	if (!bdi_write_congested(bdi))
    		return 1;
    	if (bdi == current->backing_dev_info)
    		return 1;
    	return 0;
    }
    
    /*
     * We detected a synchronous write error writing a page out.  Probably
     * -ENOSPC.  We need to propagate that into the address_space for a subsequent
     * fsync(), msync() or close().
     *
     * The tricky part is that after writepage we cannot touch the mapping: nothing
     * prevents it from being freed up.  But we have a ref on the page and once
     * that page is locked, the mapping is pinned.
     *
     * We're allowed to run sleeping lock_page() here because we know the caller has
     * __GFP_FS.
     */
    static void handle_write_error(struct address_space *mapping,
    				struct page *page, int error)
    {
    	lock_page(page);
    	if (page_mapping(page) == mapping) {
    		if (error == -ENOSPC)
    			set_bit(AS_ENOSPC, &mapping->flags);
    		else
    			set_bit(AS_EIO, &mapping->flags);
    	}
    	unlock_page(page);
    }
    
    
    /* possible outcome of pageout() */
    typedef enum {
    	/* failed to write page out, page is locked */
    	PAGE_KEEP,
    	/* move page to the active list, page is locked */
    	PAGE_ACTIVATE,
    	/* page has been sent to the disk successfully, page is unlocked */
    	PAGE_SUCCESS,
    	/* page is clean and locked */
    	PAGE_CLEAN,
    } pageout_t;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * pageout is called by shrink_page_list() for each dirty page.
     * Calls ->writepage().
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static pageout_t pageout(struct page *page, struct address_space *mapping)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	/*
    	 * If the page is dirty, only perform writeback if that write
    	 * will be non-blocking.  To prevent this allocation from being
    	 * stalled by pagecache activity.  But note that there may be
    	 * stalls if we need to run get_block().  We could test
    	 * PagePrivate for that.
    	 *
    	 * If this process is currently in generic_file_write() against
    	 * this page's queue, we can perform writeback even if that
    	 * will block.
    	 *
    	 * If the page is swapcache, write it back even if that would
    	 * block, for some throttling. This happens by accident, because
    	 * swap_backing_dev_info is bust: it doesn't reflect the
    	 * congestion state of the swapdevs.  Easy to fix, if needed.
    	 * See swapfile.c:page_queue_congested().
    	 */
    	if (!is_page_cache_freeable(page))
    		return PAGE_KEEP;
    	if (!mapping) {
    		/*
    		 * Some data journaling orphaned pages can have
    		 * page->mapping == NULL while being dirty with clean buffers.
    		 */
    
    		if (PagePrivate(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (try_to_free_buffers(page)) {
    				ClearPageDirty(page);
    				printk("%s: orphaned page\n", __FUNCTION__);
    				return PAGE_CLEAN;
    			}
    		}
    		return PAGE_KEEP;
    	}
    	if (mapping->a_ops->writepage == NULL)
    		return PAGE_ACTIVATE;
    	if (!may_write_to_queue(mapping->backing_dev_info))
    		return PAGE_KEEP;
    
    	if (clear_page_dirty_for_io(page)) {
    		int res;
    		struct writeback_control wbc = {
    			.sync_mode = WB_SYNC_NONE,
    			.nr_to_write = SWAP_CLUSTER_MAX,
    
    			.range_start = 0,
    			.range_end = LLONG_MAX,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			.nonblocking = 1,
    			.for_reclaim = 1,
    		};
    
    		SetPageReclaim(page);
    		res = mapping->a_ops->writepage(page, &wbc);
    		if (res < 0)
    			handle_write_error(mapping, page, res);
    
    		if (res == AOP_WRITEPAGE_ACTIVATE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			ClearPageReclaim(page);
    			return PAGE_ACTIVATE;
    		}
    		if (!PageWriteback(page)) {
    			/* synchronous write or broken a_ops? */
    			ClearPageReclaim(page);
    		}
    
    		inc_zone_page_state(page, NR_VMSCAN_WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return PAGE_SUCCESS;
    	}
    
    	return PAGE_CLEAN;
    }
    
    
    /*
     * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
     * someone else has a ref on the page, abort and return 0.  If it was
     * successfully detached, return 1.  Assumes the caller has a single ref on
     * this page.
     */
    
    int remove_mapping(struct address_space *mapping, struct page *page)
    
    	BUG_ON(!PageLocked(page));
    	BUG_ON(mapping != page_mapping(page));
    
    
    	write_lock_irq(&mapping->tree_lock);
    	/*
    
    	 * The non racy check for a busy page.
    	 *
    	 * Must be careful with the order of the tests. When someone has
    	 * a ref to the page, it may be possible that they dirty it then
    	 * drop the reference. So if PageDirty is tested before page_count
    	 * here, then the following race may occur:
    	 *
    	 * get_user_pages(&page);
    	 * [user mapping goes away]
    	 * write_to(page);
    	 *				!PageDirty(page)    [good]
    	 * SetPageDirty(page);
    	 * put_page(page);
    	 *				!page_count(page)   [good, discard it]
    	 *
    	 * [oops, our write_to data is lost]
    	 *
    	 * Reversing the order of the tests ensures such a situation cannot
    	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
    	 * load is not satisfied before that of page->_count.
    	 *
    	 * Note that if SetPageDirty is always performed via set_page_dirty,
    	 * and thus under tree_lock, then this ordering is not required.
    
    	 */
    	if (unlikely(page_count(page) != 2))
    		goto cannot_free;
    	smp_rmb();
    	if (unlikely(PageDirty(page)))
    		goto cannot_free;
    
    	if (PageSwapCache(page)) {
    		swp_entry_t swap = { .val = page_private(page) };
    		__delete_from_swap_cache(page);
    		write_unlock_irq(&mapping->tree_lock);
    		swap_free(swap);
    		__put_page(page);	/* The pagecache ref */
    		return 1;
    	}
    
    	__remove_from_page_cache(page);
    	write_unlock_irq(&mapping->tree_lock);
    	__put_page(page);
    	return 1;
    
    cannot_free:
    	write_unlock_irq(&mapping->tree_lock);
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_page_list() returns the number of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_page_list(struct list_head *page_list,
    					struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(ret_pages);
    	struct pagevec freed_pvec;
    	int pgactivate = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	cond_resched();
    
    	pagevec_init(&freed_pvec, 1);
    	while (!list_empty(page_list)) {
    		struct address_space *mapping;
    		struct page *page;
    		int may_enter_fs;
    		int referenced;
    
    		cond_resched();
    
    		page = lru_to_page(page_list);
    		list_del(&page->lru);
    
    		if (TestSetPageLocked(page))
    			goto keep;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		sc->nr_scanned++;
    
    
    		if (!sc->may_swap && page_mapped(page))
    			goto keep_locked;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Double the slab pressure for mapped and swapcache pages */
    		if (page_mapped(page) || PageSwapCache(page))
    			sc->nr_scanned++;
    
    		if (PageWriteback(page))
    			goto keep_locked;
    
    
    		referenced = page_referenced(page, 1);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* In active use or really unfreeable?  Activate it. */
    		if (referenced && page_mapping_inuse(page))
    			goto activate_locked;
    
    #ifdef CONFIG_SWAP
    		/*
    		 * Anonymous process memory has backing store?
    		 * Try to allocate it some swap space here.
    		 */
    
    			if (!add_to_swap(page, GFP_ATOMIC))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto activate_locked;
    #endif /* CONFIG_SWAP */
    
    		mapping = page_mapping(page);
    		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
    			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
    
    		/*
    		 * The page is mapped into the page tables of one or more
    		 * processes. Try to unmap it here.
    		 */
    		if (page_mapped(page) && mapping) {
    
    			switch (try_to_unmap(page, 0)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			case SWAP_FAIL:
    				goto activate_locked;
    			case SWAP_AGAIN:
    				goto keep_locked;
    			case SWAP_SUCCESS:
    				; /* try to free the page below */
    			}
    		}
    
    		if (PageDirty(page)) {
    			if (referenced)
    				goto keep_locked;
    			if (!may_enter_fs)
    				goto keep_locked;
    
    			if (!sc->may_writepage)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto keep_locked;
    
    			/* Page is dirty, try to write it out here */
    			switch(pageout(page, mapping)) {
    			case PAGE_KEEP:
    				goto keep_locked;
    			case PAGE_ACTIVATE:
    				goto activate_locked;
    			case PAGE_SUCCESS:
    				if (PageWriteback(page) || PageDirty(page))
    					goto keep;
    				/*
    				 * A synchronous write - probably a ramdisk.  Go
    				 * ahead and try to reclaim the page.
    				 */
    				if (TestSetPageLocked(page))
    					goto keep;
    				if (PageDirty(page) || PageWriteback(page))
    					goto keep_locked;
    				mapping = page_mapping(page);
    			case PAGE_CLEAN:
    				; /* try to free the page below */
    			}
    		}
    
    		/*
    		 * If the page has buffers, try to free the buffer mappings
    		 * associated with this page. If we succeed we try to free
    		 * the page as well.
    		 *
    		 * We do this even if the page is PageDirty().
    		 * try_to_release_page() does not perform I/O, but it is
    		 * possible for a page to have PageDirty set, but it is actually
    		 * clean (all its buffers are clean).  This happens if the
    		 * buffers were written out directly, with submit_bh(). ext3
    		 * will do this, as well as the blockdev mapping. 
    		 * try_to_release_page() will discover that cleanness and will
    		 * drop the buffers and mark the page clean - it can be freed.
    		 *
    		 * Rarely, pages can have buffers and no ->mapping.  These are
    		 * the pages which were not successfully invalidated in
    		 * truncate_complete_page().  We try to drop those buffers here
    		 * and if that worked, and the page is no longer mapped into
    		 * process address space (page_count == 1) it can be freed.
    		 * Otherwise, leave the page on the LRU so it is swappable.
    		 */
    		if (PagePrivate(page)) {
    			if (!try_to_release_page(page, sc->gfp_mask))
    				goto activate_locked;
    			if (!mapping && page_count(page) == 1)
    				goto free_it;
    		}
    
    
    		if (!mapping || !remove_mapping(mapping, page))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    free_it:
    		unlock_page(page);
    
    		nr_reclaimed++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!pagevec_add(&freed_pvec, page))
    			__pagevec_release_nonlru(&freed_pvec);
    		continue;
    
    activate_locked:
    		SetPageActive(page);
    		pgactivate++;
    keep_locked:
    		unlock_page(page);
    keep:
    		list_add(&page->lru, &ret_pages);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageLRU(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	list_splice(&ret_pages, page_list);
    	if (pagevec_count(&freed_pvec))
    		__pagevec_release_nonlru(&freed_pvec);
    
    	count_vm_events(PGACTIVATE, pgactivate);
    
    	return nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * zone->lru_lock is heavily contended.  Some of the functions that
     * shrink the lists perform better by taking out a batch of pages
     * and working on them outside the LRU lock.
     *
     * For pagecache intensive workloads, this function is the hottest
     * spot in the kernel (apart from copy_*_user functions).
     *
     * Appropriate locks must be held before calling this function.
     *
     * @nr_to_scan:	The number of pages to look through on the list.
     * @src:	The LRU list to pull pages off.
     * @dst:	The temp list to put pages on to.
     * @scanned:	The number of pages that were scanned.
     *
     * returns how many pages were moved onto *@dst.
     */
    
    static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    		struct list_head *src, struct list_head *dst,
    		unsigned long *scanned)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	unsigned long scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    
    		struct list_head *target;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = lru_to_page(src);
    		prefetchw_prev_lru_page(page, src, flags);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLRU(page));
    
    		list_del(&page->lru);
    
    		target = src;
    		if (likely(get_page_unless_zero(page))) {
    
    			 * Be careful not to clear PageLRU until after we're
    			 * sure the page is not being freed elsewhere -- the
    			 * page release code relies on it.
    
    			ClearPageLRU(page);
    			target = dst;
    			nr_taken++;
    		} /* else it is being freed elsewhere */
    
    		list_add(&page->lru, target);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	*scanned = scan;
    	return nr_taken;
    }
    
    /*
    
     * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
     * of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_inactive_list(unsigned long max_scan,
    				struct zone *zone, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(page_list);
    	struct pagevec pvec;
    
    	unsigned long nr_scanned = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pagevec_init(&pvec, 1);
    
    	lru_add_drain();
    	spin_lock_irq(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct page *page;
    
    		unsigned long nr_taken;
    		unsigned long nr_scan;
    		unsigned long nr_freed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
    					     &zone->inactive_list,
    					     &page_list, &nr_scan);
    		zone->nr_inactive -= nr_taken;
    		zone->pages_scanned += nr_scan;
    		spin_unlock_irq(&zone->lru_lock);
    
    
    		nr_scanned += nr_scan;
    
    		nr_freed = shrink_page_list(&page_list, sc);
    
    		nr_reclaimed += nr_freed;
    
    		local_irq_disable();
    		if (current_is_kswapd()) {
    
    			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
    			__count_vm_events(KSWAPD_STEAL, nr_freed);
    
    		} else
    
    			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
    		__count_vm_events(PGACTIVATE, nr_freed);
    
    		spin_lock(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * Put back any unfreeable pages.
    		 */
    		while (!list_empty(&page_list)) {
    			page = lru_to_page(&page_list);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			VM_BUG_ON(PageLRU(page));
    
    			SetPageLRU(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			list_del(&page->lru);
    			if (PageActive(page))
    				add_page_to_active_list(zone, page);
    			else
    				add_page_to_inactive_list(zone, page);
    			if (!pagevec_add(&pvec, page)) {
    				spin_unlock_irq(&zone->lru_lock);
    				__pagevec_release(&pvec);
    				spin_lock_irq(&zone->lru_lock);
    			}
    		}
    
      	} while (nr_scanned < max_scan);
    
    	spin_unlock(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    done:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pagevec_release(&pvec);
    
    	return nr_reclaimed;
    
    static inline int zone_is_near_oom(struct zone *zone)
    {
    	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This moves pages from the active list to the inactive list.
     *
     * We move them the other way if the page is referenced by one or more
     * processes, from rmap.
     *
     * If the pages are mostly unmapped, the processing is fast and it is
     * appropriate to hold zone->lru_lock across the whole operation.  But if
     * the pages are mapped, the processing is slow (page_referenced()) so we
     * should drop zone->lru_lock around each page.  It's impossible to balance
     * this, so instead we remove the pages from the LRU while processing them.
     * It is safe to rely on PG_active against the non-LRU pages in here because
     * nobody will play with that bit on a non-LRU page.
     *
     * The downside is that we have to touch page->_count against each page.
     * But we had to alter page->flags anyway.
     */
    
    static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
    				struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long pgmoved;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int pgdeactivate = 0;
    
    	unsigned long pgscanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	LIST_HEAD(l_hold);	/* The pages which were snipped off */
    	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
    	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
    	struct page *page;
    	struct pagevec pvec;
    	int reclaim_mapped = 0;
    
    		long mapped_ratio;
    		long distress;
    		long swap_tendency;
    
    
    		if (zone_is_near_oom(zone))
    			goto force_reclaim_mapped;
    
    
    		/*
    		 * `distress' is a measure of how much trouble we're having
    		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
    		 */
    		distress = 100 >> zone->prev_priority;
    
    		/*
    		 * The point of this algorithm is to decide when to start
    		 * reclaiming mapped memory instead of just pagecache.  Work out
    		 * how much memory
    		 * is mapped.
    		 */
    
    		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
    				global_page_state(NR_ANON_PAGES)) * 100) /
    
    
    		/*
    		 * Now decide how much we really want to unmap some pages.  The
    		 * mapped ratio is downgraded - just because there's a lot of
    		 * mapped memory doesn't necessarily mean that page reclaim
    		 * isn't succeeding.
    		 *
    		 * The distress ratio is important - we don't want to start
    		 * going oom.
    		 *
    		 * A 100% value of vm_swappiness overrides this algorithm
    		 * altogether.
    		 */
    
    		swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
    
    
    		/*
    		 * Now use this metric to decide whether to start moving mapped
    		 * memory onto the inactive list.
    		 */
    		if (swap_tendency >= 100)
    
    force_reclaim_mapped:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lru_add_drain();
    	spin_lock_irq(&zone->lru_lock);
    	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
    				    &l_hold, &pgscanned);
    	zone->pages_scanned += pgscanned;
    	zone->nr_active -= pgmoved;
    	spin_unlock_irq(&zone->lru_lock);
    
    	while (!list_empty(&l_hold)) {
    		cond_resched();
    		page = lru_to_page(&l_hold);
    		list_del(&page->lru);
    		if (page_mapped(page)) {
    			if (!reclaim_mapped ||
    			    (total_swap_pages == 0 && PageAnon(page)) ||
    
    			    page_referenced(page, 0)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				list_add(&page->lru, &l_active);
    				continue;
    			}
    		}
    		list_add(&page->lru, &l_inactive);
    	}
    
    	pagevec_init(&pvec, 1);
    	pgmoved = 0;
    	spin_lock_irq(&zone->lru_lock);
    	while (!list_empty(&l_inactive)) {
    		page = lru_to_page(&l_inactive);
    		prefetchw_prev_lru_page(page, &l_inactive, flags);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageLRU(page));
    
    		SetPageLRU(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageActive(page));
    
    		ClearPageActive(page);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_move(&page->lru, &zone->inactive_list);
    		pgmoved++;
    		if (!pagevec_add(&pvec, page)) {
    			zone->nr_inactive += pgmoved;
    			spin_unlock_irq(&zone->lru_lock);
    			pgdeactivate += pgmoved;
    			pgmoved = 0;
    			if (buffer_heads_over_limit)
    				pagevec_strip(&pvec);
    			__pagevec_release(&pvec);
    			spin_lock_irq(&zone->lru_lock);
    		}
    	}
    	zone->nr_inactive += pgmoved;
    	pgdeactivate += pgmoved;
    	if (buffer_heads_over_limit) {
    		spin_unlock_irq(&zone->lru_lock);
    		pagevec_strip(&pvec);
    		spin_lock_irq(&zone->lru_lock);
    	}
    
    	pgmoved = 0;
    	while (!list_empty(&l_active)) {
    		page = lru_to_page(&l_active);
    		prefetchw_prev_lru_page(page, &l_active, flags);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(PageLRU(page));
    
    		SetPageLRU(page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageActive(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_move(&page->lru, &zone->active_list);
    		pgmoved++;
    		if (!pagevec_add(&pvec, page)) {
    			zone->nr_active += pgmoved;
    			pgmoved = 0;
    			spin_unlock_irq(&zone->lru_lock);
    			__pagevec_release(&pvec);
    			spin_lock_irq(&zone->lru_lock);
    		}
    	}
    	zone->nr_active += pgmoved;
    
    	__count_zone_vm_events(PGREFILL, zone, pgscanned);
    	__count_vm_events(PGDEACTIVATE, pgdeactivate);
    	spin_unlock_irq(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	pagevec_release(&pvec);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
     */
    
    static unsigned long shrink_zone(int priority, struct zone *zone,
    				struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	unsigned long nr_active;
    	unsigned long nr_inactive;
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	atomic_inc(&zone->reclaim_in_progress);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * Add one to `nr_to_scan' just to make sure that the kernel will
    	 * slowly sift through the active list.
    	 */
    
    	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	nr_active = zone->nr_scan_active;
    	if (nr_active >= sc->swap_cluster_max)
    		zone->nr_scan_active = 0;
    	else
    		nr_active = 0;
    
    
    	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	nr_inactive = zone->nr_scan_inactive;
    	if (nr_inactive >= sc->swap_cluster_max)
    		zone->nr_scan_inactive = 0;
    	else
    		nr_inactive = 0;
    
    	while (nr_active || nr_inactive) {
    		if (nr_active) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					(unsigned long)sc->swap_cluster_max);
    
    			shrink_active_list(nr_to_scan, zone, sc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (nr_inactive) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					(unsigned long)sc->swap_cluster_max);
    
    			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
    								sc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    
    	throttle_vm_writeout();
    
    
    	atomic_dec(&zone->reclaim_in_progress);
    
    	return nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * This is the direct reclaim path, for page-allocating processes.  We only
     * try to reclaim pages from zones which will satisfy the caller's allocation
     * request.
     *
     * We reclaim from a zone even if that zone is over pages_high.  Because:
     * a) The caller may be trying to free *extra* pages to satisfy a higher-order
     *    allocation or
     * b) The zones may be over pages_high but they must go *over* pages_high to
     *    satisfy the `incremental min' zone defense algorithm.
     *
     * Returns the number of reclaimed pages.
     *
     * If a zone is deemed to be full of pinned pages then just give it a light
     * scan then give up on it.
     */
    
    static unsigned long shrink_zones(int priority, struct zone **zones,
    
    					struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int i;
    
    
    	sc->all_unreclaimable = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for (i = 0; zones[i] != NULL; i++) {
    		struct zone *zone = zones[i];
    
    
    		if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    
    		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    
    		zone->temp_priority = priority;
    		if (zone->prev_priority > priority)
    			zone->prev_priority = priority;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;	/* Let kswapd poll it */
    
    
    		sc->all_unreclaimable = 0;
    
    
    		nr_reclaimed += shrink_zone(priority, zone, sc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	return nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
     
    /*
     * This is the main entry point to direct page reclaim.
     *
     * If a full scan of the inactive list fails to free enough memory then we
     * are "out of memory" and something needs to be killed.
     *
     * If the caller is !__GFP_FS then the probability of a failure is reasonably
     * high - the zone may be full of dirty or under-writeback pages, which this
     * caller can't do much about.  We kick pdflush and take explicit naps in the
     * hope that some of these pages can be written.  But if the allocating task
     * holds filesystem locks which prevent writeout this might not work, and the
     * allocation attempt will fail.