Skip to content
Snippets Groups Projects
vmscan.c 80.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    	*scanned = scan;
    	return nr_taken;
    }
    
    
    static unsigned long isolate_pages_global(unsigned long nr,
    					struct list_head *dst,
    					unsigned long *scanned, int order,
    					int mode, struct zone *z,
    
    					int active, int file)
    
    	int lru = LRU_BASE;
    
    		lru += LRU_ACTIVE;
    	if (file)
    		lru += LRU_FILE;
    	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
    
    								mode, file);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    /*
     * clear_active_flags() is a helper for shrink_active_list(), clearing
     * any active bits from the pages in the list.
     */
    
    static unsigned long clear_active_flags(struct list_head *page_list,
    					unsigned int *count)
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    {
    	int nr_active = 0;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	struct page *page;
    
    
    	list_for_each_entry(page, page_list, lru) {
    
    		lru = page_lru_base_type(page);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		if (PageActive(page)) {
    
    			lru += LRU_ACTIVE;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			ClearPageActive(page);
    			nr_active++;
    		}
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    	return nr_active;
    }
    
    
    /**
     * isolate_lru_page - tries to isolate a page from its LRU list
     * @page: page to isolate from its LRU list
     *
     * Isolates a @page from an LRU list, clears PageLRU and adjusts the
     * vmstat statistic corresponding to whatever LRU list the page was on.
     *
     * Returns 0 if the page was removed from an LRU list.
     * Returns -EBUSY if the page was not on an LRU list.
     *
     * The returned page will have PageLRU() cleared.  If it was found on
    
     * the active list, it will have PageActive set.  If it was found on
     * the unevictable list, it will have the PageUnevictable bit set. That flag
     * may need to be cleared by the caller before letting the page go.
    
     *
     * The vmstat statistic corresponding to the list on which the page was
     * found will be decremented.
     *
     * Restrictions:
     * (1) Must be called with an elevated refcount on the page. This is a
     *     fundamentnal difference from isolate_lru_pages (which is called
     *     without a stable reference).
     * (2) the lru_lock must not be held.
     * (3) interrupts must be enabled.
     */
    int isolate_lru_page(struct page *page)
    {
    	int ret = -EBUSY;
    
    	if (PageLRU(page)) {
    		struct zone *zone = page_zone(page);
    
    		spin_lock_irq(&zone->lru_lock);
    		if (PageLRU(page) && get_page_unless_zero(page)) {
    
    			int lru = page_lru(page);
    
    			ret = 0;
    			ClearPageLRU(page);
    
    
    			del_page_from_lru_list(zone, page, lru);
    
    		}
    		spin_unlock_irq(&zone->lru_lock);
    	}
    	return ret;
    }
    
    
    /*
     * Are there way too many processes in the direct reclaim path already?
     */
    static int too_many_isolated(struct zone *zone, int file,
    		struct scan_control *sc)
    {
    	unsigned long inactive, isolated;
    
    	if (current_is_kswapd())
    		return 0;
    
    	if (!scanning_global_lru(sc))
    		return 0;
    
    	if (file) {
    		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
    	} else {
    		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
    	}
    
    	return isolated > inactive;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
     * of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long shrink_inactive_list(unsigned long max_scan,
    
    			struct zone *zone, struct scan_control *sc,
    			int priority, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(page_list);
    	struct pagevec pvec;
    
    	unsigned long nr_scanned = 0;
    
    	unsigned long nr_reclaimed = 0;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
    
    	while (unlikely(too_many_isolated(zone, file, sc))) {
    
    		congestion_wait(BLK_RW_ASYNC, HZ/10);
    
    
    		/* We are about to die and free our memory. Return now. */
    		if (fatal_signal_pending(current))
    			return SWAP_CLUSTER_MAX;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pagevec_init(&pvec, 1);
    
    	lru_add_drain();
    	spin_lock_irq(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct page *page;
    
    		unsigned long nr_taken;
    		unsigned long nr_scan;
    		unsigned long nr_freed;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		unsigned long nr_active;
    
    		unsigned int count[NR_LRU_LISTS] = { 0, };
    
    		int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
    
    		unsigned long nr_anon;
    		unsigned long nr_file;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		if (scanning_global_lru(sc)) {
    
    			nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
    							&page_list, &nr_scan,
    							sc->order, mode,
    							zone, 0, file);
    
    			zone->pages_scanned += nr_scan;
    			if (current_is_kswapd())
    				__count_zone_vm_events(PGSCAN_KSWAPD, zone,
    						       nr_scan);
    			else
    				__count_zone_vm_events(PGSCAN_DIRECT, zone,
    						       nr_scan);
    
    		} else {
    			nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
    							&page_list, &nr_scan,
    							sc->order, mode,
    							zone, sc->mem_cgroup,
    							0, file);
    			/*
    			 * mem_cgroup_isolate_pages() keeps track of
    			 * scanned pages on its own.
    			 */
    
    		nr_active = clear_active_flags(&page_list, count);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    
    		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
    						-count[LRU_ACTIVE_FILE]);
    		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
    						-count[LRU_INACTIVE_FILE]);
    		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
    						-count[LRU_ACTIVE_ANON]);
    		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
    						-count[LRU_INACTIVE_ANON]);
    
    
    		nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
    		nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
    		__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
    		__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
    
    Huang Shijie's avatar
    Huang Shijie committed
    		reclaim_stat->recent_scanned[0] += nr_anon;
    		reclaim_stat->recent_scanned[1] += nr_file;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		spin_unlock_irq(&zone->lru_lock);
    
    
    		nr_scanned += nr_scan;
    
    		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
    
    		/*
    		 * If we are direct reclaiming for contiguous pages and we do
    		 * not reclaim everything in the list, try again and wait
    		 * for IO to complete. This will stall high-order allocations
    		 * but that should be acceptable to the caller
    		 */
    		if (nr_freed < nr_taken && !current_is_kswapd() &&
    
    			congestion_wait(BLK_RW_ASYNC, HZ/10);
    
    
    			/*
    			 * The attempt at page out may have made some
    			 * of the pages active, mark them inactive again.
    			 */
    
    			nr_active = clear_active_flags(&page_list, count);
    
    			count_vm_events(PGDEACTIVATE, nr_active);
    
    			nr_freed += shrink_page_list(&page_list, sc,
    							PAGEOUT_IO_SYNC);
    		}
    
    
    		nr_reclaimed += nr_freed;
    
    		local_irq_disable();
    
    		if (current_is_kswapd())
    
    			__count_vm_events(KSWAPD_STEAL, nr_freed);
    
    		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
    
    
    		spin_lock(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * Put back any unfreeable pages.
    		 */
    		while (!list_empty(&page_list)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			page = lru_to_page(&page_list);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			VM_BUG_ON(PageLRU(page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			list_del(&page->lru);
    
    			if (unlikely(!page_evictable(page, NULL))) {
    				spin_unlock_irq(&zone->lru_lock);
    				putback_lru_page(page);
    				spin_lock_irq(&zone->lru_lock);
    				continue;
    			}
    			SetPageLRU(page);
    			lru = page_lru(page);
    			add_page_to_lru_list(zone, page, lru);
    
    			if (is_active_lru(lru)) {
    
    				int file = is_file_lru(lru);
    
    				reclaim_stat->recent_rotated[file]++;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (!pagevec_add(&pvec, page)) {
    				spin_unlock_irq(&zone->lru_lock);
    				__pagevec_release(&pvec);
    				spin_lock_irq(&zone->lru_lock);
    			}
    		}
    
    		__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
    		__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
    
    
      	} while (nr_scanned < max_scan);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    done:
    
    	spin_unlock_irq(&zone->lru_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pagevec_release(&pvec);
    
    	return nr_reclaimed;
    
    /*
     * We are about to scan this zone at a certain priority level.  If that priority
     * level is smaller (ie: more urgent) than the previous priority, then note
     * that priority level within the zone.  This is done so that when the next
     * process comes in to scan this zone, it will immediately start out at this
     * priority level rather than having to build up its own scanning priority.
     * Here, this priority affects only the reclaim-mapped threshold.
     */
    static inline void note_zone_scanning_priority(struct zone *zone, int priority)
    {
    	if (priority < zone->prev_priority)
    		zone->prev_priority = priority;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This moves pages from the active list to the inactive list.
     *
     * We move them the other way if the page is referenced by one or more
     * processes, from rmap.
     *
     * If the pages are mostly unmapped, the processing is fast and it is
     * appropriate to hold zone->lru_lock across the whole operation.  But if
     * the pages are mapped, the processing is slow (page_referenced()) so we
     * should drop zone->lru_lock around each page.  It's impossible to balance
     * this, so instead we remove the pages from the LRU while processing them.
     * It is safe to rely on PG_active against the non-LRU pages in here because
     * nobody will play with that bit on a non-LRU page.
     *
     * The downside is that we have to touch page->_count against each page.
     * But we had to alter page->flags anyway.
     */
    
    static void move_active_pages_to_lru(struct zone *zone,
    				     struct list_head *list,
    				     enum lru_list lru)
    {
    	unsigned long pgmoved = 0;
    	struct pagevec pvec;
    	struct page *page;
    
    	pagevec_init(&pvec, 1);
    
    	while (!list_empty(list)) {
    		page = lru_to_page(list);
    
    		VM_BUG_ON(PageLRU(page));
    		SetPageLRU(page);
    
    		list_move(&page->lru, &zone->lru[lru].list);
    		mem_cgroup_add_lru_list(page, lru);
    		pgmoved++;
    
    		if (!pagevec_add(&pvec, page) || list_empty(list)) {
    			spin_unlock_irq(&zone->lru_lock);
    			if (buffer_heads_over_limit)
    				pagevec_strip(&pvec);
    			__pagevec_release(&pvec);
    			spin_lock_irq(&zone->lru_lock);
    		}
    	}
    	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
    	if (!is_active_lru(lru))
    		__count_vm_events(PGDEACTIVATE, pgmoved);
    }
    
    static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
    
    			struct scan_control *sc, int priority, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken;
    
    	unsigned long pgscanned;
    
    	unsigned long vm_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	LIST_HEAD(l_hold);	/* The pages which were snipped off */
    
    	LIST_HEAD(l_inactive);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
    
    	unsigned long nr_rotated = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lru_add_drain();
    	spin_lock_irq(&zone->lru_lock);
    
    	if (scanning_global_lru(sc)) {
    
    		nr_taken = isolate_pages_global(nr_pages, &l_hold,
    						&pgscanned, sc->order,
    						ISOLATE_ACTIVE, zone,
    						1, file);
    
    	} else {
    		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
    						&pgscanned, sc->order,
    						ISOLATE_ACTIVE, zone,
    						sc->mem_cgroup, 1, file);
    		/*
    		 * mem_cgroup_isolate_pages() keeps track of
    		 * scanned pages on its own.
    		 */
    
    	reclaim_stat->recent_scanned[file] += nr_taken;
    
    	__count_zone_vm_events(PGREFILL, zone, pgscanned);
    
    		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
    
    		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_unlock_irq(&zone->lru_lock);
    
    	while (!list_empty(&l_hold)) {
    		cond_resched();
    		page = lru_to_page(&l_hold);
    		list_del(&page->lru);
    
    		if (unlikely(!page_evictable(page, NULL))) {
    			putback_lru_page(page);
    			continue;
    		}
    
    
    		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
    
    			/*
    			 * Identify referenced, file-backed active pages and
    			 * give them one more trip around the active list. So
    			 * that executable code get better chances to stay in
    			 * memory under moderate memory pressure.  Anon pages
    			 * are not likely to be evicted by use-once streaming
    			 * IO, plus JVM can create lots of anon VM_EXEC pages,
    			 * so we ignore them here.
    			 */
    
    			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
    
    				list_add(&page->lru, &l_active);
    				continue;
    			}
    		}
    
    		ClearPageActive(page);	/* we are de-activating */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_add(&page->lru, &l_inactive);
    	}
    
    
    	 * Move pages back to the lru list.
    
    	spin_lock_irq(&zone->lru_lock);
    
    	 * Count referenced pages from currently used mappings as rotated,
    	 * even though only some of them are actually re-activated.  This
    	 * helps balance scan pressure between file and anonymous pages in
    	 * get_scan_ratio.
    
    	reclaim_stat->recent_rotated[file] += nr_rotated;
    
    	move_active_pages_to_lru(zone, &l_active,
    						LRU_ACTIVE + file * LRU_FILE);
    	move_active_pages_to_lru(zone, &l_inactive,
    						LRU_BASE   + file * LRU_FILE);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    
    	spin_unlock_irq(&zone->lru_lock);
    
    static int inactive_anon_is_low_global(struct zone *zone)
    
    {
    	unsigned long active, inactive;
    
    	active = zone_page_state(zone, NR_ACTIVE_ANON);
    	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    
    	if (inactive * zone->inactive_ratio < active)
    		return 1;
    
    	return 0;
    }
    
    
    /**
     * inactive_anon_is_low - check if anonymous pages need to be deactivated
     * @zone: zone to check
     * @sc:   scan control of this context
     *
     * Returns true if the zone does not have enough inactive anon pages,
     * meaning some active anon pages need to be deactivated.
     */
    static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
    {
    	int low;
    
    
    	if (scanning_global_lru(sc))
    
    		low = inactive_anon_is_low_global(zone);
    	else
    
    		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
    
    static int inactive_file_is_low_global(struct zone *zone)
    {
    	unsigned long active, inactive;
    
    	active = zone_page_state(zone, NR_ACTIVE_FILE);
    	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    
    	return (active > inactive);
    }
    
    /**
     * inactive_file_is_low - check if file pages need to be deactivated
     * @zone: zone to check
     * @sc:   scan control of this context
     *
     * When the system is doing streaming IO, memory pressure here
     * ensures that active file pages get deactivated, until more
     * than half of the file pages are on the inactive list.
     *
     * Once we get to that situation, protect the system's working
     * set from being evicted by disabling active file page aging.
     *
     * This uses a different ratio than the anonymous pages, because
     * the page cache uses a use-once replacement algorithm.
     */
    static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
    {
    	int low;
    
    	if (scanning_global_lru(sc))
    		low = inactive_file_is_low_global(zone);
    	else
    		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
    	return low;
    }
    
    
    static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
    				int file)
    {
    	if (file)
    		return inactive_file_is_low(zone, sc);
    	else
    		return inactive_anon_is_low(zone, sc);
    }
    
    
    static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    
    	struct zone *zone, struct scan_control *sc, int priority)
    {
    
    	int file = is_file_lru(lru);
    
    
    	if (is_active_lru(lru)) {
    		if (inactive_list_is_low(zone, sc, file))
    		    shrink_active_list(nr_to_scan, zone, sc, priority, file);
    
    	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
    
    /*
     * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
     * until we collected @swap_cluster_max pages to scan.
     */
    static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
    				       unsigned long *nr_saved_scan)
    {
    	unsigned long nr;
    
    	*nr_saved_scan += nr_to_scan;
    	nr = *nr_saved_scan;
    
    	if (nr >= SWAP_CLUSTER_MAX)
    		*nr_saved_scan = 0;
    	else
    		nr = 0;
    
    	return nr;
    }
    
    
    /*
     * Determine how aggressively the anon and file LRU lists should be
     * scanned.  The relative value of each set of LRU lists is determined
     * by looking at the fraction of the pages scanned we did rotate back
     * onto the active list instead of evict.
     *
    
     * nr[0] = anon pages to scan; nr[1] = file pages to scan
    
    static void get_scan_count(struct zone *zone, struct scan_control *sc,
    					unsigned long *nr, int priority)
    
    {
    	unsigned long anon, file, free;
    	unsigned long anon_prio, file_prio;
    	unsigned long ap, fp;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
    
    	u64 fraction[2], denominator;
    	enum lru_list l;
    	int noswap = 0;
    
    	/* If we have no swap space, do not bother scanning anon pages. */
    	if (!sc->may_swap || (nr_swap_pages <= 0)) {
    		noswap = 1;
    		fraction[0] = 0;
    		fraction[1] = 1;
    		denominator = 1;
    		goto out;
    	}
    
    	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
    		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
    	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
    		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
    
    	if (scanning_global_lru(sc)) {
    
    		free  = zone_page_state(zone, NR_FREE_PAGES);
    		/* If we have very few page cache pages,
    		   force-scan anon pages. */
    
    		if (unlikely(file + free <= high_wmark_pages(zone))) {
    
    			fraction[0] = 1;
    			fraction[1] = 0;
    			denominator = 1;
    			goto out;
    
    	}
    
    	/*
    	 * OK, so we have swap space and a fair amount of page cache
    	 * pages.  We use the recently rotated / recently scanned
    	 * ratios to determine how valuable each cache is.
    	 *
    	 * Because workloads change over time (and to avoid overflow)
    	 * we keep these statistics as a floating average, which ends
    	 * up weighing recent references more than old ones.
    	 *
    	 * anon in [0], file in [1]
    	 */
    
    	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
    
    		spin_lock_irq(&zone->lru_lock);
    
    		reclaim_stat->recent_scanned[0] /= 2;
    		reclaim_stat->recent_rotated[0] /= 2;
    
    		spin_unlock_irq(&zone->lru_lock);
    	}
    
    
    	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
    
    		spin_lock_irq(&zone->lru_lock);
    
    		reclaim_stat->recent_scanned[1] /= 2;
    		reclaim_stat->recent_rotated[1] /= 2;
    
    		spin_unlock_irq(&zone->lru_lock);
    	}
    
    	/*
    	 * With swappiness at 100, anonymous and file have the same priority.
    	 * This scanning priority is essentially the inverse of IO cost.
    	 */
    	anon_prio = sc->swappiness;
    	file_prio = 200 - sc->swappiness;
    
    	/*
    
    	 * The amount of pressure on anon vs file pages is inversely
    	 * proportional to the fraction of recently scanned pages on
    	 * each list that were recently referenced and in active use.
    
    	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
    	ap /= reclaim_stat->recent_rotated[0] + 1;
    
    	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
    	fp /= reclaim_stat->recent_rotated[1] + 1;
    
    	fraction[0] = ap;
    	fraction[1] = fp;
    	denominator = ap + fp + 1;
    out:
    	for_each_evictable_lru(l) {
    		int file = is_file_lru(l);
    		unsigned long scan;
    
    		scan = zone_nr_lru_pages(zone, sc, l);
    		if (priority || noswap) {
    			scan >>= priority;
    			scan = div64_u64(scan * fraction[file], denominator);
    		}
    		nr[l] = nr_scan_try_batch(scan,
    					  &reclaim_stat->nr_saved_scan[l]);
    	}
    
    static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
    {
    	/*
    	 * If we need a large contiguous chunk of memory, or have
    	 * trouble getting a small set of contiguous pages, we
    	 * will reclaim both active and inactive pages.
    	 */
    	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
    		sc->lumpy_reclaim_mode = 1;
    	else if (sc->order && priority < DEF_PRIORITY - 2)
    		sc->lumpy_reclaim_mode = 1;
    	else
    		sc->lumpy_reclaim_mode = 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
     */
    
    static void shrink_zone(int priority, struct zone *zone,
    
    				struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr[NR_LRU_LISTS];
    
    	unsigned long nr_reclaimed = sc->nr_reclaimed;
    
    	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    
    	get_scan_count(zone, sc, nr, priority);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
    					nr[LRU_INACTIVE_FILE]) {
    
    		for_each_evictable_lru(l) {
    
    				nr_to_scan = min_t(unsigned long,
    						   nr[l], SWAP_CLUSTER_MAX);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    				nr_reclaimed += shrink_list(l, nr_to_scan,
    							    zone, sc, priority);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/*
    		 * On large memory systems, scan >> priority can become
    		 * really large. This is fine for the starting priority;
    		 * we want to put equal scanning pressure on each zone.
    		 * However, if the VM has a harder time of freeing pages,
    		 * with multiple processes reclaiming pages, the total
    		 * freeing target can get unreasonably large.
    		 */
    
    		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
    
    	sc->nr_reclaimed = nr_reclaimed;
    
    
    	/*
    	 * Even if we did not try to evict anon pages at all, we want to
    	 * rebalance the anon lru active/inactive ratio.
    	 */
    
    	if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
    
    		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * This is the direct reclaim path, for page-allocating processes.  We only
     * try to reclaim pages from zones which will satisfy the caller's allocation
     * request.
     *
    
     * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
     * Because:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * a) The caller may be trying to free *extra* pages to satisfy a higher-order
     *    allocation or
    
     * b) The target zone may be at high_wmark_pages(zone) but the lower zones
     *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
     *    zone defense algorithm.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * If a zone is deemed to be full of pinned pages then just give it a light
     * scan then give up on it.
     */
    
    static bool shrink_zones(int priority, struct zonelist *zonelist,
    
    					struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
    					sc->nodemask) {
    
    		if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		/*
    		 * Take care memory controller reclaiming has small influence
    		 * to global LRU.
    		 */
    
    		if (scanning_global_lru(sc)) {
    
    			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    				continue;
    			note_zone_scanning_priority(zone, priority);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    
    				continue;	/* Let kswapd poll it */
    		} else {
    			/*
    			 * Ignore cpuset limitation here. We just want to reduce
    			 * # of used pages by us regardless of memory shortage.
    			 */
    			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
    							priority);
    		}
    
    		shrink_zone(priority, zone, sc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the main entry point to direct page reclaim.
     *
     * If a full scan of the inactive list fails to free enough memory then we
     * are "out of memory" and something needs to be killed.
     *
     * If the caller is !__GFP_FS then the probability of a failure is reasonably
     * high - the zone may be full of dirty or under-writeback pages, which this
    
     * caller can't do much about.  We kick the writeback threads and take explicit
     * naps in the hope that some of these pages can be written.  But if the
     * allocating task holds filesystem locks which prevent writeout this might not
     * work, and the allocation attempt will fail.
    
     *
     * returns:	0, if no pages reclaimed
     * 		else, the number of pages reclaimed
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int priority;
    
    	unsigned long total_scanned = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    
    	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
    
    	unsigned long writeback_threshold;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	delayacct_freepages_start();
    
    
    	if (scanning_global_lru(sc))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
    
    		sc->nr_scanned = 0;
    
    		if (!priority)
    			disable_swap_token();
    
    		all_unreclaimable = shrink_zones(priority, zonelist, sc);
    
    		/*
    		 * Don't shrink slabs when reclaiming memory from
    		 * over limit cgroups
    		 */
    
    		if (scanning_global_lru(sc)) {
    
    			unsigned long lru_pages = 0;
    			for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
    				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    					continue;
    
    				lru_pages += zone_reclaimable_pages(zone);
    			}
    
    
    			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
    
    				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		total_scanned += sc->nr_scanned;
    
    		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto out;
    
    		/*
    		 * Try to write back as many pages as we just scanned.  This
    		 * tends to cause slow streaming writers to write data to the
    		 * disk smoothly, at the dirtying rate, which is nice.   But
    		 * that's undesirable in laptop mode, where we *want* lumpy
    		 * writeout.  So in laptop mode, write out the whole world.
    		 */
    
    		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    		if (total_scanned > writeback_threshold) {
    
    			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
    
    			sc->may_writepage = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* Take a nap, wait for some writeback to complete */
    
    		if (!sc->hibernation_mode && sc->nr_scanned &&
    		    priority < DEF_PRIORITY - 2)
    
    			congestion_wait(BLK_RW_ASYNC, HZ/10);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    	/*
    	 * Now that we've scanned all the zones at this priority level, note
    	 * that level within the zone so that the next thread which performs
    	 * scanning of this zone will immediately start out at this priority
    	 * level.  This affects only the decision whether or not to bring
    	 * mapped pages onto the inactive list.
    	 */
    	if (priority < 0)
    		priority = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (scanning_global_lru(sc)) {
    
    		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
    
    
    			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    				continue;
    
    			zone->prev_priority = priority;
    		}
    	} else
    		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	delayacct_freepages_end();
    
    	if (sc->nr_reclaimed)
    		return sc->nr_reclaimed;
    
    	/* top priority shrink_zones still had more to do? don't OOM, then */
    	if (scanning_global_lru(sc) && !all_unreclaimable)
    		return 1;
    
    	return 0;
    
    unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    
    				gfp_t gfp_mask, nodemask_t *nodemask)
    
    {
    	struct scan_control sc = {
    		.gfp_mask = gfp_mask,
    		.may_writepage = !laptop_mode,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.swappiness = vm_swappiness,
    		.order = order,
    		.mem_cgroup = NULL,
    
    		.nodemask = nodemask,
    
    	return do_try_to_free_pages(zonelist, &sc);
    
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR
    
    unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
    						gfp_t gfp_mask, bool noswap,
    						unsigned int swappiness,
    						struct zone *zone, int nid)
    {
    	struct scan_control sc = {
    		.may_writepage = !laptop_mode,
    		.may_unmap = 1,
    		.may_swap = !noswap,
    		.swappiness = swappiness,
    		.order = 0,
    		.mem_cgroup = mem,
    	};
    	nodemask_t nm  = nodemask_of_node(nid);
    
    	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    	sc.nodemask = &nm;
    	sc.nr_reclaimed = 0;
    	sc.nr_scanned = 0;
    	/*
    	 * NOTE: Although we can get the priority field, using it
    	 * here is not a good idea, since it limits the pages we can scan.
    	 * if we don't reclaim here, the shrink_zone from balance_pgdat
    	 * will pick up pages from other mem cgroup's as well. We hack
    	 * the priority and make it zero.
    	 */
    	shrink_zone(0, zone, &sc);
    	return sc.nr_reclaimed;
    }
    
    
    unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    					   gfp_t gfp_mask,
    					   bool noswap,
    					   unsigned int swappiness)
    
    	struct zonelist *zonelist;
    
    	struct scan_control sc = {
    		.may_writepage = !laptop_mode,
    
    		.may_unmap = 1,
    
    		.may_swap = !noswap,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    		.swappiness = swappiness,
    
    		.order = 0,
    		.mem_cgroup = mem_cont,
    
    		.nodemask = NULL, /* we don't care the placement */
    
    	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
    	return do_try_to_free_pages(zonelist, &sc);
    
    static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
    
    
    	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    	if (remaining)
    		return 1;
    
    	/* If after HZ/10, a zone is below the high mark, it's premature */
    
    	for (i = 0; i < pgdat->nr_zones; i++) {
    		struct zone *zone = pgdat->node_zones + i;
    
    		if (!populated_zone(zone))
    			continue;
    
    
    		if (zone->all_unreclaimable)
    
    		if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
    								0, 0))
    			return 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * For kswapd, balance_pgdat() will work across all this node's zones until
    
     * they are all at high_wmark_pages(zone).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * Returns the number of pages which were actually freed.
     *
     * There is special handling here for zones which are full of pinned pages.
     * This can happen if the pages are all mlocked, or if they are all used by
     * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
     * What we do is to detect the case where all pages in the zone have been
     * scanned twice and there has been zero successful reclaim.  Mark the zone as
     * dead and from now on, only perform a short scan.  Basically we're polling
     * the zone for when the problem goes away.
     *
     * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    
     * zones which have free_pages > high_wmark_pages(zone), but once a zone is