Skip to content
Snippets Groups Projects
vmscan.c 96 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	 *
    	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
    	 * that it is possible to migrate without blocking
    	 */
    	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
    		/* All the caller can do on PageWriteback is block */
    		if (PageWriteback(page))
    			return ret;
    
    		if (PageDirty(page)) {
    			struct address_space *mapping;
    
    			/* ISOLATE_CLEAN means only clean pages */
    			if (mode & ISOLATE_CLEAN)
    				return ret;
    
    			/*
    			 * Only pages without mappings or that have a
    			 * ->migratepage callback are possible to migrate
    			 * without blocking
    			 */
    			mapping = page_mapping(page);
    			if (mapping && !mapping->a_ops->migratepage)
    				return ret;
    		}
    	}
    
    	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
    		return ret;
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	if (likely(get_page_unless_zero(page))) {
    		/*
    		 * Be careful not to clear PageLRU until after we're
    		 * sure the page is not being freed elsewhere -- the
    		 * page release code relies on it.
    		 */
    		ClearPageLRU(page);
    		ret = 0;
    	}
    
    	return ret;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * zone->lru_lock is heavily contended.  Some of the functions that
     * shrink the lists perform better by taking out a batch of pages
     * and working on them outside the LRU lock.
     *
     * For pagecache intensive workloads, this function is the hottest
     * spot in the kernel (apart from copy_*_user functions).
     *
     * Appropriate locks must be held before calling this function.
     *
     * @nr_to_scan:	The number of pages to look through on the list.
    
    Hugh Dickins's avatar
    Hugh Dickins committed
     * @mz:		The mem_cgroup_zone to pull pages from.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @dst:	The temp list to put pages on to.
    
    Hugh Dickins's avatar
    Hugh Dickins committed
     * @nr_scanned:	The number of pages that were scanned.
    
     * @sc:		The scan_control struct for this reclaim session
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
     * @mode:	One of the LRU isolation modes
    
    Hugh Dickins's avatar
    Hugh Dickins committed
     * @active:	True [1] if isolating active pages
    
     * @file:	True [1] if isolating file [!anon] pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * returns how many pages were moved onto *@dst.
     */
    
    static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		struct mem_cgroup_zone *mz, struct list_head *dst,
    
    		unsigned long *nr_scanned, struct scan_control *sc,
    		isolate_mode_t mode, int active, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	struct lruvec *lruvec;
    	struct list_head *src;
    
    	unsigned long nr_taken = 0;
    
    	unsigned long scan;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	int lru = LRU_BASE;
    
    	lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
    	if (active)
    		lru += LRU_ACTIVE;
    	if (file)
    		lru += LRU_FILE;
    	src = &lruvec->lists[lru];
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		struct page *page;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = lru_to_page(src);
    		prefetchw_prev_lru_page(page, src, flags);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLRU(page));
    
    		switch (__isolate_lru_page(page, mode, file)) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		case 0:
    
    			mem_cgroup_lru_del(page);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			list_move(&page->lru, dst);
    
    			nr_taken += hpage_nr_pages(page);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			break;
    
    		case -EBUSY:
    			/* else it is being freed elsewhere */
    			list_move(&page->lru, src);
    			continue;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		default:
    			BUG();
    		}
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	*nr_scanned = scan;
    
    	trace_mm_vmscan_lru_isolate(sc->order,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return nr_taken;
    }
    
    
    /**
     * isolate_lru_page - tries to isolate a page from its LRU list
     * @page: page to isolate from its LRU list
     *
     * Isolates a @page from an LRU list, clears PageLRU and adjusts the
     * vmstat statistic corresponding to whatever LRU list the page was on.
     *
     * Returns 0 if the page was removed from an LRU list.
     * Returns -EBUSY if the page was not on an LRU list.
     *
     * The returned page will have PageLRU() cleared.  If it was found on
    
     * the active list, it will have PageActive set.  If it was found on
     * the unevictable list, it will have the PageUnevictable bit set. That flag
     * may need to be cleared by the caller before letting the page go.
    
     *
     * The vmstat statistic corresponding to the list on which the page was
     * found will be decremented.
     *
     * Restrictions:
     * (1) Must be called with an elevated refcount on the page. This is a
     *     fundamentnal difference from isolate_lru_pages (which is called
     *     without a stable reference).
     * (2) the lru_lock must not be held.
     * (3) interrupts must be enabled.
     */
    int isolate_lru_page(struct page *page)
    {
    	int ret = -EBUSY;
    
    
    	if (PageLRU(page)) {
    		struct zone *zone = page_zone(page);
    
    		spin_lock_irq(&zone->lru_lock);
    
    			int lru = page_lru(page);
    
    			ClearPageLRU(page);
    
    
    			del_page_from_lru_list(zone, page, lru);
    
    		}
    		spin_unlock_irq(&zone->lru_lock);
    	}
    	return ret;
    }
    
    
    /*
     * Are there way too many processes in the direct reclaim path already?
     */
    static int too_many_isolated(struct zone *zone, int file,
    		struct scan_control *sc)
    {
    	unsigned long inactive, isolated;
    
    	if (current_is_kswapd())
    		return 0;
    
    
    		return 0;
    
    	if (file) {
    		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
    	} else {
    		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
    	}
    
    	return isolated > inactive;
    }
    
    
    putback_inactive_pages(struct mem_cgroup_zone *mz,
    		       struct list_head *page_list)
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
    
    	struct zone *zone = mz->zone;
    	LIST_HEAD(pages_to_free);
    
    
    	/*
    	 * Put back any unfreeable pages.
    	 */
    	while (!list_empty(page_list)) {
    
    		struct page *page = lru_to_page(page_list);
    
    		VM_BUG_ON(PageLRU(page));
    		list_del(&page->lru);
    		if (unlikely(!page_evictable(page, NULL))) {
    			spin_unlock_irq(&zone->lru_lock);
    			putback_lru_page(page);
    			spin_lock_irq(&zone->lru_lock);
    			continue;
    		}
    
    		add_page_to_lru_list(zone, page, lru);
    
    		if (is_active_lru(lru)) {
    			int file = is_file_lru(lru);
    
    			int numpages = hpage_nr_pages(page);
    			reclaim_stat->recent_rotated[file] += numpages;
    
    		if (put_page_testzero(page)) {
    			__ClearPageLRU(page);
    			__ClearPageActive(page);
    			del_page_from_lru_list(zone, page, lru);
    
    			if (unlikely(PageCompound(page))) {
    				spin_unlock_irq(&zone->lru_lock);
    				(*get_compound_page_dtor(page))(page);
    				spin_lock_irq(&zone->lru_lock);
    			} else
    				list_add(&page->lru, &pages_to_free);
    
    	/*
    	 * To save our caller's stack, now use input list for pages to free.
    	 */
    	list_splice(&pages_to_free, page_list);
    
    static noinline_for_stack void
    update_isolated_counts(struct mem_cgroup_zone *mz,
    
    		       struct list_head *page_list,
    
    		       unsigned long *nr_file)
    
    	unsigned int count[NR_LRU_LISTS] = { 0, };
    
    	unsigned long nr_active = 0;
    	struct page *page;
    	int lru;
    
    	/*
    	 * Count pages and clear active flags
    	 */
    	list_for_each_entry(page, page_list, lru) {
    		int numpages = hpage_nr_pages(page);
    		lru = page_lru_base_type(page);
    		if (PageActive(page)) {
    			lru += LRU_ACTIVE;
    			ClearPageActive(page);
    			nr_active += numpages;
    		}
    		count[lru] += numpages;
    	}
    
    	__count_vm_events(PGDEACTIVATE, nr_active);
    
    	__mod_zone_page_state(zone, NR_ACTIVE_FILE,
    			      -count[LRU_ACTIVE_FILE]);
    	__mod_zone_page_state(zone, NR_INACTIVE_FILE,
    			      -count[LRU_INACTIVE_FILE]);
    	__mod_zone_page_state(zone, NR_ACTIVE_ANON,
    			      -count[LRU_ACTIVE_ANON]);
    	__mod_zone_page_state(zone, NR_INACTIVE_ANON,
    			      -count[LRU_INACTIVE_ANON]);
    
    	*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
    	*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
    
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
    	__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
    	preempt_enable();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
     * of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static noinline_for_stack unsigned long
    
    shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
    		     struct scan_control *sc, int priority, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(page_list);
    
    	unsigned long nr_scanned;
    
    	unsigned long nr_reclaimed = 0;
    
    	unsigned long nr_taken;
    	unsigned long nr_anon;
    	unsigned long nr_file;
    
    	unsigned long nr_dirty = 0;
    	unsigned long nr_writeback = 0;
    
    	isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
    
    	while (unlikely(too_many_isolated(zone, file, sc))) {
    
    		congestion_wait(BLK_RW_ASYNC, HZ/10);
    
    
    		/* We are about to die and free our memory. Return now. */
    		if (fatal_signal_pending(current))
    			return SWAP_CLUSTER_MAX;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	lru_add_drain();
    
    		isolate_mode |= ISOLATE_UNMAPPED;
    
    	if (!sc->may_writepage)
    
    		isolate_mode |= ISOLATE_CLEAN;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_lock_irq(&zone->lru_lock);
    
    	nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
    				     sc, isolate_mode, 0, file);
    
    		zone->pages_scanned += nr_scanned;
    		if (current_is_kswapd())
    			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
    					       nr_scanned);
    		else
    			__count_zone_vm_events(PGSCAN_DIRECT, zone,
    					       nr_scanned);
    	}
    
    	spin_unlock_irq(&zone->lru_lock);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    
    	update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
    
    
    	nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
    
    	spin_lock_irq(&zone->lru_lock);
    
    
    	reclaim_stat->recent_scanned[0] += nr_anon;
    	reclaim_stat->recent_scanned[1] += nr_file;
    
    
    	if (global_reclaim(sc)) {
    		if (current_is_kswapd())
    			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
    					       nr_reclaimed);
    		else
    			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
    					       nr_reclaimed);
    	}
    
    	putback_inactive_pages(mz, &page_list);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
    	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
    
    	spin_unlock_irq(&zone->lru_lock);
    
    	free_hot_cold_page_list(&page_list, 1);
    
    	/*
    	 * If reclaim is isolating dirty pages under writeback, it implies
    	 * that the long-lived page allocation rate is exceeding the page
    	 * laundering rate. Either the global limits are not being effective
    	 * at throttling processes due to the page distribution throughout
    	 * zones or there is heavy usage of a slow backing device. The
    	 * only option is to throttle from reclaim context which is not ideal
    	 * as there is no guarantee the dirtying process is throttled in the
    	 * same way balance_dirty_pages() manages.
    	 *
    	 * This scales the number of dirty pages that must be under writeback
    	 * before throttling depending on priority. It is a simple backoff
    	 * function that has the most effect in the range DEF_PRIORITY to
    	 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
    	 * in trouble and reclaim is considered to be in trouble.
    	 *
    	 * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
    	 * DEF_PRIORITY-1  50% must be PageWriteback
    	 * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
    	 * ...
    	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
    	 *                     isolated page is PageWriteback
    	 */
    	if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
    		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
    
    
    	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
    		zone_idx(zone),
    		nr_scanned, nr_reclaimed,
    		priority,
    
    		trace_shrink_flags(file));
    
    	return nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * This moves pages from the active list to the inactive list.
     *
     * We move them the other way if the page is referenced by one or more
     * processes, from rmap.
     *
     * If the pages are mostly unmapped, the processing is fast and it is
     * appropriate to hold zone->lru_lock across the whole operation.  But if
     * the pages are mapped, the processing is slow (page_referenced()) so we
     * should drop zone->lru_lock around each page.  It's impossible to balance
     * this, so instead we remove the pages from the LRU while processing them.
     * It is safe to rely on PG_active against the non-LRU pages in here because
     * nobody will play with that bit on a non-LRU page.
     *
     * The downside is that we have to touch page->_count against each page.
     * But we had to alter page->flags anyway.
     */
    
    static void move_active_pages_to_lru(struct zone *zone,
    				     struct list_head *list,
    
    				     struct list_head *pages_to_free,
    
    				     enum lru_list lru)
    {
    	unsigned long pgmoved = 0;
    	struct page *page;
    
    	while (!list_empty(list)) {
    
    		struct lruvec *lruvec;
    
    
    		page = lru_to_page(list);
    
    		VM_BUG_ON(PageLRU(page));
    		SetPageLRU(page);
    
    
    		lruvec = mem_cgroup_lru_add_list(zone, page, lru);
    		list_move(&page->lru, &lruvec->lists[lru]);
    
    		pgmoved += hpage_nr_pages(page);
    
    		if (put_page_testzero(page)) {
    			__ClearPageLRU(page);
    			__ClearPageActive(page);
    			del_page_from_lru_list(zone, page, lru);
    
    			if (unlikely(PageCompound(page))) {
    				spin_unlock_irq(&zone->lru_lock);
    				(*get_compound_page_dtor(page))(page);
    				spin_lock_irq(&zone->lru_lock);
    			} else
    				list_add(&page->lru, pages_to_free);
    
    		}
    	}
    	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
    	if (!is_active_lru(lru))
    		__count_vm_events(PGDEACTIVATE, pgmoved);
    }
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    static void shrink_active_list(unsigned long nr_to_scan,
    
    			       struct mem_cgroup_zone *mz,
    			       struct scan_control *sc,
    			       int priority, int file)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	unsigned long nr_scanned;
    
    	unsigned long vm_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	LIST_HEAD(l_hold);	/* The pages which were snipped off */
    
    	LIST_HEAD(l_inactive);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
    
    	unsigned long nr_rotated = 0;
    
    	isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lru_add_drain();
    
    		isolate_mode |= ISOLATE_UNMAPPED;
    
    	if (!sc->may_writepage)
    
    		isolate_mode |= ISOLATE_CLEAN;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_lock_irq(&zone->lru_lock);
    
    	nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
    
    				     isolate_mode, 1, file);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		zone->pages_scanned += nr_scanned;
    
    	reclaim_stat->recent_scanned[file] += nr_taken;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
    
    		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
    
    		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_unlock_irq(&zone->lru_lock);
    
    	while (!list_empty(&l_hold)) {
    		cond_resched();
    		page = lru_to_page(&l_hold);
    		list_del(&page->lru);
    
    		if (unlikely(!page_evictable(page, NULL))) {
    			putback_lru_page(page);
    			continue;
    		}
    
    
    		if (unlikely(buffer_heads_over_limit)) {
    			if (page_has_private(page) && trylock_page(page)) {
    				if (page_has_private(page))
    					try_to_release_page(page, 0);
    				unlock_page(page);
    			}
    		}
    
    
    		if (page_referenced(page, 0, sc->target_mem_cgroup,
    				    &vm_flags)) {
    
    			nr_rotated += hpage_nr_pages(page);
    
    			/*
    			 * Identify referenced, file-backed active pages and
    			 * give them one more trip around the active list. So
    			 * that executable code get better chances to stay in
    			 * memory under moderate memory pressure.  Anon pages
    			 * are not likely to be evicted by use-once streaming
    			 * IO, plus JVM can create lots of anon VM_EXEC pages,
    			 * so we ignore them here.
    			 */
    
    			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
    
    				list_add(&page->lru, &l_active);
    				continue;
    			}
    		}
    
    		ClearPageActive(page);	/* we are de-activating */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_add(&page->lru, &l_inactive);
    	}
    
    
    	 * Move pages back to the lru list.
    
    	spin_lock_irq(&zone->lru_lock);
    
    	 * Count referenced pages from currently used mappings as rotated,
    	 * even though only some of them are actually re-activated.  This
    	 * helps balance scan pressure between file and anonymous pages in
    	 * get_scan_ratio.
    
    	reclaim_stat->recent_rotated[file] += nr_rotated;
    
    	move_active_pages_to_lru(zone, &l_active, &l_hold,
    
    						LRU_ACTIVE + file * LRU_FILE);
    
    	move_active_pages_to_lru(zone, &l_inactive, &l_hold,
    
    						LRU_BASE   + file * LRU_FILE);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    
    	spin_unlock_irq(&zone->lru_lock);
    
    
    	free_hot_cold_page_list(&l_hold, 1);
    
    static int inactive_anon_is_low_global(struct zone *zone)
    
    {
    	unsigned long active, inactive;
    
    	active = zone_page_state(zone, NR_ACTIVE_ANON);
    	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    
    	if (inactive * zone->inactive_ratio < active)
    		return 1;
    
    	return 0;
    }
    
    
    /**
     * inactive_anon_is_low - check if anonymous pages need to be deactivated
     * @zone: zone to check
     * @sc:   scan control of this context
     *
     * Returns true if the zone does not have enough inactive anon pages,
     * meaning some active anon pages need to be deactivated.
     */
    
    static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
    
    	/*
    	 * If we don't have swap space, anonymous page deactivation
    	 * is pointless.
    	 */
    	if (!total_swap_pages)
    		return 0;
    
    
    	if (!scanning_global_lru(mz))
    		return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
    						       mz->zone);
    
    	return inactive_anon_is_low_global(mz->zone);
    
    static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
    
    static int inactive_file_is_low_global(struct zone *zone)
    {
    	unsigned long active, inactive;
    
    	active = zone_page_state(zone, NR_ACTIVE_FILE);
    	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    
    	return (active > inactive);
    }
    
    /**
     * inactive_file_is_low - check if file pages need to be deactivated
    
     *
     * When the system is doing streaming IO, memory pressure here
     * ensures that active file pages get deactivated, until more
     * than half of the file pages are on the inactive list.
     *
     * Once we get to that situation, protect the system's working
     * set from being evicted by disabling active file page aging.
     *
     * This uses a different ratio than the anonymous pages, because
     * the page cache uses a use-once replacement algorithm.
     */
    
    static int inactive_file_is_low(struct mem_cgroup_zone *mz)
    
    	if (!scanning_global_lru(mz))
    		return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
    						       mz->zone);
    
    	return inactive_file_is_low_global(mz->zone);
    
    static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
    
    static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    
    				 struct mem_cgroup_zone *mz,
    				 struct scan_control *sc, int priority)
    
    	int file = is_file_lru(lru);
    
    
    		if (inactive_list_is_low(mz, file))
    			shrink_active_list(nr_to_scan, mz, sc, priority, file);
    
    	return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
    
    static int vmscan_swappiness(struct mem_cgroup_zone *mz,
    			     struct scan_control *sc)
    
    	return mem_cgroup_swappiness(mz->mem_cgroup);
    
    /*
     * Determine how aggressively the anon and file LRU lists should be
     * scanned.  The relative value of each set of LRU lists is determined
     * by looking at the fraction of the pages scanned we did rotate back
     * onto the active list instead of evict.
     *
    
     * nr[0] = anon pages to scan; nr[1] = file pages to scan
    
    static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
    			   unsigned long *nr, int priority)
    
    {
    	unsigned long anon, file, free;
    	unsigned long anon_prio, file_prio;
    	unsigned long ap, fp;
    
    	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
    
    	u64 fraction[2], denominator;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	enum lru_list lru;
    
    	/*
    	 * If the zone or memcg is small, nr[l] can be 0.  This
    	 * results in no scanning on this priority and a potential
    	 * priority drop.  Global direct reclaim can go to the next
    	 * zone and tends to have no problems. Global kswapd is for
    	 * zone balancing and it needs to scan a minimum amount. When
    	 * reclaiming for a memcg, a priority drop can cause high
    	 * latencies, so it's better to scan a minimum amount there as
    	 * well.
    	 */
    
    	if (current_is_kswapd() && mz->zone->all_unreclaimable)
    
    
    	/* If we have no swap space, do not bother scanning anon pages. */
    	if (!sc->may_swap || (nr_swap_pages <= 0)) {
    		noswap = 1;
    		fraction[0] = 0;
    		fraction[1] = 1;
    		denominator = 1;
    		goto out;
    	}
    
    	anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
    		zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
    	file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
    		zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
    
    		free  = zone_page_state(mz->zone, NR_FREE_PAGES);
    
    		/* If we have very few page cache pages,
    		   force-scan anon pages. */
    
    		if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
    
    			fraction[0] = 1;
    			fraction[1] = 0;
    			denominator = 1;
    			goto out;
    
    	/*
    	 * With swappiness at 100, anonymous and file have the same priority.
    	 * This scanning priority is essentially the inverse of IO cost.
    	 */
    
    	anon_prio = vmscan_swappiness(mz, sc);
    	file_prio = 200 - vmscan_swappiness(mz, sc);
    
    	/*
    	 * OK, so we have swap space and a fair amount of page cache
    	 * pages.  We use the recently rotated / recently scanned
    	 * ratios to determine how valuable each cache is.
    	 *
    	 * Because workloads change over time (and to avoid overflow)
    	 * we keep these statistics as a floating average, which ends
    	 * up weighing recent references more than old ones.
    	 *
    	 * anon in [0], file in [1]
    	 */
    
    	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
    		reclaim_stat->recent_scanned[0] /= 2;
    		reclaim_stat->recent_rotated[0] /= 2;
    
    	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
    		reclaim_stat->recent_scanned[1] /= 2;
    		reclaim_stat->recent_rotated[1] /= 2;
    
    	 * The amount of pressure on anon vs file pages is inversely
    	 * proportional to the fraction of recently scanned pages on
    	 * each list that were recently referenced and in active use.
    
    	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
    	ap /= reclaim_stat->recent_rotated[0] + 1;
    
    	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
    	fp /= reclaim_stat->recent_rotated[1] + 1;
    
    	fraction[0] = ap;
    	fraction[1] = fp;
    	denominator = ap + fp + 1;
    out:
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	for_each_evictable_lru(lru) {
    		int file = is_file_lru(lru);
    
    		unsigned long scan;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		scan = zone_nr_lru_pages(mz, lru);
    
    		if (priority || noswap) {
    			scan >>= priority;
    
    			if (!scan && force_scan)
    				scan = SWAP_CLUSTER_MAX;
    
    			scan = div64_u64(scan * fraction[file], denominator);
    		}
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		nr[lru] = scan;
    
    /* Use reclaim/compaction for costly allocs or under memory pressure */
    static bool in_reclaim_compaction(int priority, struct scan_control *sc)
    {
    	if (COMPACTION_BUILD && sc->order &&
    			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
    			 priority < DEF_PRIORITY - 2))
    		return true;
    
    	return false;
    }
    
    
     * Reclaim/compaction is used for high-order allocation requests. It reclaims
     * order-0 pages before compacting the zone. should_continue_reclaim() returns
     * true if more pages should be reclaimed such that when the page allocator
     * calls try_to_compact_zone() that it will have enough free pages to succeed.
     * It will give up earlier than that if there is difficulty reclaiming pages.
    
    static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
    
    					unsigned long nr_reclaimed,
    					unsigned long nr_scanned,
    
    					int priority,
    
    					struct scan_control *sc)
    {
    	unsigned long pages_for_compaction;
    	unsigned long inactive_lru_pages;
    
    	/* If not in reclaim/compaction mode, stop */
    
    	if (!in_reclaim_compaction(priority, sc))
    
    	/* Consider stopping depending on scan and reclaim activity */
    	if (sc->gfp_mask & __GFP_REPEAT) {
    		/*
    		 * For __GFP_REPEAT allocations, stop reclaiming if the
    		 * full LRU list has been scanned and we are still failing
    		 * to reclaim pages. This full LRU scan is potentially
    		 * expensive but a __GFP_REPEAT caller really wants to succeed
    		 */
    		if (!nr_reclaimed && !nr_scanned)
    			return false;
    	} else {
    		/*
    		 * For non-__GFP_REPEAT allocations which can presumably
    		 * fail without consequence, stop if we failed to reclaim
    		 * any pages from the last SWAP_CLUSTER_MAX number of
    		 * pages that were scanned. This will return to the
    		 * caller faster at the risk reclaim/compaction and
    		 * the resulting allocation attempt fails
    		 */
    		if (!nr_reclaimed)
    			return false;
    	}
    
    
    	/*
    	 * If we have not reclaimed enough pages for compaction and the
    	 * inactive lists are large enough, continue reclaiming
    	 */
    	pages_for_compaction = (2UL << sc->order);
    
    	inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
    
    		inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
    
    	if (sc->nr_reclaimed < pages_for_compaction &&
    			inactive_lru_pages > pages_for_compaction)
    		return true;
    
    	/* If compaction would go ahead or the allocation would succeed, stop */
    
    	switch (compaction_suitable(mz->zone, sc->order)) {
    
    	case COMPACT_PARTIAL:
    	case COMPACT_CONTINUE:
    		return false;
    	default:
    		return true;
    	}
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
     */
    
    static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
    				   struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr[NR_LRU_LISTS];
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	enum lru_list lru;
    
    	unsigned long nr_reclaimed, nr_scanned;
    
    	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    
    	struct blk_plug plug;
    
    	nr_scanned = sc->nr_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	blk_start_plug(&plug);
    
    	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
    					nr[LRU_INACTIVE_FILE]) {
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		for_each_evictable_lru(lru) {
    			if (nr[lru]) {
    
    				nr_to_scan = min_t(unsigned long,
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    						   nr[lru], SWAP_CLUSTER_MAX);
    				nr[lru] -= nr_to_scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    				nr_reclaimed += shrink_list(lru, nr_to_scan,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/*
    		 * On large memory systems, scan >> priority can become
    		 * really large. This is fine for the starting priority;
    		 * we want to put equal scanning pressure on each zone.
    		 * However, if the VM has a harder time of freeing pages,
    		 * with multiple processes reclaiming pages, the total
    		 * freeing target can get unreasonably large.
    		 */
    
    		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	blk_finish_plug(&plug);
    
    	sc->nr_reclaimed += nr_reclaimed;
    
    	/*
    	 * Even if we did not try to evict anon pages at all, we want to
    	 * rebalance the anon lru active/inactive ratio.
    	 */
    
    	if (inactive_anon_is_low(mz))
    		shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
    
    	/* reclaim/compaction might need reclaim to continue */
    
    	if (should_continue_reclaim(mz, nr_reclaimed,
    
    					sc->nr_scanned - nr_scanned,
    					priority, sc))
    
    static void shrink_zone(int priority, struct zone *zone,
    			struct scan_control *sc)
    {
    
    	struct mem_cgroup *root = sc->target_mem_cgroup;
    	struct mem_cgroup_reclaim_cookie reclaim = {
    
    	struct mem_cgroup *memcg;
    
    	memcg = mem_cgroup_iter(root, NULL, &reclaim);
    	do {
    		struct mem_cgroup_zone mz = {
    			.mem_cgroup = memcg,
    			.zone = zone,
    		};
    
    		shrink_mem_cgroup_zone(priority, &mz, sc);
    		/*
    		 * Limit reclaim has historically picked one memcg and
    		 * scanned it with decreasing priority levels until
    		 * nr_to_reclaim had been reclaimed.  This priority
    		 * cycle is thus over after a single memcg.
    
    		 *
    		 * Direct reclaim and kswapd, on the other hand, have
    		 * to scan all memory cgroups to fulfill the overall
    		 * scan target for the zone.
    
    		 */
    		if (!global_reclaim(sc)) {
    			mem_cgroup_iter_break(root, memcg);
    			break;
    		}
    		memcg = mem_cgroup_iter(root, memcg, &reclaim);
    	} while (memcg);
    
    /* Returns true if compaction should go ahead for a high-order request */
    static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
    {
    	unsigned long balance_gap, watermark;
    	bool watermark_ok;
    
    	/* Do not consider compaction for orders reclaim is meant to satisfy */
    	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
    		return false;
    
    	/*
    	 * Compaction takes time to run and there are potentially other
    	 * callers using the pages just freed. Continue reclaiming until
    	 * there is a buffer of free pages available to give compaction
    	 * a reasonable chance of completing and allocating the page
    	 */
    	balance_gap = min(low_wmark_pages(zone),
    		(zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    			KSWAPD_ZONE_BALANCE_GAP_RATIO);
    	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
    	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
    
    	/*
    	 * If compaction is deferred, reclaim up to a point where
    	 * compaction will have a chance of success when re-enabled
    	 */
    
    	if (compaction_deferred(zone, sc->order))
    
    		return watermark_ok;
    
    	/* If compaction is not ready to start, keep reclaiming */
    	if (!compaction_suitable(zone, sc->order))
    		return false;
    
    	return watermark_ok;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the direct reclaim path, for page-allocating processes.  We only