Skip to content
Snippets Groups Projects
vmscan.c 103 KiB
Newer Older
  • Learn to ignore specific revisions
  • Andy Whitcroft's avatar
    Andy Whitcroft committed
     * if it is of the appropriate PageActive status.  Pages which are being
     * freed elsewhere are also ignored.
     *
     * page:	page to consider
     * mode:	one of the LRU isolation modes defined above
     *
     * returns 0 on success, -ve errno on failure.
     */
    
    int __isolate_lru_page(struct page *page, isolate_mode_t mode)
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    {
    	int ret = -EINVAL;
    
    	/* Only take pages on the LRU. */
    	if (!PageLRU(page))
    		return ret;
    
    
    Minchan Kim's avatar
    Minchan Kim committed
    	/* Compaction should not handle unevictable pages but CMA can do so */
    	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	ret = -EBUSY;
    
    	/*
    	 * To minimise LRU disruption, the caller can indicate that it only
    	 * wants to isolate pages it will be able to operate on without
    	 * blocking - clean pages for the most part.
    	 *
    	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
    	 * is used by reclaim when it is cannot write to backing storage
    	 *
    	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
    	 * that it is possible to migrate without blocking
    	 */
    	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
    		/* All the caller can do on PageWriteback is block */
    		if (PageWriteback(page))
    			return ret;
    
    		if (PageDirty(page)) {
    			struct address_space *mapping;
    
    			/* ISOLATE_CLEAN means only clean pages */
    			if (mode & ISOLATE_CLEAN)
    				return ret;
    
    			/*
    			 * Only pages without mappings or that have a
    			 * ->migratepage callback are possible to migrate
    			 * without blocking
    			 */
    			mapping = page_mapping(page);
    			if (mapping && !mapping->a_ops->migratepage)
    				return ret;
    		}
    	}
    
    	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
    		return ret;
    
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    	if (likely(get_page_unless_zero(page))) {
    		/*
    		 * Be careful not to clear PageLRU until after we're
    		 * sure the page is not being freed elsewhere -- the
    		 * page release code relies on it.
    		 */
    		ClearPageLRU(page);
    		ret = 0;
    	}
    
    	return ret;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * zone->lru_lock is heavily contended.  Some of the functions that
     * shrink the lists perform better by taking out a batch of pages
     * and working on them outside the LRU lock.
     *
     * For pagecache intensive workloads, this function is the hottest
     * spot in the kernel (apart from copy_*_user functions).
     *
     * Appropriate locks must be held before calling this function.
     *
     * @nr_to_scan:	The number of pages to look through on the list.
    
     * @lruvec:	The LRU vector to pull pages from.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @dst:	The temp list to put pages on to.
    
    Hugh Dickins's avatar
    Hugh Dickins committed
     * @nr_scanned:	The number of pages that were scanned.
    
     * @sc:		The scan_control struct for this reclaim session
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
     * @mode:	One of the LRU isolation modes
    
     * @lru:	LRU list id for isolating
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * returns how many pages were moved onto *@dst.
     */
    
    static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
    
    		struct lruvec *lruvec, struct list_head *dst,
    
    		unsigned long *nr_scanned, struct scan_control *sc,
    
    		isolate_mode_t mode, enum lru_list lru)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct list_head *src = &lruvec->lists[lru];
    
    	unsigned long nr_taken = 0;
    
    	unsigned long scan;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		struct page *page;
    
    		int nr_pages;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		page = lru_to_page(src);
    		prefetchw_prev_lru_page(page, src, flags);
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLRU(page));
    
    		switch (__isolate_lru_page(page, mode)) {
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		case 0:
    
    			nr_pages = hpage_nr_pages(page);
    			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			list_move(&page->lru, dst);
    
    			nr_taken += nr_pages;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    			break;
    
    		case -EBUSY:
    			/* else it is being freed elsewhere */
    			list_move(&page->lru, src);
    			continue;
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		default:
    			BUG();
    		}
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	*nr_scanned = scan;
    
    	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
    				    nr_taken, mode, is_file_lru(lru));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return nr_taken;
    }
    
    
    /**
     * isolate_lru_page - tries to isolate a page from its LRU list
     * @page: page to isolate from its LRU list
     *
     * Isolates a @page from an LRU list, clears PageLRU and adjusts the
     * vmstat statistic corresponding to whatever LRU list the page was on.
     *
     * Returns 0 if the page was removed from an LRU list.
     * Returns -EBUSY if the page was not on an LRU list.
     *
     * The returned page will have PageLRU() cleared.  If it was found on
    
     * the active list, it will have PageActive set.  If it was found on
     * the unevictable list, it will have the PageUnevictable bit set. That flag
     * may need to be cleared by the caller before letting the page go.
    
     *
     * The vmstat statistic corresponding to the list on which the page was
     * found will be decremented.
     *
     * Restrictions:
     * (1) Must be called with an elevated refcount on the page. This is a
     *     fundamentnal difference from isolate_lru_pages (which is called
     *     without a stable reference).
     * (2) the lru_lock must not be held.
     * (3) interrupts must be enabled.
     */
    int isolate_lru_page(struct page *page)
    {
    	int ret = -EBUSY;
    
    
    	if (PageLRU(page)) {
    		struct zone *zone = page_zone(page);
    
    		struct lruvec *lruvec;
    
    
    		spin_lock_irq(&zone->lru_lock);
    
    		lruvec = mem_cgroup_page_lruvec(page, zone);
    
    			int lru = page_lru(page);
    
    			ClearPageLRU(page);
    
    			del_page_from_lru_list(page, lruvec, lru);
    			ret = 0;
    
    		}
    		spin_unlock_irq(&zone->lru_lock);
    	}
    	return ret;
    }
    
    
     * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
     * then get resheduled. When there are massive number of tasks doing page
     * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
     * the LRU list will go small and be scanned faster than necessary, leading to
     * unnecessary swapping, thrashing and OOM.
    
     */
    static int too_many_isolated(struct zone *zone, int file,
    		struct scan_control *sc)
    {
    	unsigned long inactive, isolated;
    
    	if (current_is_kswapd())
    		return 0;
    
    
    		return 0;
    
    	if (file) {
    		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
    		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
    	} else {
    		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
    	}
    
    
    	/*
    	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
    	 * won't get blocked by normal direct-reclaimers, forming a circular
    	 * deadlock.
    	 */
    	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
    		inactive >>= 3;
    
    
    putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
    
    	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    	struct zone *zone = lruvec_zone(lruvec);
    
    	LIST_HEAD(pages_to_free);
    
    
    	/*
    	 * Put back any unfreeable pages.
    	 */
    	while (!list_empty(page_list)) {
    
    		struct page *page = lru_to_page(page_list);
    
    		VM_BUG_ON(PageLRU(page));
    		list_del(&page->lru);
    
    		if (unlikely(!page_evictable(page))) {
    
    			spin_unlock_irq(&zone->lru_lock);
    			putback_lru_page(page);
    			spin_lock_irq(&zone->lru_lock);
    			continue;
    		}
    
    
    		lruvec = mem_cgroup_page_lruvec(page, zone);
    
    
    		add_page_to_lru_list(page, lruvec, lru);
    
    
    		if (is_active_lru(lru)) {
    			int file = is_file_lru(lru);
    
    			int numpages = hpage_nr_pages(page);
    			reclaim_stat->recent_rotated[file] += numpages;
    
    		if (put_page_testzero(page)) {
    			__ClearPageLRU(page);
    			__ClearPageActive(page);
    
    			del_page_from_lru_list(page, lruvec, lru);
    
    
    			if (unlikely(PageCompound(page))) {
    				spin_unlock_irq(&zone->lru_lock);
    				(*get_compound_page_dtor(page))(page);
    				spin_lock_irq(&zone->lru_lock);
    			} else
    				list_add(&page->lru, &pages_to_free);
    
    	/*
    	 * To save our caller's stack, now use input list for pages to free.
    	 */
    	list_splice(&pages_to_free, page_list);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
     * of reclaimed pages
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static noinline_for_stack unsigned long
    
    shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
    
    		     struct scan_control *sc, enum lru_list lru)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	LIST_HEAD(page_list);
    
    	unsigned long nr_scanned;
    
    	unsigned long nr_reclaimed = 0;
    
    	unsigned long nr_taken;
    
    	unsigned long nr_dirty = 0;
    	unsigned long nr_writeback = 0;
    
    	isolate_mode_t isolate_mode = 0;
    
    	int file = is_file_lru(lru);
    
    	struct zone *zone = lruvec_zone(lruvec);
    	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    
    	while (unlikely(too_many_isolated(zone, file, sc))) {
    
    		congestion_wait(BLK_RW_ASYNC, HZ/10);
    
    
    		/* We are about to die and free our memory. Return now. */
    		if (fatal_signal_pending(current))
    			return SWAP_CLUSTER_MAX;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	lru_add_drain();
    
    		isolate_mode |= ISOLATE_UNMAPPED;
    
    	if (!sc->may_writepage)
    
    		isolate_mode |= ISOLATE_CLEAN;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_lock_irq(&zone->lru_lock);
    
    	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
    				     &nr_scanned, sc, isolate_mode, lru);
    
    
    	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    
    
    		zone->pages_scanned += nr_scanned;
    		if (current_is_kswapd())
    
    			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
    
    			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
    
    	spin_unlock_irq(&zone->lru_lock);
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    
    
    	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
    					&nr_dirty, &nr_writeback, false);
    
    	spin_lock_irq(&zone->lru_lock);
    
    
    	reclaim_stat->recent_scanned[file] += nr_taken;
    
    	if (global_reclaim(sc)) {
    		if (current_is_kswapd())
    			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
    					       nr_reclaimed);
    		else
    			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
    					       nr_reclaimed);
    	}
    
    	putback_inactive_pages(lruvec, &page_list);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    
    
    	spin_unlock_irq(&zone->lru_lock);
    
    	free_hot_cold_page_list(&page_list, 1);
    
    	/*
    	 * If reclaim is isolating dirty pages under writeback, it implies
    	 * that the long-lived page allocation rate is exceeding the page
    	 * laundering rate. Either the global limits are not being effective
    	 * at throttling processes due to the page distribution throughout
    	 * zones or there is heavy usage of a slow backing device. The
    	 * only option is to throttle from reclaim context which is not ideal
    	 * as there is no guarantee the dirtying process is throttled in the
    	 * same way balance_dirty_pages() manages.
    	 *
    	 * This scales the number of dirty pages that must be under writeback
    	 * before throttling depending on priority. It is a simple backoff
    	 * function that has the most effect in the range DEF_PRIORITY to
    	 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
    	 * in trouble and reclaim is considered to be in trouble.
    	 *
    	 * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
    	 * DEF_PRIORITY-1  50% must be PageWriteback
    	 * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
    	 * ...
    	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
    	 *                     isolated page is PageWriteback
    	 */
    
    	if (nr_writeback && nr_writeback >=
    			(nr_taken >> (DEF_PRIORITY - sc->priority)))
    
    		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
    
    
    	/*
    	 * Similarly, if many dirty pages are encountered that are not
    	 * currently being written then flag that kswapd should start
    	 * writing back pages.
    	 */
    	if (global_reclaim(sc) && nr_dirty &&
    			nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority)))
    		zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
    
    
    	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
    		zone_idx(zone),
    		nr_scanned, nr_reclaimed,
    
    		trace_shrink_flags(file));
    
    	return nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * This moves pages from the active list to the inactive list.
     *
     * We move them the other way if the page is referenced by one or more
     * processes, from rmap.
     *
     * If the pages are mostly unmapped, the processing is fast and it is
     * appropriate to hold zone->lru_lock across the whole operation.  But if
     * the pages are mapped, the processing is slow (page_referenced()) so we
     * should drop zone->lru_lock around each page.  It's impossible to balance
     * this, so instead we remove the pages from the LRU while processing them.
     * It is safe to rely on PG_active against the non-LRU pages in here because
     * nobody will play with that bit on a non-LRU page.
     *
     * The downside is that we have to touch page->_count against each page.
     * But we had to alter page->flags anyway.
     */
    
    static void move_active_pages_to_lru(struct lruvec *lruvec,
    
    				     struct list_head *list,
    
    				     struct list_head *pages_to_free,
    
    	struct zone *zone = lruvec_zone(lruvec);
    
    	unsigned long pgmoved = 0;
    	struct page *page;
    
    	int nr_pages;
    
    
    	while (!list_empty(list)) {
    		page = lru_to_page(list);
    
    		lruvec = mem_cgroup_page_lruvec(page, zone);
    
    
    		VM_BUG_ON(PageLRU(page));
    		SetPageLRU(page);
    
    
    		nr_pages = hpage_nr_pages(page);
    		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
    
    		list_move(&page->lru, &lruvec->lists[lru]);
    
    		pgmoved += nr_pages;
    
    		if (put_page_testzero(page)) {
    			__ClearPageLRU(page);
    			__ClearPageActive(page);
    
    			del_page_from_lru_list(page, lruvec, lru);
    
    
    			if (unlikely(PageCompound(page))) {
    				spin_unlock_irq(&zone->lru_lock);
    				(*get_compound_page_dtor(page))(page);
    				spin_lock_irq(&zone->lru_lock);
    			} else
    				list_add(&page->lru, pages_to_free);
    
    		}
    	}
    	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
    	if (!is_active_lru(lru))
    		__count_vm_events(PGDEACTIVATE, pgmoved);
    }
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    static void shrink_active_list(unsigned long nr_to_scan,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_taken;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	unsigned long nr_scanned;
    
    	unsigned long vm_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	LIST_HEAD(l_hold);	/* The pages which were snipped off */
    
    	LIST_HEAD(l_inactive);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    
    	unsigned long nr_rotated = 0;
    
    	isolate_mode_t isolate_mode = 0;
    
    	int file = is_file_lru(lru);
    
    	struct zone *zone = lruvec_zone(lruvec);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lru_add_drain();
    
    		isolate_mode |= ISOLATE_UNMAPPED;
    
    	if (!sc->may_writepage)
    
    		isolate_mode |= ISOLATE_CLEAN;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_lock_irq(&zone->lru_lock);
    
    	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
    				     &nr_scanned, sc, isolate_mode, lru);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		zone->pages_scanned += nr_scanned;
    
    	reclaim_stat->recent_scanned[file] += nr_taken;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
    
    	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_unlock_irq(&zone->lru_lock);
    
    	while (!list_empty(&l_hold)) {
    		cond_resched();
    		page = lru_to_page(&l_hold);
    		list_del(&page->lru);
    
    		if (unlikely(!page_evictable(page))) {
    
    			putback_lru_page(page);
    			continue;
    		}
    
    
    		if (unlikely(buffer_heads_over_limit)) {
    			if (page_has_private(page) && trylock_page(page)) {
    				if (page_has_private(page))
    					try_to_release_page(page, 0);
    				unlock_page(page);
    			}
    		}
    
    
    		if (page_referenced(page, 0, sc->target_mem_cgroup,
    				    &vm_flags)) {
    
    			nr_rotated += hpage_nr_pages(page);
    
    			/*
    			 * Identify referenced, file-backed active pages and
    			 * give them one more trip around the active list. So
    			 * that executable code get better chances to stay in
    			 * memory under moderate memory pressure.  Anon pages
    			 * are not likely to be evicted by use-once streaming
    			 * IO, plus JVM can create lots of anon VM_EXEC pages,
    			 * so we ignore them here.
    			 */
    
    			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
    
    				list_add(&page->lru, &l_active);
    				continue;
    			}
    		}
    
    		ClearPageActive(page);	/* we are de-activating */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		list_add(&page->lru, &l_inactive);
    	}
    
    
    	 * Move pages back to the lru list.
    
    	spin_lock_irq(&zone->lru_lock);
    
    	 * Count referenced pages from currently used mappings as rotated,
    	 * even though only some of them are actually re-activated.  This
    	 * helps balance scan pressure between file and anonymous pages in
    	 * get_scan_ratio.
    
    	reclaim_stat->recent_rotated[file] += nr_rotated;
    
    	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
    	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
    
    	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
    
    	spin_unlock_irq(&zone->lru_lock);
    
    
    	free_hot_cold_page_list(&l_hold, 1);
    
    static int inactive_anon_is_low_global(struct zone *zone)
    
    {
    	unsigned long active, inactive;
    
    	active = zone_page_state(zone, NR_ACTIVE_ANON);
    	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
    
    	if (inactive * zone->inactive_ratio < active)
    		return 1;
    
    	return 0;
    }
    
    
    /**
     * inactive_anon_is_low - check if anonymous pages need to be deactivated
    
     *
     * Returns true if the zone does not have enough inactive anon pages,
     * meaning some active anon pages need to be deactivated.
     */
    
    static int inactive_anon_is_low(struct lruvec *lruvec)
    
    	/*
    	 * If we don't have swap space, anonymous page deactivation
    	 * is pointless.
    	 */
    	if (!total_swap_pages)
    		return 0;
    
    
    	if (!mem_cgroup_disabled())
    
    		return mem_cgroup_inactive_anon_is_low(lruvec);
    
    	return inactive_anon_is_low_global(lruvec_zone(lruvec));
    
    static inline int inactive_anon_is_low(struct lruvec *lruvec)
    
    /**
     * inactive_file_is_low - check if file pages need to be deactivated
    
     *
     * When the system is doing streaming IO, memory pressure here
     * ensures that active file pages get deactivated, until more
     * than half of the file pages are on the inactive list.
     *
     * Once we get to that situation, protect the system's working
     * set from being evicted by disabling active file page aging.
     *
     * This uses a different ratio than the anonymous pages, because
     * the page cache uses a use-once replacement algorithm.
     */
    
    static int inactive_file_is_low(struct lruvec *lruvec)
    
    	unsigned long inactive;
    	unsigned long active;
    
    	inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
    	active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
    
    static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
    
    	if (is_file_lru(lru))
    
    		return inactive_file_is_low(lruvec);
    
    		return inactive_anon_is_low(lruvec);
    
    static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    
    				 struct lruvec *lruvec, struct scan_control *sc)
    
    		if (inactive_list_is_low(lruvec, lru))
    
    			shrink_active_list(nr_to_scan, lruvec, sc, lru);
    
    	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
    
    static int vmscan_swappiness(struct scan_control *sc)
    
    	return mem_cgroup_swappiness(sc->target_mem_cgroup);
    
    enum scan_balance {
    	SCAN_EQUAL,
    	SCAN_FRACT,
    	SCAN_ANON,
    	SCAN_FILE,
    };
    
    
    /*
     * Determine how aggressively the anon and file LRU lists should be
     * scanned.  The relative value of each set of LRU lists is determined
     * by looking at the fraction of the pages scanned we did rotate back
     * onto the active list instead of evict.
     *
    
     * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
     * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
    
    static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
    
    	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
    	u64 fraction[2];
    	u64 denominator = 0;	/* gcc */
    	struct zone *zone = lruvec_zone(lruvec);
    
    	unsigned long anon_prio, file_prio;
    
    	enum scan_balance scan_balance;
    	unsigned long anon, file, free;
    	bool force_scan = false;
    
    	unsigned long ap, fp;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	enum lru_list lru;
    
    	/*
    	 * If the zone or memcg is small, nr[l] can be 0.  This
    	 * results in no scanning on this priority and a potential
    	 * priority drop.  Global direct reclaim can go to the next
    	 * zone and tends to have no problems. Global kswapd is for
    	 * zone balancing and it needs to scan a minimum amount. When
    	 * reclaiming for a memcg, a priority drop can cause high
    	 * latencies, so it's better to scan a minimum amount there as
    	 * well.
    	 */
    
    	if (current_is_kswapd() && zone->all_unreclaimable)
    
    
    	/* If we have no swap space, do not bother scanning anon pages. */
    
    	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
    
    		scan_balance = SCAN_FILE;
    
    	/*
    	 * Global reclaim will swap to prevent OOM even with no
    	 * swappiness, but memcg users want to use this knob to
    	 * disable swapping for individual groups completely when
    	 * using the memory controller's swap limit feature would be
    	 * too expensive.
    	 */
    	if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
    
    		scan_balance = SCAN_FILE;
    
    		goto out;
    	}
    
    	/*
    	 * Do not apply any pressure balancing cleverness when the
    	 * system is close to OOM, scan both anon and file equally
    	 * (unless the swappiness setting disagrees with swapping).
    	 */
    	if (!sc->priority && vmscan_swappiness(sc)) {
    
    		scan_balance = SCAN_EQUAL;
    
    	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
    		get_lru_size(lruvec, LRU_INACTIVE_ANON);
    	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
    		get_lru_size(lruvec, LRU_INACTIVE_FILE);
    
    	/*
    	 * If it's foreseeable that reclaiming the file cache won't be
    	 * enough to get the zone back into a desirable shape, we have
    	 * to swap.  Better start now and leave the - probably heavily
    	 * thrashing - remaining file pages alone.
    	 */
    
    		free = zone_page_state(zone, NR_FREE_PAGES);
    
    		if (unlikely(file + free <= high_wmark_pages(zone))) {
    
    			scan_balance = SCAN_ANON;
    
    	/*
    	 * There is enough inactive page cache, do not reclaim
    	 * anything from the anonymous working set right now.
    	 */
    	if (!inactive_file_is_low(lruvec)) {
    
    		scan_balance = SCAN_FILE;
    
    	scan_balance = SCAN_FRACT;
    
    
    	/*
    	 * With swappiness at 100, anonymous and file have the same priority.
    	 * This scanning priority is essentially the inverse of IO cost.
    	 */
    
    	anon_prio = vmscan_swappiness(sc);
    
    	file_prio = 200 - anon_prio;
    
    	/*
    	 * OK, so we have swap space and a fair amount of page cache
    	 * pages.  We use the recently rotated / recently scanned
    	 * ratios to determine how valuable each cache is.
    	 *
    	 * Because workloads change over time (and to avoid overflow)
    	 * we keep these statistics as a floating average, which ends
    	 * up weighing recent references more than old ones.
    	 *
    	 * anon in [0], file in [1]
    	 */
    
    	spin_lock_irq(&zone->lru_lock);
    
    	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
    		reclaim_stat->recent_scanned[0] /= 2;
    		reclaim_stat->recent_rotated[0] /= 2;
    
    	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
    		reclaim_stat->recent_scanned[1] /= 2;
    		reclaim_stat->recent_rotated[1] /= 2;
    
    	 * The amount of pressure on anon vs file pages is inversely
    	 * proportional to the fraction of recently scanned pages on
    	 * each list that were recently referenced and in active use.
    
    	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
    
    	ap /= reclaim_stat->recent_rotated[0] + 1;
    
    	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
    
    	fp /= reclaim_stat->recent_rotated[1] + 1;
    
    	spin_unlock_irq(&zone->lru_lock);
    
    	fraction[0] = ap;
    	fraction[1] = fp;
    	denominator = ap + fp + 1;
    out:
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	for_each_evictable_lru(lru) {
    		int file = is_file_lru(lru);
    
    		unsigned long scan;
    
    		size = get_lru_size(lruvec, lru);
    
    		if (!scan && force_scan)
    			scan = min(size, SWAP_CLUSTER_MAX);
    
    
    		switch (scan_balance) {
    		case SCAN_EQUAL:
    			/* Scan lists relative to size */
    			break;
    		case SCAN_FRACT:
    			/*
    			 * Scan types proportional to swappiness and
    			 * their relative recent reclaim efficiency.
    			 */
    			scan = div64_u64(scan * fraction[file], denominator);
    			break;
    		case SCAN_FILE:
    		case SCAN_ANON:
    			/* Scan one type exclusively */
    			if ((scan_balance == SCAN_FILE) != file)
    				scan = 0;
    			break;
    		default:
    			/* Look ma, no brain */
    			BUG();
    		}
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		nr[lru] = scan;
    
    /*
     * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
     */
    static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
    {
    	unsigned long nr[NR_LRU_LISTS];
    
    	unsigned long targets[NR_LRU_LISTS];
    
    	unsigned long nr_to_scan;
    	enum lru_list lru;
    	unsigned long nr_reclaimed = 0;
    	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    	struct blk_plug plug;
    
    	bool scan_adjusted = false;
    
    	/* Record the original scan target for proportional adjustments later */
    	memcpy(targets, nr, sizeof(nr));
    
    
    	blk_start_plug(&plug);
    	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
    					nr[LRU_INACTIVE_FILE]) {
    
    		unsigned long nr_anon, nr_file, percentage;
    		unsigned long nr_scanned;
    
    
    		for_each_evictable_lru(lru) {
    			if (nr[lru]) {
    				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
    				nr[lru] -= nr_to_scan;
    
    				nr_reclaimed += shrink_list(lru, nr_to_scan,
    							    lruvec, sc);
    			}
    		}
    
    
    		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
    			continue;
    
    
    		 * For global direct reclaim, reclaim only the number of pages
    		 * requested. Less care is taken to scan proportionally as it
    		 * is more important to minimise direct reclaim stall latency
    		 * than it is to properly age the LRU lists.
    
    		if (global_reclaim(sc) && !current_is_kswapd())
    
    
    		/*
    		 * For kswapd and memcg, reclaim at least the number of pages
    		 * requested. Ensure that the anon and file LRUs shrink
    		 * proportionally what was requested by get_scan_count(). We
    		 * stop reclaiming one LRU and reduce the amount scanning
    		 * proportional to the original scan target.
    		 */
    		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
    		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
    
    		if (nr_file > nr_anon) {
    			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
    						targets[LRU_ACTIVE_ANON] + 1;
    			lru = LRU_BASE;
    			percentage = nr_anon * 100 / scan_target;
    		} else {
    			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
    						targets[LRU_ACTIVE_FILE] + 1;
    			lru = LRU_FILE;
    			percentage = nr_file * 100 / scan_target;
    		}
    
    		/* Stop scanning the smaller of the LRU */
    		nr[lru] = 0;
    		nr[lru + LRU_ACTIVE] = 0;
    
    		/*
    		 * Recalculate the other LRU scan count based on its original
    		 * scan target and the percentage scanning already complete
    		 */
    		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
    		nr_scanned = targets[lru] - nr[lru];
    		nr[lru] = targets[lru] * (100 - percentage) / 100;
    		nr[lru] -= min(nr[lru], nr_scanned);
    
    		lru += LRU_ACTIVE;
    		nr_scanned = targets[lru] - nr[lru];
    		nr[lru] = targets[lru] * (100 - percentage) / 100;
    		nr[lru] -= min(nr[lru], nr_scanned);
    
    		scan_adjusted = true;
    
    	}
    	blk_finish_plug(&plug);
    	sc->nr_reclaimed += nr_reclaimed;
    
    	/*
    	 * Even if we did not try to evict anon pages at all, we want to
    	 * rebalance the anon lru active/inactive ratio.
    	 */
    	if (inactive_anon_is_low(lruvec))
    		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    				   sc, LRU_ACTIVE_ANON);
    
    	throttle_vm_writeout(sc->gfp_mask);
    }
    
    
    /* Use reclaim/compaction for costly allocs or under memory pressure */
    
    static bool in_reclaim_compaction(struct scan_control *sc)
    
    	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    
    			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
    
    			 sc->priority < DEF_PRIORITY - 2))
    
    		return true;
    
    	return false;
    }
    
    
     * Reclaim/compaction is used for high-order allocation requests. It reclaims
     * order-0 pages before compacting the zone. should_continue_reclaim() returns
     * true if more pages should be reclaimed such that when the page allocator
     * calls try_to_compact_zone() that it will have enough free pages to succeed.
     * It will give up earlier than that if there is difficulty reclaiming pages.
    
    static inline bool should_continue_reclaim(struct zone *zone,
    
    					unsigned long nr_reclaimed,
    					unsigned long nr_scanned,
    					struct scan_control *sc)
    {
    	unsigned long pages_for_compaction;
    	unsigned long inactive_lru_pages;
    
    	/* If not in reclaim/compaction mode, stop */
    
    	if (!in_reclaim_compaction(sc))
    
    	/* Consider stopping depending on scan and reclaim activity */
    	if (sc->gfp_mask & __GFP_REPEAT) {
    		/*
    		 * For __GFP_REPEAT allocations, stop reclaiming if the
    		 * full LRU list has been scanned and we are still failing
    		 * to reclaim pages. This full LRU scan is potentially
    		 * expensive but a __GFP_REPEAT caller really wants to succeed
    		 */
    		if (!nr_reclaimed && !nr_scanned)
    			return false;
    	} else {
    		/*
    		 * For non-__GFP_REPEAT allocations which can presumably
    		 * fail without consequence, stop if we failed to reclaim
    		 * any pages from the last SWAP_CLUSTER_MAX number of
    		 * pages that were scanned. This will return to the
    		 * caller faster at the risk reclaim/compaction and
    		 * the resulting allocation attempt fails
    		 */
    		if (!nr_reclaimed)
    			return false;
    	}
    
    
    	/*
    	 * If we have not reclaimed enough pages for compaction and the
    	 * inactive lists are large enough, continue reclaiming
    	 */
    	pages_for_compaction = (2UL << sc->order);
    
    	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
    
    	if (get_nr_swap_pages() > 0)
    
    		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
    
    	if (sc->nr_reclaimed < pages_for_compaction &&
    			inactive_lru_pages > pages_for_compaction)
    		return true;
    
    	/* If compaction would go ahead or the allocation would succeed, stop */