vmscan.c

	*scanned = scan;
	return nr_taken;
}

static unsigned long isolate_pages_global(unsigned long nr,
					struct list_head *dst,
					unsigned long *scanned, int order,
					int mode, struct zone *z,
					int active, int file)
{
	int lru = LRU_BASE;
	if (active)
		lru += LRU_ACTIVE;
	if (file)
		lru += LRU_FILE;
	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
								mode, file);
}

/*
 * clear_active_flags() is a helper for shrink_active_list(), clearing
 * any active bits from the pages in the list.
 */
static unsigned long clear_active_flags(struct list_head *page_list,
					unsigned int *count)
{
	int nr_active = 0;
	int lru;
	struct page *page;

	list_for_each_entry(page, page_list, lru) {
		lru = page_lru_base_type(page);
		if (PageActive(page)) {
			lru += LRU_ACTIVE;
			ClearPageActive(page);
			nr_active++;
		}
		count[lru]++;
	}

	return nr_active;
}

/**
 * isolate_lru_page - tries to isolate a page from its LRU list
 * @page: page to isolate from its LRU list
 *
 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
 * vmstat statistic corresponding to whatever LRU list the page was on.
 *
 * Returns 0 if the page was removed from an LRU list.
 * Returns -EBUSY if the page was not on an LRU list.
 *
 * The returned page will have PageLRU() cleared.  If it was found on
 * the active list, it will have PageActive set.  If it was found on
 * the unevictable list, it will have the PageUnevictable bit set. That flag
 * may need to be cleared by the caller before letting the page go.
 *
 * The vmstat statistic corresponding to the list on which the page was
 * found will be decremented.
 *
 * Restrictions:
 * (1) Must be called with an elevated refcount on the page. This is a
 *     fundamentnal difference from isolate_lru_pages (which is called
 *     without a stable reference).
 * (2) the lru_lock must not be held.
 * (3) interrupts must be enabled.
 */
int isolate_lru_page(struct page *page)
{
	int ret = -EBUSY;

	if (PageLRU(page)) {
		struct zone *zone = page_zone(page);

		spin_lock_irq(&zone->lru_lock);
		if (PageLRU(page) && get_page_unless_zero(page)) {
			int lru = page_lru(page);
			ret = 0;
			ClearPageLRU(page);

			del_page_from_lru_list(zone, page, lru);
		}
		spin_unlock_irq(&zone->lru_lock);
	}
	return ret;
}

/*
 * Are there way too many processes in the direct reclaim path already?
 */
static int too_many_isolated(struct zone *zone, int file,
		struct scan_control *sc)
{
	unsigned long inactive, isolated;

	if (current_is_kswapd())
		return 0;

	if (!scanning_global_lru(sc))
		return 0;

	if (file) {
		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
	} else {
		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
	}

	return isolated > inactive;
}

/*
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
 */
static unsigned long shrink_inactive_list(unsigned long max_scan,
			struct zone *zone, struct scan_control *sc,
			int priority, int file)
{
	LIST_HEAD(page_list);
	struct pagevec pvec;
	unsigned long nr_scanned = 0;
	unsigned long nr_reclaimed = 0;
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);

	while (unlikely(too_many_isolated(zone, file, sc))) {
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		/* We are about to die and free our memory. Return now. */
		if (fatal_signal_pending(current))
			return SWAP_CLUSTER_MAX;
	}


	pagevec_init(&pvec, 1);

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
	do {
		struct page *page;
		unsigned long nr_taken;
		unsigned long nr_scan;
		unsigned long nr_freed;
		unsigned long nr_active;
		unsigned int count[NR_LRU_LISTS] = { 0, };
		int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
		unsigned long nr_anon;
		unsigned long nr_file;

		if (scanning_global_lru(sc)) {
			nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
							&page_list, &nr_scan,
							sc->order, mode,
							zone, 0, file);
			zone->pages_scanned += nr_scan;
			if (current_is_kswapd())
				__count_zone_vm_events(PGSCAN_KSWAPD, zone,
						       nr_scan);
			else
				__count_zone_vm_events(PGSCAN_DIRECT, zone,
						       nr_scan);
		} else {
			nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
							&page_list, &nr_scan,
							sc->order, mode,
							zone, sc->mem_cgroup,
							0, file);
			/*
			 * mem_cgroup_isolate_pages() keeps track of
			 * scanned pages on its own.
			 */
		}

		if (nr_taken == 0)
			goto done;

		nr_active = clear_active_flags(&page_list, count);
		__count_vm_events(PGDEACTIVATE, nr_active);

		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
						-count[LRU_ACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
						-count[LRU_INACTIVE_FILE]);
		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
						-count[LRU_ACTIVE_ANON]);
		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
						-count[LRU_INACTIVE_ANON]);

		nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
		nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
		__mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
		__mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);

		reclaim_stat->recent_scanned[0] += nr_anon;
		reclaim_stat->recent_scanned[1] += nr_file;

		spin_unlock_irq(&zone->lru_lock);

		nr_scanned += nr_scan;
		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);

		/*
		 * If we are direct reclaiming for contiguous pages and we do
		 * not reclaim everything in the list, try again and wait
		 * for IO to complete. This will stall high-order allocations
		 * but that should be acceptable to the caller
		 */
		if (nr_freed < nr_taken && !current_is_kswapd() &&
		    sc->lumpy_reclaim_mode) {
			congestion_wait(BLK_RW_ASYNC, HZ/10);

			/*
			 * The attempt at page out may have made some
			 * of the pages active, mark them inactive again.
			 */
			nr_active = clear_active_flags(&page_list, count);
			count_vm_events(PGDEACTIVATE, nr_active);

			nr_freed += shrink_page_list(&page_list, sc,
							PAGEOUT_IO_SYNC);
		}

		nr_reclaimed += nr_freed;

		local_irq_disable();
		if (current_is_kswapd())
			__count_vm_events(KSWAPD_STEAL, nr_freed);
		__count_zone_vm_events(PGSTEAL, zone, nr_freed);

		spin_lock(&zone->lru_lock);
		/*
		 * Put back any unfreeable pages.
		 */
		while (!list_empty(&page_list)) {
			int lru;
			page = lru_to_page(&page_list);
			VM_BUG_ON(PageLRU(page));
			list_del(&page->lru);
			if (unlikely(!page_evictable(page, NULL))) {
				spin_unlock_irq(&zone->lru_lock);
				putback_lru_page(page);
				spin_lock_irq(&zone->lru_lock);
				continue;
			}
			SetPageLRU(page);
			lru = page_lru(page);
			add_page_to_lru_list(zone, page, lru);
			if (is_active_lru(lru)) {
				int file = is_file_lru(lru);
				reclaim_stat->recent_rotated[file]++;
			}
			if (!pagevec_add(&pvec, page)) {
				spin_unlock_irq(&zone->lru_lock);
				__pagevec_release(&pvec);
				spin_lock_irq(&zone->lru_lock);
			}
		}
		__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
		__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);

  	} while (nr_scanned < max_scan);

done:
	spin_unlock_irq(&zone->lru_lock);
	pagevec_release(&pvec);
	return nr_reclaimed;
}

/*
 * We are about to scan this zone at a certain priority level.  If that priority
 * level is smaller (ie: more urgent) than the previous priority, then note
 * that priority level within the zone.  This is done so that when the next
 * process comes in to scan this zone, it will immediately start out at this
 * priority level rather than having to build up its own scanning priority.
 * Here, this priority affects only the reclaim-mapped threshold.
 */
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
	if (priority < zone->prev_priority)
		zone->prev_priority = priority;
}

/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
 * processes, from rmap.
 *
 * If the pages are mostly unmapped, the processing is fast and it is
 * appropriate to hold zone->lru_lock across the whole operation.  But if
 * the pages are mapped, the processing is slow (page_referenced()) so we
 * should drop zone->lru_lock around each page.  It's impossible to balance
 * this, so instead we remove the pages from the LRU while processing them.
 * It is safe to rely on PG_active against the non-LRU pages in here because
 * nobody will play with that bit on a non-LRU page.
 *
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */

static void move_active_pages_to_lru(struct zone *zone,
				     struct list_head *list,
				     enum lru_list lru)
{
	unsigned long pgmoved = 0;
	struct pagevec pvec;
	struct page *page;

	pagevec_init(&pvec, 1);

	while (!list_empty(list)) {
		page = lru_to_page(list);

		VM_BUG_ON(PageLRU(page));
		SetPageLRU(page);

		list_move(&page->lru, &zone->lru[lru].list);
		mem_cgroup_add_lru_list(page, lru);
		pgmoved++;

		if (!pagevec_add(&pvec, page) || list_empty(list)) {
			spin_unlock_irq(&zone->lru_lock);
			if (buffer_heads_over_limit)
				pagevec_strip(&pvec);
			__pagevec_release(&pvec);
			spin_lock_irq(&zone->lru_lock);
		}
	}
	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
	if (!is_active_lru(lru))
		__count_vm_events(PGDEACTIVATE, pgmoved);
}

static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
			struct scan_control *sc, int priority, int file)
{
	unsigned long nr_taken;
	unsigned long pgscanned;
	unsigned long vm_flags;
	LIST_HEAD(l_hold);	/* The pages which were snipped off */
	LIST_HEAD(l_active);
	LIST_HEAD(l_inactive);
	struct page *page;
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
	unsigned long nr_rotated = 0;

	lru_add_drain();
	spin_lock_irq(&zone->lru_lock);
	if (scanning_global_lru(sc)) {
		nr_taken = isolate_pages_global(nr_pages, &l_hold,
						&pgscanned, sc->order,
						ISOLATE_ACTIVE, zone,
						1, file);
		zone->pages_scanned += pgscanned;
	} else {
		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
						&pgscanned, sc->order,
						ISOLATE_ACTIVE, zone,
						sc->mem_cgroup, 1, file);
		/*
		 * mem_cgroup_isolate_pages() keeps track of
		 * scanned pages on its own.
		 */
	}

	reclaim_stat->recent_scanned[file] += nr_taken;

	__count_zone_vm_events(PGREFILL, zone, pgscanned);
	if (file)
		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
	else
		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
	spin_unlock_irq(&zone->lru_lock);

	while (!list_empty(&l_hold)) {
		cond_resched();
		page = lru_to_page(&l_hold);
		list_del(&page->lru);

		if (unlikely(!page_evictable(page, NULL))) {
			putback_lru_page(page);
			continue;
		}

		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
			nr_rotated++;
			/*
			 * Identify referenced, file-backed active pages and
			 * give them one more trip around the active list. So
			 * that executable code get better chances to stay in
			 * memory under moderate memory pressure.  Anon pages
			 * are not likely to be evicted by use-once streaming
			 * IO, plus JVM can create lots of anon VM_EXEC pages,
			 * so we ignore them here.
			 */
			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
				list_add(&page->lru, &l_active);
				continue;
			}
		}

		ClearPageActive(page);	/* we are de-activating */
		list_add(&page->lru, &l_inactive);
	}

	/*
	 * Move pages back to the lru list.
	 */
	spin_lock_irq(&zone->lru_lock);
	/*
	 * Count referenced pages from currently used mappings as rotated,
	 * even though only some of them are actually re-activated.  This
	 * helps balance scan pressure between file and anonymous pages in
	 * get_scan_ratio.
	 */
	reclaim_stat->recent_rotated[file] += nr_rotated;

	move_active_pages_to_lru(zone, &l_active,
						LRU_ACTIVE + file * LRU_FILE);
	move_active_pages_to_lru(zone, &l_inactive,
						LRU_BASE   + file * LRU_FILE);
	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
	spin_unlock_irq(&zone->lru_lock);
}

static int inactive_anon_is_low_global(struct zone *zone)
{
	unsigned long active, inactive;

	active = zone_page_state(zone, NR_ACTIVE_ANON);
	inactive = zone_page_state(zone, NR_INACTIVE_ANON);

	if (inactive * zone->inactive_ratio < active)
		return 1;

	return 0;
}

/**
 * inactive_anon_is_low - check if anonymous pages need to be deactivated
 * @zone: zone to check
 * @sc:   scan control of this context
 *
 * Returns true if the zone does not have enough inactive anon pages,
 * meaning some active anon pages need to be deactivated.
 */
static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
	int low;

	if (scanning_global_lru(sc))
		low = inactive_anon_is_low_global(zone);
	else
		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
	return low;
}

static int inactive_file_is_low_global(struct zone *zone)
{
	unsigned long active, inactive;

	active = zone_page_state(zone, NR_ACTIVE_FILE);
	inactive = zone_page_state(zone, NR_INACTIVE_FILE);

	return (active > inactive);
}

/**
 * inactive_file_is_low - check if file pages need to be deactivated
 * @zone: zone to check
 * @sc:   scan control of this context
 *
 * When the system is doing streaming IO, memory pressure here
 * ensures that active file pages get deactivated, until more
 * than half of the file pages are on the inactive list.
 *
 * Once we get to that situation, protect the system's working
 * set from being evicted by disabling active file page aging.
 *
 * This uses a different ratio than the anonymous pages, because
 * the page cache uses a use-once replacement algorithm.
 */
static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
{
	int low;

	if (scanning_global_lru(sc))
		low = inactive_file_is_low_global(zone);
	else
		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
	return low;
}

static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
				int file)
{
	if (file)
		return inactive_file_is_low(zone, sc);
	else
		return inactive_anon_is_low(zone, sc);
}

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
	struct zone *zone, struct scan_control *sc, int priority)
{
	int file = is_file_lru(lru);

	if (is_active_lru(lru)) {
		if (inactive_list_is_low(zone, sc, file))
		    shrink_active_list(nr_to_scan, zone, sc, priority, file);
		return 0;
	}

	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}

/*
 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
 * until we collected @swap_cluster_max pages to scan.
 */
static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
				       unsigned long *nr_saved_scan)
{
	unsigned long nr;

	*nr_saved_scan += nr_to_scan;
	nr = *nr_saved_scan;

	if (nr >= SWAP_CLUSTER_MAX)
		*nr_saved_scan = 0;
	else
		nr = 0;

	return nr;
}

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
 * nr[0] = anon pages to scan; nr[1] = file pages to scan
 */
static void get_scan_count(struct zone *zone, struct scan_control *sc,
					unsigned long *nr, int priority)
{
	unsigned long anon, file, free;
	unsigned long anon_prio, file_prio;
	unsigned long ap, fp;
	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
	u64 fraction[2], denominator;
	enum lru_list l;
	int noswap = 0;

	/* If we have no swap space, do not bother scanning anon pages. */
	if (!sc->may_swap || (nr_swap_pages <= 0)) {
		noswap = 1;
		fraction[0] = 0;
		fraction[1] = 1;
		denominator = 1;
		goto out;
	}

	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);

	if (scanning_global_lru(sc)) {
		free  = zone_page_state(zone, NR_FREE_PAGES);
		/* If we have very few page cache pages,
		   force-scan anon pages. */
		if (unlikely(file + free <= high_wmark_pages(zone))) {
			fraction[0] = 1;
			fraction[1] = 0;
			denominator = 1;
			goto out;
		}
	}

	/*
	 * OK, so we have swap space and a fair amount of page cache
	 * pages.  We use the recently rotated / recently scanned
	 * ratios to determine how valuable each cache is.
	 *
	 * Because workloads change over time (and to avoid overflow)
	 * we keep these statistics as a floating average, which ends
	 * up weighing recent references more than old ones.
	 *
	 * anon in [0], file in [1]
	 */
	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
		spin_lock_irq(&zone->lru_lock);
		reclaim_stat->recent_scanned[0] /= 2;
		reclaim_stat->recent_rotated[0] /= 2;
		spin_unlock_irq(&zone->lru_lock);
	}

	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
		spin_lock_irq(&zone->lru_lock);
		reclaim_stat->recent_scanned[1] /= 2;
		reclaim_stat->recent_rotated[1] /= 2;
		spin_unlock_irq(&zone->lru_lock);
	}

	/*
	 * With swappiness at 100, anonymous and file have the same priority.
	 * This scanning priority is essentially the inverse of IO cost.
	 */
	anon_prio = sc->swappiness;
	file_prio = 200 - sc->swappiness;

	/*
	 * The amount of pressure on anon vs file pages is inversely
	 * proportional to the fraction of recently scanned pages on
	 * each list that were recently referenced and in active use.
	 */
	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
	ap /= reclaim_stat->recent_rotated[0] + 1;

	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
	fp /= reclaim_stat->recent_rotated[1] + 1;

	fraction[0] = ap;
	fraction[1] = fp;
	denominator = ap + fp + 1;
out:
	for_each_evictable_lru(l) {
		int file = is_file_lru(l);
		unsigned long scan;

		scan = zone_nr_lru_pages(zone, sc, l);
		if (priority || noswap) {
			scan >>= priority;
			scan = div64_u64(scan * fraction[file], denominator);
		}
		nr[l] = nr_scan_try_batch(scan,
					  &reclaim_stat->nr_saved_scan[l]);
	}
}

static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
{
	/*
	 * If we need a large contiguous chunk of memory, or have
	 * trouble getting a small set of contiguous pages, we
	 * will reclaim both active and inactive pages.
	 */
	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
		sc->lumpy_reclaim_mode = 1;
	else if (sc->order && priority < DEF_PRIORITY - 2)
		sc->lumpy_reclaim_mode = 1;
	else
		sc->lumpy_reclaim_mode = 0;
}

/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
static void shrink_zone(int priority, struct zone *zone,
				struct scan_control *sc)
{
	unsigned long nr[NR_LRU_LISTS];
	unsigned long nr_to_scan;
	enum lru_list l;
	unsigned long nr_reclaimed = sc->nr_reclaimed;
	unsigned long nr_to_reclaim = sc->nr_to_reclaim;

	get_scan_count(zone, sc, nr, priority);

	set_lumpy_reclaim_mode(priority, sc);

	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
					nr[LRU_INACTIVE_FILE]) {
		for_each_evictable_lru(l) {
			if (nr[l]) {
				nr_to_scan = min_t(unsigned long,
						   nr[l], SWAP_CLUSTER_MAX);
				nr[l] -= nr_to_scan;

				nr_reclaimed += shrink_list(l, nr_to_scan,
							    zone, sc, priority);
			}
		}
		/*
		 * On large memory systems, scan >> priority can become
		 * really large. This is fine for the starting priority;
		 * we want to put equal scanning pressure on each zone.
		 * However, if the VM has a harder time of freeing pages,
		 * with multiple processes reclaiming pages, the total
		 * freeing target can get unreasonably large.
		 */
		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
			break;
	}

	sc->nr_reclaimed = nr_reclaimed;

	/*
	 * Even if we did not try to evict anon pages at all, we want to
	 * rebalance the anon lru active/inactive ratio.
	 */
	if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);

	throttle_vm_writeout(sc->gfp_mask);
}

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
 * Because:
 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
 *    allocation or
 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
 *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
 *    zone defense algorithm.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
static bool shrink_zones(int priority, struct zonelist *zonelist,
					struct scan_control *sc)
{
	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
	struct zoneref *z;
	struct zone *zone;
	bool all_unreclaimable = true;

	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
					sc->nodemask) {
		if (!populated_zone(zone))
			continue;
		/*
		 * Take care memory controller reclaiming has small influence
		 * to global LRU.
		 */
		if (scanning_global_lru(sc)) {
			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
				continue;
			note_zone_scanning_priority(zone, priority);

			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
				continue;	/* Let kswapd poll it */
		} else {
			/*
			 * Ignore cpuset limitation here. We just want to reduce
			 * # of used pages by us regardless of memory shortage.
			 */
			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
							priority);
		}

		shrink_zone(priority, zone, sc);
		all_unreclaimable = false;
	}
	return all_unreclaimable;
}

/*
 * This is the main entry point to direct page reclaim.
 *
 * If a full scan of the inactive list fails to free enough memory then we
 * are "out of memory" and something needs to be killed.
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick the writeback threads and take explicit
 * naps in the hope that some of these pages can be written.  But if the
 * allocating task holds filesystem locks which prevent writeout this might not
 * work, and the allocation attempt will fail.
 *
 * returns:	0, if no pages reclaimed
 * 		else, the number of pages reclaimed
 */
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
					struct scan_control *sc)
{
	int priority;
	bool all_unreclaimable;
	unsigned long total_scanned = 0;
	struct reclaim_state *reclaim_state = current->reclaim_state;
	struct zoneref *z;
	struct zone *zone;
	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
	unsigned long writeback_threshold;

	get_mems_allowed();
	delayacct_freepages_start();

	if (scanning_global_lru(sc))
		count_vm_event(ALLOCSTALL);

	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
		sc->nr_scanned = 0;
		if (!priority)
			disable_swap_token();
		all_unreclaimable = shrink_zones(priority, zonelist, sc);
		/*
		 * Don't shrink slabs when reclaiming memory from
		 * over limit cgroups
		 */
		if (scanning_global_lru(sc)) {
			unsigned long lru_pages = 0;
			for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
					continue;

				lru_pages += zone_reclaimable_pages(zone);
			}

			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
			if (reclaim_state) {
				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
				reclaim_state->reclaimed_slab = 0;
			}
		}
		total_scanned += sc->nr_scanned;
		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
			goto out;

		/*
		 * Try to write back as many pages as we just scanned.  This
		 * tends to cause slow streaming writers to write data to the
		 * disk smoothly, at the dirtying rate, which is nice.   But
		 * that's undesirable in laptop mode, where we *want* lumpy
		 * writeout.  So in laptop mode, write out the whole world.
		 */
		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
		if (total_scanned > writeback_threshold) {
			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
			sc->may_writepage = 1;
		}

		/* Take a nap, wait for some writeback to complete */
		if (!sc->hibernation_mode && sc->nr_scanned &&
		    priority < DEF_PRIORITY - 2)
			congestion_wait(BLK_RW_ASYNC, HZ/10);
	}

out:
	/*
	 * Now that we've scanned all the zones at this priority level, note
	 * that level within the zone so that the next thread which performs
	 * scanning of this zone will immediately start out at this priority
	 * level.  This affects only the decision whether or not to bring
	 * mapped pages onto the inactive list.
	 */
	if (priority < 0)
		priority = 0;

	if (scanning_global_lru(sc)) {
		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
				continue;

			zone->prev_priority = priority;
		}
	} else
		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);

	delayacct_freepages_end();
	put_mems_allowed();

	if (sc->nr_reclaimed)
		return sc->nr_reclaimed;

	/* top priority shrink_zones still had more to do? don't OOM, then */
	if (scanning_global_lru(sc) && !all_unreclaimable)
		return 1;

	return 0;
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
				gfp_t gfp_mask, nodemask_t *nodemask)
{
	struct scan_control sc = {
		.gfp_mask = gfp_mask,
		.may_writepage = !laptop_mode,
		.nr_to_reclaim = SWAP_CLUSTER_MAX,
		.may_unmap = 1,
		.may_swap = 1,
		.swappiness = vm_swappiness,
		.order = order,
		.mem_cgroup = NULL,
		.nodemask = nodemask,
	};

	return do_try_to_free_pages(zonelist, &sc);
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR

unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
						gfp_t gfp_mask, bool noswap,
						unsigned int swappiness,
						struct zone *zone, int nid)
{
	struct scan_control sc = {
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = !noswap,
		.swappiness = swappiness,
		.order = 0,
		.mem_cgroup = mem,
	};
	nodemask_t nm  = nodemask_of_node(nid);

	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
	sc.nodemask = &nm;
	sc.nr_reclaimed = 0;
	sc.nr_scanned = 0;
	/*
	 * NOTE: Although we can get the priority field, using it
	 * here is not a good idea, since it limits the pages we can scan.
	 * if we don't reclaim here, the shrink_zone from balance_pgdat
	 * will pick up pages from other mem cgroup's as well. We hack
	 * the priority and make it zero.
	 */
	shrink_zone(0, zone, &sc);
	return sc.nr_reclaimed;
}

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
					   gfp_t gfp_mask,
					   bool noswap,
					   unsigned int swappiness)
{
	struct zonelist *zonelist;
	struct scan_control sc = {
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = !noswap,
		.nr_to_reclaim = SWAP_CLUSTER_MAX,
		.swappiness = swappiness,
		.order = 0,
		.mem_cgroup = mem_cont,
		.nodemask = NULL, /* we don't care the placement */
	};

	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
	return do_try_to_free_pages(zonelist, &sc);
}
#endif

/* is kswapd sleeping prematurely? */
static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
{
	int i;

	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
	if (remaining)
		return 1;

	/* If after HZ/10, a zone is below the high mark, it's premature */
	for (i = 0; i < pgdat->nr_zones; i++) {
		struct zone *zone = pgdat->node_zones + i;

		if (!populated_zone(zone))
			continue;

		if (zone->all_unreclaimable)
			continue;

		if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
								0, 0))
			return 1;
	}

	return 0;
}

/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
 * What we do is to detect the case where all pages in the zone have been
 * scanned twice and there has been zero successful reclaim.  Mark the zone as
 * dead and from now on, only perform a short scan.  Basically we're polling
 * the zone for when the problem goes away.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is