Skip to content
Snippets Groups Projects
vmscan.c 96 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
     * try to reclaim pages from zones which will satisfy the caller's allocation
     * request.
     *
    
     * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
     * Because:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * a) The caller may be trying to free *extra* pages to satisfy a higher-order
     *    allocation or
    
     * b) The target zone may be at high_wmark_pages(zone) but the lower zones
     *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
     *    zone defense algorithm.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * If a zone is deemed to be full of pinned pages then just give it a light
     * scan then give up on it.
    
     *
     * This function returns true if a zone is being reclaimed for a costly
    
     * high-order allocation and compaction is ready to begin. This indicates to
    
     * the caller that it should consider retrying the allocation instead of
     * further reclaim.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static bool shrink_zones(int priority, struct zonelist *zonelist,
    
    					struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	/*
    	 * If the number of buffer_heads in the machine exceeds the maximum
    	 * allowed level, force direct reclaim to scan the highmem zone as
    	 * highmem pages could be pinning lowmem pages storing buffer_heads
    	 */
    	if (buffer_heads_over_limit)
    		sc->gfp_mask |= __GFP_HIGHMEM;
    
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask), sc->nodemask) {
    
    		if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		/*
    		 * Take care memory controller reclaiming has small influence
    		 * to global LRU.
    		 */
    
    			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    				continue;
    
    			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    
    				 * If we already have plenty of memory free for
    				 * compaction in this zone, don't free any more.
    				 * Even though compaction is invoked for any
    				 * non-zero order, only frequent costly order
    				 * reclamation is disruptive enough to become a
    
    				 * noticeable problem, like transparent huge
    				 * page allocations.
    
    			/*
    			 * This steals pages from memory cgroups over softlimit
    			 * and returns the number of reclaimed pages and
    			 * scanned pages. This works for global memory pressure
    			 * and balancing, not for a memcg's limit.
    			 */
    			nr_soft_scanned = 0;
    			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    						sc->order, sc->gfp_mask,
    						&nr_soft_scanned);
    			sc->nr_reclaimed += nr_soft_reclaimed;
    			sc->nr_scanned += nr_soft_scanned;
    			/* need some check for avoid more shrink_zone() */
    
    		shrink_zone(priority, zone, sc);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    }
    
    static bool zone_reclaimable(struct zone *zone)
    {
    	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
    }
    
    
    /* All zones in zonelist are unreclaimable? */
    
    static bool all_unreclaimable(struct zonelist *zonelist,
    		struct scan_control *sc)
    {
    	struct zoneref *z;
    	struct zone *zone;
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    			gfp_zone(sc->gfp_mask), sc->nodemask) {
    		if (!populated_zone(zone))
    			continue;
    		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    			continue;
    
    		if (!zone->all_unreclaimable)
    			return false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the main entry point to direct page reclaim.
     *
     * If a full scan of the inactive list fails to free enough memory then we
     * are "out of memory" and something needs to be killed.
     *
     * If the caller is !__GFP_FS then the probability of a failure is reasonably
     * high - the zone may be full of dirty or under-writeback pages, which this
    
     * caller can't do much about.  We kick the writeback threads and take explicit
     * naps in the hope that some of these pages can be written.  But if the
     * allocating task holds filesystem locks which prevent writeout this might not
     * work, and the allocation attempt will fail.
    
     *
     * returns:	0, if no pages reclaimed
     * 		else, the number of pages reclaimed
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    
    					struct scan_control *sc,
    					struct shrink_control *shrink)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int priority;
    
    	unsigned long total_scanned = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    
    	unsigned long writeback_threshold;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	delayacct_freepages_start();
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
    
    		sc->nr_scanned = 0;
    
    		aborted_reclaim = shrink_zones(priority, zonelist, sc);
    
    		/*
    		 * Don't shrink slabs when reclaiming memory from
    		 * over limit cgroups
    		 */
    
    			unsigned long lru_pages = 0;
    
    			for_each_zone_zonelist(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask)) {
    
    				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    					continue;
    
    				lru_pages += zone_reclaimable_pages(zone);
    			}
    
    
    			shrink_slab(shrink, sc->nr_scanned, lru_pages);
    
    				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		total_scanned += sc->nr_scanned;
    
    		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto out;
    
    		/*
    		 * Try to write back as many pages as we just scanned.  This
    		 * tends to cause slow streaming writers to write data to the
    		 * disk smoothly, at the dirtying rate, which is nice.   But
    		 * that's undesirable in laptop mode, where we *want* lumpy
    		 * writeout.  So in laptop mode, write out the whole world.
    		 */
    
    		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    		if (total_scanned > writeback_threshold) {
    
    			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
    						WB_REASON_TRY_TO_FREE_PAGES);
    
    			sc->may_writepage = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* Take a nap, wait for some writeback to complete */
    
    		if (!sc->hibernation_mode && sc->nr_scanned &&
    
    		    priority < DEF_PRIORITY - 2) {
    			struct zone *preferred_zone;
    
    			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
    
    						&cpuset_current_mems_allowed,
    						&preferred_zone);
    
    			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    	/*
    	 * As hibernation is going on, kswapd is freezed so that it can't mark
    	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
    	 * check.
    	 */
    	if (oom_killer_disabled)
    		return 0;
    
    
    	/* Aborted reclaim to try compaction? don't OOM, then */
    	if (aborted_reclaim)
    
    	/* top priority shrink_zones still had more to do? don't OOM, then */
    
    	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
    
    unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    
    				gfp_t gfp_mask, nodemask_t *nodemask)
    
    	struct scan_control sc = {
    		.gfp_mask = gfp_mask,
    		.may_writepage = !laptop_mode,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.nodemask = nodemask,
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    
    	trace_mm_vmscan_direct_reclaim_begin(order,
    				sc.may_writepage,
    				gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    #ifdef CONFIG_CGROUP_MEM_RES_CTLR
    
    unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
    
    						gfp_t gfp_mask, bool noswap,
    
    						struct zone *zone,
    						unsigned long *nr_scanned)
    
    {
    	struct scan_control sc = {
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_writepage = !laptop_mode,
    		.may_unmap = 1,
    		.may_swap = !noswap,
    		.order = 0,
    
    	struct mem_cgroup_zone mz = {
    
    	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
    						      sc.may_writepage,
    						      sc.gfp_mask);
    
    
    	/*
    	 * NOTE: Although we can get the priority field, using it
    	 * here is not a good idea, since it limits the pages we can scan.
    	 * if we don't reclaim here, the shrink_zone from balance_pgdat
    	 * will pick up pages from other mem cgroup's as well. We hack
    	 * the priority and make it zero.
    	 */
    
    	shrink_mem_cgroup_zone(0, &mz, &sc);
    
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
    
    
    	*nr_scanned = sc.nr_scanned;
    
    unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    					   gfp_t gfp_mask,
    
    	struct zonelist *zonelist;
    
    	unsigned long nr_reclaimed;
    
    	struct scan_control sc = {
    		.may_writepage = !laptop_mode,
    
    		.may_unmap = 1,
    
    		.may_swap = !noswap,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.nodemask = NULL, /* we don't care the placement */
    
    		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
    	};
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    
    	/*
    	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
    	 * take care of from where we get pages. So the node where we start the
    	 * scan does not need to be the current node.
    	 */
    
    	nid = mem_cgroup_select_victim_node(memcg);
    
    
    	zonelist = NODE_DATA(nid)->node_zonelists;
    
    
    	trace_mm_vmscan_memcg_reclaim_begin(0,
    					    sc.may_writepage,
    					    sc.gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    static void age_active_anon(struct zone *zone, struct scan_control *sc,
    			    int priority)
    {
    
    	if (!total_swap_pages)
    		return;
    
    	memcg = mem_cgroup_iter(NULL, NULL, NULL);
    	do {
    		struct mem_cgroup_zone mz = {
    			.mem_cgroup = memcg,
    			.zone = zone,
    		};
    
    		if (inactive_anon_is_low(&mz))
    			shrink_active_list(SWAP_CLUSTER_MAX, &mz,
    					   sc, priority, 0);
    
    		memcg = mem_cgroup_iter(NULL, memcg, NULL);
    	} while (memcg);
    
    /*
     * pgdat_balanced is used when checking if a node is balanced for high-order
     * allocations. Only zones that meet watermarks and are in a zone allowed
     * by the callers classzone_idx are added to balanced_pages. The total of
     * balanced pages must be at least 25% of the zones allowed by classzone_idx
     * for the node to be considered balanced. Forcing all zones to be balanced
     * for high orders can cause excessive reclaim when there are imbalanced zones.
     * The choice of 25% is due to
     *   o a 16M DMA zone that is balanced will not balance a zone on any
     *     reasonable sized machine
     *   o On all other machines, the top zone must be at least a reasonable
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
     *     percentage of the middle zones. For example, on 32-bit x86, highmem
    
     *     would need to be at least 256M for it to be balance a whole node.
     *     Similarly, on x86-64 the Normal zone would need to be at least 1G
     *     to balance a node on its own. These seemed like reasonable ratios.
     */
    static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
    						int classzone_idx)
    {
    	unsigned long present_pages = 0;
    	int i;
    
    	for (i = 0; i <= classzone_idx; i++)
    		present_pages += pgdat->node_zones[i].present_pages;
    
    
    	/* A special case here: if zone has no page, we think it's balanced */
    	return balanced_pages >= (present_pages >> 2);
    
    static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
    					int classzone_idx)
    
    
    	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    	if (remaining)
    
    	for (i = 0; i <= classzone_idx; i++) {
    
    		struct zone *zone = pgdat->node_zones + i;
    
    		if (!populated_zone(zone))
    			continue;
    
    
    		/*
    		 * balance_pgdat() skips over all_unreclaimable after
    		 * DEF_PRIORITY. Effectively, it considers them balanced so
    		 * they must be considered balanced here as well if kswapd
    		 * is to sleep
    		 */
    		if (zone->all_unreclaimable) {
    			balanced += zone->present_pages;
    
    		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
    
    	/*
    	 * For high-order requests, the balanced zones must contain at least
    	 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
    	 * must be balanced
    	 */
    	if (order)
    
    		return !pgdat_balanced(pgdat, balanced, classzone_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * For kswapd, balance_pgdat() will work across all this node's zones until
    
     * they are all at high_wmark_pages(zone).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * Returns the final order kswapd was reclaiming at
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * There is special handling here for zones which are full of pinned pages.
     * This can happen if the pages are all mlocked, or if they are all used by
     * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
     * What we do is to detect the case where all pages in the zone have been
     * scanned twice and there has been zero successful reclaim.  Mark the zone as
     * dead and from now on, only perform a short scan.  Basically we're polling
     * the zone for when the problem goes away.
     *
     * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    
     * zones which have free_pages > high_wmark_pages(zone), but once a zone is
     * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
     * lower zones regardless of the number of free pages in the lower zones. This
     * interoperates with the page allocator fallback scheme to ensure that aging
     * of pages is balanced across the zones.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int all_zones_ok;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int priority;
    	int i;
    
    	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
    
    	unsigned long total_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	struct scan_control sc = {
    		.gfp_mask = GFP_KERNEL,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		/*
    		 * kswapd doesn't want to be bailed out while reclaim. because
    		 * we want to put equal scanning pressure on each zone.
    		 */
    		.nr_to_reclaim = ULONG_MAX,
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		.order = order,
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    loop_again:
    	total_scanned = 0;
    
    	sc.may_writepage = !laptop_mode;
    
    	count_vm_event(PAGEOUTRUN);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
    		unsigned long lru_pages = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		all_zones_ok = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/*
    		 * Scan in the highmem->dma direction for the highest
    		 * zone which needs scanning
    		 */
    		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
    			struct zone *zone = pgdat->node_zones + i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (!populated_zone(zone))
    				continue;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			/*
    			 * Do some background aging of the anon list, to give
    			 * pages a chance to be referenced before reclaiming.
    			 */
    
    			/*
    			 * If the number of buffer_heads in the machine
    			 * exceeds the maximum allowed level and this node
    			 * has a highmem zone, force kswapd to reclaim from
    			 * it to relieve lowmem pressure.
    			 */
    			if (buffer_heads_over_limit && is_highmem_idx(i)) {
    				end_zone = i;
    				break;
    			}
    
    
    			if (!zone_watermark_ok_safe(zone, order,
    
    					high_wmark_pages(zone), 0, 0)) {
    
    			} else {
    				/* If balanced, clear the congested flag */
    				zone_clear_flag(zone, ZONE_CONGESTED);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    
    		if (i < 0)
    			goto out;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		for (i = 0; i <= end_zone; i++) {
    			struct zone *zone = pgdat->node_zones + i;
    
    
    			lru_pages += zone_reclaimable_pages(zone);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/*
    		 * Now scan the zone in the dma->highmem direction, stopping
    		 * at the last zone which needs scanning.
    		 *
    		 * We do this because the page allocator works in the opposite
    		 * direction.  This prevents the page allocator from allocating
    		 * pages behind kswapd's direction of progress, which would
    		 * cause too much scanning of the lower zones.
    		 */
    		for (i = 0; i <= end_zone; i++) {
    			struct zone *zone = pgdat->node_zones + i;
    
    			int nr_slab, testorder;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				continue;
    
    
    			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				continue;
    
    			sc.nr_scanned = 0;
    
    			/*
    			 * Call soft limit reclaim before calling shrink_zone.
    			 */
    
    			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    							order, sc.gfp_mask,
    							&nr_soft_scanned);
    			sc.nr_reclaimed += nr_soft_reclaimed;
    			total_scanned += nr_soft_scanned;
    
    			 * We put equal pressure on every zone, unless
    			 * one zone has way too many pages free
    			 * already. The "too many pages" is defined
    			 * as the high wmark plus a "gap" where the
    			 * gap is either the low watermark or 1%
    			 * of the zone, whichever is smaller.
    
    			balance_gap = min(low_wmark_pages(zone),
    				(zone->present_pages +
    					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    				KSWAPD_ZONE_BALANCE_GAP_RATIO);
    
    			/*
    			 * Kswapd reclaims only single pages with compaction
    			 * enabled. Trying too hard to reclaim until contiguous
    			 * free pages have become available can hurt performance
    			 * by evicting too much useful data from memory.
    			 * Do not reclaim more than needed for compaction.
    			 */
    			testorder = order;
    			if (COMPACTION_BUILD && order &&
    					compaction_suitable(zone, order) !=
    						COMPACT_SKIPPED)
    				testorder = 0;
    
    
    			if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
    
    				    !zone_watermark_ok_safe(zone, testorder,
    
    				shrink_zone(priority, zone, &sc);
    
    				reclaim_state->reclaimed_slab = 0;
    				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
    				sc.nr_reclaimed += reclaim_state->reclaimed_slab;
    				total_scanned += sc.nr_scanned;
    
    				if (nr_slab == 0 && !zone_reclaimable(zone))
    					zone->all_unreclaimable = 1;
    			}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/*
    			 * If we've done a decent amount of scanning and
    			 * the reclaim ratio is low, start doing writepage
    			 * even in laptop mode
    			 */
    			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
    
    			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				sc.may_writepage = 1;
    
    			if (zone->all_unreclaimable) {
    				if (end_zone && end_zone == i)
    					end_zone--;
    
    			if (!zone_watermark_ok_safe(zone, testorder,
    
    					high_wmark_pages(zone), end_zone, 0)) {
    				all_zones_ok = 0;
    				/*
    				 * We are still under min water mark.  This
    				 * means that we have a GFP_ATOMIC allocation
    				 * failure risk. Hurry up!
    				 */
    
    				if (!zone_watermark_ok_safe(zone, order,
    
    					    min_wmark_pages(zone), end_zone, 0))
    					has_under_min_watermark_zone = 1;
    
    			} else {
    				/*
    				 * If a zone reaches its high watermark,
    				 * consider it to be no longer congested. It's
    				 * possible there are dirty pages backed by
    				 * congested BDIs but as pressure is relieved,
    				 * spectulatively avoid congestion waits
    				 */
    				zone_clear_flag(zone, ZONE_CONGESTED);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;		/* kswapd: all done */
    		/*
    		 * OK, kswapd is getting into trouble.  Take a nap, then take
    		 * another pass across the zones.
    		 */
    
    		if (total_scanned && (priority < DEF_PRIORITY - 2)) {
    			if (has_under_min_watermark_zone)
    				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
    			else
    				congestion_wait(BLK_RW_ASYNC, HZ/10);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 * We do this so kswapd doesn't build up large priorities for
    		 * example when it is freeing in parallel with allocators. It
    		 * matches the direct reclaim path behaviour in terms of impact
    		 * on zone->*_priority.
    		 */
    
    		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			break;
    	}
    out:
    
    
    	/*
    	 * order-0: All zones must meet high watermark for a balanced node
    
    	 * high-order: Balanced zones must make up at least 25% of the node
    	 *             for the node to be balanced
    
    	if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		cond_resched();
    
    		/*
    		 * Fragmentation may mean that the system cannot be
    		 * rebalanced for high-order allocations in all zones.
    		 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
    		 * it means the zones have been fully scanned and are still
    		 * not balanced. For high-order allocations, there is
    		 * little point trying all over again as kswapd may
    		 * infinite loop.
    		 *
    		 * Instead, recheck all watermarks at order-0 as they
    		 * are the most important. If watermarks are ok, kswapd will go
    		 * back to sleep. High-order users can still perform direct
    		 * reclaim if they wish.
    		 */
    		if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
    			order = sc.order = 0;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		goto loop_again;
    	}
    
    
    	/*
    	 * If kswapd was reclaiming at a higher order, it has the option of
    	 * sleeping without all zones being balanced. Before it does, it must
    	 * ensure that the watermarks for order-0 on *all* zones are met and
    	 * that the congestion flags are cleared. The congestion flag must
    	 * be cleared as kswapd is the only mechanism that clears the flag
    	 * and it is potentially going to sleep here.
    	 */
    	if (order) {
    
    		int zones_need_compaction = 1;
    
    
    		for (i = 0; i <= end_zone; i++) {
    			struct zone *zone = pgdat->node_zones + i;
    
    			if (!populated_zone(zone))
    				continue;
    
    			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
    				continue;
    
    
    			/* Would compaction fail due to lack of free memory? */
    
    			if (COMPACTION_BUILD &&
    			    compaction_suitable(zone, order) == COMPACT_SKIPPED)
    
    			/* Confirm the zone is balanced for order-0 */
    			if (!zone_watermark_ok(zone, 0,
    					high_wmark_pages(zone), 0, 0)) {
    				order = sc.order = 0;
    				goto loop_again;
    			}
    
    
    			/* Check if the memory needs to be defragmented. */
    			if (zone_watermark_ok(zone, order,
    				    low_wmark_pages(zone), *classzone_idx, 0))
    				zones_need_compaction = 0;
    
    
    			/* If balanced, clear the congested flag */
    			zone_clear_flag(zone, ZONE_CONGESTED);
    		}
    
    
    		if (zones_need_compaction)
    			compact_pgdat(pgdat, order);
    
    	/*
    	 * Return the order we were reclaiming at so sleeping_prematurely()
    	 * makes a decision on the order we were last reclaiming at. However,
    	 * if another caller entered the allocator slow path while kswapd
    	 * was awake, order will remain at the higher level
    	 */
    
    static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
    
    {
    	long remaining = 0;
    	DEFINE_WAIT(wait);
    
    	if (freezing(current) || kthread_should_stop())
    		return;
    
    	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    
    	/* Try to sleep for a short interval */
    
    	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
    
    		remaining = schedule_timeout(HZ/10);
    		finish_wait(&pgdat->kswapd_wait, &wait);
    		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    	}
    
    	/*
    	 * After a short sleep, check if it was a premature sleep. If not, then
    	 * go fully to sleep until explicitly woken up.
    	 */
    
    	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
    
    		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
    
    		/*
    		 * vmstat counters are not perfectly accurate and the estimated
    		 * value for counters such as NR_FREE_PAGES can deviate from the
    		 * true value by nr_online_cpus * threshold. To avoid the zone
    		 * watermarks being breached while under pressure, we reduce the
    		 * per-cpu vmstat threshold while kswapd is awake and restore
    		 * them before going back to sleep.
    		 */
    		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
    		schedule();
    		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
    	} else {
    		if (remaining)
    			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
    		else
    			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
    	}
    	finish_wait(&pgdat->kswapd_wait, &wait);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * The background pageout daemon, started as a kernel thread
    
     * from the init process.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * This basically trickles out pages so that we have _some_
     * free memory available even if there is no other activity
     * that frees anything up. This is needed for things like routing
     * etc, where we otherwise might have all activity going on in
     * asynchronous contexts that cannot page things out.
     *
     * If there are applications that are active memory-allocators
     * (most normal use), this basically shouldn't matter.
     */
    static int kswapd(void *p)
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pg_data_t *pgdat = (pg_data_t*)p;
    	struct task_struct *tsk = current;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state reclaim_state = {
    		.reclaimed_slab = 0,
    	};
    
    	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	lockdep_set_current_reclaim_state(GFP_KERNEL);
    
    
    Rusty Russell's avatar
    Rusty Russell committed
    	if (!cpumask_empty(cpumask))
    
    		set_cpus_allowed_ptr(tsk, cpumask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	current->reclaim_state = &reclaim_state;
    
    	/*
    	 * Tell the memory management that we're a "memory allocator",
    	 * and that if we need more memory we should get access to it
    	 * regardless (see "__alloc_pages()"). "kswapd" should
    	 * never get caught in the normal page freeing logic.
    	 *
    	 * (Kswapd normally doesn't need memory anyway, but sometimes
    	 * you need a small amount of memory in order to be able to
    	 * page out something else, and this flag essentially protects
    	 * us from recursively trying to free more memory as we're
    	 * trying to free the first piece of memory in the first place).
    	 */
    
    	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
    
    	balanced_classzone_idx = classzone_idx;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	for ( ; ; ) {
    
    		/*
    		 * If the last balance_pgdat was unsuccessful it's unlikely a
    		 * new request of a similar or harder type will succeed soon
    		 * so consider going to sleep on the basis we reclaimed at
    		 */
    
    		if (balanced_classzone_idx >= new_classzone_idx &&
    					balanced_order == new_order) {
    
    			new_order = pgdat->kswapd_max_order;
    			new_classzone_idx = pgdat->classzone_idx;
    			pgdat->kswapd_max_order =  0;
    			pgdat->classzone_idx = pgdat->nr_zones - 1;
    		}
    
    
    		if (order < new_order || classzone_idx > new_classzone_idx) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/*
    			 * Don't sleep if someone wants a larger 'order'
    
    			 * allocation or has tigher zone constraints
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 */
    			order = new_order;
    
    			classzone_idx = new_classzone_idx;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    
    			kswapd_try_to_sleep(pgdat, balanced_order,
    						balanced_classzone_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			order = pgdat->kswapd_max_order;
    
    			classzone_idx = pgdat->classzone_idx;
    
    			new_order = order;
    			new_classzone_idx = classzone_idx;
    
    			pgdat->classzone_idx = pgdat->nr_zones - 1;
    
    		ret = try_to_freeze();
    		if (kthread_should_stop())
    			break;
    
    		/*
    		 * We can speed up thawing tasks if we don't call balance_pgdat
    		 * after returning from the refrigerator
    		 */
    
    		if (!ret) {
    			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
    
    			balanced_classzone_idx = classzone_idx;
    			balanced_order = balance_pgdat(pgdat, order,
    						&balanced_classzone_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return 0;
    }
    
    /*
     * A zone is low on free memory, so wake its kswapd task to service it.
     */
    
    void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pg_data_t *pgdat;
    
    
    	if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    
    	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	if (pgdat->kswapd_max_order < order) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		pgdat->kswapd_max_order = order;
    
    		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
    	}
    
    	if (!waitqueue_active(&pgdat->kswapd_wait))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
    		return;
    
    	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
    
    	wake_up_interruptible(&pgdat->kswapd_wait);
    
    /*
     * The reclaimable count would be mostly accurate.
     * The less reclaimable pages may be
     * - mlocked pages, which will be moved to unevictable list when encountered
     * - mapped pages, which may require several travels to be reclaimed
     * - dirty pages, which is not "instantly" reclaimable
     */
    unsigned long global_reclaimable_pages(void)
    
    	int nr;
    
    	nr = global_page_state(NR_ACTIVE_FILE) +
    	     global_page_state(NR_INACTIVE_FILE);
    
    	if (nr_swap_pages > 0)
    		nr += global_page_state(NR_ACTIVE_ANON) +
    		      global_page_state(NR_INACTIVE_ANON);
    
    	return nr;
    }
    
    unsigned long zone_reclaimable_pages(struct zone *zone)
    {
    	int nr;
    
    	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
    	     zone_page_state(zone, NR_INACTIVE_FILE);
    
    	if (nr_swap_pages > 0)
    		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
    		      zone_page_state(zone, NR_INACTIVE_ANON);
    
    	return nr;
    
    #ifdef CONFIG_HIBERNATION
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
    
     * freed pages.
     *
     * Rather than trying to age LRUs the aim is to preserve the overall
     * LRU order by reclaiming preferentially
     * inactive > active > active referenced > active mapped
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct reclaim_state reclaim_state;
    	struct scan_control sc = {
    
    		.gfp_mask = GFP_HIGHUSER_MOVABLE,
    		.may_swap = 1,
    		.may_unmap = 1,
    
    		.may_writepage = 1,
    
    		.nr_to_reclaim = nr_to_reclaim,
    		.hibernation_mode = 1,
    		.order = 0,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	};
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};