Skip to content
Snippets Groups Projects
vmscan.c 103 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	switch (compaction_suitable(zone, sc->order)) {
    
    	case COMPACT_PARTIAL:
    	case COMPACT_CONTINUE:
    		return false;
    	default:
    		return true;
    	}
    }
    
    
    static void shrink_zone(struct zone *zone, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_reclaimed, nr_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	do {
    		struct mem_cgroup *root = sc->target_mem_cgroup;
    		struct mem_cgroup_reclaim_cookie reclaim = {
    			.zone = zone,
    			.priority = sc->priority,
    		};
    		struct mem_cgroup *memcg;
    
    		nr_reclaimed = sc->nr_reclaimed;
    		nr_scanned = sc->nr_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		memcg = mem_cgroup_iter(root, NULL, &reclaim);
    		do {
    			struct lruvec *lruvec;
    
    			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    			shrink_lruvec(lruvec, sc);
    
    			 * Direct reclaim and kswapd have to scan all memory
    			 * cgroups to fulfill the overall scan target for the
    
    			 *
    			 * Limit reclaim, on the other hand, only cares about
    			 * nr_to_reclaim pages to be reclaimed and it will
    			 * retry with decreasing priority if one round over the
    			 * whole hierarchy is not sufficient.
    
    			if (!global_reclaim(sc) &&
    					sc->nr_reclaimed >= sc->nr_to_reclaim) {
    
    				mem_cgroup_iter_break(root, memcg);
    				break;
    			}
    			memcg = mem_cgroup_iter(root, memcg, &reclaim);
    		} while (memcg);
    
    
    		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
    			   sc->nr_scanned - nr_scanned,
    			   sc->nr_reclaimed - nr_reclaimed);
    
    
    	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
    					 sc->nr_scanned - nr_scanned, sc));
    
    /* Returns true if compaction should go ahead for a high-order request */
    static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
    {
    	unsigned long balance_gap, watermark;
    	bool watermark_ok;
    
    	/* Do not consider compaction for orders reclaim is meant to satisfy */
    	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
    		return false;
    
    	/*
    	 * Compaction takes time to run and there are potentially other
    	 * callers using the pages just freed. Continue reclaiming until
    	 * there is a buffer of free pages available to give compaction
    	 * a reasonable chance of completing and allocating the page
    	 */
    	balance_gap = min(low_wmark_pages(zone),
    
    		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    
    			KSWAPD_ZONE_BALANCE_GAP_RATIO);
    	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
    	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
    
    	/*
    	 * If compaction is deferred, reclaim up to a point where
    	 * compaction will have a chance of success when re-enabled
    	 */
    
    	if (compaction_deferred(zone, sc->order))
    
    		return watermark_ok;
    
    	/* If compaction is not ready to start, keep reclaiming */
    	if (!compaction_suitable(zone, sc->order))
    		return false;
    
    	return watermark_ok;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the direct reclaim path, for page-allocating processes.  We only
     * try to reclaim pages from zones which will satisfy the caller's allocation
     * request.
     *
    
     * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
     * Because:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * a) The caller may be trying to free *extra* pages to satisfy a higher-order
     *    allocation or
    
     * b) The target zone may be at high_wmark_pages(zone) but the lower zones
     *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
     *    zone defense algorithm.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * If a zone is deemed to be full of pinned pages then just give it a light
     * scan then give up on it.
    
     *
     * This function returns true if a zone is being reclaimed for a costly
    
     * high-order allocation and compaction is ready to begin. This indicates to
    
     * the caller that it should consider retrying the allocation instead of
     * further reclaim.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	/*
    	 * If the number of buffer_heads in the machine exceeds the maximum
    	 * allowed level, force direct reclaim to scan the highmem zone as
    	 * highmem pages could be pinning lowmem pages storing buffer_heads
    	 */
    	if (buffer_heads_over_limit)
    		sc->gfp_mask |= __GFP_HIGHMEM;
    
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask), sc->nodemask) {
    
    		if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		/*
    		 * Take care memory controller reclaiming has small influence
    		 * to global LRU.
    		 */
    
    			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    				continue;
    
    			if (zone->all_unreclaimable &&
    					sc->priority != DEF_PRIORITY)
    
    			if (IS_ENABLED(CONFIG_COMPACTION)) {
    
    				 * If we already have plenty of memory free for
    				 * compaction in this zone, don't free any more.
    				 * Even though compaction is invoked for any
    				 * non-zero order, only frequent costly order
    				 * reclamation is disruptive enough to become a
    
    				 * noticeable problem, like transparent huge
    				 * page allocations.
    
    			/*
    			 * This steals pages from memory cgroups over softlimit
    			 * and returns the number of reclaimed pages and
    			 * scanned pages. This works for global memory pressure
    			 * and balancing, not for a memcg's limit.
    			 */
    			nr_soft_scanned = 0;
    			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    						sc->order, sc->gfp_mask,
    						&nr_soft_scanned);
    			sc->nr_reclaimed += nr_soft_reclaimed;
    			sc->nr_scanned += nr_soft_scanned;
    			/* need some check for avoid more shrink_zone() */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    }
    
    static bool zone_reclaimable(struct zone *zone)
    {
    	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
    }
    
    
    /* All zones in zonelist are unreclaimable? */
    
    static bool all_unreclaimable(struct zonelist *zonelist,
    		struct scan_control *sc)
    {
    	struct zoneref *z;
    	struct zone *zone;
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    			gfp_zone(sc->gfp_mask), sc->nodemask) {
    		if (!populated_zone(zone))
    			continue;
    		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    			continue;
    
    		if (!zone->all_unreclaimable)
    			return false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the main entry point to direct page reclaim.
     *
     * If a full scan of the inactive list fails to free enough memory then we
     * are "out of memory" and something needs to be killed.
     *
     * If the caller is !__GFP_FS then the probability of a failure is reasonably
     * high - the zone may be full of dirty or under-writeback pages, which this
    
     * caller can't do much about.  We kick the writeback threads and take explicit
     * naps in the hope that some of these pages can be written.  But if the
     * allocating task holds filesystem locks which prevent writeout this might not
     * work, and the allocation attempt will fail.
    
     *
     * returns:	0, if no pages reclaimed
     * 		else, the number of pages reclaimed
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    
    					struct scan_control *sc,
    					struct shrink_control *shrink)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long total_scanned = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    
    	unsigned long writeback_threshold;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	delayacct_freepages_start();
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
    				sc->priority);
    
    		sc->nr_scanned = 0;
    
    		aborted_reclaim = shrink_zones(zonelist, sc);
    
    		/*
    		 * Don't shrink slabs when reclaiming memory from
    		 * over limit cgroups
    		 */
    
    			unsigned long lru_pages = 0;
    
    			for_each_zone_zonelist(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask)) {
    
    				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    					continue;
    
    				lru_pages += zone_reclaimable_pages(zone);
    			}
    
    
    			shrink_slab(shrink, sc->nr_scanned, lru_pages);
    
    				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		total_scanned += sc->nr_scanned;
    
    		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto out;
    
    
    		/*
    		 * If we're getting trouble reclaiming, start doing
    		 * writepage even in laptop mode.
    		 */
    		if (sc->priority < DEF_PRIORITY - 2)
    			sc->may_writepage = 1;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * Try to write back as many pages as we just scanned.  This
    		 * tends to cause slow streaming writers to write data to the
    		 * disk smoothly, at the dirtying rate, which is nice.   But
    		 * that's undesirable in laptop mode, where we *want* lumpy
    		 * writeout.  So in laptop mode, write out the whole world.
    		 */
    
    		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    		if (total_scanned > writeback_threshold) {
    
    			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
    						WB_REASON_TRY_TO_FREE_PAGES);
    
    			sc->may_writepage = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* Take a nap, wait for some writeback to complete */
    
    		if (!sc->hibernation_mode && sc->nr_scanned &&
    
    		    sc->priority < DEF_PRIORITY - 2) {
    
    			struct zone *preferred_zone;
    
    			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
    
    						&cpuset_current_mems_allowed,
    						&preferred_zone);
    
    			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
    		}
    
    	} while (--sc->priority >= 0);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    	/*
    	 * As hibernation is going on, kswapd is freezed so that it can't mark
    	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
    	 * check.
    	 */
    	if (oom_killer_disabled)
    		return 0;
    
    
    	/* Aborted reclaim to try compaction? don't OOM, then */
    	if (aborted_reclaim)
    
    	/* top priority shrink_zones still had more to do? don't OOM, then */
    
    	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
    
    static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
    {
    	struct zone *zone;
    	unsigned long pfmemalloc_reserve = 0;
    	unsigned long free_pages = 0;
    	int i;
    	bool wmark_ok;
    
    	for (i = 0; i <= ZONE_NORMAL; i++) {
    		zone = &pgdat->node_zones[i];
    		pfmemalloc_reserve += min_wmark_pages(zone);
    		free_pages += zone_page_state(zone, NR_FREE_PAGES);
    	}
    
    	wmark_ok = free_pages > pfmemalloc_reserve / 2;
    
    	/* kswapd must be awake if processes are being throttled */
    	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
    		pgdat->classzone_idx = min(pgdat->classzone_idx,
    						(enum zone_type)ZONE_NORMAL);
    		wake_up_interruptible(&pgdat->kswapd_wait);
    	}
    
    	return wmark_ok;
    }
    
    /*
     * Throttle direct reclaimers if backing storage is backed by the network
     * and the PFMEMALLOC reserve for the preferred node is getting dangerously
     * depleted. kswapd will continue to make progress and wake the processes
    
     * when the low watermark is reached.
     *
     * Returns true if a fatal signal was delivered during throttling. If this
     * happens, the page allocator should not consider triggering the OOM killer.
    
    static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
    
    					nodemask_t *nodemask)
    {
    	struct zone *zone;
    	int high_zoneidx = gfp_zone(gfp_mask);
    	pg_data_t *pgdat;
    
    	/*
    	 * Kernel threads should not be throttled as they may be indirectly
    	 * responsible for cleaning pages necessary for reclaim to make forward
    	 * progress. kjournald for example may enter direct reclaim while
    	 * committing a transaction where throttling it could forcing other
    	 * processes to block on log_wait_commit().
    	 */
    	if (current->flags & PF_KTHREAD)
    
    		goto out;
    
    	/*
    	 * If a fatal signal is pending, this process should not throttle.
    	 * It should return quickly so it can exit and free its memory
    	 */
    	if (fatal_signal_pending(current))
    		goto out;
    
    
    	/* Check if the pfmemalloc reserves are ok */
    	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
    	pgdat = zone->zone_pgdat;
    	if (pfmemalloc_watermark_ok(pgdat))
    
    	/* Account for the throttling */
    	count_vm_event(PGSCAN_DIRECT_THROTTLE);
    
    
    	/*
    	 * If the caller cannot enter the filesystem, it's possible that it
    	 * is due to the caller holding an FS lock or performing a journal
    	 * transaction in the case of a filesystem like ext[3|4]. In this case,
    	 * it is not safe to block on pfmemalloc_wait as kswapd could be
    	 * blocked waiting on the same lock. Instead, throttle for up to a
    	 * second before continuing.
    	 */
    	if (!(gfp_mask & __GFP_FS)) {
    		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
    			pfmemalloc_watermark_ok(pgdat), HZ);
    
    	}
    
    	/* Throttle until kswapd wakes the process */
    	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
    		pfmemalloc_watermark_ok(pgdat));
    
    
    check_pending:
    	if (fatal_signal_pending(current))
    		return true;
    
    out:
    	return false;
    
    unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    
    				gfp_t gfp_mask, nodemask_t *nodemask)
    
    	struct scan_control sc = {
    
    		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    
    		.may_writepage = !laptop_mode,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.nodemask = nodemask,
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    
    	 * Do not enter reclaim if fatal signal was delivered while throttled.
    	 * 1 is returned so that the page allocator does not OOM kill at this
    	 * point.
    
    	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
    
    	trace_mm_vmscan_direct_reclaim_begin(order,
    				sc.may_writepage,
    				gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    #ifdef CONFIG_MEMCG
    
    unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
    
    						gfp_t gfp_mask, bool noswap,
    
    						struct zone *zone,
    						unsigned long *nr_scanned)
    
    {
    	struct scan_control sc = {
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_writepage = !laptop_mode,
    		.may_unmap = 1,
    		.may_swap = !noswap,
    		.order = 0,
    
    	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
    
    						      sc.may_writepage,
    						      sc.gfp_mask);
    
    
    	/*
    	 * NOTE: Although we can get the priority field, using it
    	 * here is not a good idea, since it limits the pages we can scan.
    	 * if we don't reclaim here, the shrink_zone from balance_pgdat
    	 * will pick up pages from other mem cgroup's as well. We hack
    	 * the priority and make it zero.
    	 */
    
    	shrink_lruvec(lruvec, &sc);
    
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
    
    
    	*nr_scanned = sc.nr_scanned;
    
    unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    					   gfp_t gfp_mask,
    
    	struct zonelist *zonelist;
    
    	unsigned long nr_reclaimed;
    
    	struct scan_control sc = {
    		.may_writepage = !laptop_mode,
    
    		.may_unmap = 1,
    
    		.may_swap = !noswap,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.nodemask = NULL, /* we don't care the placement */
    
    		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
    	};
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    
    	/*
    	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
    	 * take care of from where we get pages. So the node where we start the
    	 * scan does not need to be the current node.
    	 */
    
    	nid = mem_cgroup_select_victim_node(memcg);
    
    
    	zonelist = NODE_DATA(nid)->node_zonelists;
    
    
    	trace_mm_vmscan_memcg_reclaim_begin(0,
    					    sc.may_writepage,
    					    sc.gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    static void age_active_anon(struct zone *zone, struct scan_control *sc)
    
    	if (!total_swap_pages)
    		return;
    
    	memcg = mem_cgroup_iter(NULL, NULL, NULL);
    	do {
    
    		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    		if (inactive_anon_is_low(lruvec))
    
    			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    
    
    		memcg = mem_cgroup_iter(NULL, memcg, NULL);
    	} while (memcg);
    
    static bool zone_balanced(struct zone *zone, int order,
    			  unsigned long balance_gap, int classzone_idx)
    {
    	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
    				    balance_gap, classzone_idx, 0))
    		return false;
    
    
    	if (IS_ENABLED(CONFIG_COMPACTION) && order &&
    	    !compaction_suitable(zone, order))
    
     * pgdat_balanced() is used when checking if a node is balanced.
     *
     * For order-0, all zones must be balanced!
     *
     * For high-order allocations only zones that meet watermarks and are in a
     * zone allowed by the callers classzone_idx are added to balanced_pages. The
     * total of balanced pages must be at least 25% of the zones allowed by
     * classzone_idx for the node to be considered balanced. Forcing all zones to
     * be balanced for high orders can cause excessive reclaim when there are
     * imbalanced zones.
    
     * The choice of 25% is due to
     *   o a 16M DMA zone that is balanced will not balance a zone on any
     *     reasonable sized machine
     *   o On all other machines, the top zone must be at least a reasonable
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
     *     percentage of the middle zones. For example, on 32-bit x86, highmem
    
     *     would need to be at least 256M for it to be balance a whole node.
     *     Similarly, on x86-64 the Normal zone would need to be at least 1G
     *     to balance a node on its own. These seemed like reasonable ratios.
     */
    
    static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
    
    	unsigned long balanced_pages = 0;
    
    	/* Check the watermark levels */
    	for (i = 0; i <= classzone_idx; i++) {
    		struct zone *zone = pgdat->node_zones + i;
    
    		managed_pages += zone->managed_pages;
    
    
    		/*
    		 * A special case here:
    		 *
    		 * balance_pgdat() skips over all_unreclaimable after
    		 * DEF_PRIORITY. Effectively, it considers them balanced so
    		 * they must be considered balanced here as well!
    		 */
    		if (zone->all_unreclaimable) {
    
    			balanced_pages += zone->managed_pages;
    
    			continue;
    		}
    
    		if (zone_balanced(zone, order, 0, i))
    
    			balanced_pages += zone->managed_pages;
    
    		return balanced_pages >= (managed_pages >> 2);
    
    /*
     * Prepare kswapd for sleeping. This verifies that there are no processes
     * waiting in throttle_direct_reclaim() and that watermarks have been met.
     *
     * Returns true if kswapd is ready to sleep
     */
    static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
    
    {
    	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    	if (remaining)
    
    		return false;
    
    	/*
    	 * There is a potential race between when kswapd checks its watermarks
    	 * and a process gets throttled. There is also a potential race if
    	 * processes get throttled, kswapd wakes, a large process exits therby
    	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
    	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
    	 * so wake them now if necessary. If necessary, processes will wake
    	 * kswapd and get throttled again
    	 */
    	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
    		wake_up(&pgdat->pfmemalloc_wait);
    		return false;
    	}
    
    	return pgdat_balanced(pgdat, order, classzone_idx);
    
    /*
     * kswapd shrinks the zone by the number of pages required to reach
     * the high watermark.
    
     *
     * Returns true if kswapd scanned at least the requested number of pages to
     * reclaim. This is used to determine if the scanning priority needs to be
     * raised.
    
    static bool kswapd_shrink_zone(struct zone *zone,
    
    			       unsigned long lru_pages,
    			       unsigned long *nr_attempted)
    
    {
    	unsigned long nr_slab;
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    	struct shrink_control shrink = {
    		.gfp_mask = sc->gfp_mask,
    	};
    
    	/* Reclaim above the high watermark. */
    	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
    	shrink_zone(zone, sc);
    
    	reclaim_state->reclaimed_slab = 0;
    	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
    	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    
    	/* Account for the number of pages attempted to reclaim */
    	*nr_attempted += sc->nr_to_reclaim;
    
    
    	if (nr_slab == 0 && !zone_reclaimable(zone))
    		zone->all_unreclaimable = 1;
    
    
    	return sc->nr_scanned >= sc->nr_to_reclaim;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * For kswapd, balance_pgdat() will work across all this node's zones until
    
     * they are all at high_wmark_pages(zone).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * Returns the final order kswapd was reclaiming at
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * There is special handling here for zones which are full of pinned pages.
     * This can happen if the pages are all mlocked, or if they are all used by
     * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
     * What we do is to detect the case where all pages in the zone have been
     * scanned twice and there has been zero successful reclaim.  Mark the zone as
     * dead and from now on, only perform a short scan.  Basically we're polling
     * the zone for when the problem goes away.
     *
     * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    
     * zones which have free_pages > high_wmark_pages(zone), but once a zone is
     * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
     * lower zones regardless of the number of free pages in the lower zones. This
     * interoperates with the page allocator fallback scheme to ensure that aging
     * of pages is balanced across the zones.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int i;
    
    	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	struct scan_control sc = {
    		.gfp_mask = GFP_KERNEL,
    
    		.priority = DEF_PRIORITY,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.may_writepage = !laptop_mode,
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		.order = order,
    
    	count_vm_event(PAGEOUTRUN);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unsigned long lru_pages = 0;
    
    		bool raise_priority = true;
    
    		bool pgdat_needs_compaction = (order > 0);
    
    
    		sc.nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/*
    		 * Scan in the highmem->dma direction for the highest
    		 * zone which needs scanning
    		 */
    		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
    			struct zone *zone = pgdat->node_zones + i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (!populated_zone(zone))
    				continue;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (zone->all_unreclaimable &&
    			    sc.priority != DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			/*
    			 * Do some background aging of the anon list, to give
    			 * pages a chance to be referenced before reclaiming.
    			 */
    
    			age_active_anon(zone, &sc);
    
    			/*
    			 * If the number of buffer_heads in the machine
    			 * exceeds the maximum allowed level and this node
    			 * has a highmem zone, force kswapd to reclaim from
    			 * it to relieve lowmem pressure.
    			 */
    			if (buffer_heads_over_limit && is_highmem_idx(i)) {
    				end_zone = i;
    				break;
    			}
    
    
    			if (!zone_balanced(zone, order, 0, 0)) {
    
    				/*
    				 * If balanced, clear the dirty and congested
    				 * flags
    				 */
    
    				zone_clear_flag(zone, ZONE_CONGESTED);
    
    				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		for (i = 0; i <= end_zone; i++) {
    			struct zone *zone = pgdat->node_zones + i;
    
    
    			lru_pages += zone_reclaimable_pages(zone);
    
    
    			/*
    			 * If any zone is currently balanced then kswapd will
    			 * not call compaction as it is expected that the
    			 * necessary pages are already available.
    			 */
    			if (pgdat_needs_compaction &&
    					zone_watermark_ok(zone, order,
    						low_wmark_pages(zone),
    						*classzone_idx, 0))
    				pgdat_needs_compaction = false;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/*
    		 * Now scan the zone in the dma->highmem direction, stopping
    		 * at the last zone which needs scanning.
    		 *
    		 * We do this because the page allocator works in the opposite
    		 * direction.  This prevents the page allocator from allocating
    		 * pages behind kswapd's direction of progress, which would
    		 * cause too much scanning of the lower zones.
    		 */
    		for (i = 0; i <= end_zone; i++) {
    			struct zone *zone = pgdat->node_zones + i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				continue;
    
    
    			if (zone->all_unreclaimable &&
    			    sc.priority != DEF_PRIORITY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				continue;
    
    			sc.nr_scanned = 0;
    
    			/*
    			 * Call soft limit reclaim before calling shrink_zone.
    			 */
    
    			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    							order, sc.gfp_mask,
    							&nr_soft_scanned);
    			sc.nr_reclaimed += nr_soft_reclaimed;
    
    			 * We put equal pressure on every zone, unless
    			 * one zone has way too many pages free
    			 * already. The "too many pages" is defined
    			 * as the high wmark plus a "gap" where the
    			 * gap is either the low watermark or 1%
    			 * of the zone, whichever is smaller.
    
    					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    				KSWAPD_ZONE_BALANCE_GAP_RATIO);
    
    			/*
    			 * Kswapd reclaims only single pages with compaction
    			 * enabled. Trying too hard to reclaim until contiguous
    			 * free pages have become available can hurt performance
    			 * by evicting too much useful data from memory.
    			 * Do not reclaim more than needed for compaction.
    			 */
    			testorder = order;
    
    			if (IS_ENABLED(CONFIG_COMPACTION) && order &&
    
    					compaction_suitable(zone, order) !=
    						COMPACT_SKIPPED)
    				testorder = 0;
    
    
    			if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
    
    			    !zone_balanced(zone, testorder,
    
    					   balance_gap, end_zone)) {
    				/*
    				 * There should be no need to raise the
    				 * scanning priority if enough pages are
    				 * already being scanned that high
    				 * watermark would be met at 100% efficiency.
    				 */
    
    				if (kswapd_shrink_zone(zone, &sc, lru_pages,
    						       &nr_attempted))
    
    					raise_priority = false;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/*
    
    			 * If we're getting trouble reclaiming, start doing
    			 * writepage even in laptop mode.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 */
    
    			if (sc.priority < DEF_PRIORITY - 2)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				sc.may_writepage = 1;
    
    			if (zone->all_unreclaimable) {
    				if (end_zone && end_zone == i)
    					end_zone--;
    
    			if (zone_balanced(zone, testorder, 0, end_zone))
    
    				/*
    				 * If a zone reaches its high watermark,
    				 * consider it to be no longer congested. It's
    				 * possible there are dirty pages backed by
    				 * congested BDIs but as pressure is relieved,
    
    				 * speculatively avoid congestion waits
    
    				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    
    		/*
    		 * If the low watermark is met there is no need for processes
    		 * to be throttled on pfmemalloc_wait as they should not be
    		 * able to safely make forward progress. Wake them
    		 */
    		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
    				pfmemalloc_watermark_ok(pgdat))
    			wake_up(&pgdat->pfmemalloc_wait);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    
    		 * Fragmentation may mean that the system cannot be rebalanced
    		 * for high-order allocations in all zones. If twice the
    		 * allocation size has been reclaimed and the zones are still
    		 * not balanced then recheck the watermarks at order-0 to
    		 * prevent kswapd reclaiming excessively. Assume that a
    		 * process requested a high-order can direct reclaim/compact.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 */
    
    		if (order && sc.nr_reclaimed >= 2UL << order)
    			order = sc.order = 0;
    
    		/* Check if kswapd should be suspending */
    		if (try_to_freeze() || kthread_should_stop())
    			break;
    
    		/*
    		 * Compact if necessary and kswapd is reclaiming at least the
    		 * high watermark number of pages as requsted
    		 */
    		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
    			compact_pgdat(pgdat, order);
    
    
    		 * Raise priority if scanning rate is too low or there was no
    		 * progress in reclaiming pages
    
    		if (raise_priority || !sc.nr_reclaimed)
    			sc.priority--;
    
    	} while (sc.priority >= 1 &&
    
    		 !pgdat_balanced(pgdat, order, *classzone_idx));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
    
    	 * makes a decision on the order we were last reclaiming at. However,
    	 * if another caller entered the allocator slow path while kswapd
    	 * was awake, order will remain at the higher level
    	 */
    
    static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
    
    {
    	long remaining = 0;
    	DEFINE_WAIT(wait);
    
    	if (freezing(current) || kthread_should_stop())
    		return;
    
    	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    
    	/* Try to sleep for a short interval */
    
    	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    
    		remaining = schedule_timeout(HZ/10);
    		finish_wait(&pgdat->kswapd_wait, &wait);
    		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
    	}
    
    	/*
    	 * After a short sleep, check if it was a premature sleep. If not, then
    	 * go fully to sleep until explicitly woken up.
    	 */
    
    	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
    
    		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
    
    		/*
    		 * vmstat counters are not perfectly accurate and the estimated
    		 * value for counters such as NR_FREE_PAGES can deviate from the
    		 * true value by nr_online_cpus * threshold. To avoid the zone
    		 * watermarks being breached while under pressure, we reduce the
    		 * per-cpu vmstat threshold while kswapd is awake and restore
    		 * them before going back to sleep.
    		 */
    		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
    
    		/*
    		 * Compaction records what page blocks it recently failed to