Skip to content
Snippets Groups Projects
vmscan.c 108 KiB
Newer Older
  • Learn to ignore specific revisions
  • 			break;
    		default:
    			/* Look ma, no brain */
    			BUG();
    		}
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    		nr[lru] = scan;
    
    /*
     * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
     */
    static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
    {
    	unsigned long nr[NR_LRU_LISTS];
    
    	unsigned long targets[NR_LRU_LISTS];
    
    	unsigned long nr_to_scan;
    	enum lru_list lru;
    	unsigned long nr_reclaimed = 0;
    	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    	struct blk_plug plug;
    
    	bool scan_adjusted = false;
    
    	/* Record the original scan target for proportional adjustments later */
    	memcpy(targets, nr, sizeof(nr));
    
    
    	blk_start_plug(&plug);
    	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
    					nr[LRU_INACTIVE_FILE]) {
    
    		unsigned long nr_anon, nr_file, percentage;
    		unsigned long nr_scanned;
    
    
    		for_each_evictable_lru(lru) {
    			if (nr[lru]) {
    				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
    				nr[lru] -= nr_to_scan;
    
    				nr_reclaimed += shrink_list(lru, nr_to_scan,
    							    lruvec, sc);
    			}
    		}
    
    
    		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
    			continue;
    
    
    		 * For global direct reclaim, reclaim only the number of pages
    		 * requested. Less care is taken to scan proportionally as it
    		 * is more important to minimise direct reclaim stall latency
    		 * than it is to properly age the LRU lists.
    
    		if (global_reclaim(sc) && !current_is_kswapd())
    
    
    		/*
    		 * For kswapd and memcg, reclaim at least the number of pages
    		 * requested. Ensure that the anon and file LRUs shrink
    		 * proportionally what was requested by get_scan_count(). We
    		 * stop reclaiming one LRU and reduce the amount scanning
    		 * proportional to the original scan target.
    		 */
    		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
    		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
    
    		if (nr_file > nr_anon) {
    			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
    						targets[LRU_ACTIVE_ANON] + 1;
    			lru = LRU_BASE;
    			percentage = nr_anon * 100 / scan_target;
    		} else {
    			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
    						targets[LRU_ACTIVE_FILE] + 1;
    			lru = LRU_FILE;
    			percentage = nr_file * 100 / scan_target;
    		}
    
    		/* Stop scanning the smaller of the LRU */
    		nr[lru] = 0;
    		nr[lru + LRU_ACTIVE] = 0;
    
    		/*
    		 * Recalculate the other LRU scan count based on its original
    		 * scan target and the percentage scanning already complete
    		 */
    		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
    		nr_scanned = targets[lru] - nr[lru];
    		nr[lru] = targets[lru] * (100 - percentage) / 100;
    		nr[lru] -= min(nr[lru], nr_scanned);
    
    		lru += LRU_ACTIVE;
    		nr_scanned = targets[lru] - nr[lru];
    		nr[lru] = targets[lru] * (100 - percentage) / 100;
    		nr[lru] -= min(nr[lru], nr_scanned);
    
    		scan_adjusted = true;
    
    	}
    	blk_finish_plug(&plug);
    	sc->nr_reclaimed += nr_reclaimed;
    
    	/*
    	 * Even if we did not try to evict anon pages at all, we want to
    	 * rebalance the anon lru active/inactive ratio.
    	 */
    	if (inactive_anon_is_low(lruvec))
    		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    				   sc, LRU_ACTIVE_ANON);
    
    	throttle_vm_writeout(sc->gfp_mask);
    }
    
    
    /* Use reclaim/compaction for costly allocs or under memory pressure */
    
    static bool in_reclaim_compaction(struct scan_control *sc)
    
    	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    
    			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
    
    			 sc->priority < DEF_PRIORITY - 2))
    
    		return true;
    
    	return false;
    }
    
    
     * Reclaim/compaction is used for high-order allocation requests. It reclaims
     * order-0 pages before compacting the zone. should_continue_reclaim() returns
     * true if more pages should be reclaimed such that when the page allocator
     * calls try_to_compact_zone() that it will have enough free pages to succeed.
     * It will give up earlier than that if there is difficulty reclaiming pages.
    
    static inline bool should_continue_reclaim(struct zone *zone,
    
    					unsigned long nr_reclaimed,
    					unsigned long nr_scanned,
    					struct scan_control *sc)
    {
    	unsigned long pages_for_compaction;
    	unsigned long inactive_lru_pages;
    
    	/* If not in reclaim/compaction mode, stop */
    
    	if (!in_reclaim_compaction(sc))
    
    	/* Consider stopping depending on scan and reclaim activity */
    	if (sc->gfp_mask & __GFP_REPEAT) {
    		/*
    		 * For __GFP_REPEAT allocations, stop reclaiming if the
    		 * full LRU list has been scanned and we are still failing
    		 * to reclaim pages. This full LRU scan is potentially
    		 * expensive but a __GFP_REPEAT caller really wants to succeed
    		 */
    		if (!nr_reclaimed && !nr_scanned)
    			return false;
    	} else {
    		/*
    		 * For non-__GFP_REPEAT allocations which can presumably
    		 * fail without consequence, stop if we failed to reclaim
    		 * any pages from the last SWAP_CLUSTER_MAX number of
    		 * pages that were scanned. This will return to the
    		 * caller faster at the risk reclaim/compaction and
    		 * the resulting allocation attempt fails
    		 */
    		if (!nr_reclaimed)
    			return false;
    	}
    
    
    	/*
    	 * If we have not reclaimed enough pages for compaction and the
    	 * inactive lists are large enough, continue reclaiming
    	 */
    	pages_for_compaction = (2UL << sc->order);
    
    	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
    
    	if (get_nr_swap_pages() > 0)
    
    		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
    
    	if (sc->nr_reclaimed < pages_for_compaction &&
    			inactive_lru_pages > pages_for_compaction)
    		return true;
    
    	/* If compaction would go ahead or the allocation would succeed, stop */
    
    	switch (compaction_suitable(zone, sc->order)) {
    
    	case COMPACT_PARTIAL:
    	case COMPACT_CONTINUE:
    		return false;
    	default:
    		return true;
    	}
    }
    
    
    static void shrink_zone(struct zone *zone, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_reclaimed, nr_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	do {
    		struct mem_cgroup *root = sc->target_mem_cgroup;
    		struct mem_cgroup_reclaim_cookie reclaim = {
    			.zone = zone,
    			.priority = sc->priority,
    		};
    
    		nr_reclaimed = sc->nr_reclaimed;
    		nr_scanned = sc->nr_scanned;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		memcg = mem_cgroup_iter(root, NULL, &reclaim);
    		do {
    
    			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    			shrink_lruvec(lruvec, sc);
    
    			 * Direct reclaim and kswapd have to scan all memory
    			 * cgroups to fulfill the overall scan target for the
    
    			 *
    			 * Limit reclaim, on the other hand, only cares about
    			 * nr_to_reclaim pages to be reclaimed and it will
    			 * retry with decreasing priority if one round over the
    			 * whole hierarchy is not sufficient.
    
    			if (!global_reclaim(sc) &&
    					sc->nr_reclaimed >= sc->nr_to_reclaim) {
    
    				mem_cgroup_iter_break(root, memcg);
    				break;
    			}
    
    			memcg = mem_cgroup_iter(root, memcg, &reclaim);
    		} while (memcg);
    
    
    		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
    			   sc->nr_scanned - nr_scanned,
    			   sc->nr_reclaimed - nr_reclaimed);
    
    
    	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
    					 sc->nr_scanned - nr_scanned, sc));
    
    /* Returns true if compaction should go ahead for a high-order request */
    static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
    {
    	unsigned long balance_gap, watermark;
    	bool watermark_ok;
    
    	/* Do not consider compaction for orders reclaim is meant to satisfy */
    	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
    		return false;
    
    	/*
    	 * Compaction takes time to run and there are potentially other
    	 * callers using the pages just freed. Continue reclaiming until
    	 * there is a buffer of free pages available to give compaction
    	 * a reasonable chance of completing and allocating the page
    	 */
    	balance_gap = min(low_wmark_pages(zone),
    
    		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    
    			KSWAPD_ZONE_BALANCE_GAP_RATIO);
    	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
    	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
    
    	/*
    	 * If compaction is deferred, reclaim up to a point where
    	 * compaction will have a chance of success when re-enabled
    	 */
    
    	if (compaction_deferred(zone, sc->order))
    
    		return watermark_ok;
    
    	/* If compaction is not ready to start, keep reclaiming */
    	if (!compaction_suitable(zone, sc->order))
    		return false;
    
    	return watermark_ok;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the direct reclaim path, for page-allocating processes.  We only
     * try to reclaim pages from zones which will satisfy the caller's allocation
     * request.
     *
    
     * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
     * Because:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * a) The caller may be trying to free *extra* pages to satisfy a higher-order
     *    allocation or
    
     * b) The target zone may be at high_wmark_pages(zone) but the lower zones
     *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
     *    zone defense algorithm.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * If a zone is deemed to be full of pinned pages then just give it a light
     * scan then give up on it.
    
     *
     * This function returns true if a zone is being reclaimed for a costly
    
     * high-order allocation and compaction is ready to begin. This indicates to
    
     * the caller that it should consider retrying the allocation instead of
     * further reclaim.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	/*
    	 * If the number of buffer_heads in the machine exceeds the maximum
    	 * allowed level, force direct reclaim to scan the highmem zone as
    	 * highmem pages could be pinning lowmem pages storing buffer_heads
    	 */
    	if (buffer_heads_over_limit)
    		sc->gfp_mask |= __GFP_HIGHMEM;
    
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask), sc->nodemask) {
    
    		if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		/*
    		 * Take care memory controller reclaiming has small influence
    		 * to global LRU.
    		 */
    
    			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    				continue;
    
    			if (sc->priority != DEF_PRIORITY &&
    			    !zone_reclaimable(zone))
    
    			if (IS_ENABLED(CONFIG_COMPACTION)) {
    
    				 * If we already have plenty of memory free for
    				 * compaction in this zone, don't free any more.
    				 * Even though compaction is invoked for any
    				 * non-zero order, only frequent costly order
    				 * reclamation is disruptive enough to become a
    
    				 * noticeable problem, like transparent huge
    				 * page allocations.
    
    			/*
    			 * This steals pages from memory cgroups over softlimit
    			 * and returns the number of reclaimed pages and
    			 * scanned pages. This works for global memory pressure
    			 * and balancing, not for a memcg's limit.
    			 */
    			nr_soft_scanned = 0;
    			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
    						sc->order, sc->gfp_mask,
    						&nr_soft_scanned);
    			sc->nr_reclaimed += nr_soft_reclaimed;
    			sc->nr_scanned += nr_soft_scanned;
    
    			/* need some check for avoid more shrink_zone() */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    /* All zones in zonelist are unreclaimable? */
    
    static bool all_unreclaimable(struct zonelist *zonelist,
    		struct scan_control *sc)
    {
    	struct zoneref *z;
    	struct zone *zone;
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    			gfp_zone(sc->gfp_mask), sc->nodemask) {
    		if (!populated_zone(zone))
    			continue;
    		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    			continue;
    
    		if (zone_reclaimable(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This is the main entry point to direct page reclaim.
     *
     * If a full scan of the inactive list fails to free enough memory then we
     * are "out of memory" and something needs to be killed.
     *
     * If the caller is !__GFP_FS then the probability of a failure is reasonably
     * high - the zone may be full of dirty or under-writeback pages, which this
    
     * caller can't do much about.  We kick the writeback threads and take explicit
     * naps in the hope that some of these pages can be written.  But if the
     * allocating task holds filesystem locks which prevent writeout this might not
     * work, and the allocation attempt will fail.
    
     *
     * returns:	0, if no pages reclaimed
     * 		else, the number of pages reclaimed
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
    
    					struct scan_control *sc,
    					struct shrink_control *shrink)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long total_scanned = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    
    	unsigned long writeback_threshold;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	delayacct_freepages_start();
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
    				sc->priority);
    
    		sc->nr_scanned = 0;
    
    		aborted_reclaim = shrink_zones(zonelist, sc);
    
    		 * Don't shrink slabs when reclaiming memory from over limit
    		 * cgroups but do shrink slab at least once when aborting
    		 * reclaim for compaction to avoid unevenly scanning file/anon
    		 * LRU pages over slab pages.
    
    			unsigned long lru_pages = 0;
    
    
    			nodes_clear(shrink->nodes_to_scan);
    
    			for_each_zone_zonelist(zone, z, zonelist,
    					gfp_zone(sc->gfp_mask)) {
    
    				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    					continue;
    
    				lru_pages += zone_reclaimable_pages(zone);
    
    				node_set(zone_to_nid(zone),
    					 shrink->nodes_to_scan);
    
    			shrink_slab(shrink, sc->nr_scanned, lru_pages);
    
    				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		total_scanned += sc->nr_scanned;
    
    		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			goto out;
    
    
    		/*
    		 * If we're getting trouble reclaiming, start doing
    		 * writepage even in laptop mode.
    		 */
    		if (sc->priority < DEF_PRIORITY - 2)
    			sc->may_writepage = 1;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    		 * Try to write back as many pages as we just scanned.  This
    		 * tends to cause slow streaming writers to write data to the
    		 * disk smoothly, at the dirtying rate, which is nice.   But
    		 * that's undesirable in laptop mode, where we *want* lumpy
    		 * writeout.  So in laptop mode, write out the whole world.
    		 */
    
    		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
    		if (total_scanned > writeback_threshold) {
    
    			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
    						WB_REASON_TRY_TO_FREE_PAGES);
    
    			sc->may_writepage = 1;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    	} while (--sc->priority >= 0 && !aborted_reclaim);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    	/*
    	 * As hibernation is going on, kswapd is freezed so that it can't mark
    	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
    	 * check.
    	 */
    	if (oom_killer_disabled)
    		return 0;
    
    
    	/* Aborted reclaim to try compaction? don't OOM, then */
    	if (aborted_reclaim)
    
    	/* top priority shrink_zones still had more to do? don't OOM, then */
    
    	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
    
    static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
    {
    	struct zone *zone;
    	unsigned long pfmemalloc_reserve = 0;
    	unsigned long free_pages = 0;
    	int i;
    	bool wmark_ok;
    
    	for (i = 0; i <= ZONE_NORMAL; i++) {
    		zone = &pgdat->node_zones[i];
    		pfmemalloc_reserve += min_wmark_pages(zone);
    		free_pages += zone_page_state(zone, NR_FREE_PAGES);
    	}
    
    	wmark_ok = free_pages > pfmemalloc_reserve / 2;
    
    	/* kswapd must be awake if processes are being throttled */
    	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
    		pgdat->classzone_idx = min(pgdat->classzone_idx,
    						(enum zone_type)ZONE_NORMAL);
    		wake_up_interruptible(&pgdat->kswapd_wait);
    	}
    
    	return wmark_ok;
    }
    
    /*
     * Throttle direct reclaimers if backing storage is backed by the network
     * and the PFMEMALLOC reserve for the preferred node is getting dangerously
     * depleted. kswapd will continue to make progress and wake the processes
    
     * when the low watermark is reached.
     *
     * Returns true if a fatal signal was delivered during throttling. If this
     * happens, the page allocator should not consider triggering the OOM killer.
    
    static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
    
    					nodemask_t *nodemask)
    {
    	struct zone *zone;
    	int high_zoneidx = gfp_zone(gfp_mask);
    	pg_data_t *pgdat;
    
    	/*
    	 * Kernel threads should not be throttled as they may be indirectly
    	 * responsible for cleaning pages necessary for reclaim to make forward
    	 * progress. kjournald for example may enter direct reclaim while
    	 * committing a transaction where throttling it could forcing other
    	 * processes to block on log_wait_commit().
    	 */
    	if (current->flags & PF_KTHREAD)
    
    		goto out;
    
    	/*
    	 * If a fatal signal is pending, this process should not throttle.
    	 * It should return quickly so it can exit and free its memory
    	 */
    	if (fatal_signal_pending(current))
    		goto out;
    
    
    	/* Check if the pfmemalloc reserves are ok */
    	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
    	pgdat = zone->zone_pgdat;
    	if (pfmemalloc_watermark_ok(pgdat))
    
    	/* Account for the throttling */
    	count_vm_event(PGSCAN_DIRECT_THROTTLE);
    
    
    	/*
    	 * If the caller cannot enter the filesystem, it's possible that it
    	 * is due to the caller holding an FS lock or performing a journal
    	 * transaction in the case of a filesystem like ext[3|4]. In this case,
    	 * it is not safe to block on pfmemalloc_wait as kswapd could be
    	 * blocked waiting on the same lock. Instead, throttle for up to a
    	 * second before continuing.
    	 */
    	if (!(gfp_mask & __GFP_FS)) {
    		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
    			pfmemalloc_watermark_ok(pgdat), HZ);
    
    	}
    
    	/* Throttle until kswapd wakes the process */
    	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
    		pfmemalloc_watermark_ok(pgdat));
    
    
    check_pending:
    	if (fatal_signal_pending(current))
    		return true;
    
    out:
    	return false;
    
    unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
    
    				gfp_t gfp_mask, nodemask_t *nodemask)
    
    	struct scan_control sc = {
    
    		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    
    		.may_writepage = !laptop_mode,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.nodemask = nodemask,
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    
    	 * Do not enter reclaim if fatal signal was delivered while throttled.
    	 * 1 is returned so that the page allocator does not OOM kill at this
    	 * point.
    
    	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
    
    	trace_mm_vmscan_direct_reclaim_begin(order,
    				sc.may_writepage,
    				gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    #ifdef CONFIG_MEMCG
    
    unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
    
    						gfp_t gfp_mask, bool noswap,
    
    						struct zone *zone,
    						unsigned long *nr_scanned)
    
    {
    	struct scan_control sc = {
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.may_writepage = !laptop_mode,
    		.may_unmap = 1,
    		.may_swap = !noswap,
    		.order = 0,
    
    	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
    
    						      sc.may_writepage,
    						      sc.gfp_mask);
    
    
    	/*
    	 * NOTE: Although we can get the priority field, using it
    	 * here is not a good idea, since it limits the pages we can scan.
    	 * if we don't reclaim here, the shrink_zone from balance_pgdat
    	 * will pick up pages from other mem cgroup's as well. We hack
    	 * the priority and make it zero.
    	 */
    
    	shrink_lruvec(lruvec, &sc);
    
    
    	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
    
    
    	*nr_scanned = sc.nr_scanned;
    
    unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    					   gfp_t gfp_mask,
    
    	struct zonelist *zonelist;
    
    	unsigned long nr_reclaimed;
    
    	struct scan_control sc = {
    		.may_writepage = !laptop_mode,
    
    		.may_unmap = 1,
    
    		.may_swap = !noswap,
    
    		.nr_to_reclaim = SWAP_CLUSTER_MAX,
    
    		.nodemask = NULL, /* we don't care the placement */
    
    		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
    				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
    	};
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    
    	/*
    	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
    	 * take care of from where we get pages. So the node where we start the
    	 * scan does not need to be the current node.
    	 */
    
    	nid = mem_cgroup_select_victim_node(memcg);
    
    
    	zonelist = NODE_DATA(nid)->node_zonelists;
    
    
    	trace_mm_vmscan_memcg_reclaim_begin(0,
    					    sc.may_writepage,
    					    sc.gfp_mask);
    
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    
    	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
    
    	return nr_reclaimed;
    
    static void age_active_anon(struct zone *zone, struct scan_control *sc)
    
    	if (!total_swap_pages)
    		return;
    
    	memcg = mem_cgroup_iter(NULL, NULL, NULL);
    	do {
    
    		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
    
    		if (inactive_anon_is_low(lruvec))
    
    			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
    
    
    		memcg = mem_cgroup_iter(NULL, memcg, NULL);
    	} while (memcg);
    
    static bool zone_balanced(struct zone *zone, int order,
    			  unsigned long balance_gap, int classzone_idx)
    {
    	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
    				    balance_gap, classzone_idx, 0))
    		return false;
    
    
    	if (IS_ENABLED(CONFIG_COMPACTION) && order &&
    	    !compaction_suitable(zone, order))
    
     * pgdat_balanced() is used when checking if a node is balanced.
     *
     * For order-0, all zones must be balanced!
     *
     * For high-order allocations only zones that meet watermarks and are in a
     * zone allowed by the callers classzone_idx are added to balanced_pages. The
     * total of balanced pages must be at least 25% of the zones allowed by
     * classzone_idx for the node to be considered balanced. Forcing all zones to
     * be balanced for high orders can cause excessive reclaim when there are
     * imbalanced zones.
    
     * The choice of 25% is due to
     *   o a 16M DMA zone that is balanced will not balance a zone on any
     *     reasonable sized machine
     *   o On all other machines, the top zone must be at least a reasonable
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
     *     percentage of the middle zones. For example, on 32-bit x86, highmem
    
     *     would need to be at least 256M for it to be balance a whole node.
     *     Similarly, on x86-64 the Normal zone would need to be at least 1G
     *     to balance a node on its own. These seemed like reasonable ratios.
     */
    
    static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
    
    	unsigned long balanced_pages = 0;
    
    	/* Check the watermark levels */
    	for (i = 0; i <= classzone_idx; i++) {
    		struct zone *zone = pgdat->node_zones + i;
    
    		managed_pages += zone->managed_pages;
    
    
    		/*
    		 * A special case here:
    		 *
    		 * balance_pgdat() skips over all_unreclaimable after
    		 * DEF_PRIORITY. Effectively, it considers them balanced so
    		 * they must be considered balanced here as well!
    		 */
    
    		if (!zone_reclaimable(zone)) {
    
    			balanced_pages += zone->managed_pages;
    
    			continue;
    		}
    
    		if (zone_balanced(zone, order, 0, i))
    
    			balanced_pages += zone->managed_pages;
    
    		return balanced_pages >= (managed_pages >> 2);
    
    /*
     * Prepare kswapd for sleeping. This verifies that there are no processes
     * waiting in throttle_direct_reclaim() and that watermarks have been met.
     *
     * Returns true if kswapd is ready to sleep
     */
    static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
    
    {
    	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
    	if (remaining)
    
    		return false;
    
    	/*
    	 * There is a potential race between when kswapd checks its watermarks
    	 * and a process gets throttled. There is also a potential race if
    	 * processes get throttled, kswapd wakes, a large process exits therby
    	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
    	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
    	 * so wake them now if necessary. If necessary, processes will wake
    	 * kswapd and get throttled again
    	 */
    	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
    		wake_up(&pgdat->pfmemalloc_wait);
    		return false;
    	}
    
    	return pgdat_balanced(pgdat, order, classzone_idx);
    
    /*
     * kswapd shrinks the zone by the number of pages required to reach
     * the high watermark.
    
     *
     * Returns true if kswapd scanned at least the requested number of pages to
    
     * reclaim or if the lack of progress was due to pages under writeback.
     * This is used to determine if the scanning priority needs to be raised.
    
    static bool kswapd_shrink_zone(struct zone *zone,
    
    			       unsigned long lru_pages,
    			       unsigned long *nr_attempted)
    
    	int testorder = sc->order;
    	unsigned long balance_gap;
    
    	struct reclaim_state *reclaim_state = current->reclaim_state;
    	struct shrink_control shrink = {
    		.gfp_mask = sc->gfp_mask,
    	};
    
    
    	/* Reclaim above the high watermark. */
    	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
    
    
    	/*
    	 * Kswapd reclaims only single pages with compaction enabled. Trying
    	 * too hard to reclaim until contiguous free pages have become
    	 * available can hurt performance by evicting too much useful data
    	 * from memory. Do not reclaim more than needed for compaction.
    	 */
    	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
    			compaction_suitable(zone, sc->order) !=
    				COMPACT_SKIPPED)
    		testorder = 0;
    
    	/*
    	 * We put equal pressure on every zone, unless one zone has way too
    	 * many pages free already. The "too many pages" is defined as the
    	 * high wmark plus a "gap" where the gap is either the low
    	 * watermark or 1% of the zone, whichever is smaller.
    	 */
    	balance_gap = min(low_wmark_pages(zone),
    		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
    		KSWAPD_ZONE_BALANCE_GAP_RATIO);
    
    	/*
    	 * If there is no low memory pressure or the zone is balanced then no
    	 * reclaim is necessary
    	 */
    	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
    	if (!lowmem_pressure && zone_balanced(zone, testorder,
    						balance_gap, classzone_idx))
    		return true;
    
    
    	nodes_clear(shrink.nodes_to_scan);
    	node_set(zone_to_nid(zone), shrink.nodes_to_scan);
    
    	shrink_slab(&shrink, sc->nr_scanned, lru_pages);
    
    	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
    
    
    	/* Account for the number of pages attempted to reclaim */
    	*nr_attempted += sc->nr_to_reclaim;
    
    
    	zone_clear_flag(zone, ZONE_WRITEBACK);
    
    
    	/*
    	 * If a zone reaches its high watermark, consider it to be no longer
    	 * congested. It's possible there are dirty pages backed by congested
    	 * BDIs but as pressure is relieved, speculatively avoid congestion
    	 * waits.
    	 */
    
    	if (zone_reclaimable(zone) &&
    
    	    zone_balanced(zone, testorder, 0, classzone_idx)) {
    		zone_clear_flag(zone, ZONE_CONGESTED);
    		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
    	}
    
    
    	return sc->nr_scanned >= sc->nr_to_reclaim;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * For kswapd, balance_pgdat() will work across all this node's zones until
    
     * they are all at high_wmark_pages(zone).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
    
     * Returns the final order kswapd was reclaiming at
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * There is special handling here for zones which are full of pinned pages.
     * This can happen if the pages are all mlocked, or if they are all used by
     * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
     * What we do is to detect the case where all pages in the zone have been
     * scanned twice and there has been zero successful reclaim.  Mark the zone as
     * dead and from now on, only perform a short scan.  Basically we're polling
     * the zone for when the problem goes away.
     *
     * kswapd scans the zones in the highmem->normal->dma direction.  It skips
    
     * zones which have free_pages > high_wmark_pages(zone), but once a zone is
     * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
     * lower zones regardless of the number of free pages in the lower zones. This
     * interoperates with the page allocator fallback scheme to ensure that aging
     * of pages is balanced across the zones.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int i;
    
    	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
    
    	unsigned long nr_soft_reclaimed;
    	unsigned long nr_soft_scanned;
    
    	struct scan_control sc = {
    		.gfp_mask = GFP_KERNEL,
    
    		.priority = DEF_PRIORITY,
    
    		.may_unmap = 1,
    
    		.may_swap = 1,
    
    		.may_writepage = !laptop_mode,
    
    Andy Whitcroft's avatar
    Andy Whitcroft committed
    		.order = order,
    
    	count_vm_event(PAGEOUTRUN);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unsigned long lru_pages = 0;
    
    		bool raise_priority = true;
    
    		bool pgdat_needs_compaction = (order > 0);
    
    
    		sc.nr_reclaimed = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    		/*
    		 * Scan in the highmem->dma direction for the highest
    		 * zone which needs scanning
    		 */
    		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
    			struct zone *zone = pgdat->node_zones + i;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (!populated_zone(zone))
    				continue;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			if (sc.priority != DEF_PRIORITY &&
    			    !zone_reclaimable(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			/*
    			 * Do some background aging of the anon list, to give
    			 * pages a chance to be referenced before reclaiming.
    			 */
    
    			age_active_anon(zone, &sc);
    
    			/*
    			 * If the number of buffer_heads in the machine
    			 * exceeds the maximum allowed level and this node
    			 * has a highmem zone, force kswapd to reclaim from
    			 * it to relieve lowmem pressure.
    			 */
    			if (buffer_heads_over_limit && is_highmem_idx(i)) {
    				end_zone = i;
    				break;
    			}
    
    
    			if (!zone_balanced(zone, order, 0, 0)) {