Skip to content
Snippets Groups Projects
vmscan.c 100 KiB
Newer Older
  • Learn to ignore specific revisions
  • 					balanced_order == new_order) {
    
    			new_order = pgdat->kswapd_max_order;
    			new_classzone_idx = pgdat->classzone_idx;
    			pgdat->kswapd_max_order =  0;
    			pgdat->classzone_idx = pgdat->nr_zones - 1;
    		}
    
    
    		if (order < new_order || classzone_idx > new_classzone_idx) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			/*
    			 * Don't sleep if someone wants a larger 'order'
    
    			 * allocation or has tigher zone constraints
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 */
    			order = new_order;
    
    			classzone_idx = new_classzone_idx;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		} else {
    
    			kswapd_try_to_sleep(pgdat, balanced_order,
    						balanced_classzone_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			order = pgdat->kswapd_max_order;
    
    			classzone_idx = pgdat->classzone_idx;
    
    			new_order = order;
    			new_classzone_idx = classzone_idx;
    
    			pgdat->classzone_idx = pgdat->nr_zones - 1;
    
    		ret = try_to_freeze();
    		if (kthread_should_stop())
    			break;
    
    		/*
    		 * We can speed up thawing tasks if we don't call balance_pgdat
    		 * after returning from the refrigerator
    		 */
    
    		if (!ret) {
    			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
    
    			balanced_classzone_idx = classzone_idx;
    			balanced_order = balance_pgdat(pgdat, order,
    						&balanced_classzone_idx);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return 0;
    }
    
    /*
     * A zone is low on free memory, so wake its kswapd task to service it.
     */
    
    void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pg_data_t *pgdat;
    
    
    	if (!populated_zone(zone))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    
    	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	if (pgdat->kswapd_max_order < order) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		pgdat->kswapd_max_order = order;
    
    		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
    	}
    
    	if (!waitqueue_active(&pgdat->kswapd_wait))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return;
    
    	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
    		return;
    
    	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
    
    	wake_up_interruptible(&pgdat->kswapd_wait);
    
    /*
     * The reclaimable count would be mostly accurate.
     * The less reclaimable pages may be
     * - mlocked pages, which will be moved to unevictable list when encountered
     * - mapped pages, which may require several travels to be reclaimed
     * - dirty pages, which is not "instantly" reclaimable
     */
    unsigned long global_reclaimable_pages(void)
    
    	int nr;
    
    	nr = global_page_state(NR_ACTIVE_FILE) +
    	     global_page_state(NR_INACTIVE_FILE);
    
    
    	if (get_nr_swap_pages() > 0)
    
    		nr += global_page_state(NR_ACTIVE_ANON) +
    		      global_page_state(NR_INACTIVE_ANON);
    
    	return nr;
    }
    
    unsigned long zone_reclaimable_pages(struct zone *zone)
    {
    	int nr;
    
    	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
    	     zone_page_state(zone, NR_INACTIVE_FILE);
    
    
    	if (get_nr_swap_pages() > 0)
    
    		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
    		      zone_page_state(zone, NR_INACTIVE_ANON);
    
    	return nr;
    
    #ifdef CONFIG_HIBERNATION
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
    
     * freed pages.
     *
     * Rather than trying to age LRUs the aim is to preserve the overall
     * LRU order by reclaiming preferentially
     * inactive > active > active referenced > active mapped
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct reclaim_state reclaim_state;
    	struct scan_control sc = {
    
    		.gfp_mask = GFP_HIGHUSER_MOVABLE,
    		.may_swap = 1,
    		.may_unmap = 1,
    
    		.may_writepage = 1,
    
    		.nr_to_reclaim = nr_to_reclaim,
    		.hibernation_mode = 1,
    		.order = 0,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	};
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
    
    	struct task_struct *p = current;
    	unsigned long nr_reclaimed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	p->flags |= PF_MEMALLOC;
    	lockdep_set_current_reclaim_state(sc.gfp_mask);
    	reclaim_state.reclaimed_slab = 0;
    	p->reclaim_state = &reclaim_state;
    
    	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
    
    	p->reclaim_state = NULL;
    	lockdep_clear_current_reclaim_state();
    	p->flags &= ~PF_MEMALLOC;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    #endif /* CONFIG_HIBERNATION */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* It's optimal to keep kswapds on the same CPUs as their memory, but
       not required for correctness.  So if the last cpu in a node goes
       away, we get changed to run anywhere: as the first one comes back,
       restore their cpu bindings. */
    
    static int cpu_callback(struct notifier_block *nfb, unsigned long action,
    			void *hcpu)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
    
    		for_each_node_state(nid, N_MEMORY) {
    
    			pg_data_t *pgdat = NODE_DATA(nid);
    
    			const struct cpumask *mask;
    
    			mask = cpumask_of_node(pgdat->node_id);
    
    			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				/* One of our CPUs online: restore mask */
    
    				set_cpus_allowed_ptr(pgdat->kswapd, mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    	return NOTIFY_OK;
    }
    
    
    /*
     * This kswapd start function will be called by init and node-hot-add.
     * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
     */
    int kswapd_run(int nid)
    {
    	pg_data_t *pgdat = NODE_DATA(nid);
    	int ret = 0;
    
    	if (pgdat->kswapd)
    		return 0;
    
    	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    	if (IS_ERR(pgdat->kswapd)) {
    		/* failure at boot is fatal */
    		BUG_ON(system_state == SYSTEM_BOOTING);
    
    		pr_err("Failed to start kswapd on node %d\n", nid);
    		ret = PTR_ERR(pgdat->kswapd);
    
     * Called by memory hotplug when all memory in a node is offlined.  Caller must
     * hold lock_memory_hotplug().
    
     */
    void kswapd_stop(int nid)
    {
    	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static int __init kswapd_init(void)
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	swap_setup();
    
    	for_each_node_state(nid, N_MEMORY)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	hotcpu_notifier(cpu_callback, 0);
    	return 0;
    }
    
    module_init(kswapd_init)
    
    
    #ifdef CONFIG_NUMA
    /*
     * Zone reclaim mode
     *
     * If non-zero call zone_reclaim when the number of free pages falls below
     * the watermarks.
     */
    int zone_reclaim_mode __read_mostly;
    
    
    #define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
    
    #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
    #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
    
    
    /*
     * Priority for ZONE_RECLAIM. This determines the fraction of pages
     * of a node considered for each zone_reclaim. 4 scans 1/16th of
     * a zone.
     */
    #define ZONE_RECLAIM_PRIORITY 4
    
    
    /*
     * Percentage of pages in a zone that must be unmapped for zone_reclaim to
     * occur.
     */
    int sysctl_min_unmapped_ratio = 1;
    
    
    /*
     * If the number of slab pages in a zone grows beyond this percentage then
     * slab reclaim needs to occur.
     */
    int sysctl_min_slab_ratio = 5;
    
    
    static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
    {
    	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
    	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
    		zone_page_state(zone, NR_ACTIVE_FILE);
    
    	/*
    	 * It's possible for there to be more file mapped pages than
    	 * accounted for by the pages on the file LRU lists because
    	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
    	 */
    	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
    }
    
    /* Work out how many page cache pages we can reclaim in this reclaim_mode */
    static long zone_pagecache_reclaimable(struct zone *zone)
    {
    	long nr_pagecache_reclaimable;
    	long delta = 0;
    
    	/*
    	 * If RECLAIM_SWAP is set, then all file pages are considered
    	 * potentially reclaimable. Otherwise, we have to worry about
    	 * pages like swapcache and zone_unmapped_file_pages() provides
    	 * a better estimate
    	 */
    	if (zone_reclaim_mode & RECLAIM_SWAP)
    		nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
    	else
    		nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
    
    	/* If we can't clean pages, remove dirty pages from consideration */
    	if (!(zone_reclaim_mode & RECLAIM_WRITE))
    		delta += zone_page_state(zone, NR_FILE_DIRTY);
    
    	/* Watch for any possible underflows due to delta */
    	if (unlikely(delta > nr_pagecache_reclaimable))
    		delta = nr_pagecache_reclaimable;
    
    	return nr_pagecache_reclaimable - delta;
    }
    
    
    /*
     * Try to free up some pages from this zone through reclaim.
     */
    
    static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    
    	/* Minimum pages needed in order to stay on node */
    
    	const unsigned long nr_pages = 1 << order;
    
    	struct task_struct *p = current;
    	struct reclaim_state reclaim_state;
    
    	struct scan_control sc = {
    		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
    
    		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
    
    		.may_swap = 1,
    
    		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
    
    		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
    
    		.priority = ZONE_RECLAIM_PRIORITY,
    
    	struct shrink_control shrink = {
    		.gfp_mask = sc.gfp_mask,
    	};
    
    	unsigned long nr_slab_pages0, nr_slab_pages1;
    
    	/*
    	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
    	 * and we also need to be able to write out pages for RECLAIM_WRITE
    	 * and RECLAIM_SWAP.
    	 */
    	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
    
    	lockdep_set_current_reclaim_state(gfp_mask);
    
    	reclaim_state.reclaimed_slab = 0;
    	p->reclaim_state = &reclaim_state;
    
    	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
    
    		/*
    		 * Free memory by calling shrink zone with increasing
    		 * priorities until we have enough memory freed.
    		 */
    		do {
    
    			shrink_zone(zone, &sc);
    		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
    
    	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    	if (nr_slab_pages0 > zone->min_slab_pages) {
    
    		 * shrink_slab() does not currently allow us to determine how
    
    		 * many pages were freed in this zone. So we take the current
    		 * number of slab pages and shake the slab until it is reduced
    		 * by the same nr_pages that we used for reclaiming unmapped
    		 * pages.
    
    		 * Note that shrink_slab will free memory on all zones and may
    		 * take a long time.
    
    		for (;;) {
    			unsigned long lru_pages = zone_reclaimable_pages(zone);
    
    			/* No reclaimable slab or very low memory pressure */
    
    			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
    
    				break;
    
    			/* Freed enough memory */
    			nr_slab_pages1 = zone_page_state(zone,
    							NR_SLAB_RECLAIMABLE);
    			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
    				break;
    		}
    
    
    		/*
    		 * Update nr_reclaimed by the number of slab pages we
    		 * reclaimed from this zone.
    		 */
    
    		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
    		if (nr_slab_pages1 < nr_slab_pages0)
    			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
    
    	p->reclaim_state = NULL;
    
    	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
    
    	lockdep_clear_current_reclaim_state();
    
    	return sc.nr_reclaimed >= nr_pages;
    
    
    int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
    {
    	int node_id;
    
    	 * Zone reclaim reclaims unmapped file backed pages and
    	 * slab pages if we are over the defined limits.
    
    	 * A small portion of unmapped file backed pages is needed for
    	 * file I/O otherwise pages read by file I/O will be immediately
    	 * thrown out if the zone is overallocated. So we do not reclaim
    	 * if less than a specified percentage of the zone is used by
    	 * unmapped file backed pages.
    
    	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
    	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
    
    	if (zone->all_unreclaimable)
    
    	 * Do not scan if the allocation should not be delayed.
    
    	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
    
    
    	/*
    	 * Only run zone reclaim on the local zone or on zones that do not
    	 * have associated processors. This will favor the local processor
    	 * over remote processors and spread off node memory allocations
    	 * as wide as possible.
    	 */
    
    	node_id = zone_to_nid(zone);
    
    	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
    
    
    	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
    
    	ret = __zone_reclaim(zone, gfp_mask, order);
    	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
    
    
    	if (!ret)
    		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
    
    
    
    /*
     * page_evictable - test whether a page is evictable
     * @page: the page to test
     *
     * Test whether page is evictable--i.e., should be placed on active/inactive
    
     * lists vs unevictable list.
    
     *
     * Reasons page might not be evictable:
    
     * (1) page's mapping marked unevictable
    
     * (2) page is part of an mlocked VMA
    
    int page_evictable(struct page *page)
    
    	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
    
    #ifdef CONFIG_SHMEM
    
     * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
     * @pages:	array of pages to check
     * @nr_pages:	number of pages to check
    
     * Checks pages for evictability and moves them to the appropriate lru list.
    
     *
     * This function is only used for SysV IPC SHM_UNLOCK.
    
    void check_move_unevictable_pages(struct page **pages, int nr_pages)
    
    	struct lruvec *lruvec;
    
    	struct zone *zone = NULL;
    	int pgscanned = 0;
    	int pgrescued = 0;
    	int i;
    
    	for (i = 0; i < nr_pages; i++) {
    		struct page *page = pages[i];
    		struct zone *pagezone;
    
    		pgscanned++;
    		pagezone = page_zone(page);
    		if (pagezone != zone) {
    			if (zone)
    				spin_unlock_irq(&zone->lru_lock);
    			zone = pagezone;
    			spin_lock_irq(&zone->lru_lock);
    		}
    
    		lruvec = mem_cgroup_page_lruvec(page, zone);
    
    		if (!PageLRU(page) || !PageUnevictable(page))
    			continue;
    
    		if (page_evictable(page)) {
    
    			enum lru_list lru = page_lru_base_type(page);
    
    			VM_BUG_ON(PageActive(page));
    			ClearPageUnevictable(page);
    
    			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
    			add_page_to_lru_list(page, lruvec, lru);
    
    	if (zone) {
    		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
    		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
    		spin_unlock_irq(&zone->lru_lock);
    
    #endif /* CONFIG_SHMEM */
    
    static void warn_scan_unevictable_pages(void)
    
    		    "%s: The scan_unevictable_pages sysctl/node-interface has been "
    
    		    "disabled for lack of a legitimate use case.  If you have "
    
    		    "one, please send an email to linux-mm@kvack.org.\n",
    		    current->comm);
    
    }
    
    /*
     * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
     * all nodes' unevictable lists for evictable pages
     */
    unsigned long scan_unevictable_pages;
    
    int scan_unevictable_handler(struct ctl_table *table, int write,
    
    			   size_t *length, loff_t *ppos)
    {
    
    	proc_doulongvec_minmax(table, write, buffer, length, ppos);
    
    	scan_unevictable_pages = 0;
    	return 0;
    }
    
    
    /*
     * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
     * a specified node's per zone unevictable lists for evictable pages.
     */
    
    
    static ssize_t read_scan_unevictable_node(struct device *dev,
    					  struct device_attribute *attr,
    
    	return sprintf(buf, "0\n");	/* always zero; should fit... */
    }
    
    
    static ssize_t write_scan_unevictable_node(struct device *dev,
    					   struct device_attribute *attr,
    
    					const char *buf, size_t count)
    {
    
    static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
    
    			read_scan_unevictable_node,
    			write_scan_unevictable_node);
    
    int scan_unevictable_register_node(struct node *node)
    {
    
    	return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
    
    }
    
    void scan_unevictable_unregister_node(struct node *node)
    {
    
    	device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);