Newer
Older
balanced_order == new_order) {

Mel Gorman
committed
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
* allocation or has tigher zone constraints
classzone_idx = new_classzone_idx;
kswapd_try_to_sleep(pgdat, balanced_order,
balanced_classzone_idx);
classzone_idx = pgdat->classzone_idx;
new_order = order;
new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;

Mel Gorman
committed
pgdat->classzone_idx = pgdat->nr_zones - 1;

David Rientjes
committed
ret = try_to_freeze();
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/

Mel Gorman
committed
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balanced_classzone_idx = classzone_idx;
balanced_order = balance_pgdat(pgdat, order,
&balanced_classzone_idx);

Mel Gorman
committed
}

Takamori Yamaguchi
committed
current->reclaim_state = NULL;
return 0;
}
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order) {
pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
}
if (!waitqueue_active(&pgdat->kswapd_wait))
if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
/*
* The reclaimable count would be mostly accurate.
* The less reclaimable pages may be
* - mlocked pages, which will be moved to unevictable list when encountered
* - mapped pages, which may require several travels to be reclaimed
* - dirty pages, which is not "instantly" reclaimable
*/
unsigned long global_reclaimable_pages(void)
int nr;
nr = global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_FILE);
nr += global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_INACTIVE_ANON);
return nr;
}
unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
zone_page_state(zone, NR_INACTIVE_ANON);
return nr;
#ifdef CONFIG_HIBERNATION
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
* freed pages.
*
* Rather than trying to age LRUs the aim is to preserve the overall
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct reclaim_state reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_HIGHUSER_MOVABLE,
.may_swap = 1,
.may_unmap = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
.order = 0,
.priority = DEF_PRIORITY,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
p->flags &= ~PF_MEMALLOC;
return nr_reclaimed;
#endif /* CONFIG_HIBERNATION */
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int cpu_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
set_cpus_allowed_ptr(pgdat->kswapd, mask);
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
pgdat->kswapd = NULL;
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
}
return ret;
}

David Rientjes
committed
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
* hold lock_memory_hotplug().

David Rientjes
committed
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
if (kswapd) {

David Rientjes
committed
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
}

David Rientjes
committed
}
int nid;
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
* Zone reclaim mode
*
* If non-zero call zone_reclaim when the number of free pages falls below
* the watermarks.
*/
int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
#define ZONE_RECLAIM_PRIORITY 4

Christoph Lameter
committed
/*
* Percentage of pages in a zone that must be unmapped for zone_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
/*
* If the number of slab pages in a zone grows beyond this percentage then
* slab reclaim needs to occur.
*/
int sysctl_min_slab_ratio = 5;

Mel Gorman
committed
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
* accounted for by the pages on the file LRU lists because
* tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/
return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static long zone_pagecache_reclaimable(struct zone *zone)
{
long nr_pagecache_reclaimable;
long delta = 0;
/*
* If RECLAIM_SWAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
if (zone_reclaim_mode & RECLAIM_SWAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
/* If we can't clean pages, remove dirty pages from consideration */
if (!(zone_reclaim_mode & RECLAIM_WRITE))
delta += zone_page_state(zone, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/*
* Try to free up some pages from this zone through reclaim.
*/
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.priority = ZONE_RECLAIM_PRIORITY,
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
unsigned long nr_slab_pages0, nr_slab_pages1;
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
shrink_zone(zone, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages0 > zone->min_slab_pages) {
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
* Note that shrink_slab will free memory on all zones and may
* take a long time.

KOSAKI Motohiro
committed
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

KOSAKI Motohiro
committed
break;
/* Freed enough memory */
nr_slab_pages1 = zone_page_state(zone,
NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
break;
}
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 < nr_slab_pages0)
sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
int node_id;
* Zone reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.

Christoph Lameter
committed
*

Christoph Lameter
committed
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
* thrown out if the zone is overallocated. So we do not reclaim
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
* Do not scan if the allocation should not be delayed.
if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return ZONE_RECLAIM_NOSCAN;
if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
*
* Test whether page is evictable--i.e., should be placed on active/inactive
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
int page_evictable(struct page *page)
return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
* @pages: array of pages to check
* @nr_pages: number of pages to check
* Checks pages for evictability and moves them to the appropriate lru list.
*
* This function is only used for SysV IPC SHM_UNLOCK.
void check_move_unevictable_pages(struct page **pages, int nr_pages)
struct zone *zone = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pages[i];
struct zone *pagezone;
pgscanned++;
pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
lruvec = mem_cgroup_page_lruvec(page, zone);
if (!PageLRU(page) || !PageUnevictable(page))
continue;
enum lru_list lru = page_lru_base_type(page);
VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
if (zone) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
spin_unlock_irq(&zone->lru_lock);
static void warn_scan_unevictable_pages(void)
printk_once(KERN_WARNING
"%s: The scan_unevictable_pages sysctl/node-interface has been "
"disabled for lack of a legitimate use case. If you have "
"one, please send an email to linux-mm@kvack.org.\n",
current->comm);
}
/*
* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
* all nodes' unevictable lists for evictable pages
*/
unsigned long scan_unevictable_pages;
int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
scan_unevictable_pages = 0;
return 0;
}

Thadeu Lima de Souza Cascardo
committed
#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
*/
static ssize_t read_scan_unevictable_node(struct device *dev,
struct device_attribute *attr,
warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
static ssize_t write_scan_unevictable_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
warn_scan_unevictable_pages();
static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
read_scan_unevictable_node,
write_scan_unevictable_node);
int scan_unevictable_register_node(struct node *node)
{
return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
}
void scan_unevictable_unregister_node(struct node *node)
{
device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);