Newer
Older
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
p->flags &= ~PF_MEMALLOC;
return nr_reclaimed;
#endif /* CONFIG_HIBERNATION */
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_HIGH_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
set_cpus_allowed_ptr(pgdat->kswapd, mask);
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
printk("Failed to start kswapd on node %d\n",nid);
ret = -1;
}
return ret;
}

David Rientjes
committed
/*
* Called by memory hotplug when all memory in a node is offlined.
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
if (kswapd)
kthread_stop(kswapd);
}
int nid;
for_each_node_state(nid, N_HIGH_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
* Zone reclaim mode
*
* If non-zero call zone_reclaim when the number of free pages falls below
* the watermarks.
*/
int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
#define ZONE_RECLAIM_PRIORITY 4

Christoph Lameter
committed
/*
* Percentage of pages in a zone that must be unmapped for zone_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
/*
* If the number of slab pages in a zone grows beyond this percentage then
* slab reclaim needs to occur.
*/
int sysctl_min_slab_ratio = 5;

Mel Gorman
committed
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
* accounted for by the pages on the file LRU lists because
* tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/
return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static long zone_pagecache_reclaimable(struct zone *zone)
{
long nr_pagecache_reclaimable;
long delta = 0;
/*
* If RECLAIM_SWAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
if (zone_reclaim_mode & RECLAIM_SWAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
/* If we can't clean pages, remove dirty pages from consideration */
if (!(zone_reclaim_mode & RECLAIM_WRITE))
delta += zone_page_state(zone, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/*
* Try to free up some pages from this zone through reclaim.
*/
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
unsigned long nr_slab_pages0, nr_slab_pages1;
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
shrink_zone(priority, zone, &sc);
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages0 > zone->min_slab_pages) {
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
* Note that shrink_slab will free memory on all zones and may
* take a long time.

KOSAKI Motohiro
committed
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

KOSAKI Motohiro
committed
break;
/* Freed enough memory */
nr_slab_pages1 = zone_page_state(zone,
NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
break;
}
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 < nr_slab_pages0)
sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
int node_id;
* Zone reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.

Christoph Lameter
committed
*

Christoph Lameter
committed
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
* thrown out if the zone is overallocated. So we do not reclaim
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
* Do not scan if the allocation should not be delayed.
if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return ZONE_RECLAIM_NOSCAN;
if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
* @vma: the VMA in which the page is or will be mapped, may be NULL
*
* Test whether page is evictable--i.e., should be placed on active/inactive
* lists vs unevictable list. The vma argument is !NULL when called from the
* fault path to determine how to instantate a new page.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
*/
int page_evictable(struct page *page, struct vm_area_struct *vma)
{
if (mapping_unevictable(page_mapping(page)))
return 0;
if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
return 0;
/**
* check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
* @page: page to check evictability and move to appropriate lru list
* @zone: zone page is in
*
* Checks a page for evictability and moves the page to the appropriate
* zone lru list.
*
* Restrictions: zone->lru_lock must be held, page must be on LRU and must
* have PageUnevictable set.
*/
static void check_move_unevictable_page(struct page *page, struct zone *zone)
{
VM_BUG_ON(PageActive(page));
retry:
ClearPageUnevictable(page);
if (page_evictable(page, NULL)) {
enum lru_list l = page_lru_base_type(page);
__dec_zone_state(zone, NR_UNEVICTABLE);
list_move(&page->lru, &zone->lru[l].list);
mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
__inc_zone_state(zone, NR_INACTIVE_ANON + l);
__count_vm_event(UNEVICTABLE_PGRESCUED);
} else {
/*
* rotate unevictable list
*/
SetPageUnevictable(page);
list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
if (page_evictable(page, NULL))
goto retry;
}
}
/**
* scan_mapping_unevictable_pages - scan an address space for evictable pages
* @mapping: struct address_space to scan for evictable pages
*
* Scan all pages in mapping. Check unevictable pages for
* evictability and move them to the appropriate zone lru list.
*/
void scan_mapping_unevictable_pages(struct address_space *mapping)
{
pgoff_t next = 0;
pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
struct zone *zone;
struct pagevec pvec;
if (mapping->nrpages == 0)
return;
pagevec_init(&pvec, 0);
while (next < end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
int i;
int pg_scanned = 0;
zone = NULL;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
struct zone *pagezone = page_zone(page);
pg_scanned++;
if (page_index > next)
next = page_index;
next++;
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (PageLRU(page) && PageUnevictable(page))
check_move_unevictable_page(page, zone);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
}
}
/**
* scan_zone_unevictable_pages - check unevictable list for evictable pages
* @zone - zone of which to scan the unevictable list
*
* Scan @zone's unevictable LRU lists to check for pages that have become
* evictable. Move those that have to @zone's inactive list where they
* become candidates for reclaim, unless shrink_inactive_zone() decides
* to reactivate them. Pages that are still unevictable are rotated
* back onto @zone's unevictable list.
*/
#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
static void scan_zone_unevictable_pages(struct zone *zone)
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
{
struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
unsigned long scan;
unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
while (nr_to_scan > 0) {
unsigned long batch_size = min(nr_to_scan,
SCAN_UNEVICTABLE_BATCH_SIZE);
spin_lock_irq(&zone->lru_lock);
for (scan = 0; scan < batch_size; scan++) {
struct page *page = lru_to_page(l_unevictable);
if (!trylock_page(page))
continue;
prefetchw_prev_lru_page(page, l_unevictable, flags);
if (likely(PageLRU(page) && PageUnevictable(page)))
check_move_unevictable_page(page, zone);
unlock_page(page);
}
spin_unlock_irq(&zone->lru_lock);
nr_to_scan -= batch_size;
}
}
/**
* scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
*
* A really big hammer: scan all zones' unevictable LRU lists to check for
* pages that have become evictable. Move those back to the zones'
* inactive list where they become candidates for reclaim.
* This occurs when, e.g., we have unswappable pages on the unevictable lists,
* and we add swap to the system. As such, it runs in the context of a task
* that has possibly/probably made some previously unevictable pages
* evictable.
*/
static void scan_all_zones_unevictable_pages(void)
{
struct zone *zone;
for_each_zone(zone) {
scan_zone_unevictable_pages(zone);
}
}
/*
* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
* all nodes' unevictable lists for evictable pages
*/
unsigned long scan_unevictable_pages;
int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write && *(unsigned long *)table->data)
scan_all_zones_unevictable_pages();
scan_unevictable_pages = 0;
return 0;
}

Thadeu Lima de Souza Cascardo
committed
#ifdef CONFIG_NUMA
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
*/
static ssize_t read_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
char *buf)
{
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
static ssize_t write_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
struct zone *zone;
unsigned long res;
unsigned long req = strict_strtoul(buf, 10, &res);
if (!req)
return 1; /* zero is no-op */
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!populated_zone(zone))
continue;
scan_zone_unevictable_pages(zone);
}
return 1;
}
static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
read_scan_unevictable_node,
write_scan_unevictable_node);
int scan_unevictable_register_node(struct node *node)
{
return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
}
void scan_unevictable_unregister_node(struct node *node)
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}