Newer
Older
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
p->flags &= ~PF_MEMALLOC;
return nr_reclaimed;
#endif /* CONFIG_HIBERNATION */
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_HIGH_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
set_cpus_allowed_ptr(pgdat->kswapd, mask);
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
printk("Failed to start kswapd on node %d\n",nid);
ret = -1;
}
return ret;
}

David Rientjes
committed
/*
* Called by memory hotplug when all memory in a node is offlined.
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
if (kswapd)
kthread_stop(kswapd);
}
int nid;
for_each_node_state(nid, N_HIGH_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
* Zone reclaim mode
*
* If non-zero call zone_reclaim when the number of free pages falls below
* the watermarks.
*/
int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
#define ZONE_RECLAIM_PRIORITY 4

Christoph Lameter
committed
/*
* Percentage of pages in a zone that must be unmapped for zone_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
/*
* If the number of slab pages in a zone grows beyond this percentage then
* slab reclaim needs to occur.
*/
int sysctl_min_slab_ratio = 5;

Mel Gorman
committed
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
* accounted for by the pages on the file LRU lists because
* tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/
return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static long zone_pagecache_reclaimable(struct zone *zone)
{
long nr_pagecache_reclaimable;
long delta = 0;
/*
* If RECLAIM_SWAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
if (zone_reclaim_mode & RECLAIM_SWAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
/* If we can't clean pages, remove dirty pages from consideration */
if (!(zone_reclaim_mode & RECLAIM_WRITE))
delta += zone_page_state(zone, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/*
* Try to free up some pages from this zone through reclaim.
*/
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
struct shrink_control shrink = {
.gfp_mask = sc.gfp_mask,
};
unsigned long nr_slab_pages0, nr_slab_pages1;
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
shrink_zone(priority, zone, &sc);
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages0 > zone->min_slab_pages) {
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
* Note that shrink_slab will free memory on all zones and may
* take a long time.

KOSAKI Motohiro
committed
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
/* No reclaimable slab or very low memory pressure */
if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

KOSAKI Motohiro
committed
break;
/* Freed enough memory */
nr_slab_pages1 = zone_page_state(zone,
NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
break;
}
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
if (nr_slab_pages1 < nr_slab_pages0)
sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
int node_id;
* Zone reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.

Christoph Lameter
committed
*

Christoph Lameter
committed
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
* thrown out if the zone is overallocated. So we do not reclaim
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.

Mel Gorman
committed
if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
* Do not scan if the allocation should not be delayed.
if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return ZONE_RECLAIM_NOSCAN;
if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
* @vma: the VMA in which the page is or will be mapped, may be NULL
*
* Test whether page is evictable--i.e., should be placed on active/inactive
* lists vs unevictable list. The vma argument is !NULL when called from the
* fault path to determine how to instantate a new page.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
*/
int page_evictable(struct page *page, struct vm_area_struct *vma)
{
if (mapping_unevictable(page_mapping(page)))
return 0;
if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
* @pages: array of pages to check
* @nr_pages: number of pages to check
* Checks pages for evictability and moves them to the appropriate lru list.
*
* This function is only used for SysV IPC SHM_UNLOCK.
void check_move_unevictable_pages(struct page **pages, int nr_pages)
struct zone *zone = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pages[i];
struct zone *pagezone;
pgscanned++;
pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (!PageLRU(page) || !PageUnevictable(page))
continue;
if (page_evictable(page, NULL)) {
enum lru_list lru = page_lru_base_type(page);
VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
__dec_zone_state(zone, NR_UNEVICTABLE);
lruvec = mem_cgroup_lru_move_lists(zone, page,
LRU_UNEVICTABLE, lru);
list_move(&page->lru, &lruvec->lists[lru]);
__inc_zone_state(zone, NR_INACTIVE_ANON + lru);
pgrescued++;
if (zone) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
spin_unlock_irq(&zone->lru_lock);
static void warn_scan_unevictable_pages(void)
printk_once(KERN_WARNING
"%s: The scan_unevictable_pages sysctl/node-interface has been "
"disabled for lack of a legitimate use case. If you have "
"one, please send an email to linux-mm@kvack.org.\n",
current->comm);
}
/*
* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
* all nodes' unevictable lists for evictable pages
*/
unsigned long scan_unevictable_pages;
int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
scan_unevictable_pages = 0;
return 0;
}

Thadeu Lima de Souza Cascardo
committed
#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
*/
static ssize_t read_scan_unevictable_node(struct device *dev,
struct device_attribute *attr,
warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
static ssize_t write_scan_unevictable_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
warn_scan_unevictable_pages();
static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
read_scan_unevictable_node,
write_scan_unevictable_node);
int scan_unevictable_register_node(struct node *node)
{
return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
}
void scan_unevictable_unregister_node(struct node *node)
{
device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);