Newer
Older
/*
* linux/mm/vmscan.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
/* This context's GFP mask */
/* Can pages be swapped as part of reclaim? */
int may_swap;
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
* whole list at once. */
int swap_cluster_max;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
/* Pluggable isolate pages callback */
unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
unsigned long *scanned, int order, int mode,
struct zone *z, struct mem_cgroup *mem_cont,
int active);
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetch(&prev->_field); \
} \
} while (0)
#else
#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
/*
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_CGROUP_MEM_RES_CTLR

KAMEZAWA Hiroyuki
committed
#define scan_global_lru(sc) (!(sc)->mem_cgroup)
#else
#define scan_global_lru(sc) (1)
#endif
/*
* Add a shrinker callback to be called from the vm
*/
void register_shrinker(struct shrinker *shrinker)
shrinker->nr = 0;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
EXPORT_SYMBOL(register_shrinker);
void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
}
EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
if (shrinker->nr < 0) {
printk(KERN_ERR "%s: nr=%ld\n",
__func__, shrinker->nr);
shrinker->nr = max_pass;
}
/*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
if (shrinker->nr > max_pass * 2)
shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
nr_before = (*shrinker->shrink)(0, gfp_mask);
shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
count_vm_events(SLABS_SCANNED, this_scan);
total_scan -= this_scan;
cond_resched();
}
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
}
/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
struct address_space *mapping;
/* Page is in somebody's page tables. */
if (page_mapped(page))
return 1;
/* Be more reluctant to reclaim swapcache than pagecache */
if (PageSwapCache(page))
return 1;
mapping = page_mapping(page);
if (!mapping)
return 0;
/* File is mmap'd by somebody? */
return mapping_mapped(mapping);
}
static inline int is_page_cache_freeable(struct page *page)
{
return page_count(page) - !!PagePrivate(page) == 2;
}
static int may_write_to_queue(struct backing_dev_info *bdi)
{
if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
if (bdi == current->backing_dev_info)
return 1;
return 0;
}
/*
* We detected a synchronous write error writing a page out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
* prevents it from being freed up. But we have a ref on the page and once
* that page is locked, the mapping is pinned.
*
* We're allowed to run sleeping lock_page() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);

Andy Whitcroft
committed
/* Request for sync pageout. */
enum pageout_io {
PAGEOUT_IO_ASYNC,
PAGEOUT_IO_SYNC,
};
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
PAGE_KEEP,
/* move page to the active list, page is locked */
PAGE_ACTIVATE,
/* page has been sent to the disk successfully, page is unlocked */
PAGE_SUCCESS,
/* page is clean and locked */
PAGE_CLEAN,
} pageout_t;
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().

Andy Whitcroft
committed
static pageout_t pageout(struct page *page, struct address_space *mapping,
enum pageout_io sync_writeback)
{
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in generic_file_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
* See swapfile.c:page_queue_congested().
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
if (PagePrivate(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_queue(mapping->backing_dev_info))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.nonblocking = 1,
.for_reclaim = 1,
};
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}

Andy Whitcroft
committed
/*
* Wait on writeback if requested to. This happens when
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
static int __remove_mapping(struct address_space *mapping, struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page) [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_count.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_unfreeze_refs(page, 2);
goto cannot_free;
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
} else {
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
}
return 1;
cannot_free:
write_unlock_irq(&mapping->tree_lock);
return 0;
}
/*
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
* someone else has a ref on the page, abort and return 0. If it was
* successfully detached, return 1. Assumes the caller has a single ref on
* this page.
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
if (__remove_mapping(mapping, page)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
page_unfreeze_refs(page, 1);
return 1;
}
return 0;
}
* shrink_page_list() returns the number of reclaimed pages
static unsigned long shrink_page_list(struct list_head *page_list,

Andy Whitcroft
committed
struct scan_control *sc,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
cond_resched();
pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
int referenced;
cond_resched();
page = lru_to_page(page_list);
list_del(&page->lru);
if (TestSetPageLocked(page))
goto keep;

Christoph Lameter
committed
if (!sc->may_swap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;

Andy Whitcroft
committed
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
/*
* Synchronous reclaim is performed in two passes,
* first an asynchronous pass over the list to
* start parallel writeback, and a second synchronous
* pass to wait for the IO to complete. Wait here
* for any page for which writeback has already
* started.
*/
if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
wait_on_page_writeback(page);

Andy Whitcroft
committed
goto keep_locked;
}
referenced = page_referenced(page, 1, sc->mem_cgroup);
/* In active use or really unfreeable? Activate it. */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
referenced && page_mapping_inuse(page))
goto activate_locked;
#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/

Christoph Lameter
committed
if (PageAnon(page) && !PageSwapCache(page))
if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
if (PageDirty(page)) {
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
if (!may_enter_fs)
if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */

Andy Whitcroft
committed
switch (pageout(page, mapping, sync_writeback)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page) || PageDirty(page))
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (TestSetPageLocked(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
* the page as well.
*
* We do this even if the page is PageDirty().
* try_to_release_page() does not perform I/O, but it is
* possible for a page to have PageDirty set, but it is actually
* clean (all its buffers are clean). This happens if the
* buffers were written out directly, with submit_bh(). ext3
* will do this, as well as the blockdev mapping.
* try_to_release_page() will discover that cleanness and will
* drop the buffers and mark the page clean - it can be freed.
*
* Rarely, pages can have buffers and no ->mapping. These are
* the pages which were not successfully invalidated in
* truncate_complete_page(). We try to drop those buffers here
* and if that worked, and the page is no longer mapped into
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1) {
unlock_page(page);
if (put_page_testzero(page))
goto free_it;
else {
/*
* rare race with speculative reference.
* the speculative reference will free
* this page shortly, so we may
* increment nr_reclaimed here (and
* leave it off the LRU).
*/
nr_reclaimed++;
continue;
}
}
if (!mapping || !__remove_mapping(mapping, page))
goto keep_locked;
if (!pagevec_add(&freed_pvec, page)) {
__pagevec_free(&freed_pvec);
pagevec_reinit(&freed_pvec);
}
continue;
activate_locked:
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
count_vm_events(PGACTIVATE, pgactivate);
/* LRU Isolation modes. */
#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
* freed elsewhere are also ignored.
*
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
* returns 0 on success, -ve errno on failure.
*/
int __isolate_lru_page(struct page *page, int mode)
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
{
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
return ret;
ret = -EBUSY;
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
ret = 0;
}
return ret;
}
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @src: The LRU list to pull pages off.
* @dst: The temp list to put pages on to.
* @scanned: The number of pages that were scanned.
* @order: The caller's attempted allocation order
* @mode: One of the LRU isolation modes
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct page *page;
unsigned long pfn;
unsigned long end_pfn;
unsigned long page_pfn;
int zone_id;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
switch (__isolate_lru_page(page, mode)) {
case 0:
list_move(&page->lru, dst);
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
continue;
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
default:
BUG();
}
if (!order)
continue;
/*
* Attempt to take all pages in the order aligned region
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
* as the mem_map is guarenteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
zone_id = page_zone_id(page);
page_pfn = page_to_pfn(page);
pfn = page_pfn & ~((1 << order) - 1);
end_pfn = pfn + (1 << order);
for (; pfn < end_pfn; pfn++) {
struct page *cursor_page;
/* The target page is in the block, ignore it. */
if (unlikely(pfn == page_pfn))
continue;
/* Avoid holes within the zone. */
if (unlikely(!pfn_valid_within(pfn)))
break;
cursor_page = pfn_to_page(pfn);
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
continue;
switch (__isolate_lru_page(cursor_page, mode)) {
case 0:
list_move(&cursor_page->lru, dst);
nr_taken++;
scan++;
break;
case -EBUSY:
/* else it is being freed elsewhere */
list_move(&cursor_page->lru, src);
default:
break;
}
}
}
*scanned = scan;
return nr_taken;
}
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
struct mem_cgroup *mem_cont,
int active)
{
if (active)
return isolate_lru_pages(nr, &z->active_list, dst,
scanned, order, mode);
else
return isolate_lru_pages(nr, &z->inactive_list, dst,
scanned, order, mode);
}
/*
* clear_active_flags() is a helper for shrink_active_list(), clearing
* any active bits from the pages in the list.
*/
static unsigned long clear_active_flags(struct list_head *page_list)
{
int nr_active = 0;
struct page *page;
list_for_each_entry(page, page_list, lru)
if (PageActive(page)) {
ClearPageActive(page);
nr_active++;
}
return nr_active;
}
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
static unsigned long shrink_inactive_list(unsigned long max_scan,
struct zone *zone, struct scan_control *sc)
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
unsigned long nr_taken;
unsigned long nr_scan;
unsigned long nr_freed;
nr_taken = sc->isolate_pages(sc->swap_cluster_max,
&page_list, &nr_scan, sc->order,
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, sc->mem_cgroup, 0);

Andy Whitcroft
committed
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
__mod_zone_page_state(zone, NR_INACTIVE,
-(nr_taken - nr_active));

KAMEZAWA Hiroyuki
committed
if (scan_global_lru(sc))
zone->pages_scanned += nr_scan;

Andy Whitcroft
committed
nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
/*
* If we are direct reclaiming for contiguous pages and we do
* not reclaim everything in the list, try again and wait
* for IO to complete. This will stall high-order allocations
* but that should be acceptable to the caller
*/
if (nr_freed < nr_taken && !current_is_kswapd() &&
sc->order > PAGE_ALLOC_COSTLY_ORDER) {
congestion_wait(WRITE, HZ/10);
/*
* The attempt at page out may have made some
* of the pages active, mark them inactive again.
*/
nr_active = clear_active_flags(&page_list);
count_vm_events(PGDEACTIVATE, nr_active);
nr_freed += shrink_page_list(&page_list, sc,
PAGEOUT_IO_SYNC);
}
local_irq_disable();
if (current_is_kswapd()) {
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
__count_vm_events(KSWAPD_STEAL, nr_freed);

KAMEZAWA Hiroyuki
committed
} else if (scan_global_lru(sc))
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);

KAMEZAWA Hiroyuki
committed
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
goto done;
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
else
add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
spin_unlock(&zone->lru_lock);
local_irq_enable();
/*
* We are about to scan this zone at a certain priority level. If that priority
* level is smaller (ie: more urgent) than the previous priority, then note
* that priority level within the zone. This is done so that when the next
* process comes in to scan this zone, it will immediately start out at this
* priority level rather than having to build up its own scanning priority.
* Here, this priority affects only the reclaim-mapped threshold.
*/
static inline void note_zone_scanning_priority(struct zone *zone, int priority)
{
if (priority < zone->prev_priority)
zone->prev_priority = priority;
}
static inline int zone_is_near_oom(struct zone *zone)
{
return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+ zone_page_state(zone, NR_INACTIVE))*3;

KAMEZAWA Hiroyuki
committed
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Determine we should try to reclaim mapped pages.
* This is called only when sc->mem_cgroup is NULL.
*/
static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
int priority)
{
long mapped_ratio;
long distress;
long swap_tendency;
long imbalance;
int reclaim_mapped = 0;
int prev_priority;
if (scan_global_lru(sc) && zone_is_near_oom(zone))
return 1;
/*
* `distress' is a measure of how much trouble we're having
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
*/
if (scan_global_lru(sc))
prev_priority = zone->prev_priority;
else
prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
distress = 100 >> min(prev_priority, priority);
/*
* The point of this algorithm is to decide when to start
* reclaiming mapped memory instead of just pagecache. Work out
* how much memory
* is mapped.