Newer
Older
ret = mem_cgroup_move_account(pc, child, parent, true);
if (ret)
mem_cgroup_cancel_charge(parent);
return ret;
}
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype,
struct mem_cgroup *memcg)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
int ret;
pc = lookup_page_cgroup(page);
/* can happen at boot */
if (unlikely(!pc))
return 0;
prefetchw(pc);
mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
return ret;
__mem_cgroup_commit_charge(mem, pc, ctype);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
if (PageCompound(page))
return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
* But page->mapping may have out-of-use anon_vma pointer,
* detecit it by PageAnon() check. newly-mapped-anon's page->mapping
* is NULL.
*/
if (page_mapped(page) || (page->mapping && !PageAnon(page)))
return 0;
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
}
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
struct mem_cgroup *mem = NULL;
int ret;
if (PageCompound(page))
return 0;
/*
* Corner case handling. This is called from add_to_page_cache()
* in usual. But some FS (shmem) precharges this page before calling it
* and call add_to_page_cache() with GFP_NOWAIT.
*
* For GFP_NOWAIT case, the page may be pre-charged before calling
* add_to_page_cache(). (See shmem.c) check it here and avoid to call
* charge twice. (It works but has to pay a bit larger cost.)
* And when the page is SwapCache, it should take swap information
* into account. This is under lock_page() now.
*/
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
unlock_page_cgroup(pc);
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
/* shmem */
if (PageSwapCache(page)) {
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask, struct mem_cgroup **ptr)
{
struct mem_cgroup *mem;
return 0;
if (!do_swap_account)
goto charge_cur_mm;
/*
* A racing thread's fault, or swapoff, may have already updated
* the pte, and even removed page from swap cache: in those cases
* do_swap_page()'s pte_same() test will fail; but there's also a
* KSM case which does need to charge the page.
mem = try_get_mem_cgroup_from_page(page);
ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
/* drop extra refcnt from tryget */
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
return __mem_cgroup_try_charge(mm, mask, ptr, true);
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype)
{
struct page_cgroup *pc;
return;
if (!ptr)
return;
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
__mem_cgroup_commit_charge(ptr, pc, ctype);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
* Fix it by uncharging from memsw. Basically, this SwapCache is stable
* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
* may call delete_from_swap_cache() before reach here.
if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
unsigned short id;
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
/*
* This recorded memcg can be obsolete one. So, avoid
* calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))

KAMEZAWA Hiroyuki
committed
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
rcu_read_unlock();
/*
* At swapin, we may charge account against cgroup which has no tasks.
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
cgroup_release_and_wakeup_rmdir(&ptr->css);
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
__mem_cgroup_commit_charge_swapin(page, ptr,
MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
return;
if (!mem)
return;
mem_cgroup_cancel_charge(mem);
static void
__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
batch = ¤t->memcg_batch;
/*
* In usual, we do css_get() when we remember memcg pointer.
* But in this case, we keep res->usage until end of a series of
* uncharges. Then, it's ok to ignore memcg's refcnt.
*/
if (!batch->memcg)
batch->memcg = mem;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
*/
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
*/
if (batch->memcg != mem)
goto direct_uncharge;
/* remember freed charge and uncharge it later */
batch->bytes += PAGE_SIZE;
if (uncharge_memsw)
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
res_counter_uncharge(&mem->res, PAGE_SIZE);
if (uncharge_memsw)
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
* uncharge if !page_mapped(page)
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* Check if our page_cgroup is valid
pc = lookup_page_cgroup(page);
if (unlikely(!pc || !PageCgroupUsed(pc)))
if (!PageCgroupUsed(pc))
goto unlock_out;
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
/* See mem_cgroup_prepare_migration() */
if (page_mapped(page) || PageCgroupMigration(pc))
goto unlock_out;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
if (!PageAnon(page)) { /* Shared memory */
if (page->mapping && !page_is_file_cache(page))
goto unlock_out;
} else if (page_mapped(page)) /* Anon */
goto unlock_out;
break;
default:
break;
if (!mem_cgroup_is_root(mem))
__do_uncharge(mem, ctype);
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
/*
* pc->mem_cgroup is not cleared here. It will be accessed when it's
* freed from LRU. This is safe because uncharged page is expected not
* to be reused (freed soon). Exception is SwapCache, it's handled by
* special functions.
*/
memcg_check_events(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
css_put(&mem->css);

KAMEZAWA Hiroyuki
committed
unlock_out:
unlock_page_cgroup(pc);
void mem_cgroup_uncharge_page(struct page *page)
{
/* early check. */
if (page_mapped(page))
return;
if (page->mapping && !PageAnon(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
/*
* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
* In that cases, pages are freed continuously and we can expect pages
* are in the same memcg. All these calls itself limits the number of
* pages freed at once, then uncharge_start/end() is called properly.
* This may be called prural(2) times in a context,
*/
void mem_cgroup_uncharge_start(void)
{
current->memcg_batch.do_batch++;
/* We can do nest. */
if (current->memcg_batch.do_batch == 1) {
current->memcg_batch.memcg = NULL;
current->memcg_batch.bytes = 0;
current->memcg_batch.memsw_bytes = 0;
}
}
void mem_cgroup_uncharge_end(void)
{
struct memcg_batch_info *batch = ¤t->memcg_batch;
if (!batch->do_batch)
return;
batch->do_batch--;
if (batch->do_batch) /* If stacked, do nothing. */
return;
if (!batch->memcg)
return;
/*
* This "batch->memcg" is valid without any css_get/put etc...
* bacause we hide charges behind us.
*/
if (batch->bytes)
res_counter_uncharge(&batch->memcg->res, batch->bytes);
if (batch->memsw_bytes)
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
#ifdef CONFIG_SWAP
* called after __delete_from_swap_cache() and drop "page" account.
* memcg information is recorded to swap_cgroup of "ent"
*/
void
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
if (!swapout) /* this was a swap cache but the swap is unused ! */
ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
memcg = __mem_cgroup_uncharge_common(page, ctype);
/* record memcg information */
if (do_swap_account && swapout && memcg) {
swap_cgroup_record(ent, css_id(&memcg->css));
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/*
* called from swap_entry_free(). remove record in swap_cgroup and
* uncharge "memsw" account.
*/
void mem_cgroup_uncharge_swap(swp_entry_t ent)
unsigned short id;
if (!do_swap_account)
return;
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
/*
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))

KAMEZAWA Hiroyuki
committed
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
rcu_read_unlock();
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
* @need_fixup: whether we should fixup res_counters and refcounts.
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
*
* Returns 0 on success, -EINVAL on failure.
*
* The caller must have charged to @to, IOW, called res_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
{
unsigned short old_id, new_id;
old_id = css_id(&from->css);
new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone mem_cgroup_get(to)
* because if the process that has been moved to @to does
* swap-in, the refcount of @to might be decreased to 0.
if (need_fixup) {
if (!mem_cgroup_is_root(from))
res_counter_uncharge(&from->memsw, PAGE_SIZE);
mem_cgroup_put(from);
/*
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE);
css_put(&to->css);
}
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
{
return -EINVAL;
}

KAMEZAWA Hiroyuki
committed
/*
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.

KAMEZAWA Hiroyuki
committed
*/
int mem_cgroup_prepare_migration(struct page *page,
struct page *newpage, struct mem_cgroup **ptr)

KAMEZAWA Hiroyuki
committed
{
struct page_cgroup *pc;
enum charge_type ctype;
return 0;
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
mem = pc->mem_cgroup;
css_get(&mem->css);
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
/*
* At migrating an anonymous page, its mapcount goes down
* to 0 and uncharge() will be called. But, even if it's fully
* unmapped, migration may fail and this page has to be
* charged again. We set MIGRATION flag here and delay uncharge
* until end_migration() is called
*
* Corner Case Thinking
* A)
* When the old page was mapped as Anon and it's unmap-and-freed
* while migration was ongoing.
* If unmap finds the old page, uncharge() of it will be delayed
* until end_migration(). If unmap finds a new page, it's
* uncharged when it make mapcount to be 1->0. If unmap code
* finds swap_migration_entry, the new page will not be mapped
* and end_migration() will find it(mapcount==0).
*
* B)
* When the old page was mapped but migraion fails, the kernel
* remaps it. A charge for it is kept by MIGRATION flag even
* if mapcount goes down to 0. We can do remap successfully
* without charging it again.
*
* C)
* The "old" page is under lock_page() until the end of
* migration, so, the old page itself will not be swapped-out.
* If the new page is swapped out before end_migraton, our
* hook to usual swap-out path will catch the event.
*/
if (PageAnon(page))
SetPageCgroupMigration(pc);
/*
* If the page is not charged at this point,
* we return here.
*/
if (!mem)
return 0;
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
/*
* The old page may be fully unmapped while we kept it.
*/
mem_cgroup_uncharge_page(page);
}
return -ENOMEM;
/*
* We charge new page before it's used/mapped. So, even if unlock_page()
* is called before end_migration, we can catch all events on this new
* page. In the case new page is migrated but not remapped, new page's
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
pc = lookup_page_cgroup(newpage);
if (PageAnon(page))
ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
else if (page_is_file_cache(page))
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
__mem_cgroup_commit_charge(mem, pc, ctype);

KAMEZAWA Hiroyuki
committed
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
struct page *oldpage, struct page *newpage)

KAMEZAWA Hiroyuki
committed
{
struct page *used, *unused;
struct page_cgroup *pc;
if (!mem)
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
used = oldpage;
unused = newpage;
used = newpage;
* We disallowed uncharge of pages under migration because mapcount
* of the page goes down to zero, temporarly.
* Clear the flag and check the page should be charged.
pc = lookup_page_cgroup(oldpage);
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
* If a page is a file cache, radix-tree replacement is very atomic
* and we can skip this check. When it was an Anon page, its mapcount
* goes down to 0. But because we added MIGRATION flage, it's not
* uncharged yet. There are several case but page->mapcount check
* and USED bit check in mem_cgroup_uncharge_page() will do enough
* check. (see prepare_charge() also)
if (PageAnon(used))
mem_cgroup_uncharge_page(used);
* At migration, we may charge account against cgroup which has no
* tasks.
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
cgroup_release_and_wakeup_rmdir(&mem->css);

KAMEZAWA Hiroyuki
committed
}
* A call to try to shrink memory usage on charge failure at shmem's swapin.
* Calling hierarchical_reclaim is not enough because we should update
* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
* Moreover considering hierarchy, we should reclaim from the mem_over_limit,
* not from the memcg which this page would be charged to.
* try_charge_swapin does all of these works properly.
int mem_cgroup_shmem_charge_fallback(struct page *page,
struct mm_struct *mm,
gfp_t gfp_mask)
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
static DEFINE_MUTEX(set_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
int retry_count;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
* We have to guarantee mem->res.limit < mem->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
int retry_count;
u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
* We have to guarantee mem->res.limit < mem->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
if (order > 0)
return 0;
mctz = soft_limit_tree_node_zone(nid, zid);
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
gfp_mask,
MEM_CGROUP_RECLAIM_SOFT);
nr_reclaimed += reclaimed;
spin_lock(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed) {
do {
/*
* Loop until we find yet another one.
*
* By the time we get the soft_limit lock
* again, someone might have aded the
* group back on the RB tree. Iterate to
* make sure we get a different mem.
* mem_cgroup_largest_soft_limit_node returns
* NULL if no other cgroup is present on
* the tree
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
if (next_mz == mz) {
css_put(&next_mz->mem->css);
next_mz = NULL;
} else /* next_mz == NULL or other memcg */
break;
} while (1);
}
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->mem->css);
return nr_reclaimed;
}

KAMEZAWA Hiroyuki
committed
/*
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

KAMEZAWA Hiroyuki
committed
{
struct zone *zone;
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc, *busy;

KAMEZAWA Hiroyuki
committed
struct list_head *list;

KAMEZAWA Hiroyuki
committed
zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(mem, node, zid);
list = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
loop = MEM_CGROUP_ZSTAT(mz, lru);
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;
while (loop--) {
ret = 0;
if (list_empty(list)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) {
list_move(&pc->lru, list);
busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(&zone->lru_lock, flags);
ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
if (ret == -EBUSY || ret == -EINVAL) {
/* found lock contention or "pc" is obsolete. */
busy = pc;
cond_resched();
} else
busy = NULL;

KAMEZAWA Hiroyuki
committed
}
if (!ret && !list_empty(list))
return -EBUSY;
return ret;

KAMEZAWA Hiroyuki
committed
}
/*
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

KAMEZAWA Hiroyuki
committed
{
int ret;
int node, zid, shrink;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct cgroup *cgrp = mem->css.cgroup;

KAMEZAWA Hiroyuki
committed
css_get(&mem->css);
/* should free all ? */
if (free_all)
goto try_to_free;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
ret = -EINTR;
if (signal_pending(current))

KAMEZAWA Hiroyuki
committed
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync();