Newer
Older
move_lock_mem_cgroup(from, &flags);
/* Update mapped_file data for mem_cgroup */
preempt_disable();
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
mem_cgroup_charge_statistics(from, anon, -nr_pages);
/* caller should have done css_get */
mem_cgroup_charge_statistics(to, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
/*
* check events
*/
memcg_check_events(to, page);
memcg_check_events(from, page);
return ret;
}
/**
* mem_cgroup_move_parent - moves page to the parent group
* @page: the page to move
* @pc: page_cgroup of the page
* @child: page's cgroup
*
* move charges to its parent or the root cgroup if the group has no
* parent (aka use_hierarchy==0).
* Although this might fail (get_page_unless_zero, isolate_lru_page or
* mem_cgroup_move_account fails) the failure is always temporary and
* it signals a race with a page removal/uncharge or migration. In the
* first case the page is on the way out and it will vanish from the LRU
* on the next attempt and the call should be retried later.
* Isolation from the LRU fails only if page has been isolated from
* the LRU since we looked at it and that usually means either global
* reclaim or migration going on. The page will either get back to the
* LRU or vanish.
* Finaly mem_cgroup_move_account fails only if the page got uncharged
* (!PageCgroupUsed) or moved to a different group. The page will
* disappear in the next attempt.
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
{
struct mem_cgroup *parent;
unsigned int nr_pages;
unsigned long uninitialized_var(flags);
VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
goto out;
if (isolate_lru_page(page))
goto put;
nr_pages = hpage_nr_pages(page);
parent = parent_mem_cgroup(child);
/*
* If no parent, move charges to root cgroup.
*/
if (!parent)
parent = root_mem_cgroup;
if (nr_pages > 1) {
VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
if (!ret)
__mem_cgroup_cancel_local_charge(child, nr_pages);
if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
return ret;
}
/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
unsigned int nr_pages = 1;
nr_pages <<= compound_order(page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
*/
oom = false;
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
return ret;
__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping && !PageAnon(page));
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON);
}
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask,
struct mem_cgroup **memcgp)
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. The USED bit is protected by
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
return 0;
if (!do_swap_account)
goto charge_cur_mm;
memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
*memcgp = memcg;
ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
return ret;
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
*memcgp = NULL;
if (mem_cgroup_disabled())
return 0;
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
* those cases unuse_pte()'s pte_same() test will fail; but
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
int ret;
ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
return ret;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
__mem_cgroup_cancel_charge(memcg, 1);
}
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
if (!memcg)
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
* Fix it by uncharging from memsw. Basically, this SwapCache is stable
* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
* may call delete_from_swap_cache() before reach here.
if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
void mem_cgroup_commit_charge_swapin(struct page *page,
struct mem_cgroup *memcg)
__mem_cgroup_commit_charge_swapin(page, memcg,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON);
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
int ret;
return 0;
if (PageCompound(page))
return 0;
if (!PageSwapCache(page))
ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
else { /* page is swapcache/shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, memcg, type);
}
return ret;
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
unsigned int nr_pages,
const enum charge_type ctype)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
batch = ¤t->memcg_batch;
/*
* In usual, we do css_get() when we remember memcg pointer.
* But in this case, we keep res->usage until end of a series of
* uncharges. Then, it's ok to ignore memcg's refcnt.
*/
if (!batch->memcg)
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continuously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
*/
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
if (nr_pages > 1)
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
*/
goto direct_uncharge;
/* remember freed charge and uncharge it later */
batch->nr_pages++;
batch->memsw_nr_pages++;
return;
direct_uncharge:
res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
if (unlikely(batch->memcg != memcg))
memcg_oom_recover(memcg);
* uncharge if !page_mapped(page)
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
bool end_migration)
unsigned int nr_pages = 1;
struct page_cgroup *pc;
VM_BUG_ON(PageSwapCache(page));
nr_pages <<= compound_order(page);
* Check if our page_cgroup is valid
pc = lookup_page_cgroup(page);
if (unlikely(!PageCgroupUsed(pc)))
if (!PageCgroupUsed(pc))
goto unlock_out;
anon = PageAnon(page);

Kamezawa Hiroyuki
committed
case MEM_CGROUP_CHARGE_TYPE_ANON:
/*
* Generally PageAnon tells if it's the anon statistics to be
* updated; but sometimes e.g. mem_cgroup_uncharge_page() is
* used before page reached the stage of being marked PageAnon.
*/
anon = true;
/* fallthrough */
/* See mem_cgroup_prepare_migration() */
if (page_mapped(page))
goto unlock_out;
/*
* Pages under migration may not be uncharged. But
* end_migration() /must/ be the one uncharging the
* unused post-migration page and so it has to call
* here with the migration bit still set. See the
* res_counter handling below.
*/
if (!end_migration && PageCgroupMigration(pc))
goto unlock_out;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
if (!PageAnon(page)) { /* Shared memory */
if (page->mapping && !page_is_file_cache(page))
goto unlock_out;
} else if (page_mapped(page)) /* Anon */
goto unlock_out;
break;
default:
break;
mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
/*
* pc->mem_cgroup is not cleared here. It will be accessed when it's
* freed from LRU. This is safe because uncharged page is expected not
* to be reused (freed soon). Exception is SwapCache, it's handled by
* special functions.
*/
* even after unlock, we have memcg->res.usage here and this memcg
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
mem_cgroup_swap_statistics(memcg, true);
mem_cgroup_get(memcg);
/*
* Migration does not charge the res_counter for the
* replacement page, so leave it alone when phasing out the
* page that is unused after the migration.
*/
if (!end_migration && !mem_cgroup_is_root(memcg))
mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

KAMEZAWA Hiroyuki
committed
unlock_out:
unlock_page_cgroup(pc);
void mem_cgroup_uncharge_page(struct page *page)
{
/* early check. */
if (page_mapped(page))
return;
VM_BUG_ON(page->mapping && !PageAnon(page));
if (PageSwapCache(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
}
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
/*
* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
* In that cases, pages are freed continuously and we can expect pages
* are in the same memcg. All these calls itself limits the number of
* pages freed at once, then uncharge_start/end() is called properly.
* This may be called prural(2) times in a context,
*/
void mem_cgroup_uncharge_start(void)
{
current->memcg_batch.do_batch++;
/* We can do nest. */
if (current->memcg_batch.do_batch == 1) {
current->memcg_batch.memcg = NULL;
current->memcg_batch.nr_pages = 0;
current->memcg_batch.memsw_nr_pages = 0;
}
}
void mem_cgroup_uncharge_end(void)
{
struct memcg_batch_info *batch = ¤t->memcg_batch;
if (!batch->do_batch)
return;
batch->do_batch--;
if (batch->do_batch) /* If stacked, do nothing. */
return;
if (!batch->memcg)
return;
/*
* This "batch->memcg" is valid without any css_get/put etc...
* bacause we hide charges behind us.
*/
if (batch->nr_pages)
res_counter_uncharge(&batch->memcg->res,
batch->nr_pages * PAGE_SIZE);
if (batch->memsw_nr_pages)
res_counter_uncharge(&batch->memcg->memsw,
batch->memsw_nr_pages * PAGE_SIZE);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
#ifdef CONFIG_SWAP
* called after __delete_from_swap_cache() and drop "page" account.
* memcg information is recorded to swap_cgroup of "ent"
*/
void
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
if (!swapout) /* this was a swap cache but the swap is unused ! */
ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
memcg = __mem_cgroup_uncharge_common(page, ctype, false);
/*
* record memcg information, if swapout && memcg != NULL,
* mem_cgroup_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
#endif
/*
* called from swap_entry_free(). remove record in swap_cgroup and
* uncharge "memsw" account.
*/
void mem_cgroup_uncharge_swap(swp_entry_t ent)
unsigned short id;
if (!do_swap_account)
return;
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
/*
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))

KAMEZAWA Hiroyuki
committed
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
rcu_read_unlock();
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
*
* Returns 0 on success, -EINVAL on failure.
*
* The caller must have charged to @to, IOW, called res_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
unsigned short old_id, new_id;
old_id = css_id(&from->css);
new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone mem_cgroup_get(to)
* because if the process that has been moved to @to does
* swap-in, the refcount of @to might be decreased to 0.
*/
mem_cgroup_get(to);
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
return -EINVAL;
}

KAMEZAWA Hiroyuki
committed
/*
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.

KAMEZAWA Hiroyuki
committed
*/
void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)

KAMEZAWA Hiroyuki
committed
{
unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
return;
if (PageTransHuge(page))
nr_pages <<= compound_order(page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
css_get(&memcg->css);
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
/*
* At migrating an anonymous page, its mapcount goes down
* to 0 and uncharge() will be called. But, even if it's fully
* unmapped, migration may fail and this page has to be
* charged again. We set MIGRATION flag here and delay uncharge
* until end_migration() is called
*
* Corner Case Thinking
* A)
* When the old page was mapped as Anon and it's unmap-and-freed
* while migration was ongoing.
* If unmap finds the old page, uncharge() of it will be delayed
* until end_migration(). If unmap finds a new page, it's
* uncharged when it make mapcount to be 1->0. If unmap code
* finds swap_migration_entry, the new page will not be mapped
* and end_migration() will find it(mapcount==0).
*
* B)
* When the old page was mapped but migraion fails, the kernel
* remaps it. A charge for it is kept by MIGRATION flag even
* if mapcount goes down to 0. We can do remap successfully
* without charging it again.
*
* C)
* The "old" page is under lock_page() until the end of
* migration, so, the old page itself will not be swapped-out.
* If the new page is swapped out before end_migraton, our
* hook to usual swap-out path will catch the event.
*/
if (PageAnon(page))
SetPageCgroupMigration(pc);
/*
* If the page is not charged at this point,
* we return here.
*/
return;
*memcgp = memcg;
/*
* We charge new page before it's used/mapped. So, even if unlock_page()
* is called before end_migration, we can catch all events on this new
* page. In the case new page is migrated but not remapped, new page's
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
if (PageAnon(page))

Kamezawa Hiroyuki
committed
ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
/*
* The page is committed to the memcg, but it's not actually
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);

KAMEZAWA Hiroyuki
committed
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok)

KAMEZAWA Hiroyuki
committed
{
struct page *used, *unused;

Tejun Heo
committed
if (!migration_ok) {
used = oldpage;
unused = newpage;
used = newpage;
anon = PageAnon(used);
__mem_cgroup_uncharge_common(unused,
anon ? MEM_CGROUP_CHARGE_TYPE_ANON
: MEM_CGROUP_CHARGE_TYPE_CACHE,
true);
css_put(&memcg->css);
* We disallowed uncharge of pages under migration because mapcount
* of the page goes down to zero, temporarly.
* Clear the flag and check the page should be charged.
pc = lookup_page_cgroup(oldpage);
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
* If a page is a file cache, radix-tree replacement is very atomic
* and we can skip this check. When it was an Anon page, its mapcount
* goes down to 0. But because we added MIGRATION flage, it's not
* uncharged yet. There are several case but page->mapcount check
* and USED bit check in mem_cgroup_uncharge_page() will do enough
* check. (see prepare_charge() also)
mem_cgroup_uncharge_page(used);

KAMEZAWA Hiroyuki
committed
}
/*
* At replace page cache, newpage is not under any memcg but it's on
* LRU. So, this function doesn't touch res_counter but handles LRU
* in correct way. Both pages are locked so we cannot race with uncharge.
*/
void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage)
{
struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(oldpage);
/* fix accounting on old pages */
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
mem_cgroup_charge_statistics(memcg, false, -1);
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
/*
* When called from shmem_replace_page(), in some cases the
* oldpage has already been charged, and in some cases not.
*/
if (!memcg)
return;
/*
* Even if newpage->mapping was NULL before starting replacement,
* the newpage may be on LRU(or pagevec for LRU) already. We lock
* LRU while we overwrite pc->mem_cgroup.
*/
__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
#ifdef CONFIG_DEBUG_VM
static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Can be NULL while feeding pages into the page allocator for
* the first time, i.e. during boot or memory hotplug;
* or when mem_cgroup_disabled().
*/
if (likely(pc) && PageCgroupUsed(pc))
return pc;
return NULL;
}
bool mem_cgroup_bad_page_check(struct page *page)
{
if (mem_cgroup_disabled())
return false;
return lookup_page_cgroup_used(page) != NULL;
}
void mem_cgroup_print_bad_page(struct page *page)
{
struct page_cgroup *pc;
pc = lookup_page_cgroup_used(page);
if (pc) {
printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
pc, pc->flags, pc->mem_cgroup);
}
}
#endif
static DEFINE_MUTEX(set_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
int retry_count;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.

Wanpeng Li
committed
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_reclaim(memcg, GFP_KERNEL,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
int retry_count;
u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.

Wanpeng Li
committed
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_reclaim(memcg, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
unsigned long nr_scanned;
if (order > 0)
return 0;
mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
spin_lock(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed) {
do {
/*
* Loop until we find yet another one.
*
* By the time we get the soft_limit lock
* again, someone might have aded the
* group back on the RB tree. Iterate to
* make sure we get a different mem.
* mem_cgroup_largest_soft_limit_node returns
* NULL if no other cgroup is present on
* the tree
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);

Michal Hocko
committed
if (next_mz == mz)
css_put(&next_mz->memcg->css);

Michal Hocko
committed
else /* next_mz == NULL or other memcg */
break;
} while (1);
}
__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
excess = res_counter_soft_limit_excess(&mz->memcg->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);