Newer
Older

Andrew Morton
committed
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
int loop = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = 0,
};
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!total)
break;
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
}
continue;
}
if (!mem_cgroup_reclaimable(victim, false))
continue;
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
break;

Andrew Morton
committed
mem_cgroup_iter_break(root_memcg, victim);
return total;
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
if (failed) {
/*
* OK, we failed to lock the whole subtree so we have
* to clean up what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
return !failed;
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
* Both of oom_wait_info->memcg and wake_memcg are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
* try to call OOM killer
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
int wakeups;
if (!current->memcg_oom.may_oom)
return;
current->memcg_oom.in_memcg_oom = 1;
* As with any blocking lock, a contender needs to start
* listening for wakeups before attempting the trylock,
* otherwise it can miss the wakeup from the unlock and sleep
* indefinitely. This is just open-coded because our locking
* is so particular to memcg hierarchies.
wakeups = atomic_read(&memcg->oom_wakeups);
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
mem_cgroup_out_of_memory(memcg, mask, order);
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
/*
* A system call can just return -ENOMEM, but if this
* is a page fault and somebody else is handling the
* OOM already, we need to sleep on the OOM waitqueue
* for this memcg until the situation is resolved.
* Which can take some time because it might be
* handled by a userspace task.
*
* However, this is the charge context, which means
* that we may sit on a large call stack and hold
* various filesystem locks, the mmap_sem etc. and we
* don't want the OOM handler to deadlock on them
* while we sit here and wait. Store the current OOM
* context in the task_struct, then return -ENOMEM.
* At the end of the page fault handler, with the
* stack unwound, pagefault_out_of_memory() will check
* back with us by calling
* mem_cgroup_oom_synchronize(), possibly putting the
* task to sleep.
*/
current->memcg_oom.oom_locked = locked;
current->memcg_oom.wakeups = wakeups;
css_get(&memcg->css);
current->memcg_oom.wait_on_memcg = memcg;
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
*
* This has to be called at the end of a page fault if the the memcg
* OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
*
* Memcg supports userspace OOM handling, so failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
* the end of the page fault to put the task to sleep and clean up the
* OOM state.
*
* Returns %true if an ongoing memcg OOM situation was detected and
* finalized, %false otherwise.
*/
bool mem_cgroup_oom_synchronize(void)
{
struct oom_wait_info owait;
struct mem_cgroup *memcg;
/* OOM is global, do not handle */
if (!current->memcg_oom.in_memcg_oom)
return false;
/*
* We invoked the OOM killer but there is a chance that a kill
* did not free up any charges. Everybody else might already
* be sleeping, so restart the fault and keep the rampage
* going until some charges are released.
*/
memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)
goto out;
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
goto out_memcg;
owait.memcg = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
/* Only sleep if we didn't miss any wakeups since OOM */
if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
out_memcg:
mem_cgroup_unmark_under_oom(memcg);
if (current->memcg_oom.oom_locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
}
css_put(&memcg->css);
current->memcg_oom.wait_on_memcg = NULL;
out:
current->memcg_oom.in_memcg_oom = 0;
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check mm->moving_account and detect there are possibility of race
* If there is, we take a lock.
void __mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
return;
/*
* If this memory cgroup is not under account moving, we don't

Wanpeng Li
committed
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock() if mem_cgroup_stolen() == true.
return;
move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
move_unlock_mem_cgroup(memcg, flags);
goto again;
}
*locked = true;
}
void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* It's guaranteed that pc->mem_cgroup never changes while
* lock is held because a routine modifies pc->mem_cgroup

Wanpeng Li
committed
* should take move_lock_mem_cgroup().
*/
move_unlock_mem_cgroup(pc->mem_cgroup, flags);
}
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
unsigned long uninitialized_var(flags);
if (mem_cgroup_disabled())
VM_BUG_ON(!rcu_read_lock_held());
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
this_cpu_add(memcg->stat->count[idx], val);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
* service an allocation will refill the stock.
*
* returns true if successful, false otherwise.
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
bool ret = true;
if (nr_pages > CHARGE_BATCH)
return false;
stock = &get_cpu_var(memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages)
stock->nr_pages -= nr_pages;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
unsigned long bytes = stock->nr_pages * PAGE_SIZE;
res_counter_uncharge(&old->res, bytes);
res_counter_uncharge(&old->memsw, bytes);
stock->nr_pages = 0;
}
stock->cached = NULL;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
static void __init memcg_stock_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it. sync flag says whether we should block
* until the work is done.
static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
int cpu, curcpu;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
memcg = stock->cached;
if (!memcg || !stock->nr_pages)
if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
if (!sync)
goto out;
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
flush_work(&stock->work);
}
out:
}
/*
* Tries to drain stocked charges in other cpus. This function is asynchronous
* and just put a work per cpu for draining localy on each cpu. Caller can
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
static void drain_all_stock_async(struct mem_cgroup *root_memcg)
/*
* If someone calls draining, avoid adding more kworker runs.
*/
if (!mutex_trylock(&percpu_charge_mutex))
return;
drain_all_stock(root_memcg, false);
mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
{
/* called when force_empty is called */
mutex_lock(&percpu_charge_mutex);
drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
/*
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
spin_lock(&memcg->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long x = per_cpu(memcg->stat->count[i], cpu);
per_cpu(memcg->stat->count[i], cpu) = 0;
memcg->nocpu_base.count[i] += x;
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
unsigned long x = per_cpu(memcg->stat->events[i], cpu);
per_cpu(memcg->stat->events[i], cpu) = 0;
memcg->nocpu_base.events[i] += x;
spin_unlock(&memcg->pcp_counter_lock);
static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
struct mem_cgroup *iter;
return NOTIFY_OK;
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
for_each_mem_cgroup(iter)
mem_cgroup_drain_pcp_counter(iter, cpu);
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
bool invoke_oom)
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
int ret;
ret = res_counter_charge(&memcg->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
res_counter_uncharge(&memcg->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
/*
* Never reclaim on behalf of optional batching, retry with a
* single page instead.
*/
if (nr_pages > min_pages)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
if (gfp_mask & __GFP_NORETRY)
return CHARGE_NOMEM;
ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages. Retry the charge
* before killing the task.
*
* Only for regular pages, though: huge pages are rather
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
return CHARGE_RETRY;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
if (invoke_oom)
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
return CHARGE_NOMEM;
* __mem_cgroup_try_charge() does
* 1. detect memcg to be charged against from passed *mm and *ptr,
* 2. update res_counter
* 3. call memory reclaim if necessary.
*
* In some special case, if the task is fatal, fatal_signal_pending() or
* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
* as possible without any hazards. 2: all pages should have a valid
* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
* pointer, that is treated as a charge to root_mem_cgroup.
*
* So __mem_cgroup_try_charge() will return
* 0 ... on success, filling *ptr with a valid memcg pointer.
* -ENOMEM ... charge failure because of resource limits.
* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
*
* Unlike the exported interface, an "oom" parameter is added. if oom==true,
* the oom-killer can be invoked.
static int __mem_cgroup_try_charge(struct mm_struct *mm,
unsigned int nr_pages,
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
* in system level. So, allow to go ahead dying process in addition to
* MEMDIE process.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)
|| fatal_signal_pending(current)))
goto bypass;
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the root memcg (happens for pagecache usage).
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
if (mem_cgroup_is_root(memcg))
if (consume_stock(memcg, nr_pages))
rcu_read_lock();
p = rcu_dereference(mm->owner);
/*
* Because we don't have task_lock(), "p" can exit.
* In that case, "memcg" can point to root or p can be NULL with
* race with swapoff. Then, we have small risk of mis-accouning.
* But such kind of mis-account by race always happens because
* we don't have cgroup_mutex(). It's overkill and we allo that
* small race, here.
* (*) swapoff at el will charge against mm-struct not against
* task-struct. So, mm->owner can be NULL.
if (!memcg)
memcg = root_mem_cgroup;
if (mem_cgroup_is_root(memcg)) {
if (consume_stock(memcg, nr_pages)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
* necessary. If consume_stock success, some charges
* from this memcg are cached on this cpu. So, we
* don't need to call css_get()/css_tryget() before
* calling consume_stock().
*/
rcu_read_unlock();
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
}
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
css_put(&memcg->css);
memcg = NULL;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
if (!oom || invoke_oom) {
} while (ret != CHARGE_OK);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
css_put(&memcg->css);
return 0;
nomem:
return -ENOMEM;
*ptr = root_mem_cgroup;
return -EINTR;
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
if (!mem_cgroup_is_root(memcg)) {
unsigned long bytes = nr_pages * PAGE_SIZE;
res_counter_uncharge(&memcg->res, bytes);
res_counter_uncharge(&memcg->memsw, bytes);
/*
* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
* This is useful when moving usage to parent cgroup.
*/
static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
unsigned long bytes = nr_pages * PAGE_SIZE;
if (mem_cgroup_is_root(memcg))
return;
res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
if (do_swap_account)
res_counter_uncharge_until(&memcg->memsw,
memcg->memsw.parent, bytes);
}
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller is responsible for calling css_tryget if
* the mem_cgroup is used for charging. (dropping refcnt from swap can be
* called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
struct cgroup_subsys_state *css;
/* ID 0 is unused ID */
if (!id)
return NULL;
css = css_lookup(&mem_cgroup_subsys, id);
if (!css)
return NULL;
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
VM_BUG_ON(!PageLocked(page));
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
rcu_read_unlock();
unlock_page_cgroup(pc);
static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unsigned int nr_pages,
enum charge_type ctype,
bool lrucare)
struct page_cgroup *pc = lookup_page_cgroup(page);
struct zone *uninitialized_var(zone);
bool was_on_lru = false;
lock_page_cgroup(pc);
VM_BUG_ON(PageCgroupUsed(pc));
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/
/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/
if (lrucare) {
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
del_page_from_lru_list(page, lruvec, page_lru(page));
was_on_lru = true;
}
}
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
}

Kamezawa Hiroyuki
committed
if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
anon = true;
else
anon = false;
mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
}
/*
* This is a bit cumbersome, but it is rarely used and avoids a backpointer
* in the memcg_cache_params struct.
*/
static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
{
struct kmem_cache *cachep;
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
}
static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
return -EIO;
print_slabinfo_header(m);
mutex_lock(&memcg->slab_caches_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list)
cache_show(memcg_params_to_cache(params), m);
mutex_unlock(&memcg->slab_caches_mutex);