Newer
Older
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid < MAX_NUMNODES;
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
}

Andrew Morton
committed
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
struct mem_cgroup *victim = NULL;
int total = 0;
int loop = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = 0,
};
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!total)
break;
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
}
continue;
}
if (!mem_cgroup_reclaimable(victim, false))
continue;
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
break;

Andrew Morton
committed
mem_cgroup_iter_break(root_memcg, victim);
return total;
#ifdef CONFIG_LOCKDEP
static struct lockdep_map memcg_oom_lock_dep_map = {
.name = "memcg_oom_lock",
};
#endif
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
if (failed) {
/*
* OK, we failed to lock the whole subtree so we have
* to clean up what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
} else
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
return !failed;
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
spin_lock(&memcg_oom_lock);
mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
* Both of oom_wait_info->memcg and wake_memcg are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
if (!current->memcg_oom.may_oom)
return;
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks.
*
* Also, the caller may handle a failed allocation gracefully
* (like optional page cache readahead) and so an OOM killer
* invocation might not even be necessary.
*
* That's why we don't do anything here except remember the
* OOM context and then deal with it at the end of the page
* fault when the stack is unwound, the locks are released,
* and when we know whether the fault was overall successful.
css_get(&memcg->css);
current->memcg_oom.memcg = memcg;
current->memcg_oom.gfp_mask = mask;
current->memcg_oom.order = order;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
* @handle: actually kill/wait or just clean up the OOM state
* This has to be called at the end of a page fault if the memcg OOM
* handler was enabled.
* Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
* the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
* completed, %false otherwise.
bool mem_cgroup_oom_synchronize(bool handle)
struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
bool locked;
/* OOM is global, do not handle */
if (!memcg)
return false;
if (!handle)
goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
if (locked)
mem_cgroup_oom_notify(memcg);
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
current->memcg_oom.order);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
}
if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
}
cleanup:
current->memcg_oom.memcg = NULL;
css_put(&memcg->css);
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check mm->moving_account and detect there are possibility of race
* If there is, we take a lock.
void __mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
return;
/*
* If this memory cgroup is not under account moving, we don't

Wanpeng Li
committed
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock() if mem_cgroup_stolen() == true.
return;
move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
move_unlock_mem_cgroup(memcg, flags);
goto again;
}
*locked = true;
}
void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
{
struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* It's guaranteed that pc->mem_cgroup never changes while
* lock is held because a routine modifies pc->mem_cgroup

Wanpeng Li
committed
* should take move_lock_mem_cgroup().
*/
move_unlock_mem_cgroup(pc->mem_cgroup, flags);
}
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
unsigned long uninitialized_var(flags);
if (mem_cgroup_disabled())
VM_BUG_ON(!rcu_read_lock_held());
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
this_cpu_add(memcg->stat->count[idx], val);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
* service an allocation will refill the stock.
*
* returns true if successful, false otherwise.
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
bool ret = true;
if (nr_pages > CHARGE_BATCH)
return false;
stock = &get_cpu_var(memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages)
stock->nr_pages -= nr_pages;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
unsigned long bytes = stock->nr_pages * PAGE_SIZE;
res_counter_uncharge(&old->res, bytes);
res_counter_uncharge(&old->memsw, bytes);
stock->nr_pages = 0;
}
stock->cached = NULL;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
static void __init memcg_stock_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it. sync flag says whether we should block
* until the work is done.
static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
int cpu, curcpu;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
memcg = stock->cached;
if (!memcg || !stock->nr_pages)
if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
if (!sync)
goto out;
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
flush_work(&stock->work);
}
out:
}
/*
* Tries to drain stocked charges in other cpus. This function is asynchronous
* and just put a work per cpu for draining localy on each cpu. Caller can
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
static void drain_all_stock_async(struct mem_cgroup *root_memcg)
/*
* If someone calls draining, avoid adding more kworker runs.
*/
if (!mutex_trylock(&percpu_charge_mutex))
return;
drain_all_stock(root_memcg, false);
mutex_unlock(&percpu_charge_mutex);
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
{
/* called when force_empty is called */
mutex_lock(&percpu_charge_mutex);
drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
/*
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
spin_lock(&memcg->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long x = per_cpu(memcg->stat->count[i], cpu);
per_cpu(memcg->stat->count[i], cpu) = 0;
memcg->nocpu_base.count[i] += x;
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
unsigned long x = per_cpu(memcg->stat->events[i], cpu);
per_cpu(memcg->stat->events[i], cpu) = 0;
memcg->nocpu_base.events[i] += x;
spin_unlock(&memcg->pcp_counter_lock);
static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
struct mem_cgroup *iter;
return NOTIFY_OK;
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
for_each_mem_cgroup(iter)
mem_cgroup_drain_pcp_counter(iter, cpu);
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
bool invoke_oom)
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
int ret;
ret = res_counter_charge(&memcg->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
res_counter_uncharge(&memcg->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
/*
* Never reclaim on behalf of optional batching, retry with a
* single page instead.
*/
if (nr_pages > min_pages)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
if (gfp_mask & __GFP_NORETRY)
return CHARGE_NOMEM;
ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
* Even though the limit is exceeded at this point, reclaim
* may have been able to free some pages. Retry the charge
* before killing the task.
*
* Only for regular pages, though: huge pages are rather
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
return CHARGE_RETRY;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
if (invoke_oom)
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
return CHARGE_NOMEM;
* __mem_cgroup_try_charge() does
* 1. detect memcg to be charged against from passed *mm and *ptr,
* 2. update res_counter
* 3. call memory reclaim if necessary.
*
* In some special case, if the task is fatal, fatal_signal_pending() or
* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
* as possible without any hazards. 2: all pages should have a valid
* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
* pointer, that is treated as a charge to root_mem_cgroup.
*
* So __mem_cgroup_try_charge() will return
* 0 ... on success, filling *ptr with a valid memcg pointer.
* -ENOMEM ... charge failure because of resource limits.
* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
*
* Unlike the exported interface, an "oom" parameter is added. if oom==true,
* the oom-killer can be invoked.
static int __mem_cgroup_try_charge(struct mm_struct *mm,
unsigned int nr_pages,
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
* in system level. So, allow to go ahead dying process in addition to
* MEMDIE process.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)
|| fatal_signal_pending(current)))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
oom = false;
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the root memcg (happens for pagecache usage).
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
if (mem_cgroup_is_root(memcg))
if (consume_stock(memcg, nr_pages))
rcu_read_lock();
p = rcu_dereference(mm->owner);
/*
* Because we don't have task_lock(), "p" can exit.
* In that case, "memcg" can point to root or p can be NULL with
* race with swapoff. Then, we have small risk of mis-accouning.
* But such kind of mis-account by race always happens because
* we don't have cgroup_mutex(). It's overkill and we allo that
* small race, here.
* (*) swapoff at el will charge against mm-struct not against
* task-struct. So, mm->owner can be NULL.
if (!memcg)
memcg = root_mem_cgroup;
if (mem_cgroup_is_root(memcg)) {
if (consume_stock(memcg, nr_pages)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
* necessary. If consume_stock success, some charges
* from this memcg are cached on this cpu. So, we
* don't need to call css_get()/css_tryget() before
* calling consume_stock().
*/
rcu_read_unlock();
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
}
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
css_put(&memcg->css);
memcg = NULL;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
if (!oom || invoke_oom) {
} while (ret != CHARGE_OK);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
css_put(&memcg->css);
return 0;
nomem:
if (!(gfp_mask & __GFP_NOFAIL)) {
*ptr = NULL;
return -ENOMEM;
}
*ptr = root_mem_cgroup;
return -EINTR;
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
if (!mem_cgroup_is_root(memcg)) {
unsigned long bytes = nr_pages * PAGE_SIZE;
res_counter_uncharge(&memcg->res, bytes);
res_counter_uncharge(&memcg->memsw, bytes);
/*
* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
* This is useful when moving usage to parent cgroup.
*/
static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
unsigned long bytes = nr_pages * PAGE_SIZE;
if (mem_cgroup_is_root(memcg))
return;
res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
if (do_swap_account)
res_counter_uncharge_until(&memcg->memsw,
memcg->memsw.parent, bytes);
}
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller is responsible for calling css_tryget if
* the mem_cgroup is used for charging. (dropping refcnt from swap can be
* called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
/* ID 0 is unused ID */
if (!id)
return NULL;
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
rcu_read_unlock();
unlock_page_cgroup(pc);
static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
unsigned int nr_pages,
enum charge_type ctype,
bool lrucare)
struct page_cgroup *pc = lookup_page_cgroup(page);
struct zone *uninitialized_var(zone);
bool was_on_lru = false;
lock_page_cgroup(pc);
VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/
/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/
if (lrucare) {
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
del_page_from_lru_list(page, lruvec, page_lru(page));
was_on_lru = true;
}
}
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
VM_BUG_ON_PAGE(PageLRU(page), page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
}

Kamezawa Hiroyuki
committed
if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
anon = true;
else
anon = false;
mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
static DEFINE_MUTEX(set_limit_mutex);