Newer
Older
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
goto out_unlock;
}
last_visited = mem_cgroup_iter_load(iter, root, &seq);
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
mem_cgroup_iter_update(iter, last_visited, memcg, seq);
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
if (prev && !memcg)
out_unlock:
rcu_read_unlock();
out_css_put:
if (prev && prev != root)
css_put(&prev->css);
/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
struct mem_cgroup *prev)
{
if (!root)
root = root_mem_cgroup;
if (prev && prev != root)
css_put(&prev->css);
}
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
goto out;
switch (idx) {
case PGFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
break;
case PGMAJFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
break;
default:
BUG();
}
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
* is disabled.
*/
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
* What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
* mem_cgroup_page_lruvec - return lruvec for adding an lru page
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
struct mem_cgroup *memcg;
struct page_cgroup *pc;

KAMEZAWA Hiroyuki
committed
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
* Surreptitiously switch any uncharged offlist page to root:
* an uncharged page off lru does nothing to secure
* its former mem_cgroup from sudden removal.
*
* Our caller holds lru_lock, and PageCgroupUsed is updated
* under page_cgroup lock: between them, they make all uses
* of pc->mem_cgroup safe.
*/
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
* This function must be called when a page is added to or removed from an
* lru list.
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
*lru_size += nr_pages;
VM_BUG_ON((long)(*lru_size) < 0);
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/

Johannes Weiner
committed
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
if (root_memcg == memcg)
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;

Johannes Weiner
committed
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
{
bool ret;
rcu_read_lock();

Johannes Weiner
committed
ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
rcu_read_unlock();
return ret;
bool task_in_mem_cgroup(struct task_struct *task,
const struct mem_cgroup *memcg)
struct task_struct *p;
p = find_lock_task_mm(task);

David Rientjes
committed
if (p) {
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
* All threads may have already detached their mm's, but the oom
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
rcu_read_lock();

David Rientjes
committed
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
rcu_read_unlock();

David Rientjes
committed
}
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "memcg").
ret = mem_cgroup_same_or_subtree(memcg, curr);
return ret;
}
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
unsigned long inactive_ratio;
unsigned long active;
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
return inactive * inactive_ratio < active;
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
margin = res_counter_margin(&memcg->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
/*
* memcg->moving_account is used for checking possibility that some thread is
* calling move_account(). When a thread on CPU-A starts moving pages under
* a memcg, other threads should check memcg->moving_account under
* rcu_read_lock(), like this:
*
* CPU-A CPU-B
* rcu_read_lock()
* memcg->moving_account+1 if (memcg->mocing_account)
* take heavy locks.
* synchronize_rcu() update something.
* rcu_read_unlock()
* start move here.
*/
/* for quick checking without looking up memcg */
atomic_t memcg_moving __read_mostly;
static void mem_cgroup_start_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
/*
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
if (memcg) {
atomic_dec(&memcg_moving);
atomic_dec(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
* is used for avoiding races in accounting. If true,

KAMEZAWA Hiroyuki
committed
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
VM_BUG_ON(!rcu_read_lock_held());
return atomic_read(&memcg->moving_account) > 0;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_same_or_subtree(memcg, from)
|| mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
/*
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_lock_irqsave(&memcg->move_lock, *flags);
}
static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_unlock_irqrestore(&memcg->move_lock, *flags);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
unsigned int i;
if (!p)
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
pr_info("Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
pr_cont(" as a result of limit of %s\n", memcg_name);
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats");
rcu_read_lock();
ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
if (!ret)
pr_cont(" for %s", memcg_name);
rcu_read_unlock();
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
}
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
pr_cont("\n");
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
for_each_mem_cgroup_tree(iter, memcg)
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
* Do not consider swap space if we cannot swap due to swappiness
if (mem_cgroup_swappiness(memcg)) {
u64 memsw;
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space
* available to this memcg, return that limit.
*/
limit = min(limit, memsw);
}
return limit;
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
unsigned int points = 0;
struct task_struct *chosen = NULL;
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
switch (oom_scan_process_thread(task, totalpages, NULL,
false)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = ULONG_MAX;
get_task_struct(chosen);
/* fall through */
case OOM_SCAN_CONTINUE:
continue;
case OOM_SCAN_ABORT:
css_task_iter_end(&it);
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
return;
case OOM_SCAN_OK:
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
if (points > chosen_points) {
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = points;
get_task_struct(chosen);
}
}
css_task_iter_end(&it);
}
if (!chosen)
return;
points = chosen_points * 1000 / totalpages;
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
NULL, "Memory cgroup out of memory");
}
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
gfp_t gfp_mask,
unsigned long flags)
{
unsigned long total = 0;
bool noswap = false;
int loop;
if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
noswap = true;
if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
noswap = true;
for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
if (loop)
drain_all_stock_async(memcg);
total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
/*
* Allow limit shrinkers, which are triggered directly
* by userspace, to catch signals and stop reclaim
* after minimal progress, regardless of the margin.
*/
if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
break;
if (mem_cgroup_margin(memcg))
break;
/*
* If nothing was reclaimed after two attempts, there
* may be no reclaimable pages in this hierarchy.
*/
if (loop && !total)
break;
}
return total;
}
/**
* test_mem_cgroup_node_reclaimable
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
}
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_MEMORY];
for_each_node_mask(nid, node_states[N_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
/*
* A group is eligible for the soft limit reclaim if
* a) it is over its soft limit
* b) any parent up the hierarchy is over its soft limit
*/
bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
{

Andrew Morton
committed
struct mem_cgroup *parent = memcg;
if (res_counter_soft_limit_excess(&memcg->res))
return true;
/*
* If any parent up the hierarchy is over its soft limit then we
* have to obey and reclaim from this group as well.
*/
if (res_counter_soft_limit_excess(&parent->res))
return true;
return false;
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
if (failed) {
/*
* OK, we failed to lock the whole subtree so we have
* to clean up what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
return !failed;
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
* Both of oom_wait_info->memcg and wake_memcg are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
* try to call OOM killer
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
int wakeups;
if (!current->memcg_oom.may_oom)
return;
current->memcg_oom.in_memcg_oom = 1;
* As with any blocking lock, a contender needs to start
* listening for wakeups before attempting the trylock,
* otherwise it can miss the wakeup from the unlock and sleep
* indefinitely. This is just open-coded because our locking
* is so particular to memcg hierarchies.
wakeups = atomic_read(&memcg->oom_wakeups);
mem_cgroup_mark_under_oom(memcg);
locked = mem_cgroup_oom_trylock(memcg);
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
mem_cgroup_out_of_memory(memcg, mask, order);
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
* sees the wakeups triggered by the OOM kill
* uncharges. Wake any sleepers explicitely.
*/
memcg_oom_recover(memcg);
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
/*
* A system call can just return -ENOMEM, but if this
* is a page fault and somebody else is handling the
* OOM already, we need to sleep on the OOM waitqueue
* for this memcg until the situation is resolved.
* Which can take some time because it might be
* handled by a userspace task.
*
* However, this is the charge context, which means
* that we may sit on a large call stack and hold
* various filesystem locks, the mmap_sem etc. and we
* don't want the OOM handler to deadlock on them
* while we sit here and wait. Store the current OOM
* context in the task_struct, then return -ENOMEM.
* At the end of the page fault handler, with the
* stack unwound, pagefault_out_of_memory() will check
* back with us by calling
* mem_cgroup_oom_synchronize(), possibly putting the
* task to sleep.
*/
current->memcg_oom.oom_locked = locked;
current->memcg_oom.wakeups = wakeups;
css_get(&memcg->css);
current->memcg_oom.wait_on_memcg = memcg;
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
*
* This has to be called at the end of a page fault if the the memcg
* OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
*
* Memcg supports userspace OOM handling, so failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
* the end of the page fault to put the task to sleep and clean up the
* OOM state.
*
* Returns %true if an ongoing memcg OOM situation was detected and
* finalized, %false otherwise.
*/
bool mem_cgroup_oom_synchronize(void)
{
struct oom_wait_info owait;
struct mem_cgroup *memcg;
/* OOM is global, do not handle */
if (!current->memcg_oom.in_memcg_oom)
return false;
/*
* We invoked the OOM killer but there is a chance that a kill
* did not free up any charges. Everybody else might already
* be sleeping, so restart the fault and keep the rampage
* going until some charges are released.
*/
memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)