Newer
Older
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
if (!mm)
return;
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
goto out;
switch (idx) {
case PGFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
break;
case PGMAJFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
break;
default:
BUG();
}
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
* is disabled.
*/
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
* What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
* mem_cgroup_page_lruvec - return lruvec for adding an lru page
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
struct mem_cgroup *memcg;
struct page_cgroup *pc;

KAMEZAWA Hiroyuki
committed
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
* Surreptitiously switch any uncharged offlist page to root:
* an uncharged page off lru does nothing to secure
* its former mem_cgroup from sudden removal.
*
* Our caller holds lru_lock, and PageCgroupUsed is updated
* under page_cgroup lock: between them, they make all uses
* of pc->mem_cgroup safe.
*/
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
* This function must be called when a page is added to or removed from an
* lru list.
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
*lru_size += nr_pages;
VM_BUG_ON((long)(*lru_size) < 0);
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/

Johannes Weiner
committed
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
if (root_memcg == memcg)
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;

Johannes Weiner
committed
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
{
bool ret;
rcu_read_lock();

Johannes Weiner
committed
ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
rcu_read_unlock();
return ret;
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
struct task_struct *p;
p = find_lock_task_mm(task);

David Rientjes
committed
if (p) {
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
* All threads may have already detached their mm's, but the oom
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
task_lock(task);
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
task_unlock(task);
}
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "memcg").
ret = mem_cgroup_same_or_subtree(memcg, curr);
return ret;
}
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
unsigned long inactive_ratio;
unsigned long active;
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
return inactive * inactive_ratio < active;
int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
return (active > inactive);
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
margin = res_counter_margin(&memcg->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
/*
* memcg->moving_account is used for checking possibility that some thread is
* calling move_account(). When a thread on CPU-A starts moving pages under
* a memcg, other threads should check memcg->moving_account under
* rcu_read_lock(), like this:
*
* CPU-A CPU-B
* rcu_read_lock()
* memcg->moving_account+1 if (memcg->mocing_account)
* take heavy locks.
* synchronize_rcu() update something.
* rcu_read_unlock()
* start move here.
*/
/* for quick checking without looking up memcg */
atomic_t memcg_moving __read_mostly;
static void mem_cgroup_start_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
/*
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
if (memcg) {
atomic_dec(&memcg_moving);
atomic_dec(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
* is used for avoiding races in accounting. If true,

KAMEZAWA Hiroyuki
committed
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
VM_BUG_ON(!rcu_read_lock_held());
return atomic_read(&memcg->moving_account) > 0;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_same_or_subtree(memcg, from)
|| mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
/*
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_lock_irqsave(&memcg->move_lock, *flags);
}
static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_unlock_irqrestore(&memcg->move_lock, *flags);
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
for_each_mem_cgroup_tree(iter, memcg)
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
* Do not consider swap space if we cannot swap due to swappiness
if (mem_cgroup_swappiness(memcg)) {
u64 memsw;
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space
* available to this memcg, return that limit.
*/
limit = min(limit, memsw);
}
return limit;
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
unsigned int points = 0;
struct task_struct *chosen = NULL;
/*
* If current has a pending SIGKILL, then automatically select it. The
* goal is to allow it to allocate so that it may quickly exit and free
* its memory.
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct cgroup *cgroup = iter->css.cgroup;
struct cgroup_iter it;
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
switch (oom_scan_process_thread(task, totalpages, NULL,
false)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = ULONG_MAX;
get_task_struct(chosen);
/* fall through */
case OOM_SCAN_CONTINUE:
continue;
case OOM_SCAN_ABORT:
cgroup_iter_end(cgroup, &it);
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
return;
case OOM_SCAN_OK:
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
if (points > chosen_points) {
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = points;
get_task_struct(chosen);
}
}
cgroup_iter_end(cgroup, &it);
}
if (!chosen)
return;
points = chosen_points * 1000 / totalpages;
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
NULL, "Memory cgroup out of memory");
}
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
gfp_t gfp_mask,
unsigned long flags)
{
unsigned long total = 0;
bool noswap = false;
int loop;
if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
noswap = true;
if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
noswap = true;
for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
if (loop)
drain_all_stock_async(memcg);
total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
/*
* Allow limit shrinkers, which are triggered directly
* by userspace, to catch signals and stop reclaim
* after minimal progress, regardless of the margin.
*/
if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
break;
if (mem_cgroup_margin(memcg))
break;
/*
* If nothing was reclaimed after two attempts, there
* may be no reclaimable pages in this hierarchy.
*/
if (loop && !total)
break;
}
return total;
}
/**
* test_mem_cgroup_node_reclaimable
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
}
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone,
gfp_t gfp_mask,
unsigned long *total_scanned)
struct mem_cgroup *victim = NULL;
int total = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = 0,
};
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
if (!mem_cgroup_reclaimable(victim, false))
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
mem_cgroup_iter_break(root_memcg, victim);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
* Has to be called with memcg_oom_lock
static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
/*
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
}
* Has to be called with memcg_oom_lock
static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
return 0;
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->memcg;
* Both of oom_wait_info->memcg and wake_memcg are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
int order)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
mem_cgroup_mark_under_oom(memcg);
/* At first, try to OOM lock hierarchy under memcg.*/
locked = mem_cgroup_oom_lock(memcg);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || memcg->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, mask, order);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_oom_unlock(memcg);
memcg_wakeup_oom(memcg);
mem_cgroup_unmark_under_oom(memcg);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout_uninterruptible(1);
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check mm->moving_account and detect there are possibility of race
* If there is, we take a lock.
void __mem_cgroup_begin_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
return;
/*
* If this memory cgroup is not under account moving, we don't

Wanpeng Li
committed
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock() if mem_cgroup_stolen() == true.