Newer
Older
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
list_add(&pc->lru, &mz->lists[lru]);
}
* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
* while it's linked to lru because the page may be reused after it's fully
* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
* It's done under lock_page and expected that zone->lru_lock isnever held.
static void mem_cgroup_lru_del_before_commit(struct page *page)
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* Doing this check without taking ->lru_lock seems wrong but this
* is safe. Because if page_cgroup's USED bit is unset, the page
* will not be added to any memcg's LRU. If page_cgroup's USED bit is
* set, the commit after this will fail, anyway.
* This all charge/uncharge is done under some mutual execustion.
* So, we don't need to taking care of changes in USED bit.
*/
if (likely(!PageLRU(page)))
return;
spin_lock_irqsave(&zone->lru_lock, flags);
/*
* Forget old LRU when this page_cgroup is *not* used. This Used bit
* is guarded by lock_page() because the page is SwapCache.
*/
if (!PageCgroupUsed(pc))
mem_cgroup_del_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
static void mem_cgroup_lru_add_after_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
/* taking care of that the page is added to LRU while we commit it */
if (likely(!PageLRU(page)))
return;
spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */
if (PageLRU(page) && !PageCgroupAcctLRU(pc))
mem_cgroup_add_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to)
{
return;
mem_cgroup_del_lru_list(page, from);
mem_cgroup_add_lru_list(page, to);
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
if (root_memcg != memcg) {
return (root_memcg->use_hierarchy &&
css_is_ancestor(&memcg->css, &root_memcg->css));
}
return true;
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
struct task_struct *p;
p = find_lock_task_mm(task);
if (!p)
return 0;
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "memcg").
ret = mem_cgroup_same_or_subtree(memcg, curr);
return ret;
}
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
unsigned long inactive;
unsigned long gb;
unsigned long inactive_ratio;
inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
if (present_pages) {
present_pages[0] = inactive;
present_pages[1] = active;
}
return inactive_ratio;
}
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
unsigned long present_pages[2];
unsigned long inactive_ratio;
inactive_ratio = calc_inactive_ratio(memcg, present_pages);
inactive = present_pages[0];
active = present_pages[1];
if (inactive * inactive_ratio < active)
return 1;
return 0;
}
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return &mz->reclaim_stat;
}
struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page);
if (!PageCgroupUsed(pc))
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
isolate_mode_t mode,
struct zone *z,
struct mem_cgroup *mem_cont,
{
unsigned long nr_taken = 0;
struct page *page;
unsigned long scan;
LIST_HEAD(pc_list);
struct list_head *src;

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc, *tmp;
int nid = zone_to_nid(z);

KAMEZAWA Hiroyuki
committed
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;

KAMEZAWA Hiroyuki
committed
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

KAMEZAWA Hiroyuki
committed
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;
page = lookup_cgroup_page(pc);

KAMEZAWA Hiroyuki
committed
continue;
ret = __isolate_lru_page(page, mode, file);
switch (ret) {
case 0:
list_move(&page->lru, dst);
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
mem_cgroup_rotate_lru_list(page, page_lru(page));
break;
default:
break;
}
}
*scanned = scan;
trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
0, 0, 0, mode);
return nr_taken;
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @mem: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
margin = res_counter_margin(&memcg->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
static void mem_cgroup_start_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
int cpu;
get_online_cpus();
spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
int cpu;

KAMEZAWA Hiroyuki
committed
return;
get_online_cpus();
spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
* for avoiding race in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stealed(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
VM_BUG_ON(!rcu_read_lock_held());
return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_same_or_subtree(memcg, from)
|| mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
for_each_mem_cgroup_tree(iter, memcg)
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* to this memcg, return that limit.
*/
return min(limit, memsw);
}
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
if (!root_memcg->use_hierarchy) {
css_get(&root_memcg->css);
ret = root_memcg;
nextid = root_memcg->last_scanned_child + 1;
css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
/* Updates scanning parameter */
if (!css) {
/* this means start scan from ID:1 */
root_memcg->last_scanned_child = 0;
root_memcg->last_scanned_child = found;
/**
* test_mem_cgroup_node_reclaimable
* @mem: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
}
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
* root_memcg is the original ancestor that we've been reclaim from.
* We give up and return to the caller when we visit root_memcg twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
unsigned long reclaim_options,
unsigned long *total_scanned)
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess;
unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
noswap = true;
victim = mem_cgroup_select_victim(root_memcg);
if (victim == root_memcg) {
/*
* We are not draining per cpu cached charges during
* soft limit reclaim because global reclaim doesn't
* care about charges. It tries to free some memory and
* charges will not give any.
*/
if (!check_soft && loop >= 1)
drain_all_stock_async(root_memcg);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!check_soft || !total) {
css_put(&victim->css);
break;
}
/*
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
css_put(&victim->css);
break;
}
}
}
if (!mem_cgroup_reclaimable(victim, noswap)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, zone, &nr_scanned);
*total_scanned += nr_scanned;
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
/*
* At shrinking usage, we can't check we should stop here or
* reclaim more. It's depends on callers. last_scanned_child
* will work enough for keeping fairness under tree.
*/
if (shrink)
return ret;
if (!res_counter_soft_limit_excess(&root_memcg->res))
} else if (mem_cgroup_margin(root_memcg))
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
* Has to be called with memcg_oom_lock
static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
bool cond = true;
for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
cond = false;
} else
iter->oom_lock = true;
/*
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree
*/
cond = true;
for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
if (iter == failed) {
cond = false;
continue;
}
iter->oom_lock = false;
}
* Has to be called with memcg_oom_lock
static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
return 0;
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *mem;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
*oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->mem;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
mem_cgroup_mark_under_oom(memcg);
/* At first, try to OOM lock hierarchy under memcg.*/
locked = mem_cgroup_oom_lock(memcg);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || memcg->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, mask);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_oom_unlock(memcg);
memcg_wakeup_oom(memcg);
mem_cgroup_unmark_under_oom(memcg);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout_uninterruptible(1);
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
* possibility of race condition. If there is, we take a lock.
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;

KAMEZAWA Hiroyuki
committed
rcu_read_lock();
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))

KAMEZAWA Hiroyuki
committed
goto out;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {

KAMEZAWA Hiroyuki
committed
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
need_unlock = true;
memcg = pc->mem_cgroup;
if (!memcg || !PageCgroupUsed(pc))

KAMEZAWA Hiroyuki
committed
goto out;
}
case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
this_cpu_add(memcg->stat->count[idx], val);

KAMEZAWA Hiroyuki
committed
out:
if (unlikely(need_unlock))
move_unlock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
rcu_read_unlock();
return;
EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
* Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *memcg)
{
struct memcg_stock_pcp *stock;
bool ret = true;