Newer
Older
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
return &mz->lruvec;
}
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
* What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
/**
* mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
* @zone: zone of the page
* @page: the page
* @lru: current lru
*
* This function accounts for @page being added to @lru, and returns
* the lruvec for the given @zone and the memcg @page is charged to.
*
* The callsite is then responsible for physically linking the page to
* the returned lruvec->lists[@lru].
*/
struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
enum lru_list lru)
struct mem_cgroup *memcg;
struct page_cgroup *pc;

KAMEZAWA Hiroyuki
committed
mz = page_cgroup_zoneinfo(memcg, page);
/* compound_order() is stabilized through lru_lock */
MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
return &mz->lruvec;
/**
* mem_cgroup_lru_del_list - account for removing an lru page
* @page: the page
* @lru: target lru
*
* This function accounts for @page being removed from @lru.
*
* The callsite is then responsible for physically unlinking
* @page->lru.
void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc;
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
memcg = pc->mem_cgroup;
VM_BUG_ON(!memcg);
mz = page_cgroup_zoneinfo(memcg, page);
/* huge page split is done under lru_lock. so, we have no races. */
VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
void mem_cgroup_lru_del(struct page *page)
mem_cgroup_lru_del_list(page, page_lru(page));

KAMEZAWA Hiroyuki
committed
}
/**
* mem_cgroup_lru_move_lists - account for moving a page between lrus
* @zone: zone of the page
* @page: the page
* @from: current lru
* @to: target lru
*
* This function accounts for @page being moved between the lrus @from
* and @to, and returns the lruvec for the given @zone and the memcg
* @page is charged to.
*
* The callsite is then responsible for physically relinking
* @page->lru to the returned lruvec->lists[@to].
*/
struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
struct page *page,
enum lru_list from,
enum lru_list to)
/* XXX: Optimize this, especially for @from == @to */
mem_cgroup_lru_del_list(page, from);
return mem_cgroup_lru_add_list(zone, page, to);
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
if (root_memcg != memcg) {
return (root_memcg->use_hierarchy &&
css_is_ancestor(&memcg->css, &root_memcg->css));
}
return true;
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
struct task_struct *p;
p = find_lock_task_mm(task);

David Rientjes
committed
if (p) {
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
* All threads may have already detached their mm's, but the oom
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
task_lock(task);
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
task_unlock(task);
}
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "memcg").
ret = mem_cgroup_same_or_subtree(memcg, curr);
return ret;
}
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
unsigned long inactive_ratio;
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
unsigned long active;
inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
BIT(LRU_INACTIVE_ANON));
active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
BIT(LRU_ACTIVE_ANON));
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
return inactive * inactive_ratio < active;
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
{
unsigned long active;
unsigned long inactive;
int zid = zone_idx(zone);
int nid = zone_to_nid(zone);
inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
BIT(LRU_INACTIVE_FILE));
active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return &mz->reclaim_stat;
}
struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page);
if (!PageCgroupUsed(pc))
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @mem: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
margin = res_counter_margin(&memcg->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
static void mem_cgroup_start_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
int cpu;
get_online_cpus();
spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
int cpu;

KAMEZAWA Hiroyuki
committed
return;
get_online_cpus();
spin_lock(&memcg->pcp_counter_lock);
for_each_online_cpu(cpu)
per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
spin_unlock(&memcg->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
* for avoiding race in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stealed(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
VM_BUG_ON(!rcu_read_lock_held());
return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_same_or_subtree(memcg, from)
|| mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
for_each_mem_cgroup_tree(iter, memcg)
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* to this memcg, return that limit.
*/
return min(limit, memsw);
}
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
gfp_t gfp_mask,
unsigned long flags)
{
unsigned long total = 0;
bool noswap = false;
int loop;
if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
noswap = true;
if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
noswap = true;
for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
if (loop)
drain_all_stock_async(memcg);
total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
/*
* Allow limit shrinkers, which are triggered directly
* by userspace, to catch signals and stop reclaim
* after minimal progress, regardless of the margin.
*/
if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
break;
if (mem_cgroup_margin(memcg))
break;
/*
* If nothing was reclaimed after two attempts, there
* may be no reclaimable pages in this hierarchy.
*/
if (loop && !total)
break;
}
return total;
}
/**
* test_mem_cgroup_node_reclaimable
* @mem: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
}
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone,
gfp_t gfp_mask,
unsigned long *total_scanned)
struct mem_cgroup *victim = NULL;
int total = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = 0,
};
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
if (!mem_cgroup_reclaimable(victim, false))
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
mem_cgroup_iter_break(root_memcg, victim);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
* Has to be called with memcg_oom_lock
static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
/*
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
mem_cgroup_iter_break(memcg, iter);
break;
}
iter->oom_lock = false;
}
* Has to be called with memcg_oom_lock
static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
return 0;
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *mem;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
*oom_wait_memcg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
oom_wait_memcg = oom_wait_info->mem;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
return 0;
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
static void memcg_oom_recover(struct mem_cgroup *memcg)
if (memcg && atomic_read(&memcg->under_oom))
memcg_wakeup_oom(memcg);
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
mem_cgroup_mark_under_oom(memcg);
/* At first, try to OOM lock hierarchy under memcg.*/
locked = mem_cgroup_oom_lock(memcg);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || memcg->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(memcg, mask);
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_oom_unlock(memcg);
memcg_wakeup_oom(memcg);
mem_cgroup_unmark_under_oom(memcg);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout_uninterruptible(1);
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
* possibility of race condition. If there is, we take a lock.
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
if (mem_cgroup_disabled())

KAMEZAWA Hiroyuki
committed
rcu_read_lock();
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))

KAMEZAWA Hiroyuki
committed
goto out;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {

KAMEZAWA Hiroyuki
committed
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
need_unlock = true;
memcg = pc->mem_cgroup;
if (!memcg || !PageCgroupUsed(pc))

KAMEZAWA Hiroyuki
committed
goto out;
}
case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
this_cpu_add(memcg->stat->count[idx], val);

KAMEZAWA Hiroyuki
committed
out:
if (unlikely(need_unlock))
move_unlock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
rcu_read_unlock();
return;
EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
* Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *memcg)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
if (memcg == stock->cached && stock->nr_pages)
stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
unsigned long bytes = stock->nr_pages * PAGE_SIZE;
res_counter_uncharge(&old->res, bytes);
res_counter_uncharge(&old->memsw, bytes);
stock->nr_pages = 0;
}
stock->cached = NULL;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it. sync flag says whether we should block
* until the work is done.
static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
int cpu, curcpu;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
memcg = stock->cached;
if (!memcg || !stock->nr_pages)
if (!mem_cgroup_same_or_subtree(root_memcg, memcg))