Newer
Older
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct task_struct *p;
p = find_lock_task_mm(task);
if (!p)
return 0;
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
/*
* We should check use_hierarchy of "mem" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "mem" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "mem").
*/
if (mem->use_hierarchy)
ret = css_is_ancestor(&curr->css, &mem->css);
else
ret = (curr == mem);
css_put(&curr->css);
return ret;
}
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
unsigned long inactive;
unsigned long gb;
unsigned long inactive_ratio;
inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
if (present_pages) {
present_pages[0] = inactive;
present_pages[1] = active;
}
return inactive_ratio;
}
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
unsigned long present_pages[2];
unsigned long inactive_ratio;
inactive_ratio = calc_inactive_ratio(memcg, present_pages);
inactive = present_pages[0];
active = present_pages[1];
if (inactive * inactive_ratio < active)
return 1;
return 0;
}
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
return (active > inactive);
}
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return MEM_CGROUP_ZSTAT(mz, lru);
}
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return &mz->reclaim_stat;
}
struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page);
if (!PageCgroupUsed(pc))
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
struct mem_cgroup *mem_cont,
{
unsigned long nr_taken = 0;
struct page *page;
unsigned long scan;
LIST_HEAD(pc_list);
struct list_head *src;

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc, *tmp;
int nid = zone_to_nid(z);

KAMEZAWA Hiroyuki
committed
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;

KAMEZAWA Hiroyuki
committed
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

KAMEZAWA Hiroyuki
committed
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;
page = lookup_cgroup_page(pc);

KAMEZAWA Hiroyuki
committed
continue;
ret = __isolate_lru_page(page, mode, file);
switch (ret) {
case 0:
list_move(&page->lru, dst);
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
mem_cgroup_rotate_lru_list(page, page_lru(page));
break;
default:
break;
}
}
*scanned = scan;
trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
0, 0, 0, mode);
return nr_taken;
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @mem: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
{
unsigned long long margin;
margin = res_counter_margin(&mem->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&mem->memsw));
return margin >> PAGE_SHIFT;
}
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;

KAMEZAWA Hiroyuki
committed
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *mem)
{
int cpu;
if (!mem)
return;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
* for avoiding race in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stealed(struct mem_cgroup *mem)
{
VM_BUG_ON(!rcu_read_lock_held());
return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
if (from == mem || to == mem
|| (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
|| (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
ret = true;
unlock:
spin_unlock(&mc.lock);
return ret;
}
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(mem)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
num++;
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* to this memcg, return that limit.
*/
return min(limit, memsw);
}
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
if (!root_mem->use_hierarchy) {
css_get(&root_mem->css);
ret = root_mem;
}
while (!ret) {
rcu_read_lock();
nextid = root_mem->last_scanned_child + 1;
css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
/* Updates scanning parameter */
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
}
return ret;
}
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
{
int nid;
if (time_after(mem->next_scan_node_update, jiffies))
return;
mem->next_scan_node_update = jiffies + 10*HZ;
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
continue;
if (total_swap_pages &&
(mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
continue;
node_clear(nid, mem->scan_nodes);
}
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
int node;
mem_cgroup_may_update_nodemask(mem);
node = mem->last_scanned_node;
node = next_node(node, mem->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(mem->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
mem->last_scanned_node = node;
return node;
}
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
return 0;
}
#endif
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
*
* root_mem is the original ancestor that we've been reclaim from.
*
* We give up and return to the caller when we visit root_mem twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
unsigned long reclaim_options,
unsigned long *total_scanned)
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess;
unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (root_mem->memsw_is_minimum)
noswap = true;
if (victim == root_mem) {
if (loop >= 1)
drain_all_stock_async();
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!check_soft || !total) {
css_put(&victim->css);
break;
}
/*
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
css_put(&victim->css);
break;
}
}
}
if (!mem_cgroup_local_usage(victim)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, get_swappiness(victim), zone,
&nr_scanned);
*total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
/*
* At shrinking usage, we can't check we should stop here or
* reclaim more. It's depends on callers. last_scanned_child
* will work enough for keeping fairness under tree.
*/
if (shrink)
return ret;
if (!res_counter_soft_limit_excess(&root_mem->res))
} else if (mem_cgroup_margin(root_mem))
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
int x, lock_count = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem) {
x = atomic_inc_return(&iter->oom_lock);
lock_count = max(x, lock_count);
}
if (lock_count == 1)
return true;
return false;
static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, mem)
atomic_add_unless(&iter->oom_lock, -1, 0);
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
struct oom_wait_info {
struct mem_cgroup *mem;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
if (oom_wait_info->mem == wake_mem)
goto wakeup;
/* if no hierarchy, no match */
if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
return 0;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
!css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
return 0;
wakeup:
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *mem)
{
/* for filtering, pass "mem" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
}
static void memcg_oom_recover(struct mem_cgroup *mem)
{
if (mem && atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
owait.mem = mem;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
/* At first, try to OOM lock hierarchy under mem.*/
mutex_lock(&memcg_oom_mutex);
locked = mem_cgroup_oom_lock(mem);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || mem->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
finish_wait(&memcg_oom_waitq, &owait.wait);
}
mutex_lock(&memcg_oom_mutex);
mem_cgroup_oom_unlock(mem);
mutex_unlock(&memcg_oom_mutex);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout(1);
return true;
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
* possibility of race condition. If there is, we take a lock.
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;

KAMEZAWA Hiroyuki
committed
rcu_read_lock();

KAMEZAWA Hiroyuki
committed
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

KAMEZAWA Hiroyuki
committed
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
this_cpu_add(mem->stat->count[idx], val);

KAMEZAWA Hiroyuki
committed
out:
if (unlikely(need_unlock))
move_unlock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
rcu_read_unlock();
return;
EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static atomic_t memcg_drain_count;
/*
* Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *mem)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
if (mem == stock->cached && stock->nr_pages)
stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->nr_pages) {
unsigned long bytes = stock->nr_pages * PAGE_SIZE;
res_counter_uncharge(&old->res, bytes);
res_counter_uncharge(&old->memsw, bytes);
stock->nr_pages = 0;
}
stock->cached = NULL;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != mem) { /* reset if necessary */
drain_stock(stock);
stock->cached = mem;
}
stock->nr_pages += nr_pages;
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
put_cpu_var(memcg_stock);
}
/*
* Tries to drain stocked charges in other cpus. This function is asynchronous
* and just put a work per cpu for draining localy on each cpu. Caller can
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
static void drain_all_stock_async(void)
{
int cpu;
/* This function is for scheduling "drain" in asynchronous way.
* The result of "drain" is not directly handled by callers. Then,
* if someone is calling drain, we don't have to call drain more.
* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
* there is a race. We just do loose check here.
*/
if (atomic_read(&memcg_drain_count))
return;
/* Notify other cpus that system-wide "drain" is running */
atomic_inc(&memcg_drain_count);
get_online_cpus();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
schedule_work_on(cpu, &stock->work);
}
put_online_cpus();
atomic_dec(&memcg_drain_count);
/* We don't wait for flush_work */
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(void)
{
/* called when force_empty is called */
atomic_inc(&memcg_drain_count);
schedule_on_each_cpu(drain_local_stock);
atomic_dec(&memcg_drain_count);
}
/*
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
{
int i;
spin_lock(&mem->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
long x = per_cpu(mem->stat->count[i], cpu);
per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
}
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
unsigned long x = per_cpu(mem->stat->events[i], cpu);
per_cpu(mem->stat->events[i], cpu) = 0;
mem->nocpu_base.events[i] += x;
}
/* need to clear ON_MOVE value, works as a kind of lock. */
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
spin_unlock(&mem->pcp_counter_lock);
}
static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
{
int idx = MEM_CGROUP_ON_MOVE;
spin_lock(&mem->pcp_counter_lock);
per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
spin_unlock(&mem->pcp_counter_lock);
}
static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
struct mem_cgroup *iter;
if ((action == CPU_ONLINE)) {
for_each_mem_cgroup_all(iter)
synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
}
if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
for_each_mem_cgroup_all(iter)
mem_cgroup_drain_pcp_counter(iter, cpu);
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
unsigned int nr_pages, bool oom_check)
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
int ret;
ret = res_counter_charge(&mem->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;