Newer
Older
return (active > inactive);
}
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return MEM_CGROUP_ZSTAT(mz, lru);
}
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return &mz->reclaim_stat;
}
struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page);
/*
* Used bit is set without atomic ops but after smp_wmb().
* For making pc->mem_cgroup visible, insert smp_rmb() here.
*/
smp_rmb();
if (!PageCgroupUsed(pc))
return NULL;
mz = page_cgroup_zoneinfo(pc);
if (!mz)
return NULL;
return &mz->reclaim_stat;
}
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
struct mem_cgroup *mem_cont,
{
unsigned long nr_taken = 0;
struct page *page;
unsigned long scan;
LIST_HEAD(pc_list);
struct list_head *src;

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc, *tmp;
int nid = zone_to_nid(z);

KAMEZAWA Hiroyuki
committed
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;

KAMEZAWA Hiroyuki
committed
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

KAMEZAWA Hiroyuki
committed
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;

KAMEZAWA Hiroyuki
committed
continue;
ret = __isolate_lru_page(page, mode, file);
switch (ret) {
case 0:
list_move(&page->lru, dst);
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
mem_cgroup_rotate_lru_list(page, page_lru(page));
break;
default:
break;
}
}
*scanned = scan;
trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
0, 0, 0, mode);
return nr_taken;
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
if (do_swap_account) {
if (res_counter_check_under_limit(&mem->res) &&
res_counter_check_under_limit(&mem->memsw))
return true;
} else
if (res_counter_check_under_limit(&mem->res))
return true;
return false;
}
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
unsigned int swappiness;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
spin_lock(&memcg->reclaim_param_lock);
swappiness = memcg->swappiness;
spin_unlock(&memcg->reclaim_param_lock);
return swappiness;
}

KAMEZAWA Hiroyuki
committed
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *mem)
{
int cpu;
if (!mem)
return;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
* for avoiding race in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stealed(struct mem_cgroup *mem)
{
VM_BUG_ON(!rcu_read_lock_held());
return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
if (from == mem || to == mem
|| (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
|| (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
ret = true;
unlock:
spin_unlock(&mc.lock);
return ret;
}
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(mem)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
num++;
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
total_swap_pages;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* to this memcg, return that limit.
*/
return min(limit, memsw);
}
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
if (!root_mem->use_hierarchy) {
css_get(&root_mem->css);
ret = root_mem;
}
while (!ret) {
rcu_read_lock();
nextid = root_mem->last_scanned_child + 1;
css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
/* Updates scanning parameter */
spin_lock(&root_mem->reclaim_param_lock);
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
spin_unlock(&root_mem->reclaim_param_lock);
}
return ret;
}
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
*
* root_mem is the original ancestor that we've been reclaim from.
*
* We give up and return to the caller when we visit root_mem twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
gfp_t gfp_mask,
unsigned long reclaim_options)
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess = mem_cgroup_get_excess(root_mem);
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (root_mem->memsw_is_minimum)
noswap = true;
if (victim == root_mem) {
if (loop >= 1)
drain_all_stock_async();
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!check_soft || !total) {
css_put(&victim->css);
break;
}
/*
* We want to do more targetted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
css_put(&victim->css);
break;
}
}
}
if (!mem_cgroup_local_usage(victim)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, get_swappiness(victim), zone);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
/*
* At shrinking usage, we can't check we should stop here or
* reclaim more. It's depends on callers. last_scanned_child
* will work enough for keeping fairness under tree.
*/
if (shrink)
return ret;
if (check_soft) {
if (res_counter_check_under_soft_limit(&root_mem->res))
return total;
} else if (mem_cgroup_check_under_limit(root_mem))
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
int x, lock_count = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem) {
x = atomic_inc_return(&iter->oom_lock);
lock_count = max(x, lock_count);
}
if (lock_count == 1)
return true;
return false;
static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, mem)
atomic_add_unless(&iter->oom_lock, -1, 0);
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
struct oom_wait_info {
struct mem_cgroup *mem;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
if (oom_wait_info->mem == wake_mem)
goto wakeup;
/* if no hierarchy, no match */
if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
return 0;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
!css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
return 0;
wakeup:
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *mem)
{
/* for filtering, pass "mem" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
}
static void memcg_oom_recover(struct mem_cgroup *mem)
{
if (mem && atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
owait.mem = mem;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
/* At first, try to OOM lock hierarchy under mem.*/
mutex_lock(&memcg_oom_mutex);
locked = mem_cgroup_oom_lock(mem);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || mem->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
finish_wait(&memcg_oom_waitq, &owait.wait);
}
mutex_lock(&memcg_oom_mutex);
mem_cgroup_oom_unlock(mem);
mutex_unlock(&memcg_oom_mutex);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout(1);
return true;
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
* possibility of race condition. If there is, we take a lock.
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
if (unlikely(!pc))
return;

KAMEZAWA Hiroyuki
committed
rcu_read_lock();

KAMEZAWA Hiroyuki
committed
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(mem))) {
/* take a lock against to access pc->mem_cgroup */
lock_page_cgroup(pc);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
this_cpu_add(mem->stat->count[idx], val);

KAMEZAWA Hiroyuki
committed
out:
if (unlikely(need_unlock))
unlock_page_cgroup(pc);
rcu_read_unlock();
return;
EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_SIZE (32 * PAGE_SIZE)
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
int charge;
struct work_struct work;
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static atomic_t memcg_drain_count;
/*
* Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *mem)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
if (mem == stock->cached && stock->charge)
stock->charge -= PAGE_SIZE;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->charge) {
res_counter_uncharge(&old->res, stock->charge);
if (do_swap_account)
res_counter_uncharge(&old->memsw, stock->charge);
}
stock->cached = NULL;
stock->charge = 0;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
*/
static void refill_stock(struct mem_cgroup *mem, int val)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != mem) { /* reset if necessary */
drain_stock(stock);
stock->cached = mem;
}
stock->charge += val;
put_cpu_var(memcg_stock);
}
/*
* Tries to drain stocked charges in other cpus. This function is asynchronous
* and just put a work per cpu for draining localy on each cpu. Caller can
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
static void drain_all_stock_async(void)
{
int cpu;
/* This function is for scheduling "drain" in asynchronous way.
* The result of "drain" is not directly handled by callers. Then,
* if someone is calling drain, we don't have to call drain more.
* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
* there is a race. We just do loose check here.
*/
if (atomic_read(&memcg_drain_count))
return;
/* Notify other cpus that system-wide "drain" is running */
atomic_inc(&memcg_drain_count);
get_online_cpus();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
schedule_work_on(cpu, &stock->work);
}
put_online_cpus();
atomic_dec(&memcg_drain_count);
/* We don't wait for flush_work */
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(void)
{
/* called when force_empty is called */
atomic_inc(&memcg_drain_count);
schedule_on_each_cpu(drain_local_stock);
atomic_dec(&memcg_drain_count);
}
/*
* This function drains percpu counter value from DEAD cpu and
* move it to local cpu. Note that this function can be preempted.
*/
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
{
int i;
spin_lock(&mem->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
s64 x = per_cpu(mem->stat->count[i], cpu);
per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
}
/* need to clear ON_MOVE value, works as a kind of lock. */
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
spin_unlock(&mem->pcp_counter_lock);
}
static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
{
int idx = MEM_CGROUP_ON_MOVE;
spin_lock(&mem->pcp_counter_lock);
per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
spin_unlock(&mem->pcp_counter_lock);
}
static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
struct mem_cgroup *iter;
if ((action == CPU_ONLINE)) {
for_each_mem_cgroup_all(iter)
synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
}
if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
for_each_mem_cgroup_all(iter)
mem_cgroup_drain_pcp_counter(iter, cpu);
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
int csize, bool oom_check)
{
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
int ret;
ret = res_counter_charge(&mem->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
return CHARGE_OK;
ret = res_counter_charge(&mem->memsw, csize, &fail_res);
if (likely(!ret))
return CHARGE_OK;
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
if (csize > PAGE_SIZE) /* change csize and retry */
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
gfp_mask, flags);
/*
* try_to_free_mem_cgroup_pages() might not give us a full
* picture of reclaim. Some pages are reclaimed and might be
* moved to swap cache or just unmapped from the cgroup.
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
*/
if (ret || mem_cgroup_check_under_limit(mem_over_limit))
return CHARGE_RETRY;
/*
* At task move, charge accounts can be doubly counted. So, it's
* better to wait until the end of task_move if something is going on.
*/
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
/* If we don't need to call oom-killer at el, return immediately */
if (!oom_check)
return CHARGE_NOMEM;
/* check OOM */
if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
return CHARGE_OOM_DIE;
return CHARGE_RETRY;
}
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask,
struct mem_cgroup **memcg, bool oom,
int page_size)
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
* in system level. So, allow to go ahead dying process in addition to
* MEMDIE process.
*/
if (unlikely(test_thread_flag(TIF_MEMDIE)
|| fatal_signal_pending(current)))
goto bypass;
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
if (!*memcg && !mm)
goto bypass;
again:
if (*memcg) { /* css should be a valid one */
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
if (page_size == PAGE_SIZE && consume_stock(mem))
css_get(&mem->css);
} else {
rcu_read_lock();
p = rcu_dereference(mm->owner);
/*
* Because we don't have task_lock(), "p" can exit.
* In that case, "mem" can point to root or p can be NULL with
* race with swapoff. Then, we have small risk of mis-accouning.
* But such kind of mis-account by race always happens because
* we don't have cgroup_mutex(). It's overkill and we allo that
* small race, here.
* (*) swapoff at el will charge against mm-struct not against
* task-struct. So, mm->owner can be NULL.
if (!mem || mem_cgroup_is_root(mem)) {
if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
* necessary. If consume_stock success, some charges
* from this memcg are cached on this cpu. So, we
* don't need to call css_get()/css_tryget() before
* calling consume_stock().
*/
rcu_read_unlock();
goto done;
}
/* after here, we may be blocked. we need to get refcnt */
if (!css_tryget(&mem->css)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
}
/* If killed, bypass charge */
if (fatal_signal_pending(current)) {
css_put(&mem->css);
oom_check = false;
if (oom && !nr_oom_retries) {
oom_check = true;
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
css_put(&mem->css);
mem = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
/* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
case CHARGE_OOM_DIE: /* Killed by OOM Killer */