Newer
Older
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
list_add(&pc->lru, &mz->lists[lru]);
}
* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
* while it's linked to lru because the page may be reused after it's fully
* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
* It's done under lock_page and expected that zone->lru_lock isnever held.
static void mem_cgroup_lru_del_before_commit(struct page *page)
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* Doing this check without taking ->lru_lock seems wrong but this
* is safe. Because if page_cgroup's USED bit is unset, the page
* will not be added to any memcg's LRU. If page_cgroup's USED bit is
* set, the commit after this will fail, anyway.
* This all charge/uncharge is done under some mutual execustion.
* So, we don't need to taking care of changes in USED bit.
*/
if (likely(!PageLRU(page)))
return;
spin_lock_irqsave(&zone->lru_lock, flags);
/*
* Forget old LRU when this page_cgroup is *not* used. This Used bit
* is guarded by lock_page() because the page is SwapCache.
*/
if (!PageCgroupUsed(pc))
mem_cgroup_del_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
static void mem_cgroup_lru_add_after_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
/* taking care of that the page is added to LRU while we commit it */
if (likely(!PageLRU(page)))
return;
spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */
if (PageLRU(page) && !PageCgroupAcctLRU(pc))
mem_cgroup_add_lru_list(page, page_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to)
{
return;
mem_cgroup_del_lru_list(page, from);
mem_cgroup_add_lru_list(page, to);
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct task_struct *p;
p = find_lock_task_mm(task);
if (!p)
return 0;
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
/*
* We should check use_hierarchy of "mem" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "mem" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "mem").
*/
if (mem->use_hierarchy)
ret = css_is_ancestor(&curr->css, &mem->css);
else
ret = (curr == mem);
css_put(&curr->css);
return ret;
}
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
unsigned long inactive;
unsigned long gb;
unsigned long inactive_ratio;
inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
if (present_pages) {
present_pages[0] = inactive;
present_pages[1] = active;
}
return inactive_ratio;
}
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
unsigned long present_pages[2];
unsigned long inactive_ratio;
inactive_ratio = calc_inactive_ratio(memcg, present_pages);
inactive = present_pages[0];
active = present_pages[1];
if (inactive * inactive_ratio < active)
return 1;
return 0;
}
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
{
unsigned long active;
unsigned long inactive;
inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
return &mz->reclaim_stat;
}
struct zone_reclaim_stat *
mem_cgroup_get_reclaim_stat_from_page(struct page *page)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page);
if (!PageCgroupUsed(pc))
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
int mode, struct zone *z,
struct mem_cgroup *mem_cont,
{
unsigned long nr_taken = 0;
struct page *page;
unsigned long scan;
LIST_HEAD(pc_list);
struct list_head *src;

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc, *tmp;
int nid = zone_to_nid(z);

KAMEZAWA Hiroyuki
committed
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;

KAMEZAWA Hiroyuki
committed
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

KAMEZAWA Hiroyuki
committed
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;
page = lookup_cgroup_page(pc);

KAMEZAWA Hiroyuki
committed
continue;
ret = __isolate_lru_page(page, mode, file);
switch (ret) {
case 0:
list_move(&page->lru, dst);
nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
mem_cgroup_rotate_lru_list(page, page_lru(page));
break;
default:
break;
}
}
*scanned = scan;
trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
0, 0, 0, mode);
return nr_taken;
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @mem: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
{
unsigned long long margin;
margin = res_counter_margin(&mem->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&mem->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;

KAMEZAWA Hiroyuki
committed
static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
int cpu;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *mem)
{
int cpu;
if (!mem)
return;
get_online_cpus();
spin_lock(&mem->pcp_counter_lock);
for_each_online_cpu(cpu)

KAMEZAWA Hiroyuki
committed
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
spin_unlock(&mem->pcp_counter_lock);
put_online_cpus();

KAMEZAWA Hiroyuki
committed
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
* for avoiding race in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stealed(struct mem_cgroup *mem)
{
VM_BUG_ON(!rcu_read_lock_held());
return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
if (from == mem || to == mem
|| (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
|| (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
ret = true;
unlock:
spin_unlock(&mc.lock);
return ret;
}
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(mem)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
num++;
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* to this memcg, return that limit.
*/
return min(limit, memsw);
}
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
if (!root_mem->use_hierarchy) {
css_get(&root_mem->css);
ret = root_mem;
}
while (!ret) {
rcu_read_lock();
nextid = root_mem->last_scanned_child + 1;
css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
/* Updates scanning parameter */
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
}
return ret;
}
/**
* test_mem_cgroup_node_reclaimable
* @mem: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
return true;
return false;
}
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
{
int nid;
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&mem->numainfo_events))
return;
if (atomic_inc_return(&mem->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
node_clear(nid, mem->scan_nodes);
atomic_set(&mem->numainfo_events, 0);
atomic_set(&mem->numainfo_updating, 0);
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
int node;
mem_cgroup_may_update_nodemask(mem);
node = mem->last_scanned_node;
node = next_node(node, mem->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(mem->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
mem->last_scanned_node = node;
return node;
}
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(mem->scan_nodes)) {
for (nid = first_node(mem->scan_nodes);
nid < MAX_NUMNODES;
nid = next_node(nid, mem->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_HIGH_MEMORY) {
if (node_isset(nid, mem->scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
return true;
}
return false;
}
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
return 0;
}
bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
{
return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
}
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
*
* root_mem is the original ancestor that we've been reclaim from.
*
* We give up and return to the caller when we visit root_mem twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
unsigned long reclaim_options,
unsigned long *total_scanned)
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess;
unsigned long nr_scanned;
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (!check_soft && root_mem->memsw_is_minimum)
noswap = true;
if (victim == root_mem) {
/*
* We are not draining per cpu cached charges during
* soft limit reclaim because global reclaim doesn't
* care about charges. It tries to free some memory and
* charges will not give any.
*/
if (!check_soft && loop >= 1)
drain_all_stock_async(root_mem);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!check_soft || !total) {
css_put(&victim->css);
break;
}
/*
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
css_put(&victim->css);
break;
}
}
}
if (!mem_cgroup_reclaimable(victim, noswap)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, zone, &nr_scanned);
*total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap);
/*
* At shrinking usage, we can't check we should stop here or
* reclaim more. It's depends on callers. last_scanned_child
* will work enough for keeping fairness under tree.
*/
if (shrink)
return ret;
if (!res_counter_soft_limit_excess(&root_mem->res))
} else if (mem_cgroup_margin(root_mem))
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
* Has to be called with memcg_oom_mutex
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
int lock_count = -1;
struct mem_cgroup *iter, *failed = NULL;
bool cond = true;
for_each_mem_cgroup_tree_cond(iter, mem, cond) {
bool locked = iter->oom_lock;
iter->oom_lock = true;
if (lock_count == -1)
lock_count = iter->oom_lock;
else if (lock_count != locked) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
lock_count = 0;
failed = iter;
cond = false;
}
if (!failed)
goto done;
/*
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree
*/
cond = true;
for_each_mem_cgroup_tree_cond(iter, mem, cond) {
if (iter == failed) {
cond = false;
continue;
}
iter->oom_lock = false;
}
done:
return lock_count;
/*
* Has to be called with memcg_oom_mutex
*/
static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
for_each_mem_cgroup_tree(iter, mem)
iter->oom_lock = false;
return 0;
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
atomic_inc(&iter->under_oom);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
atomic_add_unless(&iter->under_oom, -1, 0);
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
struct oom_wait_info {
struct mem_cgroup *mem;
wait_queue_t wait;
};
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
if (oom_wait_info->mem == wake_mem)
goto wakeup;
/* if no hierarchy, no match */
if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
return 0;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
!css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
return 0;
wakeup:
return autoremove_wake_function(wait, mode, sync, arg);
}
static void memcg_wakeup_oom(struct mem_cgroup *mem)
{
/* for filtering, pass "mem" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
}
static void memcg_oom_recover(struct mem_cgroup *mem)
{
if (mem && atomic_read(&mem->under_oom))
memcg_wakeup_oom(mem);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
owait.mem = mem;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
mem_cgroup_mark_under_oom(mem);
/* At first, try to OOM lock hierarchy under mem.*/
mutex_lock(&memcg_oom_mutex);
locked = mem_cgroup_oom_lock(mem);
/*
* Even if signal_pending(), we can't quit charge() loop without
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || mem->oom_kill_disable)
need_to_kill = false;
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
finish_wait(&memcg_oom_waitq, &owait.wait);
if (locked)
mem_cgroup_oom_unlock(mem);
mem_cgroup_unmark_under_oom(mem);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
/* Give chance to dying process */
schedule_timeout(1);
return true;
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.

KAMEZAWA Hiroyuki
committed
*
* Notes: Race condition
*
* We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
* Considering "charge", lock_page_cgroup() is not required because all
* file-stat operations happen after a page is attached to radix-tree. There
* are no race with "charge".
*
* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
* if there are race with "uncharge". Statistics itself is properly handled
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
* possibility of race condition. If there is, we take a lock.
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_page_stat_item idx, int val)

KAMEZAWA Hiroyuki
committed
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;

KAMEZAWA Hiroyuki
committed
rcu_read_lock();

KAMEZAWA Hiroyuki
committed
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

KAMEZAWA Hiroyuki
committed
/* take a lock against to access pc->mem_cgroup */
move_lock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
this_cpu_add(mem->stat->count[idx], val);

KAMEZAWA Hiroyuki
committed
out:
if (unlikely(need_unlock))
move_unlock_page_cgroup(pc, &flags);

KAMEZAWA Hiroyuki
committed
rcu_read_unlock();
return;
EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE (0)
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
* Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *mem)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);