Newer
Older

KAMEZAWA Hiroyuki
committed
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

KAMEZAWA Hiroyuki
committed
break;
if (unlikely(!PageCgroupUsed(pc)))
continue;

KAMEZAWA Hiroyuki
committed
continue;
ret = __isolate_lru_page(page, mode, file);
switch (ret) {
case 0:
list_move(&page->lru, dst);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
mem_cgroup_rotate_lru_list(page, page_lru(page));
break;
default:
break;
}
}
*scanned = scan;
return nr_taken;
}
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
if (do_swap_account) {
if (res_counter_check_under_limit(&mem->res) &&
res_counter_check_under_limit(&mem->memsw))
return true;
} else
if (res_counter_check_under_limit(&mem->res))
return true;
return false;
}
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
unsigned int swappiness;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
spin_lock(&memcg->reclaim_param_lock);
swappiness = memcg->swappiness;
spin_unlock(&memcg->reclaim_param_lock);
return swappiness;
}
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
(*val)++;
return 0;
}
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
if (!memcg || !p)
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
printk(KERN_INFO "Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
"failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
return num;
}
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
*/
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
struct mem_cgroup *ret = NULL;
struct cgroup_subsys_state *css;
int nextid, found;
if (!root_mem->use_hierarchy) {
css_get(&root_mem->css);
ret = root_mem;
}
while (!ret) {
rcu_read_lock();
nextid = root_mem->last_scanned_child + 1;
css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
&found);
if (css && css_tryget(css))
ret = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
/* Updates scanning parameter */
spin_lock(&root_mem->reclaim_param_lock);
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
spin_unlock(&root_mem->reclaim_param_lock);
}
return ret;
}
/*
* Scan the hierarchy if needed to reclaim memory. We remember the last child
* we reclaimed from, so that we don't end up penalizing one child extensively
* based on its position in the children list.
*
* root_mem is the original ancestor that we've been reclaim from.
*
* We give up and return to the caller when we visit root_mem twice.
* (other groups can be removed while we're walking....)
*
* If shrink==true, for avoiding to free too much, this returns immedieately.
*/
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
gfp_t gfp_mask,
unsigned long reclaim_options)
struct mem_cgroup *victim;
int ret, total = 0;
int loop = 0;
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
unsigned long excess = mem_cgroup_get_excess(root_mem);
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (root_mem->memsw_is_minimum)
noswap = true;
if (victim == root_mem) {
if (loop >= 1)
drain_all_stock_async();
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!check_soft || !total) {
css_put(&victim->css);
break;
}
/*
* We want to do more targetted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
css_put(&victim->css);
break;
}
}
}
if (!mem_cgroup_local_usage(&victim->stat)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
noswap, get_swappiness(victim), zone,
zone->zone_pgdat->node_id);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
/*
* At shrinking usage, we can't check we should stop here or
* reclaim more. It's depends on callers. last_scanned_child
* will work enough for keeping fairness under tree.
*/
if (shrink)
return ret;
if (check_soft) {
if (res_counter_check_under_soft_limit(&root_mem->res))
return total;
} else if (mem_cgroup_check_under_limit(root_mem))
bool mem_cgroup_oom_called(struct task_struct *task)
{
bool ret = false;
struct mem_cgroup *mem;
struct mm_struct *mm;
rcu_read_lock();
mm = task->mm;
if (!mm)
mm = &init_mm;
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
ret = true;
rcu_read_unlock();
return ret;
}
static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
{
mem->last_oom_jiffies = jiffies;
return 0;
}
static void record_last_oom(struct mem_cgroup *mem)
{
mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
}
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
void mem_cgroup_update_file_mapped(struct page *page, int val)
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
{
struct mem_cgroup *mem;
struct mem_cgroup_stat *stat;
struct mem_cgroup_stat_cpu *cpustat;
int cpu;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
lock_page_cgroup(pc);
mem = pc->mem_cgroup;
if (!mem)
goto done;
if (!PageCgroupUsed(pc))
goto done;
/*
* Preemption is already disabled, we don't need get_cpu()
*/
cpu = smp_processor_id();
stat = &mem->stat;
cpustat = &stat->cpustat[cpu];
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
done:
unlock_page_cgroup(pc);
}
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
#define CHARGE_SIZE (32 * PAGE_SIZE)
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
int charge;
struct work_struct work;
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static atomic_t memcg_drain_count;
/*
* Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
*/
static bool consume_stock(struct mem_cgroup *mem)
{
struct memcg_stock_pcp *stock;
bool ret = true;
stock = &get_cpu_var(memcg_stock);
if (mem == stock->cached && stock->charge)
stock->charge -= PAGE_SIZE;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
return ret;
}
/*
* Returns stocks cached in percpu to res_counter and reset cached information.
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
if (stock->charge) {
res_counter_uncharge(&old->res, stock->charge);
if (do_swap_account)
res_counter_uncharge(&old->memsw, stock->charge);
}
stock->cached = NULL;
stock->charge = 0;
}
/*
* This must be called under preempt disabled or must be called by
* a thread which is pinned to local cpu.
*/
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
}
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consumt_stock() function, later.
*/
static void refill_stock(struct mem_cgroup *mem, int val)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
if (stock->cached != mem) { /* reset if necessary */
drain_stock(stock);
stock->cached = mem;
}
stock->charge += val;
put_cpu_var(memcg_stock);
}
/*
* Tries to drain stocked charges in other cpus. This function is asynchronous
* and just put a work per cpu for draining localy on each cpu. Caller can
* expects some charges will be back to res_counter later but cannot wait for
* it.
*/
static void drain_all_stock_async(void)
{
int cpu;
/* This function is for scheduling "drain" in asynchronous way.
* The result of "drain" is not directly handled by callers. Then,
* if someone is calling drain, we don't have to call drain more.
* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
* there is a race. We just do loose check here.
*/
if (atomic_read(&memcg_drain_count))
return;
/* Notify other cpus that system-wide "drain" is running */
atomic_inc(&memcg_drain_count);
get_online_cpus();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
schedule_work_on(cpu, &stock->work);
}
put_online_cpus();
atomic_dec(&memcg_drain_count);
/* We don't wait for flush_work */
}
/* This is a synchronous drain interface. */
static void drain_all_stock_sync(void)
{
/* called when force_empty is called */
atomic_inc(&memcg_drain_count);
schedule_on_each_cpu(drain_local_stock);
atomic_dec(&memcg_drain_count);
}
static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
if (action != CPU_DEAD)
return NOTIFY_OK;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
}
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
static int __mem_cgroup_try_charge(struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom, struct page *page)

KAMEZAWA Hiroyuki
committed
struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

KAMEZAWA Hiroyuki
committed
struct res_counter *fail_res;
int csize = CHARGE_SIZE;
if (unlikely(test_thread_flag(TIF_MEMDIE))) {
/* Don't account this! */
*memcg = NULL;
return 0;
}
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
mem = *memcg;
if (likely(!mem)) {
mem = try_get_mem_cgroup_from_mm(mm);
*memcg = mem;
css_get(&mem->css);
if (unlikely(!mem))
return 0;

Nikanth Karthikesan
committed
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
unsigned long flags = 0;
if (consume_stock(mem))
goto charged;
ret = res_counter_charge(&mem->res, csize, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
ret = res_counter_charge(&mem->memsw, csize, &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
res_counter_uncharge(&mem->res, csize);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
} else
/* mem counter fails */
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
res);
/* reduce request size and retry */
if (csize > PAGE_SIZE) {
csize = PAGE_SIZE;
continue;
}
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
gfp_mask, flags);
* try_to_free_mem_cgroup_pages() might not give us a full
* picture of reclaim. Some pages are reclaimed and might be
* moved to swap cache or just unmapped from the cgroup.
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
if (mem_cgroup_check_under_limit(mem_over_limit))
continue;
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
/* try to avoid oom while someone is moving charge */
if (mc.moving_task && current != mc.moving_task) {
struct mem_cgroup *from, *to;
bool do_continue = false;
/*
* There is a small race that "from" or "to" can be
* freed by rmdir, so we use css_tryget().
*/
rcu_read_lock();
from = mc.from;
to = mc.to;
if (from && css_tryget(&from->css)) {
if (mem_over_limit->use_hierarchy)
do_continue = css_is_ancestor(
&from->css,
&mem_over_limit->css);
else
do_continue = (from == mem_over_limit);
css_put(&from->css);
}
if (!do_continue && to && css_tryget(&to->css)) {
if (mem_over_limit->use_hierarchy)
do_continue = css_is_ancestor(
&to->css,
&mem_over_limit->css);
else
do_continue = (to == mem_over_limit);
css_put(&to->css);
}
rcu_read_unlock();
if (do_continue) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait,
TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
continue;
}
}
mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
goto nomem;
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
charged:

KAMEZAWA Hiroyuki
committed
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
if (page && mem_cgroup_soft_limit_check(mem))

KAMEZAWA Hiroyuki
committed
mem_cgroup_update_tree(mem, page);
if (mem_cgroup_threshold_check(mem))
mem_cgroup_threshold(mem);
return 0;
nomem:
css_put(&mem->css);
return -ENOMEM;
}
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
* gotten by try_charge().
*/
static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
unsigned long count)
{
if (!mem_cgroup_is_root(mem)) {
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
WARN_ON_ONCE(count > INT_MAX);
__css_put(&mem->css, (int)count);
/* we don't need css_put for root */
}
static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
{
__mem_cgroup_cancel_charge(mem, 1);
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
* it's concern. (dropping refcnt from swap can be called against removed
* memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
struct cgroup_subsys_state *css;
/* ID 0 is unused ID */
if (!id)
return NULL;
css = css_lookup(&mem_cgroup_subsys, id);
if (!css)
return NULL;
return container_of(css, struct mem_cgroup, css);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
struct mem_cgroup *mem = NULL;
unsigned short id;
VM_BUG_ON(!PageLocked(page));
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
if (mem && !css_tryget(&mem->css))
mem = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup(ent);
rcu_read_lock();
mem = mem_cgroup_lookup(id);
if (mem && !css_tryget(&mem->css))
mem = NULL;
rcu_read_unlock();
unlock_page_cgroup(pc);
* commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
* USED state. If already USED, uncharge and return.
*/
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
struct page_cgroup *pc,
enum charge_type ctype)
{
/* try_charge() can return NULL to *memcg, taking care of it. */
if (!mem)
return;
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
mem_cgroup_cancel_charge(mem);
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
*/
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_CACHE:
case MEM_CGROUP_CHARGE_TYPE_SHMEM:
SetPageCgroupCache(pc);
SetPageCgroupUsed(pc);
break;
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
ClearPageCgroupCache(pc);
SetPageCgroupUsed(pc);
break;
default:
break;
}
mem_cgroup_charge_statistics(mem, pc, true);
unlock_page_cgroup(pc);
* __mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
* @uncharge: whether we should call uncharge and css_put against @from.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - the pc is locked, used, and ->mem_cgroup points to @from.
* This function doesn't do "charge" nor css_get to new cgroup. It should be
* done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
* true, this function does "uncharge" from old cgroup, but it doesn't if
* @uncharge is false, so a caller should do "uncharge".
static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
struct page *page;
int cpu;
struct mem_cgroup_stat *stat;
struct mem_cgroup_stat_cpu *cpustat;
VM_BUG_ON(from == to);
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(pc->mem_cgroup != from);
if (page_mapped(page) && !PageAnon(page)) {
cpu = smp_processor_id();
/* Update mapped_file data for mem_cgroup "from" */
stat = &from->stat;
cpustat = &stat->cpustat[cpu];
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-1);
/* Update mapped_file data for mem_cgroup "to" */
stat = &to->stat;
cpustat = &stat->cpustat[cpu];
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
mem_cgroup_cancel_charge(from);
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and move charge, so it's
* garanteed that "to" is never removed. So, we don't check rmdir
* status here.
}
/*
* check whether the @pc is valid for moving account and call
* __mem_cgroup_move_account()
*/
static int mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
ret = 0;
}
unlock_page_cgroup(pc);
return ret;
}
/*
* move charges to its parent.
*/
static int mem_cgroup_move_parent(struct page_cgroup *pc,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
int ret;
/* Is ROOT ? */
if (!pcg)
return -EINVAL;
ret = -EBUSY;
if (!get_page_unless_zero(page))
goto out;
if (isolate_lru_page(page))
goto put;
parent = mem_cgroup_from_cont(pcg);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
if (ret || !parent)
ret = mem_cgroup_move_account(pc, child, parent, true);
if (ret)
mem_cgroup_cancel_charge(parent);
return ret;
}
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype,
struct mem_cgroup *memcg)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
int ret;
pc = lookup_page_cgroup(page);
/* can happen at boot */
if (unlikely(!pc))
return 0;
prefetchw(pc);
mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
return ret;
__mem_cgroup_commit_charge(mem, pc, ctype);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
if (PageCompound(page))
return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
* But page->mapping may have out-of-use anon_vma pointer,
* detecit it by PageAnon() check. newly-mapped-anon's page->mapping
* is NULL.
*/
if (page_mapped(page) || (page->mapping && !PageAnon(page)))
return 0;
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
}
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
struct mem_cgroup *mem = NULL;
int ret;
if (PageCompound(page))
return 0;
/*
* Corner case handling. This is called from add_to_page_cache()
* in usual. But some FS (shmem) precharges this page before calling it
* and call add_to_page_cache() with GFP_NOWAIT.
*
* For GFP_NOWAIT case, the page may be pre-charged before calling
* add_to_page_cache(). (See shmem.c) check it here and avoid to call
* charge twice. (It works but has to pay a bit larger cost.)
* And when the page is SwapCache, it should take swap information
* into account. This is under lock_page() now.
*/
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
unlock_page_cgroup(pc);
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
/* shmem */
if (PageSwapCache(page)) {
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask, struct mem_cgroup **ptr)
{
struct mem_cgroup *mem;
return 0;
if (!do_swap_account)
goto charge_cur_mm;
/*
* A racing thread's fault, or swapoff, may have already updated
* the pte, and even removed page from swap cache: in those cases
* do_swap_page()'s pte_same() test will fail; but there's also a
* KSM case which does need to charge the page.
mem = try_get_mem_cgroup_from_page(page);