Newer
Older
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
mutex_unlock(&memcg->thresholds_lock);
}
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
mutex_lock(&memcg_oom_mutex);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
if (atomic_read(&memcg->oom_lock))
eventfd_signal(eventfd, 1);
mutex_unlock(&memcg_oom_mutex);
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
mutex_lock(&memcg_oom_mutex);
list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
}
}
mutex_unlock(&memcg_oom_mutex);
}
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
if (atomic_read(&mem->oom_lock))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
return 0;
}
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (!cgrp->parent || !((val == 0) || (val == 1)))
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
(mem->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
mem->oom_kill_disable = val;
if (!val)
memcg_oom_recover(mem);
cgroup_unlock();
return 0;
}
static struct cftype mem_cgroup_files[] = {
{
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

Paul Menage
committed
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.read_u64 = mem_cgroup_read,
{
.name = "stat",
.read_map = mem_control_stat_show,
{
.name = "force_empty",
.trigger = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
.read_map = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
};
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
if (!do_swap_account)
return 0;
return cgroup_add_files(cont, ss, memsw_cgroup_files,
ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
return 0;
}
#endif

KAMEZAWA Hiroyuki
committed
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

KAMEZAWA Hiroyuki
committed
if (!pn)
return 1;

KAMEZAWA Hiroyuki
committed

KAMEZAWA Hiroyuki
committed
mem->info.nodeinfo[node] = pn;

KAMEZAWA Hiroyuki
committed
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
mz->on_tree = false;
mz->mem = mem;

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
return 0;
}

KAMEZAWA Hiroyuki
committed
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
kfree(mem->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
int size = sizeof(struct mem_cgroup);
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
mem = kzalloc(size, GFP_KERNEL);
mem = vzalloc(size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
spin_lock_init(&mem->pcp_counter_lock);
out_free:
if (size < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
return NULL;
/*
* At destroying mem_cgroup, references from swap_cgroup can remain.
* (scanning all at force_empty is too costly...)
*
* Instead of clearing all references at force_empty, we remember
* the number of reference from swap_cgroup and free mem_cgroup when
* it goes down to 0.
*
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
static void __mem_cgroup_free(struct mem_cgroup *mem)
mem_cgroup_remove_from_trees(mem);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
free_percpu(mem->stat);
if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
}
static void mem_cgroup_get(struct mem_cgroup *mem)
{
atomic_inc(&mem->refcnt);
}
static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
if (atomic_sub_and_test(count, &mem->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(mem);
__mem_cgroup_free(mem);
if (parent)
mem_cgroup_put(parent);
}
static void mem_cgroup_put(struct mem_cgroup *mem)
{
__mem_cgroup_put(mem, 1);
}
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
{
if (!mem->res.parent)
return NULL;
return mem_cgroup_from_res_counter(mem->res.parent, res);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account)
do_swap_account = 1;
}
#else
static void __init enable_swap_cgroup(void)
{
}
#endif
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
static int mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
int tmp, node, zone;
for_each_node_state(node, N_POSSIBLE) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
if (!rtpn)
return 1;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
}
return 0;
}
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *mem, *parent;

KAMEZAWA Hiroyuki
committed
int node;
mem = mem_cgroup_alloc();
if (!mem)

KAMEZAWA Hiroyuki
committed
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
if (cont->parent == NULL) {
root_mem_cgroup = mem;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
mem->oom_kill_disable = parent->oom_kill_disable;
if (parent && parent->use_hierarchy) {
res_counter_init(&mem->res, &parent->res);
res_counter_init(&mem->memsw, &parent->memsw);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
* This refcnt will be decremented when freeing this
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
} else {
res_counter_init(&mem->res, NULL);
res_counter_init(&mem->memsw, NULL);
}
spin_lock_init(&mem->reclaim_param_lock);
if (parent)
mem->swappiness = get_swappiness(parent);
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);

KAMEZAWA Hiroyuki
committed
free_out:
__mem_cgroup_free(mem);
root_mem_cgroup = NULL;
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
return mem_cgroup_force_empty(mem, false);
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
mem_cgroup_put(mem);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
int ret;
ret = cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
if (!ret)
ret = register_memsw_files(cont, ss);
return ret;
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count)
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
struct mem_cgroup *mem = mc.to;
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
if (mem_cgroup_is_root(mem)) {
mc.precharge += count;
/* we don't need css_get for root */
return ret;
}
/* try to charge at once */
if (count > 1) {
struct res_counter *dummy;
/*
* "mem" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
goto one_by_one;
if (do_swap_account && res_counter_charge(&mem->memsw,
PAGE_SIZE * count, &dummy)) {
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
return ret;
}
one_by_one:
/* fall back to one by one charge */
while (count--) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
if (!batch_count--) {
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
mc.precharge++;
}
return ret;
}
/**
* is_target_pte_for_mc - check a pte whether it is valid for move charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
* Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
*
* Called with pte lock held.
*/
union mc_target {
struct page *page;
};
enum mc_target_type {
MC_TARGET_NONE, /* not used */
MC_TARGET_PAGE,
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
/* we don't move shared anon */
if (!move_anon() || page_mapcount(page) > 2)
return NULL;
} else if (!move_file())
/* we ignore mapcount for file pages */
return NULL;
if (!get_page_unless_zero(page))
return NULL;
return page;
}
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
int usage_count;
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!move_anon() || non_swap_entry(ent))
return NULL;
usage_count = mem_cgroup_count_swap_user(ent, &page);
if (usage_count > 1) { /* we don't move shared anon */
if (page)
put_page(page);
if (do_swap_account)
entry->val = ent.val;
return page;
}
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
struct inode *inode;
struct address_space *mapping;
pgoff_t pgoff;
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!move_file())
return NULL;
inode = vma->vm_file->f_path.dentry->d_inode;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
else /* pte_file(ptent) is true */
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
if (!mapping_cap_swap_backed(mapping)) { /* normal file */
page = find_get_page(mapping, pgoff);
} else { /* shmem/tmpfs file. we should take account of swap too. */
swp_entry_t ent;
mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
if (do_swap_account)
entry->val = ent.val;
}
return page;
}
static int is_target_pte_for_mc(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
int ret = 0;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
else if (pte_none(ptent) || pte_file(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
return 0;
if (page) {
pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o page_cgroup lock.
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
}
return ret;
}
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
return precharge;
}
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
unsigned long precharge = mem_cgroup_count_precharge(mm);
VM_BUG_ON(mc.moving_task);
mc.moving_task = current;
return mem_cgroup_do_precharge(precharge);
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
* we must uncharge here.
*/
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
__mem_cgroup_put(mc.from, mc.moved_swap);
if (!mem_cgroup_is_root(mc.to)) {
/*
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
static void mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
*/
mc.moving_task = NULL;
__mem_cgroup_clear_mc();
mc.from = NULL;
mc.to = NULL;

KAMEZAWA Hiroyuki
committed
mem_cgroup_end_move(from);
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
if (mem->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
VM_BUG_ON(from == mem);
mm = get_task_mm(p);
if (!mm)
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);

KAMEZAWA Hiroyuki
committed
mem_cgroup_start_move(from);
mc.from = from;
mc.to = mem;
/* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
}
return ret;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
union mc_target target;
int type;
struct page *page;
struct page_cgroup *pc;
if (!mc.precharge)
break;
type = is_target_pte_for_mc(vma, addr, ptent, &target);
switch (type) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(pc,
mc.from, mc.to, false)) {
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
putback_lru_page(page);
put: /* is_target_pte_for_mc() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent,
mc.from, mc.to, false)) {
/* we fixup refcnts and charges later. */
mc.moved_swap++;
}
default:
break;
}
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end) {
/*
* We have consumed all precharges we got in can_attach().
* We try charge one by one, but don't do any additional
* charges to mc.to if we have failed in charge once in attach()
* phase.
*/
ret = mem_cgroup_do_precharge(1);
if (!ret)
goto retry;
}
return ret;
}
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
lru_add_drain_all();
retry:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
* feature anyway, so it wouldn't be a big problem.
*/
__mem_cgroup_clear_mc();
cond_resched();
goto retry;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
if (ret)
/*
* means we have consumed all precharges and failed in
* doing additional charge. Just abandon here.
*/
break;
}
up_read(&mm->mmap_sem);
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct task_struct *p,
bool threadgroup)
struct mm_struct *mm;
if (!mc.to)
/* no need to move charge */
return;
mm = get_task_mm(p);
if (mm) {
mem_cgroup_move_charge(mm);
mmput(mm);
}
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
struct task_struct *p,
bool threadgroup)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct task_struct *p,
bool threadgroup)
{
}
#endif
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
.create = mem_cgroup_create,
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,

KAMEZAWA Hiroyuki
committed
.early_init = 0,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
if (!s || !strcmp(s, "1"))
really_do_swap_account = 1;
else if (!strcmp(s, "0"))
really_do_swap_account = 0;
return 1;