Newer
Older
for_each_node_state(node, N_POSSIBLE) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
if (!rtpn)
return 1;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
}
return 0;
}
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *memcg, *parent;

KAMEZAWA Hiroyuki
committed
int node;
memcg = mem_cgroup_alloc();
if (!memcg)

KAMEZAWA Hiroyuki
committed
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(memcg, node))

KAMEZAWA Hiroyuki
committed
goto free_out;
if (cont->parent == NULL) {
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
parent = mem_cgroup_from_cont(cont->parent);
memcg->use_hierarchy = parent->use_hierarchy;
memcg->oom_kill_disable = parent->oom_kill_disable;
if (parent && parent->use_hierarchy) {
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
res_counter_init(&memcg->kmem, &parent->kmem);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
* This refcnt will be decremented when freeing this
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
res_counter_init(&memcg->kmem, NULL);
memcg->last_scanned_child = 0;
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
memcg->swappiness = mem_cgroup_swappiness(parent);
atomic_set(&memcg->refcnt, 1);
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
return &memcg->css;

KAMEZAWA Hiroyuki
committed
free_out:
root_mem_cgroup = NULL;
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
return mem_cgroup_force_empty(memcg, false);
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
int ret;
ret = cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
if (!ret)
ret = register_memsw_files(cont, ss);
if (!ret)
ret = register_kmem_files(cont, ss);
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count)
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
struct mem_cgroup *memcg = mc.to;
mc.precharge += count;
/* we don't need css_get for root */
return ret;
}
/* try to charge at once */
if (count > 1) {
struct res_counter *dummy;
/*
* "memcg" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
if (do_swap_account && res_counter_charge(&memcg->memsw,
PAGE_SIZE * count, &dummy)) {
res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
return ret;
}
one_by_one:
/* fall back to one by one charge */
while (count--) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
if (!batch_count--) {
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
ret = __mem_cgroup_try_charge(NULL,
GFP_KERNEL, 1, &memcg, false);
if (ret || !memcg)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
mc.precharge++;
}
return ret;
}
/**
* is_target_pte_for_mc - check a pte whether it is valid for move charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
* Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
*
* Called with pte lock held.
*/
union mc_target {
struct page *page;
};
enum mc_target_type {
MC_TARGET_NONE, /* not used */
MC_TARGET_PAGE,
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
/* we don't move shared anon */
if (!move_anon() || page_mapcount(page) > 2)
return NULL;
} else if (!move_file())
/* we ignore mapcount for file pages */
return NULL;
if (!get_page_unless_zero(page))
return NULL;
return page;
}
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
int usage_count;
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!move_anon() || non_swap_entry(ent))
return NULL;
usage_count = mem_cgroup_count_swap_user(ent, &page);
if (usage_count > 1) { /* we don't move shared anon */
if (page)
put_page(page);
if (do_swap_account)
entry->val = ent.val;
return page;
}
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
struct inode *inode;
struct address_space *mapping;
pgoff_t pgoff;
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!move_file())
return NULL;
inode = vma->vm_file->f_path.dentry->d_inode;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
else /* pte_file(ptent) is true */
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
page = find_get_page(mapping, pgoff);
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swap = radix_to_swp_entry(page);
*entry = swap;
page = find_get_page(&swapper_space, swap.val);
static int is_target_pte_for_mc(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
int ret = 0;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
else if (pte_none(ptent) || pte_file(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
return 0;
if (page) {
pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o page_cgroup lock.
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
}
return ret;
}
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
split_huge_page_pmd(walk->mm, pmd);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
return precharge;
}
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
unsigned long precharge = mem_cgroup_count_precharge(mm);
VM_BUG_ON(mc.moving_task);
mc.moving_task = current;
return mem_cgroup_do_precharge(precharge);
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
* we must uncharge here.
*/
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
__mem_cgroup_put(mc.from, mc.moved_swap);
if (!mem_cgroup_is_root(mc.to)) {
/*
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
static void mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
*/
mc.moving_task = NULL;
__mem_cgroup_clear_mc();
mc.from = NULL;
mc.to = NULL;

KAMEZAWA Hiroyuki
committed
mem_cgroup_end_move(from);
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
if (memcg->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
mm = get_task_mm(p);
if (!mm)
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);

KAMEZAWA Hiroyuki
committed
mem_cgroup_start_move(from);
/* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
}
return ret;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
split_huge_page_pmd(walk->mm, pmd);
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
union mc_target target;
int type;
struct page *page;
struct page_cgroup *pc;
if (!mc.precharge)
break;
type = is_target_pte_for_mc(vma, addr, ptent, &target);
switch (type) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, 1, pc,
mc.from, mc.to, false)) {
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
putback_lru_page(page);
put: /* is_target_pte_for_mc() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent,
mc.from, mc.to, false)) {
/* we fixup refcnts and charges later. */
mc.moved_swap++;
}
default:
break;
}
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end) {
/*
* We have consumed all precharges we got in can_attach().
* We try charge one by one, but don't do any additional
* charges to mc.to if we have failed in charge once in attach()
* phase.
*/
ret = mem_cgroup_do_precharge(1);
if (!ret)
goto retry;
}
return ret;
}
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
lru_add_drain_all();
retry:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
* feature anyway, so it wouldn't be a big problem.
*/
__mem_cgroup_clear_mc();
cond_resched();
goto retry;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
if (ret)
/*
* means we have consumed all precharges and failed in
* doing additional charge. Just abandon here.
*/
break;
}
up_read(&mm->mmap_sem);
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct mm_struct *mm = get_task_mm(p);
if (mc.to)
mem_cgroup_move_charge(mm);
put_swap_token(mm);
if (mc.to)
mem_cgroup_clear_mc();
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
.create = mem_cgroup_create,
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,

KAMEZAWA Hiroyuki
committed
.early_init = 0,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
really_do_swap_account = 1;
really_do_swap_account = 0;
return 1;
}
__setup("swapaccount=", enable_swap_account);