Newer
Older
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
* Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
*
* Called with pte lock held.
*/
union mc_target {
struct page *page;
};
enum mc_target_type {
MC_TARGET_NONE, /* not used */
MC_TARGET_PAGE,
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
/* we don't move shared anon */
if (!move_anon() || page_mapcount(page) > 2)
return NULL;
} else if (!move_file())
/* we ignore mapcount for file pages */
return NULL;
if (!get_page_unless_zero(page))
return NULL;
return page;
}
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
int usage_count;
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!move_anon() || non_swap_entry(ent))
return NULL;
usage_count = mem_cgroup_count_swap_user(ent, &page);
if (usage_count > 1) { /* we don't move shared anon */
if (page)
put_page(page);
if (do_swap_account)
entry->val = ent.val;
return page;
}
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
struct inode *inode;
struct address_space *mapping;
pgoff_t pgoff;
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!move_file())
return NULL;
inode = vma->vm_file->f_path.dentry->d_inode;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
else /* pte_file(ptent) is true */
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
if (!mapping_cap_swap_backed(mapping)) { /* normal file */
page = find_get_page(mapping, pgoff);
} else { /* shmem/tmpfs file. we should take account of swap too. */
swp_entry_t ent;
mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
if (do_swap_account)
entry->val = ent.val;
}
return page;
}
static int is_target_pte_for_mc(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
int ret = 0;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
else if (pte_none(ptent) || pte_file(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
return 0;
if (page) {
pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o page_cgroup lock.
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
}
return ret;
}
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
split_huge_page_pmd(walk->mm, pmd);
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
return precharge;
}
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
unsigned long precharge = mem_cgroup_count_precharge(mm);
VM_BUG_ON(mc.moving_task);
mc.moving_task = current;
return mem_cgroup_do_precharge(precharge);
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
* we must uncharge here.
*/
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
__mem_cgroup_put(mc.from, mc.moved_swap);
if (!mem_cgroup_is_root(mc.to)) {
/*
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
static void mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
*/
mc.moving_task = NULL;
__mem_cgroup_clear_mc();
mc.from = NULL;
mc.to = NULL;

KAMEZAWA Hiroyuki
committed
mem_cgroup_end_move(from);
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
if (mem->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
VM_BUG_ON(from == mem);
mm = get_task_mm(p);
if (!mm)
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);

KAMEZAWA Hiroyuki
committed
mem_cgroup_start_move(from);
mc.from = from;
mc.to = mem;
/* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
}
return ret;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
split_huge_page_pmd(walk->mm, pmd);
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
union mc_target target;
int type;
struct page *page;
struct page_cgroup *pc;
if (!mc.precharge)
break;
type = is_target_pte_for_mc(vma, addr, ptent, &target);
switch (type) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, 1, pc,
mc.from, mc.to, false)) {
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
putback_lru_page(page);
put: /* is_target_pte_for_mc() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent,
mc.from, mc.to, false)) {
/* we fixup refcnts and charges later. */
mc.moved_swap++;
}
default:
break;
}
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end) {
/*
* We have consumed all precharges we got in can_attach().
* We try charge one by one, but don't do any additional
* charges to mc.to if we have failed in charge once in attach()
* phase.
*/
ret = mem_cgroup_do_precharge(1);
if (!ret)
goto retry;
}
return ret;
}
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
lru_add_drain_all();
retry:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
* feature anyway, so it wouldn't be a big problem.
*/
__mem_cgroup_clear_mc();
cond_resched();
goto retry;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
if (ret)
/*
* means we have consumed all precharges and failed in
* doing additional charge. Just abandon here.
*/
break;
}
up_read(&mm->mmap_sem);
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct mm_struct *mm;
if (!mc.to)
/* no need to move charge */
return;
mm = get_task_mm(p);
if (mm) {
mem_cgroup_move_charge(mm);
mmput(mm);
}
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
.create = mem_cgroup_create,
.pre_destroy = mem_cgroup_pre_destroy,
.destroy = mem_cgroup_destroy,
.populate = mem_cgroup_populate,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,

KAMEZAWA Hiroyuki
committed
.early_init = 0,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
really_do_swap_account = 1;
really_do_swap_account = 0;
return 1;
}
__setup("swapaccount=", enable_swap_account);