Newer
Older
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count)
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
struct mem_cgroup *memcg = mc.to;
mc.precharge += count;
/* we don't need css_get for root */
return ret;
}
/* try to charge at once */
if (count > 1) {
struct res_counter *dummy;
/*
* "memcg" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
if (do_swap_account && res_counter_charge(&memcg->memsw,
PAGE_SIZE * count, &dummy)) {
res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
return ret;
}
one_by_one:
/* fall back to one by one charge */
while (count--) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
if (!batch_count--) {
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
ret = __mem_cgroup_try_charge(NULL,
GFP_KERNEL, 1, &memcg, false);
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
}
/**
* get_mctgt_type - get target type of moving charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
* @target: the pointer the target page or swap ent will be stored(can be NULL)
*
* Returns
* 0(MC_TARGET_NONE): if the pte is not a target for move charge.
* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
* move charge. if @target is not NULL, the page is stored in target->page
* with extra refcnt got(Callers should handle it).
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
*
* Called with pte lock held.
*/
union mc_target {
struct page *page;
};
enum mc_target_type {
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
/* we don't move shared anon */
if (!move_anon())
} else if (!move_file())
/* we ignore mapcount for file pages */
return NULL;
if (!get_page_unless_zero(page))
return NULL;
return page;
}
#ifdef CONFIG_SWAP
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
if (!move_anon() || non_swap_entry(ent))
return NULL;
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), ent.val);
if (do_swap_account)
entry->val = ent.val;
return page;
}
#else
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
return NULL;
}
#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
struct page *page = NULL;
struct address_space *mapping;
pgoff_t pgoff;
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!move_file())
return NULL;
mapping = vma->vm_file->f_mapping;
if (pte_none(ptent))
pgoff = linear_page_index(vma, addr);
else /* pte_file(ptent) is true */
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
page = find_get_page(mapping, pgoff);
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swap = radix_to_swp_entry(page);
page = find_get_page(swap_address_space(swap), swap.val);
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
else if (pte_none(ptent) || pte_file(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (page) {
pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o page_cgroup lock.
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
}
if (!ret || !target)
put_page(page);
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
}
return ret;
}
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* We don't consider swapping or file mapped pages because THP does not
* support them for now.
* Caller should make sure that pmd_trans_huge(pmd) is true.
*/
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
VM_BUG_ON(!page || !PageHead(page));
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
target->page = page;
}
}
return ret;
}
#else
static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
return MC_TARGET_NONE;
}
#endif
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(&vma->vm_mm->page_table_lock);

Andrea Arcangeli
committed
return 0;
if (pmd_trans_unstable(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (get_mctgt_type(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
return precharge;
}
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
unsigned long precharge = mem_cgroup_count_precharge(mm);
VM_BUG_ON(mc.moving_task);
mc.moving_task = current;
return mem_cgroup_do_precharge(precharge);
/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
* we must uncharge here.
*/
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
css_put(&mc.from->css);
if (!mem_cgroup_is_root(mc.to)) {
/*
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
static void mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
*/
mc.moving_task = NULL;
__mem_cgroup_clear_mc();
mc.from = NULL;
mc.to = NULL;

KAMEZAWA Hiroyuki
committed
mem_cgroup_end_move(from);
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)

Tejun Heo
committed
struct task_struct *p = cgroup_taskset_first(tset);
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
unsigned long move_charge_at_immigrate;
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
move_charge_at_immigrate = memcg->move_charge_at_immigrate;
if (move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
mm = get_task_mm(p);
if (!mm)
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);

KAMEZAWA Hiroyuki
committed
mem_cgroup_start_move(from);
mc.immigrate_flags = move_charge_at_immigrate;
/* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
}
return ret;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
int ret = 0;
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
union mc_target target;
struct page *page;
struct page_cgroup *pc;
/*
* We don't take compound_lock() here but no race with splitting thp
* happens because:
* - if pmd_trans_huge_lock() returns 1, the relevant thp is not
* under splitting, which means there's no concurrent thp split,
* - if another thread runs into split_huge_page() just after we
* entered this if-block, the thread must wait for page table lock
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
pc, mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
putback_lru_page(page);
}
put_page(page);
}
spin_unlock(&vma->vm_mm->page_table_lock);

Andrea Arcangeli
committed
return 0;
if (pmd_trans_unstable(pmd))
return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, 1, pc,
mc.from, mc.to)) {
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
putback_lru_page(page);
put: /* get_mctgt_type() gets the page */
case MC_TARGET_SWAP:
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
/* we fixup refcnts and charges later. */
mc.moved_swap++;
}
default:
break;
}
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end) {
/*
* We have consumed all precharges we got in can_attach().
* We try charge one by one, but don't do any additional
* charges to mc.to if we have failed in charge once in attach()
* phase.
*/
ret = mem_cgroup_do_precharge(1);
if (!ret)
goto retry;
}
return ret;
}
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
struct vm_area_struct *vma;
lru_add_drain_all();
retry:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
* feature anyway, so it wouldn't be a big problem.
*/
__mem_cgroup_clear_mc();
cond_resched();
goto retry;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
.mm = mm,
.private = vma,
};
if (is_vm_hugetlb_page(vma))
continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
if (ret)
/*
* means we have consumed all precharges and failed in
* doing additional charge. Just abandon here.
*/
break;
}
up_read(&mm->mmap_sem);
static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)

Tejun Heo
committed
struct task_struct *p = cgroup_taskset_first(tset);
struct mm_struct *mm = get_task_mm(p);
if (mc.to)
mem_cgroup_move_charge(mm);
if (mc.to)
mem_cgroup_clear_mc();
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
* to verify sane_behavior flag on each mount attempt.
*/
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
{
/*
* use_hierarchy is forced with sane_behavior. cgroup core
* guarantees that @root doesn't have any children, so turning it
* on for the root memcg is enough.
*/
if (cgroup_sane_behavior(root_css->cgroup))
mem_cgroup_from_css(root_css)->use_hierarchy = true;
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,

Tejun Heo
committed
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,

Tejun Heo
committed
.css_offline = mem_cgroup_css_offline,
.css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.base_cftypes = mem_cgroup_files,

KAMEZAWA Hiroyuki
committed
.early_init = 0,
static int __init enable_swap_account(char *s)
{
really_do_swap_account = 1;
really_do_swap_account = 0;
return 1;
}
__setup("swapaccount=", enable_swap_account);
static void __init memsw_file_init(void)
{
WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
memsw_file_init();
}
static void __init enable_swap_cgroup(void)
{
}
* subsys_initcall() for memory controller.
*
* Some parts like hotcpu_notifier() have to be initialized from this context
* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
* everything that doesn't depend on a specific mem_cgroup structure should
* be initialized from here.
*/
static int __init mem_cgroup_init(void)
{
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
enable_swap_cgroup();
memcg_stock_init();
return 0;
}
subsys_initcall(mem_cgroup_init);