Skip to content
Snippets Groups Projects
memcontrol.c 142 KiB
Newer Older
  • Learn to ignore specific revisions
  • }
    
    /*
     * Cache charges(val) which is from res_counter, to local per_cpu area.
    
     * This will be consumed by consume_stock() function, later.
    
    static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
    
    {
    	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
    
    
    	if (stock->cached != memcg) { /* reset if necessary */
    
    		drain_stock(stock);
    
    		stock->cached = memcg;
    
    	stock->nr_pages += nr_pages;
    
     * Drains all per-CPU charge caches for given root_memcg resp. subtree
    
     * of the hierarchy under it. sync flag says whether we should block
     * until the work is done.
    
    static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
    
    	/* Notify other cpus that system-wide "drain" is running */
    	get_online_cpus();
    
    	for_each_online_cpu(cpu) {
    		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
    
    		struct mem_cgroup *memcg;
    
    		memcg = stock->cached;
    		if (!memcg || !stock->nr_pages)
    
    		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
    
    		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
    			if (cpu == curcpu)
    				drain_local_stock(&stock->work);
    			else
    				schedule_work_on(cpu, &stock->work);
    		}
    
    
    	if (!sync)
    		goto out;
    
    	for_each_online_cpu(cpu) {
    		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
    
    		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
    
    }
    
    /*
     * Tries to drain stocked charges in other cpus. This function is asynchronous
     * and just put a work per cpu for draining localy on each cpu. Caller can
     * expects some charges will be back to res_counter later but cannot wait for
     * it.
     */
    
    static void drain_all_stock_async(struct mem_cgroup *root_memcg)
    
    	/*
    	 * If someone calls draining, avoid adding more kworker runs.
    	 */
    	if (!mutex_trylock(&percpu_charge_mutex))
    		return;
    
    	drain_all_stock(root_memcg, false);
    
    	mutex_unlock(&percpu_charge_mutex);
    
    }
    
    /* This is a synchronous drain interface. */
    
    static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
    
    {
    	/* called when force_empty is called */
    
    	mutex_lock(&percpu_charge_mutex);
    
    	drain_all_stock(root_memcg, true);
    
    	mutex_unlock(&percpu_charge_mutex);
    
    /*
     * This function drains percpu counter value from DEAD cpu and
     * move it to local cpu. Note that this function can be preempted.
     */
    
    static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
    
    	spin_lock(&memcg->pcp_counter_lock);
    
    	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
    
    		long x = per_cpu(memcg->stat->count[i], cpu);
    
    		per_cpu(memcg->stat->count[i], cpu) = 0;
    		memcg->nocpu_base.count[i] += x;
    
    	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
    
    		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
    
    		per_cpu(memcg->stat->events[i], cpu) = 0;
    		memcg->nocpu_base.events[i] += x;
    
    	spin_unlock(&memcg->pcp_counter_lock);
    
    }
    
    static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
    
    					unsigned long action,
    					void *hcpu)
    {
    	int cpu = (unsigned long)hcpu;
    	struct memcg_stock_pcp *stock;
    
    	struct mem_cgroup *iter;
    
    	if (action == CPU_ONLINE)
    
    	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
    
    	for_each_mem_cgroup(iter)
    
    		mem_cgroup_drain_pcp_counter(iter, cpu);
    
    
    	stock = &per_cpu(memcg_stock, cpu);
    	drain_stock(stock);
    	return NOTIFY_OK;
    }
    
    
    
    /* See __mem_cgroup_try_charge() for details */
    enum {
    	CHARGE_OK,		/* success */
    	CHARGE_RETRY,		/* need to retry but retry is not bad */
    	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
    	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
    	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
    };
    
    
    static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
    
    				unsigned int nr_pages, bool oom_check)
    
    	unsigned long csize = nr_pages * PAGE_SIZE;
    
    	struct mem_cgroup *mem_over_limit;
    	struct res_counter *fail_res;
    	unsigned long flags = 0;
    	int ret;
    
    
    	ret = res_counter_charge(&memcg->res, csize, &fail_res);
    
    
    	if (likely(!ret)) {
    		if (!do_swap_account)
    			return CHARGE_OK;
    
    		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
    
    		if (likely(!ret))
    			return CHARGE_OK;
    
    
    		res_counter_uncharge(&memcg->res, csize);
    
    		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
    		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
    	} else
    		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
    
    	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
    	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
    
    	 *
    	 * Never reclaim on behalf of optional batching, retry with a
    	 * single page instead.
    	 */
    
    	if (nr_pages == CHARGE_BATCH)
    
    		return CHARGE_RETRY;
    
    	if (!(gfp_mask & __GFP_WAIT))
    		return CHARGE_WOULDBLOCK;
    
    
    	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
    
    	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
    
    	 * Even though the limit is exceeded at this point, reclaim
    	 * may have been able to free some pages.  Retry the charge
    	 * before killing the task.
    	 *
    	 * Only for regular pages, though: huge pages are rather
    	 * unlikely to succeed so close to the limit, and we fall back
    	 * to regular pages anyway in case of failure.
    
    		return CHARGE_RETRY;
    
    	/*
    	 * At task move, charge accounts can be doubly counted. So, it's
    	 * better to wait until the end of task_move if something is going on.
    	 */
    	if (mem_cgroup_wait_acct_move(mem_over_limit))
    		return CHARGE_RETRY;
    
    	/* If we don't need to call oom-killer at el, return immediately */
    	if (!oom_check)
    		return CHARGE_NOMEM;
    	/* check OOM */
    
    	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
    
    		return CHARGE_OOM_DIE;
    
    	return CHARGE_RETRY;
    }
    
    
     * __mem_cgroup_try_charge() does
     * 1. detect memcg to be charged against from passed *mm and *ptr,
     * 2. update res_counter
     * 3. call memory reclaim if necessary.
     *
     * In some special case, if the task is fatal, fatal_signal_pending() or
     * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
     * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
     * as possible without any hazards. 2: all pages should have a valid
     * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
     * pointer, that is treated as a charge to root_mem_cgroup.
     *
     * So __mem_cgroup_try_charge() will return
     *  0       ...  on success, filling *ptr with a valid memcg pointer.
     *  -ENOMEM ...  charge failure because of resource limits.
     *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
     *
     * Unlike the exported interface, an "oom" parameter is added. if oom==true,
     * the oom-killer can be invoked.
    
    static int __mem_cgroup_try_charge(struct mm_struct *mm,
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    				   gfp_t gfp_mask,
    
    				   struct mem_cgroup **ptr,
    
    	unsigned int batch = max(CHARGE_BATCH, nr_pages);
    
    	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
    
    	struct mem_cgroup *memcg = NULL;
    
    	/*
    	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
    	 * in system level. So, allow to go ahead dying process in addition to
    	 * MEMDIE process.
    	 */
    	if (unlikely(test_thread_flag(TIF_MEMDIE)
    		     || fatal_signal_pending(current)))
    		goto bypass;
    
    	 * We always charge the cgroup the mm_struct belongs to.
    	 * The mm_struct's mem_cgroup changes on task migration if the
    
    	 * thread group leader migrates. It's possible that mm is not
    	 * set, if so charge the init_mm (happens for pagecache usage).
    	 */
    
    	if (!*ptr && !mm)
    
    		*ptr = root_mem_cgroup;
    
    again:
    
    	if (*ptr) { /* css should be a valid one */
    		memcg = *ptr;
    		VM_BUG_ON(css_is_removed(&memcg->css));
    		if (mem_cgroup_is_root(memcg))
    
    			goto done;
    
    		if (nr_pages == 1 && consume_stock(memcg))
    
    			goto done;
    
    		css_get(&memcg->css);
    
    		struct task_struct *p;
    
    		rcu_read_lock();
    		p = rcu_dereference(mm->owner);
    		/*
    
    		 * Because we don't have task_lock(), "p" can exit.
    
    		 * In that case, "memcg" can point to root or p can be NULL with
    
    		 * race with swapoff. Then, we have small risk of mis-accouning.
    		 * But such kind of mis-account by race always happens because
    		 * we don't have cgroup_mutex(). It's overkill and we allo that
    		 * small race, here.
    		 * (*) swapoff at el will charge against mm-struct not against
    		 * task-struct. So, mm->owner can be NULL.
    
    		memcg = mem_cgroup_from_task(p);
    
    		if (!memcg)
    			memcg = root_mem_cgroup;
    		if (mem_cgroup_is_root(memcg)) {
    
    			rcu_read_unlock();
    			goto done;
    		}
    
    		if (nr_pages == 1 && consume_stock(memcg)) {
    
    			/*
    			 * It seems dagerous to access memcg without css_get().
    			 * But considering how consume_stok works, it's not
    			 * necessary. If consume_stock success, some charges
    			 * from this memcg are cached on this cpu. So, we
    			 * don't need to call css_get()/css_tryget() before
    			 * calling consume_stock().
    			 */
    			rcu_read_unlock();
    			goto done;
    		}
    		/* after here, we may be blocked. we need to get refcnt */
    
    		if (!css_tryget(&memcg->css)) {
    
    			rcu_read_unlock();
    			goto again;
    		}
    		rcu_read_unlock();
    	}
    
    	do {
    		bool oom_check;
    
    		/* If killed, bypass charge */
    
    		if (fatal_signal_pending(current)) {
    
    			css_put(&memcg->css);
    
    		oom_check = false;
    		if (oom && !nr_oom_retries) {
    			oom_check = true;
    			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
    
    		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
    
    		switch (ret) {
    		case CHARGE_OK:
    			break;
    		case CHARGE_RETRY: /* not in OOM situation but retry */
    
    			css_put(&memcg->css);
    			memcg = NULL;
    
    			goto again;
    
    		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
    
    			css_put(&memcg->css);
    
    			goto nomem;
    		case CHARGE_NOMEM: /* OOM routine works */
    
    			if (!oom) {
    
    				css_put(&memcg->css);
    
    				goto nomem;
    
    			/* If oom, we never return -ENOMEM */
    			nr_oom_retries--;
    			break;
    		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
    
    			css_put(&memcg->css);
    
    			goto bypass;
    
    	} while (ret != CHARGE_OK);
    
    
    		refill_stock(memcg, batch - nr_pages);
    	css_put(&memcg->css);
    
    	*ptr = memcg;
    
    	*ptr = NULL;
    
    	*ptr = root_mem_cgroup;
    	return -EINTR;
    
    /*
     * Somemtimes we have to undo a charge we got by try_charge().
     * This function is for that and do uncharge, put css's refcnt.
     * gotten by try_charge().
     */
    
    static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
    
    				       unsigned int nr_pages)
    
    	if (!mem_cgroup_is_root(memcg)) {
    
    		unsigned long bytes = nr_pages * PAGE_SIZE;
    
    
    		res_counter_uncharge(&memcg->res, bytes);
    
    		if (do_swap_account)
    
    			res_counter_uncharge(&memcg->memsw, bytes);
    
    /*
     * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
     * This is useful when moving usage to parent cgroup.
     */
    static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
    					unsigned int nr_pages)
    {
    	unsigned long bytes = nr_pages * PAGE_SIZE;
    
    	if (mem_cgroup_is_root(memcg))
    		return;
    
    	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
    	if (do_swap_account)
    		res_counter_uncharge_until(&memcg->memsw,
    						memcg->memsw.parent, bytes);
    }
    
    
    /*
     * A helper function to get mem_cgroup from ID. must be called under
     * rcu_read_lock(). The caller must check css_is_removed() or some if
     * it's concern. (dropping refcnt from swap can be called against removed
     * memcg.)
     */
    static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
    {
    	struct cgroup_subsys_state *css;
    
    	/* ID 0 is unused ID */
    	if (!id)
    		return NULL;
    	css = css_lookup(&mem_cgroup_subsys, id);
    	if (!css)
    		return NULL;
    	return container_of(css, struct mem_cgroup, css);
    }
    
    
    struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
    
    	struct mem_cgroup *memcg = NULL;
    
    	struct page_cgroup *pc;
    
    	VM_BUG_ON(!PageLocked(page));
    
    	pc = lookup_page_cgroup(page);
    
    	lock_page_cgroup(pc);
    
    		memcg = pc->mem_cgroup;
    		if (memcg && !css_tryget(&memcg->css))
    			memcg = NULL;
    
    	} else if (PageSwapCache(page)) {
    
    		ent.val = page_private(page);
    
    		id = lookup_swap_cgroup_id(ent);
    
    		memcg = mem_cgroup_lookup(id);
    		if (memcg && !css_tryget(&memcg->css))
    			memcg = NULL;
    
    	unlock_page_cgroup(pc);
    
    	return memcg;
    
    static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
    
    				       enum charge_type ctype,
    				       bool lrucare)
    
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    	struct zone *uninitialized_var(zone);
    
    	struct lruvec *lruvec;
    
    	bool was_on_lru = false;
    
    	lock_page_cgroup(pc);
    	if (unlikely(PageCgroupUsed(pc))) {
    		unlock_page_cgroup(pc);
    
    		__mem_cgroup_cancel_charge(memcg, nr_pages);
    
    		return;
    	}
    	/*
    	 * we don't need page_cgroup_lock about tail pages, becase they are not
    	 * accessed by any other context at this point.
    	 */
    
    
    	/*
    	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
    	 * may already be on some other mem_cgroup's LRU.  Take care of it.
    	 */
    	if (lrucare) {
    		zone = page_zone(page);
    		spin_lock_irq(&zone->lru_lock);
    		if (PageLRU(page)) {
    
    			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
    
    			ClearPageLRU(page);
    
    			del_page_from_lru_list(page, lruvec, page_lru(page));
    
    	pc->mem_cgroup = memcg;
    
    	/*
    	 * We access a page_cgroup asynchronously without lock_page_cgroup().
    	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
    	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
    	 * before USED bit, we need memory barrier here.
    	 * See mem_cgroup_add_lru_list(), etc.
     	 */
    
    	smp_wmb();
    
    	SetPageCgroupUsed(pc);
    
    	if (lrucare) {
    		if (was_on_lru) {
    
    			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
    
    			VM_BUG_ON(PageLRU(page));
    			SetPageLRU(page);
    
    			add_page_to_lru_list(page, lruvec, page_lru(page));
    
    		}
    		spin_unlock_irq(&zone->lru_lock);
    	}
    
    
    	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
    		anon = true;
    	else
    		anon = false;
    
    	mem_cgroup_charge_statistics(memcg, anon, nr_pages);
    
    	unlock_page_cgroup(pc);
    
    	/*
    	 * "charge_statistics" updated event counter. Then, check it.
    	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
    	 * if they exceeds softlimit.
    	 */
    
    	memcg_check_events(memcg, page);
    
    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    
    
    #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
    
    /*
     * Because tail pages are not marked as "used", set it. We're under
    
     * zone->lru_lock, 'splitting on pmd' and compound_lock.
     * charge/uncharge will be never happen and move_account() is done under
     * compound_lock(), so we don't have to take care of races.
    
    void mem_cgroup_split_huge_fixup(struct page *head)
    
    {
    	struct page_cgroup *head_pc = lookup_page_cgroup(head);
    
    	if (mem_cgroup_disabled())
    		return;
    
    	for (i = 1; i < HPAGE_PMD_NR; i++) {
    		pc = head_pc + i;
    		pc->mem_cgroup = head_pc->mem_cgroup;
    		smp_wmb();/* see __commit_charge() */
    		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
    	}
    
    #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    
     * mem_cgroup_move_account - move account of the page
    
     * @nr_pages: number of regular pages (>1 for huge pages)
    
     * @pc:	page_cgroup of the page.
     * @from: mem_cgroup which the page is moved from.
     * @to:	mem_cgroup which the page is moved to. @from != @to.
     *
     * The caller must confirm following.
    
     * - page is not on LRU (isolate_page() is useful.)
    
     * - compound_lock is held when nr_pages > 1
    
     * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
     * from old cgroup.
    
    static int mem_cgroup_move_account(struct page *page,
    				   unsigned int nr_pages,
    				   struct page_cgroup *pc,
    				   struct mem_cgroup *from,
    
    	unsigned long flags;
    	int ret;
    
    	bool anon = PageAnon(page);
    
    	VM_BUG_ON(from == to);
    
    	VM_BUG_ON(PageLRU(page));
    
    	/*
    	 * The page is isolated from LRU. So, collapse function
    	 * will not handle this page. But page splitting can happen.
    	 * Do this check under compound_page_lock(). The caller should
    	 * hold it.
    	 */
    	ret = -EBUSY;
    
    	if (nr_pages > 1 && !PageTransHuge(page))
    
    		goto out;
    
    	lock_page_cgroup(pc);
    
    	ret = -EINVAL;
    	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
    		goto unlock;
    
    
    	move_lock_mem_cgroup(from, &flags);
    
    	if (!anon && page_mapped(page)) {
    
    		/* Update mapped_file data for mem_cgroup */
    		preempt_disable();
    		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
    		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
    		preempt_enable();
    
    	mem_cgroup_charge_statistics(from, anon, -nr_pages);
    
    	/* caller should have done css_get */
    
    	pc->mem_cgroup = to;
    
    	mem_cgroup_charge_statistics(to, anon, nr_pages);
    
    	/*
    	 * We charges against "to" which may not have any tasks. Then, "to"
    	 * can be under rmdir(). But in current implementation, caller of
    
    	 * this function is just force_empty() and move charge, so it's
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    	 * guaranteed that "to" is never removed. So, we don't check rmdir
    
    	move_unlock_mem_cgroup(from, &flags);
    
    	unlock_page_cgroup(pc);
    
    	memcg_check_events(to, page);
    	memcg_check_events(from, page);
    
    static int mem_cgroup_move_parent(struct page *page,
    				  struct page_cgroup *pc,
    
    				  struct mem_cgroup *child,
    				  gfp_t gfp_mask)
    {
    	struct mem_cgroup *parent;
    
    	unsigned long uninitialized_var(flags);
    
    	if (mem_cgroup_is_root(child))
    
    	ret = -EBUSY;
    	if (!get_page_unless_zero(page))
    		goto out;
    	if (isolate_lru_page(page))
    		goto put;
    
    	nr_pages = hpage_nr_pages(page);
    
    	parent = parent_mem_cgroup(child);
    	/*
    	 * If no parent, move charges to root cgroup.
    	 */
    	if (!parent)
    		parent = root_mem_cgroup;
    
    		flags = compound_lock_irqsave(page);
    
    
    	ret = mem_cgroup_move_account(page, nr_pages,
    
    	if (!ret)
    		__mem_cgroup_cancel_local_charge(child, nr_pages);
    
    		compound_unlock_irqrestore(page, flags);
    
    	putback_lru_page(page);
    
    /*
     * Charge the memory controller for page usage.
     * Return
     * 0 if the charge was successful
     * < 0 if the cgroup is over its limit
     */
    static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
    
    				gfp_t gfp_mask, enum charge_type ctype)
    
    	struct mem_cgroup *memcg = NULL;
    
    	bool oom = true;
    
    	if (PageTransHuge(page)) {
    
    		nr_pages <<= compound_order(page);
    
    		VM_BUG_ON(!PageTransHuge(page));
    
    		/*
    		 * Never OOM-kill a process for a huge page.  The
    		 * fault handler will fall back to regular pages.
    		 */
    		oom = false;
    
    	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
    
    	if (ret == -ENOMEM)
    
    	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
    
    int mem_cgroup_newpage_charge(struct page *page,
    			      struct mm_struct *mm, gfp_t gfp_mask)
    
    	if (mem_cgroup_disabled())
    
    	VM_BUG_ON(page_mapped(page));
    	VM_BUG_ON(page->mapping && !PageAnon(page));
    	VM_BUG_ON(!mm);
    
    	return mem_cgroup_charge_common(page, mm, gfp_mask,
    
    					MEM_CGROUP_CHARGE_TYPE_MAPPED);
    
    static void
    __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
    					enum charge_type ctype);
    
    
    int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
    				gfp_t gfp_mask)
    
    	struct mem_cgroup *memcg = NULL;
    
    	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
    
    	if (mem_cgroup_disabled())
    
    	if (PageCompound(page))
    		return 0;
    
    	if (!page_is_file_cache(page))
    		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
    
    	if (!PageSwapCache(page))
    
    		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
    
    	else { /* page is swapcache/shmem */
    
    		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
    
    		if (!ret)
    
    			__mem_cgroup_commit_charge_swapin(page, memcg, type);
    	}
    
    /*
     * While swap-in, try_charge -> commit or cancel, the page is locked.
     * And when try_charge() successfully returns, one refcnt to memcg without
    
     * struct page_cgroup is acquired. This refcnt will be consumed by
    
     * "commit()" or removed by "cancel()"
     */
    
    int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
    				 struct page *page,
    
    				 gfp_t mask, struct mem_cgroup **memcgp)
    
    	struct mem_cgroup *memcg;
    
    	if (mem_cgroup_disabled())
    
    		return 0;
    
    	if (!do_swap_account)
    		goto charge_cur_mm;
    	/*
    	 * A racing thread's fault, or swapoff, may have already updated
    
    	 * the pte, and even removed page from swap cache: in those cases
    	 * do_swap_page()'s pte_same() test will fail; but there's also a
    	 * KSM case which does need to charge the page.
    
    	 */
    	if (!PageSwapCache(page))
    
    		goto charge_cur_mm;
    
    	memcg = try_get_mem_cgroup_from_page(page);
    	if (!memcg)
    
    		goto charge_cur_mm;
    
    	*memcgp = memcg;
    	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
    
    	css_put(&memcg->css);
    
    	if (ret == -EINTR)
    		ret = 0;
    
    	return ret;
    
    charge_cur_mm:
    	if (unlikely(!mm))
    		mm = &init_mm;
    
    	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
    	if (ret == -EINTR)
    		ret = 0;
    	return ret;
    
    static void
    
    __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
    
    					enum charge_type ctype)
    
    	if (mem_cgroup_disabled())
    
    	cgroup_exclude_rmdir(&memcg->css);
    
    	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
    
    	/*
    	 * Now swap is on-memory. This means this page may be
    	 * counted both as mem and swap....double count.
    
    	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
    	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
    	 * may call delete_from_swap_cache() before reach here.
    
    	if (do_swap_account && PageSwapCache(page)) {
    
    		swp_entry_t ent = {.val = page_private(page)};
    
    		mem_cgroup_uncharge_swap(ent);
    
    	/*
    	 * At swapin, we may charge account against cgroup which has no tasks.
    	 * So, rmdir()->pre_destroy() can be called while we do this charge.
    	 * In that case, we need to call pre_destroy() again. check it here.
    	 */
    
    	cgroup_release_and_wakeup_rmdir(&memcg->css);
    
    void mem_cgroup_commit_charge_swapin(struct page *page,
    				     struct mem_cgroup *memcg)
    
    	__mem_cgroup_commit_charge_swapin(page, memcg,
    					  MEM_CGROUP_CHARGE_TYPE_MAPPED);
    
    void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
    
    	if (mem_cgroup_disabled())
    
    	__mem_cgroup_cancel_charge(memcg, 1);
    
    static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
    
    				   unsigned int nr_pages,
    				   const enum charge_type ctype)
    
    {
    	struct memcg_batch_info *batch = NULL;
    	bool uncharge_memsw = true;
    
    	/* If swapout, usage of swap doesn't decrease */
    	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
    		uncharge_memsw = false;
    
    	batch = &current->memcg_batch;
    	/*
    	 * In usual, we do css_get() when we remember memcg pointer.
    	 * But in this case, we keep res->usage until end of a series of
    	 * uncharges. Then, it's ok to ignore memcg's refcnt.
    	 */
    	if (!batch->memcg)
    
    		batch->memcg = memcg;
    
    	/*
    	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    	 * In those cases, all pages freed continuously can be expected to be in
    
    	 * the same cgroup and we have chance to coalesce uncharges.
    	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
    	 * because we want to do uncharge as soon as possible.
    	 */
    
    	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
    		goto direct_uncharge;
    
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    		goto direct_uncharge;
    
    
    	/*
    	 * In typical case, batch->memcg == mem. This means we can
    	 * merge a series of uncharges to an uncharge of res_counter.
    	 * If not, we uncharge res_counter ony by one.
    	 */
    
    	if (batch->memcg != memcg)
    
    		goto direct_uncharge;
    	/* remember freed charge and uncharge it later */
    
    	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
    
    		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
    	if (unlikely(batch->memcg != memcg))
    		memcg_oom_recover(memcg);
    
     * uncharge if !page_mapped(page)
    
    static struct mem_cgroup *
    
    __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
    
    	struct mem_cgroup *memcg = NULL;
    
    	unsigned int nr_pages = 1;
    	struct page_cgroup *pc;
    
    	if (mem_cgroup_disabled())
    
    		return NULL;
    
    	if (PageSwapCache(page))
    
    		return NULL;
    
    	if (PageTransHuge(page)) {
    
    		nr_pages <<= compound_order(page);
    
    		VM_BUG_ON(!PageTransHuge(page));
    	}
    
    	 * Check if our page_cgroup is valid
    
    	pc = lookup_page_cgroup(page);
    
    	if (unlikely(!PageCgroupUsed(pc)))
    
    		return NULL;
    
    	lock_page_cgroup(pc);
    
    	memcg = pc->mem_cgroup;
    
    	if (!PageCgroupUsed(pc))
    		goto unlock_out;
    
    
    	anon = PageAnon(page);
    
    
    	switch (ctype) {
    	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
    
    		/*
    		 * Generally PageAnon tells if it's the anon statistics to be
    		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
    		 * used before page reached the stage of being marked PageAnon.
    		 */
    
    		anon = true;
    		/* fallthrough */
    
    	case MEM_CGROUP_CHARGE_TYPE_DROP:
    
    		/* See mem_cgroup_prepare_migration() */
    		if (page_mapped(page) || PageCgroupMigration(pc))
    
    			goto unlock_out;
    		break;
    	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
    		if (!PageAnon(page)) {	/* Shared memory */
    			if (page->mapping && !page_is_file_cache(page))
    				goto unlock_out;
    		} else if (page_mapped(page)) /* Anon */
    				goto unlock_out;
    		break;
    	default:
    		break;
    
    	mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
    
    	ClearPageCgroupUsed(pc);
    
    	/*
    	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
    	 * freed from LRU. This is safe because uncharged page is expected not
    	 * to be reused (freed soon). Exception is SwapCache, it's handled by
    	 * special functions.
    	 */
    
    	unlock_page_cgroup(pc);
    
    	 * even after unlock, we have memcg->res.usage here and this memcg
    
    	 * will never be freed.
    	 */
    
    	memcg_check_events(memcg, page);
    
    	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
    
    		mem_cgroup_swap_statistics(memcg, true);
    		mem_cgroup_get(memcg);
    
    	if (!mem_cgroup_is_root(memcg))
    		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
    
    	return memcg;
    
    
    unlock_out:
    	unlock_page_cgroup(pc);
    
    	return NULL;
    
    void mem_cgroup_uncharge_page(struct page *page)
    {
    
    	/* early check. */
    	if (page_mapped(page))
    		return;
    
    	VM_BUG_ON(page->mapping && !PageAnon(page));
    
    	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
    }
    
    void mem_cgroup_uncharge_cache_page(struct page *page)
    {
    	VM_BUG_ON(page_mapped(page));
    
    	VM_BUG_ON(page->mapping);
    
    	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);