Skip to content
Snippets Groups Projects
memcontrol.c 143 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Iteration constructs for visiting all cgroups (under a tree).  If
     * loops are exited prematurely (break), mem_cgroup_iter_break() must
     * be used for reference counting.
     */
    #define for_each_mem_cgroup_tree(iter, root)		\
    
    	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
    
    	     iter = mem_cgroup_iter(root, iter, NULL))
    
    #define for_each_mem_cgroup(iter)			\
    
    	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
    
    	     iter = mem_cgroup_iter(NULL, iter, NULL))
    
    static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
    
    	return (memcg == root_mem_cgroup);
    
    void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
    {
    
    	struct mem_cgroup *memcg;
    
    	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
    	if (unlikely(!memcg))
    
    		goto out;
    
    	switch (idx) {
    	case PGFAULT:
    
    		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
    		break;
    	case PGMAJFAULT:
    		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
    
    		break;
    	default:
    		BUG();
    	}
    out:
    	rcu_read_unlock();
    }
    EXPORT_SYMBOL(mem_cgroup_count_vm_event);
    
    
    /**
     * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
     * @zone: zone of the wanted lruvec
    
     * @memcg: memcg of the wanted lruvec
    
     *
     * Returns the lru list vector holding pages for the given @zone and
     * @mem.  This can be the global zone lruvec, if the memory controller
     * is disabled.
     */
    struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
    				      struct mem_cgroup *memcg)
    {
    	struct mem_cgroup_per_zone *mz;
    
    	if (mem_cgroup_disabled())
    		return &zone->lruvec;
    
    	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
    	return &mz->lruvec;
    }
    
    
    /*
     * Following LRU functions are allowed to be used without PCG_LOCK.
     * Operations are called by routine of global LRU independently from memcg.
     * What we have to take care of here is validness of pc->mem_cgroup.
     *
     * Changes to pc->mem_cgroup happens when
     * 1. charge
     * 2. moving account
     * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
     * It is added to LRU before charge.
     * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
     * When moving account, the page is not on LRU. It's isolated.
     */
    
     * mem_cgroup_page_lruvec - return lruvec for adding an lru page
    
     * @page: the page
    
     * @zone: zone of the page
    
    struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	struct mem_cgroup *memcg;
    	struct page_cgroup *pc;
    
    	if (mem_cgroup_disabled())
    
    		return &zone->lruvec;
    
    
    	pc = lookup_page_cgroup(page);
    
    	memcg = pc->mem_cgroup;
    
    	 * Surreptitiously switch any uncharged offlist page to root:
    
    	 * an uncharged page off lru does nothing to secure
    	 * its former mem_cgroup from sudden removal.
    	 *
    	 * Our caller holds lru_lock, and PageCgroupUsed is updated
    	 * under page_cgroup lock: between them, they make all uses
    	 * of pc->mem_cgroup safe.
    	 */
    
    	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
    
    		pc->mem_cgroup = memcg = root_mem_cgroup;
    
    
    	mz = page_cgroup_zoneinfo(memcg, page);
    	return &mz->lruvec;
    
     * mem_cgroup_update_lru_size - account for adding or removing an lru page
     * @lruvec: mem_cgroup per zone lru vector
     * @lru: index of lru list the page is sitting on
     * @nr_pages: positive when adding or negative when removing
    
     * This function must be called when a page is added to or removed from an
     * lru list.
    
    void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
    				int nr_pages)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	unsigned long *lru_size;
    
    	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
    	lru_size = mz->lru_size + lru;
    	*lru_size += nr_pages;
    	VM_BUG_ON((long)(*lru_size) < 0);
    
     * Checks whether given mem is same or in the root_mem_cgroup's
    
     * hierarchy subtree
     */
    
    bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
    				  struct mem_cgroup *memcg)
    
    	if (!root_memcg->use_hierarchy || !memcg)
    
    	return css_is_ancestor(&memcg->css, &root_memcg->css);
    }
    
    static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
    				       struct mem_cgroup *memcg)
    {
    	bool ret;
    
    
    	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
    
    int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
    
    	struct mem_cgroup *curr = NULL;
    
    	p = find_lock_task_mm(task);
    
    	if (p) {
    		curr = try_get_mem_cgroup_from_mm(p->mm);
    		task_unlock(p);
    	} else {
    		/*
    		 * All threads may have already detached their mm's, but the oom
    		 * killer still needs to detect if they have already been oom
    		 * killed to prevent needlessly killing additional tasks.
    		 */
    		task_lock(task);
    		curr = mem_cgroup_from_task(task);
    		if (curr)
    			css_get(&curr->css);
    		task_unlock(task);
    	}
    
    	if (!curr)
    		return 0;
    
    	 * We should check use_hierarchy of "memcg" not "curr". Because checking
    
    	 * use_hierarchy of "curr" here make this function true if hierarchy is
    
    	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
    	 * hierarchy(even if use_hierarchy is disabled in "memcg").
    
    	ret = mem_cgroup_same_or_subtree(memcg, curr);
    
    	css_put(&curr->css);
    
    int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
    
    	unsigned long inactive_ratio;
    
    	unsigned long inactive;
    
    	unsigned long gb;
    
    	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
    	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
    
    	gb = (inactive + active) >> (30 - PAGE_SHIFT);
    	if (gb)
    		inactive_ratio = int_sqrt(10 * gb);
    	else
    		inactive_ratio = 1;
    
    
    	return inactive * inactive_ratio < active;
    
    int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
    
    {
    	unsigned long active;
    	unsigned long inactive;
    
    
    	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
    	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
    
    
    	return (active > inactive);
    }
    
    
    #define mem_cgroup_from_res_counter(counter, member)	\
    	container_of(counter, struct mem_cgroup, member)
    
    
     * mem_cgroup_margin - calculate chargeable space of a memory cgroup
     * @mem: the memory cgroup
    
     * Returns the maximum amount of memory @mem can be charged with, in
    
    static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
    
    	unsigned long long margin;
    
    
    	margin = res_counter_margin(&memcg->res);
    
    		margin = min(margin, res_counter_margin(&memcg->memsw));
    
    	return margin >> PAGE_SHIFT;
    
    int mem_cgroup_swappiness(struct mem_cgroup *memcg)
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    {
    	struct cgroup *cgrp = memcg->css.cgroup;
    
    	/* root ? */
    	if (cgrp->parent == NULL)
    		return vm_swappiness;
    
    
    	return memcg->swappiness;
    
    /*
     * memcg->moving_account is used for checking possibility that some thread is
     * calling move_account(). When a thread on CPU-A starts moving pages under
     * a memcg, other threads should check memcg->moving_account under
     * rcu_read_lock(), like this:
     *
     *         CPU-A                                    CPU-B
     *                                              rcu_read_lock()
     *         memcg->moving_account+1              if (memcg->mocing_account)
     *                                                   take heavy locks.
     *         synchronize_rcu()                    update something.
     *                                              rcu_read_unlock()
     *         start move here.
     */
    
    
    /* for quick checking without looking up memcg */
    atomic_t memcg_moving __read_mostly;
    
    
    static void mem_cgroup_start_move(struct mem_cgroup *memcg)
    
    	atomic_inc(&memcg->moving_account);
    
    static void mem_cgroup_end_move(struct mem_cgroup *memcg)
    
    	/*
    	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
    	 * We check NULL in callee rather than caller.
    	 */
    
    	if (memcg) {
    		atomic_dec(&memcg_moving);
    
    		atomic_dec(&memcg->moving_account);
    
    /*
     * 2 routines for checking "mem" is under move_account() or not.
     *
    
     * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
     *			  is used for avoiding races in accounting.  If true,
    
     *			  pc->mem_cgroup may be overwritten.
     *
     * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
     *			  under hierarchy of moving cgroups. This is for
     *			  waiting at hith-memory prressure caused by "move".
     */
    
    
    static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
    
    	return atomic_read(&memcg->moving_account) > 0;
    
    static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	bool ret = false;
    
    	/*
    	 * Unlike task_move routines, we access mc.to, mc.from not under
    	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
    	 */
    	spin_lock(&mc.lock);
    	from = mc.from;
    	to = mc.to;
    	if (!from)
    		goto unlock;
    
    	ret = mem_cgroup_same_or_subtree(memcg, from)
    		|| mem_cgroup_same_or_subtree(memcg, to);
    
    unlock:
    	spin_unlock(&mc.lock);
    
    static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
    
    {
    	if (mc.moving_task && current != mc.moving_task) {
    
    		if (mem_cgroup_under_move(memcg)) {
    
    			DEFINE_WAIT(wait);
    			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
    			/* moving charge context might have finished. */
    			if (mc.moving_task)
    				schedule();
    			finish_wait(&mc.waitq, &wait);
    			return true;
    		}
    	}
    	return false;
    }
    
    
    /*
     * Take this lock when
     * - a code tries to modify page's memcg while it's USED.
     * - a code tries to modify page state accounting in a memcg.
    
     * see mem_cgroup_stolen(), too.
    
     */
    static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
    				  unsigned long *flags)
    {
    	spin_lock_irqsave(&memcg->move_lock, *flags);
    }
    
    static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
    				unsigned long *flags)
    {
    	spin_unlock_irqrestore(&memcg->move_lock, *flags);
    }
    
    
     * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
    
     * @memcg: The memory cgroup that went over limit
     * @p: Task that is going to be killed
     *
     * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
     * enabled
     */
    void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
    {
    	struct cgroup *task_cgrp;
    	struct cgroup *mem_cgrp;
    	/*
    	 * Need a buffer in BSS, can't rely on allocations. The code relies
    	 * on the assumption that OOM is serialized for memory controller.
    	 * If this assumption is broken, revisit this code.
    	 */
    	static char memcg_name[PATH_MAX];
    	int ret;
    
    
    		return;
    
    	rcu_read_lock();
    
    	mem_cgrp = memcg->css.cgroup;
    	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
    
    	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		/*
    		 * Unfortunately, we are unable to convert to a useful name
    		 * But we'll still print out the usage information
    		 */
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	printk(KERN_INFO "Task in %s killed", memcg_name);
    
    	rcu_read_lock();
    	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	/*
    	 * Continues from above, so we don't need an KERN_ level
    	 */
    	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
    done:
    
    	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
    		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->res, RES_FAILCNT));
    	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
    		"failcnt %llu\n",
    		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
    }
    
    
    /*
     * This function returns the number of memcg under hierarchy tree. Returns
     * 1(self count) if no children.
     */
    
    static int mem_cgroup_count_children(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    /*
     * Return the memory (and swap, if configured) limit for a memcg.
     */
    u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
    {
    	u64 limit;
    	u64 memsw;
    
    
    	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
    	limit += total_swap_pages << PAGE_SHIFT;
    
    
    	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
    	/*
    	 * If memsw is finite and limits the amount of swap space available
    	 * to this memcg, return that limit.
    	 */
    	return min(limit, memsw);
    }
    
    
    static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
    					gfp_t gfp_mask,
    					unsigned long flags)
    {
    	unsigned long total = 0;
    	bool noswap = false;
    	int loop;
    
    	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
    		noswap = true;
    	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
    		noswap = true;
    
    	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
    		if (loop)
    			drain_all_stock_async(memcg);
    		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
    		/*
    		 * Allow limit shrinkers, which are triggered directly
    		 * by userspace, to catch signals and stop reclaim
    		 * after minimal progress, regardless of the margin.
    		 */
    		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
    			break;
    		if (mem_cgroup_margin(memcg))
    			break;
    		/*
    		 * If nothing was reclaimed after two attempts, there
    		 * may be no reclaimable pages in this hierarchy.
    		 */
    		if (loop && !total)
    			break;
    	}
    	return total;
    }
    
    
    /**
     * test_mem_cgroup_node_reclaimable
     * @mem: the target memcg
     * @nid: the node ID to be checked.
     * @noswap : specify true here if the user wants flle only information.
     *
     * This function returns whether the specified memcg contains any
     * reclaimable pages on a node. Returns true if there are any reclaimable
     * pages in the node.
     */
    
    static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
    
    		int nid, bool noswap)
    {
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
    
    		return true;
    	if (noswap || !total_swap_pages)
    		return false;
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
    
    #if MAX_NUMNODES > 1
    
    /*
     * Always updating the nodemask is not very good - even if we have an empty
     * list or the wrong list here, we can start from some node and traverse all
     * nodes based on the zonelist. So update the list loosely once per 10 secs.
     *
     */
    
    static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
    
    	/*
    	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
    	 * pagein/pageout changes since the last update.
    	 */
    
    	if (!atomic_read(&memcg->numainfo_events))
    
    	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
    
    		return;
    
    	/* make a nodemask where this memcg uses memory from */
    
    	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
    
    
    	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
    
    
    		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
    			node_clear(nid, memcg->scan_nodes);
    
    	atomic_set(&memcg->numainfo_events, 0);
    	atomic_set(&memcg->numainfo_updating, 0);
    
    }
    
    /*
     * Selecting a node where we start reclaim from. Because what we need is just
     * reducing usage counter, start from anywhere is O,K. Considering
     * memory reclaim from current node, there are pros. and cons.
     *
     * Freeing memory from current node means freeing memory from a node which
     * we'll use or we've used. So, it may make LRU bad. And if several threads
     * hit limits, it will see a contention on a node. But freeing from remote
     * node means more costs for memory reclaim because of memory latency.
     *
     * Now, we use round-robin. Better algorithm is welcomed.
     */
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
    	mem_cgroup_may_update_nodemask(memcg);
    	node = memcg->last_scanned_node;
    
    	node = next_node(node, memcg->scan_nodes);
    
    	if (node == MAX_NUMNODES)
    
    		node = first_node(memcg->scan_nodes);
    
    	/*
    	 * We call this when we hit limit, not when pages are added to LRU.
    	 * No LRU may hold pages because all pages are UNEVICTABLE or
    	 * memcg is too small and all pages are not on LRU. In that case,
    	 * we use curret node.
    	 */
    	if (unlikely(node == MAX_NUMNODES))
    		node = numa_node_id();
    
    
    	memcg->last_scanned_node = node;
    
    /*
     * Check all nodes whether it contains reclaimable pages or not.
     * For quick scan, we make use of scan_nodes. This will allow us to skip
     * unused nodes. But scan_nodes is lazily updated and may not cotain
     * enough new information. We need to do double check.
     */
    
    static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
    
    {
    	int nid;
    
    	/*
    	 * quick check...making use of scan_node.
    	 * We can skip unused nodes.
    	 */
    
    	if (!nodes_empty(memcg->scan_nodes)) {
    		for (nid = first_node(memcg->scan_nodes);
    
    		     nid < MAX_NUMNODES;
    
    		     nid = next_node(nid, memcg->scan_nodes)) {
    
    			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
    
    				return true;
    		}
    	}
    	/*
    	 * Check rest of nodes.
    	 */
    	for_each_node_state(nid, N_HIGH_MEMORY) {
    
    		if (node_isset(nid, memcg->scan_nodes))
    
    		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
    static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
    
    	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
    
    static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
    				   struct zone *zone,
    				   gfp_t gfp_mask,
    				   unsigned long *total_scanned)
    
    	struct mem_cgroup *victim = NULL;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	int loop = 0;
    
    	unsigned long excess;
    
    	unsigned long nr_scanned;
    
    	struct mem_cgroup_reclaim_cookie reclaim = {
    		.zone = zone,
    		.priority = 0,
    	};
    
    	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
    
    		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			loop++;
    
    			if (loop >= 2) {
    				/*
    				 * If we have not been able to reclaim
    				 * anything, it might because there are
    				 * no reclaimable pages under this hierarchy
    				 */
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    				 * We want to do more targeted reclaim.
    
    				 * excess >> 2 is not to excessive so as to
    				 * reclaim too much, nor too less that we keep
    				 * coming back to reclaim from this cgroup
    				 */
    				if (total >= (excess >> 2) ||
    
    					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
    
    		if (!mem_cgroup_reclaimable(victim, false))
    
    		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
    						     zone, &nr_scanned);
    		*total_scanned += nr_scanned;
    		if (!res_counter_soft_limit_excess(&root_memcg->res))
    
    	mem_cgroup_iter_break(root_memcg, victim);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	return total;
    
    /*
     * Check OOM-Killer is already running under our hierarchy.
     * If someone is running, return false.
    
     * Has to be called with memcg_oom_lock
    
    static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter, *failed = NULL;
    
    	for_each_mem_cgroup_tree(iter, memcg) {
    
    		if (iter->oom_lock) {
    
    			/*
    			 * this subtree of our hierarchy is already locked
    			 * so we cannot give a lock.
    			 */
    			failed = iter;
    
    			mem_cgroup_iter_break(memcg, iter);
    			break;
    
    		} else
    			iter->oom_lock = true;
    
    		return true;
    
    
    	/*
    	 * OK, we failed to lock the whole subtree so we have to clean up
    	 * what we set up to the failing subtree
    	 */
    
    	for_each_mem_cgroup_tree(iter, memcg) {
    
    			mem_cgroup_iter_break(memcg, iter);
    			break;
    
    	return false;
    
     * Has to be called with memcg_oom_lock
    
    static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
    
    	/*
    	 * When a new child is created while the hierarchy is under oom,
    	 * mem_cgroup_oom_lock() may not be called. We have to use
    	 * atomic_add_unless() here.
    	 */
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		atomic_add_unless(&iter->under_oom, -1, 0);
    
    static DEFINE_SPINLOCK(memcg_oom_lock);
    
    static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
    
    
    struct oom_wait_info {
    
    	struct mem_cgroup *memcg;
    
    	wait_queue_t	wait;
    };
    
    static int memcg_oom_wake_function(wait_queue_t *wait,
    	unsigned mode, int sync, void *arg)
    {
    
    	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
    	struct mem_cgroup *oom_wait_memcg;
    
    	struct oom_wait_info *oom_wait_info;
    
    	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
    
    	oom_wait_memcg = oom_wait_info->memcg;
    
    	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
    
    	 * Then we can use css_is_ancestor without taking care of RCU.
    	 */
    
    	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
    		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
    
    		return 0;
    	return autoremove_wake_function(wait, mode, sync, arg);
    }
    
    
    static void memcg_wakeup_oom(struct mem_cgroup *memcg)
    
    	/* for filtering, pass "memcg" as argument. */
    	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
    
    static void memcg_oom_recover(struct mem_cgroup *memcg)
    
    	if (memcg && atomic_read(&memcg->under_oom))
    		memcg_wakeup_oom(memcg);
    
    /*
     * try to call OOM killer. returns false if we should exit memory-reclaim loop.
     */
    
    static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
    				  int order)
    
    	struct oom_wait_info owait;
    
    	bool locked, need_to_kill;
    
    	owait.memcg = memcg;
    
    	owait.wait.flags = 0;
    	owait.wait.func = memcg_oom_wake_function;
    	owait.wait.private = current;
    	INIT_LIST_HEAD(&owait.wait.task_list);
    
    	need_to_kill = true;
    
    	mem_cgroup_mark_under_oom(memcg);
    
    	/* At first, try to OOM lock hierarchy under memcg.*/
    
    	spin_lock(&memcg_oom_lock);
    
    	locked = mem_cgroup_oom_lock(memcg);
    
    	/*
    	 * Even if signal_pending(), we can't quit charge() loop without
    	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
    	 * under OOM is always welcomed, use TASK_KILLABLE here.
    	 */
    
    	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
    
    	if (!locked || memcg->oom_kill_disable)
    
    		need_to_kill = false;
    	if (locked)
    
    		mem_cgroup_oom_notify(memcg);
    
    	spin_unlock(&memcg_oom_lock);
    
    	if (need_to_kill) {
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    		mem_cgroup_out_of_memory(memcg, mask, order);
    
    		schedule();
    
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    	spin_lock(&memcg_oom_lock);
    
    		mem_cgroup_oom_unlock(memcg);
    	memcg_wakeup_oom(memcg);
    
    	spin_unlock(&memcg_oom_lock);
    
    	mem_cgroup_unmark_under_oom(memcg);
    
    	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
    		return false;
    	/* Give chance to dying process */
    
    	schedule_timeout_uninterruptible(1);
    
    	return true;
    
    /*
     * Currently used to update mapped file statistics, but the routine can be
     * generalized to update other statistics as well.
    
     *
     * Notes: Race condition
     *
     * We usually use page_cgroup_lock() for accessing page_cgroup member but
     * it tends to be costly. But considering some conditions, we doesn't need
     * to do so _always_.
     *
     * Considering "charge", lock_page_cgroup() is not required because all
     * file-stat operations happen after a page is attached to radix-tree. There
     * are no race with "charge".
     *
     * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
     * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
     * if there are race with "uncharge". Statistics itself is properly handled
     * by flags.
     *
     * Considering "move", this is an only case we see a race. To make the race
    
     * small, we check mm->moving_account and detect there are possibility of race
     * If there is, we take a lock.
    
    void __mem_cgroup_begin_update_page_stat(struct page *page,
    				bool *locked, unsigned long *flags)
    {
    	struct mem_cgroup *memcg;
    	struct page_cgroup *pc;
    
    	pc = lookup_page_cgroup(page);
    again:
    	memcg = pc->mem_cgroup;
    	if (unlikely(!memcg || !PageCgroupUsed(pc)))
    		return;
    	/*
    	 * If this memory cgroup is not under account moving, we don't
    	 * need to take move_lock_page_cgroup(). Because we already hold
    	 * rcu_read_lock(), any calls to move_account will be delayed until
    
    	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
    
    	if (!mem_cgroup_stolen(memcg))
    
    		return;
    
    	move_lock_mem_cgroup(memcg, flags);
    	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
    		move_unlock_mem_cgroup(memcg, flags);
    		goto again;
    	}
    	*locked = true;
    }
    
    void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
    {
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    	/*
    	 * It's guaranteed that pc->mem_cgroup never changes while
    	 * lock is held because a routine modifies pc->mem_cgroup
    	 * should take move_lock_page_cgroup().
    	 */
    	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
    }
    
    
    void mem_cgroup_update_page_stat(struct page *page,
    				 enum mem_cgroup_page_stat_item idx, int val)
    
    	struct mem_cgroup *memcg;
    
    	unsigned long uninitialized_var(flags);
    
    	memcg = pc->mem_cgroup;
    	if (unlikely(!memcg || !PageCgroupUsed(pc)))
    
    	case MEMCG_NR_FILE_MAPPED:
    		idx = MEM_CGROUP_STAT_FILE_MAPPED;
    
    	this_cpu_add(memcg->stat->count[idx], val);
    
    /*
     * size of first charge trial. "32" comes from vmscan.c's magic value.
     * TODO: maybe necessary to use big numbers in big irons.
     */
    
    struct memcg_stock_pcp {
    	struct mem_cgroup *cached; /* this never be root cgroup */
    
    	struct work_struct work;
    
    #define FLUSHING_CACHED_CHARGE	0
    
    };
    static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
    
    static DEFINE_MUTEX(percpu_charge_mutex);
    
     * Try to consume stocked charge on this cpu. If success, one page is consumed
    
     * from local stock and true is returned. If the stock is 0 or charges from a
     * cgroup which is not current target, returns false. This stock will be
     * refilled.
     */
    
    static bool consume_stock(struct mem_cgroup *memcg)
    
    {
    	struct memcg_stock_pcp *stock;
    	bool ret = true;
    
    	stock = &get_cpu_var(memcg_stock);
    
    	if (memcg == stock->cached && stock->nr_pages)
    
    	else /* need to call res_counter_charge */
    		ret = false;
    	put_cpu_var(memcg_stock);
    	return ret;
    }
    
    /*
     * Returns stocks cached in percpu to res_counter and reset cached information.
     */
    static void drain_stock(struct memcg_stock_pcp *stock)
    {
    	struct mem_cgroup *old = stock->cached;
    
    
    	if (stock->nr_pages) {
    		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
    
    		res_counter_uncharge(&old->res, bytes);
    
    		if (do_swap_account)