Skip to content
Snippets Groups Projects
memcontrol.c 178 KiB
Newer Older
  • Learn to ignore specific revisions
  • 
    			mz = mem_cgroup_zoneinfo(root, nid, zid);
    			iter = &mz->reclaim_iter[reclaim->priority];
    
    			if (prev && reclaim->generation != iter->generation) {
    
    				iter->last_visited = NULL;
    
    			last_visited = mem_cgroup_iter_load(iter, root, &seq);
    
    		memcg = __mem_cgroup_iter_next(root, last_visited);
    
    			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
    
    			if (!memcg)
    
    				iter->generation++;
    			else if (!prev && memcg)
    				reclaim->generation = iter->generation;
    		}
    
    out_unlock:
    	rcu_read_unlock();
    
    out_css_put:
    	if (prev && prev != root)
    		css_put(&prev->css);
    
    
    /**
     * mem_cgroup_iter_break - abort a hierarchy walk prematurely
     * @root: hierarchy root
     * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
     */
    void mem_cgroup_iter_break(struct mem_cgroup *root,
    			   struct mem_cgroup *prev)
    
    {
    	if (!root)
    		root = root_mem_cgroup;
    	if (prev && prev != root)
    		css_put(&prev->css);
    }
    
    /*
     * Iteration constructs for visiting all cgroups (under a tree).  If
     * loops are exited prematurely (break), mem_cgroup_iter_break() must
     * be used for reference counting.
     */
    #define for_each_mem_cgroup_tree(iter, root)		\
    
    	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
    
    	     iter = mem_cgroup_iter(root, iter, NULL))
    
    #define for_each_mem_cgroup(iter)			\
    
    	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
    
    	     iter = mem_cgroup_iter(NULL, iter, NULL))
    
    void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
    
    	struct mem_cgroup *memcg;
    
    	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
    	if (unlikely(!memcg))
    
    		goto out;
    
    	switch (idx) {
    	case PGFAULT:
    
    		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
    		break;
    	case PGMAJFAULT:
    		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
    
    		break;
    	default:
    		BUG();
    	}
    out:
    	rcu_read_unlock();
    }
    
    EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
    
    /**
     * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
     * @zone: zone of the wanted lruvec
    
     * @memcg: memcg of the wanted lruvec
    
     *
     * Returns the lru list vector holding pages for the given @zone and
     * @mem.  This can be the global zone lruvec, if the memory controller
     * is disabled.
     */
    struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
    				      struct mem_cgroup *memcg)
    {
    	struct mem_cgroup_per_zone *mz;
    
    	struct lruvec *lruvec;
    
    	if (mem_cgroup_disabled()) {
    		lruvec = &zone->lruvec;
    		goto out;
    	}
    
    
    	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
    
    	lruvec = &mz->lruvec;
    out:
    	/*
    	 * Since a node can be onlined after the mem_cgroup was created,
    	 * we have to be prepared to initialize lruvec->zone here;
    	 * and if offlined then reonlined, we need to reinitialize it.
    	 */
    	if (unlikely(lruvec->zone != zone))
    		lruvec->zone = zone;
    	return lruvec;
    
    /*
     * Following LRU functions are allowed to be used without PCG_LOCK.
     * Operations are called by routine of global LRU independently from memcg.
     * What we have to take care of here is validness of pc->mem_cgroup.
     *
     * Changes to pc->mem_cgroup happens when
     * 1. charge
     * 2. moving account
     * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
     * It is added to LRU before charge.
     * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
     * When moving account, the page is not on LRU. It's isolated.
     */
    
     * mem_cgroup_page_lruvec - return lruvec for adding an lru page
    
     * @page: the page
    
     * @zone: zone of the page
    
    struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	struct mem_cgroup *memcg;
    	struct page_cgroup *pc;
    
    	struct lruvec *lruvec;
    
    	if (mem_cgroup_disabled()) {
    		lruvec = &zone->lruvec;
    		goto out;
    	}
    
    	pc = lookup_page_cgroup(page);
    
    	memcg = pc->mem_cgroup;
    
    	 * Surreptitiously switch any uncharged offlist page to root:
    
    	 * an uncharged page off lru does nothing to secure
    	 * its former mem_cgroup from sudden removal.
    	 *
    	 * Our caller holds lru_lock, and PageCgroupUsed is updated
    	 * under page_cgroup lock: between them, they make all uses
    	 * of pc->mem_cgroup safe.
    	 */
    
    	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
    
    		pc->mem_cgroup = memcg = root_mem_cgroup;
    
    
    	mz = page_cgroup_zoneinfo(memcg, page);
    
    	lruvec = &mz->lruvec;
    out:
    	/*
    	 * Since a node can be onlined after the mem_cgroup was created,
    	 * we have to be prepared to initialize lruvec->zone here;
    	 * and if offlined then reonlined, we need to reinitialize it.
    	 */
    	if (unlikely(lruvec->zone != zone))
    		lruvec->zone = zone;
    	return lruvec;
    
     * mem_cgroup_update_lru_size - account for adding or removing an lru page
     * @lruvec: mem_cgroup per zone lru vector
     * @lru: index of lru list the page is sitting on
     * @nr_pages: positive when adding or negative when removing
    
     * This function must be called when a page is added to or removed from an
     * lru list.
    
    void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
    				int nr_pages)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	unsigned long *lru_size;
    
    	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
    	lru_size = mz->lru_size + lru;
    	*lru_size += nr_pages;
    	VM_BUG_ON((long)(*lru_size) < 0);
    
     * Checks whether given mem is same or in the root_mem_cgroup's
    
     * hierarchy subtree
     */
    
    bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
    				  struct mem_cgroup *memcg)
    
    	if (!root_memcg->use_hierarchy || !memcg)
    
    	return css_is_ancestor(&memcg->css, &root_memcg->css);
    }
    
    static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
    				       struct mem_cgroup *memcg)
    {
    	bool ret;
    
    
    	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
    
    bool task_in_mem_cgroup(struct task_struct *task,
    			const struct mem_cgroup *memcg)
    
    	struct mem_cgroup *curr = NULL;
    
    	p = find_lock_task_mm(task);
    
    	if (p) {
    		curr = try_get_mem_cgroup_from_mm(p->mm);
    		task_unlock(p);
    	} else {
    		/*
    		 * All threads may have already detached their mm's, but the oom
    		 * killer still needs to detect if they have already been oom
    		 * killed to prevent needlessly killing additional tasks.
    		 */
    
    		curr = mem_cgroup_from_task(task);
    		if (curr)
    			css_get(&curr->css);
    
    	 * We should check use_hierarchy of "memcg" not "curr". Because checking
    
    	 * use_hierarchy of "curr" here make this function true if hierarchy is
    
    	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
    	 * hierarchy(even if use_hierarchy is disabled in "memcg").
    
    	ret = mem_cgroup_same_or_subtree(memcg, curr);
    
    	css_put(&curr->css);
    
    int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
    
    	unsigned long inactive_ratio;
    
    	unsigned long inactive;
    
    	unsigned long gb;
    
    	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
    	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
    
    	gb = (inactive + active) >> (30 - PAGE_SHIFT);
    	if (gb)
    		inactive_ratio = int_sqrt(10 * gb);
    	else
    		inactive_ratio = 1;
    
    
    	return inactive * inactive_ratio < active;
    
    #define mem_cgroup_from_res_counter(counter, member)	\
    	container_of(counter, struct mem_cgroup, member)
    
    
     * mem_cgroup_margin - calculate chargeable space of a memory cgroup
    
    Wanpeng Li's avatar
    Wanpeng Li committed
     * @memcg: the memory cgroup
    
     * Returns the maximum amount of memory @mem can be charged with, in
    
    static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
    
    	unsigned long long margin;
    
    
    	margin = res_counter_margin(&memcg->res);
    
    		margin = min(margin, res_counter_margin(&memcg->memsw));
    
    	return margin >> PAGE_SHIFT;
    
    int mem_cgroup_swappiness(struct mem_cgroup *memcg)
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    {
    	/* root ? */
    
    Tejun Heo's avatar
    Tejun Heo committed
    	if (!css_parent(&memcg->css))
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    		return vm_swappiness;
    
    
    	return memcg->swappiness;
    
    /*
     * memcg->moving_account is used for checking possibility that some thread is
     * calling move_account(). When a thread on CPU-A starts moving pages under
     * a memcg, other threads should check memcg->moving_account under
     * rcu_read_lock(), like this:
     *
     *         CPU-A                                    CPU-B
     *                                              rcu_read_lock()
     *         memcg->moving_account+1              if (memcg->mocing_account)
     *                                                   take heavy locks.
     *         synchronize_rcu()                    update something.
     *                                              rcu_read_unlock()
     *         start move here.
     */
    
    
    /* for quick checking without looking up memcg */
    atomic_t memcg_moving __read_mostly;
    
    
    static void mem_cgroup_start_move(struct mem_cgroup *memcg)
    
    	atomic_inc(&memcg->moving_account);
    
    static void mem_cgroup_end_move(struct mem_cgroup *memcg)
    
    	/*
    	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
    	 * We check NULL in callee rather than caller.
    	 */
    
    	if (memcg) {
    		atomic_dec(&memcg_moving);
    
    		atomic_dec(&memcg->moving_account);
    
    /*
     * 2 routines for checking "mem" is under move_account() or not.
     *
    
     * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
     *			  is used for avoiding races in accounting.  If true,
    
     *			  pc->mem_cgroup may be overwritten.
     *
     * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
     *			  under hierarchy of moving cgroups. This is for
     *			  waiting at hith-memory prressure caused by "move".
     */
    
    
    static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
    
    	return atomic_read(&memcg->moving_account) > 0;
    
    static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	bool ret = false;
    
    	/*
    	 * Unlike task_move routines, we access mc.to, mc.from not under
    	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
    	 */
    	spin_lock(&mc.lock);
    	from = mc.from;
    	to = mc.to;
    	if (!from)
    		goto unlock;
    
    	ret = mem_cgroup_same_or_subtree(memcg, from)
    		|| mem_cgroup_same_or_subtree(memcg, to);
    
    unlock:
    	spin_unlock(&mc.lock);
    
    static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
    
    {
    	if (mc.moving_task && current != mc.moving_task) {
    
    		if (mem_cgroup_under_move(memcg)) {
    
    			DEFINE_WAIT(wait);
    			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
    			/* moving charge context might have finished. */
    			if (mc.moving_task)
    				schedule();
    			finish_wait(&mc.waitq, &wait);
    			return true;
    		}
    	}
    	return false;
    }
    
    
    /*
     * Take this lock when
     * - a code tries to modify page's memcg while it's USED.
     * - a code tries to modify page state accounting in a memcg.
    
     * see mem_cgroup_stolen(), too.
    
     */
    static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
    				  unsigned long *flags)
    {
    	spin_lock_irqsave(&memcg->move_lock, *flags);
    }
    
    static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
    				unsigned long *flags)
    {
    	spin_unlock_irqrestore(&memcg->move_lock, *flags);
    }
    
    
    #define K(x) ((x) << (PAGE_SHIFT-10))
    
     * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
    
     * @memcg: The memory cgroup that went over limit
     * @p: Task that is going to be killed
     *
     * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
     * enabled
     */
    void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
    {
    	struct cgroup *task_cgrp;
    	struct cgroup *mem_cgrp;
    	/*
    	 * Need a buffer in BSS, can't rely on allocations. The code relies
    	 * on the assumption that OOM is serialized for memory controller.
    	 * If this assumption is broken, revisit this code.
    	 */
    	static char memcg_name[PATH_MAX];
    	int ret;
    
    	struct mem_cgroup *iter;
    	unsigned int i;
    
    		return;
    
    	rcu_read_lock();
    
    	mem_cgrp = memcg->css.cgroup;
    	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
    
    	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		/*
    		 * Unfortunately, we are unable to convert to a useful name
    		 * But we'll still print out the usage information
    		 */
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    
    	pr_info("Task in %s killed", memcg_name);
    
    
    	rcu_read_lock();
    	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	/*
    	 * Continues from above, so we don't need an KERN_ level
    	 */
    
    	pr_cont(" as a result of limit of %s\n", memcg_name);
    
    	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
    
    		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->res, RES_FAILCNT));
    
    	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
    
    		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
    
    	pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
    
    		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
    
    
    	for_each_mem_cgroup_tree(iter, memcg) {
    		pr_info("Memory cgroup stats");
    
    		rcu_read_lock();
    		ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
    		if (!ret)
    			pr_cont(" for %s", memcg_name);
    		rcu_read_unlock();
    		pr_cont(":");
    
    		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
    			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
    				continue;
    			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
    				K(mem_cgroup_read_stat(iter, i)));
    		}
    
    		for (i = 0; i < NR_LRU_LISTS; i++)
    			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
    				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
    
    		pr_cont("\n");
    	}
    
    /*
     * This function returns the number of memcg under hierarchy tree. Returns
     * 1(self count) if no children.
     */
    
    static int mem_cgroup_count_children(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    /*
     * Return the memory (and swap, if configured) limit for a memcg.
     */
    
    static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
    
    	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
    
    
    	 * Do not consider swap space if we cannot swap due to swappiness
    
    	if (mem_cgroup_swappiness(memcg)) {
    		u64 memsw;
    
    		limit += total_swap_pages << PAGE_SHIFT;
    		memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
    
    		/*
    		 * If memsw is finite and limits the amount of swap space
    		 * available to this memcg, return that limit.
    		 */
    		limit = min(limit, memsw);
    	}
    
    	return limit;
    
    static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
    				     int order)
    
    {
    	struct mem_cgroup *iter;
    	unsigned long chosen_points = 0;
    	unsigned long totalpages;
    	unsigned int points = 0;
    	struct task_struct *chosen = NULL;
    
    
    	 * If current has a pending SIGKILL or is exiting, then automatically
    	 * select it.  The goal is to allow it to allocate so that it may
    	 * quickly exit and free its memory.
    
    	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
    
    		set_thread_flag(TIF_MEMDIE);
    		return;
    	}
    
    	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
    
    	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
    	for_each_mem_cgroup_tree(iter, memcg) {
    
    		css_task_iter_start(&iter->css, &it);
    		while ((task = css_task_iter_next(&it))) {
    
    			switch (oom_scan_process_thread(task, totalpages, NULL,
    							false)) {
    			case OOM_SCAN_SELECT:
    				if (chosen)
    					put_task_struct(chosen);
    				chosen = task;
    				chosen_points = ULONG_MAX;
    				get_task_struct(chosen);
    				/* fall through */
    			case OOM_SCAN_CONTINUE:
    				continue;
    			case OOM_SCAN_ABORT:
    
    				mem_cgroup_iter_break(memcg, iter);
    				if (chosen)
    					put_task_struct(chosen);
    				return;
    			case OOM_SCAN_OK:
    				break;
    			};
    			points = oom_badness(task, memcg, NULL, totalpages);
    			if (points > chosen_points) {
    				if (chosen)
    					put_task_struct(chosen);
    				chosen = task;
    				chosen_points = points;
    				get_task_struct(chosen);
    			}
    		}
    
    	}
    
    	if (!chosen)
    		return;
    	points = chosen_points * 1000 / totalpages;
    	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
    			 NULL, "Memory cgroup out of memory");
    }
    
    
    static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
    					gfp_t gfp_mask,
    					unsigned long flags)
    {
    	unsigned long total = 0;
    	bool noswap = false;
    	int loop;
    
    	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
    		noswap = true;
    	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
    		noswap = true;
    
    	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
    		if (loop)
    			drain_all_stock_async(memcg);
    		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
    		/*
    		 * Allow limit shrinkers, which are triggered directly
    		 * by userspace, to catch signals and stop reclaim
    		 * after minimal progress, regardless of the margin.
    		 */
    		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
    			break;
    		if (mem_cgroup_margin(memcg))
    			break;
    		/*
    		 * If nothing was reclaimed after two attempts, there
    		 * may be no reclaimable pages in this hierarchy.
    		 */
    		if (loop && !total)
    			break;
    	}
    	return total;
    }
    
    
    #if MAX_NUMNODES > 1
    
    /**
     * test_mem_cgroup_node_reclaimable
    
    Wanpeng Li's avatar
    Wanpeng Li committed
     * @memcg: the target memcg
    
     * @nid: the node ID to be checked.
     * @noswap : specify true here if the user wants flle only information.
     *
     * This function returns whether the specified memcg contains any
     * reclaimable pages on a node. Returns true if there are any reclaimable
     * pages in the node.
     */
    
    static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
    
    		int nid, bool noswap)
    {
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
    
    		return true;
    	if (noswap || !total_swap_pages)
    		return false;
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
    
    
    /*
     * Always updating the nodemask is not very good - even if we have an empty
     * list or the wrong list here, we can start from some node and traverse all
     * nodes based on the zonelist. So update the list loosely once per 10 secs.
     *
     */
    
    static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
    
    	/*
    	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
    	 * pagein/pageout changes since the last update.
    	 */
    
    	if (!atomic_read(&memcg->numainfo_events))
    
    	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
    
    		return;
    
    	/* make a nodemask where this memcg uses memory from */
    
    	memcg->scan_nodes = node_states[N_MEMORY];
    
    	for_each_node_mask(nid, node_states[N_MEMORY]) {
    
    		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
    			node_clear(nid, memcg->scan_nodes);
    
    	atomic_set(&memcg->numainfo_events, 0);
    	atomic_set(&memcg->numainfo_updating, 0);
    
    }
    
    /*
     * Selecting a node where we start reclaim from. Because what we need is just
     * reducing usage counter, start from anywhere is O,K. Considering
     * memory reclaim from current node, there are pros. and cons.
     *
     * Freeing memory from current node means freeing memory from a node which
     * we'll use or we've used. So, it may make LRU bad. And if several threads
     * hit limits, it will see a contention on a node. But freeing from remote
     * node means more costs for memory reclaim because of memory latency.
     *
     * Now, we use round-robin. Better algorithm is welcomed.
     */
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
    	mem_cgroup_may_update_nodemask(memcg);
    	node = memcg->last_scanned_node;
    
    	node = next_node(node, memcg->scan_nodes);
    
    	if (node == MAX_NUMNODES)
    
    		node = first_node(memcg->scan_nodes);
    
    	/*
    	 * We call this when we hit limit, not when pages are added to LRU.
    	 * No LRU may hold pages because all pages are UNEVICTABLE or
    	 * memcg is too small and all pages are not on LRU. In that case,
    	 * we use curret node.
    	 */
    	if (unlikely(node == MAX_NUMNODES))
    		node = numa_node_id();
    
    
    	memcg->last_scanned_node = node;
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
     * A group is eligible for the soft limit reclaim if
     * 	a) it is over its soft limit
    
    Andrew Morton's avatar
    Andrew Morton committed
     *	b) any parent up the hierarchy is over its soft limit
    
    bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg)
    
    
    	if (res_counter_soft_limit_excess(&memcg->res))
    
    	 * If any parent up the hierarchy is over its soft limit then we
    	 * have to obey and reclaim from this group as well.
    
    Andrew Morton's avatar
    Andrew Morton committed
    	while ((parent = parent_mem_cgroup(parent))) {
    
    		if (res_counter_soft_limit_excess(&parent->res))
    
    static DEFINE_SPINLOCK(memcg_oom_lock);
    
    
    /*
     * Check OOM-Killer is already running under our hierarchy.
     * If someone is running, return false.
     */
    
    static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter, *failed = NULL;
    
    	for_each_mem_cgroup_tree(iter, memcg) {
    
    		if (iter->oom_lock) {
    
    			/*
    			 * this subtree of our hierarchy is already locked
    			 * so we cannot give a lock.
    			 */
    			failed = iter;
    
    			mem_cgroup_iter_break(memcg, iter);
    			break;
    
    		} else
    			iter->oom_lock = true;
    
    	if (failed) {
    		/*
    		 * OK, we failed to lock the whole subtree so we have
    		 * to clean up what we set up to the failing subtree
    		 */
    		for_each_mem_cgroup_tree(iter, memcg) {
    			if (iter == failed) {
    				mem_cgroup_iter_break(memcg, iter);
    				break;
    			}
    			iter->oom_lock = false;
    
    
    	spin_unlock(&memcg_oom_lock);
    
    	return !failed;
    
    static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	spin_lock(&memcg_oom_lock);
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		iter->oom_lock = false;
    
    	spin_unlock(&memcg_oom_lock);
    
    static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
    
    	/*
    	 * When a new child is created while the hierarchy is under oom,
    	 * mem_cgroup_oom_lock() may not be called. We have to use
    	 * atomic_add_unless() here.
    	 */
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		atomic_add_unless(&iter->under_oom, -1, 0);
    
    static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
    
    
    struct oom_wait_info {
    
    	struct mem_cgroup *memcg;
    
    	wait_queue_t	wait;
    };
    
    static int memcg_oom_wake_function(wait_queue_t *wait,
    	unsigned mode, int sync, void *arg)
    {
    
    	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
    	struct mem_cgroup *oom_wait_memcg;
    
    	struct oom_wait_info *oom_wait_info;
    
    	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
    
    	oom_wait_memcg = oom_wait_info->memcg;
    
    	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
    
    	 * Then we can use css_is_ancestor without taking care of RCU.
    	 */
    
    	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
    		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
    
    		return 0;
    	return autoremove_wake_function(wait, mode, sync, arg);
    }
    
    
    static void memcg_wakeup_oom(struct mem_cgroup *memcg)
    
    	atomic_inc(&memcg->oom_wakeups);
    
    	/* for filtering, pass "memcg" as argument. */
    	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
    
    static void memcg_oom_recover(struct mem_cgroup *memcg)
    
    	if (memcg && atomic_read(&memcg->under_oom))
    		memcg_wakeup_oom(memcg);
    
    static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
    
    	if (!current->memcg_oom.may_oom)
    		return;
    
    	current->memcg_oom.in_memcg_oom = 1;
    
    	 * As with any blocking lock, a contender needs to start
    	 * listening for wakeups before attempting the trylock,
    	 * otherwise it can miss the wakeup from the unlock and sleep
    	 * indefinitely.  This is just open-coded because our locking
    	 * is so particular to memcg hierarchies.
    
    	wakeups = atomic_read(&memcg->oom_wakeups);
    
    	mem_cgroup_mark_under_oom(memcg);
    
    	locked = mem_cgroup_oom_trylock(memcg);
    
    
    		mem_cgroup_oom_notify(memcg);
    
    	if (locked && !memcg->oom_kill_disable) {
    		mem_cgroup_unmark_under_oom(memcg);
    
    		mem_cgroup_out_of_memory(memcg, mask, order);
    
    		mem_cgroup_oom_unlock(memcg);
    		/*
    		 * There is no guarantee that an OOM-lock contender
    		 * sees the wakeups triggered by the OOM kill
    		 * uncharges.  Wake any sleepers explicitely.
    		 */
    		memcg_oom_recover(memcg);
    
    		/*
    		 * A system call can just return -ENOMEM, but if this
    		 * is a page fault and somebody else is handling the
    		 * OOM already, we need to sleep on the OOM waitqueue
    		 * for this memcg until the situation is resolved.
    		 * Which can take some time because it might be
    		 * handled by a userspace task.
    		 *
    		 * However, this is the charge context, which means
    		 * that we may sit on a large call stack and hold
    		 * various filesystem locks, the mmap_sem etc. and we
    		 * don't want the OOM handler to deadlock on them
    		 * while we sit here and wait.  Store the current OOM
    		 * context in the task_struct, then return -ENOMEM.
    		 * At the end of the page fault handler, with the
    		 * stack unwound, pagefault_out_of_memory() will check
    		 * back with us by calling
    		 * mem_cgroup_oom_synchronize(), possibly putting the
    		 * task to sleep.
    		 */
    		current->memcg_oom.oom_locked = locked;
    		current->memcg_oom.wakeups = wakeups;
    		css_get(&memcg->css);
    		current->memcg_oom.wait_on_memcg = memcg;
    
    }
    
    /**
     * mem_cgroup_oom_synchronize - complete memcg OOM handling
     *
     * This has to be called at the end of a page fault if the the memcg
     * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
     *
     * Memcg supports userspace OOM handling, so failed allocations must
     * sleep on a waitqueue until the userspace task resolves the
     * situation.  Sleeping directly in the charge context with all kinds
     * of locks held is not a good idea, instead we remember an OOM state
     * in the task and mem_cgroup_oom_synchronize() has to be called at
     * the end of the page fault to put the task to sleep and clean up the
     * OOM state.
     *
     * Returns %true if an ongoing memcg OOM situation was detected and
     * finalized, %false otherwise.
     */
    bool mem_cgroup_oom_synchronize(void)
    {
    	struct oom_wait_info owait;
    	struct mem_cgroup *memcg;
    
    	/* OOM is global, do not handle */
    	if (!current->memcg_oom.in_memcg_oom)
    		return false;
    
    	/*
    	 * We invoked the OOM killer but there is a chance that a kill
    	 * did not free up any charges.  Everybody else might already
    	 * be sleeping, so restart the fault and keep the rampage
    	 * going until some charges are released.
    	 */
    	memcg = current->memcg_oom.wait_on_memcg;
    	if (!memcg)