Skip to content
Snippets Groups Projects
memcontrol.c 145 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
     * inactive list.
     */
    void mem_cgroup_rotate_reclaimable_page(struct page *page)
    {
    	struct mem_cgroup_per_zone *mz;
    	struct page_cgroup *pc;
    	enum lru_list lru = page_lru(page);
    
    	if (mem_cgroup_disabled())
    		return;
    
    	pc = lookup_page_cgroup(page);
    	/* unused or root page is not rotated. */
    	if (!PageCgroupUsed(pc))
    		return;
    	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
    	smp_rmb();
    	if (mem_cgroup_is_root(pc->mem_cgroup))
    		return;
    
    	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
    
    	list_move_tail(&pc->lru, &mz->lists[lru]);
    }
    
    
    void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
    {
    	struct mem_cgroup_per_zone *mz;
    	struct page_cgroup *pc;
    
    	if (mem_cgroup_disabled())
    
    		return;
    
    	pc = lookup_page_cgroup(page);
    
    	/* unused or root page is not rotated. */
    
    	if (!PageCgroupUsed(pc))
    		return;
    	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
    	smp_rmb();
    	if (mem_cgroup_is_root(pc->mem_cgroup))
    
    		return;
    
    	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
    
    	list_move(&pc->lru, &mz->lists[lru]);
    
    void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
    
    	struct page_cgroup *pc;
    	struct mem_cgroup_per_zone *mz;
    
    	if (mem_cgroup_disabled())
    
    		return;
    	pc = lookup_page_cgroup(page);
    
    	VM_BUG_ON(PageCgroupAcctLRU(pc));
    
    	/*
    	 * putback:				charge:
    	 * SetPageLRU				SetPageCgroupUsed
    	 * smp_mb				smp_mb
    	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
    	 *
    	 * Ensure that one of the two sides adds the page to the memcg
    	 * LRU during a race.
    	 */
    	smp_mb();
    
    	if (!PageCgroupUsed(pc))
    
    	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
    	smp_rmb();
    
    	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
    
    	/* huge page split is done under lru_lock. so, we have no races. */
    	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
    
    	SetPageCgroupAcctLRU(pc);
    	if (mem_cgroup_is_root(pc->mem_cgroup))
    		return;
    
    	list_add(&pc->lru, &mz->lists[lru]);
    }
    
     * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
     * while it's linked to lru because the page may be reused after it's fully
     * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
     * It's done under lock_page and expected that zone->lru_lock isnever held.
    
    static void mem_cgroup_lru_del_before_commit(struct page *page)
    
    	unsigned long flags;
    	struct zone *zone = page_zone(page);
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    
    	/*
    	 * Doing this check without taking ->lru_lock seems wrong but this
    	 * is safe. Because if page_cgroup's USED bit is unset, the page
    	 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
    	 * set, the commit after this will fail, anyway.
    	 * This all charge/uncharge is done under some mutual execustion.
    	 * So, we don't need to taking care of changes in USED bit.
    	 */
    	if (likely(!PageLRU(page)))
    		return;
    
    
    	spin_lock_irqsave(&zone->lru_lock, flags);
    	/*
    	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
    	 * is guarded by lock_page() because the page is SwapCache.
    	 */
    	if (!PageCgroupUsed(pc))
    		mem_cgroup_del_lru_list(page, page_lru(page));
    	spin_unlock_irqrestore(&zone->lru_lock, flags);
    
    static void mem_cgroup_lru_add_after_commit(struct page *page)
    
    {
    	unsigned long flags;
    	struct zone *zone = page_zone(page);
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    	/*
    	 * putback:				charge:
    	 * SetPageLRU				SetPageCgroupUsed
    	 * smp_mb				smp_mb
    	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
    	 *
    	 * Ensure that one of the two sides adds the page to the memcg
    	 * LRU during a race.
    	 */
    	smp_mb();
    
    	/* taking care of that the page is added to LRU while we commit it */
    	if (likely(!PageLRU(page)))
    		return;
    
    	spin_lock_irqsave(&zone->lru_lock, flags);
    	/* link when the page is linked to LRU but page_cgroup isn't */
    
    	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
    
    		mem_cgroup_add_lru_list(page, page_lru(page));
    	spin_unlock_irqrestore(&zone->lru_lock, flags);
    }
    
    
    
    void mem_cgroup_move_lists(struct page *page,
    			   enum lru_list from, enum lru_list to)
    {
    
    	if (mem_cgroup_disabled())
    
    		return;
    	mem_cgroup_del_lru_list(page, from);
    	mem_cgroup_add_lru_list(page, to);
    
     * Checks whether given mem is same or in the root_mem_cgroup's
    
     * hierarchy subtree
     */
    
    static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
    		struct mem_cgroup *memcg)
    
    	if (root_memcg != memcg) {
    		return (root_memcg->use_hierarchy &&
    			css_is_ancestor(&memcg->css, &root_memcg->css));
    
    int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
    
    	struct mem_cgroup *curr = NULL;
    
    	p = find_lock_task_mm(task);
    	if (!p)
    		return 0;
    	curr = try_get_mem_cgroup_from_mm(p->mm);
    	task_unlock(p);
    
    	if (!curr)
    		return 0;
    
    	 * We should check use_hierarchy of "memcg" not "curr". Because checking
    
    	 * use_hierarchy of "curr" here make this function true if hierarchy is
    
    	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
    	 * hierarchy(even if use_hierarchy is disabled in "memcg").
    
    	ret = mem_cgroup_same_or_subtree(memcg, curr);
    
    	css_put(&curr->css);
    
    int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
    
    	unsigned long inactive_ratio;
    	int nid = zone_to_nid(zone);
    	int zid = zone_idx(zone);
    
    	unsigned long inactive;
    
    	unsigned long gb;
    
    	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
    						BIT(LRU_INACTIVE_ANON));
    	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
    					      BIT(LRU_ACTIVE_ANON));
    
    	gb = (inactive + active) >> (30 - PAGE_SHIFT);
    	if (gb)
    		inactive_ratio = int_sqrt(10 * gb);
    	else
    		inactive_ratio = 1;
    
    
    	return inactive * inactive_ratio < active;
    
    int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
    
    {
    	unsigned long active;
    	unsigned long inactive;
    
    	int zid = zone_idx(zone);
    	int nid = zone_to_nid(zone);
    
    	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
    						BIT(LRU_INACTIVE_FILE));
    	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
    					      BIT(LRU_ACTIVE_FILE));
    
    
    	return (active > inactive);
    }
    
    
    struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
    						      struct zone *zone)
    {
    
    	int zid = zone_idx(zone);
    	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    
    	return &mz->reclaim_stat;
    }
    
    struct zone_reclaim_stat *
    mem_cgroup_get_reclaim_stat_from_page(struct page *page)
    {
    	struct page_cgroup *pc;
    	struct mem_cgroup_per_zone *mz;
    
    	if (mem_cgroup_disabled())
    		return NULL;
    
    	pc = lookup_page_cgroup(page);
    
    	if (!PageCgroupUsed(pc))
    		return NULL;
    
    	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
    	smp_rmb();
    
    	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
    
    	return &mz->reclaim_stat;
    }
    
    
    unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
    					struct list_head *dst,
    					unsigned long *scanned, int order,
    
    					isolate_mode_t mode,
    					struct zone *z,
    
    					struct mem_cgroup *mem_cont,
    
    					int active, int file)
    
    {
    	unsigned long nr_taken = 0;
    	struct page *page;
    	unsigned long scan;
    	LIST_HEAD(pc_list);
    	struct list_head *src;
    
    	int zid = zone_idx(z);
    	struct mem_cgroup_per_zone *mz;
    
    	int lru = LRU_FILE * file + active;
    
    	BUG_ON(!mem_cont);
    
    	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
    
    	src = &mz->lists[lru];
    
    	scan = 0;
    	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
    
    		if (scan >= nr_to_scan)
    
    		if (unlikely(!PageCgroupUsed(pc)))
    			continue;
    
    		page = lookup_cgroup_page(pc);
    
    		if (unlikely(!PageLRU(page)))
    
    		ret = __isolate_lru_page(page, mode, file);
    		switch (ret) {
    		case 0:
    
    			list_move(&page->lru, dst);
    
    			mem_cgroup_del_lru(page);
    
    			nr_taken += hpage_nr_pages(page);
    
    			break;
    		case -EBUSY:
    			/* we don't affect global LRU but rotate in our LRU */
    			mem_cgroup_rotate_lru_list(page, page_lru(page));
    			break;
    		default:
    			break;
    
    
    	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
    				      0, 0, 0, mode);
    
    
    #define mem_cgroup_from_res_counter(counter, member)	\
    	container_of(counter, struct mem_cgroup, member)
    
    
     * mem_cgroup_margin - calculate chargeable space of a memory cgroup
     * @mem: the memory cgroup
    
     * Returns the maximum amount of memory @mem can be charged with, in
    
    static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
    
    	unsigned long long margin;
    
    
    	margin = res_counter_margin(&memcg->res);
    
    		margin = min(margin, res_counter_margin(&memcg->memsw));
    
    	return margin >> PAGE_SHIFT;
    
    int mem_cgroup_swappiness(struct mem_cgroup *memcg)
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    {
    	struct cgroup *cgrp = memcg->css.cgroup;
    
    	/* root ? */
    	if (cgrp->parent == NULL)
    		return vm_swappiness;
    
    
    	return memcg->swappiness;
    
    static void mem_cgroup_start_move(struct mem_cgroup *memcg)
    
    	spin_lock(&memcg->pcp_counter_lock);
    
    		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
    	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
    	spin_unlock(&memcg->pcp_counter_lock);
    
    static void mem_cgroup_end_move(struct mem_cgroup *memcg)
    
    	spin_lock(&memcg->pcp_counter_lock);
    
    		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
    	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
    	spin_unlock(&memcg->pcp_counter_lock);
    
    }
    /*
     * 2 routines for checking "mem" is under move_account() or not.
     *
     * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
     *			  for avoiding race in accounting. If true,
     *			  pc->mem_cgroup may be overwritten.
     *
     * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
     *			  under hierarchy of moving cgroups. This is for
     *			  waiting at hith-memory prressure caused by "move".
     */
    
    
    static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
    
    	return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
    
    static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	bool ret = false;
    
    	/*
    	 * Unlike task_move routines, we access mc.to, mc.from not under
    	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
    	 */
    	spin_lock(&mc.lock);
    	from = mc.from;
    	to = mc.to;
    	if (!from)
    		goto unlock;
    
    	ret = mem_cgroup_same_or_subtree(memcg, from)
    		|| mem_cgroup_same_or_subtree(memcg, to);
    
    unlock:
    	spin_unlock(&mc.lock);
    
    static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
    
    {
    	if (mc.moving_task && current != mc.moving_task) {
    
    		if (mem_cgroup_under_move(memcg)) {
    
    			DEFINE_WAIT(wait);
    			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
    			/* moving charge context might have finished. */
    			if (mc.moving_task)
    				schedule();
    			finish_wait(&mc.waitq, &wait);
    			return true;
    		}
    	}
    	return false;
    }
    
    
     * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
    
     * @memcg: The memory cgroup that went over limit
     * @p: Task that is going to be killed
     *
     * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
     * enabled
     */
    void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
    {
    	struct cgroup *task_cgrp;
    	struct cgroup *mem_cgrp;
    	/*
    	 * Need a buffer in BSS, can't rely on allocations. The code relies
    	 * on the assumption that OOM is serialized for memory controller.
    	 * If this assumption is broken, revisit this code.
    	 */
    	static char memcg_name[PATH_MAX];
    	int ret;
    
    
    		return;
    
    
    	rcu_read_lock();
    
    	mem_cgrp = memcg->css.cgroup;
    	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
    
    	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		/*
    		 * Unfortunately, we are unable to convert to a useful name
    		 * But we'll still print out the usage information
    		 */
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	printk(KERN_INFO "Task in %s killed", memcg_name);
    
    	rcu_read_lock();
    	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	/*
    	 * Continues from above, so we don't need an KERN_ level
    	 */
    	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
    done:
    
    	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
    		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->res, RES_FAILCNT));
    	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
    		"failcnt %llu\n",
    		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
    }
    
    
    /*
     * This function returns the number of memcg under hierarchy tree. Returns
     * 1(self count) if no children.
     */
    
    static int mem_cgroup_count_children(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    /*
     * Return the memory (and swap, if configured) limit for a memcg.
     */
    u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
    {
    	u64 limit;
    	u64 memsw;
    
    
    	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
    	limit += total_swap_pages << PAGE_SHIFT;
    
    
    	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
    	/*
    	 * If memsw is finite and limits the amount of swap space available
    	 * to this memcg, return that limit.
    	 */
    	return min(limit, memsw);
    }
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
     * Visit the first child (need not be the first child as per the ordering
     * of the cgroup list, since we track last_scanned_child) of @mem and use
     * that to reclaim free pages from.
     */
    static struct mem_cgroup *
    
    mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    {
    	struct mem_cgroup *ret = NULL;
    	struct cgroup_subsys_state *css;
    	int nextid, found;
    
    
    	if (!root_memcg->use_hierarchy) {
    		css_get(&root_memcg->css);
    		ret = root_memcg;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	}
    
    	while (!ret) {
    		rcu_read_lock();
    
    		nextid = root_memcg->last_scanned_child + 1;
    		css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    				   &found);
    		if (css && css_tryget(css))
    			ret = container_of(css, struct mem_cgroup, css);
    
    		rcu_read_unlock();
    		/* Updates scanning parameter */
    		if (!css) {
    			/* this means start scan from ID:1 */
    
    			root_memcg->last_scanned_child = 0;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		} else
    
    			root_memcg->last_scanned_child = found;
    
    /**
     * test_mem_cgroup_node_reclaimable
     * @mem: the target memcg
     * @nid: the node ID to be checked.
     * @noswap : specify true here if the user wants flle only information.
     *
     * This function returns whether the specified memcg contains any
     * reclaimable pages on a node. Returns true if there are any reclaimable
     * pages in the node.
     */
    
    static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
    
    		int nid, bool noswap)
    {
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
    
    		return true;
    	if (noswap || !total_swap_pages)
    		return false;
    
    	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
    
    #if MAX_NUMNODES > 1
    
    /*
     * Always updating the nodemask is not very good - even if we have an empty
     * list or the wrong list here, we can start from some node and traverse all
     * nodes based on the zonelist. So update the list loosely once per 10 secs.
     *
     */
    
    static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
    
    	/*
    	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
    	 * pagein/pageout changes since the last update.
    	 */
    
    	if (!atomic_read(&memcg->numainfo_events))
    
    	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
    
    		return;
    
    	/* make a nodemask where this memcg uses memory from */
    
    	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
    
    
    	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
    
    
    		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
    			node_clear(nid, memcg->scan_nodes);
    
    	atomic_set(&memcg->numainfo_events, 0);
    	atomic_set(&memcg->numainfo_updating, 0);
    
    }
    
    /*
     * Selecting a node where we start reclaim from. Because what we need is just
     * reducing usage counter, start from anywhere is O,K. Considering
     * memory reclaim from current node, there are pros. and cons.
     *
     * Freeing memory from current node means freeing memory from a node which
     * we'll use or we've used. So, it may make LRU bad. And if several threads
     * hit limits, it will see a contention on a node. But freeing from remote
     * node means more costs for memory reclaim because of memory latency.
     *
     * Now, we use round-robin. Better algorithm is welcomed.
     */
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
    	mem_cgroup_may_update_nodemask(memcg);
    	node = memcg->last_scanned_node;
    
    	node = next_node(node, memcg->scan_nodes);
    
    	if (node == MAX_NUMNODES)
    
    		node = first_node(memcg->scan_nodes);
    
    	/*
    	 * We call this when we hit limit, not when pages are added to LRU.
    	 * No LRU may hold pages because all pages are UNEVICTABLE or
    	 * memcg is too small and all pages are not on LRU. In that case,
    	 * we use curret node.
    	 */
    	if (unlikely(node == MAX_NUMNODES))
    		node = numa_node_id();
    
    
    	memcg->last_scanned_node = node;
    
    /*
     * Check all nodes whether it contains reclaimable pages or not.
     * For quick scan, we make use of scan_nodes. This will allow us to skip
     * unused nodes. But scan_nodes is lazily updated and may not cotain
     * enough new information. We need to do double check.
     */
    
    bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
    
    {
    	int nid;
    
    	/*
    	 * quick check...making use of scan_node.
    	 * We can skip unused nodes.
    	 */
    
    	if (!nodes_empty(memcg->scan_nodes)) {
    		for (nid = first_node(memcg->scan_nodes);
    
    		     nid < MAX_NUMNODES;
    
    		     nid = next_node(nid, memcg->scan_nodes)) {
    
    			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
    
    				return true;
    		}
    	}
    	/*
    	 * Check rest of nodes.
    	 */
    	for_each_node_state(nid, N_HIGH_MEMORY) {
    
    		if (node_isset(nid, memcg->scan_nodes))
    
    		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
    
    int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
    
    bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
    
    	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /*
     * Scan the hierarchy if needed to reclaim memory. We remember the last child
     * we reclaimed from, so that we don't end up penalizing one child extensively
     * based on its position in the children list.
    
     * root_memcg is the original ancestor that we've been reclaim from.
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
     *
    
     * We give up and return to the caller when we visit root_memcg twice.
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
     * (other groups can be removed while we're walking....)
    
     *
     * If shrink==true, for avoiding to free too much, this returns immedieately.
    
    static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
    
    						unsigned long reclaim_options,
    						unsigned long *total_scanned)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	struct mem_cgroup *victim;
    	int ret, total = 0;
    	int loop = 0;
    
    	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
    	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
    
    	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
    
    	unsigned long excess;
    
    	unsigned long nr_scanned;
    
    	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
    
    	/* If memsw_is_minimum==1, swap-out is of-no-use. */
    
    	if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
    
    		victim = mem_cgroup_select_victim(root_memcg);
    		if (victim == root_memcg) {
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			loop++;
    
    			/*
    			 * We are not draining per cpu cached charges during
    			 * soft limit reclaim  because global reclaim doesn't
    			 * care about charges. It tries to free some memory and
    			 * charges will not give any.
    			 */
    			if (!check_soft && loop >= 1)
    
    				drain_all_stock_async(root_memcg);
    
    			if (loop >= 2) {
    				/*
    				 * If we have not been able to reclaim
    				 * anything, it might because there are
    				 * no reclaimable pages under this hierarchy
    				 */
    				if (!check_soft || !total) {
    					css_put(&victim->css);
    					break;
    				}
    				/*
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    				 * We want to do more targeted reclaim.
    
    				 * excess >> 2 is not to excessive so as to
    				 * reclaim too much, nor too less that we keep
    				 * coming back to reclaim from this cgroup
    				 */
    				if (total >= (excess >> 2) ||
    					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
    					css_put(&victim->css);
    					break;
    				}
    			}
    		}
    
    		if (!mem_cgroup_reclaimable(victim, noswap)) {
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			/* this cgroup's local usage == 0 */
    			css_put(&victim->css);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		/* we use swappiness of local cgroup */
    
    			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
    
    				noswap, zone, &nr_scanned);
    			*total_scanned += nr_scanned;
    
    			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		css_put(&victim->css);
    
    		/*
    		 * At shrinking usage, we can't check we should stop here or
    		 * reclaim more. It's depends on callers. last_scanned_child
    		 * will work enough for keeping fairness under tree.
    		 */
    		if (shrink)
    			return ret;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		total += ret;
    
    			if (!res_counter_soft_limit_excess(&root_memcg->res))
    
    		} else if (mem_cgroup_margin(root_memcg))
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	return total;
    
    /*
     * Check OOM-Killer is already running under our hierarchy.
     * If someone is running, return false.
    
     * Has to be called with memcg_oom_lock
    
    static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter, *failed = NULL;
    	bool cond = true;
    
    	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
    
    		if (iter->oom_lock) {
    
    			/*
    			 * this subtree of our hierarchy is already locked
    			 * so we cannot give a lock.
    			 */
    			failed = iter;
    			cond = false;
    
    		} else
    			iter->oom_lock = true;
    
    		return true;
    
    
    	/*
    	 * OK, we failed to lock the whole subtree so we have to clean up
    	 * what we set up to the failing subtree
    	 */
    	cond = true;
    
    	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
    
    		if (iter == failed) {
    			cond = false;
    			continue;
    		}
    		iter->oom_lock = false;
    	}
    
    	return false;
    
     * Has to be called with memcg_oom_lock
    
    static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
    
    	/*
    	 * When a new child is created while the hierarchy is under oom,
    	 * mem_cgroup_oom_lock() may not be called. We have to use
    	 * atomic_add_unless() here.
    	 */
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		atomic_add_unless(&iter->under_oom, -1, 0);
    
    static DEFINE_SPINLOCK(memcg_oom_lock);
    
    static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
    
    
    struct oom_wait_info {
    	struct mem_cgroup *mem;
    	wait_queue_t	wait;
    };
    
    static int memcg_oom_wake_function(wait_queue_t *wait,
    	unsigned mode, int sync, void *arg)
    {
    
    	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
    			  *oom_wait_memcg;
    
    	struct oom_wait_info *oom_wait_info;
    
    	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
    
    	oom_wait_memcg = oom_wait_info->mem;
    
    
    	/*
    	 * Both of oom_wait_info->mem and wake_mem are stable under us.
    	 * Then we can use css_is_ancestor without taking care of RCU.
    	 */
    
    	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
    		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
    
    		return 0;
    	return autoremove_wake_function(wait, mode, sync, arg);
    }
    
    
    static void memcg_wakeup_oom(struct mem_cgroup *memcg)
    
    	/* for filtering, pass "memcg" as argument. */
    	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
    
    static void memcg_oom_recover(struct mem_cgroup *memcg)
    
    	if (memcg && atomic_read(&memcg->under_oom))
    		memcg_wakeup_oom(memcg);
    
    /*
     * try to call OOM killer. returns false if we should exit memory-reclaim loop.
     */
    
    bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
    
    	struct oom_wait_info owait;
    
    	bool locked, need_to_kill;
    
    	owait.mem = memcg;
    
    	owait.wait.flags = 0;
    	owait.wait.func = memcg_oom_wake_function;
    	owait.wait.private = current;
    	INIT_LIST_HEAD(&owait.wait.task_list);
    
    	need_to_kill = true;
    
    	mem_cgroup_mark_under_oom(memcg);
    
    	/* At first, try to OOM lock hierarchy under memcg.*/
    
    	spin_lock(&memcg_oom_lock);
    
    	locked = mem_cgroup_oom_lock(memcg);
    
    	/*
    	 * Even if signal_pending(), we can't quit charge() loop without
    	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
    	 * under OOM is always welcomed, use TASK_KILLABLE here.
    	 */
    
    	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
    
    	if (!locked || memcg->oom_kill_disable)
    
    		need_to_kill = false;
    	if (locked)
    
    		mem_cgroup_oom_notify(memcg);
    
    	spin_unlock(&memcg_oom_lock);
    
    	if (need_to_kill) {
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    		mem_cgroup_out_of_memory(memcg, mask);
    
    		schedule();
    
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    	spin_lock(&memcg_oom_lock);
    
    		mem_cgroup_oom_unlock(memcg);
    	memcg_wakeup_oom(memcg);
    
    	spin_unlock(&memcg_oom_lock);
    
    	mem_cgroup_unmark_under_oom(memcg);
    
    	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
    		return false;
    	/* Give chance to dying process */
    
    	schedule_timeout_uninterruptible(1);
    
    	return true;
    
    /*
     * Currently used to update mapped file statistics, but the routine can be
     * generalized to update other statistics as well.
    
     *
     * Notes: Race condition
     *
     * We usually use page_cgroup_lock() for accessing page_cgroup member but
     * it tends to be costly. But considering some conditions, we doesn't need
     * to do so _always_.
     *
     * Considering "charge", lock_page_cgroup() is not required because all
     * file-stat operations happen after a page is attached to radix-tree. There
     * are no race with "charge".
     *
     * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
     * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
     * if there are race with "uncharge". Statistics itself is properly handled
     * by flags.
     *
     * Considering "move", this is an only case we see a race. To make the race
     * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
     * possibility of race condition. If there is, we take a lock.
    
    void mem_cgroup_update_page_stat(struct page *page,
    				 enum mem_cgroup_page_stat_item idx, int val)
    
    	struct mem_cgroup *memcg;
    
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    	bool need_unlock = false;
    
    	unsigned long uninitialized_var(flags);
    
    
    	if (unlikely(!pc))
    		return;