Skip to content
Snippets Groups Projects
memcontrol.c 141 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	if (likely(!PageLRU(page)))
    		return;
    
    
    	spin_lock_irqsave(&zone->lru_lock, flags);
    	/*
    	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
    	 * is guarded by lock_page() because the page is SwapCache.
    	 */
    	if (!PageCgroupUsed(pc))
    		mem_cgroup_del_lru_list(page, page_lru(page));
    	spin_unlock_irqrestore(&zone->lru_lock, flags);
    
    static void mem_cgroup_lru_add_after_commit(struct page *page)
    
    {
    	unsigned long flags;
    	struct zone *zone = page_zone(page);
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    
    	/* taking care of that the page is added to LRU while we commit it */
    	if (likely(!PageLRU(page)))
    		return;
    
    	spin_lock_irqsave(&zone->lru_lock, flags);
    	/* link when the page is linked to LRU but page_cgroup isn't */
    
    	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
    
    		mem_cgroup_add_lru_list(page, page_lru(page));
    	spin_unlock_irqrestore(&zone->lru_lock, flags);
    }
    
    
    
    void mem_cgroup_move_lists(struct page *page,
    			   enum lru_list from, enum lru_list to)
    {
    
    	if (mem_cgroup_disabled())
    
    		return;
    	mem_cgroup_del_lru_list(page, from);
    	mem_cgroup_add_lru_list(page, to);
    
    int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
    {
    	int ret;
    
    	struct mem_cgroup *curr = NULL;
    
    	p = find_lock_task_mm(task);
    	if (!p)
    		return 0;
    	curr = try_get_mem_cgroup_from_mm(p->mm);
    	task_unlock(p);
    
    	if (!curr)
    		return 0;
    
    	/*
    	 * We should check use_hierarchy of "mem" not "curr". Because checking
    	 * use_hierarchy of "curr" here make this function true if hierarchy is
    	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
    	 * hierarchy(even if use_hierarchy is disabled in "mem").
    	 */
    	if (mem->use_hierarchy)
    
    		ret = css_is_ancestor(&curr->css, &mem->css);
    	else
    		ret = (curr == mem);
    	css_put(&curr->css);
    
    static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
    
    {
    	unsigned long active;
    	unsigned long inactive;
    
    	unsigned long gb;
    	unsigned long inactive_ratio;
    
    	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
    	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
    
    	gb = (inactive + active) >> (30 - PAGE_SHIFT);
    	if (gb)
    		inactive_ratio = int_sqrt(10 * gb);
    	else
    		inactive_ratio = 1;
    
    	if (present_pages) {
    		present_pages[0] = inactive;
    		present_pages[1] = active;
    	}
    
    	return inactive_ratio;
    }
    
    int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
    {
    	unsigned long active;
    	unsigned long inactive;
    	unsigned long present_pages[2];
    	unsigned long inactive_ratio;
    
    	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
    
    	inactive = present_pages[0];
    	active = present_pages[1];
    
    	if (inactive * inactive_ratio < active)
    
    int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
    {
    	unsigned long active;
    	unsigned long inactive;
    
    	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
    	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
    
    	return (active > inactive);
    }
    
    
    unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
    						struct zone *zone,
    						enum lru_list lru)
    
    	int zid = zone_idx(zone);
    	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    
    	return MEM_CGROUP_ZSTAT(mz, lru);
    }
    
    
    #ifdef CONFIG_NUMA
    static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
    							int nid)
    {
    	unsigned long ret;
    
    	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
    		mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
    
    	return ret;
    }
    
    static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
    {
    	u64 total = 0;
    	int nid;
    
    	for_each_node_state(nid, N_HIGH_MEMORY)
    		total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
    
    	return total;
    }
    
    static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
    							int nid)
    {
    	unsigned long ret;
    
    	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
    		mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
    
    	return ret;
    }
    
    static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
    {
    	u64 total = 0;
    	int nid;
    
    	for_each_node_state(nid, N_HIGH_MEMORY)
    		total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
    
    	return total;
    }
    
    static unsigned long
    mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
    {
    	return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
    }
    
    static unsigned long
    mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
    {
    	u64 total = 0;
    	int nid;
    
    	for_each_node_state(nid, N_HIGH_MEMORY)
    		total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
    
    	return total;
    }
    
    static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
    							int nid)
    {
    	enum lru_list l;
    	u64 total = 0;
    
    	for_each_lru(l)
    		total += mem_cgroup_get_zonestat_node(memcg, nid, l);
    
    	return total;
    }
    
    static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
    {
    	u64 total = 0;
    	int nid;
    
    	for_each_node_state(nid, N_HIGH_MEMORY)
    		total += mem_cgroup_node_nr_lru_pages(memcg, nid);
    
    	return total;
    }
    #endif /* CONFIG_NUMA */
    
    
    struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
    						      struct zone *zone)
    {
    
    	int zid = zone_idx(zone);
    	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    
    	return &mz->reclaim_stat;
    }
    
    struct zone_reclaim_stat *
    mem_cgroup_get_reclaim_stat_from_page(struct page *page)
    {
    	struct page_cgroup *pc;
    	struct mem_cgroup_per_zone *mz;
    
    	if (mem_cgroup_disabled())
    		return NULL;
    
    	pc = lookup_page_cgroup(page);
    
    	if (!PageCgroupUsed(pc))
    		return NULL;
    
    	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
    	smp_rmb();
    
    	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
    
    	return &mz->reclaim_stat;
    }
    
    
    unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
    					struct list_head *dst,
    					unsigned long *scanned, int order,
    					int mode, struct zone *z,
    					struct mem_cgroup *mem_cont,
    
    					int active, int file)
    
    {
    	unsigned long nr_taken = 0;
    	struct page *page;
    	unsigned long scan;
    	LIST_HEAD(pc_list);
    	struct list_head *src;
    
    	int zid = zone_idx(z);
    	struct mem_cgroup_per_zone *mz;
    
    	int lru = LRU_FILE * file + active;
    
    	BUG_ON(!mem_cont);
    
    	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
    
    	src = &mz->lists[lru];
    
    	scan = 0;
    	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
    
    		if (scan >= nr_to_scan)
    
    		if (unlikely(!PageCgroupUsed(pc)))
    			continue;
    
    		page = lookup_cgroup_page(pc);
    
    		if (unlikely(!PageLRU(page)))
    
    		ret = __isolate_lru_page(page, mode, file);
    		switch (ret) {
    		case 0:
    
    			list_move(&page->lru, dst);
    
    			mem_cgroup_del_lru(page);
    
    			nr_taken += hpage_nr_pages(page);
    
    			break;
    		case -EBUSY:
    			/* we don't affect global LRU but rotate in our LRU */
    			mem_cgroup_rotate_lru_list(page, page_lru(page));
    			break;
    		default:
    			break;
    
    
    	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
    				      0, 0, 0, mode);
    
    
    #define mem_cgroup_from_res_counter(counter, member)	\
    	container_of(counter, struct mem_cgroup, member)
    
    
     * mem_cgroup_margin - calculate chargeable space of a memory cgroup
     * @mem: the memory cgroup
    
     * Returns the maximum amount of memory @mem can be charged with, in
    
    static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
    
    	unsigned long long margin;
    
    	margin = res_counter_margin(&mem->res);
    	if (do_swap_account)
    		margin = min(margin, res_counter_margin(&mem->memsw));
    
    	return margin >> PAGE_SHIFT;
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    static unsigned int get_swappiness(struct mem_cgroup *memcg)
    {
    	struct cgroup *cgrp = memcg->css.cgroup;
    
    	/* root ? */
    	if (cgrp->parent == NULL)
    		return vm_swappiness;
    
    
    	return memcg->swappiness;
    
    static void mem_cgroup_start_move(struct mem_cgroup *mem)
    {
    	int cpu;
    
    
    	get_online_cpus();
    	spin_lock(&mem->pcp_counter_lock);
    	for_each_online_cpu(cpu)
    
    		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
    
    	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
    	spin_unlock(&mem->pcp_counter_lock);
    	put_online_cpus();
    
    
    	synchronize_rcu();
    }
    
    static void mem_cgroup_end_move(struct mem_cgroup *mem)
    {
    	int cpu;
    
    	if (!mem)
    		return;
    
    	get_online_cpus();
    	spin_lock(&mem->pcp_counter_lock);
    	for_each_online_cpu(cpu)
    
    		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
    
    	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
    	spin_unlock(&mem->pcp_counter_lock);
    	put_online_cpus();
    
    }
    /*
     * 2 routines for checking "mem" is under move_account() or not.
     *
     * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
     *			  for avoiding race in accounting. If true,
     *			  pc->mem_cgroup may be overwritten.
     *
     * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
     *			  under hierarchy of moving cgroups. This is for
     *			  waiting at hith-memory prressure caused by "move".
     */
    
    static bool mem_cgroup_stealed(struct mem_cgroup *mem)
    {
    	VM_BUG_ON(!rcu_read_lock_held());
    	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
    }
    
    
    static bool mem_cgroup_under_move(struct mem_cgroup *mem)
    {
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	bool ret = false;
    
    	/*
    	 * Unlike task_move routines, we access mc.to, mc.from not under
    	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
    	 */
    	spin_lock(&mc.lock);
    	from = mc.from;
    	to = mc.to;
    	if (!from)
    		goto unlock;
    	if (from == mem || to == mem
    	    || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
    	    || (mem->use_hierarchy && css_is_ancestor(&to->css,	&mem->css)))
    		ret = true;
    unlock:
    	spin_unlock(&mc.lock);
    
    	return ret;
    }
    
    static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
    {
    	if (mc.moving_task && current != mc.moving_task) {
    		if (mem_cgroup_under_move(mem)) {
    			DEFINE_WAIT(wait);
    			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
    			/* moving charge context might have finished. */
    			if (mc.moving_task)
    				schedule();
    			finish_wait(&mc.waitq, &wait);
    			return true;
    		}
    	}
    	return false;
    }
    
    
     * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
    
     * @memcg: The memory cgroup that went over limit
     * @p: Task that is going to be killed
     *
     * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
     * enabled
     */
    void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
    {
    	struct cgroup *task_cgrp;
    	struct cgroup *mem_cgrp;
    	/*
    	 * Need a buffer in BSS, can't rely on allocations. The code relies
    	 * on the assumption that OOM is serialized for memory controller.
    	 * If this assumption is broken, revisit this code.
    	 */
    	static char memcg_name[PATH_MAX];
    	int ret;
    
    
    		return;
    
    
    	rcu_read_lock();
    
    	mem_cgrp = memcg->css.cgroup;
    	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
    
    	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		/*
    		 * Unfortunately, we are unable to convert to a useful name
    		 * But we'll still print out the usage information
    		 */
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	printk(KERN_INFO "Task in %s killed", memcg_name);
    
    	rcu_read_lock();
    	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
    	if (ret < 0) {
    		rcu_read_unlock();
    		goto done;
    	}
    	rcu_read_unlock();
    
    	/*
    	 * Continues from above, so we don't need an KERN_ level
    	 */
    	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
    done:
    
    	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
    		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->res, RES_FAILCNT));
    	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
    		"failcnt %llu\n",
    		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
    		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
    }
    
    
    /*
     * This function returns the number of memcg under hierarchy tree. Returns
     * 1(self count) if no children.
     */
    static int mem_cgroup_count_children(struct mem_cgroup *mem)
    {
    	int num = 0;
    
    	struct mem_cgroup *iter;
    
    	for_each_mem_cgroup_tree(iter, mem)
    		num++;
    
    /*
     * Return the memory (and swap, if configured) limit for a memcg.
     */
    u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
    {
    	u64 limit;
    	u64 memsw;
    
    
    	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
    	limit += total_swap_pages << PAGE_SHIFT;
    
    
    	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
    	/*
    	 * If memsw is finite and limits the amount of swap space available
    	 * to this memcg, return that limit.
    	 */
    	return min(limit, memsw);
    }
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
     * Visit the first child (need not be the first child as per the ordering
     * of the cgroup list, since we track last_scanned_child) of @mem and use
     * that to reclaim free pages from.
     */
    static struct mem_cgroup *
    mem_cgroup_select_victim(struct mem_cgroup *root_mem)
    {
    	struct mem_cgroup *ret = NULL;
    	struct cgroup_subsys_state *css;
    	int nextid, found;
    
    	if (!root_mem->use_hierarchy) {
    		css_get(&root_mem->css);
    		ret = root_mem;
    	}
    
    	while (!ret) {
    		rcu_read_lock();
    		nextid = root_mem->last_scanned_child + 1;
    		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
    				   &found);
    		if (css && css_tryget(css))
    			ret = container_of(css, struct mem_cgroup, css);
    
    		rcu_read_unlock();
    		/* Updates scanning parameter */
    		if (!css) {
    			/* this means start scan from ID:1 */
    			root_mem->last_scanned_child = 0;
    		} else
    			root_mem->last_scanned_child = found;
    	}
    
    	return ret;
    }
    
    
    #if MAX_NUMNODES > 1
    
    /*
     * Always updating the nodemask is not very good - even if we have an empty
     * list or the wrong list here, we can start from some node and traverse all
     * nodes based on the zonelist. So update the list loosely once per 10 secs.
     *
     */
    static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
    {
    	int nid;
    
    	if (time_after(mem->next_scan_node_update, jiffies))
    		return;
    
    	mem->next_scan_node_update = jiffies + 10*HZ;
    	/* make a nodemask where this memcg uses memory from */
    	mem->scan_nodes = node_states[N_HIGH_MEMORY];
    
    	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
    
    		if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
    		    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
    			continue;
    
    		if (total_swap_pages &&
    		    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
    		     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
    			continue;
    		node_clear(nid, mem->scan_nodes);
    	}
    }
    
    /*
     * Selecting a node where we start reclaim from. Because what we need is just
     * reducing usage counter, start from anywhere is O,K. Considering
     * memory reclaim from current node, there are pros. and cons.
     *
     * Freeing memory from current node means freeing memory from a node which
     * we'll use or we've used. So, it may make LRU bad. And if several threads
     * hit limits, it will see a contention on a node. But freeing from remote
     * node means more costs for memory reclaim because of memory latency.
     *
     * Now, we use round-robin. Better algorithm is welcomed.
     */
    int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
    {
    	int node;
    
    	mem_cgroup_may_update_nodemask(mem);
    	node = mem->last_scanned_node;
    
    	node = next_node(node, mem->scan_nodes);
    	if (node == MAX_NUMNODES)
    		node = first_node(mem->scan_nodes);
    	/*
    	 * We call this when we hit limit, not when pages are added to LRU.
    	 * No LRU may hold pages because all pages are UNEVICTABLE or
    	 * memcg is too small and all pages are not on LRU. In that case,
    	 * we use curret node.
    	 */
    	if (unlikely(node == MAX_NUMNODES))
    		node = numa_node_id();
    
    	mem->last_scanned_node = node;
    	return node;
    }
    
    #else
    int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
    {
    	return 0;
    }
    #endif
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /*
     * Scan the hierarchy if needed to reclaim memory. We remember the last child
     * we reclaimed from, so that we don't end up penalizing one child extensively
     * based on its position in the children list.
    
     *
     * root_mem is the original ancestor that we've been reclaim from.
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
     *
     * We give up and return to the caller when we visit root_mem twice.
     * (other groups can be removed while we're walking....)
    
     *
     * If shrink==true, for avoiding to free too much, this returns immedieately.
    
     */
    static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
    
    						unsigned long reclaim_options,
    						unsigned long *total_scanned)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	struct mem_cgroup *victim;
    	int ret, total = 0;
    	int loop = 0;
    
    	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
    	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
    
    	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
    
    	unsigned long excess;
    
    
    	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
    
    	/* If memsw_is_minimum==1, swap-out is of-no-use. */
    	if (root_mem->memsw_is_minimum)
    		noswap = true;
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		victim = mem_cgroup_select_victim(root_mem);
    
    		if (victim == root_mem) {
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			loop++;
    
    			if (loop >= 1)
    				drain_all_stock_async();
    
    			if (loop >= 2) {
    				/*
    				 * If we have not been able to reclaim
    				 * anything, it might because there are
    				 * no reclaimable pages under this hierarchy
    				 */
    				if (!check_soft || !total) {
    					css_put(&victim->css);
    					break;
    				}
    				/*
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
    				 * We want to do more targeted reclaim.
    
    				 * excess >> 2 is not to excessive so as to
    				 * reclaim too much, nor too less that we keep
    				 * coming back to reclaim from this cgroup
    				 */
    				if (total >= (excess >> 2) ||
    					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
    					css_put(&victim->css);
    					break;
    				}
    			}
    		}
    
    		if (!mem_cgroup_local_usage(victim)) {
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			/* this cgroup's local usage == 0 */
    			css_put(&victim->css);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		/* we use swappiness of local cgroup */
    
    			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
    
    				noswap, get_swappiness(victim), zone,
    				&nr_scanned);
    			*total_scanned += nr_scanned;
    		} else
    
    			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
    						noswap, get_swappiness(victim));
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		css_put(&victim->css);
    
    		/*
    		 * At shrinking usage, we can't check we should stop here or
    		 * reclaim more. It's depends on callers. last_scanned_child
    		 * will work enough for keeping fairness under tree.
    		 */
    		if (shrink)
    			return ret;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		total += ret;
    
    			if (!res_counter_soft_limit_excess(&root_mem->res))
    
    		} else if (mem_cgroup_margin(root_mem))
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	return total;
    
    /*
     * Check OOM-Killer is already running under our hierarchy.
     * If someone is running, return false.
     */
    static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
    {
    
    	int x, lock_count = 0;
    	struct mem_cgroup *iter;
    
    	for_each_mem_cgroup_tree(iter, mem) {
    		x = atomic_inc_return(&iter->oom_lock);
    		lock_count = max(x, lock_count);
    	}
    
    
    	if (lock_count == 1)
    		return true;
    	return false;
    
    static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
    
    	struct mem_cgroup *iter;
    
    
    	/*
    	 * When a new child is created while the hierarchy is under oom,
    	 * mem_cgroup_oom_lock() may not be called. We have to use
    	 * atomic_add_unless() here.
    	 */
    
    	for_each_mem_cgroup_tree(iter, mem)
    		atomic_add_unless(&iter->oom_lock, -1, 0);
    
    
    static DEFINE_MUTEX(memcg_oom_mutex);
    static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
    
    
    struct oom_wait_info {
    	struct mem_cgroup *mem;
    	wait_queue_t	wait;
    };
    
    static int memcg_oom_wake_function(wait_queue_t *wait,
    	unsigned mode, int sync, void *arg)
    {
    	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
    	struct oom_wait_info *oom_wait_info;
    
    	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
    
    	if (oom_wait_info->mem == wake_mem)
    		goto wakeup;
    	/* if no hierarchy, no match */
    	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
    		return 0;
    	/*
    	 * Both of oom_wait_info->mem and wake_mem are stable under us.
    	 * Then we can use css_is_ancestor without taking care of RCU.
    	 */
    	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
    	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
    		return 0;
    
    wakeup:
    	return autoremove_wake_function(wait, mode, sync, arg);
    }
    
    static void memcg_wakeup_oom(struct mem_cgroup *mem)
    {
    	/* for filtering, pass "mem" as argument. */
    	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
    }
    
    
    static void memcg_oom_recover(struct mem_cgroup *mem)
    {
    
    	if (mem && atomic_read(&mem->oom_lock))
    
    /*
     * try to call OOM killer. returns false if we should exit memory-reclaim loop.
     */
    bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
    
    	struct oom_wait_info owait;
    
    	bool locked, need_to_kill;
    
    	owait.mem = mem;
    	owait.wait.flags = 0;
    	owait.wait.func = memcg_oom_wake_function;
    	owait.wait.private = current;
    	INIT_LIST_HEAD(&owait.wait.task_list);
    
    	need_to_kill = true;
    
    	/* At first, try to OOM lock hierarchy under mem.*/
    	mutex_lock(&memcg_oom_mutex);
    	locked = mem_cgroup_oom_lock(mem);
    	/*
    	 * Even if signal_pending(), we can't quit charge() loop without
    	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
    	 * under OOM is always welcomed, use TASK_KILLABLE here.
    	 */
    
    	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
    	if (!locked || mem->oom_kill_disable)
    		need_to_kill = false;
    	if (locked)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		mem_cgroup_oom_notify(mem);
    
    	mutex_unlock(&memcg_oom_mutex);
    
    
    	if (need_to_kill) {
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    		mem_cgroup_out_of_memory(mem, mask);
    
    		schedule();
    
    		finish_wait(&memcg_oom_waitq, &owait.wait);
    
    	}
    	mutex_lock(&memcg_oom_mutex);
    	mem_cgroup_oom_unlock(mem);
    
    	memcg_wakeup_oom(mem);
    
    	mutex_unlock(&memcg_oom_mutex);
    
    	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
    		return false;
    	/* Give chance to dying process */
    	schedule_timeout(1);
    	return true;
    
    /*
     * Currently used to update mapped file statistics, but the routine can be
     * generalized to update other statistics as well.
    
     *
     * Notes: Race condition
     *
     * We usually use page_cgroup_lock() for accessing page_cgroup member but
     * it tends to be costly. But considering some conditions, we doesn't need
     * to do so _always_.
     *
     * Considering "charge", lock_page_cgroup() is not required because all
     * file-stat operations happen after a page is attached to radix-tree. There
     * are no race with "charge".
     *
     * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
     * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
     * if there are race with "uncharge". Statistics itself is properly handled
     * by flags.
     *
     * Considering "move", this is an only case we see a race. To make the race
     * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
     * possibility of race condition. If there is, we take a lock.
    
    void mem_cgroup_update_page_stat(struct page *page,
    				 enum mem_cgroup_page_stat_item idx, int val)
    
    {
    	struct mem_cgroup *mem;
    
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    	bool need_unlock = false;
    
    	unsigned long uninitialized_var(flags);
    
    
    	if (unlikely(!pc))
    		return;
    
    
    	mem = pc->mem_cgroup;
    
    	if (unlikely(!mem || !PageCgroupUsed(pc)))
    		goto out;
    	/* pc->mem_cgroup is unstable ? */
    
    	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
    
    		move_lock_page_cgroup(pc, &flags);
    
    		need_unlock = true;
    		mem = pc->mem_cgroup;
    		if (!mem || !PageCgroupUsed(pc))
    			goto out;
    	}
    
    	case MEMCG_NR_FILE_MAPPED:
    
    		if (val > 0)
    			SetPageCgroupFileMapped(pc);
    		else if (!page_mapped(page))
    
    			ClearPageCgroupFileMapped(pc);
    
    		idx = MEM_CGROUP_STAT_FILE_MAPPED;
    
    	this_cpu_add(mem->stat->count[idx], val);
    
    
    		move_unlock_page_cgroup(pc, &flags);
    
    EXPORT_SYMBOL(mem_cgroup_update_page_stat);
    
    /*
     * size of first charge trial. "32" comes from vmscan.c's magic value.
     * TODO: maybe necessary to use big numbers in big irons.
     */
    
    struct memcg_stock_pcp {
    	struct mem_cgroup *cached; /* this never be root cgroup */
    
    	struct work_struct work;
    };
    static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
    static atomic_t memcg_drain_count;
    
    /*
    
     * Try to consume stocked charge on this cpu. If success, one page is consumed
    
     * from local stock and true is returned. If the stock is 0 or charges from a
     * cgroup which is not current target, returns false. This stock will be
     * refilled.
     */
    static bool consume_stock(struct mem_cgroup *mem)
    {
    	struct memcg_stock_pcp *stock;
    	bool ret = true;
    
    	stock = &get_cpu_var(memcg_stock);
    
    	if (mem == stock->cached && stock->nr_pages)
    		stock->nr_pages--;
    
    	else /* need to call res_counter_charge */
    		ret = false;
    	put_cpu_var(memcg_stock);
    	return ret;
    }
    
    /*
     * Returns stocks cached in percpu to res_counter and reset cached information.
     */
    static void drain_stock(struct memcg_stock_pcp *stock)
    {
    	struct mem_cgroup *old = stock->cached;
    
    
    	if (stock->nr_pages) {
    		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
    
    		res_counter_uncharge(&old->res, bytes);
    
    		if (do_swap_account)
    
    			res_counter_uncharge(&old->memsw, bytes);
    		stock->nr_pages = 0;
    
    	}
    	stock->cached = NULL;
    }
    
    /*
     * This must be called under preempt disabled or must be called by
     * a thread which is pinned to local cpu.
     */
    static void drain_local_stock(struct work_struct *dummy)
    {
    	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
    	drain_stock(stock);
    }
    
    /*
     * Cache charges(val) which is from res_counter, to local per_cpu area.
    
     * This will be consumed by consume_stock() function, later.
    
    static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
    
    {
    	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
    
    	if (stock->cached != mem) { /* reset if necessary */
    		drain_stock(stock);
    		stock->cached = mem;
    	}