memcontrol.c

		switch (ret) {
		case 0:
			list_move(&page->lru, dst);
			mem_cgroup_del_lru(page);
			nr_taken++;
			break;
		case -EBUSY:
			/* we don't affect global LRU but rotate in our LRU */
			mem_cgroup_rotate_lru_list(page, page_lru(page));
			break;
		default:
			break;
		}
	}

	*scanned = scan;

	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
				      0, 0, 0, mode);

	return nr_taken;
}

#define mem_cgroup_from_res_counter(counter, member)	\
	container_of(counter, struct mem_cgroup, member)

static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
	if (do_swap_account) {
		if (res_counter_check_under_limit(&mem->res) &&
			res_counter_check_under_limit(&mem->memsw))
			return true;
	} else
		if (res_counter_check_under_limit(&mem->res))
			return true;
	return false;
}

static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
	struct cgroup *cgrp = memcg->css.cgroup;
	unsigned int swappiness;

	/* root ? */
	if (cgrp->parent == NULL)
		return vm_swappiness;

	spin_lock(&memcg->reclaim_param_lock);
	swappiness = memcg->swappiness;
	spin_unlock(&memcg->reclaim_param_lock);

	return swappiness;
}

static void mem_cgroup_start_move(struct mem_cgroup *mem)
{
	int cpu;
	/* Because this is for moving account, reuse mc.lock */
	spin_lock(&mc.lock);
	for_each_possible_cpu(cpu)
		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
	spin_unlock(&mc.lock);

	synchronize_rcu();
}

static void mem_cgroup_end_move(struct mem_cgroup *mem)
{
	int cpu;

	if (!mem)
		return;
	spin_lock(&mc.lock);
	for_each_possible_cpu(cpu)
		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
	spin_unlock(&mc.lock);
}
/*
 * 2 routines for checking "mem" is under move_account() or not.
 *
 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
 *			  for avoiding race in accounting. If true,
 *			  pc->mem_cgroup may be overwritten.
 *
 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
 *			  under hierarchy of moving cgroups. This is for
 *			  waiting at hith-memory prressure caused by "move".
 */

static bool mem_cgroup_stealed(struct mem_cgroup *mem)
{
	VM_BUG_ON(!rcu_read_lock_held());
	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
}

static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
	struct mem_cgroup *from;
	struct mem_cgroup *to;
	bool ret = false;
	/*
	 * Unlike task_move routines, we access mc.to, mc.from not under
	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
	 */
	spin_lock(&mc.lock);
	from = mc.from;
	to = mc.to;
	if (!from)
		goto unlock;
	if (from == mem || to == mem
	    || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
	    || (mem->use_hierarchy && css_is_ancestor(&to->css,	&mem->css)))
		ret = true;
unlock:
	spin_unlock(&mc.lock);
	return ret;
}

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
{
	if (mc.moving_task && current != mc.moving_task) {
		if (mem_cgroup_under_move(mem)) {
			DEFINE_WAIT(wait);
			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
			/* moving charge context might have finished. */
			if (mc.moving_task)
				schedule();
			finish_wait(&mc.waitq, &wait);
			return true;
		}
	}
	return false;
}

static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
	int *val = data;
	(*val)++;
	return 0;
}

/**
 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
 * @memcg: The memory cgroup that went over limit
 * @p: Task that is going to be killed
 *
 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
 * enabled
 */
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
	struct cgroup *task_cgrp;
	struct cgroup *mem_cgrp;
	/*
	 * Need a buffer in BSS, can't rely on allocations. The code relies
	 * on the assumption that OOM is serialized for memory controller.
	 * If this assumption is broken, revisit this code.
	 */
	static char memcg_name[PATH_MAX];
	int ret;

	if (!memcg || !p)
		return;


	rcu_read_lock();

	mem_cgrp = memcg->css.cgroup;
	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
	if (ret < 0) {
		/*
		 * Unfortunately, we are unable to convert to a useful name
		 * But we'll still print out the usage information
		 */
		rcu_read_unlock();
		goto done;
	}
	rcu_read_unlock();

	printk(KERN_INFO "Task in %s killed", memcg_name);

	rcu_read_lock();
	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
	if (ret < 0) {
		rcu_read_unlock();
		goto done;
	}
	rcu_read_unlock();

	/*
	 * Continues from above, so we don't need an KERN_ level
	 */
	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:

	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
		res_counter_read_u64(&memcg->res, RES_FAILCNT));
	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
		"failcnt %llu\n",
		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}

/*
 * This function returns the number of memcg under hierarchy tree. Returns
 * 1(self count) if no children.
 */
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
	int num = 0;
 	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
	return num;
}

/*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
	u64 limit;
	u64 memsw;

	limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
			total_swap_pages;
	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
	/*
	 * If memsw is finite and limits the amount of swap space available
	 * to this memcg, return that limit.
	 */
	return min(limit, memsw);
}

/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
 */
static struct mem_cgroup *
mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
	struct mem_cgroup *ret = NULL;
	struct cgroup_subsys_state *css;
	int nextid, found;

	if (!root_mem->use_hierarchy) {
		css_get(&root_mem->css);
		ret = root_mem;
	}

	while (!ret) {
		rcu_read_lock();
		nextid = root_mem->last_scanned_child + 1;
		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
				   &found);
		if (css && css_tryget(css))
			ret = container_of(css, struct mem_cgroup, css);

		rcu_read_unlock();
		/* Updates scanning parameter */
		spin_lock(&root_mem->reclaim_param_lock);
		if (!css) {
			/* this means start scan from ID:1 */
			root_mem->last_scanned_child = 0;
		} else
			root_mem->last_scanned_child = found;
		spin_unlock(&root_mem->reclaim_param_lock);
	}

	return ret;
}

/*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
 * based on its position in the children list.
 *
 * root_mem is the original ancestor that we've been reclaim from.
 *
 * We give up and return to the caller when we visit root_mem twice.
 * (other groups can be removed while we're walking....)
 *
 * If shrink==true, for avoiding to free too much, this returns immedieately.
 */
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
						struct zone *zone,
						gfp_t gfp_mask,
						unsigned long reclaim_options)
{
	struct mem_cgroup *victim;
	int ret, total = 0;
	int loop = 0;
	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
	unsigned long excess = mem_cgroup_get_excess(root_mem);

	/* If memsw_is_minimum==1, swap-out is of-no-use. */
	if (root_mem->memsw_is_minimum)
		noswap = true;

	while (1) {
		victim = mem_cgroup_select_victim(root_mem);
		if (victim == root_mem) {
			loop++;
			if (loop >= 1)
				drain_all_stock_async();
			if (loop >= 2) {
				/*
				 * If we have not been able to reclaim
				 * anything, it might because there are
				 * no reclaimable pages under this hierarchy
				 */
				if (!check_soft || !total) {
					css_put(&victim->css);
					break;
				}
				/*
				 * We want to do more targetted reclaim.
				 * excess >> 2 is not to excessive so as to
				 * reclaim too much, nor too less that we keep
				 * coming back to reclaim from this cgroup
				 */
				if (total >= (excess >> 2) ||
					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
					css_put(&victim->css);
					break;
				}
			}
		}
		if (!mem_cgroup_local_usage(victim)) {
			/* this cgroup's local usage == 0 */
			css_put(&victim->css);
			continue;
		}
		/* we use swappiness of local cgroup */
		if (check_soft)
			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
				noswap, get_swappiness(victim), zone);
		else
			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
						noswap, get_swappiness(victim));
		css_put(&victim->css);
		/*
		 * At shrinking usage, we can't check we should stop here or
		 * reclaim more. It's depends on callers. last_scanned_child
		 * will work enough for keeping fairness under tree.
		 */
		if (shrink)
			return ret;
		total += ret;
		if (check_soft) {
			if (res_counter_check_under_soft_limit(&root_mem->res))
				return total;
		} else if (mem_cgroup_check_under_limit(root_mem))
			return 1 + total;
	}
	return total;
}

static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
{
	int *val = (int *)data;
	int x;
	/*
	 * Logically, we can stop scanning immediately when we find
	 * a memcg is already locked. But condidering unlock ops and
	 * creation/removal of memcg, scan-all is simple operation.
	 */
	x = atomic_inc_return(&mem->oom_lock);
	*val = max(x, *val);
	return 0;
}
/*
 * Check OOM-Killer is already running under our hierarchy.
 * If someone is running, return false.
 */
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
	int lock_count = 0;

	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);

	if (lock_count == 1)
		return true;
	return false;
}

static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{
	/*
	 * When a new child is created while the hierarchy is under oom,
	 * mem_cgroup_oom_lock() may not be called. We have to use
	 * atomic_add_unless() here.
	 */
	atomic_add_unless(&mem->oom_lock, -1, 0);
	return 0;
}

static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
	mem_cgroup_walk_tree(mem, NULL,	mem_cgroup_oom_unlock_cb);
}

static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

struct oom_wait_info {
	struct mem_cgroup *mem;
	wait_queue_t	wait;
};

static int memcg_oom_wake_function(wait_queue_t *wait,
	unsigned mode, int sync, void *arg)
{
	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
	struct oom_wait_info *oom_wait_info;

	oom_wait_info = container_of(wait, struct oom_wait_info, wait);

	if (oom_wait_info->mem == wake_mem)
		goto wakeup;
	/* if no hierarchy, no match */
	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
		return 0;
	/*
	 * Both of oom_wait_info->mem and wake_mem are stable under us.
	 * Then we can use css_is_ancestor without taking care of RCU.
	 */
	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
		return 0;

wakeup:
	return autoremove_wake_function(wait, mode, sync, arg);
}

static void memcg_wakeup_oom(struct mem_cgroup *mem)
{
	/* for filtering, pass "mem" as argument. */
	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
}

static void memcg_oom_recover(struct mem_cgroup *mem)
{
	if (mem && atomic_read(&mem->oom_lock))
		memcg_wakeup_oom(mem);
}

/*
 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
 */
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
	struct oom_wait_info owait;
	bool locked, need_to_kill;

	owait.mem = mem;
	owait.wait.flags = 0;
	owait.wait.func = memcg_oom_wake_function;
	owait.wait.private = current;
	INIT_LIST_HEAD(&owait.wait.task_list);
	need_to_kill = true;
	/* At first, try to OOM lock hierarchy under mem.*/
	mutex_lock(&memcg_oom_mutex);
	locked = mem_cgroup_oom_lock(mem);
	/*
	 * Even if signal_pending(), we can't quit charge() loop without
	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
	 * under OOM is always welcomed, use TASK_KILLABLE here.
	 */
	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
	if (!locked || mem->oom_kill_disable)
		need_to_kill = false;
	if (locked)
		mem_cgroup_oom_notify(mem);
	mutex_unlock(&memcg_oom_mutex);

	if (need_to_kill) {
		finish_wait(&memcg_oom_waitq, &owait.wait);
		mem_cgroup_out_of_memory(mem, mask);
	} else {
		schedule();
		finish_wait(&memcg_oom_waitq, &owait.wait);
	}
	mutex_lock(&memcg_oom_mutex);
	mem_cgroup_oom_unlock(mem);
	memcg_wakeup_oom(mem);
	mutex_unlock(&memcg_oom_mutex);

	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
		return false;
	/* Give chance to dying process */
	schedule_timeout(1);
	return true;
}

/*
 * Currently used to update mapped file statistics, but the routine can be
 * generalized to update other statistics as well.
 *
 * Notes: Race condition
 *
 * We usually use page_cgroup_lock() for accessing page_cgroup member but
 * it tends to be costly. But considering some conditions, we doesn't need
 * to do so _always_.
 *
 * Considering "charge", lock_page_cgroup() is not required because all
 * file-stat operations happen after a page is attached to radix-tree. There
 * are no race with "charge".
 *
 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
 * if there are race with "uncharge". Statistics itself is properly handled
 * by flags.
 *
 * Considering "move", this is an only case we see a race. To make the race
 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
 * possibility of race condition. If there is, we take a lock.
 */
void mem_cgroup_update_file_mapped(struct page *page, int val)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc = lookup_page_cgroup(page);
	bool need_unlock = false;

	if (unlikely(!pc))
		return;

	rcu_read_lock();
	mem = pc->mem_cgroup;
	if (unlikely(!mem || !PageCgroupUsed(pc)))
		goto out;
	/* pc->mem_cgroup is unstable ? */
	if (unlikely(mem_cgroup_stealed(mem))) {
		/* take a lock against to access pc->mem_cgroup */
		lock_page_cgroup(pc);
		need_unlock = true;
		mem = pc->mem_cgroup;
		if (!mem || !PageCgroupUsed(pc))
			goto out;
	}
	if (val > 0) {
		this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		SetPageCgroupFileMapped(pc);
	} else {
		this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
		if (!page_mapped(page)) /* for race between dec->inc counter */
			ClearPageCgroupFileMapped(pc);
	}

out:
	if (unlikely(need_unlock))
		unlock_page_cgroup(pc);
	rcu_read_unlock();
	return;
}

/*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
 */
#define CHARGE_SIZE	(32 * PAGE_SIZE)
struct memcg_stock_pcp {
	struct mem_cgroup *cached; /* this never be root cgroup */
	int charge;
	struct work_struct work;
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static atomic_t memcg_drain_count;

/*
 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
 * from local stock and true is returned. If the stock is 0 or charges from a
 * cgroup which is not current target, returns false. This stock will be
 * refilled.
 */
static bool consume_stock(struct mem_cgroup *mem)
{
	struct memcg_stock_pcp *stock;
	bool ret = true;

	stock = &get_cpu_var(memcg_stock);
	if (mem == stock->cached && stock->charge)
		stock->charge -= PAGE_SIZE;
	else /* need to call res_counter_charge */
		ret = false;
	put_cpu_var(memcg_stock);
	return ret;
}

/*
 * Returns stocks cached in percpu to res_counter and reset cached information.
 */
static void drain_stock(struct memcg_stock_pcp *stock)
{
	struct mem_cgroup *old = stock->cached;

	if (stock->charge) {
		res_counter_uncharge(&old->res, stock->charge);
		if (do_swap_account)
			res_counter_uncharge(&old->memsw, stock->charge);
	}
	stock->cached = NULL;
	stock->charge = 0;
}

/*
 * This must be called under preempt disabled or must be called by
 * a thread which is pinned to local cpu.
 */
static void drain_local_stock(struct work_struct *dummy)
{
	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
	drain_stock(stock);
}

/*
 * Cache charges(val) which is from res_counter, to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
 */
static void refill_stock(struct mem_cgroup *mem, int val)
{
	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

	if (stock->cached != mem) { /* reset if necessary */
		drain_stock(stock);
		stock->cached = mem;
	}
	stock->charge += val;
	put_cpu_var(memcg_stock);
}

/*
 * Tries to drain stocked charges in other cpus. This function is asynchronous
 * and just put a work per cpu for draining localy on each cpu. Caller can
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
static void drain_all_stock_async(void)
{
	int cpu;
	/* This function is for scheduling "drain" in asynchronous way.
	 * The result of "drain" is not directly handled by callers. Then,
	 * if someone is calling drain, we don't have to call drain more.
	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
	 * there is a race. We just do loose check here.
	 */
	if (atomic_read(&memcg_drain_count))
		return;
	/* Notify other cpus that system-wide "drain" is running */
	atomic_inc(&memcg_drain_count);
	get_online_cpus();
	for_each_online_cpu(cpu) {
		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
		schedule_work_on(cpu, &stock->work);
	}
 	put_online_cpus();
	atomic_dec(&memcg_drain_count);
	/* We don't wait for flush_work */
}

/* This is a synchronous drain interface. */
static void drain_all_stock_sync(void)
{
	/* called when force_empty is called */
	atomic_inc(&memcg_drain_count);
	schedule_on_each_cpu(drain_local_stock);
	atomic_dec(&memcg_drain_count);
}

static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
					unsigned long action,
					void *hcpu)
{
	int cpu = (unsigned long)hcpu;
	struct memcg_stock_pcp *stock;

	if (action != CPU_DEAD)
		return NOTIFY_OK;
	stock = &per_cpu(memcg_stock, cpu);
	drain_stock(stock);
	return NOTIFY_OK;
}


/* See __mem_cgroup_try_charge() for details */
enum {
	CHARGE_OK,		/* success */
	CHARGE_RETRY,		/* need to retry but retry is not bad */
	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
};

static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
				int csize, bool oom_check)
{
	struct mem_cgroup *mem_over_limit;
	struct res_counter *fail_res;
	unsigned long flags = 0;
	int ret;

	ret = res_counter_charge(&mem->res, csize, &fail_res);

	if (likely(!ret)) {
		if (!do_swap_account)
			return CHARGE_OK;
		ret = res_counter_charge(&mem->memsw, csize, &fail_res);
		if (likely(!ret))
			return CHARGE_OK;

		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
	} else
		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

	if (csize > PAGE_SIZE) /* change csize and retry */
		return CHARGE_RETRY;

	if (!(gfp_mask & __GFP_WAIT))
		return CHARGE_WOULDBLOCK;

	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
					gfp_mask, flags);
	/*
	 * try_to_free_mem_cgroup_pages() might not give us a full
	 * picture of reclaim. Some pages are reclaimed and might be
	 * moved to swap cache or just unmapped from the cgroup.
	 * Check the limit again to see if the reclaim reduced the
	 * current usage of the cgroup before giving up
	 */
	if (ret || mem_cgroup_check_under_limit(mem_over_limit))
		return CHARGE_RETRY;

	/*
	 * At task move, charge accounts can be doubly counted. So, it's
	 * better to wait until the end of task_move if something is going on.
	 */
	if (mem_cgroup_wait_acct_move(mem_over_limit))
		return CHARGE_RETRY;

	/* If we don't need to call oom-killer at el, return immediately */
	if (!oom_check)
		return CHARGE_NOMEM;
	/* check OOM */
	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
		return CHARGE_OOM_DIE;

	return CHARGE_RETRY;
}

/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
 */
static int __mem_cgroup_try_charge(struct mm_struct *mm,
		gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
	struct mem_cgroup *mem = NULL;
	int ret;
	int csize = CHARGE_SIZE;

	/*
	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
	 * in system level. So, allow to go ahead dying process in addition to
	 * MEMDIE process.
	 */
	if (unlikely(test_thread_flag(TIF_MEMDIE)
		     || fatal_signal_pending(current)))
		goto bypass;

	/*
	 * We always charge the cgroup the mm_struct belongs to.
	 * The mm_struct's mem_cgroup changes on task migration if the
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
	if (!*memcg && !mm)
		goto bypass;
again:
	if (*memcg) { /* css should be a valid one */
		mem = *memcg;
		VM_BUG_ON(css_is_removed(&mem->css));
		if (mem_cgroup_is_root(mem))
			goto done;
		if (consume_stock(mem))
			goto done;
		css_get(&mem->css);
	} else {
		struct task_struct *p;

		rcu_read_lock();
		p = rcu_dereference(mm->owner);
		VM_BUG_ON(!p);
		/*
		 * because we don't have task_lock(), "p" can exit while
		 * we're here. In that case, "mem" can point to root
		 * cgroup but never be NULL. (and task_struct itself is freed
		 * by RCU, cgroup itself is RCU safe.) Then, we have small
		 * risk here to get wrong cgroup. But such kind of mis-account
		 * by race always happens because we don't have cgroup_mutex().
		 * It's overkill and we allow that small race, here.
		 */
		mem = mem_cgroup_from_task(p);
		VM_BUG_ON(!mem);
		if (mem_cgroup_is_root(mem)) {
			rcu_read_unlock();
			goto done;
		}
		if (consume_stock(mem)) {
			/*
			 * It seems dagerous to access memcg without css_get().
			 * But considering how consume_stok works, it's not
			 * necessary. If consume_stock success, some charges
			 * from this memcg are cached on this cpu. So, we
			 * don't need to call css_get()/css_tryget() before
			 * calling consume_stock().
			 */
			rcu_read_unlock();
			goto done;
		}
		/* after here, we may be blocked. we need to get refcnt */
		if (!css_tryget(&mem->css)) {
			rcu_read_unlock();
			goto again;
		}
		rcu_read_unlock();
	}

	do {
		bool oom_check;

		/* If killed, bypass charge */
		if (fatal_signal_pending(current)) {
			css_put(&mem->css);
			goto bypass;
		}

		oom_check = false;
		if (oom && !nr_oom_retries) {
			oom_check = true;
			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
		}

		ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);

		switch (ret) {
		case CHARGE_OK:
			break;
		case CHARGE_RETRY: /* not in OOM situation but retry */
			csize = PAGE_SIZE;
			css_put(&mem->css);
			mem = NULL;
			goto again;
		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
			css_put(&mem->css);
			goto nomem;
		case CHARGE_NOMEM: /* OOM routine works */
			if (!oom) {
				css_put(&mem->css);
				goto nomem;
			}
			/* If oom, we never return -ENOMEM */
			nr_oom_retries--;
			break;
		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
			css_put(&mem->css);
			goto bypass;
		}
	} while (ret != CHARGE_OK);

	if (csize > PAGE_SIZE)
		refill_stock(mem, csize - PAGE_SIZE);
	css_put(&mem->css);
done:
	*memcg = mem;
	return 0;
nomem:
	*memcg = NULL;
	return -ENOMEM;
bypass:
	*memcg = NULL;
	return 0;
}

/*
 * Somemtimes we have to undo a charge we got by try_charge().
 * This function is for that and do uncharge, put css's refcnt.
 * gotten by try_charge().
 */
static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
							unsigned long count)
{
	if (!mem_cgroup_is_root(mem)) {
		res_counter_uncharge(&mem->res, PAGE_SIZE * count);
		if (do_swap_account)
			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
	}
}

static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
{
	__mem_cgroup_cancel_charge(mem, 1);
}

/*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock(). The caller must check css_is_removed() or some if
 * it's concern. (dropping refcnt from swap can be called against removed
 * memcg.)
 */
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
	struct cgroup_subsys_state *css;

	/* ID 0 is unused ID */
	if (!id)
		return NULL;
	css = css_lookup(&mem_cgroup_subsys, id);
	if (!css)
		return NULL;
	return container_of(css, struct mem_cgroup, css);
}

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
	struct mem_cgroup *mem = NULL;
	struct page_cgroup *pc;
	unsigned short id;
	swp_entry_t ent;

	VM_BUG_ON(!PageLocked(page));

	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc);
	if (PageCgroupUsed(pc)) {
		mem = pc->mem_cgroup;
		if (mem && !css_tryget(&mem->css))
			mem = NULL;
	} else if (PageSwapCache(page)) {
		ent.val = page_private(page);
		id = lookup_swap_cgroup(ent);
		rcu_read_lock();
		mem = mem_cgroup_lookup(id);
		if (mem && !css_tryget(&mem->css))
			mem = NULL;
		rcu_read_unlock();
	}
	unlock_page_cgroup(pc);
	return mem;
}

/*
 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
 * USED state. If already USED, uncharge and return.
 */

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
				     struct page_cgroup *pc,
				     enum charge_type ctype)
{
	/* try_charge() can return NULL to *memcg, taking care of it. */
	if (!mem)
		return;

	lock_page_cgroup(pc);
	if (unlikely(PageCgroupUsed(pc))) {
		unlock_page_cgroup(pc);
		mem_cgroup_cancel_charge(mem);
		return;
	}

	pc->mem_cgroup = mem;
	/*
	 * We access a page_cgroup asynchronously without lock_page_cgroup().
	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
	 * before USED bit, we need memory barrier here.
	 * See mem_cgroup_add_lru_list(), etc.
 	 */
	smp_wmb();
	switch (ctype) {
	case MEM_CGROUP_CHARGE_TYPE_CACHE:
	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
		SetPageCgroupCache(pc);
		SetPageCgroupUsed(pc);
		break;
	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
		ClearPageCgroupCache(pc);
		SetPageCgroupUsed(pc);
		break;
	default:
		break;
	}