Skip to content
Snippets Groups Projects
memcontrol.c 188 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	int loop = 0;
    	unsigned long excess;
    	unsigned long nr_scanned;
    	struct mem_cgroup_reclaim_cookie reclaim = {
    		.zone = zone,
    		.priority = 0,
    	};
    
    	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
    
    	while (1) {
    		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
    		if (!victim) {
    			loop++;
    			if (loop >= 2) {
    				/*
    				 * If we have not been able to reclaim
    				 * anything, it might because there are
    				 * no reclaimable pages under this hierarchy
    				 */
    				if (!total)
    					break;
    				/*
    				 * We want to do more targeted reclaim.
    				 * excess >> 2 is not to excessive so as to
    				 * reclaim too much, nor too less that we keep
    				 * coming back to reclaim from this cgroup
    				 */
    				if (total >= (excess >> 2) ||
    					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
    					break;
    			}
    			continue;
    		}
    		if (!mem_cgroup_reclaimable(victim, false))
    			continue;
    		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
    						     zone, &nr_scanned);
    		*total_scanned += nr_scanned;
    		if (!res_counter_soft_limit_excess(&root_memcg->res))
    			break;
    
    	mem_cgroup_iter_break(root_memcg, victim);
    	return total;
    
    static DEFINE_SPINLOCK(memcg_oom_lock);
    
    
    /*
     * Check OOM-Killer is already running under our hierarchy.
     * If someone is running, return false.
     */
    
    static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter, *failed = NULL;
    
    	for_each_mem_cgroup_tree(iter, memcg) {
    
    		if (iter->oom_lock) {
    
    			/*
    			 * this subtree of our hierarchy is already locked
    			 * so we cannot give a lock.
    			 */
    			failed = iter;
    
    			mem_cgroup_iter_break(memcg, iter);
    			break;
    
    		} else
    			iter->oom_lock = true;
    
    	if (failed) {
    		/*
    		 * OK, we failed to lock the whole subtree so we have
    		 * to clean up what we set up to the failing subtree
    		 */
    		for_each_mem_cgroup_tree(iter, memcg) {
    			if (iter == failed) {
    				mem_cgroup_iter_break(memcg, iter);
    				break;
    			}
    			iter->oom_lock = false;
    
    
    	spin_unlock(&memcg_oom_lock);
    
    	return !failed;
    
    static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
    
    	struct mem_cgroup *iter;
    
    
    	spin_lock(&memcg_oom_lock);
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		iter->oom_lock = false;
    
    	spin_unlock(&memcg_oom_lock);
    
    static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
    
    	/*
    	 * When a new child is created while the hierarchy is under oom,
    	 * mem_cgroup_oom_lock() may not be called. We have to use
    	 * atomic_add_unless() here.
    	 */
    
    	for_each_mem_cgroup_tree(iter, memcg)
    
    		atomic_add_unless(&iter->under_oom, -1, 0);
    
    static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
    
    
    struct oom_wait_info {
    
    	struct mem_cgroup *memcg;
    
    	wait_queue_t	wait;
    };
    
    static int memcg_oom_wake_function(wait_queue_t *wait,
    	unsigned mode, int sync, void *arg)
    {
    
    	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
    	struct mem_cgroup *oom_wait_memcg;
    
    	struct oom_wait_info *oom_wait_info;
    
    	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
    
    	oom_wait_memcg = oom_wait_info->memcg;
    
    	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
    
    	 * Then we can use css_is_ancestor without taking care of RCU.
    	 */
    
    	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
    		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
    
    		return 0;
    	return autoremove_wake_function(wait, mode, sync, arg);
    }
    
    
    static void memcg_wakeup_oom(struct mem_cgroup *memcg)
    
    	atomic_inc(&memcg->oom_wakeups);
    
    	/* for filtering, pass "memcg" as argument. */
    	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
    
    static void memcg_oom_recover(struct mem_cgroup *memcg)
    
    	if (memcg && atomic_read(&memcg->under_oom))
    		memcg_wakeup_oom(memcg);
    
    static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
    
    	if (!current->memcg_oom.may_oom)
    		return;
    
    	current->memcg_oom.in_memcg_oom = 1;
    
    	 * As with any blocking lock, a contender needs to start
    	 * listening for wakeups before attempting the trylock,
    	 * otherwise it can miss the wakeup from the unlock and sleep
    	 * indefinitely.  This is just open-coded because our locking
    	 * is so particular to memcg hierarchies.
    
    	wakeups = atomic_read(&memcg->oom_wakeups);
    
    	mem_cgroup_mark_under_oom(memcg);
    
    	locked = mem_cgroup_oom_trylock(memcg);
    
    
    		mem_cgroup_oom_notify(memcg);
    
    	if (locked && !memcg->oom_kill_disable) {
    		mem_cgroup_unmark_under_oom(memcg);
    
    		mem_cgroup_out_of_memory(memcg, mask, order);
    
    		mem_cgroup_oom_unlock(memcg);
    		/*
    		 * There is no guarantee that an OOM-lock contender
    		 * sees the wakeups triggered by the OOM kill
    		 * uncharges.  Wake any sleepers explicitely.
    		 */
    		memcg_oom_recover(memcg);
    
    		/*
    		 * A system call can just return -ENOMEM, but if this
    		 * is a page fault and somebody else is handling the
    		 * OOM already, we need to sleep on the OOM waitqueue
    		 * for this memcg until the situation is resolved.
    		 * Which can take some time because it might be
    		 * handled by a userspace task.
    		 *
    		 * However, this is the charge context, which means
    		 * that we may sit on a large call stack and hold
    		 * various filesystem locks, the mmap_sem etc. and we
    		 * don't want the OOM handler to deadlock on them
    		 * while we sit here and wait.  Store the current OOM
    		 * context in the task_struct, then return -ENOMEM.
    		 * At the end of the page fault handler, with the
    		 * stack unwound, pagefault_out_of_memory() will check
    		 * back with us by calling
    		 * mem_cgroup_oom_synchronize(), possibly putting the
    		 * task to sleep.
    		 */
    		current->memcg_oom.oom_locked = locked;
    		current->memcg_oom.wakeups = wakeups;
    		css_get(&memcg->css);
    		current->memcg_oom.wait_on_memcg = memcg;
    
    }
    
    /**
     * mem_cgroup_oom_synchronize - complete memcg OOM handling
     *
     * This has to be called at the end of a page fault if the the memcg
     * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
     *
     * Memcg supports userspace OOM handling, so failed allocations must
     * sleep on a waitqueue until the userspace task resolves the
     * situation.  Sleeping directly in the charge context with all kinds
     * of locks held is not a good idea, instead we remember an OOM state
     * in the task and mem_cgroup_oom_synchronize() has to be called at
     * the end of the page fault to put the task to sleep and clean up the
     * OOM state.
     *
     * Returns %true if an ongoing memcg OOM situation was detected and
     * finalized, %false otherwise.
     */
    bool mem_cgroup_oom_synchronize(void)
    {
    	struct oom_wait_info owait;
    	struct mem_cgroup *memcg;
    
    	/* OOM is global, do not handle */
    	if (!current->memcg_oom.in_memcg_oom)
    		return false;
    
    	/*
    	 * We invoked the OOM killer but there is a chance that a kill
    	 * did not free up any charges.  Everybody else might already
    	 * be sleeping, so restart the fault and keep the rampage
    	 * going until some charges are released.
    	 */
    	memcg = current->memcg_oom.wait_on_memcg;
    	if (!memcg)
    		goto out;
    
    	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
    		goto out_memcg;
    
    	owait.memcg = memcg;
    	owait.wait.flags = 0;
    	owait.wait.func = memcg_oom_wake_function;
    	owait.wait.private = current;
    	INIT_LIST_HEAD(&owait.wait.task_list);
    
    	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
    	/* Only sleep if we didn't miss any wakeups since OOM */
    	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
    		schedule();
    	finish_wait(&memcg_oom_waitq, &owait.wait);
    out_memcg:
    	mem_cgroup_unmark_under_oom(memcg);
    	if (current->memcg_oom.oom_locked) {
    
    		mem_cgroup_oom_unlock(memcg);
    		/*
    		 * There is no guarantee that an OOM-lock contender
    		 * sees the wakeups triggered by the OOM kill
    		 * uncharges.  Wake any sleepers explicitely.
    		 */
    		memcg_oom_recover(memcg);
    	}
    
    	css_put(&memcg->css);
    	current->memcg_oom.wait_on_memcg = NULL;
    out:
    	current->memcg_oom.in_memcg_oom = 0;
    
    	return true;
    
    /*
     * Currently used to update mapped file statistics, but the routine can be
     * generalized to update other statistics as well.
    
     *
     * Notes: Race condition
     *
     * We usually use page_cgroup_lock() for accessing page_cgroup member but
     * it tends to be costly. But considering some conditions, we doesn't need
     * to do so _always_.
     *
     * Considering "charge", lock_page_cgroup() is not required because all
     * file-stat operations happen after a page is attached to radix-tree. There
     * are no race with "charge".
     *
     * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
     * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
     * if there are race with "uncharge". Statistics itself is properly handled
     * by flags.
     *
     * Considering "move", this is an only case we see a race. To make the race
    
     * small, we check mm->moving_account and detect there are possibility of race
     * If there is, we take a lock.
    
    void __mem_cgroup_begin_update_page_stat(struct page *page,
    				bool *locked, unsigned long *flags)
    {
    	struct mem_cgroup *memcg;
    	struct page_cgroup *pc;
    
    	pc = lookup_page_cgroup(page);
    again:
    	memcg = pc->mem_cgroup;
    	if (unlikely(!memcg || !PageCgroupUsed(pc)))
    		return;
    	/*
    	 * If this memory cgroup is not under account moving, we don't
    
    	 * need to take move_lock_mem_cgroup(). Because we already hold
    
    	 * rcu_read_lock(), any calls to move_account will be delayed until
    
    	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
    
    	if (!mem_cgroup_stolen(memcg))
    
    		return;
    
    	move_lock_mem_cgroup(memcg, flags);
    	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
    		move_unlock_mem_cgroup(memcg, flags);
    		goto again;
    	}
    	*locked = true;
    }
    
    void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
    {
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    	/*
    	 * It's guaranteed that pc->mem_cgroup never changes while
    	 * lock is held because a routine modifies pc->mem_cgroup
    
    	 */
    	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
    }
    
    
    void mem_cgroup_update_page_stat(struct page *page,
    
    				 enum mem_cgroup_stat_index idx, int val)
    
    	struct mem_cgroup *memcg;
    
    	unsigned long uninitialized_var(flags);
    
    	VM_BUG_ON(!rcu_read_lock_held());
    
    	memcg = pc->mem_cgroup;
    	if (unlikely(!memcg || !PageCgroupUsed(pc)))
    
    	this_cpu_add(memcg->stat->count[idx], val);
    
    /*
     * size of first charge trial. "32" comes from vmscan.c's magic value.
     * TODO: maybe necessary to use big numbers in big irons.
     */
    
    struct memcg_stock_pcp {
    	struct mem_cgroup *cached; /* this never be root cgroup */
    
    	struct work_struct work;
    
    #define FLUSHING_CACHED_CHARGE	0
    
    };
    static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
    
    static DEFINE_MUTEX(percpu_charge_mutex);
    
    /**
     * consume_stock: Try to consume stocked charge on this cpu.
     * @memcg: memcg to consume from.
     * @nr_pages: how many pages to charge.
     *
     * The charges will only happen if @memcg matches the current cpu's memcg
     * stock, and at least @nr_pages are available in that stock.  Failure to
     * service an allocation will refill the stock.
     *
     * returns true if successful, false otherwise.
    
    static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
    
    {
    	struct memcg_stock_pcp *stock;
    	bool ret = true;
    
    
    	if (nr_pages > CHARGE_BATCH)
    		return false;
    
    
    	stock = &get_cpu_var(memcg_stock);
    
    	if (memcg == stock->cached && stock->nr_pages >= nr_pages)
    		stock->nr_pages -= nr_pages;
    
    	else /* need to call res_counter_charge */
    		ret = false;
    	put_cpu_var(memcg_stock);
    	return ret;
    }
    
    /*
     * Returns stocks cached in percpu to res_counter and reset cached information.
     */
    static void drain_stock(struct memcg_stock_pcp *stock)
    {
    	struct mem_cgroup *old = stock->cached;
    
    
    	if (stock->nr_pages) {
    		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
    
    		res_counter_uncharge(&old->res, bytes);
    
    		if (do_swap_account)
    
    			res_counter_uncharge(&old->memsw, bytes);
    		stock->nr_pages = 0;
    
    	}
    	stock->cached = NULL;
    }
    
    /*
     * This must be called under preempt disabled or must be called by
     * a thread which is pinned to local cpu.
     */
    static void drain_local_stock(struct work_struct *dummy)
    {
    	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
    	drain_stock(stock);
    
    	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
    
    static void __init memcg_stock_init(void)
    {
    	int cpu;
    
    	for_each_possible_cpu(cpu) {
    		struct memcg_stock_pcp *stock =
    					&per_cpu(memcg_stock, cpu);
    		INIT_WORK(&stock->work, drain_local_stock);
    	}
    }
    
    
    /*
     * Cache charges(val) which is from res_counter, to local per_cpu area.
    
     * This will be consumed by consume_stock() function, later.
    
    static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
    
    {
    	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
    
    
    	if (stock->cached != memcg) { /* reset if necessary */
    
    		drain_stock(stock);
    
    		stock->cached = memcg;
    
    	stock->nr_pages += nr_pages;
    
     * Drains all per-CPU charge caches for given root_memcg resp. subtree
    
     * of the hierarchy under it. sync flag says whether we should block
     * until the work is done.
    
    static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
    
    	/* Notify other cpus that system-wide "drain" is running */
    	get_online_cpus();
    
    	for_each_online_cpu(cpu) {
    		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
    
    		struct mem_cgroup *memcg;
    
    		memcg = stock->cached;
    		if (!memcg || !stock->nr_pages)
    
    		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
    
    		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
    			if (cpu == curcpu)
    				drain_local_stock(&stock->work);
    			else
    				schedule_work_on(cpu, &stock->work);
    		}
    
    
    	if (!sync)
    		goto out;
    
    	for_each_online_cpu(cpu) {
    		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
    
    		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
    
    Andrew Morton's avatar
    Andrew Morton committed
    	put_online_cpus();
    
    }
    
    /*
     * Tries to drain stocked charges in other cpus. This function is asynchronous
     * and just put a work per cpu for draining localy on each cpu. Caller can
     * expects some charges will be back to res_counter later but cannot wait for
     * it.
     */
    
    static void drain_all_stock_async(struct mem_cgroup *root_memcg)
    
    	/*
    	 * If someone calls draining, avoid adding more kworker runs.
    	 */
    	if (!mutex_trylock(&percpu_charge_mutex))
    		return;
    
    	drain_all_stock(root_memcg, false);
    
    	mutex_unlock(&percpu_charge_mutex);
    
    }
    
    /* This is a synchronous drain interface. */
    
    static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
    
    {
    	/* called when force_empty is called */
    
    	mutex_lock(&percpu_charge_mutex);
    
    	drain_all_stock(root_memcg, true);
    
    	mutex_unlock(&percpu_charge_mutex);
    
    /*
     * This function drains percpu counter value from DEAD cpu and
     * move it to local cpu. Note that this function can be preempted.
     */
    
    static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
    
    	spin_lock(&memcg->pcp_counter_lock);
    
    	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
    
    		long x = per_cpu(memcg->stat->count[i], cpu);
    
    		per_cpu(memcg->stat->count[i], cpu) = 0;
    		memcg->nocpu_base.count[i] += x;
    
    	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
    
    		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
    
    		per_cpu(memcg->stat->events[i], cpu) = 0;
    		memcg->nocpu_base.events[i] += x;
    
    	spin_unlock(&memcg->pcp_counter_lock);
    
    static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
    
    					unsigned long action,
    					void *hcpu)
    {
    	int cpu = (unsigned long)hcpu;
    	struct memcg_stock_pcp *stock;
    
    	struct mem_cgroup *iter;
    
    	if (action == CPU_ONLINE)
    
    	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
    
    	for_each_mem_cgroup(iter)
    
    		mem_cgroup_drain_pcp_counter(iter, cpu);
    
    
    	stock = &per_cpu(memcg_stock, cpu);
    	drain_stock(stock);
    	return NOTIFY_OK;
    }
    
    
    
    /* See __mem_cgroup_try_charge() for details */
    enum {
    	CHARGE_OK,		/* success */
    	CHARGE_RETRY,		/* need to retry but retry is not bad */
    	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
    	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
    };
    
    
    static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
    
    				unsigned int nr_pages, unsigned int min_pages,
    
    	unsigned long csize = nr_pages * PAGE_SIZE;
    
    	struct mem_cgroup *mem_over_limit;
    	struct res_counter *fail_res;
    	unsigned long flags = 0;
    	int ret;
    
    
    	ret = res_counter_charge(&memcg->res, csize, &fail_res);
    
    
    	if (likely(!ret)) {
    		if (!do_swap_account)
    			return CHARGE_OK;
    
    		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
    
    		if (likely(!ret))
    			return CHARGE_OK;
    
    
    		res_counter_uncharge(&memcg->res, csize);
    
    		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
    		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
    	} else
    		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
    
    	/*
    	 * Never reclaim on behalf of optional batching, retry with a
    	 * single page instead.
    	 */
    
    	if (nr_pages > min_pages)
    
    		return CHARGE_RETRY;
    
    	if (!(gfp_mask & __GFP_WAIT))
    		return CHARGE_WOULDBLOCK;
    
    
    	if (gfp_mask & __GFP_NORETRY)
    		return CHARGE_NOMEM;
    
    
    	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
    
    	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
    
    	 * Even though the limit is exceeded at this point, reclaim
    	 * may have been able to free some pages.  Retry the charge
    	 * before killing the task.
    	 *
    	 * Only for regular pages, though: huge pages are rather
    	 * unlikely to succeed so close to the limit, and we fall back
    	 * to regular pages anyway in case of failure.
    
    	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
    
    		return CHARGE_RETRY;
    
    	/*
    	 * At task move, charge accounts can be doubly counted. So, it's
    	 * better to wait until the end of task_move if something is going on.
    	 */
    	if (mem_cgroup_wait_acct_move(mem_over_limit))
    		return CHARGE_RETRY;
    
    
    	if (invoke_oom)
    		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
    
     * __mem_cgroup_try_charge() does
     * 1. detect memcg to be charged against from passed *mm and *ptr,
     * 2. update res_counter
     * 3. call memory reclaim if necessary.
     *
     * In some special case, if the task is fatal, fatal_signal_pending() or
     * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
     * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
     * as possible without any hazards. 2: all pages should have a valid
     * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
     * pointer, that is treated as a charge to root_mem_cgroup.
     *
     * So __mem_cgroup_try_charge() will return
     *  0       ...  on success, filling *ptr with a valid memcg pointer.
     *  -ENOMEM ...  charge failure because of resource limits.
     *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
     *
     * Unlike the exported interface, an "oom" parameter is added. if oom==true,
     * the oom-killer can be invoked.
    
    static int __mem_cgroup_try_charge(struct mm_struct *mm,
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    				   gfp_t gfp_mask,
    
    				   struct mem_cgroup **ptr,
    
    	unsigned int batch = max(CHARGE_BATCH, nr_pages);
    
    	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
    
    	struct mem_cgroup *memcg = NULL;
    
    	/*
    	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
    	 * in system level. So, allow to go ahead dying process in addition to
    	 * MEMDIE process.
    	 */
    	if (unlikely(test_thread_flag(TIF_MEMDIE)
    		     || fatal_signal_pending(current)))
    		goto bypass;
    
    	 * We always charge the cgroup the mm_struct belongs to.
    	 * The mm_struct's mem_cgroup changes on task migration if the
    
    	 * thread group leader migrates. It's possible that mm is not
    
    	 * set, if so charge the root memcg (happens for pagecache usage).
    
    	if (!*ptr && !mm)
    
    		*ptr = root_mem_cgroup;
    
    again:
    
    	if (*ptr) { /* css should be a valid one */
    		memcg = *ptr;
    		if (mem_cgroup_is_root(memcg))
    
    			goto done;
    
    		if (consume_stock(memcg, nr_pages))
    
    			goto done;
    
    		css_get(&memcg->css);
    
    		struct task_struct *p;
    
    		rcu_read_lock();
    		p = rcu_dereference(mm->owner);
    		/*
    
    		 * Because we don't have task_lock(), "p" can exit.
    
    		 * In that case, "memcg" can point to root or p can be NULL with
    
    		 * race with swapoff. Then, we have small risk of mis-accouning.
    		 * But such kind of mis-account by race always happens because
    		 * we don't have cgroup_mutex(). It's overkill and we allo that
    		 * small race, here.
    		 * (*) swapoff at el will charge against mm-struct not against
    		 * task-struct. So, mm->owner can be NULL.
    
    		memcg = mem_cgroup_from_task(p);
    
    		if (!memcg)
    			memcg = root_mem_cgroup;
    		if (mem_cgroup_is_root(memcg)) {
    
    			rcu_read_unlock();
    			goto done;
    		}
    
    		if (consume_stock(memcg, nr_pages)) {
    
    			/*
    			 * It seems dagerous to access memcg without css_get().
    			 * But considering how consume_stok works, it's not
    			 * necessary. If consume_stock success, some charges
    			 * from this memcg are cached on this cpu. So, we
    			 * don't need to call css_get()/css_tryget() before
    			 * calling consume_stock().
    			 */
    			rcu_read_unlock();
    			goto done;
    		}
    		/* after here, we may be blocked. we need to get refcnt */
    
    		if (!css_tryget(&memcg->css)) {
    
    			rcu_read_unlock();
    			goto again;
    		}
    		rcu_read_unlock();
    	}
    
    		bool invoke_oom = oom && !nr_oom_retries;
    
    		/* If killed, bypass charge */
    
    		if (fatal_signal_pending(current)) {
    
    			css_put(&memcg->css);
    
    		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
    					   nr_pages, invoke_oom);
    
    		switch (ret) {
    		case CHARGE_OK:
    			break;
    		case CHARGE_RETRY: /* not in OOM situation but retry */
    
    			css_put(&memcg->css);
    			memcg = NULL;
    
    			goto again;
    
    		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
    
    			css_put(&memcg->css);
    
    			goto nomem;
    		case CHARGE_NOMEM: /* OOM routine works */
    
    				css_put(&memcg->css);
    
    				goto nomem;
    
    			nr_oom_retries--;
    			break;
    
    	} while (ret != CHARGE_OK);
    
    
    		refill_stock(memcg, batch - nr_pages);
    	css_put(&memcg->css);
    
    	*ptr = memcg;
    
    	*ptr = NULL;
    
    	*ptr = root_mem_cgroup;
    	return -EINTR;
    
    /*
     * Somemtimes we have to undo a charge we got by try_charge().
     * This function is for that and do uncharge, put css's refcnt.
     * gotten by try_charge().
     */
    
    static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
    
    				       unsigned int nr_pages)
    
    	if (!mem_cgroup_is_root(memcg)) {
    
    		unsigned long bytes = nr_pages * PAGE_SIZE;
    
    
    		res_counter_uncharge(&memcg->res, bytes);
    
    		if (do_swap_account)
    
    			res_counter_uncharge(&memcg->memsw, bytes);
    
    /*
     * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
     * This is useful when moving usage to parent cgroup.
     */
    static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
    					unsigned int nr_pages)
    {
    	unsigned long bytes = nr_pages * PAGE_SIZE;
    
    	if (mem_cgroup_is_root(memcg))
    		return;
    
    	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
    	if (do_swap_account)
    		res_counter_uncharge_until(&memcg->memsw,
    						memcg->memsw.parent, bytes);
    }
    
    
    /*
     * A helper function to get mem_cgroup from ID. must be called under
    
    Tejun Heo's avatar
    Tejun Heo committed
     * rcu_read_lock().  The caller is responsible for calling css_tryget if
     * the mem_cgroup is used for charging. (dropping refcnt from swap can be
     * called against removed memcg.)
    
     */
    static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
    {
    	struct cgroup_subsys_state *css;
    
    	/* ID 0 is unused ID */
    	if (!id)
    		return NULL;
    	css = css_lookup(&mem_cgroup_subsys, id);
    	if (!css)
    		return NULL;
    
    	return mem_cgroup_from_css(css);
    
    struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
    
    	struct mem_cgroup *memcg = NULL;
    
    	struct page_cgroup *pc;
    
    	VM_BUG_ON(!PageLocked(page));
    
    	pc = lookup_page_cgroup(page);
    
    	lock_page_cgroup(pc);
    
    		memcg = pc->mem_cgroup;
    		if (memcg && !css_tryget(&memcg->css))
    			memcg = NULL;
    
    	} else if (PageSwapCache(page)) {
    
    		ent.val = page_private(page);
    
    		id = lookup_swap_cgroup_id(ent);
    
    		memcg = mem_cgroup_lookup(id);
    		if (memcg && !css_tryget(&memcg->css))
    			memcg = NULL;
    
    	unlock_page_cgroup(pc);
    
    	return memcg;
    
    static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
    
    				       enum charge_type ctype,
    				       bool lrucare)
    
    	struct page_cgroup *pc = lookup_page_cgroup(page);
    
    	struct zone *uninitialized_var(zone);
    
    	struct lruvec *lruvec;
    
    	bool was_on_lru = false;
    
    	lock_page_cgroup(pc);
    
    	VM_BUG_ON(PageCgroupUsed(pc));
    
    	/*
    	 * we don't need page_cgroup_lock about tail pages, becase they are not
    	 * accessed by any other context at this point.
    	 */
    
    
    	/*
    	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
    	 * may already be on some other mem_cgroup's LRU.  Take care of it.
    	 */
    	if (lrucare) {
    		zone = page_zone(page);
    		spin_lock_irq(&zone->lru_lock);
    		if (PageLRU(page)) {
    
    			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
    
    			ClearPageLRU(page);
    
    			del_page_from_lru_list(page, lruvec, page_lru(page));
    
    	pc->mem_cgroup = memcg;
    
    	/*
    	 * We access a page_cgroup asynchronously without lock_page_cgroup().
    	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
    	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
    	 * before USED bit, we need memory barrier here.
    	 * See mem_cgroup_add_lru_list(), etc.
    
    	smp_wmb();
    
    	SetPageCgroupUsed(pc);
    
    	if (lrucare) {
    		if (was_on_lru) {
    
    			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
    
    			VM_BUG_ON(PageLRU(page));
    			SetPageLRU(page);
    
    			add_page_to_lru_list(page, lruvec, page_lru(page));
    
    		}
    		spin_unlock_irq(&zone->lru_lock);
    	}
    
    
    	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
    
    	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
    
    	unlock_page_cgroup(pc);
    
    	 * "charge_statistics" updated event counter. Then, check it.
    	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
    	 * if they exceeds softlimit.
    
    	memcg_check_events(memcg, page);
    
    static DEFINE_MUTEX(set_limit_mutex);
    
    
    #ifdef CONFIG_MEMCG_KMEM
    static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
    {
    	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
    		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
    }
    
    
    /*
     * This is a bit cumbersome, but it is rarely used and avoids a backpointer
     * in the memcg_cache_params struct.
     */
    static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
    {
    	struct kmem_cache *cachep;
    
    	VM_BUG_ON(p->is_root_cache);
    	cachep = p->root_cache;
    	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
    }
    
    
    #ifdef CONFIG_SLABINFO
    
    static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
    				    struct cftype *cft, struct seq_file *m)
    
    	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
    
    	struct memcg_cache_params *params;
    
    	if (!memcg_can_account_kmem(memcg))
    		return -EIO;
    
    	print_slabinfo_header(m);
    
    	mutex_lock(&memcg->slab_caches_mutex);
    	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
    		cache_show(memcg_params_to_cache(params), m);
    	mutex_unlock(&memcg->slab_caches_mutex);