Skip to content
Snippets Groups Projects
memcontrol.c 188 KiB
Newer Older
  • Learn to ignore specific revisions
  • static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
    {
    	struct res_counter *fail_res;
    	struct mem_cgroup *_memcg;
    	int ret = 0;
    	bool may_oom;
    
    	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
    	if (ret)
    		return ret;
    
    	/*
    	 * Conditions under which we can wait for the oom_killer. Those are
    	 * the same conditions tested by the core page allocator
    	 */
    	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
    
    	_memcg = memcg;
    	ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
    				      &_memcg, may_oom);
    
    	if (ret == -EINTR)  {
    		/*
    		 * __mem_cgroup_try_charge() chosed to bypass to root due to
    		 * OOM kill or fatal signal.  Since our only options are to
    		 * either fail the allocation or charge it to this cgroup, do
    		 * it as a temporary condition. But we can't fail. From a
    		 * kmem/slab perspective, the cache has already been selected,
    		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
    		 * our minds.
    		 *
    		 * This condition will only trigger if the task entered
    		 * memcg_charge_kmem in a sane state, but was OOM-killed during
    		 * __mem_cgroup_try_charge() above. Tasks that were already
    		 * dying when the allocation triggers should have been already
    		 * directed to the root cgroup in memcontrol.h
    		 */
    		res_counter_charge_nofail(&memcg->res, size, &fail_res);
    		if (do_swap_account)
    			res_counter_charge_nofail(&memcg->memsw, size,
    						  &fail_res);
    		ret = 0;
    	} else if (ret)
    		res_counter_uncharge(&memcg->kmem, size);
    
    	return ret;
    }
    
    static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
    {
    	res_counter_uncharge(&memcg->res, size);
    	if (do_swap_account)
    		res_counter_uncharge(&memcg->memsw, size);
    
    
    	/* Not down to 0 */
    	if (res_counter_uncharge(&memcg->kmem, size))
    		return;
    
    
    	/*
    	 * Releases a reference taken in kmem_cgroup_css_offline in case
    	 * this last uncharge is racing with the offlining code or it is
    	 * outliving the memcg existence.
    	 *
    	 * The memory barrier imposed by test&clear is paired with the
    	 * explicit one in memcg_kmem_mark_dead().
    	 */
    
    	if (memcg_kmem_test_and_clear_dead(memcg))
    
    		css_put(&memcg->css);
    
    void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
    {
    	if (!memcg)
    		return;
    
    	mutex_lock(&memcg->slab_caches_mutex);
    	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
    	mutex_unlock(&memcg->slab_caches_mutex);
    }
    
    /*
     * helper for acessing a memcg's index. It will be used as an index in the
     * child cache array in kmem_cache, and also to derive its name. This function
     * will return -1 when this is not a kmem-limited memcg.
     */
    int memcg_cache_id(struct mem_cgroup *memcg)
    {
    	return memcg ? memcg->kmemcg_id : -1;
    }
    
    
    /*
     * This ends up being protected by the set_limit mutex, during normal
     * operation, because that is its main call site.
     *
     * But when we create a new cache, we can call this as well if its parent
     * is kmem-limited. That will have to hold set_limit_mutex as well.
     */
    int memcg_update_cache_sizes(struct mem_cgroup *memcg)
    {
    	int num, ret;
    
    	num = ida_simple_get(&kmem_limited_groups,
    				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
    	if (num < 0)
    		return num;
    	/*
    	 * After this point, kmem_accounted (that we test atomically in
    	 * the beginning of this conditional), is no longer 0. This
    	 * guarantees only one process will set the following boolean
    	 * to true. We don't need test_and_set because we're protected
    	 * by the set_limit_mutex anyway.
    	 */
    	memcg_kmem_set_activated(memcg);
    
    	ret = memcg_update_all_caches(num+1);
    	if (ret) {
    		ida_simple_remove(&kmem_limited_groups, num);
    		memcg_kmem_clear_activated(memcg);
    		return ret;
    	}
    
    	memcg->kmemcg_id = num;
    	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
    	mutex_init(&memcg->slab_caches_mutex);
    	return 0;
    }
    
    static size_t memcg_caches_array_size(int num_groups)
    {
    	ssize_t size;
    	if (num_groups <= 0)
    		return 0;
    
    	size = 2 * num_groups;
    	if (size < MEMCG_CACHES_MIN_SIZE)
    		size = MEMCG_CACHES_MIN_SIZE;
    	else if (size > MEMCG_CACHES_MAX_SIZE)
    		size = MEMCG_CACHES_MAX_SIZE;
    
    	return size;
    }
    
    /*
     * We should update the current array size iff all caches updates succeed. This
     * can only be done from the slab side. The slab mutex needs to be held when
     * calling this.
     */
    void memcg_update_array_size(int num)
    {
    	if (num > memcg_limited_groups_array_size)
    		memcg_limited_groups_array_size = memcg_caches_array_size(num);
    }
    
    
    static void kmem_cache_destroy_work_func(struct work_struct *w);
    
    
    int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
    {
    	struct memcg_cache_params *cur_params = s->memcg_params;
    
    	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
    
    	if (num_groups > memcg_limited_groups_array_size) {
    		int i;
    		ssize_t size = memcg_caches_array_size(num_groups);
    
    		size *= sizeof(void *);
    
    		size += offsetof(struct memcg_cache_params, memcg_caches);
    
    
    		s->memcg_params = kzalloc(size, GFP_KERNEL);
    		if (!s->memcg_params) {
    			s->memcg_params = cur_params;
    			return -ENOMEM;
    		}
    
    		s->memcg_params->is_root_cache = true;
    
    		/*
    		 * There is the chance it will be bigger than
    		 * memcg_limited_groups_array_size, if we failed an allocation
    		 * in a cache, in which case all caches updated before it, will
    		 * have a bigger array.
    		 *
    		 * But if that is the case, the data after
    		 * memcg_limited_groups_array_size is certainly unused
    		 */
    		for (i = 0; i < memcg_limited_groups_array_size; i++) {
    			if (!cur_params->memcg_caches[i])
    				continue;
    			s->memcg_params->memcg_caches[i] =
    						cur_params->memcg_caches[i];
    		}
    
    		/*
    		 * Ideally, we would wait until all caches succeed, and only
    		 * then free the old one. But this is not worth the extra
    		 * pointer per-cache we'd have to have for this.
    		 *
    		 * It is not a big deal if some caches are left with a size
    		 * bigger than the others. And all updates will reset this
    		 * anyway.
    		 */
    		kfree(cur_params);
    	}
    	return 0;
    }
    
    
    int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
    			 struct kmem_cache *root_cache)
    
    	if (!memcg) {
    		size = offsetof(struct memcg_cache_params, memcg_caches);
    
    		size += memcg_limited_groups_array_size * sizeof(void *);
    
    	} else
    		size = sizeof(struct memcg_cache_params);
    
    	s->memcg_params = kzalloc(size, GFP_KERNEL);
    	if (!s->memcg_params)
    		return -ENOMEM;
    
    
    	if (memcg) {
    
    		s->memcg_params->memcg = memcg;
    
    		s->memcg_params->root_cache = root_cache;
    
    		INIT_WORK(&s->memcg_params->destroy,
    				kmem_cache_destroy_work_func);
    
    	} else
    		s->memcg_params->is_root_cache = true;
    
    
    	return 0;
    }
    
    void memcg_release_cache(struct kmem_cache *s)
    {
    
    	struct kmem_cache *root;
    	struct mem_cgroup *memcg;
    	int id;
    
    	/*
    	 * This happens, for instance, when a root cache goes away before we
    	 * add any memcg.
    	 */
    	if (!s->memcg_params)
    		return;
    
    	if (s->memcg_params->is_root_cache)
    		goto out;
    
    	memcg = s->memcg_params->memcg;
    	id  = memcg_cache_id(memcg);
    
    	root = s->memcg_params->root_cache;
    	root->memcg_params->memcg_caches[id] = NULL;
    
    	mutex_lock(&memcg->slab_caches_mutex);
    	list_del(&s->memcg_params->list);
    	mutex_unlock(&memcg->slab_caches_mutex);
    
    
    /*
     * During the creation a new cache, we need to disable our accounting mechanism
     * altogether. This is true even if we are not creating, but rather just
     * enqueing new caches to be created.
     *
     * This is because that process will trigger allocations; some visible, like
     * explicit kmallocs to auxiliary data structures, name strings and internal
     * cache structures; some well concealed, like INIT_WORK() that can allocate
     * objects during debug.
     *
     * If any allocation happens during memcg_kmem_get_cache, we will recurse back
     * to it. This may not be a bounded recursion: since the first cache creation
     * failed to complete (waiting on the allocation), we'll just try to create the
     * cache again, failing at the same point.
     *
     * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
     * memcg_kmem_skip_account. So we enclose anything that might allocate memory
     * inside the following two functions.
     */
    static inline void memcg_stop_kmem_account(void)
    {
    	VM_BUG_ON(!current->mm);
    	current->memcg_kmem_skip_account++;
    }
    
    static inline void memcg_resume_kmem_account(void)
    {
    	VM_BUG_ON(!current->mm);
    	current->memcg_kmem_skip_account--;
    }
    
    
    static void kmem_cache_destroy_work_func(struct work_struct *w)
    {
    	struct kmem_cache *cachep;
    	struct memcg_cache_params *p;
    
    	p = container_of(w, struct memcg_cache_params, destroy);
    
    	cachep = memcg_params_to_cache(p);
    
    
    	/*
    	 * If we get down to 0 after shrink, we could delete right away.
    	 * However, memcg_release_pages() already puts us back in the workqueue
    	 * in that case. If we proceed deleting, we'll get a dangling
    	 * reference, and removing the object from the workqueue in that case
    	 * is unnecessary complication. We are not a fast path.
    	 *
    	 * Note that this case is fundamentally different from racing with
    	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
    	 * kmem_cache_shrink, not only we would be reinserting a dead cache
    	 * into the queue, but doing so from inside the worker racing to
    	 * destroy it.
    	 *
    	 * So if we aren't down to zero, we'll just schedule a worker and try
    	 * again
    	 */
    	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
    		kmem_cache_shrink(cachep);
    		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
    			return;
    	} else
    
    		kmem_cache_destroy(cachep);
    }
    
    void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
    {
    	if (!cachep->memcg_params->dead)
    		return;
    
    
    	/*
    	 * There are many ways in which we can get here.
    	 *
    	 * We can get to a memory-pressure situation while the delayed work is
    	 * still pending to run. The vmscan shrinkers can then release all
    	 * cache memory and get us to destruction. If this is the case, we'll
    	 * be executed twice, which is a bug (the second time will execute over
    	 * bogus data). In this case, cancelling the work should be fine.
    	 *
    	 * But we can also get here from the worker itself, if
    	 * kmem_cache_shrink is enough to shake all the remaining objects and
    	 * get the page count to 0. In this case, we'll deadlock if we try to
    	 * cancel the work (the worker runs with an internal lock held, which
    	 * is the same lock we would hold for cancel_work_sync().)
    	 *
    	 * Since we can't possibly know who got us here, just refrain from
    	 * running if there is already work pending
    	 */
    	if (work_pending(&cachep->memcg_params->destroy))
    		return;
    
    	/*
    	 * We have to defer the actual destroying to a workqueue, because
    	 * we might currently be in a context that cannot sleep.
    	 */
    	schedule_work(&cachep->memcg_params->destroy);
    }
    
    
    /*
     * This lock protects updaters, not readers. We want readers to be as fast as
     * they can, and they will either see NULL or a valid cache value. Our model
     * allow them to see NULL, in which case the root memcg will be selected.
     *
     * We need this lock because multiple allocations to the same cache from a non
     * will span more than one worker. Only one of them can create the cache.
     */
    static DEFINE_MUTEX(memcg_cache_mutex);
    
    /*
     * Called with memcg_cache_mutex held
     */
    
    static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
    					 struct kmem_cache *s)
    {
    	struct kmem_cache *new;
    
    	static char *tmp_name = NULL;
    
    	lockdep_assert_held(&memcg_cache_mutex);
    
    	/*
    	 * kmem_cache_create_memcg duplicates the given name and
    	 * cgroup_name for this name requires RCU context.
    	 * This static temporary buffer is used to prevent from
    	 * pointless shortliving allocation.
    	 */
    	if (!tmp_name) {
    		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
    		if (!tmp_name)
    			return NULL;
    	}
    
    	rcu_read_lock();
    	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
    			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
    	rcu_read_unlock();
    
    	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
    
    				      (s->flags & ~SLAB_PANIC), s->ctor, s);
    
    	if (new)
    		new->allocflags |= __GFP_KMEMCG;
    
    
    	return new;
    }
    
    static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
    						  struct kmem_cache *cachep)
    {
    	struct kmem_cache *new_cachep;
    	int idx;
    
    	BUG_ON(!memcg_can_account_kmem(memcg));
    
    	idx = memcg_cache_id(memcg);
    
    	mutex_lock(&memcg_cache_mutex);
    	new_cachep = cachep->memcg_params->memcg_caches[idx];
    
    	if (new_cachep) {
    		css_put(&memcg->css);
    
    
    	new_cachep = kmem_cache_dup(memcg, cachep);
    	if (new_cachep == NULL) {
    		new_cachep = cachep;
    
    	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
    
    
    	cachep->memcg_params->memcg_caches[idx] = new_cachep;
    	/*
    	 * the readers won't lock, make sure everybody sees the updated value,
    	 * so they won't put stuff in the queue again for no reason
    	 */
    	wmb();
    out:
    	mutex_unlock(&memcg_cache_mutex);
    	return new_cachep;
    }
    
    
    void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
    {
    	struct kmem_cache *c;
    	int i;
    
    	if (!s->memcg_params)
    		return;
    	if (!s->memcg_params->is_root_cache)
    		return;
    
    	/*
    	 * If the cache is being destroyed, we trust that there is no one else
    	 * requesting objects from it. Even if there are, the sanity checks in
    	 * kmem_cache_destroy should caught this ill-case.
    	 *
    	 * Still, we don't want anyone else freeing memcg_caches under our
    	 * noses, which can happen if a new memcg comes to life. As usual,
    	 * we'll take the set_limit_mutex to protect ourselves against this.
    	 */
    	mutex_lock(&set_limit_mutex);
    	for (i = 0; i < memcg_limited_groups_array_size; i++) {
    		c = s->memcg_params->memcg_caches[i];
    		if (!c)
    			continue;
    
    		/*
    		 * We will now manually delete the caches, so to avoid races
    		 * we need to cancel all pending destruction workers and
    		 * proceed with destruction ourselves.
    		 *
    		 * kmem_cache_destroy() will call kmem_cache_shrink internally,
    		 * and that could spawn the workers again: it is likely that
    		 * the cache still have active pages until this very moment.
    		 * This would lead us back to mem_cgroup_destroy_cache.
    		 *
    		 * But that will not execute at all if the "dead" flag is not
    		 * set, so flip it down to guarantee we are in control.
    		 */
    		c->memcg_params->dead = false;
    
    		cancel_work_sync(&c->memcg_params->destroy);
    
    		kmem_cache_destroy(c);
    	}
    	mutex_unlock(&set_limit_mutex);
    }
    
    
    struct create_work {
    	struct mem_cgroup *memcg;
    	struct kmem_cache *cachep;
    	struct work_struct work;
    };
    
    
    static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
    {
    	struct kmem_cache *cachep;
    	struct memcg_cache_params *params;
    
    	if (!memcg_kmem_is_active(memcg))
    		return;
    
    	mutex_lock(&memcg->slab_caches_mutex);
    	list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
    		cachep = memcg_params_to_cache(params);
    		cachep->memcg_params->dead = true;
    		schedule_work(&cachep->memcg_params->destroy);
    	}
    	mutex_unlock(&memcg->slab_caches_mutex);
    }
    
    
    static void memcg_create_cache_work_func(struct work_struct *w)
    {
    	struct create_work *cw;
    
    	cw = container_of(w, struct create_work, work);
    	memcg_create_kmem_cache(cw->memcg, cw->cachep);
    	kfree(cw);
    }
    
    /*
     * Enqueue the creation of a per-memcg kmem_cache.
     */
    
    static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
    					 struct kmem_cache *cachep)
    
    {
    	struct create_work *cw;
    
    	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
    
    	if (cw == NULL) {
    		css_put(&memcg->css);
    
    		return;
    	}
    
    	cw->memcg = memcg;
    	cw->cachep = cachep;
    
    	INIT_WORK(&cw->work, memcg_create_cache_work_func);
    	schedule_work(&cw->work);
    }
    
    
    static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
    				       struct kmem_cache *cachep)
    {
    	/*
    	 * We need to stop accounting when we kmalloc, because if the
    	 * corresponding kmalloc cache is not yet created, the first allocation
    	 * in __memcg_create_cache_enqueue will recurse.
    	 *
    	 * However, it is better to enclose the whole function. Depending on
    	 * the debugging options enabled, INIT_WORK(), for instance, can
    	 * trigger an allocation. This too, will make us recurse. Because at
    	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
    	 * the safest choice is to do it like this, wrapping the whole function.
    	 */
    	memcg_stop_kmem_account();
    	__memcg_create_cache_enqueue(memcg, cachep);
    	memcg_resume_kmem_account();
    }
    
    /*
     * Return the kmem_cache we're supposed to use for a slab allocation.
     * We try to use the current memcg's version of the cache.
     *
     * If the cache does not exist yet, if we are the first user of it,
     * we either create it immediately, if possible, or create it asynchronously
     * in a workqueue.
     * In the latter case, we will let the current allocation go through with
     * the original cache.
     *
     * Can't be called in interrupt context or from kernel threads.
     * This function needs to be called with rcu_read_lock() held.
     */
    struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
    					  gfp_t gfp)
    {
    	struct mem_cgroup *memcg;
    	int idx;
    
    	VM_BUG_ON(!cachep->memcg_params);
    	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
    
    
    	if (!current->mm || current->memcg_kmem_skip_account)
    		return cachep;
    
    
    	rcu_read_lock();
    	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
    
    	if (!memcg_can_account_kmem(memcg))
    
    
    	idx = memcg_cache_id(memcg);
    
    	/*
    	 * barrier to mare sure we're always seeing the up to date value.  The
    	 * code updating memcg_caches will issue a write barrier to match this.
    	 */
    	read_barrier_depends();
    
    	if (likely(cachep->memcg_params->memcg_caches[idx])) {
    		cachep = cachep->memcg_params->memcg_caches[idx];
    		goto out;
    
    	/* The corresponding put will be done in the workqueue. */
    	if (!css_tryget(&memcg->css))
    		goto out;
    	rcu_read_unlock();
    
    	/*
    	 * If we are in a safe context (can wait, and not in interrupt
    	 * context), we could be be predictable and return right away.
    	 * This would guarantee that the allocation being performed
    	 * already belongs in the new cache.
    	 *
    	 * However, there are some clashes that can arrive from locking.
    	 * For instance, because we acquire the slab_mutex while doing
    	 * kmem_cache_dup, this means no further allocation could happen
    	 * with the slab_mutex held.
    	 *
    	 * Also, because cache creation issue get_online_cpus(), this
    	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
    	 * that ends up reversed during cpu hotplug. (cpuset allocates
    	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
    	 * better to defer everything.
    	 */
    	memcg_create_cache_enqueue(memcg, cachep);
    	return cachep;
    out:
    	rcu_read_unlock();
    	return cachep;
    
    }
    EXPORT_SYMBOL(__memcg_kmem_get_cache);
    
    
    /*
     * We need to verify if the allocation against current->mm->owner's memcg is
     * possible for the given order. But the page is not allocated yet, so we'll
     * need a further commit step to do the final arrangements.
     *
     * It is possible for the task to switch cgroups in this mean time, so at
     * commit time, we can't rely on task conversion any longer.  We'll then use
     * the handle argument to return to the caller which cgroup we should commit
     * against. We could also return the memcg directly and avoid the pointer
     * passing, but a boolean return value gives better semantics considering
     * the compiled-out case as well.
     *
     * Returning true means the allocation is possible.
     */
    bool
    __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
    {
    	struct mem_cgroup *memcg;
    	int ret;
    
    	*_memcg = NULL;
    
    
    	/*
    	 * Disabling accounting is only relevant for some specific memcg
    	 * internal allocations. Therefore we would initially not have such
    	 * check here, since direct calls to the page allocator that are marked
    	 * with GFP_KMEMCG only happen outside memcg core. We are mostly
    	 * concerned with cache allocations, and by having this test at
    	 * memcg_kmem_get_cache, we are already able to relay the allocation to
    	 * the root cache and bypass the memcg cache altogether.
    	 *
    	 * There is one exception, though: the SLUB allocator does not create
    	 * large order caches, but rather service large kmallocs directly from
    	 * the page allocator. Therefore, the following sequence when backed by
    	 * the SLUB allocator:
    	 *
    
    Andrew Morton's avatar
    Andrew Morton committed
    	 *	memcg_stop_kmem_account();
    	 *	kmalloc(<large_number>)
    	 *	memcg_resume_kmem_account();
    
    	 *
    	 * would effectively ignore the fact that we should skip accounting,
    	 * since it will drive us directly to this function without passing
    	 * through the cache selector memcg_kmem_get_cache. Such large
    	 * allocations are extremely rare but can happen, for instance, for the
    	 * cache arrays. We bring this test here.
    	 */
    	if (!current->mm || current->memcg_kmem_skip_account)
    		return true;
    
    
    	memcg = try_get_mem_cgroup_from_mm(current->mm);
    
    	/*
    	 * very rare case described in mem_cgroup_from_task. Unfortunately there
    	 * isn't much we can do without complicating this too much, and it would
    	 * be gfp-dependent anyway. Just let it go
    	 */
    	if (unlikely(!memcg))
    		return true;
    
    	if (!memcg_can_account_kmem(memcg)) {
    		css_put(&memcg->css);
    		return true;
    	}
    
    	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
    	if (!ret)
    		*_memcg = memcg;
    
    	css_put(&memcg->css);
    	return (ret == 0);
    }
    
    void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
    			      int order)
    {
    	struct page_cgroup *pc;
    
    	VM_BUG_ON(mem_cgroup_is_root(memcg));
    
    	/* The page allocation failed. Revert */
    	if (!page) {
    		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
    		return;
    	}
    
    	pc = lookup_page_cgroup(page);
    	lock_page_cgroup(pc);
    	pc->mem_cgroup = memcg;
    	SetPageCgroupUsed(pc);
    	unlock_page_cgroup(pc);
    }
    
    void __memcg_kmem_uncharge_pages(struct page *page, int order)
    {
    	struct mem_cgroup *memcg = NULL;
    	struct page_cgroup *pc;
    
    
    	pc = lookup_page_cgroup(page);
    	/*
    	 * Fast unlocked return. Theoretically might have changed, have to
    	 * check again after locking.
    	 */
    	if (!PageCgroupUsed(pc))
    		return;
    
    	lock_page_cgroup(pc);
    	if (PageCgroupUsed(pc)) {
    		memcg = pc->mem_cgroup;
    		ClearPageCgroupUsed(pc);
    	}
    	unlock_page_cgroup(pc);
    
    	/*
    	 * We trust that only if there is a memcg associated with the page, it
    	 * is a valid allocation
    	 */
    	if (!memcg)
    		return;
    
    	VM_BUG_ON(mem_cgroup_is_root(memcg));
    	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
    }
    
    #else
    static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
    {
    }
    
    #endif /* CONFIG_MEMCG_KMEM */
    
    
    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    
    
    #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
    
    /*
     * Because tail pages are not marked as "used", set it. We're under
    
     * zone->lru_lock, 'splitting on pmd' and compound_lock.
     * charge/uncharge will be never happen and move_account() is done under
     * compound_lock(), so we don't have to take care of races.
    
    void mem_cgroup_split_huge_fixup(struct page *head)
    
    {
    	struct page_cgroup *head_pc = lookup_page_cgroup(head);
    
    	struct mem_cgroup *memcg;
    
    	if (mem_cgroup_disabled())
    		return;
    
    
    	memcg = head_pc->mem_cgroup;
    
    	for (i = 1; i < HPAGE_PMD_NR; i++) {
    		pc = head_pc + i;
    
    		pc->mem_cgroup = memcg;
    
    		smp_wmb();/* see __commit_charge() */
    		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
    	}
    
    	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
    		       HPAGE_PMD_NR);
    
    #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    
    static inline
    void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
    					struct mem_cgroup *to,
    					unsigned int nr_pages,
    					enum mem_cgroup_stat_index idx)
    {
    	/* Update stat data for mem_cgroup */
    	preempt_disable();
    	WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
    	__this_cpu_add(from->stat->count[idx], -nr_pages);
    	__this_cpu_add(to->stat->count[idx], nr_pages);
    	preempt_enable();
    }
    
    
     * mem_cgroup_move_account - move account of the page
    
     * @nr_pages: number of regular pages (>1 for huge pages)
    
     * @pc:	page_cgroup of the page.
     * @from: mem_cgroup which the page is moved from.
     * @to:	mem_cgroup which the page is moved to. @from != @to.
     *
     * The caller must confirm following.
    
     * - page is not on LRU (isolate_page() is useful.)
    
     * - compound_lock is held when nr_pages > 1
    
     * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
     * from old cgroup.
    
    static int mem_cgroup_move_account(struct page *page,
    				   unsigned int nr_pages,
    				   struct page_cgroup *pc,
    				   struct mem_cgroup *from,
    
    	unsigned long flags;
    	int ret;
    
    	bool anon = PageAnon(page);
    
    	VM_BUG_ON(from == to);
    
    	VM_BUG_ON(PageLRU(page));
    
    	/*
    	 * The page is isolated from LRU. So, collapse function
    	 * will not handle this page. But page splitting can happen.
    	 * Do this check under compound_page_lock(). The caller should
    	 * hold it.
    	 */
    	ret = -EBUSY;
    
    	if (nr_pages > 1 && !PageTransHuge(page))
    
    		goto out;
    
    	lock_page_cgroup(pc);
    
    	ret = -EINVAL;
    	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
    		goto unlock;
    
    
    	move_lock_mem_cgroup(from, &flags);
    
    	if (!anon && page_mapped(page))
    		mem_cgroup_move_account_page_stat(from, to, nr_pages,
    			MEM_CGROUP_STAT_FILE_MAPPED);
    
    	if (PageWriteback(page))
    		mem_cgroup_move_account_page_stat(from, to, nr_pages,
    			MEM_CGROUP_STAT_WRITEBACK);
    
    
    	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
    
    	/* caller should have done css_get */
    
    	pc->mem_cgroup = to;
    
    	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
    
    	move_unlock_mem_cgroup(from, &flags);
    
    	unlock_page_cgroup(pc);
    
    	memcg_check_events(to, page);
    	memcg_check_events(from, page);
    
    /**
     * mem_cgroup_move_parent - moves page to the parent group
     * @page: the page to move
     * @pc: page_cgroup of the page
     * @child: page's cgroup
     *
     * move charges to its parent or the root cgroup if the group has no
     * parent (aka use_hierarchy==0).
     * Although this might fail (get_page_unless_zero, isolate_lru_page or
     * mem_cgroup_move_account fails) the failure is always temporary and
     * it signals a race with a page removal/uncharge or migration. In the
     * first case the page is on the way out and it will vanish from the LRU
     * on the next attempt and the call should be retried later.
     * Isolation from the LRU fails only if page has been isolated from
     * the LRU since we looked at it and that usually means either global
     * reclaim or migration going on. The page will either get back to the
     * LRU or vanish.
     * Finaly mem_cgroup_move_account fails only if the page got uncharged
     * (!PageCgroupUsed) or moved to a different group. The page will
     * disappear in the next attempt.
    
    static int mem_cgroup_move_parent(struct page *page,
    				  struct page_cgroup *pc,
    
    				  struct mem_cgroup *child)
    
    {
    	struct mem_cgroup *parent;
    
    	unsigned long uninitialized_var(flags);
    
    	VM_BUG_ON(mem_cgroup_is_root(child));
    
    	ret = -EBUSY;
    	if (!get_page_unless_zero(page))
    		goto out;
    	if (isolate_lru_page(page))
    		goto put;
    
    	nr_pages = hpage_nr_pages(page);
    
    	parent = parent_mem_cgroup(child);
    	/*
    	 * If no parent, move charges to root cgroup.
    	 */
    	if (!parent)
    		parent = root_mem_cgroup;
    
    	if (nr_pages > 1) {
    		VM_BUG_ON(!PageTransHuge(page));
    
    		flags = compound_lock_irqsave(page);
    
    	ret = mem_cgroup_move_account(page, nr_pages,
    
    	if (!ret)
    		__mem_cgroup_cancel_local_charge(child, nr_pages);
    
    		compound_unlock_irqrestore(page, flags);
    
    	putback_lru_page(page);
    
    /*
     * Charge the memory controller for page usage.
     * Return
     * 0 if the charge was successful
     * < 0 if the cgroup is over its limit
     */
    static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
    
    				gfp_t gfp_mask, enum charge_type ctype)
    
    	struct mem_cgroup *memcg = NULL;
    
    	bool oom = true;
    
    	if (PageTransHuge(page)) {
    
    		nr_pages <<= compound_order(page);
    
    		VM_BUG_ON(!PageTransHuge(page));
    
    		/*
    		 * Never OOM-kill a process for a huge page.  The
    		 * fault handler will fall back to regular pages.
    		 */
    		oom = false;
    
    	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
    
    	if (ret == -ENOMEM)
    
    	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
    
    int mem_cgroup_newpage_charge(struct page *page,
    			      struct mm_struct *mm, gfp_t gfp_mask)
    
    	if (mem_cgroup_disabled())
    
    	VM_BUG_ON(page_mapped(page));
    	VM_BUG_ON(page->mapping && !PageAnon(page));
    	VM_BUG_ON(!mm);
    
    	return mem_cgroup_charge_common(page, mm, gfp_mask,
    
    /*
     * While swap-in, try_charge -> commit or cancel, the page is locked.
     * And when try_charge() successfully returns, one refcnt to memcg without
    
     * struct page_cgroup is acquired. This refcnt will be consumed by
    
     * "commit()" or removed by "cancel()"
     */