Newer
Older
/*
* Migration does not charge the res_counter for the
* replacement page, so leave it alone when phasing out the
* page that is unused after the migration.
*/
if (!end_migration && !mem_cgroup_is_root(memcg))
mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

KAMEZAWA Hiroyuki
committed
unlock_out:
unlock_page_cgroup(pc);
void mem_cgroup_uncharge_page(struct page *page)
{
/* early check. */
if (page_mapped(page))
return;
VM_BUG_ON(page->mapping && !PageAnon(page));
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
* and handles memcg lifetime.
*
* Note that this check is not stable and reclaim may add the
* page to swap cache at any time after this. However, if the
* page is not in swap cache by the time page->mapcount hits
* 0, there won't be any page table references to the swap
* slot, and reclaim will free it and not actually write the
* page to disk.
*/
if (PageSwapCache(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
}
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
/*
* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
* In that cases, pages are freed continuously and we can expect pages
* are in the same memcg. All these calls itself limits the number of
* pages freed at once, then uncharge_start/end() is called properly.
* This may be called prural(2) times in a context,
*/
void mem_cgroup_uncharge_start(void)
{
current->memcg_batch.do_batch++;
/* We can do nest. */
if (current->memcg_batch.do_batch == 1) {
current->memcg_batch.memcg = NULL;
current->memcg_batch.nr_pages = 0;
current->memcg_batch.memsw_nr_pages = 0;
}
}
void mem_cgroup_uncharge_end(void)
{
struct memcg_batch_info *batch = ¤t->memcg_batch;
if (!batch->do_batch)
return;
batch->do_batch--;
if (batch->do_batch) /* If stacked, do nothing. */
return;
if (!batch->memcg)
return;
/*
* This "batch->memcg" is valid without any css_get/put etc...
* bacause we hide charges behind us.
*/
if (batch->nr_pages)
res_counter_uncharge(&batch->memcg->res,
batch->nr_pages * PAGE_SIZE);
if (batch->memsw_nr_pages)
res_counter_uncharge(&batch->memcg->memsw,
batch->memsw_nr_pages * PAGE_SIZE);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
#ifdef CONFIG_SWAP
* called after __delete_from_swap_cache() and drop "page" account.
* memcg information is recorded to swap_cgroup of "ent"
*/
void
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
if (!swapout) /* this was a swap cache but the swap is unused ! */
ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
memcg = __mem_cgroup_uncharge_common(page, ctype, false);
/*
* record memcg information, if swapout && memcg != NULL,
*/
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
#endif
/*
* called from swap_entry_free(). remove record in swap_cgroup and
* uncharge "memsw" account.
*/
void mem_cgroup_uncharge_swap(swp_entry_t ent)
unsigned short id;
if (!do_swap_account)
return;
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
/*
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))

KAMEZAWA Hiroyuki
committed
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
rcu_read_unlock();
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
* @from: mem_cgroup which the entry is moved from
* @to: mem_cgroup which the entry is moved to
*
* It succeeds only when the swap_cgroup's record for this entry is the same
* as the mem_cgroup's id of @from.
*
* Returns 0 on success, -EINVAL on failure.
*
* The caller must have charged to @to, IOW, called res_counter_charge() about
* both res and memsw, and called css_get().
*/
static int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
unsigned short old_id, new_id;
old_id = css_id(&from->css);
new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone css_get(to) because if
* the process that has been moved to @to does swap-in, the
* refcount of @to might be decreased to 0.
*
* We are in attach() phase, so the cgroup is guaranteed to be
* alive, so we can just call css_get().
return 0;
}
return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
struct mem_cgroup *from, struct mem_cgroup *to)
{
return -EINVAL;
}

KAMEZAWA Hiroyuki
committed
/*
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.

KAMEZAWA Hiroyuki
committed
*/
void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)

KAMEZAWA Hiroyuki
committed
{
unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
return;
if (PageTransHuge(page))
nr_pages <<= compound_order(page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
css_get(&memcg->css);
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
/*
* At migrating an anonymous page, its mapcount goes down
* to 0 and uncharge() will be called. But, even if it's fully
* unmapped, migration may fail and this page has to be
* charged again. We set MIGRATION flag here and delay uncharge
* until end_migration() is called
*
* Corner Case Thinking
* A)
* When the old page was mapped as Anon and it's unmap-and-freed
* while migration was ongoing.
* If unmap finds the old page, uncharge() of it will be delayed
* until end_migration(). If unmap finds a new page, it's
* uncharged when it make mapcount to be 1->0. If unmap code
* finds swap_migration_entry, the new page will not be mapped
* and end_migration() will find it(mapcount==0).
*
* B)
* When the old page was mapped but migraion fails, the kernel
* remaps it. A charge for it is kept by MIGRATION flag even
* if mapcount goes down to 0. We can do remap successfully
* without charging it again.
*
* C)
* The "old" page is under lock_page() until the end of
* migration, so, the old page itself will not be swapped-out.
* If the new page is swapped out before end_migraton, our
* hook to usual swap-out path will catch the event.
*/
if (PageAnon(page))
SetPageCgroupMigration(pc);
/*
* If the page is not charged at this point,
* we return here.
*/
return;
*memcgp = memcg;
/*
* We charge new page before it's used/mapped. So, even if unlock_page()
* is called before end_migration, we can catch all events on this new
* page. In the case new page is migrated but not remapped, new page's
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
if (PageAnon(page))

Kamezawa Hiroyuki
committed
ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
/*
* The page is committed to the memcg, but it's not actually
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);

KAMEZAWA Hiroyuki
committed
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *memcg,
struct page *oldpage, struct page *newpage, bool migration_ok)

KAMEZAWA Hiroyuki
committed
{
struct page *used, *unused;

Tejun Heo
committed
if (!migration_ok) {
used = oldpage;
unused = newpage;
used = newpage;
anon = PageAnon(used);
__mem_cgroup_uncharge_common(unused,
anon ? MEM_CGROUP_CHARGE_TYPE_ANON
: MEM_CGROUP_CHARGE_TYPE_CACHE,
true);
css_put(&memcg->css);
* We disallowed uncharge of pages under migration because mapcount
* of the page goes down to zero, temporarly.
* Clear the flag and check the page should be charged.
pc = lookup_page_cgroup(oldpage);
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
* If a page is a file cache, radix-tree replacement is very atomic
* and we can skip this check. When it was an Anon page, its mapcount
* goes down to 0. But because we added MIGRATION flage, it's not
* uncharged yet. There are several case but page->mapcount check
* and USED bit check in mem_cgroup_uncharge_page() will do enough
* check. (see prepare_charge() also)
mem_cgroup_uncharge_page(used);

KAMEZAWA Hiroyuki
committed
}
/*
* At replace page cache, newpage is not under any memcg but it's on
* LRU. So, this function doesn't touch res_counter but handles LRU
* in correct way. Both pages are locked so we cannot race with uncharge.
*/
void mem_cgroup_replace_page_cache(struct page *oldpage,
struct page *newpage)
{
struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(oldpage);
/* fix accounting on old pages */
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
/*
* When called from shmem_replace_page(), in some cases the
* oldpage has already been charged, and in some cases not.
*/
if (!memcg)
return;
/*
* Even if newpage->mapping was NULL before starting replacement,
* the newpage may be on LRU(or pagevec for LRU) already. We lock
* LRU while we overwrite pc->mem_cgroup.
*/
__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
#ifdef CONFIG_DEBUG_VM
static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Can be NULL while feeding pages into the page allocator for
* the first time, i.e. during boot or memory hotplug;
* or when mem_cgroup_disabled().
*/
if (likely(pc) && PageCgroupUsed(pc))
return pc;
return NULL;
}
bool mem_cgroup_bad_page_check(struct page *page)
{
if (mem_cgroup_disabled())
return false;
return lookup_page_cgroup_used(page) != NULL;
}
void mem_cgroup_print_bad_page(struct page *page)
{
struct page_cgroup *pc;
pc = lookup_page_cgroup_used(page);
if (pc) {
pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
pc, pc->flags, pc->mem_cgroup);
}
}
#endif
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
int retry_count;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.

Wanpeng Li
committed
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_reclaim(memcg, GFP_KERNEL,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
retry_count--;
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
int retry_count;
u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.

Wanpeng Li
committed
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_reclaim(memcg, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
/**
* mem_cgroup_force_empty_list - clears LRU of a group
* @memcg: group to clear
* @node: NUMA node
* @zid: zone id
* @lru: lru to to clear
*
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
* reclaim the pages page themselves - pages are moved to the parent (or root)
* group.

KAMEZAWA Hiroyuki
committed
*/
static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

KAMEZAWA Hiroyuki
committed
{
unsigned long flags;

KAMEZAWA Hiroyuki
committed
struct list_head *list;
struct page *busy;
struct zone *zone;

KAMEZAWA Hiroyuki
committed
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
list = &lruvec->lists[lru];

KAMEZAWA Hiroyuki
committed
struct page *page;
if (list_empty(list)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
page = list_entry(list->prev, struct page, lru);
if (busy == page) {
list_move(&page->lru, list);
busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(&zone->lru_lock, flags);
pc = lookup_page_cgroup(page);
if (mem_cgroup_move_parent(page, pc, memcg)) {
/* found lock contention or "pc" is obsolete. */
cond_resched();
} else
busy = NULL;
} while (!list_empty(list));

KAMEZAWA Hiroyuki
committed
}
/*
* make mem_cgroup's charge to be 0 if there is no task by moving
* all the charges and pages to the parent.

KAMEZAWA Hiroyuki
committed
* This enables deleting this mem_cgroup.
*
* Caller is responsible for holding css reference on the memcg.

KAMEZAWA Hiroyuki
committed
*/
static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
int node, zid;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
mem_cgroup_start_move(memcg);
for_each_node_state(node, N_MEMORY) {
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mem_cgroup_force_empty_list(memcg,

KAMEZAWA Hiroyuki
committed
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
* Kernel memory may not necessarily be trackable to a specific
* process. So they are not migrated, and therefore we can't
* expect their value to drop to 0 here.
* Having res filled up with kmem only is enough.
*
* This is a safety check because mem_cgroup_force_empty_list
* could have raced with mem_cgroup_replace_page_cache callers
* so the lru seemed empty but the page could have been added
* right after the check. RES_USAGE should be safe as we always
* charge before adding to the LRU.
*/
usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
res_counter_read_u64(&memcg->kmem, RES_USAGE);
} while (usage > 0);
}
/*
* This mainly exists for tests during the setting of set of use_hierarchy.
* Since this is the very setting we are changing, the current hierarchy value
* is meaningless
*/
static inline bool __memcg_has_children(struct mem_cgroup *memcg)
{

Tejun Heo
committed
struct cgroup_subsys_state *pos;
/* bounce at first found */

Tejun Heo
committed
css_for_each_child(pos, &memcg->css)
return true;
return false;
}
/*
* Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
* to be already dead (as in mem_cgroup_force_empty, for instance). This is
* from mem_cgroup_count_children(), in the sense that we don't really care how
* many children we have; we only need to know if we have any. It also counts
* any memcg without hierarchy as infertile.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
return memcg->use_hierarchy && __memcg_has_children(memcg);
}
/*
* Reclaims as many pages from the given memcg as possible and moves
* the rest to the parent.
*
* Caller is responsible for holding css reference for memcg.
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct cgroup *cgrp = memcg->css.cgroup;
/* returns EBUSY if there is a task or if we come here twice. */
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
return -EBUSY;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
if (signal_pending(current))
return -EINTR;
progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
/* maybe some writeback is necessary */
congestion_wait(BLK_RW_ASYNC, HZ/10);
mem_cgroup_reparent_charges(memcg);
return 0;

KAMEZAWA Hiroyuki
committed
}
static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
unsigned int event)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
if (mem_cgroup_is_root(memcg))
return -EINVAL;
return mem_cgroup_force_empty(memcg);
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return mem_cgroup_from_css(css)->use_hierarchy;
static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
mutex_lock(&memcg_create_mutex);
if (memcg->use_hierarchy == val)
goto out;
* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
* occur, provided the current cgroup has no children.
*
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
if (!__memcg_has_children(memcg))
else
retval = -EBUSY;
} else
retval = -EINVAL;
mutex_unlock(&memcg_create_mutex);
return retval;
}
static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
/* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, memcg)
val += mem_cgroup_read_stat(iter, idx);
if (val < 0) /* race ? */
val = 0;
return val;
static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (!mem_cgroup_is_root(memcg)) {
if (!swap)

Glauber Costa
committed
return res_counter_read_u64(&memcg->res, RES_USAGE);

Glauber Costa
committed
return res_counter_read_u64(&memcg->memsw, RES_USAGE);
/*
* Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
* as well as in MEM_CGROUP_STAT_RSS_HUGE.
*/
val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
return val << PAGE_SHIFT;
}
static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
struct cftype *cft, struct file *file,
char __user *buf, size_t nbytes, loff_t *ppos)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
char str[64];
int name, len;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
if (name == RES_USAGE)
val = mem_cgroup_usage(memcg, false);
val = res_counter_read_u64(&memcg->res, name);
if (name == RES_USAGE)
val = mem_cgroup_usage(memcg, true);
val = res_counter_read_u64(&memcg->memsw, name);
case _KMEM:
val = res_counter_read_u64(&memcg->kmem, name);
break;
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
{
int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
* already joined.
*
* If tasks join before we set the limit, a person looking at
* kmem.usage_in_bytes will have no way to determine when it took
* place, which makes the value quite meaningless.
*
* After it first became limited, changes in the value of the limit are
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
mutex_lock(&set_limit_mutex);
if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
ret = -EBUSY;
goto out;
}
ret = res_counter_set_limit(&memcg->kmem, val);
VM_BUG_ON(ret);
ret = memcg_update_cache_sizes(memcg);
if (ret) {
res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
goto out;
}
static_key_slow_inc(&memcg_kmem_enabled_key);
/*
* setting the active bit after the inc will guarantee no one
* starts accounting before all call sites are patched
*/
memcg_kmem_set_active(memcg);
} else
ret = res_counter_set_limit(&memcg->kmem, val);
out:
mutex_unlock(&set_limit_mutex);
mutex_unlock(&memcg_create_mutex);
#endif
return ret;
}
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
if (!parent)
goto out;
memcg->kmem_account_flags = parent->kmem_account_flags;
/*
* When that happen, we need to disable the static branch only on those
* memcgs that enabled it. To achieve this, we would be forced to
* complicate the code by keeping track of which memcgs were the ones
* that actually enabled limits, and which ones got it from its
* parents.
*
* It is a lot simpler just to do static_key_slow_inc() on every child
* that is accounted.
*/
if (!memcg_kmem_is_active(memcg))
goto out;
/*
* __mem_cgroup_free() will issue static_key_slow_dec() because this
* memcg is active already. If the later initialization fails then the
* cgroup core triggers the cleanup so we do not have to do it here.
*/
static_key_slow_inc(&memcg_kmem_enabled_key);
mutex_lock(&set_limit_mutex);
memcg_stop_kmem_account();
ret = memcg_update_cache_sizes(memcg);
memcg_resume_kmem_account();
mutex_unlock(&set_limit_mutex);
out:
return ret;
#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
* RES_LIMIT.
*/
static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,

Paul Menage
committed
const char *buffer)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
unsigned long long val;
int ret;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
/* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
if (type == _MEM)
ret = mem_cgroup_resize_limit(memcg, val);
ret = mem_cgroup_resize_memsw_limit(memcg, val);
ret = memcg_update_kmem_limit(css, val);
case RES_SOFT_LIMIT:
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
/*
* For memsw, soft limits are hard to implement in terms
* of semantics, for now, we support soft limits for
* control without swap
*/
if (type == _MEM)
ret = res_counter_set_soft_limit(&memcg->res, val);
else
ret = -EINVAL;
break;
default:
ret = -EINVAL; /* should be BUG() ? */
break;
}
return ret;
static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
unsigned long long *mem_limit, unsigned long long *memsw_limit)
{
unsigned long long min_limit, min_memsw_limit, tmp;
min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (!memcg->use_hierarchy)
goto out;
while (css_parent(&memcg->css)) {
memcg = mem_cgroup_from_css(css_parent(&memcg->css));
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_limit = min(min_limit, tmp);
tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
min_memsw_limit = min(min_memsw_limit, tmp);
}
out:
*mem_limit = min_limit;
*memsw_limit = min_memsw_limit;
}
static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);