Newer
Older
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
page);

KAMEZAWA Hiroyuki
committed
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
struct page *oldpage, struct page *newpage)

KAMEZAWA Hiroyuki
committed
{
struct page *target, *unused;
struct page_cgroup *pc;
enum charge_type ctype;
if (!mem)
return;
cgroup_exclude_rmdir(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
target = oldpage;
unused = NULL;
} else {
target = newpage;
unused = oldpage;
}
if (PageAnon(target))
ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
else if (page_is_file_cache(target))
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
/* unused page is not on radix-tree now. */
__mem_cgroup_uncharge_common(unused, ctype);
pc = lookup_page_cgroup(target);
* __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
* So, double-counting is effectively avoided.
*/
__mem_cgroup_commit_charge(mem, pc, ctype);
/*
* Both of oldpage and newpage are still under lock_page().
* Then, we don't have to care about race in radix-tree.
* But we have to be careful that this page is unmapped or not.
*
* There is a case for !page_mapped(). At the start of
* migration, oldpage was mapped. But now, it's zapped.
* But we know *target* page is not freed/reused under us.
* mem_cgroup_uncharge_page() does all necessary checks.
if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
mem_cgroup_uncharge_page(target);
/*
* At migration, we may charge account against cgroup which has no tasks
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
cgroup_release_and_wakeup_rmdir(&mem->css);

KAMEZAWA Hiroyuki
committed
}
* A call to try to shrink memory usage on charge failure at shmem's swapin.
* Calling hierarchical_reclaim is not enough because we should update
* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
* Moreover considering hierarchy, we should reclaim from the mem_over_limit,
* not from the memcg which this page would be charged to.
* try_charge_swapin does all of these works properly.
int mem_cgroup_shmem_charge_fallback(struct page *page,
struct mm_struct *mm,
gfp_t gfp_mask)
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
static DEFINE_MUTEX(set_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
int retry_count;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
* is depends on callers. We set our retry-count to be function
* of # of children which we should visit in this loop.
*/
retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
* We have to guarantee mem->res.limit < mem->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
GFP_KERNEL,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
int retry_count;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
* We have to guarantee mem->res.limit < mem->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
else
oldusage = curusage;
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
if (order > 0)
return 0;
mctz = soft_limit_tree_node_zone(nid, zid);
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
gfp_mask,
MEM_CGROUP_RECLAIM_SOFT);
nr_reclaimed += reclaimed;
spin_lock(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed) {
do {
/*
* Loop until we find yet another one.
*
* By the time we get the soft_limit lock
* again, someone might have aded the
* group back on the RB tree. Iterate to
* make sure we get a different mem.
* mem_cgroup_largest_soft_limit_node returns
* NULL if no other cgroup is present on
* the tree
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
if (next_mz == mz) {
css_put(&next_mz->mem->css);
next_mz = NULL;
} else /* next_mz == NULL or other memcg */
break;
} while (1);
}
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->mem->css);
return nr_reclaimed;
}

KAMEZAWA Hiroyuki
committed
/*
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

KAMEZAWA Hiroyuki
committed
{
struct zone *zone;
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc, *busy;

KAMEZAWA Hiroyuki
committed
struct list_head *list;

KAMEZAWA Hiroyuki
committed
zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(mem, node, zid);
list = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
loop = MEM_CGROUP_ZSTAT(mz, lru);
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;
while (loop--) {
ret = 0;
if (list_empty(list)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) {
list_move(&pc->lru, list);
busy = 0;
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(&zone->lru_lock, flags);
ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
if (ret == -EBUSY || ret == -EINVAL) {
/* found lock contention or "pc" is obsolete. */
busy = pc;
cond_resched();
} else
busy = NULL;

KAMEZAWA Hiroyuki
committed
}
if (!ret && !list_empty(list))
return -EBUSY;
return ret;

KAMEZAWA Hiroyuki
committed
}
/*
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

KAMEZAWA Hiroyuki
committed
{
int ret;
int node, zid, shrink;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct cgroup *cgrp = mem->css.cgroup;

KAMEZAWA Hiroyuki
committed
css_get(&mem->css);
/* should free all ? */
if (free_all)
goto try_to_free;

KAMEZAWA Hiroyuki
committed
while (mem->res.usage > 0) {
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
ret = -EINTR;
if (signal_pending(current))

KAMEZAWA Hiroyuki
committed
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
for_each_lru(l) {
ret = mem_cgroup_force_empty_list(mem,
if (ret)
break;
}

KAMEZAWA Hiroyuki
committed
}
if (ret)
break;
}
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;

KAMEZAWA Hiroyuki
committed
}
ret = 0;
out:
css_put(&mem->css);
return ret;
/* returns EBUSY if there is a task or if we come here twice. */
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
ret = -EBUSY;
goto out;
}
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
false, get_swappiness(mem));
/* maybe some writeback is necessary */
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* try move_account...there may be some *locked* pages. */
if (mem->res.usage)
goto move_account;
ret = 0;
goto out;

KAMEZAWA Hiroyuki
committed
}
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}
static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
{
return mem_cgroup_from_cont(cont)->use_hierarchy;
}
static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
u64 val)
{
int retval = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
struct cgroup *parent = cont->parent;
struct mem_cgroup *parent_mem = NULL;
if (parent)
parent_mem = mem_cgroup_from_cont(parent);
cgroup_lock();
/*
* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
* occur, provided the current cgroup has no children.
*
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
if ((!parent_mem || !parent_mem->use_hierarchy) &&
(val == 1 || val == 0)) {
if (list_empty(&cont->children))
mem->use_hierarchy = val;
else
retval = -EBUSY;
} else
retval = -EINVAL;
cgroup_unlock();
return retval;
}
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
struct mem_cgroup_idx_data {
s64 val;
enum mem_cgroup_stat_index idx;
};
static int
mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
{
struct mem_cgroup_idx_data *d = data;
d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
return 0;
}
static void
mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
enum mem_cgroup_stat_index idx, s64 *val)
{
struct mem_cgroup_idx_data d;
d.idx = idx;
d.val = 0;
mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
*val = d.val;
}
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
int type, name;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_CACHE, &idx_val);
val = idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_RSS, &idx_val);
val += idx_val;
val <<= PAGE_SHIFT;
} else
val = res_counter_read_u64(&mem->res, name);
if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_CACHE, &idx_val);
val = idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_RSS, &idx_val);
val += idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_SWAPOUT, &idx_val);
val <<= PAGE_SHIFT;
} else
val = res_counter_read_u64(&mem->memsw, name);
break;
default:
BUG();
break;
}
return val;
/*
* The user of this function is...
* RES_LIMIT.
*/

Paul Menage
committed
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
const char *buffer)
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
unsigned long long val;
int ret;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (name) {
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
/* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
if (type == _MEM)
ret = mem_cgroup_resize_limit(memcg, val);
else
ret = mem_cgroup_resize_memsw_limit(memcg, val);
case RES_SOFT_LIMIT:
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
/*
* For memsw, soft limits are hard to implement in terms
* of semantics, for now, we support soft limits for
* control without swap
*/
if (type == _MEM)
ret = res_counter_set_soft_limit(&memcg->res, val);
else
ret = -EINVAL;
break;
default:
ret = -EINVAL; /* should be BUG() ? */
break;
}
return ret;
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
unsigned long long *mem_limit, unsigned long long *memsw_limit)
{
struct cgroup *cgroup;
unsigned long long min_limit, min_memsw_limit, tmp;
min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
cgroup = memcg->css.cgroup;
if (!memcg->use_hierarchy)
goto out;
while (cgroup->parent) {
cgroup = cgroup->parent;
memcg = mem_cgroup_from_cont(cgroup);
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_limit = min(min_limit, tmp);
tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
min_memsw_limit = min(min_memsw_limit, tmp);
}
out:
*mem_limit = min_limit;
*memsw_limit = min_memsw_limit;
return;
}
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *mem;
mem = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
switch (name) {
if (type == _MEM)
res_counter_reset_max(&mem->res);
else
res_counter_reset_max(&mem->memsw);
if (type == _MEM)
res_counter_reset_failcnt(&mem->res);
else
res_counter_reset_failcnt(&mem->memsw);
/* For read statistics */
enum {
MCS_CACHE,
MCS_RSS,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
MCS_ACTIVE_FILE,
MCS_UNEVICTABLE,
NR_MCS_STAT,
};
struct mcs_total_stat {
s64 stat[NR_MCS_STAT];
struct {
char *local_name;
char *total_name;
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
{"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
{"active_file", "total_active_file"},
{"unevictable", "total_unevictable"}
};
static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
{
struct mcs_total_stat *s = data;
s64 val;
/* per cpu stat */
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
s->stat[MCS_PGPGIN] += val;
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
return 0;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
}
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
cb->fill(cb, "hierarchical_memory_limit", limit);
if (do_swap_account)
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
}
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_total_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
recent_rotated[1] +=
mz->reclaim_stat.recent_rotated[1];
recent_scanned[0] +=
mz->reclaim_stat.recent_scanned[0];
recent_scanned[1] +=
mz->reclaim_stat.recent_scanned[1];
}
cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
}
#endif
return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
return get_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
if (val > 100)
return -EINVAL;
if (cgrp->parent == NULL)
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
/* If under hierarchy, only empty-root can set this value */
if ((parent->use_hierarchy) ||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
spin_lock(&memcg->reclaim_param_lock);
memcg->swappiness = val;
spin_unlock(&memcg->reclaim_param_lock);
static struct cftype mem_cgroup_files[] = {
{
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

Paul Menage
committed
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.read_u64 = mem_cgroup_read,
{
.name = "stat",
.read_map = mem_control_stat_show,
{
.name = "force_empty",
.trigger = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
};
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
if (!do_swap_account)
return 0;
return cgroup_add_files(cont, ss, memsw_cgroup_files,
ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
return 0;
}
#endif

KAMEZAWA Hiroyuki
committed
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

KAMEZAWA Hiroyuki
committed
if (!pn)
return 1;

KAMEZAWA Hiroyuki
committed

KAMEZAWA Hiroyuki
committed
mem->info.nodeinfo[node] = pn;
memset(pn, 0, sizeof(*pn));

KAMEZAWA Hiroyuki
committed
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
mz->on_tree = false;
mz->mem = mem;

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
return 0;
}

KAMEZAWA Hiroyuki
committed
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
kfree(mem->info.nodeinfo[node]);
}
static int mem_cgroup_size(void)
{
int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
return sizeof(struct mem_cgroup) + cpustat_size;
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
int size = mem_cgroup_size();
if (size < PAGE_SIZE)
mem = kmalloc(size, GFP_KERNEL);
memset(mem, 0, size);
return mem;
}
/*
* At destroying mem_cgroup, references from swap_cgroup can remain.
* (scanning all at force_empty is too costly...)
*
* Instead of clearing all references at force_empty, we remember
* the number of reference from swap_cgroup and free mem_cgroup when
* it goes down to 0.