Newer
Older
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
MCS_ACTIVE_FILE,
MCS_UNEVICTABLE,
NR_MCS_STAT,
};
struct mcs_total_stat {
s64 stat[NR_MCS_STAT];
struct {
char *local_name;
char *total_name;
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
{"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"pgfault", "total_pgfault"},
{"pgmajfault", "total_pgmajfault"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
{"active_file", "total_active_file"},
{"unevictable", "total_unevictable"}
};
static void
mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
s64 val;
/* per cpu stat */
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
s->stat[MCS_PGFAULT] += val;
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_get_local_stat(iter, s);
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
#ifdef CONFIG_NUMA
static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
{
int nid;
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
total_nr = mem_cgroup_nr_lru_pages(mem_cont);
seq_printf(m, "total=%lu", total_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
seq_printf(m, "file=%lu", file_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
seq_printf(m, "anon=%lu", anon_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
seq_printf(m, "unevictable=%lu", unevictable_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
cb->fill(cb, "hierarchical_memory_limit", limit);
if (do_swap_account)
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
}
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_total_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
recent_rotated[1] +=
mz->reclaim_stat.recent_rotated[1];
recent_scanned[0] +=
mz->reclaim_stat.recent_scanned[0];
recent_scanned[1] +=
mz->reclaim_stat.recent_scanned[1];
}
cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
}
#endif
return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
return get_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
if (val > 100)
return -EINVAL;
if (cgrp->parent == NULL)
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
/* If under hierarchy, only empty-root can set this value */
if ((parent->use_hierarchy) ||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
u64 usage;
int i;
rcu_read_lock();
if (!swap)
t = rcu_dereference(memcg->thresholds.primary);
t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t)
goto unlock;
usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
/*
* Iterate backward over array of thresholds starting from
* current_threshold and check if a threshold is crossed.
* If none of thresholds below usage is crossed, we read
* only one element of the array here.
*/
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
eventfd_signal(t->entries[i].eventfd, 1);
/* i = current_threshold + 1 */
i++;
/*
* Iterate forward over array of thresholds starting from
* current_threshold+1 and check if a threshold is crossed.
* If none of thresholds above usage is crossed, we read
* only one element of the array here.
*/
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
eventfd_signal(t->entries[i].eventfd, 1);
/* Update current_threshold */
unlock:
rcu_read_unlock();
}
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
if (do_swap_account)
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
}
}
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
return _a->threshold - _b->threshold;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
{
struct mem_cgroup_eventfd_list *ev;
list_for_each_entry(ev, &mem->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
return 0;
}
static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
ret = res_counter_memparse_write_strategy(args, &threshold);
if (ret)
return ret;
mutex_lock(&memcg->thresholds_lock);
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before adding a new one */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
ret = -ENOMEM;
goto unlock;
}
/* Copy thresholds (if any) to new array */
if (thresholds->primary) {
memcpy(new->entries, thresholds->primary->entries, (size - 1) *
sizeof(struct mem_cgroup_threshold));
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
/* Find current threshold */
if (new->entries[i].threshold < usage) {
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
/* Free old spare buffer and save old primary buffer as spare */
kfree(thresholds->spare);
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
unlock:
mutex_unlock(&memcg->thresholds_lock);
return ret;
}
static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 usage;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM)
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
/*
* Something went wrong if we trying to unregister a threshold
* if we don't have thresholds
*/
BUG_ON(!thresholds);
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
goto swap_buffers;
/* Copy thresholds and find current threshold */
new->current_threshold = -1;
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd == eventfd)
new->entries[j] = thresholds->primary->entries[i];
if (new->entries[j].threshold < usage) {
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
mutex_unlock(&memcg->thresholds_lock);
}
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
mutex_lock(&memcg_oom_mutex);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
if (atomic_read(&memcg->oom_lock))
eventfd_signal(eventfd, 1);
mutex_unlock(&memcg_oom_mutex);
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
mutex_lock(&memcg_oom_mutex);
list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
}
}
mutex_unlock(&memcg_oom_mutex);
}
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
if (atomic_read(&mem->oom_lock))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
return 0;
}
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (!cgrp->parent || !((val == 0) || (val == 1)))
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
(mem->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
mem->oom_kill_disable = val;
if (!val)
memcg_oom_recover(mem);
cgroup_unlock();
return 0;
}
#ifdef CONFIG_NUMA
static const struct file_operations mem_control_numa_stat_file_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
{
struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
file->f_op = &mem_control_numa_stat_file_operations;
return single_open(file, mem_control_numa_stat_show, cont);
}
#endif /* CONFIG_NUMA */
static struct cftype mem_cgroup_files[] = {
{
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

Paul Menage
committed
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.read_u64 = mem_cgroup_read,
{
.name = "stat",
.read_map = mem_control_stat_show,
{
.name = "force_empty",
.trigger = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
.read_map = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.open = mem_control_numa_stat_open,
},
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
};
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
if (!do_swap_account)
return 0;
return cgroup_add_files(cont, ss, memsw_cgroup_files,
ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
return 0;
}
#endif

KAMEZAWA Hiroyuki
committed
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

KAMEZAWA Hiroyuki
committed
if (!pn)
return 1;

KAMEZAWA Hiroyuki
committed

KAMEZAWA Hiroyuki
committed
mem->info.nodeinfo[node] = pn;

KAMEZAWA Hiroyuki
committed
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
mz->on_tree = false;
mz->mem = mem;

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
return 0;
}

KAMEZAWA Hiroyuki
committed
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
kfree(mem->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
int size = sizeof(struct mem_cgroup);
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
mem = kzalloc(size, GFP_KERNEL);
mem = vzalloc(size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
spin_lock_init(&mem->pcp_counter_lock);
out_free:
if (size < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
return NULL;
/*
* At destroying mem_cgroup, references from swap_cgroup can remain.
* (scanning all at force_empty is too costly...)
*
* Instead of clearing all references at force_empty, we remember
* the number of reference from swap_cgroup and free mem_cgroup when
* it goes down to 0.
*
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
static void __mem_cgroup_free(struct mem_cgroup *mem)
mem_cgroup_remove_from_trees(mem);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
free_percpu(mem->stat);
if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
}
static void mem_cgroup_get(struct mem_cgroup *mem)
{
atomic_inc(&mem->refcnt);
}
static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
if (atomic_sub_and_test(count, &mem->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(mem);
__mem_cgroup_free(mem);
if (parent)
mem_cgroup_put(parent);
}
static void mem_cgroup_put(struct mem_cgroup *mem)
{
__mem_cgroup_put(mem, 1);
}
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
{
if (!mem->res.parent)
return NULL;
return mem_cgroup_from_res_counter(mem->res.parent, res);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account)
do_swap_account = 1;
}
#else
static void __init enable_swap_cgroup(void)
{
}
#endif
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
static int mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
int tmp, node, zone;
for_each_node_state(node, N_POSSIBLE) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
if (!rtpn)
return 1;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
}
return 0;
}
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *mem, *parent;

KAMEZAWA Hiroyuki
committed
int node;
mem = mem_cgroup_alloc();
if (!mem)

KAMEZAWA Hiroyuki
committed
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
if (cont->parent == NULL) {
root_mem_cgroup = mem;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
mem->oom_kill_disable = parent->oom_kill_disable;
if (parent && parent->use_hierarchy) {
res_counter_init(&mem->res, &parent->res);
res_counter_init(&mem->memsw, &parent->memsw);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
* This refcnt will be decremented when freeing this
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
} else {
res_counter_init(&mem->res, NULL);
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_node = MAX_NUMNODES;
if (parent)
mem->swappiness = get_swappiness(parent);
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);

KAMEZAWA Hiroyuki
committed
free_out:
__mem_cgroup_free(mem);
root_mem_cgroup = NULL;
static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
return mem_cgroup_force_empty(mem, false);
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
mem_cgroup_put(mem);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
int ret;
ret = cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
if (!ret)
ret = register_memsw_files(cont, ss);
return ret;
/* Handlers for move charge at task migration. */
#define PRECHARGE_COUNT_AT_ONCE 256
static int mem_cgroup_do_precharge(unsigned long count)
int ret = 0;
int batch_count = PRECHARGE_COUNT_AT_ONCE;
struct mem_cgroup *mem = mc.to;
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
if (mem_cgroup_is_root(mem)) {
mc.precharge += count;
/* we don't need css_get for root */
return ret;
}
/* try to charge at once */
if (count > 1) {
struct res_counter *dummy;
/*
* "mem" cannot be under rmdir() because we've already checked
* by cgroup_lock_live_cgroup() that it is not removed and we
* are still under the same cgroup_mutex. So we can postpone
* css_get().
*/
if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
goto one_by_one;
if (do_swap_account && res_counter_charge(&mem->memsw,
PAGE_SIZE * count, &dummy)) {
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
goto one_by_one;
}
mc.precharge += count;
return ret;
}
one_by_one:
/* fall back to one by one charge */
while (count--) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
if (!batch_count--) {
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
mc.precharge++;
}
return ret;
}
/**
* is_target_pte_for_mc - check a pte whether it is valid for move charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked