Newer
Older
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
{
struct cgroup *cgroup;
unsigned long long min_limit, min_memsw_limit, tmp;
min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
cgroup = memcg->css.cgroup;
if (!memcg->use_hierarchy)
goto out;
while (cgroup->parent) {
cgroup = cgroup->parent;
memcg = mem_cgroup_from_cont(cgroup);
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_limit = min(min_limit, tmp);
tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
min_memsw_limit = min(min_memsw_limit, tmp);
}
out:
*mem_limit = min_limit;
*memsw_limit = min_memsw_limit;
return;
}
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *mem;
mem = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
switch (name) {
if (type == _MEM)
res_counter_reset_max(&mem->res);
else
res_counter_reset_max(&mem->memsw);
if (type == _MEM)
res_counter_reset_failcnt(&mem->res);
else
res_counter_reset_failcnt(&mem->memsw);
static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
struct cftype *cft)
{
return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
}
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
/*
* We check this value several times in both in can_attach() and
* attach(), so we need cgroup lock to prevent this value from being
* inconsistent.
*/
cgroup_lock();
mem->move_charge_at_immigrate = val;
cgroup_unlock();
return 0;
}
#else
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
return -ENOSYS;
}
#endif
/* For read statistics */
enum {
MCS_CACHE,
MCS_RSS,
MCS_FILE_MAPPED,
MCS_PGFAULT,
MCS_PGMAJFAULT,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
MCS_ACTIVE_FILE,
MCS_UNEVICTABLE,
NR_MCS_STAT,
};
struct mcs_total_stat {
s64 stat[NR_MCS_STAT];
struct {
char *local_name;
char *total_name;
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
{"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"pgfault", "total_pgfault"},
{"pgmajfault", "total_pgmajfault"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
{"active_file", "total_active_file"},
{"unevictable", "total_unevictable"}
};
static void
mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
s64 val;
/* per cpu stat */
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
s->stat[MCS_PGFAULT] += val;
val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_get_local_stat(iter, s);
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
#ifdef CONFIG_NUMA
static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
{
int nid;
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
unsigned long node_nr;
struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
total_nr = mem_cgroup_nr_lru_pages(mem_cont);
seq_printf(m, "total=%lu", total_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
seq_printf(m, "file=%lu", file_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
seq_printf(m, "anon=%lu", anon_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
seq_printf(m, "unevictable=%lu", unevictable_nr);
for_each_node_state(nid, N_HIGH_MEMORY) {
node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
nid);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
cb->fill(cb, "hierarchical_memory_limit", limit);
if (do_swap_account)
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
}
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_total_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
recent_rotated[1] +=
mz->reclaim_stat.recent_rotated[1];
recent_scanned[0] +=
mz->reclaim_stat.recent_scanned[0];
recent_scanned[1] +=
mz->reclaim_stat.recent_scanned[1];
}
cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
}
#endif
return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
return get_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
if (val > 100)
return -EINVAL;
if (cgrp->parent == NULL)
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
/* If under hierarchy, only empty-root can set this value */
if ((parent->use_hierarchy) ||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
u64 usage;
int i;
rcu_read_lock();
if (!swap)
t = rcu_dereference(memcg->thresholds.primary);
t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t)
goto unlock;
usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
/*
* Iterate backward over array of thresholds starting from
* current_threshold and check if a threshold is crossed.
* If none of thresholds below usage is crossed, we read
* only one element of the array here.
*/
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
eventfd_signal(t->entries[i].eventfd, 1);
/* i = current_threshold + 1 */
i++;
/*
* Iterate forward over array of thresholds starting from
* current_threshold+1 and check if a threshold is crossed.
* If none of thresholds above usage is crossed, we read
* only one element of the array here.
*/
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
eventfd_signal(t->entries[i].eventfd, 1);
/* Update current_threshold */
unlock:
rcu_read_unlock();
}
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
if (do_swap_account)
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
}
}
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
return _a->threshold - _b->threshold;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
{
struct mem_cgroup_eventfd_list *ev;
list_for_each_entry(ev, &mem->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
return 0;
}
static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
ret = res_counter_memparse_write_strategy(args, &threshold);
if (ret)
return ret;
mutex_lock(&memcg->thresholds_lock);
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before adding a new one */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
ret = -ENOMEM;
goto unlock;
}
/* Copy thresholds (if any) to new array */
if (thresholds->primary) {
memcpy(new->entries, thresholds->primary->entries, (size - 1) *
sizeof(struct mem_cgroup_threshold));
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
/* Find current threshold */
if (new->entries[i].threshold < usage) {
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
/* Free old spare buffer and save old primary buffer as spare */
kfree(thresholds->spare);
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
unlock:
mutex_unlock(&memcg->thresholds_lock);
return ret;
}
static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 usage;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM)
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
/*
* Something went wrong if we trying to unregister a threshold
* if we don't have thresholds
*/
BUG_ON(!thresholds);
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
goto swap_buffers;
/* Copy thresholds and find current threshold */
new->current_threshold = -1;
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd == eventfd)
new->entries[j] = thresholds->primary->entries[i];
if (new->entries[j].threshold < usage) {
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
mutex_unlock(&memcg->thresholds_lock);
}
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
mutex_lock(&memcg_oom_mutex);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
if (atomic_read(&memcg->oom_lock))
eventfd_signal(eventfd, 1);
mutex_unlock(&memcg_oom_mutex);
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
mutex_lock(&memcg_oom_mutex);
list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
if (ev->eventfd == eventfd) {
list_del(&ev->list);
kfree(ev);
}
}
mutex_unlock(&memcg_oom_mutex);
}
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
if (atomic_read(&mem->oom_lock))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
return 0;
}
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (!cgrp->parent || !((val == 0) || (val == 1)))
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
(mem->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
mem->oom_kill_disable = val;
if (!val)
memcg_oom_recover(mem);
cgroup_unlock();
return 0;
}
#ifdef CONFIG_NUMA
static const struct file_operations mem_control_numa_stat_file_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
{
struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
file->f_op = &mem_control_numa_stat_file_operations;
return single_open(file, mem_control_numa_stat_show, cont);
}
#endif /* CONFIG_NUMA */
static struct cftype mem_cgroup_files[] = {
{
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

Paul Menage
committed
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.read_u64 = mem_cgroup_read,
{
.name = "stat",
.read_map = mem_control_stat_show,
{
.name = "force_empty",
.trigger = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
{
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
.read_map = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
.open = mem_control_numa_stat_open,
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
.read_u64 = mem_cgroup_read,
.register_event = mem_cgroup_usage_register_event,
.unregister_event = mem_cgroup_usage_unregister_event,
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
.read_u64 = mem_cgroup_read,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
.read_u64 = mem_cgroup_read,
},
};
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
if (!do_swap_account)
return 0;
return cgroup_add_files(cont, ss, memsw_cgroup_files,
ARRAY_SIZE(memsw_cgroup_files));
};
#else
static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
{
return 0;
}
#endif

KAMEZAWA Hiroyuki
committed
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
struct mem_cgroup_per_node *pn;

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

KAMEZAWA Hiroyuki
committed
if (!pn)
return 1;

KAMEZAWA Hiroyuki
committed

KAMEZAWA Hiroyuki
committed
mem->info.nodeinfo[node] = pn;

KAMEZAWA Hiroyuki
committed
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
mz->on_tree = false;
mz->mem = mem;

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
return 0;
}

KAMEZAWA Hiroyuki
committed
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
kfree(mem->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
int size = sizeof(struct mem_cgroup);
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
mem = kzalloc(size, GFP_KERNEL);
mem = vzalloc(size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
spin_lock_init(&mem->pcp_counter_lock);
out_free:
if (size < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
return NULL;
/*
* At destroying mem_cgroup, references from swap_cgroup can remain.
* (scanning all at force_empty is too costly...)
*
* Instead of clearing all references at force_empty, we remember
* the number of reference from swap_cgroup and free mem_cgroup when
* it goes down to 0.
*
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
static void __mem_cgroup_free(struct mem_cgroup *mem)
mem_cgroup_remove_from_trees(mem);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
free_percpu(mem->stat);
if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
}
static void mem_cgroup_get(struct mem_cgroup *mem)
{
atomic_inc(&mem->refcnt);
}
static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
if (atomic_sub_and_test(count, &mem->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(mem);
__mem_cgroup_free(mem);
if (parent)
mem_cgroup_put(parent);
}
static void mem_cgroup_put(struct mem_cgroup *mem)
{
__mem_cgroup_put(mem, 1);
}
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
{
if (!mem->res.parent)
return NULL;
return mem_cgroup_from_res_counter(mem->res.parent, res);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account)
do_swap_account = 1;
}
#else
static void __init enable_swap_cgroup(void)
{
}
#endif
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
static int mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
int tmp, node, zone;
for_each_node_state(node, N_POSSIBLE) {
tmp = node;
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
if (!rtpn)
return 1;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
rtpz = &rtpn->rb_tree_per_zone[zone];
rtpz->rb_root = RB_ROOT;
spin_lock_init(&rtpz->lock);
}
}
return 0;
}
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *mem, *parent;

KAMEZAWA Hiroyuki
committed
int node;
mem = mem_cgroup_alloc();
if (!mem)

KAMEZAWA Hiroyuki
committed
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
if (cont->parent == NULL) {
root_mem_cgroup = mem;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
for_each_possible_cpu(cpu) {
struct memcg_stock_pcp *stock =
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
mem->oom_kill_disable = parent->oom_kill_disable;
if (parent && parent->use_hierarchy) {
res_counter_init(&mem->res, &parent->res);
res_counter_init(&mem->memsw, &parent->memsw);
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
* This refcnt will be decremented when freeing this
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
} else {
res_counter_init(&mem->res, NULL);
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_node = MAX_NUMNODES;
if (parent)
mem->swappiness = get_swappiness(parent);