Newer
Older
u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
break;
}
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
* We have to guarantee mem->res.limit < mem->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
memcg->memsw_is_minimum = true;
else
memcg->memsw_is_minimum = false;
}
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
else
oldusage = curusage;
if (!ret && enlarge)
memcg_oom_recover(memcg);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
unsigned long long excess;
if (order > 0)
return 0;
mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
gfp_mask,
MEM_CGROUP_RECLAIM_SOFT);
nr_reclaimed += reclaimed;
spin_lock(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed) {
do {
/*
* Loop until we find yet another one.
*
* By the time we get the soft_limit lock
* again, someone might have aded the
* group back on the RB tree. Iterate to
* make sure we get a different mem.
* mem_cgroup_largest_soft_limit_node returns
* NULL if no other cgroup is present on
* the tree
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
if (next_mz == mz) {
css_put(&next_mz->mem->css);
next_mz = NULL;
} else /* next_mz == NULL or other memcg */
break;
} while (1);
}
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->mem->css);
return nr_reclaimed;
}

KAMEZAWA Hiroyuki
committed
/*
* This routine traverse page_cgroup in given list and drop them all.
* *And* this routine doesn't reclaim page itself, just removes page_cgroup.
*/
static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

KAMEZAWA Hiroyuki
committed
{
struct zone *zone;
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc, *busy;

KAMEZAWA Hiroyuki
committed
struct list_head *list;

KAMEZAWA Hiroyuki
committed
zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(mem, node, zid);
list = &mz->lists[lru];

KAMEZAWA Hiroyuki
committed
loop = MEM_CGROUP_ZSTAT(mz, lru);
/* give some margin against EBUSY etc...*/
loop += 256;
busy = NULL;
while (loop--) {
ret = 0;
if (list_empty(list)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
pc = list_entry(list->prev, struct page_cgroup, lru);
if (busy == pc) {
list_move(&pc->lru, list);
busy = NULL;
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(&zone->lru_lock, flags);
ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
if (ret == -EBUSY || ret == -EINVAL) {
/* found lock contention or "pc" is obsolete. */
busy = pc;
cond_resched();
} else
busy = NULL;

KAMEZAWA Hiroyuki
committed
}
if (!ret && !list_empty(list))
return -EBUSY;
return ret;

KAMEZAWA Hiroyuki
committed
}
/*
* make mem_cgroup's charge to be 0 if there is no task.
* This enables deleting this mem_cgroup.
*/
static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

KAMEZAWA Hiroyuki
committed
{
int ret;
int node, zid, shrink;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct cgroup *cgrp = mem->css.cgroup;

KAMEZAWA Hiroyuki
committed
css_get(&mem->css);
/* should free all ? */
if (free_all)
goto try_to_free;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
ret = -EINTR;
if (signal_pending(current))

KAMEZAWA Hiroyuki
committed
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync();

KAMEZAWA Hiroyuki
committed
mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
for_each_lru(l) {
ret = mem_cgroup_force_empty_list(mem,
if (ret)
break;
}

KAMEZAWA Hiroyuki
committed
}
if (ret)
break;
}

KAMEZAWA Hiroyuki
committed
mem_cgroup_end_move(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
/* "ret" should also be checked to ensure all lists are empty. */
} while (mem->res.usage > 0 || ret);

KAMEZAWA Hiroyuki
committed
out:
css_put(&mem->css);
return ret;
/* returns EBUSY if there is a task or if we come here twice. */
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
ret = -EBUSY;
goto out;
}
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
while (nr_retries && mem->res.usage > 0) {
int progress;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
}
progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
false, get_swappiness(mem));
/* maybe some writeback is necessary */
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* try move_account...there may be some *locked* pages. */

KAMEZAWA Hiroyuki
committed
}
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
}
static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
{
return mem_cgroup_from_cont(cont)->use_hierarchy;
}
static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
u64 val)
{
int retval = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
struct cgroup *parent = cont->parent;
struct mem_cgroup *parent_mem = NULL;
if (parent)
parent_mem = mem_cgroup_from_cont(parent);
cgroup_lock();
/*
* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
* occur, provided the current cgroup has no children.
*
* For the root cgroup, parent_mem is NULL, we allow value to be
* set if there are no children.
*/
if ((!parent_mem || !parent_mem->use_hierarchy) &&
(val == 1 || val == 0)) {
if (list_empty(&cont->children))
mem->use_hierarchy = val;
else
retval = -EBUSY;
} else
retval = -EINVAL;
cgroup_unlock();
return retval;
}
static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
enum mem_cgroup_stat_index idx)
struct mem_cgroup *iter;
s64 val = 0;
/* each per cpu's value can be minus.Then, use s64 */
for_each_mem_cgroup_tree(iter, mem)
val += mem_cgroup_read_stat(iter, idx);
if (val < 0) /* race ? */
val = 0;
return val;
static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
{
if (!mem_cgroup_is_root(mem)) {
if (!swap)
return res_counter_read_u64(&mem->res, RES_USAGE);
else
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}
val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
if (swap)
val += mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
int type, name;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
if (name == RES_USAGE)
val = mem_cgroup_usage(mem, false);
else
val = res_counter_read_u64(&mem->res, name);
if (name == RES_USAGE)
val = mem_cgroup_usage(mem, true);
else
val = res_counter_read_u64(&mem->memsw, name);
break;
default:
BUG();
break;
}
return val;
/*
* The user of this function is...
* RES_LIMIT.
*/

Paul Menage
committed
static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
const char *buffer)
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
unsigned long long val;
int ret;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
switch (name) {
if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL;
break;
}
/* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
if (type == _MEM)
ret = mem_cgroup_resize_limit(memcg, val);
else
ret = mem_cgroup_resize_memsw_limit(memcg, val);
case RES_SOFT_LIMIT:
ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
/*
* For memsw, soft limits are hard to implement in terms
* of semantics, for now, we support soft limits for
* control without swap
*/
if (type == _MEM)
ret = res_counter_set_soft_limit(&memcg->res, val);
else
ret = -EINVAL;
break;
default:
ret = -EINVAL; /* should be BUG() ? */
break;
}
return ret;
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
unsigned long long *mem_limit, unsigned long long *memsw_limit)
{
struct cgroup *cgroup;
unsigned long long min_limit, min_memsw_limit, tmp;
min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
cgroup = memcg->css.cgroup;
if (!memcg->use_hierarchy)
goto out;
while (cgroup->parent) {
cgroup = cgroup->parent;
memcg = mem_cgroup_from_cont(cgroup);
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_limit = min(min_limit, tmp);
tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
min_memsw_limit = min(min_memsw_limit, tmp);
}
out:
*mem_limit = min_limit;
*memsw_limit = min_memsw_limit;
return;
}
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *mem;
mem = mem_cgroup_from_cont(cont);
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
switch (name) {
if (type == _MEM)
res_counter_reset_max(&mem->res);
else
res_counter_reset_max(&mem->memsw);
if (type == _MEM)
res_counter_reset_failcnt(&mem->res);
else
res_counter_reset_failcnt(&mem->memsw);
static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
struct cftype *cft)
{
return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
}
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
/*
* We check this value several times in both in can_attach() and
* attach(), so we need cgroup lock to prevent this value from being
* inconsistent.
*/
cgroup_lock();
mem->move_charge_at_immigrate = val;
cgroup_unlock();
return 0;
}
#else
static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
return -ENOSYS;
}
#endif
/* For read statistics */
enum {
MCS_CACHE,
MCS_RSS,
MCS_FILE_MAPPED,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
MCS_ACTIVE_FILE,
MCS_UNEVICTABLE,
NR_MCS_STAT,
};
struct mcs_total_stat {
s64 stat[NR_MCS_STAT];
struct {
char *local_name;
char *total_name;
} memcg_stat_strings[NR_MCS_STAT] = {
{"cache", "total_cache"},
{"rss", "total_rss"},
{"mapped_file", "total_mapped_file"},
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
{"active_file", "total_active_file"},
{"unevictable", "total_unevictable"}
};
static void
mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
s64 val;
/* per cpu stat */
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_get_local_stat(iter, s);
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_local_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
{
unsigned long long limit, memsw_limit;
memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
cb->fill(cb, "hierarchical_memory_limit", limit);
if (do_swap_account)
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
}
memset(&mystat, 0, sizeof(mystat));
mem_cgroup_get_total_stat(mem_cont, &mystat);
for (i = 0; i < NR_MCS_STAT; i++) {
if (i == MCS_SWAP && !do_swap_account)
continue;
cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
{
int nid, zid;
struct mem_cgroup_per_zone *mz;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
recent_rotated[1] +=
mz->reclaim_stat.recent_rotated[1];
recent_scanned[0] +=
mz->reclaim_stat.recent_scanned[0];
recent_scanned[1] +=
mz->reclaim_stat.recent_scanned[1];
}
cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
}
#endif
return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
return get_swappiness(memcg);
}
static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
if (val > 100)
return -EINVAL;
if (cgrp->parent == NULL)
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
/* If under hierarchy, only empty-root can set this value */
if ((parent->use_hierarchy) ||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
spin_lock(&memcg->reclaim_param_lock);
memcg->swappiness = val;
spin_unlock(&memcg->reclaim_param_lock);
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
u64 usage;
int i;
rcu_read_lock();
if (!swap)
t = rcu_dereference(memcg->thresholds.primary);
t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t)
goto unlock;
usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below usage.
* If it's not true, a threshold was crossed after last
* call of __mem_cgroup_threshold().
*/
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
/*
* Iterate backward over array of thresholds starting from
* current_threshold and check if a threshold is crossed.
* If none of thresholds below usage is crossed, we read
* only one element of the array here.
*/
for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
eventfd_signal(t->entries[i].eventfd, 1);
/* i = current_threshold + 1 */
i++;
/*
* Iterate forward over array of thresholds starting from
* current_threshold+1 and check if a threshold is crossed.
* If none of thresholds above usage is crossed, we read
* only one element of the array here.
*/
for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
eventfd_signal(t->entries[i].eventfd, 1);
/* Update current_threshold */
unlock:
rcu_read_unlock();
}
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
if (do_swap_account)
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
}
}
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
return _a->threshold - _b->threshold;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
{
struct mem_cgroup_eventfd_list *ev;
list_for_each_entry(ev, &mem->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
return 0;
}
static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, mem)
mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
ret = res_counter_memparse_write_strategy(args, &threshold);
if (ret)
return ret;
mutex_lock(&memcg->thresholds_lock);
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before adding a new one */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
ret = -ENOMEM;
goto unlock;
}
/* Copy thresholds (if any) to new array */
if (thresholds->primary) {
memcpy(new->entries, thresholds->primary->entries, (size - 1) *
sizeof(struct mem_cgroup_threshold));
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
/* Find current threshold */
if (new->entries[i].threshold < usage) {
* new->current_threshold will not be used until
* rcu_assign_pointer(), so it's safe to increment
/* Free old spare buffer and save old primary buffer as spare */
kfree(thresholds->spare);
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
unlock:
mutex_unlock(&memcg->thresholds_lock);
return ret;
}
static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
int type = MEMFILE_TYPE(cft->private);
u64 usage;
mutex_lock(&memcg->thresholds_lock);
if (type == _MEM)
thresholds = &memcg->thresholds;
thresholds = &memcg->memsw_thresholds;
else
BUG();
/*
* Something went wrong if we trying to unregister a threshold
* if we don't have thresholds
*/
BUG_ON(!thresholds);
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
goto swap_buffers;
/* Copy thresholds and find current threshold */
new->current_threshold = -1;
for (i = 0, j = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd == eventfd)
new->entries[j] = thresholds->primary->entries[i];
if (new->entries[j].threshold < usage) {
* new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
mutex_unlock(&memcg->thresholds_lock);
}
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
mutex_lock(&memcg_oom_mutex);
event->eventfd = eventfd;
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
if (atomic_read(&memcg->oom_lock))
eventfd_signal(eventfd, 1);
mutex_unlock(&memcg_oom_mutex);
return 0;
}
static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
struct cftype *cft, struct eventfd_ctx *eventfd)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
int type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
mutex_lock(&memcg_oom_mutex);
list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
if (ev->eventfd == eventfd) {