Newer
Older
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
preempt_enable();
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
} else
preempt_enable();
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
return mem_cgroup_from_css(
cgroup_subsys_state(cont, mem_cgroup_subsys_id));
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
* pessimistic (rather than adding locks here).
*/
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
} while (!css_tryget(&memcg->css));
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
*
* Caller must pass the return value in @prev on subsequent
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
* Reclaimers can specify a zone and a priority level in @reclaim to
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
struct mem_cgroup *memcg = NULL;
int id = 0;
if (mem_cgroup_disabled())
return NULL;
if (!root)
root = root_mem_cgroup;
if (prev && !reclaim)
id = css_id(&prev->css);
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out_css_put;
return root;
}
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css;
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation)
goto out_css_put;
id = iter->position;
}
rcu_read_lock();
css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
if (css) {
if (css == &root->css || css_tryget(css))
} else
id = 0;
if (reclaim) {
iter->position = id;
if (!css)
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
if (prev && !css)
goto out_css_put;
out_css_put:
if (prev && prev != root)
css_put(&prev->css);
/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
struct mem_cgroup *prev)
{
if (!root)
root = root_mem_cgroup;
if (prev && prev != root)
css_put(&prev->css);
}
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
goto out;
switch (idx) {
case PGFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
break;
case PGMAJFAULT:
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
break;
default:
BUG();
}
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
* @memcg: memcg of the wanted lruvec
*
* Returns the lru list vector holding pages for the given @zone and
* @mem. This can be the global zone lruvec, if the memory controller
* is disabled.
*/
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
* What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
* mem_cgroup_page_lruvec - return lruvec for adding an lru page
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
struct mem_cgroup *memcg;
struct page_cgroup *pc;

KAMEZAWA Hiroyuki
committed
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
* Surreptitiously switch any uncharged offlist page to root:
* an uncharged page off lru does nothing to secure
* its former mem_cgroup from sudden removal.
*
* Our caller holds lru_lock, and PageCgroupUsed is updated
* under page_cgroup lock: between them, they make all uses
* of pc->mem_cgroup safe.
*/
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
* @nr_pages: positive when adding or negative when removing
* This function must be called when a page is added to or removed from an
* lru list.
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
lru_size = mz->lru_size + lru;
*lru_size += nr_pages;
VM_BUG_ON((long)(*lru_size) < 0);
* Checks whether given mem is same or in the root_mem_cgroup's
* hierarchy subtree
*/

Johannes Weiner
committed
bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
if (root_memcg == memcg)
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;

Johannes Weiner
committed
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
struct mem_cgroup *memcg)
{
bool ret;
rcu_read_lock();

Johannes Weiner
committed
ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
rcu_read_unlock();
return ret;
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
struct task_struct *p;
p = find_lock_task_mm(task);

David Rientjes
committed
if (p) {
curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
* All threads may have already detached their mm's, but the oom
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
task_lock(task);
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
task_unlock(task);
}
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "memcg").
ret = mem_cgroup_same_or_subtree(memcg, curr);
return ret;
}
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
unsigned long inactive_ratio;
unsigned long active;
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
return inactive * inactive_ratio < active;
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
*/
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
unsigned long long margin;
margin = res_counter_margin(&memcg->res);
if (do_swap_account)
margin = min(margin, res_counter_margin(&memcg->memsw));
return margin >> PAGE_SHIFT;
}
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
/*
* memcg->moving_account is used for checking possibility that some thread is
* calling move_account(). When a thread on CPU-A starts moving pages under
* a memcg, other threads should check memcg->moving_account under
* rcu_read_lock(), like this:
*
* CPU-A CPU-B
* rcu_read_lock()
* memcg->moving_account+1 if (memcg->mocing_account)
* take heavy locks.
* synchronize_rcu() update something.
* rcu_read_unlock()
* start move here.
*/
/* for quick checking without looking up memcg */
atomic_t memcg_moving __read_mostly;
static void mem_cgroup_start_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
synchronize_rcu();
}
static void mem_cgroup_end_move(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
/*
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
if (memcg) {
atomic_dec(&memcg_moving);
atomic_dec(&memcg->moving_account);

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
/*
* 2 routines for checking "mem" is under move_account() or not.
*
* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
* is used for avoiding races in accounting. If true,

KAMEZAWA Hiroyuki
committed
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* under hierarchy of moving cgroups. This is for
* waiting at hith-memory prressure caused by "move".
*/
static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

KAMEZAWA Hiroyuki
committed
{
VM_BUG_ON(!rcu_read_lock_held());
return atomic_read(&memcg->moving_account) > 0;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
struct mem_cgroup *from;
struct mem_cgroup *to;
/*
* Unlike task_move routines, we access mc.to, mc.from not under
* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
*/
spin_lock(&mc.lock);
from = mc.from;
to = mc.to;
if (!from)
goto unlock;
ret = mem_cgroup_same_or_subtree(memcg, from)
|| mem_cgroup_same_or_subtree(memcg, to);
unlock:
spin_unlock(&mc.lock);
static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
if (mc.moving_task && current != mc.moving_task) {
if (mem_cgroup_under_move(memcg)) {
DEFINE_WAIT(wait);
prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
/* moving charge context might have finished. */
if (mc.moving_task)
schedule();
finish_wait(&mc.waitq, &wait);
return true;
}
}
return false;
}
/*
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_lock_irqsave(&memcg->move_lock, *flags);
}
static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
{
spin_unlock_irqrestore(&memcg->move_lock, *flags);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
struct cgroup *task_cgrp;
struct cgroup *mem_cgrp;
/*
* Need a buffer in BSS, can't rely on allocations. The code relies
* on the assumption that OOM is serialized for memory controller.
* If this assumption is broken, revisit this code.
*/
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
unsigned int i;
if (!p)
return;
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
/*
* Unfortunately, we are unable to convert to a useful name
* But we'll still print out the usage information
*/
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
pr_info("Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
if (ret < 0) {
rcu_read_unlock();
goto done;
}
rcu_read_unlock();
/*
* Continues from above, so we don't need an KERN_ level
*/
pr_cont(" as a result of limit of %s\n", memcg_name);
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats");
rcu_read_lock();
ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
if (!ret)
pr_cont(" for %s", memcg_name);
rcu_read_unlock();
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
}
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
pr_cont("\n");
}
/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
int num = 0;
for_each_mem_cgroup_tree(iter, memcg)
return num;
}
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
* Do not consider swap space if we cannot swap due to swappiness
if (mem_cgroup_swappiness(memcg)) {
u64 memsw;
limit += total_swap_pages << PAGE_SHIFT;
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space
* available to this memcg, return that limit.
*/
limit = min(limit, memsw);
}
return limit;
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
unsigned int points = 0;
struct task_struct *chosen = NULL;
/*
* If current has a pending SIGKILL, then automatically select it. The
* goal is to allow it to allocate so that it may quickly exit and free
* its memory.
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct cgroup *cgroup = iter->css.cgroup;
struct cgroup_iter it;
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
switch (oom_scan_process_thread(task, totalpages, NULL,
false)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = ULONG_MAX;
get_task_struct(chosen);
/* fall through */
case OOM_SCAN_CONTINUE:
continue;
case OOM_SCAN_ABORT:
cgroup_iter_end(cgroup, &it);
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
return;
case OOM_SCAN_OK:
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
if (points > chosen_points) {
if (chosen)
put_task_struct(chosen);
chosen = task;
chosen_points = points;
get_task_struct(chosen);
}
}
cgroup_iter_end(cgroup, &it);
}
if (!chosen)
return;
points = chosen_points * 1000 / totalpages;
oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
NULL, "Memory cgroup out of memory");
}
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
gfp_t gfp_mask,
unsigned long flags)
{
unsigned long total = 0;
bool noswap = false;
int loop;
if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
noswap = true;
if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
noswap = true;
for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
if (loop)
drain_all_stock_async(memcg);
total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
/*
* Allow limit shrinkers, which are triggered directly
* by userspace, to catch signals and stop reclaim
* after minimal progress, regardless of the margin.
*/
if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
break;
if (mem_cgroup_margin(memcg))
break;
/*
* If nothing was reclaimed after two attempts, there
* may be no reclaimable pages in this hierarchy.
*/
if (loop && !total)
break;
}
return total;
}
/**
* test_mem_cgroup_node_reclaimable
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
* This function returns whether the specified memcg contains any
* reclaimable pages on a node. Returns true if there are any reclaimable
* pages in the node.
*/
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
return true;
return false;
}
#if MAX_NUMNODES > 1
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
* nodes based on the zonelist. So update the list loosely once per 10 secs.
*
*/
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
/*
* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
* pagein/pageout changes since the last update.
*/
if (!atomic_read(&memcg->numainfo_events))
return;
if (atomic_inc_return(&memcg->numainfo_updating) > 1)
return;
/* make a nodemask where this memcg uses memory from */
memcg->scan_nodes = node_states[N_MEMORY];
for_each_node_mask(nid, node_states[N_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
atomic_set(&memcg->numainfo_events, 0);
atomic_set(&memcg->numainfo_updating, 0);
}
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
* memory reclaim from current node, there are pros. and cons.
*
* Freeing memory from current node means freeing memory from a node which
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
*
* Now, we use round-robin. Better algorithm is welcomed.
*/
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
mem_cgroup_may_update_nodemask(memcg);
node = memcg->last_scanned_node;
node = next_node(node, memcg->scan_nodes);
if (node == MAX_NUMNODES)
node = first_node(memcg->scan_nodes);
/*
* We call this when we hit limit, not when pages are added to LRU.
* No LRU may hold pages because all pages are UNEVICTABLE or
* memcg is too small and all pages are not on LRU. In that case,
* we use curret node.
*/
if (unlikely(node == MAX_NUMNODES))
node = numa_node_id();
return node;
}
/*
* Check all nodes whether it contains reclaimable pages or not.
* For quick scan, we make use of scan_nodes. This will allow us to skip
* unused nodes. But scan_nodes is lazily updated and may not cotain
* enough new information. We need to do double check.
*/
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
{
int nid;
/*
* quick check...making use of scan_node.
* We can skip unused nodes.
*/
if (!nodes_empty(memcg->scan_nodes)) {
for (nid = first_node(memcg->scan_nodes);
nid = next_node(nid, memcg->scan_nodes)) {
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
}
/*
* Check rest of nodes.
*/
for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
return true;
}
return false;
}
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
return 0;
}
static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
struct zone *zone,
gfp_t gfp_mask,
unsigned long *total_scanned)
struct mem_cgroup *victim = NULL;
int total = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.zone = zone,
.priority = 0,
};
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
if (!mem_cgroup_reclaimable(victim, false))
total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
zone, &nr_scanned);
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
mem_cgroup_iter_break(root_memcg, victim);
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
* Has to be called with memcg_oom_lock
static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
struct mem_cgroup *iter, *failed = NULL;
for_each_mem_cgroup_tree(iter, memcg) {
/*
* this subtree of our hierarchy is already locked
* so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
} else
iter->oom_lock = true;
/*
* OK, we failed to lock the whole subtree so we have to clean up
* what we set up to the failing subtree