Newer
Older
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>

KAMEZAWA Hiroyuki
committed
#include <linux/smp.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/sort.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/oom.h>
#include <asm/uaccess.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
/* for remember boot option*/
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif
#else
#define do_swap_account (0)
#endif

KAMEZAWA Hiroyuki
committed
/*
* Statistics for memory cgroup.
*/
enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */

KAMEZAWA Hiroyuki
committed
MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

KAMEZAWA Hiroyuki
committed
MEM_CGROUP_STAT_NSTATS,
};
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
* for trigger some periodic events. This is straightforward and better
* than using jiffies etc. to handle periodic memcg event.
*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET (128)
#define SOFTLIMIT_EVENTS_TARGET (1024)
#define NUMAINFO_EVENTS_TARGET (1024)

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
unsigned long targets[MEM_CGROUP_NTARGETS];

KAMEZAWA Hiroyuki
committed
};

KAMEZAWA Hiroyuki
committed
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {

KAMEZAWA Hiroyuki
committed
/*
* spin_lock to protect the per cgroup LRU
*/
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
struct rb_node tree_node; /* RB tree node */
unsigned long long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *mem; /* Back pointer, we cannot */
/* use container_of */

KAMEZAWA Hiroyuki
committed
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
struct mem_cgroup_lru_info {
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_zone {
struct rb_root rb_root;
spinlock_t lock;
};
struct mem_cgroup_tree_per_node {
struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
};
struct mem_cgroup_threshold_ary {
/* An array index points to threshold just below usage. */
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
struct mem_cgroup_threshold entries[0];
};
struct mem_cgroup_thresholds {
/* Primary thresholds array */
struct mem_cgroup_threshold_ary *primary;
/*
* Spare threshold array.
* This is needed to make mem_cgroup_unregister_event() "never fail".
* It must be able to store at least primary->size - 1 entries.
*/
struct mem_cgroup_threshold_ary *spare;
};
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
static void mem_cgroup_threshold(struct mem_cgroup *mem);
static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
struct res_counter res;
/*
* the counter to account for mem+swap usage.
*/
struct res_counter memsw;
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_lru_info info;
* While reclaiming in a hierarchy, we cache the last child we
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
/* OOM-Killer disable */
int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
/* protect arrays of thresholds */
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
/* For oom notifier event fd */
struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
unsigned long move_charge_at_immigrate;

KAMEZAWA Hiroyuki
committed
/*
* percpu counter.

KAMEZAWA Hiroyuki
committed
*/
struct mem_cgroup_stat_cpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
* left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
NR_MOVE_TYPE,
};
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
static bool move_anon(void)
{
return test_bit(MOVE_CHARGE_TYPE_ANON,
&mc.to->move_charge_at_immigrate);
}
static bool move_file(void)
{
return test_bit(MOVE_CHARGE_TYPE_FILE,
&mc.to->move_charge_at_immigrate);
}
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
*/
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
static void drain_all_stock_async(struct mem_cgroup *mem);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
{
return &mem->info.nodeinfo[nid]->zoneinfo[zid];
}
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
{
return &mem->css;
}
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(mem, nid, zid);
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_zone *mz_node;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess)
p = &(*p)->rb_left;
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void
__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void
mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
__mem_cgroup_remove_exceeded(mem, mz, mctz);
spin_unlock(&mctz->lock);
}
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;

KAMEZAWA Hiroyuki
committed
int nid = page_to_nid(page);
int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
/*

KAMEZAWA Hiroyuki
committed
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.

KAMEZAWA Hiroyuki
committed
for (; mem; mem = parent_mem_cgroup(mem)) {
mz = mem_cgroup_zoneinfo(mem, nid, zid);
excess = res_counter_soft_limit_excess(&mem->res);

KAMEZAWA Hiroyuki
committed
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {

KAMEZAWA Hiroyuki
committed
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mem, mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.

KAMEZAWA Hiroyuki
committed
*/
__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);

KAMEZAWA Hiroyuki
committed
spin_unlock(&mctz->lock);
}
}
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
{
int node, zone;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
for_each_node_state(node, N_POSSIBLE) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = mem_cgroup_zoneinfo(mem, node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
mem_cgroup_remove_exceeded(mem, mz, mctz);
}
}
}
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct rb_node *rightmost = NULL;
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
rightmost = rb_last(&mctz->rb_root);
if (!rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
if (!res_counter_soft_limit_excess(&mz->mem->res) ||
!css_tryget(&mz->mem->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct mem_cgroup_per_zone *mz;
spin_lock(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock(&mctz->lock);
return mz;
}
/*
* Implementation Note: reading percpu statistics for memcg.
*
* Both of vmstat[] and percpu_counter has threshold and do periodic
* synchronization to implement "quick" read. There are trade-off between
* reading cost and precision of value. Then, we may have a chance to implement
* a periodic synchronizion of counter in memcg's counter.
*
* But this _read() function is used for user interface now. The user accounts
* memory usage by memory cgroup and he _always_ requires exact value because
* he accounts memory. Even if we provide quick-and-fuzzy read, we always
* have to visit all online cpus and make sum. So, for now, unnecessary
* synchronization is not implemented. (just implemented for cpu hotplug)
*
* If there are kernel internal actions which can make use of some not-exact
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
static long mem_cgroup_read_stat(struct mem_cgroup *mem,
enum mem_cgroup_stat_index idx)
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(mem->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&mem->pcp_counter_lock);
val += mem->nocpu_base.count[idx];
spin_unlock(&mem->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
int val = (charge) ? 1 : -1;
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
{
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
}
void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
{
this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
}
static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
val += per_cpu(mem->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&mem->pcp_counter_lock);
val += mem->nocpu_base.events[idx];
spin_unlock(&mem->pcp_counter_lock);
#endif
return val;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
bool file, int nr_pages)

KAMEZAWA Hiroyuki
committed
{
preempt_disable();
if (file)
__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);

KAMEZAWA Hiroyuki
committed
else
__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
preempt_enable();

KAMEZAWA Hiroyuki
committed
}
static unsigned long
mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
{
struct mem_cgroup_per_zone *mz;
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
mz = mem_cgroup_zoneinfo(mem, nid, zid);
total += MEM_CGROUP_ZSTAT(mz, idx);
}
return total;
}
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,

KAMEZAWA Hiroyuki
committed
{

KAMEZAWA Hiroyuki
committed
u64 total = 0;
for_each_online_node(nid)
total += mem_cgroup_get_zonestat_node(mem, nid, idx);

KAMEZAWA Hiroyuki
committed
return total;

KAMEZAWA Hiroyuki
committed
}
static bool __memcg_event_check(struct mem_cgroup *mem, int target)
{
unsigned long val, next;
val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
next = this_cpu_read(mem->stat->targets[target]);
/* from time_after() in jiffies.h */
return ((long)next - (long)val < 0);
}
static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
unsigned long val, next;
val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
default:
return;
}
this_cpu_write(mem->stat->targets[target], next);
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
{
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
mem_cgroup_threshold(mem);
__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
if (unlikely(__memcg_event_check(mem,
MEM_CGROUP_TARGET_SOFTLIMIT))) {
mem_cgroup_update_tree(mem, page);
__mem_cgroup_target_update(mem,
MEM_CGROUP_TARGET_SOFTLIMIT);
}
#if MAX_NUMNODES > 1
if (unlikely(__memcg_event_check(mem,
MEM_CGROUP_TARGET_NUMAINFO))) {
atomic_inc(&mem->numainfo_events);
__mem_cgroup_target_update(mem,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
return container_of(cgroup_subsys_state(cont,
mem_cgroup_subsys_id), struct mem_cgroup,
css);
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
* pessimistic (rather than adding locks here).
*/
rcu_read_lock();
do {
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!mem))
break;
} while (!css_tryget(&mem->css));
rcu_read_unlock();
return mem;
}
/* The caller has to guarantee "mem" exists before calling this */
static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
struct cgroup_subsys_state *css;
int found;
if (!mem) /* ROOT cgroup has the smallest ID */
return root_mem_cgroup; /*css_put/get against root is ignored*/
if (!mem->use_hierarchy) {
if (css_tryget(&mem->css))
return mem;
return NULL;
}
rcu_read_lock();
/*
* searching a memory cgroup which has the smallest ID under given
* ROOT cgroup. (ID >= 1)
*/
css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
if (css && css_tryget(css))
mem = container_of(css, struct mem_cgroup, css);
else
mem = NULL;
rcu_read_unlock();
return mem;
}
static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
struct mem_cgroup *root,
bool cond)
{
int nextid = css_id(&iter->css) + 1;
int found;
int hierarchy_used;
hierarchy_used = iter->use_hierarchy;
/* If no ROOT, walk all, ignore hierarchy */
if (!cond || (root && !hierarchy_used))
if (!root)
root = root_mem_cgroup;
css = css_get_next(&mem_cgroup_subsys, nextid,
&root->css, &found);
iter = container_of(css, struct mem_cgroup, css);
/* If css is NULL, no more cgroups will be found */
/*
* for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
* be careful that "break" loop is not allowed. We have reference count.
* Instead of that modify "cond" to be false and "continue" to exit the loop.
*/
#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
for (iter = mem_cgroup_start_loop(root);\
iter != NULL;\
iter = mem_cgroup_get_next(iter, root, cond))
#define for_each_mem_cgroup_tree(iter, root) \
for_each_mem_cgroup_tree_cond(iter, root, true)
#define for_each_mem_cgroup_all(iter) \
for_each_mem_cgroup_tree_cond(iter, NULL, true)
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
return (mem == root_mem_cgroup);
}
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *mem;
if (!mm)
return;
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!mem))
goto out;
switch (idx) {
case PGMAJFAULT:
mem_cgroup_pgmajfault(mem, 1);
break;
case PGFAULT:
mem_cgroup_pgfault(mem, 1);
break;
default:
BUG();
}
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(mem_cgroup_count_vm_event);
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
* What we have to take care of here is validness of pc->mem_cgroup.
*
* Changes to pc->mem_cgroup happens when
* 1. charge
* 2. moving account
* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
* It is added to LRU before charge.
* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
* When moving account, the page is not on LRU. It's isolated.
*/
void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
{
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
return;
pc = lookup_page_cgroup(page);
/* can happen while we handle swapcache. */
if (!TestClearPageCgroupAcctLRU(pc))
VM_BUG_ON(!pc->mem_cgroup);
/*
* We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU.
*/
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));

KAMEZAWA Hiroyuki
committed
}

KAMEZAWA Hiroyuki
committed
{
mem_cgroup_del_lru_list(page, page_lru(page));
}
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
*/
void mem_cgroup_rotate_reclaimable_page(struct page *page)
{
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc;
enum lru_list lru = page_lru(page);
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
/* unused or root page is not rotated. */
if (!PageCgroupUsed(pc))
return;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
list_move_tail(&pc->lru, &mz->lists[lru]);
}
void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc;

KAMEZAWA Hiroyuki
committed
/* unused or root page is not rotated. */
if (!PageCgroupUsed(pc))
return;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
if (mem_cgroup_is_root(pc->mem_cgroup))
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

KAMEZAWA Hiroyuki
committed
}
void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;

KAMEZAWA Hiroyuki
committed
VM_BUG_ON(PageCgroupAcctLRU(pc));
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
* while it's linked to lru because the page may be reused after it's fully
* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
* It's done under lock_page and expected that zone->lru_lock isnever held.
static void mem_cgroup_lru_del_before_commit(struct page *page)
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* Doing this check without taking ->lru_lock seems wrong but this