Newer
Older
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>

KAMEZAWA Hiroyuki
committed
#include <linux/smp.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/sort.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/oom.h>
#include <net/sock.h>
#include <net/tcp_memcontrol.h>
#include <asm/uaccess.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
/* for remember boot option*/
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif

KAMEZAWA Hiroyuki
committed
/*
* Statistics for memory cgroup.
*/
enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */

KAMEZAWA Hiroyuki
committed
MEM_CGROUP_STAT_NSTATS,
};
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
"mapped_file",
"swap",
};
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
"pgfault",
"pgmajfault",
};
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
* for trigger some periodic events. This is straightforward and better
* than using jiffies etc. to handle periodic memcg event.
*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_reclaim_iter {
/* css_id of the last scanned hierarchy member */
int position;
/* scan generation, increased every round-trip */
unsigned int generation;
};

KAMEZAWA Hiroyuki
committed
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {
unsigned long lru_size[NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
unsigned long long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
struct mem_cgroup_lru_info {
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_zone {
struct rb_root rb_root;
spinlock_t lock;
};
struct mem_cgroup_tree_per_node {
struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
};
struct mem_cgroup_threshold_ary {
/* An array index points to threshold just below or equal to usage. */
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
struct mem_cgroup_threshold entries[0];
};
struct mem_cgroup_thresholds {
/* Primary thresholds array */
struct mem_cgroup_threshold_ary *primary;
/*
* Spare threshold array.
* This is needed to make mem_cgroup_unregister_event() "never fail".
* It must be able to store at least primary->size - 1 entries.
*/
struct mem_cgroup_threshold_ary *spare;
};
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
struct res_counter res;
union {
/*
* the counter to account for mem+swap usage.
*/
struct res_counter memsw;
/*
* rcu_freeing is used only when freeing struct mem_cgroup,
* so put it into a union to avoid wasting more memory.
* It must be disjoint from the css field. It could be
* in a union with the res field, but res plays a much
* larger part in mem_cgroup life than memsw, and might
* be of interest, even at time of free, when debugging.
* So share rcu_head with the less interesting memsw.
*/
struct rcu_head rcu_freeing;
/*
* We also need some space for a worker in deferred freeing.
* By the time we call it, rcu_freeing is no longer in use.
*/
struct work_struct work_freeing;
};
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_lru_info info;
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
bool oom_lock;
atomic_t under_oom;
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
/* protect arrays of thresholds */
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
/* For oom notifier event fd */
struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
unsigned long move_charge_at_immigrate;
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
atomic_t moving_account;
/* taken only while moving_account > 0 */
spinlock_t move_lock;

KAMEZAWA Hiroyuki
committed
/*
* percpu counter.

KAMEZAWA Hiroyuki
committed
*/
struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#ifdef CONFIG_INET
struct tcp_memcontrol tcp_mem;
#endif
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
* left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
NR_MOVE_TYPE,
};
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
static bool move_anon(void)
{
return test_bit(MOVE_CHARGE_TYPE_ANON,
&mc.to->move_charge_at_immigrate);
}
static bool move_file(void)
{
return test_bit(MOVE_CHARGE_TYPE_FILE,
&mc.to->move_charge_at_immigrate);
}
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
/* for encoding cft->private value on file */

Glauber Costa
committed
#define _MEM (0)
#define _MEMSWAP (1)
#define _OOM_TYPE (2)
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
*/
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
static void mem_cgroup_get(struct mem_cgroup *memcg);
static void mem_cgroup_put(struct mem_cgroup *memcg);
/* Writing them here to avoid exposing memcg's inner layout */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
#include <net/sock.h>
static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct mem_cgroup *memcg;
BUG_ON(!sk->sk_prot->proto_cgroup);
/* Socket cloning can throw us here with sk_cgrp already
* filled. It won't however, necessarily happen from
* process context. So the test for root memcg given
* the current task's memcg won't help us in this case.
*
* Respecting the original socket's memcg is a better
* decision in this case.
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
mem_cgroup_get(sk->sk_cgrp->memcg);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (!mem_cgroup_is_root(memcg)) {
mem_cgroup_get(memcg);
sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
}
rcu_read_unlock();
}
}
EXPORT_SYMBOL(sock_update_memcg);
void sock_release_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
mem_cgroup_put(memcg);
}
}
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
#endif /* CONFIG_INET */
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(memcg, nid, zid);
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz,
unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_zone *mz_node;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess)
p = &(*p)->rb_left;
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void
__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
__mem_cgroup_remove_exceeded(memcg, mz, mctz);
spin_unlock(&mctz->lock);
}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;

KAMEZAWA Hiroyuki
committed
int nid = page_to_nid(page);
int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
/*

KAMEZAWA Hiroyuki
committed
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
excess = res_counter_soft_limit_excess(&memcg->res);

KAMEZAWA Hiroyuki
committed
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {

KAMEZAWA Hiroyuki
committed
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(memcg, mz, mctz);

KAMEZAWA Hiroyuki
committed
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.

KAMEZAWA Hiroyuki
committed
*/
__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

KAMEZAWA Hiroyuki
committed
spin_unlock(&mctz->lock);
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
int node, zone;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = mem_cgroup_zoneinfo(memcg, node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
mem_cgroup_remove_exceeded(memcg, mz, mctz);
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct rb_node *rightmost = NULL;
rightmost = rb_last(&mctz->rb_root);
if (!rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
!css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
struct mem_cgroup_per_zone *mz;
spin_lock(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock(&mctz->lock);
return mz;
}
/*
* Implementation Note: reading percpu statistics for memcg.
*
* Both of vmstat[] and percpu_counter has threshold and do periodic
* synchronization to implement "quick" read. There are trade-off between
* reading cost and precision of value. Then, we may have a chance to implement
* a periodic synchronizion of counter in memcg's counter.
*
* But this _read() function is used for user interface now. The user accounts
* memory usage by memory cgroup and he _always_ requires exact value because
* he accounts memory. Even if we provide quick-and-fuzzy read, we always
* have to visit all online cpus and make sum. So, for now, unnecessary
* synchronization is not implemented. (just implemented for cpu hotplug)
*
* If there are kernel internal actions which can make use of some not-exact
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.count[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
return val;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

KAMEZAWA Hiroyuki
committed
{
preempt_disable();
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/
if (anon)
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

KAMEZAWA Hiroyuki
committed
else
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
preempt_enable();

KAMEZAWA Hiroyuki
committed
}
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
return mz->lru_size[lru];
}
static unsigned long
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
unsigned long ret = 0;
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
for_each_lru(lru) {
if (BIT(lru) & lru_mask)
ret += mz->lru_size[lru];
}
return ret;
}
static unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++)
total += mem_cgroup_zone_nr_lru_pages(memcg,
nid, zid, lru_mask);
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)

KAMEZAWA Hiroyuki
committed
{

KAMEZAWA Hiroyuki
committed
u64 total = 0;
for_each_node_state(nid, N_HIGH_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

KAMEZAWA Hiroyuki
committed
return total;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
unsigned long val, next;
val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
if ((long)next - (long)val < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
default:
break;
}
__this_cpu_write(memcg->stat->targets[target], next);
return true;
return false;
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
preempt_enable();
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
} else
preempt_enable();
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
return container_of(cgroup_subsys_state(cont,
mem_cgroup_subsys_id), struct mem_cgroup,
css);
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
* pessimistic (rather than adding locks here).
*/
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
} while (!css_tryget(&memcg->css));
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
*
* Caller must pass the return value in @prev on subsequent
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
* Reclaimers can specify a zone and a priority level in @reclaim to
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
struct mem_cgroup *memcg = NULL;
int id = 0;
if (mem_cgroup_disabled())
return NULL;
if (!root)
root = root_mem_cgroup;
if (prev && !reclaim)
id = css_id(&prev->css);
if (prev && prev != root)
css_put(&prev->css);
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
return NULL;
return root;
}
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css;
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation)
return NULL;
id = iter->position;
}
rcu_read_lock();
css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
if (css) {
if (css == &root->css || css_tryget(css))
memcg = container_of(css,
struct mem_cgroup, css);
} else
id = 0;
if (reclaim) {
iter->position = id;
if (!css)
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
if (prev && !css)
return NULL;
}
return memcg;
/**
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
struct mem_cgroup *prev)
{
if (!root)
root = root_mem_cgroup;
if (prev && prev != root)
css_put(&prev->css);
}
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \