Newer
Older
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Kernel Memory Controller
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>

KAMEZAWA Hiroyuki
committed
#include <linux/smp.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/sort.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/oom.h>
#include <asm/uaccess.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
EXPORT_SYMBOL(mem_cgroup_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
/* for remember boot option*/
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
"pgfault",
"pgmajfault",
};
static const char * const mem_cgroup_lru_names[] = {
"inactive_anon",
"active_anon",
"inactive_file",
"active_file",
"unevictable",
};
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
* for trigger some periodic events. This is straightforward and better
* than using jiffies etc. to handle periodic memcg event.
*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_reclaim_iter {
/*
* last scanned hierarchy member. Valid only if last_dead_count
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
/* scan generation, increased every round-trip */
unsigned int generation;
};

KAMEZAWA Hiroyuki
committed
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {
unsigned long lru_size[NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
};
struct mem_cgroup_threshold_ary {
/* An array index points to threshold just below or equal to usage. */
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
struct mem_cgroup_threshold entries[0];
};
struct mem_cgroup_thresholds {
/* Primary thresholds array */
struct mem_cgroup_threshold_ary *primary;
/*
* Spare threshold array.
* This is needed to make mem_cgroup_unregister_event() "never fail".
* It must be able to store at least primary->size - 1 entries.
*/
struct mem_cgroup_threshold_ary *spare;
};
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
struct res_counter res;
/* vmpressure notifications */
struct vmpressure vmpressure;
/*
* the counter to account for mem+swap usage.
*/
struct res_counter memsw;
/*
* the counter to account for kernel memory usage.
*/
struct res_counter kmem;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
atomic_t oom_wakeups;
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
/* protect arrays of thresholds */
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
/* For oom notifier event fd */
struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
atomic_t moving_account;
/* taken only while moving_account > 0 */
spinlock_t move_lock;

KAMEZAWA Hiroyuki
committed
/*
* percpu counter.

KAMEZAWA Hiroyuki
committed
*/
struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
struct list_head memcg_slab_caches;
/* Not a spinlock, we can take a lot of time walking the list */
struct mutex slab_caches_mutex;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
/*
* Protects soft_contributed transitions.
* See mem_cgroup_update_soft_limit
*/
spinlock_t soft_lock;
/*
* If true then this group has increased parents' children_in_excess
* When a group falls bellow the soft limit, parents' children_in_excess
* is decreased and soft_contributed changed to false.
*/
bool soft_contributed;
/* Number of children that are in soft limit excess */
atomic_t children_in_excess;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
nr_node_ids * sizeof(struct mem_cgroup_per_node);
}
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
/* We account when limit is on, but only after call sites are patched */
#define KMEM_ACCOUNTED_MASK \
((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
{
set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}
static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
{
clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
* Our caller must use css_get() first, because memcg_uncharge_kmem()
* will call css_put() if it sees the memcg is dead.
*/
smp_wmb();
if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}
static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
{
return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
&memcg->kmem_account_flags);
}
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" and
* "immigrate_flags" are treated as a left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
NR_MOVE_TYPE,
};
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long immigrate_flags;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON,
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
/* for encoding cft->private value on file */
enum res_type {
_MEM,
_MEMSWAP,
_OOM_TYPE,
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
*/
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
/*
* The memcg_create_mutex will be held whenever a new cgroup is created.
* As a consequence, any change that needs to protect against new child cgroups
* appearing has to hold it as well.
*/
static DEFINE_MUTEX(memcg_create_mutex);
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
return s ? container_of(s, struct mem_cgroup, css) : NULL;
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->vmpressure;
}
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
return &mem_cgroup_from_css(css)->vmpressure;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
}
/*
* We restrict the id in the range of [1, 65535], so it can fit into
* an unsigned short.
*/
#define MEM_CGROUP_ID_MAX USHRT_MAX
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
/*
* The ID of the root cgroup is 0, but memcg treat 0 as an
* invalid ID, so we return (cgroup_id + 1).
*/
return memcg->css.cgroup->id + 1;
}
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
css = css_from_id(id - 1, &mem_cgroup_subsys);
return mem_cgroup_from_css(css);
}
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct cg_proto *cg_proto;
BUG_ON(!sk->sk_prot->proto_cgroup);
/* Socket cloning can throw us here with sk_cgrp already
* filled. It won't however, necessarily happen from
* process context. So the test for root memcg given
* the current task's memcg won't help us in this case.
*
* Respecting the original socket's memcg is a better
* decision in this case.
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
css_get(&sk->sk_cgrp->memcg->css);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
if (!mem_cgroup_is_root(memcg) &&
memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
}
}
EXPORT_SYMBOL(sock_update_memcg);
void sock_release_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
css_put(&sk->sk_cgrp->memcg->css);
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
#else
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
* There are two main reasons for not using the css_id for this:
* 1) this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
* 2) In order not to violate the cgroup API, we would like to do all memory
* allocation in ->create(). At that point, we haven't yet allocated the
* css_id. Having a separate index prevents us from messing with the cgroup
* core for this
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
* increase it.
*/
static DEFINE_IDA(kmem_limited_groups);
int memcg_limited_groups_array_size;
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
* the alloc/free process all the time. In a small machine, 4 kmem-limited
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
* MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
* css_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE 65535
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_kmem_get_cache are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
if (memcg_kmem_is_active(memcg)) {
static_key_slow_dec(&memcg_kmem_enabled_key);
ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
}
/*
* This check can't live in kmem destruction function,
* since the charges will outlive the cgroup
*/
WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
static void disarm_static_keys(struct mem_cgroup *memcg)
{
disarm_sock_keys(memcg);
disarm_kmem_keys(memcg);
}
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
VM_BUG_ON((unsigned)nid >= nr_node_ids);
return &memcg->nodeinfo[nid]->zoneinfo[zid];
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(memcg, nid, zid);
/*
* Implementation Note: reading percpu statistics for memcg.
*
* Both of vmstat[] and percpu_counter has threshold and do periodic
* synchronization to implement "quick" read. There are trade-off between
* reading cost and precision of value. Then, we may have a chance to implement
* a periodic synchronizion of counter in memcg's counter.
*
* But this _read() function is used for user interface now. The user accounts
* memory usage by memory cgroup and he _always_ requires exact value because
* he accounts memory. Even if we provide quick-and-fuzzy read, we always
* have to visit all online cpus and make sum. So, for now, unnecessary
* synchronization is not implemented. (just implemented for cpu hotplug)
*
* If there are kernel internal actions which can make use of some not-exact
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.count[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
return val;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

KAMEZAWA Hiroyuki
committed
{
preempt_disable();
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/
if (anon)
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

KAMEZAWA Hiroyuki
committed
else
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
if (PageTransHuge(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
preempt_enable();

KAMEZAWA Hiroyuki
committed
}
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
return mz->lru_size[lru];
}
static unsigned long
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
unsigned long ret = 0;
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
for_each_lru(lru) {
if (BIT(lru) & lru_mask)
ret += mz->lru_size[lru];
}
return ret;
}
static unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++)
total += mem_cgroup_zone_nr_lru_pages(memcg,
nid, zid, lru_mask);
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)

KAMEZAWA Hiroyuki
committed
{

KAMEZAWA Hiroyuki
committed
u64 total = 0;
for_each_node_state(nid, N_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

KAMEZAWA Hiroyuki
committed
return total;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
unsigned long val, next;
val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
if ((long)next - (long)val < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
default:
break;
}
__this_cpu_write(memcg->stat->targets[target], next);
return true;
return false;
* Called from rate-limited memcg_check_events when enough
* MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
* that all the parents up the hierarchy will be notified that this group
* is in excess or that it is not in excess anymore. mmecg->soft_contributed
* makes the transition a single action whenever the state flips from one to
*/
static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
{
unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
struct mem_cgroup *parent = memcg;
int delta = 0;
spin_lock(&memcg->soft_lock);
if (excess) {
if (!memcg->soft_contributed) {
delta = 1;
memcg->soft_contributed = true;
}
} else {
if (memcg->soft_contributed) {
delta = -1;
memcg->soft_contributed = false;
}
}
/*
* Necessary to update all ancestors when hierarchy is used
* because their event counter is not touched.
* We track children even outside the hierarchy for the root
* cgroup because tree walk starting at root should visit
* all cgroups and we want to prevent from pointless tree
* walk if no children is below the limit.
*/
while (delta && (parent = parent_mem_cgroup(parent)))
atomic_add(delta, &parent->children_in_excess);
if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
atomic_add(delta, &root_mem_cgroup->children_in_excess);
spin_unlock(&memcg->soft_lock);
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
preempt_enable();
if (unlikely(do_softlimit))
mem_cgroup_update_soft_limit(memcg);
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
} else
preempt_enable();
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
* pessimistic (rather than adding locks here).
*/
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
} while (!css_tryget(&memcg->css));
static enum mem_cgroup_filter_t
mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
mem_cgroup_iter_filter cond)
{
if (!cond)
return VISIT;
return cond(memcg, root);
}
/*
* Returns a next (in a pre-order walk) alive memcg (with elevated css
* ref. count) or NULL if the whole root's subtree has been visited.
*
* helper function to be used by mem_cgroup_iter
*/
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)

Tejun Heo
committed
struct cgroup_subsys_state *prev_css, *next_css;

Tejun Heo
committed
prev_css = last_visited ? &last_visited->css : NULL;

Tejun Heo
committed
next_css = css_next_descendant_pre(prev_css, &root->css);
/*
* Even if we found a group we have to make sure it is
* alive. css && !memcg means that the groups should be
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
*/

Tejun Heo
committed
if (next_css) {
struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
switch (mem_cgroup_filter(mem, root, cond)) {
case SKIP:

Tejun Heo
committed
prev_css = next_css;
case SKIP_TREE:
if (mem == root)
return NULL;
/*
* css_rightmost_descendant is not an optimal way to
* skip through a subtree (especially for imbalanced
* trees leaning to right) but that's what we have right
* now. More effective solution would be traversing