Newer
Older
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Kernel Memory Controller
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>

KAMEZAWA Hiroyuki
committed
#include <linux/smp.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/sort.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/oom.h>
#include <asm/uaccess.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
EXPORT_SYMBOL(mem_cgroup_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
/* for remember boot option*/
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
#endif
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
};
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
"pgfault",
"pgmajfault",
};
static const char * const mem_cgroup_lru_names[] = {
"inactive_anon",
"active_anon",
"inactive_file",
"active_file",
"unevictable",
};
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
* for trigger some periodic events. This is straightforward and better
* than using jiffies etc. to handle periodic memcg event.
*/
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_NUMAINFO,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024

KAMEZAWA Hiroyuki
committed
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_reclaim_iter {
/*
* last scanned hierarchy member. Valid only if last_dead_count
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
/* scan generation, increased every round-trip */
unsigned int generation;
};

KAMEZAWA Hiroyuki
committed
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {
unsigned long lru_size[NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */

KAMEZAWA Hiroyuki
committed
};
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
u64 threshold;
};
struct mem_cgroup_threshold_ary {
/* An array index points to threshold just below or equal to usage. */
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
struct mem_cgroup_threshold entries[0];
};
struct mem_cgroup_thresholds {
/* Primary thresholds array */
struct mem_cgroup_threshold_ary *primary;
/*
* Spare threshold array.
* This is needed to make mem_cgroup_unregister_event() "never fail".
* It must be able to store at least primary->size - 1 entries.
*/
struct mem_cgroup_threshold_ary *spare;
};
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
struct res_counter res;
/* vmpressure notifications */
struct vmpressure vmpressure;
/*
* the counter to account for mem+swap usage.
*/
struct res_counter memsw;
/*
* the counter to account for kernel memory usage.
*/
struct res_counter kmem;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
atomic_t oom_wakeups;
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
/* protect arrays of thresholds */
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
/* For oom notifier event fd */
struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
atomic_t moving_account;
/* taken only while moving_account > 0 */
spinlock_t move_lock;

KAMEZAWA Hiroyuki
committed
/*
* percpu counter.

KAMEZAWA Hiroyuki
committed
*/
struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
struct list_head memcg_slab_caches;
/* Not a spinlock, we can take a lot of time walking the list */
struct mutex slab_caches_mutex;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
nr_node_ids * sizeof(struct mem_cgroup_per_node);
}
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
/* We account when limit is on, but only after call sites are patched */
#define KMEM_ACCOUNTED_MASK \
((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
{
set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}
static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
{
clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
}
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
* Our caller must use css_get() first, because memcg_uncharge_kmem()
* will call css_put() if it sees the memcg is dead.
*/
smp_wmb();
if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}
static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
{
return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
&memcg->kmem_account_flags);
}
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" and
* "immigrate_flags" are treated as a left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
NR_MOVE_TYPE,
};
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long immigrate_flags;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON,
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
/* for encoding cft->private value on file */
enum res_type {
_MEM,
_MEMSWAP,
_OOM_TYPE,
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
/*
* Reclaim flags for mem_cgroup_hierarchical_reclaim
*/
#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
/*
* The memcg_create_mutex will be held whenever a new cgroup is created.
* As a consequence, any change that needs to protect against new child cgroups
* appearing has to hold it as well.
*/
static DEFINE_MUTEX(memcg_create_mutex);
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
return s ? container_of(s, struct mem_cgroup, css) : NULL;
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->vmpressure;
}
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
{
return &mem_cgroup_from_css(css)->vmpressure;
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
}
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct cg_proto *cg_proto;
BUG_ON(!sk->sk_prot->proto_cgroup);
/* Socket cloning can throw us here with sk_cgrp already
* filled. It won't however, necessarily happen from
* process context. So the test for root memcg given
* the current task's memcg won't help us in this case.
*
* Respecting the original socket's memcg is a better
* decision in this case.
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
css_get(&sk->sk_cgrp->memcg->css);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
if (!mem_cgroup_is_root(memcg) &&
memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
}
}
EXPORT_SYMBOL(sock_update_memcg);
void sock_release_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
css_put(&sk->sk_cgrp->memcg->css);
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
#else
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
* There are two main reasons for not using the css_id for this:
* 1) this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
* 2) In order not to violate the cgroup API, we would like to do all memory
* allocation in ->create(). At that point, we haven't yet allocated the
* css_id. Having a separate index prevents us from messing with the cgroup
* core for this
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
* increase it.
*/
static DEFINE_IDA(kmem_limited_groups);
int memcg_limited_groups_array_size;
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
* the alloc/free process all the time. In a small machine, 4 kmem-limited
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
* MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
* css_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE 65535
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_kmem_get_cache are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
if (memcg_kmem_is_active(memcg)) {
static_key_slow_dec(&memcg_kmem_enabled_key);
ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
}
/*
* This check can't live in kmem destruction function,
* since the charges will outlive the cgroup
*/
WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
}
#else
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
static void disarm_static_keys(struct mem_cgroup *memcg)
{
disarm_sock_keys(memcg);
disarm_kmem_keys(memcg);
}
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
VM_BUG_ON((unsigned)nid >= nr_node_ids);
return &memcg->nodeinfo[nid]->zoneinfo[zid];
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(memcg, nid, zid);
/*
* Implementation Note: reading percpu statistics for memcg.
*
* Both of vmstat[] and percpu_counter has threshold and do periodic
* synchronization to implement "quick" read. There are trade-off between
* reading cost and precision of value. Then, we may have a chance to implement
* a periodic synchronizion of counter in memcg's counter.
*
* But this _read() function is used for user interface now. The user accounts
* memory usage by memory cgroup and he _always_ requires exact value because
* he accounts memory. Even if we provide quick-and-fuzzy read, we always
* have to visit all online cpus and make sum. So, for now, unnecessary
* synchronization is not implemented. (just implemented for cpu hotplug)
*
* If there are kernel internal actions which can make use of some not-exact
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.count[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
put_online_cpus();
return val;
}
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx)
{
unsigned long val = 0;
int cpu;
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
spin_lock(&memcg->pcp_counter_lock);
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
return val;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

KAMEZAWA Hiroyuki
committed
{
preempt_disable();
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
*/
if (anon)
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

KAMEZAWA Hiroyuki
committed
else
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
if (PageTransHuge(page))
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
preempt_enable();

KAMEZAWA Hiroyuki
committed
}
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
return mz->lru_size[lru];
}
static unsigned long
mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
unsigned long ret = 0;
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
for_each_lru(lru) {
if (BIT(lru) & lru_mask)
ret += mz->lru_size[lru];
}
return ret;
}
static unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
u64 total = 0;
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++)
total += mem_cgroup_zone_nr_lru_pages(memcg,
nid, zid, lru_mask);
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)

KAMEZAWA Hiroyuki
committed
{

KAMEZAWA Hiroyuki
committed
u64 total = 0;
for_each_node_state(nid, N_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

KAMEZAWA Hiroyuki
committed
return total;

KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
unsigned long val, next;
val = __this_cpu_read(memcg->stat->nr_page_events);
next = __this_cpu_read(memcg->stat->targets[target]);
/* from time_after() in jiffies.h */
if ((long)next - (long)val < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_NUMAINFO:
next = val + NUMAINFO_EVENTS_TARGET;
break;
default:
break;
}
__this_cpu_write(memcg->stat->targets[target], next);
return true;
return false;
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_disable();
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_numainfo __maybe_unused;
#if MAX_NUMNODES > 1
do_numainfo = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_NUMAINFO);
#endif
preempt_enable();
#if MAX_NUMNODES > 1
if (unlikely(do_numainfo))
atomic_inc(&memcg->numainfo_events);
#endif
} else
preempt_enable();
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.
*/
if (unlikely(!p))
return NULL;
return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
* pessimistic (rather than adding locks here).
*/
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
} while (!css_tryget(&memcg->css));
/*
* Returns a next (in a pre-order walk) alive memcg (with elevated css
* ref. count) or NULL if the whole root's subtree has been visited.
*
* helper function to be used by mem_cgroup_iter
*/
static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
struct mem_cgroup *last_visited)

Tejun Heo
committed
struct cgroup_subsys_state *prev_css, *next_css;

Tejun Heo
committed
prev_css = last_visited ? &last_visited->css : NULL;

Tejun Heo
committed
next_css = css_next_descendant_pre(prev_css, &root->css);
/*
* Even if we found a group we have to make sure it is
* alive. css && !memcg means that the groups should be
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
*/

Tejun Heo
committed
if (next_css) {
struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
if (css_tryget(&mem->css))
return mem;
else {

Tejun Heo
committed
prev_css = next_css;
goto skip_node;
}
}
return NULL;
}
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
{
/*
* When a group in the hierarchy below root is destroyed, the
* hierarchy iterator can no longer be trusted since it might
* have pointed to the destroyed group. Invalidate it.
*/
atomic_inc(&root->dead_count);
}
static struct mem_cgroup *
mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *root,
int *sequence)
{
struct mem_cgroup *position = NULL;
/*
* A cgroup destruction happens in two stages: offlining and
* release. They are separated by a RCU grace period.
*
* If the iterator is valid, we may still race with an
* offlining. The RCU lock ensures the object won't be
* released, tryget will fail if we lost the race.
*/
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
if (position && !css_tryget(&position->css))
position = NULL;
}
return position;
}
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
int sequence)
{
if (last_visited)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
* loaded successfully instead of rereading it here so that we
* don't lose destruction events in between. We could have
* raced with the destruction of @new_position after all.
*/
iter->last_visited = new_position;
smp_wmb();
iter->last_dead_count = sequence;
}
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
* @reclaim: cookie for shared reclaim walks, NULL for full walks
*
* Returns references to children of the hierarchy below @root, or
* @root itself, or %NULL after a full round-trip.
*
* Caller must pass the return value in @prev on subsequent
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
* Reclaimers can specify a zone and a priority level in @reclaim to
* divide up the memcgs in the hierarchy among all concurrent
* reclaimers operating on the same zone and priority.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
if (mem_cgroup_disabled())
return NULL;
if (!root)
root = root_mem_cgroup;
if (prev && !reclaim)
last_visited = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
goto out_css_put;
return root;
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
int uninitialized_var(seq);
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;