Skip to content
Snippets Groups Projects
memcontrol.c 188 KiB
Newer Older
  • Learn to ignore specific revisions
  • /* memcontrol.c - Memory Controller
     *
     * Copyright IBM Corporation, 2007
     * Author Balbir Singh <balbir@linux.vnet.ibm.com>
     *
    
     * Copyright 2007 OpenVZ SWsoft Inc
     * Author: Pavel Emelianov <xemul@openvz.org>
     *
    
     * Memory thresholds
     * Copyright (C) 2009 Nokia Corporation
     * Author: Kirill A. Shutemov
     *
    
     * Kernel Memory Controller
     * Copyright (C) 2012 Parallels Inc. and Google Inc.
     * Authors: Glauber Costa and Suleiman Souhlal
     *
    
     * This program is free software; you can redistribute it and/or modify
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     */
    
    #include <linux/res_counter.h>
    #include <linux/memcontrol.h>
    #include <linux/cgroup.h>
    
    #include <linux/mm.h>
    
    #include <linux/hugetlb.h>
    
    #include <linux/pagemap.h>
    
    #include <linux/page-flags.h>
    
    #include <linux/backing-dev.h>
    
    #include <linux/bit_spinlock.h>
    #include <linux/rcupdate.h>
    
    #include <linux/limits.h>
    
    #include <linux/mutex.h>
    
    #include <linux/rbtree.h>
    
    #include <linux/swap.h>
    
    #include <linux/swapops.h>
    
    #include <linux/spinlock.h>
    
    #include <linux/eventfd.h>
    #include <linux/sort.h>
    
    #include <linux/fs.h>
    
    #include <linux/seq_file.h>
    
    #include <linux/vmalloc.h>
    
    #include <linux/vmpressure.h>
    
    #include <linux/mm_inline.h>
    
    #include <linux/page_cgroup.h>
    
    #include <linux/cpu.h>
    
    #include "internal.h"
    
    #include <net/sock.h>
    
    #include <net/ip.h>
    
    #include <net/tcp_memcontrol.h>
    
    #include <trace/events/vmscan.h>
    
    
    struct cgroup_subsys mem_cgroup_subsys __read_mostly;
    
    #define MEM_CGROUP_RECLAIM_RETRIES	5
    
    static struct mem_cgroup *root_mem_cgroup __read_mostly;
    
    #ifdef CONFIG_MEMCG_SWAP
    
    /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
    
    int do_swap_account __read_mostly;
    
    
    /* for remember boot option*/
    
    #ifdef CONFIG_MEMCG_SWAP_ENABLED
    
    static int really_do_swap_account __initdata = 1;
    #else
    static int really_do_swap_account __initdata = 0;
    #endif
    
    
    #define do_swap_account		0
    
    static const char * const mem_cgroup_stat_names[] = {
    	"cache",
    	"rss",
    
    enum mem_cgroup_events_index {
    	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
    	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
    
    	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
    	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
    
    	MEM_CGROUP_EVENTS_NSTATS,
    };
    
    
    static const char * const mem_cgroup_events_names[] = {
    	"pgpgin",
    	"pgpgout",
    	"pgfault",
    	"pgmajfault",
    };
    
    
    static const char * const mem_cgroup_lru_names[] = {
    	"inactive_anon",
    	"active_anon",
    	"inactive_file",
    	"active_file",
    	"unevictable",
    };
    
    
    /*
     * Per memcg event counter is incremented at every pagein/pageout. With THP,
     * it will be incremated by the number of pages. This counter is used for
     * for trigger some periodic events. This is straightforward and better
     * than using jiffies etc. to handle periodic memcg event.
     */
    enum mem_cgroup_events_target {
    	MEM_CGROUP_TARGET_THRESH,
    
    	MEM_CGROUP_TARGET_SOFTLIMIT,
    
    #define THRESHOLDS_EVENTS_TARGET 128
    #define SOFTLIMIT_EVENTS_TARGET 1024
    #define NUMAINFO_EVENTS_TARGET	1024
    
    	long count[MEM_CGROUP_STAT_NSTATS];
    
    	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
    
    	unsigned long nr_page_events;
    
    	unsigned long targets[MEM_CGROUP_NTARGETS];
    
    struct mem_cgroup_reclaim_iter {
    
    	/*
    	 * last scanned hierarchy member. Valid only if last_dead_count
    	 * matches memcg->dead_count of the hierarchy root group.
    	 */
    
    	struct mem_cgroup *last_visited;
    
    	unsigned long last_dead_count;
    
    
    	/* scan generation, increased every round-trip */
    	unsigned int generation;
    };
    
    
    /*
     * per-zone information in memory controller.
     */
    struct mem_cgroup_per_zone {
    
    	struct lruvec		lruvec;
    
    	unsigned long		lru_size[NR_LRU_LISTS];
    
    	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
    
    
    	struct rb_node		tree_node;	/* RB tree node */
    	unsigned long long	usage_in_excess;/* Set to the value by which */
    						/* the soft limit is exceeded*/
    	bool			on_tree;
    
    	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
    
    						/* use container_of	   */
    
    };
    
    struct mem_cgroup_per_node {
    	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
    };
    
    
    /*
     * Cgroups above their limits are maintained in a RB-Tree, independent of
     * their hierarchy representation
     */
    
    struct mem_cgroup_tree_per_zone {
    	struct rb_root rb_root;
    	spinlock_t lock;
    };
    
    struct mem_cgroup_tree_per_node {
    	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
    };
    
    struct mem_cgroup_tree {
    	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
    };
    
    static struct mem_cgroup_tree soft_limit_tree __read_mostly;
    
    
    struct mem_cgroup_threshold {
    	struct eventfd_ctx *eventfd;
    	u64 threshold;
    };
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* For threshold */
    
    struct mem_cgroup_threshold_ary {
    
    	/* An array index points to threshold just below or equal to usage. */
    
    	int current_threshold;
    
    	/* Size of entries[] */
    	unsigned int size;
    	/* Array of thresholds */
    	struct mem_cgroup_threshold entries[0];
    };
    
    
    struct mem_cgroup_thresholds {
    	/* Primary thresholds array */
    	struct mem_cgroup_threshold_ary *primary;
    	/*
    	 * Spare threshold array.
    	 * This is needed to make mem_cgroup_unregister_event() "never fail".
    	 * It must be able to store at least primary->size - 1 entries.
    	 */
    	struct mem_cgroup_threshold_ary *spare;
    };
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* for OOM */
    struct mem_cgroup_eventfd_list {
    	struct list_head list;
    	struct eventfd_ctx *eventfd;
    };
    
    static void mem_cgroup_threshold(struct mem_cgroup *memcg);
    static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
    
    /*
     * The memory controller data structure. The memory controller controls both
     * page cache and RSS per cgroup. We would eventually like to provide
     * statistics based on the statistics developed by Rik Van Riel for clock-pro,
     * to help the administrator determine what knobs to tune.
     *
     * TODO: Add a water mark for the memory controller. Reclaim will begin when
    
     * we hit the water mark. May be even add a low water mark, such that
     * no reclaim occurs from a cgroup at it's low water mark, this is
     * a feature that will be implemented much later in the future.
    
     */
    struct mem_cgroup {
    	struct cgroup_subsys_state css;
    	/*
    	 * the counter to account for memory usage
    	 */
    	struct res_counter res;
    
    	/* vmpressure notifications */
    	struct vmpressure vmpressure;
    
    
    	/*
    	 * the counter to account for mem+swap usage.
    	 */
    	struct res_counter memsw;
    
    	/*
    	 * the counter to account for kernel memory usage.
    	 */
    	struct res_counter kmem;
    
    	/*
    	 * Should the accounting and control be hierarchical, per subtree?
    	 */
    	bool use_hierarchy;
    
    	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
    
    
    	bool		oom_lock;
    	atomic_t	under_oom;
    
    	/* OOM-Killer disable */
    	int		oom_kill_disable;
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    
    
    	/* set when res.limit == memsw.limit */
    	bool		memsw_is_minimum;
    
    
    	/* protect arrays of thresholds */
    	struct mutex thresholds_lock;
    
    	/* thresholds for memory usage. RCU-protected */
    
    	struct mem_cgroup_thresholds thresholds;
    
    	/* thresholds for mem+swap usage. RCU-protected */
    
    	struct mem_cgroup_thresholds memsw_thresholds;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	/* For oom notifier event fd */
    	struct list_head oom_notify;
    
    	/*
    	 * Should we move charges of a task when a task is moved into this
    	 * mem_cgroup ? And what type of charges should we move ?
    	 */
    
    Andrew Morton's avatar
    Andrew Morton committed
    	unsigned long move_charge_at_immigrate;
    
    	/*
    	 * set > 0 if pages under this cgroup are moving to other cgroup.
    	 */
    	atomic_t	moving_account;
    
    	/* taken only while moving_account > 0 */
    	spinlock_t	move_lock;
    
    	struct mem_cgroup_stat_cpu __percpu *stat;
    
    	/*
    	 * used when a cpu is offlined or other synchronizations
    	 * See mem_cgroup_read_stat().
    	 */
    	struct mem_cgroup_stat_cpu nocpu_base;
    	spinlock_t pcp_counter_lock;
    
    	atomic_t	dead_count;
    
    #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
    
    	struct tcp_memcontrol tcp_mem;
    #endif
    
    #if defined(CONFIG_MEMCG_KMEM)
    	/* analogous to slab_common's slab_caches list. per-memcg */
    	struct list_head memcg_slab_caches;
    	/* Not a spinlock, we can take a lot of time walking the list */
    	struct mutex slab_caches_mutex;
            /* Index in the kmem_cache->memcg_params->memcg_caches array */
    	int kmemcg_id;
    #endif
    
    
    	int last_scanned_node;
    #if MAX_NUMNODES > 1
    	nodemask_t	scan_nodes;
    	atomic_t	numainfo_events;
    	atomic_t	numainfo_updating;
    #endif
    
    	struct mem_cgroup_per_node *nodeinfo[0];
    	/* WARNING: nodeinfo must be the last member here */
    
    static size_t memcg_size(void)
    {
    	return sizeof(struct mem_cgroup) +
    		nr_node_ids * sizeof(struct mem_cgroup_per_node);
    }
    
    
    /* internal only representation about the status of kmem accounting. */
    enum {
    	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
    
    	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
    
    	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
    
    /* We account when limit is on, but only after call sites are patched */
    #define KMEM_ACCOUNTED_MASK \
    		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
    
    
    #ifdef CONFIG_MEMCG_KMEM
    static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
    {
    	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
    }
    
    
    static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
    {
    	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
    {
    	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
    {
    	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
    {
    
    	/*
    	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
    	 * will call css_put() if it sees the memcg is dead.
    	 */
    	smp_wmb();
    
    	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
    		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
    }
    
    static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
    {
    	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
    				  &memcg->kmem_account_flags);
    }
    
    /* Stuffs for move charges at task migration. */
    /*
    
     * Types of charges to be moved. "move_charge_at_immitgrate" and
     * "immigrate_flags" are treated as a left-shifted bitmap of these types.
    
    	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
    
    	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
    
    /* "mc" and its members are protected by cgroup_mutex */
    static struct move_charge_struct {
    
    	spinlock_t	  lock; /* for from, to */
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	unsigned long precharge;
    
    	unsigned long moved_charge;
    
    	unsigned long moved_swap;
    
    	struct task_struct *moving_task;	/* a task moving charges */
    	wait_queue_head_t waitq;		/* a waitq for other context */
    } mc = {
    
    	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
    
    	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
    };
    
    static bool move_anon(void)
    {
    
    	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
    
    static bool move_file(void)
    {
    
    	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
    
    /*
     * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
     * limit reclaim to prevent infinite loops, if they ever occur.
     */
    
    #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
    
    #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
    
    enum charge_type {
    	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
    
    	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
    
    	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
    
    /* for encoding cft->private value on file */
    
    enum res_type {
    	_MEM,
    	_MEMSWAP,
    	_OOM_TYPE,
    
    #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
    #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
    
    #define MEMFILE_ATTR(val)	((val) & 0xffff)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* Used for OOM nofiier */
    #define OOM_CONTROL		(0)
    
    /*
     * Reclaim flags for mem_cgroup_hierarchical_reclaim
     */
    #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
    #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
    #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
    #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
    
    
    /*
     * The memcg_create_mutex will be held whenever a new cgroup is created.
     * As a consequence, any change that needs to protect against new child cgroups
     * appearing has to hold it as well.
     */
    static DEFINE_MUTEX(memcg_create_mutex);
    
    
    struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
    {
    
    	return s ? container_of(s, struct mem_cgroup, css) : NULL;
    
    /* Some nice accessors for the vmpressure. */
    struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
    {
    	if (!memcg)
    		memcg = root_mem_cgroup;
    	return &memcg->vmpressure;
    }
    
    struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
    {
    	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
    }
    
    struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
    {
    	return &mem_cgroup_from_css(css)->vmpressure;
    }
    
    
    static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
    {
    	return (memcg == root_mem_cgroup);
    }
    
    
    /* Writing them here to avoid exposing memcg's inner layout */
    
    #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
    
    
    void sock_update_memcg(struct sock *sk)
    {
    
    	if (mem_cgroup_sockets_enabled) {
    
    		struct mem_cgroup *memcg;
    
    		struct cg_proto *cg_proto;
    
    
    		BUG_ON(!sk->sk_prot->proto_cgroup);
    
    
    		/* Socket cloning can throw us here with sk_cgrp already
    		 * filled. It won't however, necessarily happen from
    		 * process context. So the test for root memcg given
    		 * the current task's memcg won't help us in this case.
    		 *
    		 * Respecting the original socket's memcg is a better
    		 * decision in this case.
    		 */
    		if (sk->sk_cgrp) {
    			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
    
    			css_get(&sk->sk_cgrp->memcg->css);
    
    		rcu_read_lock();
    		memcg = mem_cgroup_from_task(current);
    
    		cg_proto = sk->sk_prot->proto_cgroup(memcg);
    
    		if (!mem_cgroup_is_root(memcg) &&
    		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
    
    			sk->sk_cgrp = cg_proto;
    
    		}
    		rcu_read_unlock();
    	}
    }
    EXPORT_SYMBOL(sock_update_memcg);
    
    void sock_release_memcg(struct sock *sk)
    {
    
    	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
    
    		struct mem_cgroup *memcg;
    		WARN_ON(!sk->sk_cgrp->memcg);
    		memcg = sk->sk_cgrp->memcg;
    
    		css_put(&sk->sk_cgrp->memcg->css);
    
    
    struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
    {
    	if (!memcg || mem_cgroup_is_root(memcg))
    		return NULL;
    
    	return &memcg->tcp_mem.cg_proto;
    }
    EXPORT_SYMBOL(tcp_proto_cgroup);
    
    static void disarm_sock_keys(struct mem_cgroup *memcg)
    {
    	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
    		return;
    	static_key_slow_dec(&memcg_socket_limit_enabled);
    }
    #else
    static void disarm_sock_keys(struct mem_cgroup *memcg)
    {
    }
    #endif
    
    
    #ifdef CONFIG_MEMCG_KMEM
    
    /*
     * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
     * There are two main reasons for not using the css_id for this:
     *  1) this works better in sparse environments, where we have a lot of memcgs,
     *     but only a few kmem-limited. Or also, if we have, for instance, 200
     *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
     *     200 entry array for that.
     *
     *  2) In order not to violate the cgroup API, we would like to do all memory
     *     allocation in ->create(). At that point, we haven't yet allocated the
     *     css_id. Having a separate index prevents us from messing with the cgroup
     *     core for this
     *
     * The current size of the caches array is stored in
     * memcg_limited_groups_array_size.  It will double each time we have to
     * increase it.
     */
    static DEFINE_IDA(kmem_limited_groups);
    
    int memcg_limited_groups_array_size;
    
    
    /*
     * MIN_SIZE is different than 1, because we would like to avoid going through
     * the alloc/free process all the time. In a small machine, 4 kmem-limited
     * cgroups is a reasonable guess. In the future, it could be a parameter or
     * tunable, but that is strictly not necessary.
     *
     * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
     * this constant directly from cgroup, but it is understandable that this is
     * better kept as an internal representation in cgroup.c. In any case, the
     * css_id space is not getting any smaller, and we don't have to necessarily
     * increase ours as well if it increases.
     */
    #define MEMCG_CACHES_MIN_SIZE 4
    #define MEMCG_CACHES_MAX_SIZE 65535
    
    
    /*
     * A lot of the calls to the cache allocation functions are expected to be
     * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
     * conditional to this static branch, we'll have to allow modules that does
     * kmem_cache_alloc and the such to see this symbol as well
     */
    
    struct static_key memcg_kmem_enabled_key;
    
    EXPORT_SYMBOL(memcg_kmem_enabled_key);
    
    
    static void disarm_kmem_keys(struct mem_cgroup *memcg)
    {
    
    	if (memcg_kmem_is_active(memcg)) {
    
    		static_key_slow_dec(&memcg_kmem_enabled_key);
    
    		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
    	}
    
    	/*
    	 * This check can't live in kmem destruction function,
    	 * since the charges will outlive the cgroup
    	 */
    	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
    
    }
    #else
    static void disarm_kmem_keys(struct mem_cgroup *memcg)
    {
    }
    #endif /* CONFIG_MEMCG_KMEM */
    
    static void disarm_static_keys(struct mem_cgroup *memcg)
    {
    	disarm_sock_keys(memcg);
    	disarm_kmem_keys(memcg);
    }
    
    
    static void drain_all_stock_async(struct mem_cgroup *memcg);
    
    static struct mem_cgroup_per_zone *
    
    mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
    
    	VM_BUG_ON((unsigned)nid >= nr_node_ids);
    
    	return &memcg->nodeinfo[nid]->zoneinfo[zid];
    
    struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
    
    	return &memcg->css;
    
    static struct mem_cgroup_per_zone *
    
    page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
    
    	int nid = page_to_nid(page);
    	int zid = page_zonenum(page);
    
    	return mem_cgroup_zoneinfo(memcg, nid, zid);
    
    static struct mem_cgroup_tree_per_zone *
    soft_limit_tree_node_zone(int nid, int zid)
    {
    	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
    }
    
    static struct mem_cgroup_tree_per_zone *
    soft_limit_tree_from_page(struct page *page)
    {
    	int nid = page_to_nid(page);
    	int zid = page_zonenum(page);
    
    	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
    }
    
    static void
    __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
    				struct mem_cgroup_per_zone *mz,
    				struct mem_cgroup_tree_per_zone *mctz,
    				unsigned long long new_usage_in_excess)
    {
    	struct rb_node **p = &mctz->rb_root.rb_node;
    	struct rb_node *parent = NULL;
    	struct mem_cgroup_per_zone *mz_node;
    
    	if (mz->on_tree)
    		return;
    
    	mz->usage_in_excess = new_usage_in_excess;
    	if (!mz->usage_in_excess)
    		return;
    	while (*p) {
    		parent = *p;
    		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
    					tree_node);
    		if (mz->usage_in_excess < mz_node->usage_in_excess)
    			p = &(*p)->rb_left;
    		/*
    		 * We can't avoid mem cgroups that are over their soft
    		 * limit by the same amount
    		 */
    		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
    			p = &(*p)->rb_right;
    	}
    	rb_link_node(&mz->tree_node, parent, p);
    	rb_insert_color(&mz->tree_node, &mctz->rb_root);
    	mz->on_tree = true;
    }
    
    static void
    __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
    				struct mem_cgroup_per_zone *mz,
    				struct mem_cgroup_tree_per_zone *mctz)
    {
    	if (!mz->on_tree)
    		return;
    	rb_erase(&mz->tree_node, &mctz->rb_root);
    	mz->on_tree = false;
    }
    
    static void
    mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
    				struct mem_cgroup_per_zone *mz,
    				struct mem_cgroup_tree_per_zone *mctz)
    {
    	spin_lock(&mctz->lock);
    	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
    	spin_unlock(&mctz->lock);
    }
    
    
    static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
    {
    	unsigned long long excess;
    	struct mem_cgroup_per_zone *mz;
    	struct mem_cgroup_tree_per_zone *mctz;
    	int nid = page_to_nid(page);
    	int zid = page_zonenum(page);
    	mctz = soft_limit_tree_from_page(page);
    
    	/*
    	 * Necessary to update all ancestors when hierarchy is used.
    	 * because their event counter is not touched.
    	 */
    	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
    		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    		excess = res_counter_soft_limit_excess(&memcg->res);
    		/*
    		 * We have to update the tree if mz is on RB-tree or
    		 * mem is over its softlimit.
    		 */
    		if (excess || mz->on_tree) {
    			spin_lock(&mctz->lock);
    			/* if on-tree, remove it */
    			if (mz->on_tree)
    				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
    			/*
    			 * Insert again. mz->usage_in_excess will be updated.
    			 * If excess is 0, no tree ops.
    			 */
    			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
    			spin_unlock(&mctz->lock);
    		}
    	}
    }
    
    static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
    {
    	int node, zone;
    	struct mem_cgroup_per_zone *mz;
    	struct mem_cgroup_tree_per_zone *mctz;
    
    	for_each_node(node) {
    		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
    			mz = mem_cgroup_zoneinfo(memcg, node, zone);
    			mctz = soft_limit_tree_node_zone(node, zone);
    			mem_cgroup_remove_exceeded(memcg, mz, mctz);
    		}
    	}
    }
    
    static struct mem_cgroup_per_zone *
    __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
    {
    	struct rb_node *rightmost = NULL;
    	struct mem_cgroup_per_zone *mz;
    
    retry:
    	mz = NULL;
    	rightmost = rb_last(&mctz->rb_root);
    	if (!rightmost)
    		goto done;		/* Nothing to reclaim from */
    
    	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
    	/*
    	 * Remove the node now but someone else can add it back,
    	 * we will to add it back at the end of reclaim to its correct
    	 * position in the tree.
    	 */
    	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
    	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
    		!css_tryget(&mz->memcg->css))
    		goto retry;
    done:
    	return mz;
    }
    
    static struct mem_cgroup_per_zone *
    mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
    {
    	struct mem_cgroup_per_zone *mz;
    
    	spin_lock(&mctz->lock);
    	mz = __mem_cgroup_largest_soft_limit_node(mctz);
    	spin_unlock(&mctz->lock);
    	return mz;
    }
    
    
    /*
     * Implementation Note: reading percpu statistics for memcg.
     *
     * Both of vmstat[] and percpu_counter has threshold and do periodic
     * synchronization to implement "quick" read. There are trade-off between
     * reading cost and precision of value. Then, we may have a chance to implement
     * a periodic synchronizion of counter in memcg's counter.
     *
     * But this _read() function is used for user interface now. The user accounts
     * memory usage by memory cgroup and he _always_ requires exact value because
     * he accounts memory. Even if we provide quick-and-fuzzy read, we always
     * have to visit all online cpus and make sum. So, for now, unnecessary
     * synchronization is not implemented. (just implemented for cpu hotplug)
     *
     * If there are kernel internal actions which can make use of some not-exact
     * value, and reading all cpu value can be performance bottleneck in some
     * common workload, threashold and synchonization as vmstat[] should be
     * implemented.
     */
    
    static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
    
    				 enum mem_cgroup_stat_index idx)
    
    	get_online_cpus();
    	for_each_online_cpu(cpu)
    
    		val += per_cpu(memcg->stat->count[idx], cpu);
    
    #ifdef CONFIG_HOTPLUG_CPU
    
    	spin_lock(&memcg->pcp_counter_lock);
    	val += memcg->nocpu_base.count[idx];
    	spin_unlock(&memcg->pcp_counter_lock);
    
    #endif
    	put_online_cpus();
    
    static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
    
    					 bool charge)
    {
    	int val = (charge) ? 1 : -1;
    
    	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
    
    static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
    
    					    enum mem_cgroup_events_index idx)
    {
    	unsigned long val = 0;
    	int cpu;
    
    	for_each_online_cpu(cpu)
    
    		val += per_cpu(memcg->stat->events[idx], cpu);
    
    #ifdef CONFIG_HOTPLUG_CPU
    
    	spin_lock(&memcg->pcp_counter_lock);
    	val += memcg->nocpu_base.events[idx];
    	spin_unlock(&memcg->pcp_counter_lock);
    
    static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
    
    					 struct page *page,
    
    					 bool anon, int nr_pages)
    
    	/*
    	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
    	 * counted as CACHE even if it's on ANON LRU.
    	 */
    	if (anon)
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
    
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
    
    	if (PageTransHuge(page))
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
    				nr_pages);
    
    
    	/* pagein of a big page is an event. So, ignore page size */
    	if (nr_pages > 0)
    
    		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
    
    		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
    
    		nr_pages = -nr_pages; /* for event */
    	}
    
    	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
    
    mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
    	return mz->lru_size[lru];
    }
    
    static unsigned long
    
    mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	enum lru_list lru;
    
    	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	for_each_lru(lru) {
    		if (BIT(lru) & lru_mask)
    			ret += mz->lru_size[lru];
    
    mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
    
    			int nid, unsigned int lru_mask)
    {
    
    	for (zid = 0; zid < MAX_NR_ZONES; zid++)
    
    		total += mem_cgroup_zone_nr_lru_pages(memcg,
    						nid, zid, lru_mask);
    
    static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
    
    	for_each_node_state(nid, N_MEMORY)
    
    		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
    
    static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
    				       enum mem_cgroup_events_target target)
    
    	val = __this_cpu_read(memcg->stat->nr_page_events);
    
    	next = __this_cpu_read(memcg->stat->targets[target]);
    
    	/* from time_after() in jiffies.h */
    
    	if ((long)next - (long)val < 0) {
    		switch (target) {
    		case MEM_CGROUP_TARGET_THRESH:
    			next = val + THRESHOLDS_EVENTS_TARGET;
    			break;
    
    		case MEM_CGROUP_TARGET_SOFTLIMIT:
    			next = val + SOFTLIMIT_EVENTS_TARGET;
    			break;
    
    		case MEM_CGROUP_TARGET_NUMAINFO:
    			next = val + NUMAINFO_EVENTS_TARGET;
    			break;
    		default:
    			break;
    		}
    		__this_cpu_write(memcg->stat->targets[target], next);
    		return true;
    
    static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
    
    	/* threshold event is triggered in finer grain than soft limit */
    
    	if (unlikely(mem_cgroup_event_ratelimit(memcg,