Skip to content
Snippets Groups Projects
memcontrol.c 178 KiB
Newer Older
  • Learn to ignore specific revisions
  • /* memcontrol.c - Memory Controller
     *
     * Copyright IBM Corporation, 2007
     * Author Balbir Singh <balbir@linux.vnet.ibm.com>
     *
    
     * Copyright 2007 OpenVZ SWsoft Inc
     * Author: Pavel Emelianov <xemul@openvz.org>
     *
    
     * Memory thresholds
     * Copyright (C) 2009 Nokia Corporation
     * Author: Kirill A. Shutemov
     *
    
     * Kernel Memory Controller
     * Copyright (C) 2012 Parallels Inc. and Google Inc.
     * Authors: Glauber Costa and Suleiman Souhlal
     *
    
     * This program is free software; you can redistribute it and/or modify
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     */
    
    #include <linux/res_counter.h>
    #include <linux/memcontrol.h>
    #include <linux/cgroup.h>
    
    #include <linux/mm.h>
    
    #include <linux/hugetlb.h>
    
    #include <linux/pagemap.h>
    
    #include <linux/page-flags.h>
    
    #include <linux/backing-dev.h>
    
    #include <linux/bit_spinlock.h>
    #include <linux/rcupdate.h>
    
    #include <linux/limits.h>
    
    #include <linux/mutex.h>
    
    #include <linux/swap.h>
    
    #include <linux/swapops.h>
    
    #include <linux/spinlock.h>
    
    #include <linux/eventfd.h>
    #include <linux/sort.h>
    
    #include <linux/fs.h>
    
    #include <linux/seq_file.h>
    
    #include <linux/vmalloc.h>
    
    #include <linux/vmpressure.h>
    
    #include <linux/mm_inline.h>
    
    #include <linux/page_cgroup.h>
    
    #include <linux/cpu.h>
    
    #include "internal.h"
    
    #include <net/sock.h>
    
    #include <net/ip.h>
    
    #include <net/tcp_memcontrol.h>
    
    #include <trace/events/vmscan.h>
    
    
    struct cgroup_subsys mem_cgroup_subsys __read_mostly;
    
    #define MEM_CGROUP_RECLAIM_RETRIES	5
    
    static struct mem_cgroup *root_mem_cgroup __read_mostly;
    
    #ifdef CONFIG_MEMCG_SWAP
    
    /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
    
    int do_swap_account __read_mostly;
    
    
    /* for remember boot option*/
    
    #ifdef CONFIG_MEMCG_SWAP_ENABLED
    
    static int really_do_swap_account __initdata = 1;
    #else
    static int really_do_swap_account __initdata = 0;
    #endif
    
    
    #define do_swap_account		0
    
    static const char * const mem_cgroup_stat_names[] = {
    	"cache",
    	"rss",
    
    enum mem_cgroup_events_index {
    	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
    	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
    
    	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
    	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
    
    	MEM_CGROUP_EVENTS_NSTATS,
    };
    
    
    static const char * const mem_cgroup_events_names[] = {
    	"pgpgin",
    	"pgpgout",
    	"pgfault",
    	"pgmajfault",
    };
    
    
    static const char * const mem_cgroup_lru_names[] = {
    	"inactive_anon",
    	"active_anon",
    	"inactive_file",
    	"active_file",
    	"unevictable",
    };
    
    
    /*
     * Per memcg event counter is incremented at every pagein/pageout. With THP,
     * it will be incremated by the number of pages. This counter is used for
     * for trigger some periodic events. This is straightforward and better
     * than using jiffies etc. to handle periodic memcg event.
     */
    enum mem_cgroup_events_target {
    	MEM_CGROUP_TARGET_THRESH,
    
    #define THRESHOLDS_EVENTS_TARGET 128
    #define SOFTLIMIT_EVENTS_TARGET 1024
    #define NUMAINFO_EVENTS_TARGET	1024
    
    	long count[MEM_CGROUP_STAT_NSTATS];
    
    	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
    
    	unsigned long nr_page_events;
    
    	unsigned long targets[MEM_CGROUP_NTARGETS];
    
    struct mem_cgroup_reclaim_iter {
    
    	/*
    	 * last scanned hierarchy member. Valid only if last_dead_count
    	 * matches memcg->dead_count of the hierarchy root group.
    	 */
    
    	struct mem_cgroup *last_visited;
    
    	unsigned long last_dead_count;
    
    
    	/* scan generation, increased every round-trip */
    	unsigned int generation;
    };
    
    
    /*
     * per-zone information in memory controller.
     */
    struct mem_cgroup_per_zone {
    
    	struct lruvec		lruvec;
    
    	unsigned long		lru_size[NR_LRU_LISTS];
    
    	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
    
    
    	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
    
    						/* use container_of	   */
    
    };
    
    struct mem_cgroup_per_node {
    	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
    };
    
    
    struct mem_cgroup_threshold {
    	struct eventfd_ctx *eventfd;
    	u64 threshold;
    };
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* For threshold */
    
    struct mem_cgroup_threshold_ary {
    
    	/* An array index points to threshold just below or equal to usage. */
    
    	int current_threshold;
    
    	/* Size of entries[] */
    	unsigned int size;
    	/* Array of thresholds */
    	struct mem_cgroup_threshold entries[0];
    };
    
    
    struct mem_cgroup_thresholds {
    	/* Primary thresholds array */
    	struct mem_cgroup_threshold_ary *primary;
    	/*
    	 * Spare threshold array.
    	 * This is needed to make mem_cgroup_unregister_event() "never fail".
    	 * It must be able to store at least primary->size - 1 entries.
    	 */
    	struct mem_cgroup_threshold_ary *spare;
    };
    
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* for OOM */
    struct mem_cgroup_eventfd_list {
    	struct list_head list;
    	struct eventfd_ctx *eventfd;
    };
    
    static void mem_cgroup_threshold(struct mem_cgroup *memcg);
    static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
    
    /*
     * The memory controller data structure. The memory controller controls both
     * page cache and RSS per cgroup. We would eventually like to provide
     * statistics based on the statistics developed by Rik Van Riel for clock-pro,
     * to help the administrator determine what knobs to tune.
     *
     * TODO: Add a water mark for the memory controller. Reclaim will begin when
    
     * we hit the water mark. May be even add a low water mark, such that
     * no reclaim occurs from a cgroup at it's low water mark, this is
     * a feature that will be implemented much later in the future.
    
     */
    struct mem_cgroup {
    	struct cgroup_subsys_state css;
    	/*
    	 * the counter to account for memory usage
    	 */
    	struct res_counter res;
    
    	/* vmpressure notifications */
    	struct vmpressure vmpressure;
    
    
    	/*
    	 * the counter to account for mem+swap usage.
    	 */
    	struct res_counter memsw;
    
    	/*
    	 * the counter to account for kernel memory usage.
    	 */
    	struct res_counter kmem;
    
    	/*
    	 * Should the accounting and control be hierarchical, per subtree?
    	 */
    	bool use_hierarchy;
    
    	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
    
    
    	bool		oom_lock;
    	atomic_t	under_oom;
    
    	/* OOM-Killer disable */
    	int		oom_kill_disable;
    
    KOSAKI Motohiro's avatar
    KOSAKI Motohiro committed
    
    
    	/* set when res.limit == memsw.limit */
    	bool		memsw_is_minimum;
    
    
    	/* protect arrays of thresholds */
    	struct mutex thresholds_lock;
    
    	/* thresholds for memory usage. RCU-protected */
    
    	struct mem_cgroup_thresholds thresholds;
    
    	/* thresholds for mem+swap usage. RCU-protected */
    
    	struct mem_cgroup_thresholds memsw_thresholds;
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    	/* For oom notifier event fd */
    	struct list_head oom_notify;
    
    	/*
    	 * Should we move charges of a task when a task is moved into this
    	 * mem_cgroup ? And what type of charges should we move ?
    	 */
    
    Andrew Morton's avatar
    Andrew Morton committed
    	unsigned long move_charge_at_immigrate;
    
    	/*
    	 * set > 0 if pages under this cgroup are moving to other cgroup.
    	 */
    	atomic_t	moving_account;
    
    	/* taken only while moving_account > 0 */
    	spinlock_t	move_lock;
    
    	struct mem_cgroup_stat_cpu __percpu *stat;
    
    	/*
    	 * used when a cpu is offlined or other synchronizations
    	 * See mem_cgroup_read_stat().
    	 */
    	struct mem_cgroup_stat_cpu nocpu_base;
    	spinlock_t pcp_counter_lock;
    
    	atomic_t	dead_count;
    
    #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
    
    	struct tcp_memcontrol tcp_mem;
    #endif
    
    #if defined(CONFIG_MEMCG_KMEM)
    	/* analogous to slab_common's slab_caches list. per-memcg */
    	struct list_head memcg_slab_caches;
    	/* Not a spinlock, we can take a lot of time walking the list */
    	struct mutex slab_caches_mutex;
            /* Index in the kmem_cache->memcg_params->memcg_caches array */
    	int kmemcg_id;
    #endif
    
    
    	int last_scanned_node;
    #if MAX_NUMNODES > 1
    	nodemask_t	scan_nodes;
    	atomic_t	numainfo_events;
    	atomic_t	numainfo_updating;
    #endif
    
    	struct mem_cgroup_per_node *nodeinfo[0];
    	/* WARNING: nodeinfo must be the last member here */
    
    static size_t memcg_size(void)
    {
    	return sizeof(struct mem_cgroup) +
    		nr_node_ids * sizeof(struct mem_cgroup_per_node);
    }
    
    
    /* internal only representation about the status of kmem accounting. */
    enum {
    	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
    
    	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
    
    	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
    
    /* We account when limit is on, but only after call sites are patched */
    #define KMEM_ACCOUNTED_MASK \
    		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
    
    
    #ifdef CONFIG_MEMCG_KMEM
    static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
    {
    	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
    }
    
    
    static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
    {
    	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
    {
    	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
    {
    	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
    }
    
    
    static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
    {
    
    	/*
    	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
    	 * will call css_put() if it sees the memcg is dead.
    	 */
    	smp_wmb();
    
    	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
    		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
    }
    
    static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
    {
    	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
    				  &memcg->kmem_account_flags);
    }
    
    /* Stuffs for move charges at task migration. */
    /*
    
     * Types of charges to be moved. "move_charge_at_immitgrate" and
     * "immigrate_flags" are treated as a left-shifted bitmap of these types.
    
    	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
    
    	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
    
    /* "mc" and its members are protected by cgroup_mutex */
    static struct move_charge_struct {
    
    	spinlock_t	  lock; /* for from, to */
    
    	struct mem_cgroup *from;
    	struct mem_cgroup *to;
    
    	unsigned long precharge;
    
    	unsigned long moved_charge;
    
    	unsigned long moved_swap;
    
    	struct task_struct *moving_task;	/* a task moving charges */
    	wait_queue_head_t waitq;		/* a waitq for other context */
    } mc = {
    
    	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
    
    	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
    };
    
    static bool move_anon(void)
    {
    
    	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
    
    static bool move_file(void)
    {
    
    	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
    
    /*
     * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
     * limit reclaim to prevent infinite loops, if they ever occur.
     */
    
    #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
    
    enum charge_type {
    	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
    
    	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
    
    	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
    
    /* for encoding cft->private value on file */
    
    enum res_type {
    	_MEM,
    	_MEMSWAP,
    	_OOM_TYPE,
    
    #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
    #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
    
    #define MEMFILE_ATTR(val)	((val) & 0xffff)
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    /* Used for OOM nofiier */
    #define OOM_CONTROL		(0)
    
    /*
     * Reclaim flags for mem_cgroup_hierarchical_reclaim
     */
    #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
    #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
    #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
    #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
    
    
    /*
     * The memcg_create_mutex will be held whenever a new cgroup is created.
     * As a consequence, any change that needs to protect against new child cgroups
     * appearing has to hold it as well.
     */
    static DEFINE_MUTEX(memcg_create_mutex);
    
    
    struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
    {
    
    	return s ? container_of(s, struct mem_cgroup, css) : NULL;
    
    /* Some nice accessors for the vmpressure. */
    struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
    {
    	if (!memcg)
    		memcg = root_mem_cgroup;
    	return &memcg->vmpressure;
    }
    
    struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
    {
    	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
    }
    
    struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
    {
    	return &mem_cgroup_from_css(css)->vmpressure;
    }
    
    
    static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
    {
    	return (memcg == root_mem_cgroup);
    }
    
    
    /* Writing them here to avoid exposing memcg's inner layout */
    
    #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
    
    
    void sock_update_memcg(struct sock *sk)
    {
    
    	if (mem_cgroup_sockets_enabled) {
    
    		struct mem_cgroup *memcg;
    
    		struct cg_proto *cg_proto;
    
    
    		BUG_ON(!sk->sk_prot->proto_cgroup);
    
    
    		/* Socket cloning can throw us here with sk_cgrp already
    		 * filled. It won't however, necessarily happen from
    		 * process context. So the test for root memcg given
    		 * the current task's memcg won't help us in this case.
    		 *
    		 * Respecting the original socket's memcg is a better
    		 * decision in this case.
    		 */
    		if (sk->sk_cgrp) {
    			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
    
    			css_get(&sk->sk_cgrp->memcg->css);
    
    		rcu_read_lock();
    		memcg = mem_cgroup_from_task(current);
    
    		cg_proto = sk->sk_prot->proto_cgroup(memcg);
    
    		if (!mem_cgroup_is_root(memcg) &&
    		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
    
    			sk->sk_cgrp = cg_proto;
    
    		}
    		rcu_read_unlock();
    	}
    }
    EXPORT_SYMBOL(sock_update_memcg);
    
    void sock_release_memcg(struct sock *sk)
    {
    
    	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
    
    		struct mem_cgroup *memcg;
    		WARN_ON(!sk->sk_cgrp->memcg);
    		memcg = sk->sk_cgrp->memcg;
    
    		css_put(&sk->sk_cgrp->memcg->css);
    
    
    struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
    {
    	if (!memcg || mem_cgroup_is_root(memcg))
    		return NULL;
    
    	return &memcg->tcp_mem.cg_proto;
    }
    EXPORT_SYMBOL(tcp_proto_cgroup);
    
    static void disarm_sock_keys(struct mem_cgroup *memcg)
    {
    	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
    		return;
    	static_key_slow_dec(&memcg_socket_limit_enabled);
    }
    #else
    static void disarm_sock_keys(struct mem_cgroup *memcg)
    {
    }
    #endif
    
    
    #ifdef CONFIG_MEMCG_KMEM
    
    /*
     * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
     * There are two main reasons for not using the css_id for this:
     *  1) this works better in sparse environments, where we have a lot of memcgs,
     *     but only a few kmem-limited. Or also, if we have, for instance, 200
     *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
     *     200 entry array for that.
     *
     *  2) In order not to violate the cgroup API, we would like to do all memory
     *     allocation in ->create(). At that point, we haven't yet allocated the
     *     css_id. Having a separate index prevents us from messing with the cgroup
     *     core for this
     *
     * The current size of the caches array is stored in
     * memcg_limited_groups_array_size.  It will double each time we have to
     * increase it.
     */
    static DEFINE_IDA(kmem_limited_groups);
    
    int memcg_limited_groups_array_size;
    
    
    /*
     * MIN_SIZE is different than 1, because we would like to avoid going through
     * the alloc/free process all the time. In a small machine, 4 kmem-limited
     * cgroups is a reasonable guess. In the future, it could be a parameter or
     * tunable, but that is strictly not necessary.
     *
     * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
     * this constant directly from cgroup, but it is understandable that this is
     * better kept as an internal representation in cgroup.c. In any case, the
     * css_id space is not getting any smaller, and we don't have to necessarily
     * increase ours as well if it increases.
     */
    #define MEMCG_CACHES_MIN_SIZE 4
    #define MEMCG_CACHES_MAX_SIZE 65535
    
    
    /*
     * A lot of the calls to the cache allocation functions are expected to be
     * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
     * conditional to this static branch, we'll have to allow modules that does
     * kmem_cache_alloc and the such to see this symbol as well
     */
    
    struct static_key memcg_kmem_enabled_key;
    
    EXPORT_SYMBOL(memcg_kmem_enabled_key);
    
    
    static void disarm_kmem_keys(struct mem_cgroup *memcg)
    {
    
    	if (memcg_kmem_is_active(memcg)) {
    
    		static_key_slow_dec(&memcg_kmem_enabled_key);
    
    		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
    	}
    
    	/*
    	 * This check can't live in kmem destruction function,
    	 * since the charges will outlive the cgroup
    	 */
    	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
    
    }
    #else
    static void disarm_kmem_keys(struct mem_cgroup *memcg)
    {
    }
    #endif /* CONFIG_MEMCG_KMEM */
    
    static void disarm_static_keys(struct mem_cgroup *memcg)
    {
    	disarm_sock_keys(memcg);
    	disarm_kmem_keys(memcg);
    }
    
    
    static void drain_all_stock_async(struct mem_cgroup *memcg);
    
    static struct mem_cgroup_per_zone *
    
    mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
    
    	VM_BUG_ON((unsigned)nid >= nr_node_ids);
    
    	return &memcg->nodeinfo[nid]->zoneinfo[zid];
    
    struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
    
    	return &memcg->css;
    
    static struct mem_cgroup_per_zone *
    
    page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
    
    	int nid = page_to_nid(page);
    	int zid = page_zonenum(page);
    
    	return mem_cgroup_zoneinfo(memcg, nid, zid);
    
    /*
     * Implementation Note: reading percpu statistics for memcg.
     *
     * Both of vmstat[] and percpu_counter has threshold and do periodic
     * synchronization to implement "quick" read. There are trade-off between
     * reading cost and precision of value. Then, we may have a chance to implement
     * a periodic synchronizion of counter in memcg's counter.
     *
     * But this _read() function is used for user interface now. The user accounts
     * memory usage by memory cgroup and he _always_ requires exact value because
     * he accounts memory. Even if we provide quick-and-fuzzy read, we always
     * have to visit all online cpus and make sum. So, for now, unnecessary
     * synchronization is not implemented. (just implemented for cpu hotplug)
     *
     * If there are kernel internal actions which can make use of some not-exact
     * value, and reading all cpu value can be performance bottleneck in some
     * common workload, threashold and synchonization as vmstat[] should be
     * implemented.
     */
    
    static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
    
    				 enum mem_cgroup_stat_index idx)
    
    	get_online_cpus();
    	for_each_online_cpu(cpu)
    
    		val += per_cpu(memcg->stat->count[idx], cpu);
    
    #ifdef CONFIG_HOTPLUG_CPU
    
    	spin_lock(&memcg->pcp_counter_lock);
    	val += memcg->nocpu_base.count[idx];
    	spin_unlock(&memcg->pcp_counter_lock);
    
    #endif
    	put_online_cpus();
    
    static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
    
    					 bool charge)
    {
    	int val = (charge) ? 1 : -1;
    
    	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
    
    static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
    
    					    enum mem_cgroup_events_index idx)
    {
    	unsigned long val = 0;
    	int cpu;
    
    	for_each_online_cpu(cpu)
    
    		val += per_cpu(memcg->stat->events[idx], cpu);
    
    #ifdef CONFIG_HOTPLUG_CPU
    
    	spin_lock(&memcg->pcp_counter_lock);
    	val += memcg->nocpu_base.events[idx];
    	spin_unlock(&memcg->pcp_counter_lock);
    
    static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
    
    					 struct page *page,
    
    					 bool anon, int nr_pages)
    
    	/*
    	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
    	 * counted as CACHE even if it's on ANON LRU.
    	 */
    	if (anon)
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
    
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
    
    	if (PageTransHuge(page))
    		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
    				nr_pages);
    
    
    	/* pagein of a big page is an event. So, ignore page size */
    	if (nr_pages > 0)
    
    		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
    
    		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
    
    		nr_pages = -nr_pages; /* for event */
    	}
    
    	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
    
    mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
    	return mz->lru_size[lru];
    }
    
    static unsigned long
    
    mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
    
    {
    	struct mem_cgroup_per_zone *mz;
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	enum lru_list lru;
    
    	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	for_each_lru(lru) {
    		if (BIT(lru) & lru_mask)
    			ret += mz->lru_size[lru];
    
    mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
    
    			int nid, unsigned int lru_mask)
    {
    
    	for (zid = 0; zid < MAX_NR_ZONES; zid++)
    
    		total += mem_cgroup_zone_nr_lru_pages(memcg,
    						nid, zid, lru_mask);
    
    static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
    
    	for_each_node_state(nid, N_MEMORY)
    
    		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
    
    static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
    				       enum mem_cgroup_events_target target)
    
    	val = __this_cpu_read(memcg->stat->nr_page_events);
    
    	next = __this_cpu_read(memcg->stat->targets[target]);
    
    	/* from time_after() in jiffies.h */
    
    	if ((long)next - (long)val < 0) {
    		switch (target) {
    		case MEM_CGROUP_TARGET_THRESH:
    			next = val + THRESHOLDS_EVENTS_TARGET;
    			break;
    		case MEM_CGROUP_TARGET_NUMAINFO:
    			next = val + NUMAINFO_EVENTS_TARGET;
    			break;
    		default:
    			break;
    		}
    		__this_cpu_write(memcg->stat->targets[target], next);
    		return true;
    
    static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
    
    	/* threshold event is triggered in finer grain than soft limit */
    
    	if (unlikely(mem_cgroup_event_ratelimit(memcg,
    						MEM_CGROUP_TARGET_THRESH))) {
    
    		bool do_numainfo __maybe_unused;
    
    
    #if MAX_NUMNODES > 1
    		do_numainfo = mem_cgroup_event_ratelimit(memcg,
    						MEM_CGROUP_TARGET_NUMAINFO);
    #endif
    		preempt_enable();
    
    
    		mem_cgroup_threshold(memcg);
    
    			atomic_inc(&memcg->numainfo_events);
    
    struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
    
    	/*
    	 * mm_update_next_owner() may clear mm->owner to NULL
    	 * if it races with swapoff, page migration, etc.
    	 * So this can be called with p == NULL.
    	 */
    	if (unlikely(!p))
    		return NULL;
    
    
    	return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
    
    struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
    
    	struct mem_cgroup *memcg = NULL;
    
    
    	if (!mm)
    		return NULL;
    
    	/*
    	 * Because we have no locks, mm->owner's may be being moved to other
    	 * cgroup. We use css_tryget() here even if this looks
    	 * pessimistic (rather than adding locks here).
    	 */
    	rcu_read_lock();
    	do {
    
    		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
    		if (unlikely(!memcg))
    
    	} while (!css_tryget(&memcg->css));
    
    	rcu_read_unlock();
    
    	return memcg;
    
    /*
     * Returns a next (in a pre-order walk) alive memcg (with elevated css
     * ref. count) or NULL if the whole root's subtree has been visited.
     *
     * helper function to be used by mem_cgroup_iter
     */
    static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
    
    		struct mem_cgroup *last_visited)
    
    	struct cgroup_subsys_state *prev_css, *next_css;
    
    	prev_css = last_visited ? &last_visited->css : NULL;
    
    	next_css = css_next_descendant_pre(prev_css, &root->css);
    
    
    	/*
    	 * Even if we found a group we have to make sure it is
    	 * alive. css && !memcg means that the groups should be
    	 * skipped and we should continue the tree walk.
    	 * last_visited css is safe to use because it is
    	 * protected by css_get and the tree walk is rcu safe.
    	 */
    
    	if (next_css) {
    		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
    
    
    		if (css_tryget(&mem->css))
    			return mem;
    		else {
    
    static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
    {
    	/*
    	 * When a group in the hierarchy below root is destroyed, the
    	 * hierarchy iterator can no longer be trusted since it might
    	 * have pointed to the destroyed group.  Invalidate it.
    	 */
    	atomic_inc(&root->dead_count);
    }
    
    static struct mem_cgroup *
    mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
    		     struct mem_cgroup *root,
    		     int *sequence)
    {
    	struct mem_cgroup *position = NULL;
    	/*
    	 * A cgroup destruction happens in two stages: offlining and
    	 * release.  They are separated by a RCU grace period.
    	 *
    	 * If the iterator is valid, we may still race with an
    	 * offlining.  The RCU lock ensures the object won't be
    	 * released, tryget will fail if we lost the race.
    	 */
    	*sequence = atomic_read(&root->dead_count);
    	if (iter->last_dead_count == *sequence) {
    		smp_rmb();
    		position = iter->last_visited;
    		if (position && !css_tryget(&position->css))
    			position = NULL;
    	}
    	return position;
    }
    
    static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
    				   struct mem_cgroup *last_visited,
    				   struct mem_cgroup *new_position,
    				   int sequence)
    {
    	if (last_visited)
    		css_put(&last_visited->css);
    	/*
    	 * We store the sequence count from the time @last_visited was
    	 * loaded successfully instead of rereading it here so that we
    	 * don't lose destruction events in between.  We could have
    	 * raced with the destruction of @new_position after all.
    	 */
    	iter->last_visited = new_position;
    	smp_wmb();
    	iter->last_dead_count = sequence;
    }
    
    
    /**
     * mem_cgroup_iter - iterate over memory cgroup hierarchy
     * @root: hierarchy root
     * @prev: previously returned memcg, NULL on first invocation
     * @reclaim: cookie for shared reclaim walks, NULL for full walks
     *
     * Returns references to children of the hierarchy below @root, or
     * @root itself, or %NULL after a full round-trip.
     *
     * Caller must pass the return value in @prev on subsequent
     * invocations for reference counting, or use mem_cgroup_iter_break()
     * to cancel a hierarchy walk before the round-trip is complete.
     *
     * Reclaimers can specify a zone and a priority level in @reclaim to
     * divide up the memcgs in the hierarchy among all concurrent
     * reclaimers operating on the same zone and priority.
     */
    
    struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
    
    				   struct mem_cgroup_reclaim_cookie *reclaim)
    
    	struct mem_cgroup *memcg = NULL;
    
    	struct mem_cgroup *last_visited = NULL;
    
    	if (mem_cgroup_disabled())
    		return NULL;
    
    	if (!root)
    		root = root_mem_cgroup;
    
    	if (!root->use_hierarchy && root != root_mem_cgroup) {
    		if (prev)
    
    		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
    
    		if (reclaim) {
    			int nid = zone_to_nid(reclaim->zone);
    			int zid = zone_idx(reclaim->zone);
    			struct mem_cgroup_per_zone *mz;