Skip to content
Snippets Groups Projects
cgroup.c 145 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     *  Generic process-grouping system.
     *
     *  Based originally on the cpuset system, extracted by Paul Menage
     *  Copyright (C) 2006 Google, Inc
     *
    
     *  Notifications support
     *  Copyright (C) 2009 Nokia Corporation
     *  Author: Kirill A. Shutemov
     *
    
     *  Copyright notices from the original cpuset code:
     *  --------------------------------------------------
     *  Copyright (C) 2003 BULL SA.
     *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
     *
     *  Portions derived from Patrick Mochel's sysfs code.
     *  sysfs is Copyright (c) 2001-3 Patrick Mochel
     *
     *  2003-10-10 Written by Simon Derr.
     *  2003-10-22 Updates by Stephen Hemminger.
     *  2004 May-July Rework by Paul Jackson.
     *  ---------------------------------------------------
     *
     *  This file is subject to the terms and conditions of the GNU General Public
     *  License.  See the file COPYING in the main directory of the Linux
     *  distribution for more details.
     */
    
    #include <linux/cgroup.h>
    
    #include <linux/ctype.h>
    
    #include <linux/errno.h>
    #include <linux/fs.h>
    
    #include <linux/init_task.h>
    
    #include <linux/kernel.h>
    #include <linux/list.h>
    #include <linux/mm.h>
    #include <linux/mutex.h>
    #include <linux/mount.h>
    #include <linux/pagemap.h>
    
    #include <linux/proc_fs.h>
    
    #include <linux/rcupdate.h>
    #include <linux/sched.h>
    
    #include <linux/backing-dev.h>
    
    #include <linux/seq_file.h>
    #include <linux/slab.h>
    #include <linux/magic.h>
    #include <linux/spinlock.h>
    #include <linux/string.h>
    
    #include <linux/sort.h>
    
    #include <linux/module.h>
    
    Balbir Singh's avatar
    Balbir Singh committed
    #include <linux/delayacct.h>
    #include <linux/cgroupstats.h>
    
    #include <linux/hash.h>
    
    #include <linux/namei.h>
    
    Li Zefan's avatar
    Li Zefan committed
    #include <linux/pid_namespace.h>
    
    #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
    
    #include <linux/eventfd.h>
    #include <linux/poll.h>
    
    #include <linux/flex_array.h> /* used in cgroup_attach_proc */
    
    Balbir Singh's avatar
    Balbir Singh committed
    
    
    #include <linux/atomic.h>
    
    /* css deactivation bias, makes css->refcnt negative to deny new trygets */
    #define CSS_DEACT_BIAS		INT_MIN
    
    
    Tejun Heo's avatar
    Tejun Heo committed
    /*
     * cgroup_mutex is the master lock.  Any modification to cgroup or its
     * hierarchy must be performed while holding it.
     *
     * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
     * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
     * release_agent_path and so on.  Modifying requires both cgroup_mutex and
     * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
     * break the following locking order cycle.
     *
     *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
     *  B. namespace_sem -> cgroup_mutex
     *
     * B happens only through cgroup_show_options() and using cgroup_root_mutex
     * breaks it.
     */
    
    static DEFINE_MUTEX(cgroup_mutex);
    
    Tejun Heo's avatar
    Tejun Heo committed
    static DEFINE_MUTEX(cgroup_root_mutex);
    
    Ben Blum's avatar
    Ben Blum committed
    /*
     * Generate an array of cgroup subsystem pointers. At boot time, this is
    
     * populated with the built in subsystems, and modular subsystems are
    
    Ben Blum's avatar
    Ben Blum committed
     * registered after that. The mutable section of this array is protected by
     * cgroup_mutex.
     */
    
    #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
    
    #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
    
    Ben Blum's avatar
    Ben Blum committed
    static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
    
    #include <linux/cgroup_subsys.h>
    };
    
    
    #define MAX_CGROUP_ROOT_NAMELEN 64
    
    
    /*
     * A cgroupfs_root represents the root of a cgroup hierarchy,
     * and may be associated with a superblock to form an active
     * hierarchy
     */
    struct cgroupfs_root {
    	struct super_block *sb;
    
    	/*
    	 * The bitmask of subsystems intended to be attached to this
    	 * hierarchy
    	 */
    
    	unsigned long subsys_mask;
    
    	/* Unique id for this hierarchy. */
    	int hierarchy_id;
    
    
    	/* The bitmask of subsystems currently attached to this hierarchy */
    
    	unsigned long actual_subsys_mask;
    
    
    	/* A list running through the attached subsystems */
    	struct list_head subsys_list;
    
    	/* The root cgroup for this hierarchy */
    	struct cgroup top_cgroup;
    
    	/* Tracks how many cgroups are currently defined in hierarchy.*/
    	int number_of_cgroups;
    
    
    	/* A list running through the active hierarchies */
    
    	struct list_head root_list;
    
    
    	/* All cgroups on this root, cgroup_mutex protected */
    	struct list_head allcg_list;
    
    
    	/* Hierarchy-specific flags */
    	unsigned long flags;
    
    	/* The path to use for release notifications. */
    
    	char release_agent_path[PATH_MAX];
    
    
    	/* The name for this hierarchy - may be empty */
    	char name[MAX_CGROUP_ROOT_NAMELEN];
    
    };
    
    /*
     * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
     * subsystems that are otherwise unattached - it never has more than a
     * single cgroup, and all tasks are part of that cgroup.
     */
    static struct cgroupfs_root rootnode;
    
    
    /*
     * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
     */
    struct cfent {
    	struct list_head		node;
    	struct dentry			*dentry;
    	struct cftype			*type;
    };
    
    
    /*
     * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
     * cgroup_subsys->use_id != 0.
     */
    #define CSS_ID_MAX	(65535)
    struct css_id {
    	/*
    	 * The css to which this ID points. This pointer is set to valid value
    	 * after cgroup is populated. If cgroup is removed, this will be NULL.
    	 * This pointer is expected to be RCU-safe because destroy()
    
    Tejun Heo's avatar
    Tejun Heo committed
    	 * is called after synchronize_rcu(). But for safe use, css_tryget()
    	 * should be used for avoiding race.
    
    	struct cgroup_subsys_state __rcu *css;
    
    	/*
    	 * ID of this css.
    	 */
    	unsigned short id;
    	/*
    	 * Depth in hierarchy which this ID belongs to.
    	 */
    	unsigned short depth;
    	/*
    	 * ID is freed by RCU. (and lookup routine is RCU safe.)
    	 */
    	struct rcu_head rcu_head;
    	/*
    	 * Hierarchy of CSS ID belongs to.
    	 */
    	unsigned short stack[0]; /* Array of Length (depth+1) */
    };
    
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
     * cgroup_event represents events which userspace want to receive.
    
     */
    struct cgroup_event {
    	/*
    	 * Cgroup which the event belongs to.
    	 */
    	struct cgroup *cgrp;
    	/*
    	 * Control file which the event associated.
    	 */
    	struct cftype *cft;
    	/*
    	 * eventfd to signal userspace about the event.
    	 */
    	struct eventfd_ctx *eventfd;
    	/*
    	 * Each of these stored in a list by the cgroup.
    	 */
    	struct list_head list;
    	/*
    	 * All fields below needed to unregister event when
    	 * userspace closes eventfd.
    	 */
    	poll_table pt;
    	wait_queue_head_t *wqh;
    	wait_queue_t wait;
    	struct work_struct remove;
    };
    
    /* The list of hierarchy roots */
    
    static LIST_HEAD(roots);
    
    static DEFINE_IDA(hierarchy_ida);
    static int next_hierarchy_id;
    static DEFINE_SPINLOCK(hierarchy_id_lock);
    
    
    /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
    #define dummytop (&rootnode.top_cgroup)
    
    /* This flag indicates whether tasks in the fork and exit paths should
    
    Li Zefan's avatar
    Li Zefan committed
     * check for fork/exit handlers to call. This avoids us having to do
     * extra work in the fork/exit path if none of the subsystems need to
     * be called.
    
    static int need_forkexit_callback __read_mostly;
    
    #ifdef CONFIG_PROVE_LOCKING
    int cgroup_lock_is_held(void)
    {
    	return lockdep_is_held(&cgroup_mutex);
    }
    #else /* #ifdef CONFIG_PROVE_LOCKING */
    int cgroup_lock_is_held(void)
    {
    	return mutex_is_locked(&cgroup_mutex);
    }
    #endif /* #else #ifdef CONFIG_PROVE_LOCKING */
    
    EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
    
    
    static int css_unbias_refcnt(int refcnt)
    {
    	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
    }
    
    
    /* the current nr of refs, always >= 0 whether @css is deactivated or not */
    static int css_refcnt(struct cgroup_subsys_state *css)
    {
    	int v = atomic_read(&css->refcnt);
    
    
    	return css_unbias_refcnt(v);
    
    /* convenient tests for these bits */
    
    inline int cgroup_is_removed(const struct cgroup *cgrp)
    
    	return test_bit(CGRP_REMOVED, &cgrp->flags);
    
    }
    
    /* bits in struct cgroupfs_root flags field */
    enum {
    
    	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */
    	ROOT_XATTR,	/* supports extended attributes */
    
    static int cgroup_is_releasable(const struct cgroup *cgrp)
    
    		(1 << CGRP_RELEASABLE) |
    		(1 << CGRP_NOTIFY_ON_RELEASE);
    	return (cgrp->flags & bits) == bits;
    
    static int notify_on_release(const struct cgroup *cgrp)
    
    	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
    
    static int clone_children(const struct cgroup *cgrp)
    {
    	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
    }
    
    
    /*
     * for_each_subsys() allows you to iterate on each subsystem attached to
     * an active hierarchy
     */
    #define for_each_subsys(_root, _ss) \
    list_for_each_entry(_ss, &_root->subsys_list, sibling)
    
    
    /* for_each_active_root() allows you to iterate across the active hierarchies */
    #define for_each_active_root(_root) \
    
    list_for_each_entry(_root, &roots, root_list)
    
    
    static inline struct cgroup *__d_cgrp(struct dentry *dentry)
    {
    	return dentry->d_fsdata;
    }
    
    
    static inline struct cfent *__d_cfe(struct dentry *dentry)
    
    {
    	return dentry->d_fsdata;
    }
    
    
    static inline struct cftype *__d_cft(struct dentry *dentry)
    {
    	return __d_cfe(dentry)->type;
    }
    
    
    /* the list of cgroups eligible for automatic release. Protected by
     * release_list_lock */
    static LIST_HEAD(release_list);
    
    static DEFINE_RAW_SPINLOCK(release_list_lock);
    
    static void cgroup_release_agent(struct work_struct *work);
    static DECLARE_WORK(release_agent_work, cgroup_release_agent);
    
    static void check_for_release(struct cgroup *cgrp);
    
    /* Link structure for associating css_set objects with cgroups */
    struct cg_cgroup_link {
    	/*
    	 * List running through cg_cgroup_links associated with a
    	 * cgroup, anchored on cgroup->css_sets
    	 */
    
    	struct list_head cgrp_link_list;
    
    	/*
    	 * List running through cg_cgroup_links pointing at a
    	 * single css_set object, anchored on css_set->cg_links
    	 */
    	struct list_head cg_link_list;
    	struct css_set *cg;
    };
    
    /* The default css_set - used by init and its children prior to any
     * hierarchies being mounted. It contains a pointer to the root state
     * for each subsystem. Also used to anchor the list of css_sets. Not
     * reference-counted, to improve performance when child cgroups
     * haven't been created.
     */
    
    static struct css_set init_css_set;
    static struct cg_cgroup_link init_css_set_link;
    
    
    static int cgroup_init_idr(struct cgroup_subsys *ss,
    			   struct cgroup_subsys_state *css);
    
    /* css_set_lock protects the list of css_set objects, and the
     * chain of tasks off each css_set.  Nests outside task->alloc_lock
     * due to cgroup_iter_start() */
    static DEFINE_RWLOCK(css_set_lock);
    static int css_set_count;
    
    
    /*
     * hash table for cgroup groups. This improves the performance to find
     * an existing css_set. This hash doesn't (currently) take into
     * account cgroups in empty hierarchies.
     */
    
    #define CSS_SET_HASH_BITS	7
    #define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
    static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
    
    static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
    {
    	int i;
    	int index;
    	unsigned long tmp = 0UL;
    
    	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
    		tmp += (unsigned long)css[i];
    	tmp = (tmp >> 16) ^ tmp;
    
    	index = hash_long(tmp, CSS_SET_HASH_BITS);
    
    	return &css_set_table[index];
    }
    
    
    /* We don't maintain the lists running through each css_set to its
     * task until after the first call to cgroup_iter_start(). This
     * reduces the fork()/exit() overhead for people who have cgroups
     * compiled into their kernel but not actually in use */
    
    static int use_task_css_set_links __read_mostly;
    
    static void __put_css_set(struct css_set *cg, int taskexit)
    
    	struct cg_cgroup_link *link;
    	struct cg_cgroup_link *saved_link;
    
    	/*
    	 * Ensure that the refcount doesn't hit zero while any readers
    	 * can see it. Similar to atomic_dec_and_lock(), but for an
    	 * rwlock
    	 */
    	if (atomic_add_unless(&cg->refcount, -1, 1))
    		return;
    	write_lock(&css_set_lock);
    	if (!atomic_dec_and_test(&cg->refcount)) {
    		write_unlock(&css_set_lock);
    		return;
    	}
    
    	/* This css_set is dead. unlink it and release cgroup refcounts */
    	hlist_del(&cg->hlist);
    	css_set_count--;
    
    	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
    				 cg_link_list) {
    		struct cgroup *cgrp = link->cgrp;
    		list_del(&link->cg_link_list);
    		list_del(&link->cgrp_link_list);
    
    		if (atomic_dec_and_test(&cgrp->count) &&
    		    notify_on_release(cgrp)) {
    
    				set_bit(CGRP_RELEASABLE, &cgrp->flags);
    			check_for_release(cgrp);
    
    /*
     * refcounted get/put for css_set objects
     */
    static inline void get_css_set(struct css_set *cg)
    {
    
    }
    
    static inline void put_css_set(struct css_set *cg)
    {
    
    static inline void put_css_set_taskexit(struct css_set *cg)
    {
    
    /*
     * compare_css_sets - helper function for find_existing_css_set().
     * @cg: candidate css_set being tested
     * @old_cg: existing css_set for a task
     * @new_cgrp: cgroup that's being entered by the task
     * @template: desired set of css pointers in css_set (pre-calculated)
     *
     * Returns true if "cg" matches "old_cg" except for the hierarchy
     * which "new_cgrp" belongs to, for which it should match "new_cgrp".
     */
    static bool compare_css_sets(struct css_set *cg,
    			     struct css_set *old_cg,
    			     struct cgroup *new_cgrp,
    			     struct cgroup_subsys_state *template[])
    {
    	struct list_head *l1, *l2;
    
    	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
    		/* Not all subsystems matched */
    		return false;
    	}
    
    	/*
    	 * Compare cgroup pointers in order to distinguish between
    	 * different cgroups in heirarchies with no subsystems. We
    	 * could get by with just this check alone (and skip the
    	 * memcmp above) but on most setups the memcmp check will
    	 * avoid the need for this more expensive check on almost all
    	 * candidates.
    	 */
    
    	l1 = &cg->cg_links;
    	l2 = &old_cg->cg_links;
    	while (1) {
    		struct cg_cgroup_link *cgl1, *cgl2;
    		struct cgroup *cg1, *cg2;
    
    		l1 = l1->next;
    		l2 = l2->next;
    		/* See if we reached the end - both lists are equal length. */
    		if (l1 == &cg->cg_links) {
    			BUG_ON(l2 != &old_cg->cg_links);
    			break;
    		} else {
    			BUG_ON(l2 == &old_cg->cg_links);
    		}
    		/* Locate the cgroups associated with these links. */
    		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
    		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
    		cg1 = cgl1->cgrp;
    		cg2 = cgl2->cgrp;
    		/* Hierarchies should be linked in the same order. */
    		BUG_ON(cg1->root != cg2->root);
    
    		/*
    		 * If this hierarchy is the hierarchy of the cgroup
    		 * that's changing, then we need to check that this
    		 * css_set points to the new cgroup; if it's any other
    		 * hierarchy, then this css_set should point to the
    		 * same cgroup as the old css_set.
    		 */
    		if (cg1->root == new_cgrp->root) {
    			if (cg1 != new_cgrp)
    				return false;
    		} else {
    			if (cg1 != cg2)
    				return false;
    		}
    	}
    	return true;
    }
    
    
    /*
     * find_existing_css_set() is a helper for
     * find_css_set(), and checks to see whether an existing
    
     * css_set is suitable.
    
     *
     * oldcg: the cgroup group that we're using before the cgroup
     * transition
     *
    
     * cgrp: the cgroup that we're moving into
    
     *
     * template: location in which to build the desired set of subsystem
     * state objects for the new cgroup group
     */
    static struct css_set *find_existing_css_set(
    	struct css_set *oldcg,
    
    	struct cgroup_subsys_state *template[])
    
    	struct cgroupfs_root *root = cgrp->root;
    
    	struct hlist_head *hhead;
    	struct hlist_node *node;
    	struct css_set *cg;
    
    Ben Blum's avatar
    Ben Blum committed
    	/*
    	 * Build the set of subsystem state objects that we want to see in the
    	 * new css_set. while subsystems can change globally, the entries here
    	 * won't change, so no need for locking.
    	 */
    
    	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
    
    		if (root->subsys_mask & (1UL << i)) {
    
    			/* Subsystem is in this hierarchy. So we want
    			 * the subsystem state from the new
    			 * cgroup */
    
    			template[i] = cgrp->subsys[i];
    
    		} else {
    			/* Subsystem is not in this hierarchy, so we
    			 * don't want to change the subsystem state */
    			template[i] = oldcg->subsys[i];
    		}
    	}
    
    
    	hhead = css_set_hash(template);
    	hlist_for_each_entry(cg, node, hhead, hlist) {
    
    		if (!compare_css_sets(cg, oldcg, cgrp, template))
    			continue;
    
    		/* This css_set matches what we need */
    		return cg;
    
    
    	/* No existing cgroup group matched */
    	return NULL;
    }
    
    
    static void free_cg_links(struct list_head *tmp)
    {
    	struct cg_cgroup_link *link;
    	struct cg_cgroup_link *saved_link;
    
    	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
    		list_del(&link->cgrp_link_list);
    		kfree(link);
    	}
    }
    
    
    /*
     * allocate_cg_links() allocates "count" cg_cgroup_link structures
    
     * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
    
     * success or a negative error
     */
    static int allocate_cg_links(int count, struct list_head *tmp)
    {
    	struct cg_cgroup_link *link;
    	int i;
    	INIT_LIST_HEAD(tmp);
    	for (i = 0; i < count; i++) {
    		link = kmalloc(sizeof(*link), GFP_KERNEL);
    		if (!link) {
    
    			free_cg_links(tmp);
    
    		list_add(&link->cgrp_link_list, tmp);
    
    /**
     * link_css_set - a helper function to link a css_set to a cgroup
     * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
     * @cg: the css_set to be linked
     * @cgrp: the destination cgroup
     */
    static void link_css_set(struct list_head *tmp_cg_links,
    			 struct css_set *cg, struct cgroup *cgrp)
    {
    	struct cg_cgroup_link *link;
    
    	BUG_ON(list_empty(tmp_cg_links));
    	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
    				cgrp_link_list);
    	link->cg = cg;
    
    	list_move(&link->cgrp_link_list, &cgrp->css_sets);
    
    	/*
    	 * Always add links to the tail of the list so that the list
    	 * is sorted by order of hierarchy creation
    	 */
    	list_add_tail(&link->cg_link_list, &cg->cg_links);
    
    /*
     * find_css_set() takes an existing cgroup group and a
     * cgroup object, and returns a css_set object that's
     * equivalent to the old group, but with the given cgroup
     * substituted into the appropriate hierarchy. Must be called with
     * cgroup_mutex held
     */
    static struct css_set *find_css_set(
    
    	struct css_set *oldcg, struct cgroup *cgrp)
    
    {
    	struct css_set *res;
    	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
    
    	struct list_head tmp_cg_links;
    
    
    	struct hlist_head *hhead;
    
    	/* First see if we already have a cgroup group that matches
    	 * the desired set */
    
    	read_lock(&css_set_lock);
    
    	res = find_existing_css_set(oldcg, cgrp, template);
    
    	read_unlock(&css_set_lock);
    
    
    	if (res)
    		return res;
    
    	res = kmalloc(sizeof(*res), GFP_KERNEL);
    	if (!res)
    		return NULL;
    
    	/* Allocate all the cg_cgroup_link objects that we'll need */
    	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
    		kfree(res);
    		return NULL;
    	}
    
    
    	INIT_LIST_HEAD(&res->cg_links);
    	INIT_LIST_HEAD(&res->tasks);
    
    	INIT_HLIST_NODE(&res->hlist);
    
    
    	/* Copy the set of subsystem state objects generated in
    	 * find_existing_css_set() */
    	memcpy(res->subsys, template, sizeof(res->subsys));
    
    	write_lock(&css_set_lock);
    	/* Add reference counts and links from the new css_set. */
    
    	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
    		struct cgroup *c = link->cgrp;
    		if (c->root == cgrp->root)
    			c = cgrp;
    		link_css_set(&tmp_cg_links, res, c);
    	}
    
    
    	BUG_ON(!list_empty(&tmp_cg_links));
    
    	css_set_count++;
    
    
    	/* Add this cgroup group to the hash table */
    	hhead = css_set_hash(res->subsys);
    	hlist_add_head(&res->hlist, hhead);
    
    
    	write_unlock(&css_set_lock);
    
    	return res;
    
    /*
     * Return the cgroup for "task" from the given hierarchy. Must be
     * called with cgroup_mutex held.
     */
    static struct cgroup *task_cgroup_from_root(struct task_struct *task,
    					    struct cgroupfs_root *root)
    {
    	struct css_set *css;
    	struct cgroup *res = NULL;
    
    	BUG_ON(!mutex_is_locked(&cgroup_mutex));
    	read_lock(&css_set_lock);
    	/*
    	 * No need to lock the task - since we hold cgroup_mutex the
    	 * task can't change groups, so the only thing that can happen
    	 * is that it exits and its css is set back to init_css_set.
    	 */
    	css = task->cgroups;
    	if (css == &init_css_set) {
    		res = &root->top_cgroup;
    	} else {
    		struct cg_cgroup_link *link;
    		list_for_each_entry(link, &css->cg_links, cg_link_list) {
    			struct cgroup *c = link->cgrp;
    			if (c->root == root) {
    				res = c;
    				break;
    			}
    		}
    	}
    	read_unlock(&css_set_lock);
    	BUG_ON(!res);
    	return res;
    }
    
    
    /*
     * There is one global cgroup mutex. We also require taking
     * task_lock() when dereferencing a task's cgroup subsys pointers.
     * See "The task_lock() exception", at the end of this comment.
     *
     * A task must hold cgroup_mutex to modify cgroups.
     *
     * Any task can increment and decrement the count field without lock.
     * So in general, code holding cgroup_mutex can't rely on the count
     * field not changing.  However, if the count goes to zero, then only
    
     * cgroup_attach_task() can increment it again.  Because a count of zero
    
     * means that no tasks are currently attached, therefore there is no
     * way a task attached to that cgroup can fork (the other way to
     * increment the count).  So code holding cgroup_mutex can safely
     * assume that if the count is zero, it will stay zero. Similarly, if
     * a task holds cgroup_mutex on a cgroup with zero count, it
     * knows that the cgroup won't be removed, as cgroup_rmdir()
     * needs that mutex.
     *
     * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
     * (usually) take cgroup_mutex.  These are the two most performance
     * critical pieces of code here.  The exception occurs on cgroup_exit(),
     * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
     * is taken, and if the cgroup count is zero, a usermode call made
    
    Li Zefan's avatar
    Li Zefan committed
     * to the release agent with the name of the cgroup (path relative to
     * the root of cgroup file system) as the argument.
    
     *
     * A cgroup can only be deleted if both its 'count' of using tasks
     * is zero, and its list of 'children' cgroups is empty.  Since all
     * tasks in the system use _some_ cgroup, and since there is always at
     * least one task in the system (init, pid == 1), therefore, top_cgroup
     * always has either children cgroups and/or using tasks.  So we don't
     * need a special hack to ensure that top_cgroup cannot be deleted.
     *
     *	The task_lock() exception
     *
     * The need for this exception arises from the action of
    
     * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
    
    Li Zefan's avatar
    Li Zefan committed
     * another.  It does so using cgroup_mutex, however there are
    
     * several performance critical places that need to reference
     * task->cgroup without the expense of grabbing a system global
     * mutex.  Therefore except as noted below, when dereferencing or, as
    
     * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
    
     * task_lock(), which acts on a spinlock (task->alloc_lock) already in
     * the task_struct routinely used for such matters.
     *
     * P.S.  One more locking exception.  RCU is used to guard the
    
     * update of a tasks cgroup pointer by cgroup_attach_task()
    
     */
    
    /**
     * cgroup_lock - lock out any changes to cgroup structures
     *
     */
    void cgroup_lock(void)
    {
    	mutex_lock(&cgroup_mutex);
    }
    
    EXPORT_SYMBOL_GPL(cgroup_lock);
    
    
    /**
     * cgroup_unlock - release lock on cgroup changes
     *
     * Undo the lock taken in a previous cgroup_lock() call.
     */
    void cgroup_unlock(void)
    {
    	mutex_unlock(&cgroup_mutex);
    }
    
    EXPORT_SYMBOL_GPL(cgroup_unlock);
    
    
    /*
     * A couple of forward declarations required, due to cyclic reference loop:
     * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
     * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
     * -> cgroup_mkdir.
     */
    
    
    static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
    
    static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
    
    static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
    
    static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
    			       unsigned long subsys_mask);
    
    static const struct inode_operations cgroup_dir_inode_operations;
    
    static const struct file_operations proc_cgroupstats_operations;
    
    
    static struct backing_dev_info cgroup_backing_dev_info = {
    
    	.name		= "cgroup",
    
    	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
    
    static int alloc_css_id(struct cgroup_subsys *ss,
    			struct cgroup *parent, struct cgroup *child);
    
    
    Al Viro's avatar
    Al Viro committed
    static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
    
    {
    	struct inode *inode = new_inode(sb);
    
    	if (inode) {
    
    		inode->i_ino = get_next_ino();
    
    		inode->i_mode = mode;
    
    		inode->i_uid = current_fsuid();
    		inode->i_gid = current_fsgid();
    
    		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
    		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
    	}
    	return inode;
    }
    
    static void cgroup_diput(struct dentry *dentry, struct inode *inode)
    {
    	/* is dentry a directory ? if so, kfree() associated cgroup */
    	if (S_ISDIR(inode->i_mode)) {
    
    		struct cgroup *cgrp = dentry->d_fsdata;
    
    		struct cgroup_subsys *ss;
    
    		BUG_ON(!(cgroup_is_removed(cgrp)));
    
    		/* It's possible for external users to be holding css
    		 * reference counts on a cgroup; css_put() needs to
    		 * be able to access the cgroup after decrementing
    		 * the reference count in order to know if it needs to
    		 * queue the cgroup to be handled by the release
    		 * agent */
    		synchronize_rcu();
    
    
    		mutex_lock(&cgroup_mutex);
    		/*
    		 * Release the subsystem state objects.
    		 */
    
    		for_each_subsys(cgrp->root, ss)
    
    			ss->destroy(cgrp);
    
    
    		cgrp->root->number_of_cgroups--;
    		mutex_unlock(&cgroup_mutex);
    
    
    		 * Drop the active superblock reference that we took when we
    		 * created the cgroup
    
    		deactivate_super(cgrp->root->sb);
    
    		/*
    		 * if we're getting rid of the cgroup, refcount should ensure
    		 * that there are no pidlists left.
    		 */
    		BUG_ON(!list_empty(&cgrp->pidlists));
    
    
    		simple_xattrs_free(&cgrp->xattrs);
    
    
    	} else {
    		struct cfent *cfe = __d_cfe(dentry);
    		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
    
    		struct cftype *cft = cfe->type;
    
    
    		WARN_ONCE(!list_empty(&cfe->node) &&
    			  cgrp != &cgrp->root->top_cgroup,
    			  "cfe still linked for %s\n", cfe->type->name);
    		kfree(cfe);
    
    		simple_xattrs_free(&cft->xattrs);
    
    static int cgroup_delete(const struct dentry *d)
    {
    	return 1;
    }
    
    
    static void remove_dir(struct dentry *d)
    {
    	struct dentry *parent = dget(d->d_parent);
    
    	d_delete(d);
    	simple_rmdir(parent->d_inode, d);
    	dput(parent);
    }
    
    
    static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
    {
    	struct cfent *cfe;
    
    	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
    	lockdep_assert_held(&cgroup_mutex);
    
    	list_for_each_entry(cfe, &cgrp->files, node) {
    		struct dentry *d = cfe->dentry;
    
    		if (cft && cfe->type != cft)
    			continue;
    
    		dget(d);
    		d_delete(d);
    
    		simple_unlink(cgrp->dentry->d_inode, d);
    
    		list_del_init(&cfe->node);
    		dput(d);
    
    		return 0;
    
    	return -ENOENT;
    }
    
    
    /**
     * cgroup_clear_directory - selective removal of base and subsystem files
     * @dir: directory containing the files
     * @base_files: true if the base files should be removed
     * @subsys_mask: mask of the subsystem ids whose files should be removed
     */
    static void cgroup_clear_directory(struct dentry *dir, bool base_files,
    				   unsigned long subsys_mask)
    
    {
    	struct cgroup *cgrp = __d_cgrp(dir);
    
    	struct cgroup_subsys *ss;
    
    	for_each_subsys(cgrp->root, ss) {
    		struct cftype_set *set;
    		if (!test_bit(ss->subsys_id, &subsys_mask))
    			continue;
    		list_for_each_entry(set, &ss->cftsets, node)
    			cgroup_rm_file(cgrp, set->cfts);
    	}
    	if (base_files) {
    		while (!list_empty(&cgrp->files))
    			cgroup_rm_file(cgrp, NULL);
    	}
    
    }
    
    /*
     * NOTE : the dentry must have been dget()'ed
     */
    static void cgroup_d_remove_dir(struct dentry *dentry)
    {
    
    	struct dentry *parent;
    
    	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
    
    	cgroup_clear_directory(dentry, true, root->subsys_mask);
    
    	parent = dentry->d_parent;
    	spin_lock(&parent->d_lock);
    
    	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
    
    	list_del_init(&dentry->d_u.d_child);
    
    	spin_unlock(&dentry->d_lock);
    	spin_unlock(&parent->d_lock);
    
    Ben Blum's avatar
    Ben Blum committed
    /*
    
     * Call with cgroup_mutex held. Drops reference counts on modules, including
     * any duplicate ones that parse_cgroupfs_options took. If this function
     * returns an error, no reference counts are touched.
    
    Ben Blum's avatar
    Ben Blum committed
     */
    
    static int rebind_subsystems(struct cgroupfs_root *root,
    
    			      unsigned long final_subsys_mask)