Skip to content
Snippets Groups Projects
namei.c 82.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/fs/namei.c
     *
     *  Copyright (C) 1991, 1992  Linus Torvalds
     */
    
    /*
     * Some corrections by tytso.
     */
    
    /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
     * lookup logic.
     */
    /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
     */
    
    #include <linux/init.h>
    #include <linux/module.h>
    #include <linux/slab.h>
    #include <linux/fs.h>
    #include <linux/namei.h>
    #include <linux/pagemap.h>
    
    Robert Love's avatar
    Robert Love committed
    #include <linux/fsnotify.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/personality.h>
    #include <linux/security.h>
    
    Mimi Zohar's avatar
    Mimi Zohar committed
    #include <linux/ima.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/syscalls.h>
    #include <linux/mount.h>
    #include <linux/audit.h>
    
    #include <linux/capability.h>
    
    #include <linux/fcntl.h>
    
    #include <linux/device_cgroup.h>
    
    #include <linux/fs_struct.h>
    
    #include <linux/posix_acl.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <asm/uaccess.h>
    
    
    #include "internal.h"
    
    #include "mount.h"
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* [Feb-1997 T. Schoebel-Theuer]
     * Fundamental changes in the pathname lookup mechanisms (namei)
     * were necessary because of omirr.  The reason is that omirr needs
     * to know the _real_ pathname, not the user-supplied one, in case
     * of symlinks (and also when transname replacements occur).
     *
     * The new code replaces the old recursive symlink resolution with
     * an iterative one (in case of non-nested symlink chains).  It does
     * this with calls to <fs>_follow_link().
     * As a side effect, dir_namei(), _namei() and follow_link() are now 
     * replaced with a single function lookup_dentry() that can handle all 
     * the special cases of the former code.
     *
     * With the new dcache, the pathname is stored at each inode, at least as
     * long as the refcount of the inode is positive.  As a side effect, the
     * size of the dcache depends on the inode cache and thus is dynamic.
     *
     * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
     * resolution to correspond with current state of the code.
     *
     * Note that the symlink resolution is not *completely* iterative.
     * There is still a significant amount of tail- and mid- recursion in
     * the algorithm.  Also, note that <fs>_readlink() is not used in
     * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
     * may return different results than <fs>_follow_link().  Many virtual
     * filesystems (including /proc) exhibit this behavior.
     */
    
    /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
     * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
     * and the name already exists in form of a symlink, try to create the new
     * name indicated by the symlink. The old code always complained that the
     * name already exists, due to not following the symlink even if its target
     * is nonexistent.  The new semantics affects also mknod() and link() when
    
    Lucas De Marchi's avatar
    Lucas De Marchi committed
     * the name is a symlink pointing to a non-existent name.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * I don't know which semantics is the right one, since I have no access
     * to standards. But I found by trial that HP-UX 9.0 has the full "new"
     * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
     * "old" one. Personally, I think the new semantics is much more logical.
     * Note that "ln old new" where "new" is a symlink pointing to a non-existing
     * file does succeed in both HP-UX and SunOs, but not in Solaris
     * and in the old Linux semantics.
     */
    
    /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
     * semantics.  See the comments in "open_namei" and "do_link" below.
     *
     * [10-Sep-98 Alan Modra] Another symlink change.
     */
    
    /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
     *	inside the path - always follow.
     *	in the last component in creation/removal/renaming - never follow.
     *	if LOOKUP_FOLLOW passed - follow.
     *	if the pathname has trailing slashes - follow.
     *	otherwise - don't follow.
     * (applied in that order).
     *
     * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
     * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
     * During the 2.4 we need to fix the userland stuff depending on it -
     * hopefully we will be able to get rid of that wart in 2.5. So far only
     * XEmacs seems to be relying on it...
     */
    /*
     * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
    
     * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * any extra contention...
     */
    
    /* In order to reduce some races, while at the same time doing additional
     * checking and hopefully speeding things up, we copy filenames to the
     * kernel data space before using them..
     *
     * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
     * PATH_MAX includes the nul terminator --RR.
     */
    
    static int do_getname(const char __user *filename, char *page)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int retval;
    	unsigned long len = PATH_MAX;
    
    	if (!segment_eq(get_fs(), KERNEL_DS)) {
    		if ((unsigned long) filename >= TASK_SIZE)
    			return -EFAULT;
    		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
    			len = TASK_SIZE - (unsigned long) filename;
    	}
    
    	retval = strncpy_from_user(page, filename, len);
    	if (retval > 0) {
    		if (retval < len)
    			return 0;
    		return -ENAMETOOLONG;
    	} else if (!retval)
    		retval = -ENOENT;
    	return retval;
    }
    
    
    static char *getname_flags(const char __user *filename, int flags, int *empty)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	char *tmp, *result;
    
    	result = ERR_PTR(-ENOMEM);
    	tmp = __getname();
    	if (tmp)  {
    		int retval = do_getname(filename, tmp);
    
    		result = tmp;
    		if (retval < 0) {
    
    			if (retval == -ENOENT && empty)
    				*empty = 1;
    
    Al Viro's avatar
    Al Viro committed
    			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
    				__putname(tmp);
    				result = ERR_PTR(retval);
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    	}
    	audit_getname(result);
    	return result;
    }
    
    
    Al Viro's avatar
    Al Viro committed
    char *getname(const char __user * filename)
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifdef CONFIG_AUDITSYSCALL
    void putname(const char *name)
    {
    
    	if (unlikely(!audit_dummy_context()))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		audit_putname(name);
    	else
    		__putname(name);
    }
    EXPORT_SYMBOL(putname);
    #endif
    
    
    static int check_acl(struct inode *inode, int mask)
    {
    
    	struct posix_acl *acl;
    
    	if (mask & MAY_NOT_BLOCK) {
    
    		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
    	        if (!acl)
    
    		/* no ->get_acl() calls in RCU mode... */
    		if (acl == ACL_NOT_CACHED)
    			return -ECHILD;
    
    	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
    
    	}
    
    	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
    
    	/*
    
    	 * A filesystem can force a ACL callback by just never filling the
    	 * ACL cache. But normally you'd fill the cache either at inode
    	 * instantiation time, or on the first ->get_acl call.
    
    	 * If the filesystem doesn't have a get_acl() function at all, we'll
    	 * just create the negative cache entry.
    
    	 */
    	if (acl == ACL_NOT_CACHED) {
    
    	        if (inode->i_op->get_acl) {
    			acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
    			if (IS_ERR(acl))
    				return PTR_ERR(acl);
    		} else {
    		        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
    		        return -EAGAIN;
    		}
    
    	}
    
    	if (acl) {
    	        int error = posix_acl_permission(inode, acl, mask);
    	        posix_acl_release(acl);
    	        return error;
    	}
    
     * This does the basic permission checking
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int acl_permission_check(struct inode *inode, int mask)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned int mode = inode->i_mode;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (current_user_ns() != inode_userns(inode))
    		goto other_perms;
    
    
    	if (likely(current_fsuid() == inode->i_uid))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		mode >>= 6;
    	else {
    
    		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
    
    			int error = check_acl(inode, mask);
    
    			if (error != -EAGAIN)
    				return error;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		if (in_group_p(inode->i_gid))
    			mode >>= 3;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * If the DACs are ok we don't need any capability check.
    	 */
    
    	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    
     * generic_permission -  check for access rights on a Posix-like filesystem
    
     * @inode:	inode to check access rights for
    
     * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
    
     *
     * Used to check for read/write/execute permissions on a file.
     * We use "fsuid" for this, letting us set arbitrary permissions
     * for filesystem access without changing the "normal" uids which
    
     * are used for other things.
     *
     * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
     * request cannot be satisfied (eg. requires blocking or too much complexity).
     * It would then be called again in ref-walk mode.
    
    int generic_permission(struct inode *inode, int mask)
    
    	 * Do the basic permission checks.
    
    	ret = acl_permission_check(inode, mask);
    
    	if (ret != -EACCES)
    		return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (S_ISDIR(inode->i_mode)) {
    		/* DACs are overridable for directories */
    		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
    			return 0;
    		if (!(mask & MAY_WRITE))
    			if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
    				return 0;
    		return -EACCES;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * Read/write DACs are always overridable.
    
    	 * Executable DACs are overridable when there is
    	 * at least one exec bit set.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
    
    		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 0;
    
    	/*
    	 * Searching includes executable on directories, else just read.
    	 */
    
    	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
    
    		if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return 0;
    
    	return -EACCES;
    }
    
    
    /*
     * We _really_ want to just do "generic_permission()" without
     * even looking at the inode->i_op values. So we keep a cache
     * flag in inode->i_opflags, that says "this has not special
     * permission function, use the fast case".
     */
    static inline int do_inode_permission(struct inode *inode, int mask)
    {
    	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
    		if (likely(inode->i_op->permission))
    			return inode->i_op->permission(inode, mask);
    
    		/* This gets set once for the inode lifetime */
    		spin_lock(&inode->i_lock);
    		inode->i_opflags |= IOP_FASTPERM;
    		spin_unlock(&inode->i_lock);
    	}
    	return generic_permission(inode, mask);
    }
    
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
    /**
     * inode_permission  -  check for access rights to a given inode
     * @inode:	inode to check permission on
    
     * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
     *
     * Used to check for read/write/execute permissions on an inode.
     * We use "fsuid" for this, letting us set arbitrary permissions
     * for filesystem access without changing the "normal" uids which
     * are used for other things.
    
     *
     * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
     */
    
    int inode_permission(struct inode *inode, int mask)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	int retval;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(mask & MAY_WRITE)) {
    
    		umode_t mode = inode->i_mode;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/*
    		 * Nobody gets write access to a read-only fs.
    		 */
    		if (IS_RDONLY(inode) &&
    		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
    			return -EROFS;
    
    		/*
    		 * Nobody gets write access to an immutable file.
    		 */
    		if (IS_IMMUTABLE(inode))
    			return -EACCES;
    	}
    
    
    	retval = do_inode_permission(inode, mask);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (retval)
    		return retval;
    
    
    	retval = devcgroup_inode_permission(inode, mask);
    	if (retval)
    		return retval;
    
    
    	return security_inode_permission(inode, mask);
    
    Jan Blunck's avatar
    Jan Blunck committed
    /**
     * path_get - get a reference to a path
     * @path: path to get the reference to
     *
     * Given a path increment the reference count to the dentry and the vfsmount.
     */
    void path_get(struct path *path)
    {
    	mntget(path->mnt);
    	dget(path->dentry);
    }
    EXPORT_SYMBOL(path_get);
    
    
    Jan Blunck's avatar
    Jan Blunck committed
    /**
     * path_put - put a reference to a path
     * @path: path to put the reference to
     *
     * Given a path decrement the reference count to the dentry and the vfsmount.
     */
    void path_put(struct path *path)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Jan Blunck's avatar
    Jan Blunck committed
    	dput(path->dentry);
    	mntput(path->mnt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    Jan Blunck's avatar
    Jan Blunck committed
    EXPORT_SYMBOL(path_put);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
     * Path walking has 2 modes, rcu-walk and ref-walk (see
    
     * Documentation/filesystems/path-lookup.txt).  In situations when we can't
     * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
     * normal reference counts on dentries and vfsmounts to transition to rcu-walk
     * mode.  Refcounts are grabbed at the last known good point before rcu-walk
     * got stuck, so ref-walk may continue from there. If this is not successful
     * (eg. a seqcount has changed), then failure is returned and it's up to caller
     * to restart the path walk from the beginning in ref-walk mode.
    
     * unlazy_walk - try to switch to ref-walk mode.
     * @nd: nameidata pathwalk data
     * @dentry: child of nd->path.dentry or NULL
    
     * Returns: 0 on success, -ECHILD on failure
    
     * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
     * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
     * @nd or NULL.  Must be called from rcu-walk context.
    
    static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
    
    {
    	struct fs_struct *fs = current->fs;
    	struct dentry *parent = nd->path.dentry;
    
    	int want_root = 0;
    
    
    	BUG_ON(!(nd->flags & LOOKUP_RCU));
    
    	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
    		want_root = 1;
    
    		spin_lock(&fs->lock);
    		if (nd->root.mnt != fs->root.mnt ||
    				nd->root.dentry != fs->root.dentry)
    			goto err_root;
    	}
    	spin_lock(&parent->d_lock);
    
    	if (!dentry) {
    		if (!__d_rcu_to_refcount(parent, nd->seq))
    			goto err_parent;
    		BUG_ON(nd->inode != parent->d_inode);
    	} else {
    
    		if (dentry->d_parent != parent)
    			goto err_parent;
    
    		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
    		if (!__d_rcu_to_refcount(dentry, nd->seq))
    			goto err_child;
    		/*
    		 * If the sequence check on the child dentry passed, then
    		 * the child has not been removed from its parent. This
    		 * means the parent dentry must be valid and able to take
    		 * a reference at this point.
    		 */
    		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
    		BUG_ON(!parent->d_count);
    		parent->d_count++;
    		spin_unlock(&dentry->d_lock);
    	}
    
    	spin_unlock(&parent->d_lock);
    
    	if (want_root) {
    
    		path_get(&nd->root);
    		spin_unlock(&fs->lock);
    	}
    	mntget(nd->path.mnt);
    
    	rcu_read_unlock();
    	br_read_unlock(vfsmount_lock);
    	nd->flags &= ~LOOKUP_RCU;
    	return 0;
    
    	spin_unlock(&dentry->d_lock);
    
    err_parent:
    
    	spin_unlock(&parent->d_lock);
    err_root:
    
    	if (want_root)
    
    		spin_unlock(&fs->lock);
    	return -ECHILD;
    }
    
    
    /**
     * release_open_intent - free up open intent resources
     * @nd: pointer to nameidata
     */
    void release_open_intent(struct nameidata *nd)
    {
    
    	struct file *file = nd->intent.open.file;
    
    	if (file && !IS_ERR(file)) {
    		if (file->f_path.dentry == NULL)
    			put_filp(file);
    		else
    			fput(file);
    	}
    
    Al Viro's avatar
    Al Viro committed
    static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
    
    Al Viro's avatar
    Al Viro committed
    	return dentry->d_op->d_revalidate(dentry, nd);
    
    /**
     * complete_walk - successful completion of path walk
     * @nd:  pointer nameidata
    
     * If we had been in RCU mode, drop out of it and legitimize nd->path.
     * Revalidate the final result, unless we'd already done that during
     * the path walk or the filesystem doesn't ask for it.  Return 0 on
     * success, -error on failure.  In case of failure caller does not
     * need to drop nd->path.
    
    static int complete_walk(struct nameidata *nd)
    
    	struct dentry *dentry = nd->path.dentry;
    
    	if (nd->flags & LOOKUP_RCU) {
    		nd->flags &= ~LOOKUP_RCU;
    		if (!(nd->flags & LOOKUP_ROOT))
    			nd->root.mnt = NULL;
    		spin_lock(&dentry->d_lock);
    		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
    			spin_unlock(&dentry->d_lock);
    			rcu_read_unlock();
    			br_read_unlock(vfsmount_lock);
    			return -ECHILD;
    		}
    		BUG_ON(nd->inode != dentry->d_inode);
    		spin_unlock(&dentry->d_lock);
    		mntget(nd->path.mnt);
    		rcu_read_unlock();
    		br_read_unlock(vfsmount_lock);
    	}
    
    
    	if (likely(!(nd->flags & LOOKUP_JUMPED)))
    		return 0;
    
    	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
    
    	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
    		return 0;
    
    	/* Note: we do not d_invalidate() */
    
    	status = d_revalidate(dentry, nd);
    
    	if (!status)
    
    	path_put(&nd->path);
    
    Al Viro's avatar
    Al Viro committed
    static __always_inline void set_root(struct nameidata *nd)
    {
    
    	if (!nd->root.mnt)
    		get_fs_root(current->fs, &nd->root);
    
    static int link_path_walk(const char *, struct nameidata *);
    
    
    static __always_inline void set_root_rcu(struct nameidata *nd)
    {
    	if (!nd->root.mnt) {
    		struct fs_struct *fs = current->fs;
    
    		unsigned seq;
    
    		do {
    			seq = read_seqcount_begin(&fs->seq);
    			nd->root = fs->root;
    
    			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
    
    		} while (read_seqcount_retry(&fs->seq, seq));
    
    static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (IS_ERR(link))
    		goto fail;
    
    	if (*link == '/') {
    
    Al Viro's avatar
    Al Viro committed
    		set_root(nd);
    
    Jan Blunck's avatar
    Jan Blunck committed
    		path_put(&nd->path);
    
    Al Viro's avatar
    Al Viro committed
    		nd->path = nd->root;
    		path_get(&nd->root);
    
    		nd->flags |= LOOKUP_JUMPED;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	nd->inode = nd->path.dentry->d_inode;
    
    	ret = link_path_walk(link, nd);
    	return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    fail:
    
    Jan Blunck's avatar
    Jan Blunck committed
    	path_put(&nd->path);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return PTR_ERR(link);
    }
    
    
    Jan Blunck's avatar
    Jan Blunck committed
    static void path_put_conditional(struct path *path, struct nameidata *nd)
    
    static inline void path_to_nameidata(const struct path *path,
    					struct nameidata *nd)
    
    	if (!(nd->flags & LOOKUP_RCU)) {
    		dput(nd->path.dentry);
    		if (nd->path.mnt != path->mnt)
    			mntput(nd->path.mnt);
    
    	nd->path.mnt = path->mnt;
    
    	nd->path.dentry = path->dentry;
    
    static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
    {
    	struct inode *inode = link->dentry->d_inode;
    	if (!IS_ERR(cookie) && inode->i_op->put_link)
    		inode->i_op->put_link(link->dentry, nd, cookie);
    	path_put(link);
    }
    
    
    Al Viro's avatar
    Al Viro committed
    static __always_inline int
    
    follow_link(struct path *link, struct nameidata *nd, void **p)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	int error;
    
    	struct dentry *dentry = link->dentry;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	BUG_ON(nd->flags & LOOKUP_RCU);
    
    
    Al Viro's avatar
    Al Viro committed
    	if (link->mnt == nd->path.mnt)
    		mntget(link->mnt);
    
    
    	if (unlikely(current->total_link_count >= 40)) {
    		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
    		path_put(&nd->path);
    		return -ELOOP;
    	}
    	cond_resched();
    	current->total_link_count++;
    
    
    	touch_atime(link->mnt, dentry);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	nd_set_link(nd, NULL);
    
    	error = security_inode_follow_link(link->dentry, nd);
    	if (error) {
    		*p = ERR_PTR(error); /* no ->put_link(), please */
    		path_put(&nd->path);
    		return error;
    	}
    
    
    	nd->last_type = LAST_BIND;
    
    Al Viro's avatar
    Al Viro committed
    	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
    	error = PTR_ERR(*p);
    	if (!IS_ERR(*p)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		char *s = nd_get_link(nd);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (s)
    			error = __vfs_follow_link(nd, s);
    
    Al Viro's avatar
    Al Viro committed
    		else if (nd->last_type == LAST_BIND) {
    
    			nd->flags |= LOOKUP_JUMPED;
    
    			nd->inode = nd->path.dentry->d_inode;
    			if (nd->inode->i_op->follow_link) {
    
    Al Viro's avatar
    Al Viro committed
    				/* stepped on a _really_ weird one */
    				path_put(&nd->path);
    				error = -ELOOP;
    			}
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	return error;
    }
    
    
    static int follow_up_rcu(struct path *path)
    {
    	struct vfsmount *parent;
    	struct dentry *mountpoint;
    
    	parent = path->mnt->mnt_parent;
    	if (parent == path->mnt)
    		return 0;
    	mountpoint = path->mnt->mnt_mountpoint;
    	path->dentry = mountpoint;
    	path->mnt = parent;
    	return 1;
    }
    
    
    int follow_up(struct path *path)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct vfsmount *parent;
    	struct dentry *mountpoint;
    
    
    	br_read_lock(vfsmount_lock);
    
    	parent = path->mnt->mnt_parent;
    	if (parent == path->mnt) {
    
    		br_read_unlock(vfsmount_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    	}
    	mntget(parent);
    
    	mountpoint = dget(path->mnt->mnt_mountpoint);
    
    	br_read_unlock(vfsmount_lock);
    
    	dput(path->dentry);
    	path->dentry = mountpoint;
    	mntput(path->mnt);
    	path->mnt = parent;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return 1;
    }
    
    
     * Perform an automount
     * - return -EISDIR to tell follow_managed() to stop and return the path we
     *   were called with.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int follow_automount(struct path *path, unsigned flags,
    			    bool *need_mntput)
    
    
    	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
    		return -EREMOTE;
    
    
    	/* We don't want to mount if someone's just doing a stat -
    	 * unless they're stat'ing a directory and appended a '/' to
    	 * the name.
    	 *
    	 * We do, however, want to mount if someone wants to open or
    	 * create a file of any type under the mountpoint, wants to
    	 * traverse through the mountpoint or wants to open the
    	 * mounted directory.  Also, autofs may mark negative dentries
    	 * as being automount points.  These will need the attentions
    	 * of the daemon to instantiate them before they can be used.
    
    	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
    
    		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
    
    	    path->dentry->d_inode)
    		return -EISDIR;
    
    
    	current->total_link_count++;
    	if (current->total_link_count >= 40)
    		return -ELOOP;
    
    	mnt = path->dentry->d_op->d_automount(path);
    	if (IS_ERR(mnt)) {
    		/*
    		 * The filesystem is allowed to return -EISDIR here to indicate
    		 * it doesn't want to automount.  For instance, autofs would do
    		 * this so that its userspace daemon can mount on this dentry.
    		 *
    		 * However, we can only permit this if it's a terminal point in
    		 * the path being looked up; if it wasn't then the remainder of
    		 * the path is inaccessible and we should say so.
    		 */
    
    Al Viro's avatar
    Al Viro committed
    		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
    
    	if (!mnt) /* mount collision */
    		return 0;
    
    	if (!*need_mntput) {
    		/* lock_mount() may release path->mnt on error */
    		mntget(path->mnt);
    		*need_mntput = true;
    	}
    
    	err = finish_automount(mnt, path);
    
    	switch (err) {
    	case -EBUSY:
    		/* Someone else made a mount here whilst we were busy */
    
    		path->mnt = mnt;
    		path->dentry = dget(mnt->mnt_root);
    		return 0;
    
    	default:
    		return err;
    
    /*
     * Handle a dentry that is managed in some way.
    
     * - Flagged for transit management (autofs)
    
     * - Flagged as mountpoint
     * - Flagged as automount point
     *
     * This may only be called in refwalk mode.
     *
     * Serialization is taken care of in namespace.c
     */
    static int follow_managed(struct path *path, unsigned flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
    
    	unsigned managed;
    	bool need_mntput = false;
    
    
    	/* Given that we're not holding a lock here, we retain the value in a
    	 * local variable for each dentry as we look at it so that we don't see
    	 * the components of that value change under us */
    	while (managed = ACCESS_ONCE(path->dentry->d_flags),
    	       managed &= DCACHE_MANAGED_DENTRY,
    	       unlikely(managed != 0)) {
    
    		/* Allow the filesystem to manage the transit without i_mutex
    		 * being held. */
    		if (managed & DCACHE_MANAGE_TRANSIT) {
    			BUG_ON(!path->dentry->d_op);
    			BUG_ON(!path->dentry->d_op->d_manage);
    
    			ret = path->dentry->d_op->d_manage(path->dentry, false);
    
    		/* Transit to a mounted filesystem. */
    		if (managed & DCACHE_MOUNTED) {
    			struct vfsmount *mounted = lookup_mnt(path);
    			if (mounted) {
    				dput(path->dentry);
    				if (need_mntput)
    					mntput(path->mnt);
    				path->mnt = mounted;
    				path->dentry = dget(mounted->mnt_root);
    				need_mntput = true;
    				continue;
    			}
    
    			/* Something is mounted on this dentry in another
    			 * namespace and/or whatever was mounted there in this
    			 * namespace got unmounted before we managed to get the
    			 * vfsmount_lock */
    		}
    
    		/* Handle an automount point */
    		if (managed & DCACHE_NEED_AUTOMOUNT) {
    			ret = follow_automount(path, flags, &need_mntput);
    			if (ret < 0)
    
    			continue;
    		}
    
    		/* We didn't change the current path point */
    		break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    
    	if (need_mntput && path->mnt == mnt)
    		mntput(path->mnt);
    	if (ret == -EISDIR)
    		ret = 0;
    
    	return ret < 0 ? ret : need_mntput;
    
    int follow_down_one(struct path *path)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct vfsmount *mounted;
    
    
    Al Viro's avatar
    Al Viro committed
    	mounted = lookup_mnt(path);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (mounted) {
    
    Al Viro's avatar
    Al Viro committed
    		dput(path->dentry);
    		mntput(path->mnt);
    		path->mnt = mounted;
    		path->dentry = dget(mounted->mnt_root);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 1;
    	}
    	return 0;
    }
    
    
    static inline bool managed_dentry_might_block(struct dentry *dentry)
    {
    	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
    		dentry->d_op->d_manage(dentry, true) < 0);
    }
    
    
     * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
     * we meet a managed dentry that would need blocking.
    
     */
    static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
    
    			       struct inode **inode)
    
    		struct mount *mounted;
    
    		/*
    		 * Don't forget we might have a non-mountpoint managed dentry
    		 * that wants to block transit.
    		 */
    
    		if (unlikely(managed_dentry_might_block(path->dentry)))
    
    		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
    		if (!mounted)
    			break;
    
    		path->mnt = &mounted->mnt;
    		path->dentry = mounted->mnt.mnt_root;
    
    		nd->flags |= LOOKUP_JUMPED;
    
    		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
    
    		/*
    		 * Update the inode too. We don't need to re-check the
    		 * dentry sequence number here after this d_inode read,
    		 * because a mount-point is always pinned.
    		 */
    		*inode = path->dentry->d_inode;
    
    static void follow_mount_rcu(struct nameidata *nd)
    
    	while (d_mountpoint(nd->path.dentry)) {
    
    		struct mount *mounted;
    
    		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
    
    		nd->path.mnt = &mounted->mnt;
    		nd->path.dentry = mounted->mnt.mnt_root;
    
    		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
    
    static int follow_dotdot_rcu(struct nameidata *nd)
    {
    	set_root_rcu(nd);
    
    
    		if (nd->path.dentry == nd->root.dentry &&
    		    nd->path.mnt == nd->root.mnt) {
    			break;
    		}
    		if (nd->path.dentry != nd->path.mnt->mnt_root) {
    			struct dentry *old = nd->path.dentry;
    			struct dentry *parent = old->d_parent;
    			unsigned seq;
    
    			seq = read_seqcount_begin(&parent->d_seq);
    			if (read_seqcount_retry(&old->d_seq, nd->seq))
    
    				goto failed;
    
    			nd->path.dentry = parent;
    			nd->seq = seq;
    			break;
    		}
    		if (!follow_up_rcu(&nd->path))
    			break;
    		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
    	}
    
    	follow_mount_rcu(nd);
    	nd->inode = nd->path.dentry->d_inode;
    
    	return 0;
    
    
    failed:
    	nd->flags &= ~LOOKUP_RCU;
    
    	if (!(nd->flags & LOOKUP_ROOT))
    		nd->root.mnt = NULL;
    
    	rcu_read_unlock();
    	br_read_unlock(vfsmount_lock);
    	return -ECHILD;
    
    /*
     * Follow down to the covering mount currently visible to userspace.  At each
     * point, the filesystem owning that dentry may be queried as to whether the
     * caller is permitted to proceed or not.
     */
    
    int follow_down(struct path *path)
    
    {
    	unsigned managed;
    	int ret;
    
    	while (managed = ACCESS_ONCE(path->dentry->d_flags),
    	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
    		/* Allow the filesystem to manage the transit without i_mutex
    		 * being held.
    		 *
    		 * We indicate to the filesystem if someone is trying to mount
    		 * something here.  This gives autofs the chance to deny anyone
    		 * other than its daemon the right to mount on its
    		 * superstructure.
    		 *
    		 * The filesystem may sleep at this point.
    		 */
    		if (managed & DCACHE_MANAGE_TRANSIT) {
    			BUG_ON(!path->dentry->d_op);
    			BUG_ON(!path->dentry->d_op->d_manage);
    
    			ret = path->dentry->d_op->d_manage(
    
    				path->dentry, false);
    
    			if (ret < 0)
    				return ret == -EISDIR ? 0 : ret;
    		}
    
    		/* Transit to a mounted filesystem. */
    		if (managed & DCACHE_MOUNTED) {
    			struct vfsmount *mounted = lookup_mnt(path);