Newer
Older
/*
* linux/fs/namespace.c
*
* (C) Copyright Al Viro 2000, 2001
* Released under GPL v2.
*
* Based on code from fs/super.c, copyright Linus Torvalds and others.
* Heavily rewritten.
*/
#include <linux/syscalls.h>
#include <linux/mnt_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/acct.h> /* acct_auto_close_mnt */
#include <linux/ramfs.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
#include <linux/uaccess.h>
#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
#define HASH_SIZE (1UL << HASH_SHIFT)
static DEFINE_IDA(mnt_group_ida);
static int mnt_id_start = 0;
static int mnt_group_start = 1;
static struct list_head *mount_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
/*
* vfsmount lock may be taken for read to prevent changes to the
* vfsmount hash, ie. during mountpoint lookups or walking back
* up the tree.
*
* It should be taken for write in all cases where the vfsmount
* tree or hash is modified or when a vfsmount structure is modified.
*/
DEFINE_BRLOCK(vfsmount_lock);
static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
tmp = tmp + (tmp >> HASH_SHIFT);
return tmp & (HASH_SIZE - 1);
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
/*
* allocation is serialized by namespace_sem, but we need the spinlock to
* serialize with freeing.
*/
static int mnt_alloc_id(struct mount *mnt)
{
int res;
retry:
ida_pre_get(&mnt_id_ida, GFP_KERNEL);
res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
if (res == -EAGAIN)
goto retry;
return res;
}
static void mnt_free_id(struct mount *mnt)
ida_remove(&mnt_id_ida, id);
if (mnt_id_start > id)
mnt_id_start = id;
/*
* Allocate a new peer group ID
*
* mnt_group_ida is protected by namespace_sem
*/
static int mnt_alloc_group_id(struct mount *mnt)
if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
return -ENOMEM;
res = ida_get_new_above(&mnt_group_ida,
mnt_group_start,
}
/*
* Release a peer group ID
*/
void mnt_release_group_id(struct mount *mnt)
ida_remove(&mnt_group_ida, id);
if (mnt_group_start > id)
mnt_group_start = id;
/*
* vfsmount lock must be held for read
*/
static inline void mnt_add_count(struct mount *mnt, int n)
this_cpu_add(mnt->mnt_pcp->mnt_count, n);
preempt_enable();
#endif
}
/*
* vfsmount lock must be held for write
*/
unsigned int mnt_get_count(struct mount *mnt)
count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
static struct mount *alloc_vfsmnt(const char *name)
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
if (err)
goto out_free_cache;
if (name) {
mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
if (!mnt->mnt_devname)
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
if (!mnt->mnt_pcp)
this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
* We must keep track of when those operations start
* (for permission checks) and when they end, so that
* we can determine when writes are able to occur to
* a filesystem.
*/
/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
int __mnt_is_readonly(struct vfsmount *mnt)
{
if (mnt->mnt_flags & MNT_READONLY)
return 1;
if (mnt->mnt_sb->s_flags & MS_RDONLY)
return 1;
return 0;
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
static inline void mnt_inc_writers(struct mount *mnt)
this_cpu_inc(mnt->mnt_pcp->mnt_writers);
static inline void mnt_dec_writers(struct mount *mnt)
this_cpu_dec(mnt->mnt_pcp->mnt_writers);
static unsigned int mnt_get_writers(struct mount *mnt)
#ifdef CONFIG_SMP
unsigned int count = 0;
int cpu;
for_each_possible_cpu(cpu) {
count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
return count;
#else
return mnt->mnt_writers;
#endif
static int mnt_is_readonly(struct vfsmount *mnt)
{
if (mnt->mnt_sb->s_readonly_remount)
return 1;
/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
smp_rmb();
return __mnt_is_readonly(mnt);
}
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
* We must keep track of when those operations start
* (for permission checks) and when they end, so that
* we can determine when writes are able to occur to
* a filesystem.
*/
/**
* mnt_want_write - get write access to a mount
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is
* about to be performed to it, and makes sure that
* writes are allowed before returning success. When
* the write operation is finished, mnt_drop_write()
* must be called. This is effectively a refcount.
*/
int mnt_want_write(struct vfsmount *m)
struct mount *mnt = real_mount(m);
int ret = 0;
* The store to mnt_inc_writers must be visible before we pass
* MNT_WRITE_HOLD loop below, so that the slowpath can see our
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
* MNT_WRITE_HOLD is cleared.
*/
smp_rmb();
ret = -EROFS;
}
}
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
* mnt_clone_write - get write access to a mount
* @mnt: the mount on which to take a write
*
* This is effectively like mnt_want_write, except
* it must only be used to take an extra write reference
* on a mountpoint that we already know has a write reference
* on it. This allows some optimisation.
*
* After finished, mnt_drop_write must be called as usual to
* drop the reference.
*/
int mnt_clone_write(struct vfsmount *mnt)
{
/* superblock may be r/o */
if (__mnt_is_readonly(mnt))
return -EROFS;
preempt_disable();
mnt_inc_writers(real_mount(mnt));
preempt_enable();
return 0;
}
EXPORT_SYMBOL_GPL(mnt_clone_write);
/**
* mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
* This is like mnt_want_write, but it takes a file and can
* do some optimisations if the file is open for write already
*/
int mnt_want_write_file(struct file *file)
{
struct inode *inode = file->f_dentry->d_inode;
if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
return mnt_want_write(file->f_path.mnt);
else
return mnt_clone_write(file->f_path.mnt);
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
* mnt_want_write() call above.
*/
void mnt_drop_write(struct vfsmount *mnt)
{
mnt_dec_writers(real_mount(mnt));
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
void mnt_drop_write_file(struct file *file)
{
mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(mnt_drop_write_file);
static int mnt_make_readonly(struct mount *mnt)
int ret = 0;
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
* With writers on hold, if this value is zero, then there are
* definitely no active writers (although held writers may subsequently
* increment the count, they'll have to wait, and decrement it after
* seeing MNT_READONLY).
*
* It is OK to have counter incremented on one CPU and decremented on
* another: the sum will add up correctly. The danger would be when we
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
* MNT_WRITE_HOLD protects against this scenario, because
* mnt_want_write first increments count, then smp_mb, then spins on
* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
mnt->mnt.mnt_flags |= MNT_READONLY;
/*
* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
static void __mnt_unmake_readonly(struct mount *mnt)
mnt->mnt.mnt_flags &= ~MNT_READONLY;
int sb_prepare_remount_readonly(struct super_block *sb)
{
struct mount *mnt;
int err = 0;
/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
smp_mb();
if (mnt_get_writers(mnt) > 0) {
err = -EBUSY;
break;
}
}
}
if (!err && atomic_long_read(&sb->s_remove_count))
err = -EBUSY;
if (!err) {
sb->s_readonly_remount = 1;
smp_wmb();
}
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}
static void free_vfsmnt(struct mount *mnt)
kmem_cache_free(mnt_cache, mnt);
* find the first or last mount at @dentry on vfsmount @mnt depending on
* @dir. If @dir is set return the first mount else return the last mount.
* vfsmount_lock must be held for read or write.
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
struct list_head *head = mount_hashtable + hash(mnt, dentry);
struct list_head *tmp = head;
struct mount *p, *found = NULL;
p = list_entry(tmp, struct mount, mnt_hash);
if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
/*
* lookup_mnt increments the ref count before returning
* the vfsmount struct.
*/
child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
if (child_mnt) {
mnt_add_count(child_mnt, 1);
return &child_mnt->mnt;
} else {
static inline int check_mnt(struct mount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
/*
* vfsmount lock must be held for write
*/
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns) {
ns->event = ++event;
wake_up_interruptible(&ns->poll);
}
}
/*
* vfsmount lock must be held for write
*/
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns && ns->event != event) {
ns->event = event;
wake_up_interruptible(&ns->poll);
}
}
/*
* Clear dentry's mounted state if it has no remaining mounts.
* vfsmount_lock must be held for write.
*/
static void dentry_reset_mounted(struct dentry *dentry)
{
unsigned u;
for (u = 0; u < HASH_SIZE; u++) {
list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
return;
}
}
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
}
/*
* vfsmount lock must be held for write
*/
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
old_path->mnt = &mnt->mnt_parent->mnt;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
dentry_reset_mounted(old_path->dentry);
/*
* vfsmount lock must be held for write
*/
void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
struct mount *child_mnt)
mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = dget(dentry);
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
/*
* vfsmount lock must be held for write
*/
static void attach_mnt(struct mount *mnt, struct path *path)
mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
static void commit_tree(struct mount *mnt)
struct mount *parent = mnt->mnt_parent;
BUG_ON(parent == mnt);
list_splice(&head, n->list.prev);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
touch_mnt_namespace(n);
static struct mount *next_mnt(struct mount *p, struct mount *root)
struct list_head *next = p->mnt_mounts.next;
if (next == &p->mnt_mounts) {
next = p->mnt_child.next;
if (next != &p->mnt_parent->mnt_mounts)
p = p->mnt_parent;
return list_entry(next, struct mount, mnt_child);
static struct mount *skip_mnt_tree(struct mount *p)
struct list_head *prev = p->mnt_mounts.prev;
while (prev != &p->mnt_mounts) {
p = list_entry(prev, struct mount, mnt_child);
prev = p->mnt_mounts.prev;
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct mount *mnt;
struct dentry *root;
if (!type)
return ERR_PTR(-ENODEV);
mnt = alloc_vfsmnt(name);
if (!mnt)
return ERR_PTR(-ENOMEM);
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
root = mount_fs(type, flags, name, data);
if (IS_ERR(root)) {
free_vfsmnt(mnt);
return ERR_CAST(root);
}
mnt->mnt.mnt_root = root;
mnt->mnt.mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
return &mnt->mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
struct super_block *sb = old->mnt.mnt_sb;
struct mount *mnt;
int err;
mnt = alloc_vfsmnt(old->mnt_devname);
if (!mnt)
return ERR_PTR(-ENOMEM);
if (flag & (CL_SLAVE | CL_PRIVATE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
mnt->mnt_group_id = old->mnt_group_id;
if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
err = mnt_alloc_group_id(mnt);
if (err)
goto out_free;
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
atomic_inc(&sb->s_active);
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
br_write_lock(&vfsmount_lock);
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
br_write_unlock(&vfsmount_lock);
if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
mnt->mnt_master = old;
CLEAR_MNT_SHARED(mnt);
} else if (!(flag & CL_PRIVATE)) {
if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
list_add(&mnt->mnt_share, &old->mnt_share);
if (IS_MNT_SLAVE(old))
list_add(&mnt->mnt_slave, &old->mnt_slave);
mnt->mnt_master = old->mnt_master;
}
if (flag & CL_MAKE_SHARED)
set_mnt_shared(mnt);
/* stick the duplicate mount on the same expiry list
* as the original if that was on one */
if (flag & CL_EXPIRE) {
if (!list_empty(&old->mnt_expire))
list_add(&mnt->mnt_expire, &old->mnt_expire);
}
out_free:
free_vfsmnt(mnt);
return ERR_PTR(err);
static inline void mntfree(struct mount *mnt)
struct vfsmount *m = &mnt->mnt;
struct super_block *sb = m->mnt_sb;
/*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
* happens, the filesystem was probably unable
* to make r/w->r/o transitions.
*/
* The locking used to deal with mnt_count decrement provides barriers,
* so mnt_get_writers() below is safe.
fsnotify_vfsmount_delete(m);
dput(m->mnt_root);
free_vfsmnt(mnt);
static void mntput_no_expire(struct mount *mnt)
if (likely(mnt->mnt_ns)) {
/* shouldn't be the last one */
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
list_del(&mnt->mnt_instance);
mntfree(mnt);
}
void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
if (unlikely(m->mnt_expiry_mark))
m->mnt_expiry_mark = 0;
mntput_no_expire(m);
}
}
EXPORT_SYMBOL(mntput);
struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)
mnt_add_count(real_mount(mnt), 1);
return mnt;
}
EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
{
real_mount(mnt)->mnt_pinned++;
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *m)
struct mount *mnt = real_mount(m);
if (mnt->mnt_pinned) {
mnt->mnt_pinned--;
}
}
EXPORT_SYMBOL(mnt_unpin);
static inline void mangle(struct seq_file *m, const char *s)
{
seq_escape(m, s, " \t\n\\");
}
/*
* Simple .show_options callback for filesystems which don't want to
* implement more complex mount option showing.
*
* See also save_mount_options().
*/
int generic_show_options(struct seq_file *m, struct dentry *root)
const char *options;
rcu_read_lock();
options = rcu_dereference(root->d_sb->s_options);
if (options != NULL && options[0]) {
seq_putc(m, ',');
mangle(m, options);
}
return 0;
}
EXPORT_SYMBOL(generic_show_options);
/*
* If filesystem uses generic_show_options(), this function should be
* called from the fill_super() callback.
*
* The .remount_fs callback usually needs to be handled in a special
* way, to make sure, that previous options are not overwritten if the
* remount fails.
*
* Also note, that if the filesystem's .remount_fs function doesn't
* reset all options to their default value, but changes only newly
* given options, then the displayed options will not reflect reality
* any more.
*/
void save_mount_options(struct super_block *sb, char *options)
{
BUG_ON(sb->s_options);
rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
}
EXPORT_SYMBOL(save_mount_options);
void replace_mount_options(struct super_block *sb, char *options)
{
char *old = sb->s_options;
rcu_assign_pointer(sb->s_options, options);
if (old) {
synchronize_rcu();
kfree(old);
}
}
EXPORT_SYMBOL(replace_mount_options);
#ifdef CONFIG_PROC_FS
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
return seq_list_start(&p->ns->list, *pos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
return seq_list_next(v, &p->ns->list, pos);
}
static void m_stop(struct seq_file *m, void *v)
{
static int m_show(struct seq_file *m, void *v)
struct mount *r = list_entry(v, struct mount, mnt_list);
return p->show(m, &r->mnt);
const struct seq_operations mounts_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
#endif /* CONFIG_PROC_FS */
/**
* may_umount_tree - check if a mount tree is busy
* @mnt: root of mount tree
*
* This is called to check if a tree of mounts has any
* open files, pwds, chroots or sub mounts that are
* busy.
*/
int may_umount_tree(struct vfsmount *m)
struct mount *mnt = real_mount(m);
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += mnt_get_count(p);