Skip to content
Snippets Groups Projects
fork.c 45.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     *  linux/kernel/fork.c
     *
     *  Copyright (C) 1991, 1992  Linus Torvalds
     */
    
    /*
     *  'fork.c' contains the help-routines for the 'fork' system call
     * (see also entry.S and others).
     * Fork is rather simple, once you get the hang of it, but the memory
     * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
     */
    
    #include <linux/slab.h>
    #include <linux/init.h>
    #include <linux/unistd.h>
    #include <linux/module.h>
    #include <linux/vmalloc.h>
    #include <linux/completion.h>
    #include <linux/personality.h>
    #include <linux/mempolicy.h>
    #include <linux/sem.h>
    #include <linux/file.h>
    
    Al Viro's avatar
    Al Viro committed
    #include <linux/fdtable.h>
    
    #include <linux/iocontext.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/key.h>
    #include <linux/binfmts.h>
    #include <linux/mman.h>
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    #include <linux/mmu_notifier.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/fs.h>
    
    #include <linux/nsproxy.h>
    
    #include <linux/capability.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/cpu.h>
    
    #include <linux/cgroup.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/security.h>
    
    #include <linux/seccomp.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/swap.h>
    #include <linux/syscalls.h>
    #include <linux/jiffies.h>
    #include <linux/futex.h>
    
    #include <linux/compat.h>
    
    #include <linux/kthread.h>
    
    #include <linux/task_io_accounting_ops.h>
    
    #include <linux/rcupdate.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/ptrace.h>
    #include <linux/mount.h>
    #include <linux/audit.h>
    
    #include <linux/memcontrol.h>
    
    #include <linux/proc_fs.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/profile.h>
    #include <linux/rmap.h>
    
    #include <linux/ksm.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/acct.h>
    
    #include <linux/tsacct_kern.h>
    
    #include <linux/cn_proc.h>
    
    #include <linux/freezer.h>
    
    #include <linux/delayacct.h>
    
    #include <linux/taskstats_kern.h>
    
    #include <linux/tty.h>
    
    #include <linux/blkdev.h>
    
    #include <linux/fs_struct.h>
    
    #include <linux/perf_event.h>
    
    #include <linux/posix-timers.h>
    
    #include <linux/user-return-notifier.h>
    
    #include <linux/oom.h>
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    #include <linux/khugepaged.h>
    
    #include <linux/aio.h>
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    #include <asm/pgtable.h>
    #include <asm/pgalloc.h>
    #include <asm/uaccess.h>
    #include <asm/mmu_context.h>
    #include <asm/cacheflush.h>
    #include <asm/tlbflush.h>
    
    
    #define CREATE_TRACE_POINTS
    #include <trace/events/task.h>
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Protected counters by write_lock_irq(&tasklist_lock)
     */
    unsigned long total_forks;	/* Handle normal Linux uptimes. */
    
    int nr_threads;			/* The idle threads do not count.. */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    int max_threads;		/* tunable limit on nr_threads */
    
    DEFINE_PER_CPU(unsigned long, process_counts) = 0;
    
    
    __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
    
    
    #ifdef CONFIG_PROVE_RCU
    int lockdep_tasklist_lock_is_held(void)
    {
    	return lockdep_is_held(&tasklist_lock);
    }
    EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
    #endif /* #ifdef CONFIG_PROVE_RCU */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    int nr_processes(void)
    {
    	int cpu;
    	int total = 0;
    
    
    	for_each_possible_cpu(cpu)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		total += per_cpu(process_counts, cpu);
    
    	return total;
    }
    
    
    void __weak arch_release_task_struct(struct task_struct *tsk)
    {
    }
    
    
    #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
    
    static struct kmem_cache *task_struct_cachep;
    
    
    static inline struct task_struct *alloc_task_struct_node(int node)
    {
    	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
    }
    
    static inline void free_task_struct(struct task_struct *tsk)
    {
    	kmem_cache_free(task_struct_cachep, tsk);
    }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    
    void __weak arch_release_thread_info(struct thread_info *ti)
    {
    }
    
    
    #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
    
    /*
     * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
     * kmemcache based allocator.
     */
    # if THREAD_SIZE >= PAGE_SIZE
    
    static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
    						  int node)
    
    	struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
    
    					     THREAD_SIZE_ORDER);
    
    
    	return page ? page_address(page) : NULL;
    
    }
    
    static inline void free_thread_info(struct thread_info *ti)
    {
    
    	free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
    
    # else
    static struct kmem_cache *thread_info_cache;
    
    static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
    						  int node)
    {
    	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
    }
    
    static void free_thread_info(struct thread_info *ti)
    {
    	kmem_cache_free(thread_info_cache, ti);
    }
    
    void thread_info_cache_init(void)
    {
    	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
    					      THREAD_SIZE, 0, NULL);
    	BUG_ON(thread_info_cache == NULL);
    }
    # endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* SLAB cache for signal_struct structures (tsk->signal) */
    
    static struct kmem_cache *signal_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* SLAB cache for sighand_struct structures (tsk->sighand) */
    
    struct kmem_cache *sighand_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* SLAB cache for files_struct structures (tsk->files) */
    
    struct kmem_cache *files_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* SLAB cache for fs_struct structures (tsk->fs) */
    
    struct kmem_cache *fs_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* SLAB cache for vm_area_struct structures */
    
    struct kmem_cache *vm_area_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /* SLAB cache for mm_struct structures (tsk->mm) */
    
    static struct kmem_cache *mm_cachep;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    static void account_kernel_stack(struct thread_info *ti, int account)
    {
    	struct zone *zone = page_zone(virt_to_page(ti));
    
    	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    void free_task(struct task_struct *tsk)
    {
    
    	arch_release_thread_info(tsk->stack);
    
    	free_thread_info(tsk->stack);
    
    	rt_mutex_debug_task_free(tsk);
    
    	put_seccomp_filter(tsk);
    
    	arch_release_task_struct(tsk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	free_task_struct(tsk);
    }
    EXPORT_SYMBOL(free_task);
    
    
    static inline void free_signal_struct(struct signal_struct *sig)
    {
    
    	sched_autogroup_exit(sig);
    
    	kmem_cache_free(signal_cachep, sig);
    }
    
    static inline void put_signal_struct(struct signal_struct *sig)
    {
    
    	if (atomic_dec_and_test(&sig->sigcnt))
    
    void __put_task_struct(struct task_struct *tsk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Eugene Teo's avatar
    Eugene Teo committed
    	WARN_ON(!tsk->exit_state);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	WARN_ON(atomic_read(&tsk->usage));
    	WARN_ON(tsk == current);
    
    
    	security_task_free(tsk);
    
    	delayacct_tsk_free(tsk);
    
    	put_signal_struct(tsk->signal);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (!profile_handoff_task(tsk))
    		free_task(tsk);
    }
    
    EXPORT_SYMBOL_GPL(__put_task_struct);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    void __init __weak arch_task_cache_init(void) { }
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    void __init fork_init(unsigned long mempages)
    {
    
    #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #ifndef ARCH_MIN_TASKALIGN
    #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
    #endif
    	/* create a slab on which task_structs can be allocated */
    	task_struct_cachep =
    		kmem_cache_create("task_struct", sizeof(struct task_struct),
    
    			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #endif
    
    
    	/* do the arch specific task caches init */
    	arch_task_cache_init();
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * The default maximum number of threads is set to a safe
    	 * value: the thread structures can take up at most half
    	 * of memory.
    	 */
    	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
    
    	/*
    	 * we need to allow at least 20 threads to boot a system
    	 */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		max_threads = 20;
    
    	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
    	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
    	init_task.signal->rlim[RLIMIT_SIGPENDING] =
    		init_task.signal->rlim[RLIMIT_NPROC];
    }
    
    
    int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
    					       struct task_struct *src)
    {
    	*dst = *src;
    	return 0;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    static struct task_struct *dup_task_struct(struct task_struct *orig)
    {
    	struct task_struct *tsk;
    	struct thread_info *ti;
    
    	int node = tsk_fork_get_node(orig);
    
    	int err;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	tsk = alloc_task_struct_node(node);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!tsk)
    		return NULL;
    
    
    	ti = alloc_thread_info_node(tsk, node);
    
    	if (!ti)
    		goto free_tsk;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	err = arch_dup_task_struct(tsk, orig);
    
    		goto free_ti;
    
    	tsk->stack = ti;
    
    	setup_thread_stack(tsk, orig);
    
    	clear_user_return_notifier(tsk);
    
    	clear_tsk_need_resched(tsk);
    
    	stackend = end_of_stack(tsk);
    	*stackend = STACK_END_MAGIC;	/* for overflow detection */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #ifdef CONFIG_CC_STACKPROTECTOR
    	tsk->stack_canary = get_random_int();
    #endif
    
    
    	/*
    	 * One for us, one for whoever does the "release_task()" (usually
    	 * parent)
    	 */
    	atomic_set(&tsk->usage, 2);
    
    #ifdef CONFIG_BLK_DEV_IO_TRACE
    
    	tsk->splice_pipe = NULL;
    
    	tsk->task_frag.page = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return tsk;
    
    	free_thread_info(ti);
    
    	free_task_struct(tsk);
    	return NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    #ifdef CONFIG_MMU
    
    Alexey Dobriyan's avatar
    Alexey Dobriyan committed
    static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct rb_node **rb_link, *rb_parent;
    	int retval;
    	unsigned long charge;
    	struct mempolicy *pol;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	down_write(&oldmm->mmap_sem);
    
    	flush_cache_dup_mm(oldmm);
    
    	uprobe_dup_mmap(oldmm, mm);
    
    	/*
    	 * Not linked in yet - no deadlock potential:
    	 */
    	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	mm->locked_vm = 0;
    	mm->mmap = NULL;
    	mm->mmap_cache = NULL;
    	mm->map_count = 0;
    
    	cpumask_clear(mm_cpumask(mm));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	mm->mm_rb = RB_ROOT;
    	rb_link = &mm->mm_rb.rb_node;
    	rb_parent = NULL;
    	pprev = &mm->mmap;
    
    	retval = ksm_fork(mm, oldmm);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	if (retval)
    		goto out;
    	retval = khugepaged_fork(mm, oldmm);
    
    	if (retval)
    		goto out;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct file *file;
    
    		if (mpnt->vm_flags & VM_DONTCOPY) {
    
    			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    		}
    		charge = 0;
    		if (mpnt->vm_flags & VM_ACCOUNT) {
    
    			unsigned long len = vma_pages(mpnt);
    
    
    			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				goto fail_nomem;
    			charge = len;
    		}
    
    		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!tmp)
    			goto fail_nomem;
    		*tmp = *mpnt;
    
    		INIT_LIST_HEAD(&tmp->anon_vma_chain);
    
    		pol = mpol_dup(vma_policy(mpnt));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		retval = PTR_ERR(pol);
    		if (IS_ERR(pol))
    			goto fail_nomem_policy;
    		vma_set_policy(tmp, pol);
    
    		tmp->vm_mm = mm;
    
    		if (anon_vma_fork(tmp, mpnt))
    			goto fail_nomem_anon_vma_fork;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tmp->vm_flags &= ~VM_LOCKED;
    
    		tmp->vm_next = tmp->vm_prev = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		file = tmp->vm_file;
    		if (file) {
    
    Al Viro's avatar
    Al Viro committed
    			struct inode *inode = file_inode(file);
    
    			struct address_space *mapping = file->f_mapping;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			get_file(file);
    			if (tmp->vm_flags & VM_DENYWRITE)
    				atomic_dec(&inode->i_writecount);
    
    			mutex_lock(&mapping->i_mmap_mutex);
    
    			if (tmp->vm_flags & VM_SHARED)
    				mapping->i_mmap_writable++;
    			flush_dcache_mmap_lock(mapping);
    			/* insert tmp into the share list, just after mpnt */
    
    			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
    				vma_nonlinear_insert(tmp,
    						&mapping->i_mmap_nonlinear);
    			else
    				vma_interval_tree_insert_after(tmp, mpnt,
    							&mapping->i_mmap);
    
    			flush_dcache_mmap_unlock(mapping);
    
    			mutex_unlock(&mapping->i_mmap_mutex);
    
    		/*
    		 * Clear hugetlb-related page reserves for children. This only
    		 * affects MAP_PRIVATE mappings. Faults generated by the child
    		 * are not guaranteed to succeed, even if read-only
    		 */
    		if (is_vm_hugetlb_page(tmp))
    			reset_vma_resv_huge_pages(tmp);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/*
    
    		 * Link in the new vma and copy the page table entries.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		 */
    		*pprev = tmp;
    		pprev = &tmp->vm_next;
    
    		tmp->vm_prev = prev;
    		prev = tmp;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		__vma_link_rb(mm, tmp, rb_link, rb_parent);
    		rb_link = &tmp->vm_rb.rb_right;
    		rb_parent = &tmp->vm_rb;
    
    		mm->map_count++;
    
    		retval = copy_page_range(mm, oldmm, mpnt);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		if (tmp->vm_ops && tmp->vm_ops->open)
    			tmp->vm_ops->open(tmp);
    
    		if (retval)
    			goto out;
    	}
    
    	/* a new mm has just been created */
    	arch_dup_mmap(oldmm, mm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	retval = 0;
    out:
    
    	up_write(&mm->mmap_sem);
    
    	flush_tlb_mm(oldmm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	up_write(&oldmm->mmap_sem);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return retval;
    
    fail_nomem_anon_vma_fork:
    	mpol_put(pol);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    fail_nomem_policy:
    	kmem_cache_free(vm_area_cachep, tmp);
    fail_nomem:
    	retval = -ENOMEM;
    	vm_unacct_memory(charge);
    	goto out;
    }
    
    
    static inline int mm_alloc_pgd(struct mm_struct *mm)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	mm->pgd = pgd_alloc(mm);
    	if (unlikely(!mm->pgd))
    		return -ENOMEM;
    	return 0;
    }
    
    
    static inline void mm_free_pgd(struct mm_struct *mm)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	pgd_free(mm, mm->pgd);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    #else
    #define dup_mmap(mm, oldmm)	(0)
    #define mm_alloc_pgd(mm)	(0)
    #define mm_free_pgd(mm)
    #endif /* CONFIG_MMU */
    
    
    Daniel Walker's avatar
    Daniel Walker committed
    __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
    
    
    static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
    
    static int __init coredump_filter_setup(char *s)
    {
    	default_dump_filter =
    		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
    		MMF_DUMP_FILTER_MASK;
    	return 1;
    }
    
    __setup("coredump_filter=", coredump_filter_setup);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    #include <linux/init_task.h>
    
    
    static void mm_init_aio(struct mm_struct *mm)
    {
    #ifdef CONFIG_AIO
    	spin_lock_init(&mm->ioctx_lock);
    	INIT_HLIST_HEAD(&mm->ioctx_list);
    #endif
    }
    
    
    static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	atomic_set(&mm->mm_users, 1);
    	atomic_set(&mm->mm_count, 1);
    	init_rwsem(&mm->mmap_sem);
    	INIT_LIST_HEAD(&mm->mmlist);
    
    	mm->flags = (current->mm) ?
    		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	mm->nr_ptes = 0;
    
    	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	spin_lock_init(&mm->page_table_lock);
    
    	mm_init_aio(mm);
    
    	mm_init_owner(mm, p);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (likely(!mm_alloc_pgd(mm))) {
    		mm->def_flags = 0;
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    		mmu_notifier_mm_init(mm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return mm;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	free_mm(mm);
    	return NULL;
    }
    
    
    static void check_mm(struct mm_struct *mm)
    {
    	int i;
    
    	for (i = 0; i < NR_MM_COUNTERS; i++) {
    		long x = atomic_long_read(&mm->rss_stat.count[i]);
    
    		if (unlikely(x))
    			printk(KERN_ALERT "BUG: Bad rss-counter state "
    					  "mm:%p idx:%d val:%ld\n", mm, i, x);
    	}
    
    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    	VM_BUG_ON(mm->pmd_huge_pte);
    #endif
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Allocate and initialize an mm_struct.
     */
    
    struct mm_struct *mm_alloc(void)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	mm = allocate_mm();
    
    	if (!mm)
    		return NULL;
    
    	memset(mm, 0, sizeof(*mm));
    
    	mm_init_cpumask(mm);
    	return mm_init(mm, current);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Called when the last reference to the mm
     * is dropped: either by a lazy thread or by
     * mmput. Free the page directory and the mm.
     */
    
    void __mmdrop(struct mm_struct *mm)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	BUG_ON(mm == &init_mm);
    	mm_free_pgd(mm);
    	destroy_context(mm);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	mmu_notifier_mm_destroy(mm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	free_mm(mm);
    }
    
    EXPORT_SYMBOL_GPL(__mmdrop);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    /*
     * Decrement the use count and release all resources for an mm.
     */
    void mmput(struct mm_struct *mm)
    {
    
    	might_sleep();
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (atomic_dec_and_test(&mm->mm_users)) {
    
    		uprobe_clear_state(mm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		exit_aio(mm);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    		khugepaged_exit(mm); /* must run before exit_mmap */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		exit_mmap(mm);
    
    Matt Helsley's avatar
    Matt Helsley committed
    		set_mm_exe_file(mm, NULL);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!list_empty(&mm->mmlist)) {
    			spin_lock(&mmlist_lock);
    			list_del(&mm->mmlist);
    			spin_unlock(&mmlist_lock);
    		}
    
    		if (mm->binfmt)
    			module_put(mm->binfmt->module);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		mmdrop(mm);
    	}
    }
    EXPORT_SYMBOL_GPL(mmput);
    
    
    void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
    {
    	if (new_exe_file)
    		get_file(new_exe_file);
    	if (mm->exe_file)
    		fput(mm->exe_file);
    	mm->exe_file = new_exe_file;
    }
    
    struct file *get_mm_exe_file(struct mm_struct *mm)
    {
    	struct file *exe_file;
    
    
    	/* We need mmap_sem to protect against races with removal of exe_file */
    
    	down_read(&mm->mmap_sem);
    	exe_file = mm->exe_file;
    	if (exe_file)
    		get_file(exe_file);
    	up_read(&mm->mmap_sem);
    	return exe_file;
    }
    
    static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
    {
    	/* It's safe to write the exe_file pointer without exe_file_lock because
    	 * this is called during fork when the task is not yet in /proc */
    	newmm->exe_file = get_mm_exe_file(oldmm);
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /**
     * get_task_mm - acquire a reference to the task's mm
     *
    
     * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * this kernel workthread has transiently adopted a user mm with use_mm,
     * to do its AIO) is not set and if so returns a reference to it, after
     * bumping up the use count.  User must release the mm via mmput()
     * after use.  Typically used by /proc and ptrace.
     */
    struct mm_struct *get_task_mm(struct task_struct *task)
    {
    	struct mm_struct *mm;
    
    	task_lock(task);
    	mm = task->mm;
    	if (mm) {
    
    		if (task->flags & PF_KTHREAD)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			mm = NULL;
    		else
    			atomic_inc(&mm->mm_users);
    	}
    	task_unlock(task);
    	return mm;
    }
    EXPORT_SYMBOL_GPL(get_task_mm);
    
    
    struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
    {
    	struct mm_struct *mm;
    	int err;
    
    	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
    	if (err)
    		return ERR_PTR(err);
    
    	mm = get_task_mm(task);
    	if (mm && mm != current->mm &&
    			!ptrace_may_access(task, mode)) {
    		mmput(mm);
    		mm = ERR_PTR(-EACCES);
    	}
    	mutex_unlock(&task->signal->cred_guard_mutex);
    
    	return mm;
    }
    
    
    static void complete_vfork_done(struct task_struct *tsk)
    
    Oleg Nesterov's avatar
    Oleg Nesterov committed
    	struct completion *vfork;
    
    Oleg Nesterov's avatar
    Oleg Nesterov committed
    	task_lock(tsk);
    	vfork = tsk->vfork_done;
    	if (likely(vfork)) {
    		tsk->vfork_done = NULL;
    		complete(vfork);
    	}
    	task_unlock(tsk);
    }
    
    static int wait_for_vfork_done(struct task_struct *child,
    				struct completion *vfork)
    {
    	int killed;
    
    	freezer_do_not_count();
    	killed = wait_for_completion_killable(vfork);
    	freezer_count();
    
    	if (killed) {
    		task_lock(child);
    		child->vfork_done = NULL;
    		task_unlock(child);
    	}
    
    	put_task_struct(child);
    	return killed;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /* Please note the differences between mmput and mm_release.
     * mmput is called whenever we stop holding onto a mm_struct,
     * error success whatever.
     *
     * mm_release is called after a mm_struct has been removed
     * from the current process.
     *
     * This difference is important for error handling, when we
     * only half set up a mm_struct for a new process and need to restore
     * the old one.  Because we mmput the new mm_struct before
     * restoring the old one. . .
     * Eric Biederman 10 January 1998
     */
    void mm_release(struct task_struct *tsk, struct mm_struct *mm)
    {
    
    	/* Get rid of any futexes when releasing the mm */
    #ifdef CONFIG_FUTEX
    
    	if (unlikely(tsk->robust_list)) {
    
    		exit_robust_list(tsk);
    
    		tsk->robust_list = NULL;
    	}
    
    #ifdef CONFIG_COMPAT
    
    	if (unlikely(tsk->compat_robust_list)) {
    
    		compat_exit_robust_list(tsk);
    
    		tsk->compat_robust_list = NULL;
    	}
    
    	if (unlikely(!list_empty(&tsk->pi_state_list)))
    		exit_pi_state_list(tsk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* Get rid of any cached register state */
    	deactivate_mm(tsk, mm);
    
    
    	/*
    	 * If we're exiting normally, clear a user-space tid field if
    	 * requested.  We leave this alone when dying by signal, to leave
    	 * the value intact in a core dump, and to save the unnecessary
    
    Oleg Nesterov's avatar
    Oleg Nesterov committed
    	 * trouble, say, a killed vfork parent shouldn't touch this mm.
    	 * Userland only wants this done for a sys_exit.
    
    	if (tsk->clear_child_tid) {
    		if (!(tsk->flags & PF_SIGNALED) &&
    		    atomic_read(&mm->mm_users) > 1) {
    			/*
    			 * We don't check the error code - if userspace has
    			 * not set up a proper pointer then tough luck.
    			 */
    			put_user(0, tsk->clear_child_tid);
    			sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
    					1, NULL, NULL, 0);
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		tsk->clear_child_tid = NULL;
    	}
    
    
    	/*
    	 * All done, finally we can wake up parent and return this mm to him.
    	 * Also kthread_stop() uses this completion for synchronization.
    	 */
    	if (tsk->vfork_done)
    		complete_vfork_done(tsk);
    
    /*
     * Allocate a new mm structure and copy contents from the
     * mm structure of the passed in task structure.
     */
    
    struct mm_struct *dup_mm(struct task_struct *tsk)
    
    {
    	struct mm_struct *mm, *oldmm = current->mm;
    	int err;
    
    	if (!oldmm)
    		return NULL;
    
    	mm = allocate_mm();
    	if (!mm)
    		goto fail_nomem;
    
    	memcpy(mm, oldmm, sizeof(*mm));
    
    	mm_init_cpumask(mm);
    
    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
    	mm->pmd_huge_pte = NULL;
    
    #endif
    #ifdef CONFIG_NUMA_BALANCING
    	mm->first_nid = NUMA_PTE_SCAN_INIT;
    
    	if (!mm_init(mm, tsk))
    
    		goto fail_nomem;
    
    	if (init_new_context(tsk, mm))
    		goto fail_nocontext;
    
    
    Matt Helsley's avatar
    Matt Helsley committed
    	dup_mm_exe_file(oldmm, mm);
    
    
    	err = dup_mmap(mm, oldmm);
    	if (err)
    		goto free_pt;
    
    	mm->hiwater_rss = get_mm_rss(mm);
    	mm->hiwater_vm = mm->total_vm;
    
    
    	if (mm->binfmt && !try_module_get(mm->binfmt->module))
    		goto free_pt;
    
    
    	/* don't put binfmt in mmput, we haven't got module yet */
    	mm->binfmt = NULL;
    
    	mmput(mm);
    
    fail_nomem:
    	return NULL;
    
    fail_nocontext:
    	/*
    	 * If init_new_context() failed, we cannot use mmput() to free the mm
    	 * because it calls destroy_context()
    	 */
    	mm_free_pgd(mm);
    	free_mm(mm);
    	return NULL;
    }
    
    
    static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct mm_struct *mm, *oldmm;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	int retval;
    
    	tsk->min_flt = tsk->maj_flt = 0;
    	tsk->nvcsw = tsk->nivcsw = 0;
    
    #ifdef CONFIG_DETECT_HUNG_TASK
    	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
    #endif
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	tsk->mm = NULL;
    	tsk->active_mm = NULL;
    
    	/*
    	 * Are we cloning a kernel thread?
    	 *
    	 * We need to steal a active VM for that..
    	 */
    	oldmm = current->mm;
    	if (!oldmm)
    		return 0;
    
    	if (clone_flags & CLONE_VM) {
    		atomic_inc(&oldmm->mm_users);
    		mm = oldmm;
    		goto good_mm;
    	}
    
    	retval = -ENOMEM;
    
    	mm = dup_mm(tsk);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!mm)
    		goto fail_nomem;
    
    good_mm:
    	tsk->mm = mm;
    	tsk->active_mm = mm;
    	return 0;
    
    fail_nomem:
    	return retval;
    }
    
    
    Alexey Dobriyan's avatar
    Alexey Dobriyan committed
    static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct fs_struct *fs = current->fs;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (clone_flags & CLONE_FS) {
    
    		/* tsk->fs is already what we want */
    
    		spin_lock(&fs->lock);
    
    		if (fs->in_exec) {
    
    			spin_unlock(&fs->lock);
    
    			return -EAGAIN;
    		}
    		fs->users++;
    
    		spin_unlock(&fs->lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		return 0;
    	}
    
    	tsk->fs = copy_fs_struct(fs);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!tsk->fs)
    		return -ENOMEM;
    	return 0;
    }
    
    
    static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
    
    {
    	struct files_struct *oldf, *newf;
    	int error = 0;
    
    	/*
    	 * A background process may not have any files ...
    	 */
    	oldf = current->files;
    	if (!oldf)
    		goto out;
    
    	if (clone_flags & CLONE_FILES) {
    		atomic_inc(&oldf->count);
    		goto out;
    	}
    
    	newf = dup_fd(oldf, &error);
    	if (!newf)
    		goto out;
    
    	tsk->files = newf;
    	error = 0;
    out:
    	return error;
    }
    
    
    static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
    
    {
    #ifdef CONFIG_BLOCK
    	struct io_context *ioc = current->io_context;
    
    	/*
    	 * Share io context with parent, if CLONE_IO is set
    	 */
    	if (clone_flags & CLONE_IO) {
    
    		ioc_task_link(ioc);
    		tsk->io_context = ioc;
    
    	} else if (ioprio_valid(ioc->ioprio)) {
    
    		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
    		if (unlikely(!new_ioc))
    
    		put_io_context(new_ioc);
    
    Alexey Dobriyan's avatar
    Alexey Dobriyan committed
    static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	struct sighand_struct *sig;
    
    
    	if (clone_flags & CLONE_SIGHAND) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		atomic_inc(&current->sighand->count);
    		return 0;
    	}
    	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
    
    Ingo Molnar's avatar
    Ingo Molnar committed
    	rcu_assign_pointer(tsk->sighand, sig);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!sig)
    		return -ENOMEM;
    	atomic_set(&sig->count, 1);
    	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
    	return 0;
    }
    
    
    void __cleanup_sighand(struct sighand_struct *sighand)
    
    	if (atomic_dec_and_test(&sighand->count)) {
    		signalfd_cleanup(sighand);