Skip to content
Snippets Groups Projects
memory.c 116 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
    		unsigned long addr, unsigned long end)
    {
    	pud_t *src_pud, *dst_pud;
    	unsigned long next;
    
    	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
    	if (!dst_pud)
    		return -ENOMEM;
    	src_pud = pud_offset(src_pgd, addr);
    	do {
    		next = pud_addr_end(addr, end);
    		if (pud_none_or_clear_bad(src_pud))
    			continue;
    		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
    						vma, addr, next))
    			return -ENOMEM;
    	} while (dst_pud++, src_pud++, addr = next, addr != end);
    	return 0;
    }
    
    int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
    		struct vm_area_struct *vma)
    {
    	pgd_t *src_pgd, *dst_pgd;
    	unsigned long next;
    	unsigned long addr = vma->vm_start;
    	unsigned long end = vma->vm_end;
    
    	unsigned long mmun_start;	/* For mmu_notifiers */
    	unsigned long mmun_end;		/* For mmu_notifiers */
    	bool is_cow;
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	int ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	/*
    	 * Don't copy ptes where a page fault will fill them correctly.
    	 * Fork becomes much lighter when there are big shared or private
    	 * readonly mappings. The tradeoff is that copy_page_range is more
    	 * efficient than faulting.
    	 */
    
    	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
    			       VM_PFNMAP | VM_MIXEDMAP))) {
    
    		if (!vma->anon_vma)
    			return 0;
    	}
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (is_vm_hugetlb_page(vma))
    		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
    
    
    	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
    
    		/*
    		 * We do not free on error cases below as remove_vma
    		 * gets called on error from higher level routine
    		 */
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	/*
    	 * We need to invalidate the secondary MMU mappings only when
    	 * there could be a permission downgrade on the ptes of the
    	 * parent mm. And a permission downgrade will only happen if
    	 * is_cow_mapping() returns true.
    	 */
    
    	is_cow = is_cow_mapping(vma->vm_flags);
    	mmun_start = addr;
    	mmun_end   = end;
    	if (is_cow)
    		mmu_notifier_invalidate_range_start(src_mm, mmun_start,
    						    mmun_end);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    
    	ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	dst_pgd = pgd_offset(dst_mm, addr);
    	src_pgd = pgd_offset(src_mm, addr);
    	do {
    		next = pgd_addr_end(addr, end);
    		if (pgd_none_or_clear_bad(src_pgd))
    			continue;
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
    					    vma, addr, next))) {
    			ret = -ENOMEM;
    			break;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
    
    	if (is_cow)
    		mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	return ret;
    
    static unsigned long zap_pte_range(struct mmu_gather *tlb,
    
    				struct vm_area_struct *vma, pmd_t *pmd,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				unsigned long addr, unsigned long end,
    
    				struct zap_details *details)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct mm_struct *mm = tlb->mm;
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	int force_flush = 0;
    
    	int rss[NR_MM_COUNTERS];
    
    	spinlock_t *ptl;
    
    	pte_t *start_pte;
    
    	pte_t *pte;
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    again:
    
    	init_rss_vec(rss);
    
    	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
    	pte = start_pte;
    
    	arch_enter_lazy_mmu_mode();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	do {
    		pte_t ptent = *pte;
    
    		if (pte_none(ptent)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (pte_present(ptent)) {
    
    			struct page *page;
    
    			page = vm_normal_page(vma, addr, ptent);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (unlikely(details) && page) {
    				/*
    				 * unmap_shared_mapping_pages() wants to
    				 * invalidate cache without truncating:
    				 * unmap shared but keep private pages.
    				 */
    				if (details->check_mapping &&
    				    details->check_mapping != page->mapping)
    					continue;
    				/*
    				 * Each page->index must be checked when
    				 * invalidating or truncating nonlinear.
    				 */
    				if (details->nonlinear_vma &&
    				    (page->index < details->first_index ||
    				     page->index > details->last_index))
    					continue;
    			}
    
    			ptent = ptep_get_and_clear_full(mm, addr, pte,
    
    							tlb->fullmm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			tlb_remove_tlb_entry(tlb, pte, addr);
    			if (unlikely(!page))
    				continue;
    			if (unlikely(details) && details->nonlinear_vma
    			    && linear_page_index(details->nonlinear_vma,
    						addr) != page->index)
    
    				set_pte_at(mm, addr, pte,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    					   pgoff_to_pte(page->index));
    			if (PageAnon(page))
    
    				rss[MM_ANONPAGES]--;
    
    			else {
    				if (pte_dirty(ptent))
    					set_page_dirty(page);
    
    				if (pte_young(ptent) &&
    
    				    likely(!(vma->vm_flags & VM_SEQ_READ)))
    
    					mark_page_accessed(page);
    
    				rss[MM_FILEPAGES]--;
    
    			page_remove_rmap(page);
    
    			if (unlikely(page_mapcount(page) < 0))
    				print_bad_pte(vma, addr, ptent, page);
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    			force_flush = !__tlb_remove_page(tlb, page);
    			if (force_flush)
    				break;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    		}
    		/*
    		 * If details->check_mapping, we leave swap entries;
    		 * if details->nonlinear_vma, we leave file entries.
    		 */
    		if (unlikely(details))
    			continue;
    
    		if (pte_file(ptent)) {
    			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
    				print_bad_pte(vma, addr, ptent, NULL);
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    		} else {
    			swp_entry_t entry = pte_to_swp_entry(ptent);
    
    			if (!non_swap_entry(entry))
    				rss[MM_SWAPENTS]--;
    
    			else if (is_migration_entry(entry)) {
    				struct page *page;
    
    				page = migration_entry_to_page(entry);
    
    				if (PageAnon(page))
    					rss[MM_ANONPAGES]--;
    				else
    					rss[MM_FILEPAGES]--;
    			}
    
    KAMEZAWA Hiroyuki's avatar
    KAMEZAWA Hiroyuki committed
    			if (unlikely(!free_swap_and_cache(entry)))
    				print_bad_pte(vma, addr, ptent, NULL);
    		}
    
    		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
    
    	} while (pte++, addr += PAGE_SIZE, addr != end);
    
    	add_mm_rss_vec(mm, rss);
    
    	arch_leave_lazy_mmu_mode();
    
    	pte_unmap_unlock(start_pte, ptl);
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	/*
    	 * mmu_gather ran out of room to batch pages, we break out of
    	 * the PTE lock to avoid doing the potential expensive TLB invalidate
    	 * and page-free while holding it.
    	 */
    	if (force_flush) {
    		force_flush = 0;
    
    
    #ifdef HAVE_GENERIC_MMU_GATHER
    
    		tlb->start = range_start;
    		tlb->end = addr;
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    		tlb_flush_mmu(tlb);
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    			goto again;
    
    static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
    
    				struct vm_area_struct *vma, pud_t *pud,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				unsigned long addr, unsigned long end,
    
    				struct zap_details *details)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pmd_t *pmd;
    	unsigned long next;
    
    	pmd = pmd_offset(pud, addr);
    	do {
    		next = pmd_addr_end(addr, end);
    
    		if (pmd_trans_huge(*pmd)) {
    
    #ifdef CONFIG_DEBUG_VM
    				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
    					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
    						__func__, addr, end,
    						vma->vm_start,
    						vma->vm_end);
    					BUG();
    				}
    #endif
    
    				split_huge_page_pmd(vma, addr, pmd);
    
    			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
    
    			/* fall through */
    		}
    
    		/*
    		 * Here there can be other concurrent MADV_DONTNEED or
    		 * trans huge page faults running, and if the pmd is
    		 * none or trans huge it can change under us. This is
    		 * because MADV_DONTNEED holds the mmap_sem in read
    		 * mode.
    		 */
    		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
    			goto next;
    
    		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
    
    		cond_resched();
    	} while (pmd++, addr = next, addr != end);
    
    static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
    
    				struct vm_area_struct *vma, pgd_t *pgd,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				unsigned long addr, unsigned long end,
    
    				struct zap_details *details)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pud_t *pud;
    	unsigned long next;
    
    	pud = pud_offset(pgd, addr);
    	do {
    		next = pud_addr_end(addr, end);
    
    		if (pud_none_or_clear_bad(pud))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
    	} while (pud++, addr = next, addr != end);
    
    static void unmap_page_range(struct mmu_gather *tlb,
    			     struct vm_area_struct *vma,
    			     unsigned long addr, unsigned long end,
    			     struct zap_details *details)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pgd_t *pgd;
    	unsigned long next;
    
    	if (details && !details->check_mapping && !details->nonlinear_vma)
    		details = NULL;
    
    	BUG_ON(addr >= end);
    
    	mem_cgroup_uncharge_start();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tlb_start_vma(tlb, vma);
    	pgd = pgd_offset(vma->vm_mm, addr);
    	do {
    		next = pgd_addr_end(addr, end);
    
    		if (pgd_none_or_clear_bad(pgd))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    
    		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
    	} while (pgd++, addr = next, addr != end);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	tlb_end_vma(tlb, vma);
    
    	mem_cgroup_uncharge_end();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    
    static void unmap_single_vma(struct mmu_gather *tlb,
    		struct vm_area_struct *vma, unsigned long start_addr,
    
    		struct zap_details *details)
    {
    	unsigned long start = max(vma->vm_start, start_addr);
    	unsigned long end;
    
    	if (start >= vma->vm_end)
    		return;
    	end = min(vma->vm_end, end_addr);
    	if (end <= vma->vm_start)
    		return;
    
    
    	if (vma->vm_file)
    		uprobe_munmap(vma, start, end);
    
    
    	if (unlikely(vma->vm_flags & VM_PFNMAP))
    
    
    	if (start != end) {
    		if (unlikely(is_vm_hugetlb_page(vma))) {
    			/*
    			 * It is undesirable to test vma->vm_file as it
    			 * should be non-null for valid hugetlb area.
    			 * However, vm_file will be NULL in the error
    			 * cleanup path of do_mmap_pgoff. When
    			 * hugetlbfs ->mmap method fails,
    			 * do_mmap_pgoff() nullifies vma->vm_file
    			 * before calling this function to clean up.
    			 * Since no pte has actually been setup, it is
    			 * safe to do nothing in this case.
    			 */
    
    			if (vma->vm_file) {
    				mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
    
    				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
    
    				mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
    			}
    
    		} else
    			unmap_page_range(tlb, vma, start, end, details);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /**
     * unmap_vmas - unmap a range of memory covered by a list of vma's
    
     * @tlb: address of the caller's struct mmu_gather
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @vma: the starting vma
     * @start_addr: virtual address at which to start unmapping
     * @end_addr: virtual address at which to end unmapping
     *
    
     * Unmap all pages in the vma list.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * Only addresses between `start' and `end' will be unmapped.
     *
     * The VMA list must be sorted in ascending virtual address order.
     *
     * unmap_vmas() assumes that the caller will flush the whole unmapped address
     * range after unmap_vmas() returns.  So the only responsibility here is to
     * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
     * drops the lock and schedules.
     */
    
    void unmap_vmas(struct mmu_gather *tlb,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		struct vm_area_struct *vma, unsigned long start_addr,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	struct mm_struct *mm = vma->vm_mm;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
    
    	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
    
    		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
    
    Andrea Arcangeli's avatar
    Andrea Arcangeli committed
    	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /**
     * zap_page_range - remove user pages in a given range
     * @vma: vm_area_struct holding the applicable pages
    
     * @start: starting address of pages to zap
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @size: number of bytes to zap
     * @details: details of nonlinear truncation or shared cache invalidation
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    void zap_page_range(struct vm_area_struct *vma, unsigned long start,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unsigned long size, struct zap_details *details)
    {
    	struct mm_struct *mm = vma->vm_mm;
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	struct mmu_gather tlb;
    
    	unsigned long end = start + size;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	lru_add_drain();
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	tlb_gather_mmu(&tlb, mm, 0);
    
    	update_hiwater_rss(mm);
    
    	mmu_notifier_invalidate_range_start(mm, start, end);
    	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
    
    		unmap_single_vma(&tlb, vma, start, end, details);
    
    	mmu_notifier_invalidate_range_end(mm, start, end);
    	tlb_finish_mmu(&tlb, start, end);
    
    /**
     * zap_page_range_single - remove user pages in a given range
     * @vma: vm_area_struct holding the applicable pages
     * @address: starting address of pages to zap
     * @size: number of bytes to zap
     * @details: details of nonlinear truncation or shared cache invalidation
     *
     * The range must fit into one VMA.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		unsigned long size, struct zap_details *details)
    {
    	struct mm_struct *mm = vma->vm_mm;
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	struct mmu_gather tlb;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	unsigned long end = address + size;
    
    	lru_add_drain();
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	tlb_gather_mmu(&tlb, mm, 0);
    
    	update_hiwater_rss(mm);
    
    	mmu_notifier_invalidate_range_start(mm, address, end);
    
    	unmap_single_vma(&tlb, vma, address, end, details);
    
    	mmu_notifier_invalidate_range_end(mm, address, end);
    
    Peter Zijlstra's avatar
    Peter Zijlstra committed
    	tlb_finish_mmu(&tlb, address, end);
    
    /**
     * zap_vma_ptes - remove ptes mapping the vma
     * @vma: vm_area_struct holding ptes to be zapped
     * @address: starting address of pages to zap
     * @size: number of bytes to zap
     *
     * This function only unmaps ptes assigned to VM_PFNMAP vmas.
     *
     * The entire address range must be fully contained within the vma.
     *
     * Returns 0 if successful.
     */
    int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
    		unsigned long size)
    {
    	if (address < vma->vm_start || address + size > vma->vm_end ||
    	    		!(vma->vm_flags & VM_PFNMAP))
    		return -1;
    
    	zap_page_range_single(vma, address, size, NULL);
    
    	return 0;
    }
    EXPORT_SYMBOL_GPL(zap_vma_ptes);
    
    
     * follow_page_mask - look up a page descriptor from a user-virtual address
    
     * @vma: vm_area_struct mapping @address
     * @address: virtual address to look up
     * @flags: flags modifying lookup behaviour
    
     * @page_mask: on output, *page_mask is set according to the size of the page
    
     *
     * @flags can have FOLL_ flags set, defined in <linux/mm.h>
     *
     * Returns the mapped (struct page *), %NULL if no mapping exists, or
     * an error pointer if there is a mapping to something not represented
     * by a page descriptor (see also vm_normal_page()).
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    struct page *follow_page_mask(struct vm_area_struct *vma,
    			      unsigned long address, unsigned int flags,
    			      unsigned int *page_mask)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pgd_t *pgd;
    	pud_t *pud;
    	pmd_t *pmd;
    	pte_t *ptep, pte;
    
    	spinlock_t *ptl;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	struct mm_struct *mm = vma->vm_mm;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
    	if (!IS_ERR(page)) {
    		BUG_ON(flags & FOLL_GET);
    		goto out;
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pgd = pgd_offset(mm, address);
    	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
    
    		goto no_page_table;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pud = pud_offset(pgd, address);
    
    Andi Kleen's avatar
    Andi Kleen committed
    	if (pud_none(*pud))
    
    		goto no_page_table;
    
    	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
    
    Andi Kleen's avatar
    Andi Kleen committed
    		BUG_ON(flags & FOLL_GET);
    		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
    		goto out;
    	}
    	if (unlikely(pud_bad(*pud)))
    		goto no_page_table;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pmd = pmd_offset(pud, address);
    
    	if (pmd_none(*pmd))
    
    		goto no_page_table;
    
    	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
    
    		BUG_ON(flags & FOLL_GET);
    		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		goto out;
    
    	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
    		goto no_page_table;
    
    	if (pmd_trans_huge(*pmd)) {
    
    		if (flags & FOLL_SPLIT) {
    
    			split_huge_page_pmd(vma, address, pmd);
    
    			goto split_fallthrough;
    		}
    
    		spin_lock(&mm->page_table_lock);
    		if (likely(pmd_trans_huge(*pmd))) {
    			if (unlikely(pmd_trans_splitting(*pmd))) {
    				spin_unlock(&mm->page_table_lock);
    				wait_split_huge_page(vma->anon_vma, pmd);
    			} else {
    
    				page = follow_trans_huge_pmd(vma, address,
    
    							     pmd, flags);
    				spin_unlock(&mm->page_table_lock);
    
    				*page_mask = HPAGE_PMD_NR - 1;
    
    				goto out;
    			}
    		} else
    			spin_unlock(&mm->page_table_lock);
    		/* fall through */
    	}
    
    split_fallthrough:
    
    	if (unlikely(pmd_bad(*pmd)))
    		goto no_page_table;
    
    
    	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pte = *ptep;
    
    	if (!pte_present(pte)) {
    		swp_entry_t entry;
    		/*
    		 * KSM's break_ksm() relies upon recognizing a ksm page
    		 * even while it is being migrated, so for that case we
    		 * need migration_entry_wait().
    		 */
    		if (likely(!(flags & FOLL_MIGRATION)))
    			goto no_page;
    		if (pte_none(pte) || pte_file(pte))
    			goto no_page;
    		entry = pte_to_swp_entry(pte);
    		if (!is_migration_entry(entry))
    			goto no_page;
    		pte_unmap_unlock(ptep, ptl);
    		migration_entry_wait(mm, pmd, address);
    		goto split_fallthrough;
    	}
    
    	if ((flags & FOLL_NUMA) && pte_numa(pte))
    		goto no_page;
    
    	if ((flags & FOLL_WRITE) && !pte_write(pte))
    		goto unlock;
    
    	page = vm_normal_page(vma, address, pte);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	if (unlikely(!page)) {
    		if ((flags & FOLL_DUMP) ||
    
    		    !is_zero_pfn(pte_pfn(pte)))
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    			goto bad_page;
    		page = pte_page(pte);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (flags & FOLL_GET)
    
    		get_page_foll(page);
    
    	if (flags & FOLL_TOUCH) {
    		if ((flags & FOLL_WRITE) &&
    		    !pte_dirty(pte) && !PageDirty(page))
    			set_page_dirty(page);
    
    		/*
    		 * pte_mkyoung() would be more correct here, but atomic care
    		 * is needed to avoid losing the dirty bit: it is easier to use
    		 * mark_page_accessed().
    		 */
    
    		mark_page_accessed(page);
    	}
    
    	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
    
    		/*
    		 * The preliminary mapping check is mainly to avoid the
    		 * pointless overhead of lock_page on the ZERO_PAGE
    		 * which might bounce very badly if there is contention.
    		 *
    		 * If the page is already locked, we don't need to
    		 * handle it now - vmscan will handle it later if and
    		 * when it attempts to reclaim the page.
    		 */
    		if (page->mapping && trylock_page(page)) {
    			lru_add_drain();  /* push cached pages to LRU */
    			/*
    
    			 * Because we lock page here, and migration is
    			 * blocked by the pte's page reference, and we
    			 * know the page is still mapped, we don't even
    			 * need to check for file-cache page truncation.
    
    			mlock_vma_page(page);
    
    unlock:
    	pte_unmap_unlock(ptep, ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    bad_page:
    	pte_unmap_unlock(ptep, ptl);
    	return ERR_PTR(-EFAULT);
    
    no_page:
    	pte_unmap_unlock(ptep, ptl);
    	if (!pte_none(pte))
    		return page;
    
    no_page_table:
    	/*
    	 * When core dumping an enormous anonymous area that nobody
    
    	 * has touched so far, we don't want to allocate unnecessary pages or
    	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
    	 * then get_dump_page() will return NULL to leave a hole in the dump.
    	 * But we can only make this optimization where a hole would surely
    	 * be zero-filled if handle_mm_fault() actually did handle it.
    
    	if ((flags & FOLL_DUMP) &&
    	    (!vma->vm_ops || !vma->vm_ops->fault))
    		return ERR_PTR(-EFAULT);
    
    static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
    {
    
    	return stack_guard_page_start(vma, addr) ||
    	       stack_guard_page_end(vma, addr+PAGE_SIZE);
    
    Huang Ying's avatar
    Huang Ying committed
    /**
     * __get_user_pages() - pin user pages in memory
     * @tsk:	task_struct of target task
     * @mm:		mm_struct of target mm
     * @start:	starting user address
     * @nr_pages:	number of pages from start to pin
     * @gup_flags:	flags modifying pin behaviour
     * @pages:	array that receives pointers to the pages pinned.
     *		Should be at least nr_pages long. Or NULL, if caller
     *		only intends to ensure the pages are faulted in.
     * @vmas:	array of pointers to vmas corresponding to each page.
     *		Or NULL if the caller does not require them.
     * @nonblocking: whether waiting for disk IO or mmap_sem contention
     *
     * Returns number of pages pinned. This may be fewer than the number
     * requested. If nr_pages is 0 or negative, returns 0. If no pages
     * were pinned, returns -errno. Each page returned must be released
     * with a put_page() call when it is finished with. vmas will only
     * remain valid while mmap_sem is held.
     *
     * Must be called with mmap_sem held for read or write.
     *
     * __get_user_pages walks a process's page tables and takes a reference to
     * each struct page that each user address corresponds to at a given
     * instant. That is, it takes the page that would be accessed if a user
     * thread accesses the given user virtual address at that instant.
     *
     * This does not guarantee that the page exists in the user mappings when
     * __get_user_pages returns, and there may even be a completely different
     * page there in some cases (eg. if mmapped pagecache has been invalidated
     * and subsequently re faulted). However it does guarantee that the page
     * won't be freed completely. And mostly callers simply care that the page
     * contains data that was valid *at some point in time*. Typically, an IO
     * or similar operation cannot guarantee anything stronger anyway because
     * locks can't be held over the syscall boundary.
     *
     * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
     * the page is written to, set_page_dirty (or set_page_dirty_lock, as
     * appropriate) must be called after the page is finished with, and
     * before put_page is called.
     *
     * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
     * or mmap_sem contention, and if waiting is needed to pin all pages,
     * *@nonblocking will be set to 0.
     *
     * In most cases, get_user_pages or get_user_pages_fast should be used
     * instead of __get_user_pages. __get_user_pages should be used only if
     * you need some special @gup_flags.
     */
    
    long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
    		unsigned long start, unsigned long nr_pages,
    		unsigned int gup_flags, struct page **pages,
    		struct vm_area_struct **vmas, int *nonblocking)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	unsigned long vm_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    
    	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/* 
    	 * Require read or write permissions.
    
    	 * If FOLL_FORCE is set, we only require the "MAY" flags.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	vm_flags  = (gup_flags & FOLL_WRITE) ?
    			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
    	vm_flags &= (gup_flags & FOLL_FORCE) ?
    			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
    
    
    	/*
    	 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
    	 * would be called on PROT_NONE ranges. We must never invoke
    	 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
    	 * page faults would unprotect the PROT_NONE ranges if
    	 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
    	 * bitflag. So to avoid that, don't set FOLL_NUMA if
    	 * FOLL_FORCE is set.
    	 */
    	if (!(gup_flags & FOLL_FORCE))
    		gup_flags |= FOLL_NUMA;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	i = 0;
    
    	do {
    
    		struct vm_area_struct *vma;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		vma = find_extend_vma(mm, start);
    
    		if (!vma && in_gate_area(mm, start)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			unsigned long pg = start & PAGE_MASK;
    			pgd_t *pgd;
    			pud_t *pud;
    			pmd_t *pmd;
    			pte_t *pte;
    
    
    			/* user gate pages are read-only */
    
    			if (gup_flags & FOLL_WRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				return i ? : -EFAULT;
    			if (pg > TASK_SIZE)
    				pgd = pgd_offset_k(pg);
    			else
    				pgd = pgd_offset_gate(mm, pg);
    			BUG_ON(pgd_none(*pgd));
    			pud = pud_offset(pgd, pg);
    			BUG_ON(pud_none(*pud));
    			pmd = pmd_offset(pud, pg);
    
    			if (pmd_none(*pmd))
    				return i ? : -EFAULT;
    
    			VM_BUG_ON(pmd_trans_huge(*pmd));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			pte = pte_offset_map(pmd, pg);
    
    			if (pte_none(*pte)) {
    				pte_unmap(pte);
    				return i ? : -EFAULT;
    			}
    
    			vma = get_gate_vma(mm);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (pages) {
    
    				page = vm_normal_page(vma, start, *pte);
    
    				if (!page) {
    					if (!(gup_flags & FOLL_DUMP) &&
    					     is_zero_pfn(pte_pfn(*pte)))
    						page = pte_page(*pte);
    					else {
    						pte_unmap(pte);
    						return i ? : -EFAULT;
    					}
    				}
    
    				pages[i] = page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			pte_unmap(pte);
    
    			goto next_page;
    
    		if (!vma ||
    		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
    
    		    !(vm_flags & vma->vm_flags))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return i ? : -EFAULT;
    
    
    		if (is_vm_hugetlb_page(vma)) {
    			i = follow_hugetlb_page(mm, vma, pages, vmas,
    
    					&start, &nr_pages, i, gup_flags);
    
    			continue;
    		}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do {
    
    			struct page *page;
    
    			unsigned int foll_flags = gup_flags;
    
    			unsigned int page_increm;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			 * If we have a pending SIGKILL, don't keep faulting
    
    			 * pages and potentially allocating memory.
    
    			if (unlikely(fatal_signal_pending(current)))
    
    				return i ? i : -ERESTARTSYS;
    
    			cond_resched();
    
    			while (!(page = follow_page_mask(vma, start,
    						foll_flags, &page_mask))) {
    
    				/* For mlock, just skip the stack guard page. */
    				if (foll_flags & FOLL_MLOCK) {
    					if (stack_guard_page(vma, start))
    						goto next_page;
    				}
    
    				if (foll_flags & FOLL_WRITE)
    					fault_flags |= FAULT_FLAG_WRITE;
    				if (nonblocking)
    					fault_flags |= FAULT_FLAG_ALLOW_RETRY;
    
    				if (foll_flags & FOLL_NOWAIT)
    					fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
    
    				ret = handle_mm_fault(mm, vma, start,
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				if (ret & VM_FAULT_ERROR) {
    					if (ret & VM_FAULT_OOM)
    						return i ? i : -ENOMEM;
    
    					if (ret & (VM_FAULT_HWPOISON |
    						   VM_FAULT_HWPOISON_LARGE)) {
    						if (i)
    							return i;
    						else if (gup_flags & FOLL_HWPOISON)
    							return -EHWPOISON;
    						else
    							return -EFAULT;
    					}
    					if (ret & VM_FAULT_SIGBUS)
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    						return i ? i : -EFAULT;
    					BUG();
    				}
    
    
    				if (tsk) {
    					if (ret & VM_FAULT_MAJOR)
    						tsk->maj_flt++;
    					else
    						tsk->min_flt++;
    				}
    
    				if (ret & VM_FAULT_RETRY) {
    
    					if (nonblocking)
    						*nonblocking = 0;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				 * The VM_FAULT_WRITE bit tells us that
    				 * do_wp_page has broken COW when necessary,
    				 * even if maybe_mkwrite decided not to set
    				 * pte_write. We can thus safely do subsequent
    
    				 * page lookups as if they were reads. But only
    				 * do so when looping for pte_write is futile:
    				 * in some cases userspace may also be wanting
    				 * to write to the gotten user page, which a
    				 * read fault here might prevent (a readonly
    				 * page might get reCOWed by userspace write).
    
    				if ((ret & VM_FAULT_WRITE) &&
    				    !(vma->vm_flags & VM_WRITE))
    
    					foll_flags &= ~FOLL_WRITE;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    
    			if (IS_ERR(page))
    				return i ? i : PTR_ERR(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (pages) {
    
    				pages[i] = page;
    
    				flush_anon_page(vma, page, start);
    
    				flush_dcache_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    				vmas[i] = vma;
    
    				page_mask = 0;
    			}
    			page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
    			if (page_increm > nr_pages)
    				page_increm = nr_pages;
    			i += page_increm;
    			start += page_increm * PAGE_SIZE;
    			nr_pages -= page_increm;
    
    		} while (nr_pages && start < vma->vm_end);
    	} while (nr_pages);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return i;
    }
    
    Huang Ying's avatar
    Huang Ying committed
    EXPORT_SYMBOL(__get_user_pages);
    
    /*
     * fixup_user_fault() - manually resolve a user page fault
     * @tsk:	the task_struct to use for page fault accounting, or
     *		NULL if faults are not to be recorded.
     * @mm:		mm_struct of target mm
     * @address:	user address
     * @fault_flags:flags to pass down to handle_mm_fault()
     *
     * This is meant to be called in the specific scenario where for locking reasons
     * we try to access user memory in atomic context (within a pagefault_disable()
     * section), this returns -EFAULT, and we want to resolve the user fault before
     * trying again.
     *
     * Typically this is meant to be used by the futex code.
     *
     * The main difference with get_user_pages() is that this function will
     * unconditionally call handle_mm_fault() which will in turn perform all the
     * necessary SW fixup of the dirty and young bits in the PTE, while
     * handle_mm_fault() only guarantees to update these in the struct page.
     *
     * This is important for some architectures where those bits also gate the
     * access permission to the page because they are maintained in software.  On
     * such architectures, gup() will not be enough to make a subsequent access
     * succeed.
     *
     * This should be called with the mm_sem held for read.
     */
    int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
    		     unsigned long address, unsigned int fault_flags)
    {
    	struct vm_area_struct *vma;
    	int ret;
    
    	vma = find_extend_vma(mm, address);
    	if (!vma || address < vma->vm_start)
    		return -EFAULT;
    
    	ret = handle_mm_fault(mm, vma, address, fault_flags);
    	if (ret & VM_FAULT_ERROR) {
    		if (ret & VM_FAULT_OOM)
    			return -ENOMEM;
    		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
    			return -EHWPOISON;
    		if (ret & VM_FAULT_SIGBUS)
    			return -EFAULT;
    		BUG();
    	}
    	if (tsk) {
    		if (ret & VM_FAULT_MAJOR)
    			tsk->maj_flt++;
    		else
    			tsk->min_flt++;
    	}
    	return 0;
    }
    
    /*
    
     * get_user_pages() - pin user pages in memory
    
     * @tsk:	the task_struct to use for page fault accounting, or
     *		NULL if faults are not to be recorded.
    
     * @mm:		mm_struct of target mm
     * @start:	starting user address
    
     * @nr_pages:	number of pages from start to pin
    
     * @write:	whether pages will be written to by the caller
     * @force:	whether to force write access even if user mapping is
     *		readonly. This will result in the page being COWed even
     *		in MAP_SHARED mappings. You do not want this.
     * @pages:	array that receives pointers to the pages pinned.
     *		Should be at least nr_pages long. Or NULL, if caller
     *		only intends to ensure the pages are faulted in.
     * @vmas:	array of pointers to vmas corresponding to each page.
     *		Or NULL if the caller does not require them.
     *
     * Returns number of pages pinned. This may be fewer than the number
    
     * requested. If nr_pages is 0 or negative, returns 0. If no pages
    
     * were pinned, returns -errno. Each page returned must be released
     * with a put_page() call when it is finished with. vmas will only
     * remain valid while mmap_sem is held.
     *
     * Must be called with mmap_sem held for read or write.
     *
     * get_user_pages walks a process's page tables and takes a reference to
     * each struct page that each user address corresponds to at a given
     * instant. That is, it takes the page that would be accessed if a user
     * thread accesses the given user virtual address at that instant.
     *
     * This does not guarantee that the page exists in the user mappings when
     * get_user_pages returns, and there may even be a completely different
     * page there in some cases (eg. if mmapped pagecache has been invalidated
     * and subsequently re faulted). However it does guarantee that the page
     * won't be freed completely. And mostly callers simply care that the page
     * contains data that was valid *at some point in time*. Typically, an IO
     * or similar operation cannot guarantee anything stronger anyway because
     * locks can't be held over the syscall boundary.
     *
     * If write=0, the page must not be written to. If the page is written to,
     * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
     * after the page is finished with, and before put_page is called.
     *
     * get_user_pages is typically used for fewer-copy IO operations, to get a