Skip to content
Snippets Groups Projects
memory.c 72.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • Linus Torvalds's avatar
    Linus Torvalds committed
    
    	do {
    
    		struct vm_area_struct *vma;
    		unsigned int foll_flags;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		vma = find_extend_vma(mm, start);
    		if (!vma && in_gate_area(tsk, start)) {
    			unsigned long pg = start & PAGE_MASK;
    			struct vm_area_struct *gate_vma = get_gate_vma(tsk);
    			pgd_t *pgd;
    			pud_t *pud;
    			pmd_t *pmd;
    			pte_t *pte;
    			if (write) /* user gate pages are read-only */
    				return i ? : -EFAULT;
    			if (pg > TASK_SIZE)
    				pgd = pgd_offset_k(pg);
    			else
    				pgd = pgd_offset_gate(mm, pg);
    			BUG_ON(pgd_none(*pgd));
    			pud = pud_offset(pgd, pg);
    			BUG_ON(pud_none(*pud));
    			pmd = pmd_offset(pud, pg);
    
    			if (pmd_none(*pmd))
    				return i ? : -EFAULT;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			pte = pte_offset_map(pmd, pg);
    
    			if (pte_none(*pte)) {
    				pte_unmap(pte);
    				return i ? : -EFAULT;
    			}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (pages) {
    
    				struct page *page = vm_normal_page(gate_vma, start, *pte);
    
    				pages[i] = page;
    				if (page)
    					get_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			pte_unmap(pte);
    			if (vmas)
    				vmas[i] = gate_vma;
    			i++;
    			start += PAGE_SIZE;
    			len--;
    			continue;
    		}
    
    
    		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
    
    				|| !(vm_flags & vma->vm_flags))
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			return i ? : -EFAULT;
    
    		if (is_vm_hugetlb_page(vma)) {
    			i = follow_hugetlb_page(mm, vma, pages, vmas,
    
    						&start, &len, i, write);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			continue;
    		}
    
    
    		foll_flags = FOLL_TOUCH;
    		if (pages)
    			foll_flags |= FOLL_GET;
    		if (!write && !(vma->vm_flags & VM_LOCKED) &&
    
    		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
    					!vma->vm_ops->fault)))
    
    			foll_flags |= FOLL_ANON;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		do {
    
    			struct page *page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    			/*
    			 * If tsk is ooming, cut off its access to large memory
    			 * allocations. It has a pending SIGKILL, but it can't
    			 * be processed until returning to user space.
    			 */
    			if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
    				return -ENOMEM;
    
    
    			if (write)
    				foll_flags |= FOLL_WRITE;
    
    			cond_resched();
    
    			while (!(page = follow_page(vma, start, foll_flags))) {
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				ret = handle_mm_fault(mm, vma, start,
    
    						foll_flags & FOLL_WRITE);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				if (ret & VM_FAULT_ERROR) {
    					if (ret & VM_FAULT_OOM)
    						return i ? i : -ENOMEM;
    					else if (ret & VM_FAULT_SIGBUS)
    						return i ? i : -EFAULT;
    					BUG();
    				}
    				if (ret & VM_FAULT_MAJOR)
    					tsk->maj_flt++;
    				else
    					tsk->min_flt++;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				 * The VM_FAULT_WRITE bit tells us that
    				 * do_wp_page has broken COW when necessary,
    				 * even if maybe_mkwrite decided not to set
    				 * pte_write. We can thus safely do subsequent
    				 * page lookups as if they were reads.
    
    				 */
    				if (ret & VM_FAULT_WRITE)
    
    					foll_flags &= ~FOLL_WRITE;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			if (pages) {
    
    				pages[i] = page;
    
    				flush_anon_page(vma, page, start);
    
    				flush_dcache_page(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			}
    			if (vmas)
    				vmas[i] = vma;
    			i++;
    			start += PAGE_SIZE;
    			len--;
    
    		} while (len && start < vma->vm_end);
    	} while (len);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return i;
    }
    EXPORT_SYMBOL(get_user_pages);
    
    
    pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
    			spinlock_t **ptl)
    
    {
    	pgd_t * pgd = pgd_offset(mm, addr);
    	pud_t * pud = pud_alloc(mm, pgd, addr);
    	if (pud) {
    
    		pmd_t * pmd = pmd_alloc(mm, pud, addr);
    
    		if (pmd)
    			return pte_alloc_map_lock(mm, pmd, addr, ptl);
    	}
    	return NULL;
    }
    
    
    /*
     * This is the old fallback for page remapping.
     *
     * For historical reasons, it only allows reserved pages. Only
     * old drivers should use this, and they needed to mark their
     * pages reserved for the old functions anyway.
     */
    static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
    {
    	int retval;
    
    	spinlock_t *ptl;
    
    
    	retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
    
    	if (retval)
    		goto out;
    
    	if (PageAnon(page))
    
    		goto out_uncharge;
    
    	retval = -ENOMEM;
    	flush_dcache_page(page);
    
    	pte = get_locked_pte(mm, addr, &ptl);
    
    		goto out_uncharge;
    
    	retval = -EBUSY;
    	if (!pte_none(*pte))
    		goto out_unlock;
    
    	/* Ok, finally just insert the thing.. */
    	get_page(page);
    	inc_mm_counter(mm, file_rss);
    	page_add_file_rmap(page);
    	set_pte_at(mm, addr, pte, mk_pte(page, prot));
    
    	retval = 0;
    
    	pte_unmap_unlock(pte, ptl);
    	return retval;
    
    out_unlock:
    	pte_unmap_unlock(pte, ptl);
    
    out_uncharge:
    	mem_cgroup_uncharge_page(page);
    
    /**
     * vm_insert_page - insert single page into user vma
     * @vma: user vma to map to
     * @addr: target user address of this page
     * @page: source kernel page
     *
    
     * This allows drivers to insert individual pages they've allocated
     * into a user vma.
     *
     * The page has to be a nice clean _individual_ kernel allocation.
     * If you allocate a compound page, you need to have marked it as
     * such (__GFP_COMP), or manually just split the page up yourself
    
     * (see split_page()).
    
     *
     * NOTE! Traditionally this was done with "remap_pfn_range()" which
     * took an arbitrary page protection parameter. This doesn't allow
     * that. Your vma protection will have to be set up correctly, which
     * means that if you want a shared writable mapping, you'd better
     * ask for a shared writable mapping!
     *
     * The page does not need to be reserved.
     */
    int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
    {
    	if (addr < vma->vm_start || addr >= vma->vm_end)
    		return -EFAULT;
    	if (!page_count(page))
    		return -EINVAL;
    
    	vma->vm_flags |= VM_INSERTPAGE;
    
    	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
    }
    
    EXPORT_SYMBOL(vm_insert_page);
    
    /**
     * vm_insert_pfn - insert single pfn into user vma
     * @vma: user vma to map to
     * @addr: target user address of this page
     * @pfn: source kernel pfn
     *
     * Similar to vm_inert_page, this allows drivers to insert individual pages
     * they've allocated into a user vma. Same comments apply.
     *
     * This function should only be called from a vm_ops->fault handler, and
     * in that case the handler should return NULL.
     */
    int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
    		unsigned long pfn)
    {
    	struct mm_struct *mm = vma->vm_mm;
    	int retval;
    	pte_t *pte, entry;
    	spinlock_t *ptl;
    
    	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
    	BUG_ON(is_cow_mapping(vma->vm_flags));
    
    	retval = -ENOMEM;
    	pte = get_locked_pte(mm, addr, &ptl);
    	if (!pte)
    		goto out;
    	retval = -EBUSY;
    	if (!pte_none(*pte))
    		goto out_unlock;
    
    	/* Ok, finally just insert the thing.. */
    	entry = pfn_pte(pfn, vma->vm_page_prot);
    	set_pte_at(mm, addr, pte, entry);
    	update_mmu_cache(vma, addr, entry);
    
    	retval = 0;
    out_unlock:
    	pte_unmap_unlock(pte, ptl);
    
    out:
    	return retval;
    }
    EXPORT_SYMBOL(vm_insert_pfn);
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * maps a range of physical memory into the requested pages. the old
     * mappings are removed. any references to nonexistent pages results
     * in null mappings (currently treated as "copy-on-access")
     */
    static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
    			unsigned long addr, unsigned long end,
    			unsigned long pfn, pgprot_t prot)
    {
    	pte_t *pte;
    
    	spinlock_t *ptl;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!pte)
    		return -ENOMEM;
    
    	arch_enter_lazy_mmu_mode();
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	do {
    		BUG_ON(!pte_none(*pte));
    
    		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		pfn++;
    	} while (pte++, addr += PAGE_SIZE, addr != end);
    
    	arch_leave_lazy_mmu_mode();
    
    	pte_unmap_unlock(pte - 1, ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return 0;
    }
    
    static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
    			unsigned long addr, unsigned long end,
    			unsigned long pfn, pgprot_t prot)
    {
    	pmd_t *pmd;
    	unsigned long next;
    
    	pfn -= addr >> PAGE_SHIFT;
    	pmd = pmd_alloc(mm, pud, addr);
    	if (!pmd)
    		return -ENOMEM;
    	do {
    		next = pmd_addr_end(addr, end);
    		if (remap_pte_range(mm, pmd, addr, next,
    				pfn + (addr >> PAGE_SHIFT), prot))
    			return -ENOMEM;
    	} while (pmd++, addr = next, addr != end);
    	return 0;
    }
    
    static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
    			unsigned long addr, unsigned long end,
    			unsigned long pfn, pgprot_t prot)
    {
    	pud_t *pud;
    	unsigned long next;
    
    	pfn -= addr >> PAGE_SHIFT;
    	pud = pud_alloc(mm, pgd, addr);
    	if (!pud)
    		return -ENOMEM;
    	do {
    		next = pud_addr_end(addr, end);
    		if (remap_pmd_range(mm, pud, addr, next,
    				pfn + (addr >> PAGE_SHIFT), prot))
    			return -ENOMEM;
    	} while (pud++, addr = next, addr != end);
    	return 0;
    }
    
    
    /**
     * remap_pfn_range - remap kernel memory to userspace
     * @vma: user vma to map to
     * @addr: target user address to start at
     * @pfn: physical address of kernel memory
     * @size: size of map area
     * @prot: page protection flags for this mapping
     *
     *  Note: this is only safe if the mm semaphore is held when called.
     */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
    		    unsigned long pfn, unsigned long size, pgprot_t prot)
    {
    	pgd_t *pgd;
    	unsigned long next;
    
    	unsigned long end = addr + PAGE_ALIGN(size);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct mm_struct *mm = vma->vm_mm;
    	int err;
    
    	/*
    	 * Physically remapped pages are special. Tell the
    	 * rest of the world about it:
    	 *   VM_IO tells people not to look at these pages
    	 *	(accesses can have side effects).
    
    	 *   VM_RESERVED is specified all over the place, because
    	 *	in 2.4 it kept swapout's vma scan off this vma; but
    	 *	in 2.6 the LRU scan won't even find its pages, so this
    	 *	flag means no more than count its pages in reserved_vm,
    	 * 	and omit it from core dump, even when VM_IO turned off.
    
    	 *   VM_PFNMAP tells the core MM that the base pages are just
    	 *	raw PFN mappings, and do not have a "struct page" associated
    	 *	with them.
    
    	 *
    	 * There's a horrible special case to handle copy-on-write
    	 * behaviour that some programs depend on. We mark the "original"
    	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	if (is_cow_mapping(vma->vm_flags)) {
    
    		if (addr != vma->vm_start || end != vma->vm_end)
    
    		vma->vm_pgoff = pfn;
    	}
    
    
    	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	BUG_ON(addr >= end);
    	pfn -= addr >> PAGE_SHIFT;
    	pgd = pgd_offset(mm, addr);
    	flush_cache_range(vma, addr, end);
    	do {
    		next = pgd_addr_end(addr, end);
    		err = remap_pud_range(mm, pgd, addr, next,
    				pfn + (addr >> PAGE_SHIFT), prot);
    		if (err)
    			break;
    	} while (pgd++, addr = next, addr != end);
    	return err;
    }
    EXPORT_SYMBOL(remap_pfn_range);
    
    
    static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
    				     unsigned long addr, unsigned long end,
    				     pte_fn_t fn, void *data)
    {
    	pte_t *pte;
    	int err;
    	struct page *pmd_page;
    
    	spinlock_t *uninitialized_var(ptl);
    
    
    	pte = (mm == &init_mm) ?
    		pte_alloc_kernel(pmd, addr) :
    		pte_alloc_map_lock(mm, pmd, addr, &ptl);
    	if (!pte)
    		return -ENOMEM;
    
    	BUG_ON(pmd_huge(*pmd));
    
    	pmd_page = pmd_page(*pmd);
    
    	do {
    		err = fn(pte, pmd_page, addr, data);
    		if (err)
    			break;
    	} while (pte++, addr += PAGE_SIZE, addr != end);
    
    	if (mm != &init_mm)
    		pte_unmap_unlock(pte-1, ptl);
    	return err;
    }
    
    static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
    				     unsigned long addr, unsigned long end,
    				     pte_fn_t fn, void *data)
    {
    	pmd_t *pmd;
    	unsigned long next;
    	int err;
    
    	pmd = pmd_alloc(mm, pud, addr);
    	if (!pmd)
    		return -ENOMEM;
    	do {
    		next = pmd_addr_end(addr, end);
    		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
    		if (err)
    			break;
    	} while (pmd++, addr = next, addr != end);
    	return err;
    }
    
    static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
    				     unsigned long addr, unsigned long end,
    				     pte_fn_t fn, void *data)
    {
    	pud_t *pud;
    	unsigned long next;
    	int err;
    
    	pud = pud_alloc(mm, pgd, addr);
    	if (!pud)
    		return -ENOMEM;
    	do {
    		next = pud_addr_end(addr, end);
    		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
    		if (err)
    			break;
    	} while (pud++, addr = next, addr != end);
    	return err;
    }
    
    /*
     * Scan a region of virtual memory, filling in page tables as necessary
     * and calling a provided function on each leaf page table.
     */
    int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
    			unsigned long size, pte_fn_t fn, void *data)
    {
    	pgd_t *pgd;
    	unsigned long next;
    	unsigned long end = addr + size;
    	int err;
    
    	BUG_ON(addr >= end);
    	pgd = pgd_offset(mm, addr);
    	do {
    		next = pgd_addr_end(addr, end);
    		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
    		if (err)
    			break;
    	} while (pgd++, addr = next, addr != end);
    	return err;
    }
    EXPORT_SYMBOL_GPL(apply_to_page_range);
    
    
    /*
     * handle_pte_fault chooses page fault handler according to an entry
     * which was read non-atomically.  Before making any commitment, on
     * those architectures or configurations (e.g. i386 with PAE) which
     * might give a mix of unmatched parts, do_swap_page and do_file_page
     * must check under lock before unmapping the pte and proceeding
     * (but do_wp_page is only called after already making such a check;
     * and do_anonymous_page and do_no_page can safely check later on).
     */
    
    static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
    
    				pte_t *page_table, pte_t orig_pte)
    {
    	int same = 1;
    #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
    	if (sizeof(pte_t) > sizeof(unsigned long)) {
    
    		spinlock_t *ptl = pte_lockptr(mm, pmd);
    		spin_lock(ptl);
    
    		same = pte_same(*page_table, orig_pte);
    
    		spin_unlock(ptl);
    
    	}
    #endif
    	pte_unmap(page_table);
    	return same;
    }
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
     * servicing faults for write access.  In the normal case, do always want
     * pte_mkwrite.  But get_user_pages can cause write faults for mappings
     * that do not have writing enabled, when used by access_process_vm.
     */
    static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
    {
    	if (likely(vma->vm_flags & VM_WRITE))
    		pte = pte_mkwrite(pte);
    	return pte;
    }
    
    
    static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
    
    {
    	/*
    	 * If the source page was a PFN mapping, we don't have
    	 * a "struct page" for it. We do a best-effort copy by
    	 * just copying from the original user address. If that
    	 * fails, we just zero-fill it. Live with it.
    	 */
    	if (unlikely(!src)) {
    		void *kaddr = kmap_atomic(dst, KM_USER0);
    
    		void __user *uaddr = (void __user *)(va & PAGE_MASK);
    
    		/*
    		 * This really shouldn't fail, because the page is there
    		 * in the page tables. But it might just be unreadable,
    		 * in which case we just give up and fill the result with
    		 * zeroes.
    		 */
    		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
    
    			memset(kaddr, 0, PAGE_SIZE);
    		kunmap_atomic(kaddr, KM_USER0);
    
    		flush_dcache_page(dst);
    
    	} else
    		copy_user_highpage(dst, src, va, vma);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * This routine handles present pages, when users try to write
     * to a shared page. It is done by copying the page to a new address
     * and decrementing the shared-page counter for the old page.
     *
     * Note that this routine assumes that the protection checks have been
     * done by the caller (the low-level page fault routine in most cases).
     * Thus we can safely just mark it writable once we've done any necessary
     * COW.
     *
     * We also mark the page dirty at this point even though the page will
     * change only once the write actually happens. This avoids a few races,
     * and potentially makes it more efficient.
     *
    
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
     * but allow concurrent faults), with pte both mapped and locked.
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
    		unsigned long address, pte_t *page_table, pmd_t *pmd,
    
    		spinlock_t *ptl, pte_t orig_pte)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct page *old_page, *new_page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pte_t entry;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	int reuse = 0, ret = 0;
    
    	int page_mkwrite = 0;
    
    	struct page *dirty_page = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	old_page = vm_normal_page(vma, address, orig_pte);
    	if (!old_page)
    		goto gotten;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	 * Take out anonymous pages first, anonymous shared vmas are
    	 * not dirty accountable.
    
    	if (PageAnon(old_page)) {
    		if (!TestSetPageLocked(old_page)) {
    			reuse = can_share_swap_page(old_page);
    			unlock_page(old_page);
    		}
    	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
    
    					(VM_WRITE|VM_SHARED))) {
    
    		/*
    		 * Only catch write-faults on shared writable pages,
    		 * read-only shared pages can get COWed by
    		 * get_user_pages(.write=1, .force=1).
    		 */
    
    		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
    			/*
    			 * Notify the address space that the page is about to
    			 * become writable so that it can prohibit this or wait
    			 * for the page to get into an appropriate state.
    			 *
    			 * We do this without the lock held, so that it can
    			 * sleep if it needs to.
    			 */
    			page_cache_get(old_page);
    			pte_unmap_unlock(page_table, ptl);
    
    			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
    				goto unwritable_page;
    
    			/*
    			 * Since we dropped the lock we need to revalidate
    			 * the PTE as someone else may have changed it.  If
    			 * they did, we just return, as we can count on the
    			 * MMU to tell us if they didn't also make it writable.
    			 */
    			page_table = pte_offset_map_lock(mm, pmd, address,
    							 &ptl);
    
    			page_cache_release(old_page);
    
    			if (!pte_same(*page_table, orig_pte))
    				goto unlock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		dirty_page = old_page;
    		get_page(dirty_page);
    
    		reuse = 1;
    	}
    
    	if (reuse) {
    		flush_cache_page(vma, address, pte_pfn(orig_pte));
    		entry = pte_mkyoung(orig_pte);
    		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    
    		if (ptep_set_access_flags(vma, address, page_table, entry,1))
    
    			update_mmu_cache(vma, address, entry);
    
    		ret |= VM_FAULT_WRITE;
    		goto unlock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*
    	 * Ok, we need to copy. Oh, well..
    	 */
    
    	page_cache_get(old_page);
    
    	pte_unmap_unlock(page_table, ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (unlikely(anon_vma_prepare(vma)))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	VM_BUG_ON(old_page == ZERO_PAGE(0));
    	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
    	if (!new_page)
    		goto oom;
    	cow_user_page(new_page, old_page, address, vma);
    
    	__SetPageUptodate(new_page);
    
    	if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
    
    		goto oom_free_new;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * Re-check the pte - we dropped the lock
    	 */
    
    	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    	if (likely(pte_same(*page_table, orig_pte))) {
    
    		if (old_page) {
    
    			page_remove_rmap(old_page, vma);
    
    			if (!PageAnon(old_page)) {
    				dec_mm_counter(mm, file_rss);
    				inc_mm_counter(mm, anon_rss);
    			}
    		} else
    
    			inc_mm_counter(mm, anon_rss);
    
    		flush_cache_page(vma, address, pte_pfn(orig_pte));
    
    		entry = mk_pte(new_page, vma->vm_page_prot);
    		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    
    		/*
    		 * Clear the pte entry and flush it first, before updating the
    		 * pte with the new entry. This will avoid a race condition
    		 * seen in the presence of one thread doing SMC and another
    		 * thread doing COW.
    		 */
    		ptep_clear_flush(vma, address, page_table);
    		set_pte_at(mm, address, page_table, entry);
    
    		update_mmu_cache(vma, address, entry);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		lru_cache_add_active(new_page);
    
    		page_add_new_anon_rmap(new_page, vma, address);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    		/* Free the old page.. */
    		new_page = old_page;
    
    		ret |= VM_FAULT_WRITE;
    
    	} else
    		mem_cgroup_uncharge_page(new_page);
    
    
    	if (new_page)
    		page_cache_release(new_page);
    	if (old_page)
    		page_cache_release(old_page);
    
    	pte_unmap_unlock(page_table, ptl);
    
    	if (dirty_page) {
    
    		if (vma->vm_file)
    			file_update_time(vma->vm_file);
    
    
    		/*
    		 * Yes, Virginia, this is actually required to prevent a race
    		 * with clear_page_dirty_for_io() from clearing the page dirty
    		 * bit after it clear all dirty ptes, but before a racing
    		 * do_wp_page installs a dirty pte.
    		 *
    		 * do_no_page is protected similarly.
    		 */
    		wait_on_page_locked(dirty_page);
    
    		set_page_dirty_balance(dirty_page, page_mkwrite);
    
    		put_page(dirty_page);
    	}
    
    	return ret;
    
    oom_free_new:
    	__free_page(new_page);
    
    	if (old_page)
    		page_cache_release(old_page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return VM_FAULT_OOM;
    
    
    unwritable_page:
    	page_cache_release(old_page);
    	return VM_FAULT_SIGBUS;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Helper functions for unmap_mapping_range().
     *
     * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
     *
     * We have to restart searching the prio_tree whenever we drop the lock,
     * since the iterator is only valid while the lock is held, and anyway
     * a later vma might be split and reinserted earlier while lock dropped.
     *
     * The list of nonlinear vmas could be handled more efficiently, using
     * a placeholder, but handle it in the same way until a need is shown.
     * It is important to search the prio_tree before nonlinear list: a vma
     * may become nonlinear and be shifted from prio_tree to nonlinear list
     * while the lock is dropped; but never shifted from list to prio_tree.
     *
     * In order to make forward progress despite restarting the search,
     * vm_truncate_count is used to mark a vma as now dealt with, so we can
     * quickly skip it next time around.  Since the prio_tree search only
     * shows us those vmas affected by unmapping the range in question, we
     * can't efficiently keep all vmas in step with mapping->truncate_count:
     * so instead reset them all whenever it wraps back to 0 (then go to 1).
     * mapping->truncate_count and vma->vm_truncate_count are protected by
     * i_mmap_lock.
     *
     * In order to make forward progress despite repeatedly restarting some
    
     * large vma, note the restart_addr from unmap_vmas when it breaks out:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * and restart from that address when we reach that vma again.  It might
     * have been split or merged, shrunk or extended, but never shifted: so
     * restart_addr remains valid so long as it remains in the vma's range.
     * unmap_mapping_range forces truncate_count to leap over page-aligned
     * values so we can save vma's restart_addr in its truncate_count field.
     */
    #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
    
    static void reset_vma_truncate_counts(struct address_space *mapping)
    {
    	struct vm_area_struct *vma;
    	struct prio_tree_iter iter;
    
    	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
    		vma->vm_truncate_count = 0;
    	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
    		vma->vm_truncate_count = 0;
    }
    
    static int unmap_mapping_range_vma(struct vm_area_struct *vma,
    		unsigned long start_addr, unsigned long end_addr,
    		struct zap_details *details)
    {
    	unsigned long restart_addr;
    	int need_break;
    
    
    	/*
    	 * files that support invalidating or truncating portions of the
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	 * file from under mmaped areas must have their ->fault function
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	 * return a locked page (and set VM_FAULT_LOCKED in the return).
    	 * This provides synchronisation against concurrent unmapping here.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    again:
    	restart_addr = vma->vm_truncate_count;
    	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
    		start_addr = restart_addr;
    		if (start_addr >= end_addr) {
    			/* Top of vma has been split off since last time */
    			vma->vm_truncate_count = details->truncate_count;
    			return 0;
    		}
    	}
    
    
    	restart_addr = zap_page_range(vma, start_addr,
    					end_addr - start_addr, details);
    
    	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (restart_addr >= end_addr) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We have now completed this vma: mark it so */
    		vma->vm_truncate_count = details->truncate_count;
    		if (!need_break)
    			return 0;
    	} else {
    		/* Note restart_addr in vma's truncate_count field */
    
    		vma->vm_truncate_count = restart_addr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!need_break)
    			goto again;
    	}
    
    	spin_unlock(details->i_mmap_lock);
    	cond_resched();
    	spin_lock(details->i_mmap_lock);
    	return -EINTR;
    }
    
    static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
    					    struct zap_details *details)
    {
    	struct vm_area_struct *vma;
    	struct prio_tree_iter iter;
    	pgoff_t vba, vea, zba, zea;
    
    restart:
    	vma_prio_tree_foreach(vma, &iter, root,
    			details->first_index, details->last_index) {
    		/* Skip quickly over those we have already dealt with */
    		if (vma->vm_truncate_count == details->truncate_count)
    			continue;
    
    		vba = vma->vm_pgoff;
    		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
    		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
    		zba = details->first_index;
    		if (zba < vba)
    			zba = vba;
    		zea = details->last_index;
    		if (zea > vea)
    			zea = vea;
    
    		if (unmap_mapping_range_vma(vma,
    			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
    			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
    				details) < 0)
    			goto restart;
    	}
    }
    
    static inline void unmap_mapping_range_list(struct list_head *head,
    					    struct zap_details *details)
    {
    	struct vm_area_struct *vma;
    
    	/*
    	 * In nonlinear VMAs there is no correspondence between virtual address
    	 * offset and file offset.  So we must perform an exhaustive search
    	 * across *all* the pages in each nonlinear VMA, not just the pages
    	 * whose virtual address lies outside the file truncation point.
    	 */
    restart:
    	list_for_each_entry(vma, head, shared.vm_set.list) {
    		/* Skip quickly over those we have already dealt with */
    		if (vma->vm_truncate_count == details->truncate_count)
    			continue;
    		details->nonlinear_vma = vma;
    		if (unmap_mapping_range_vma(vma, vma->vm_start,
    					vma->vm_end, details) < 0)
    			goto restart;
    	}
    }
    
    /**
    
     * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
    
     * @mapping: the address space containing mmaps to be unmapped.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @holebegin: byte in first page to unmap, relative to the start of
     * the underlying file.  This will be rounded down to a PAGE_SIZE
     * boundary.  Note that this is different from vmtruncate(), which
     * must keep the partial page.  In contrast, we must get rid of
     * partial pages.
     * @holelen: size of prospective hole in bytes.  This will be rounded
     * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
     * end of the file.
     * @even_cows: 1 when truncating a file, unmap even private COWed pages;
     * but 0 when invalidating pagecache, don't throw away private data.
     */
    void unmap_mapping_range(struct address_space *mapping,
    		loff_t const holebegin, loff_t const holelen, int even_cows)
    {
    	struct zap_details details;
    	pgoff_t hba = holebegin >> PAGE_SHIFT;
    	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    
    	/* Check for overflow. */
    	if (sizeof(holelen) > sizeof(hlen)) {
    		long long holeend =
    			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    		if (holeend & ~(long long)ULONG_MAX)
    			hlen = ULONG_MAX - hba + 1;
    	}
    
    	details.check_mapping = even_cows? NULL: mapping;
    	details.nonlinear_vma = NULL;
    	details.first_index = hba;
    	details.last_index = hba + hlen - 1;
    	if (details.last_index < details.first_index)
    		details.last_index = ULONG_MAX;
    	details.i_mmap_lock = &mapping->i_mmap_lock;
    
    	spin_lock(&mapping->i_mmap_lock);
    
    
    	/* Protect against endless unmapping loops */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	mapping->truncate_count++;
    	if (unlikely(is_restart_addr(mapping->truncate_count))) {
    		if (mapping->truncate_count == 0)
    			reset_vma_truncate_counts(mapping);
    		mapping->truncate_count++;
    	}
    	details.truncate_count = mapping->truncate_count;
    
    	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
    		unmap_mapping_range_tree(&mapping->i_mmap, &details);
    	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
    		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
    	spin_unlock(&mapping->i_mmap_lock);
    }
    EXPORT_SYMBOL(unmap_mapping_range);
    
    
    /**
     * vmtruncate - unmap mappings "freed" by truncate() syscall
     * @inode: inode of the file used
     * @offset: file offset to start truncating
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * NOTE! We have to be ready to update the memory sharing
     * between the file and the memory map for a potential last
     * incomplete page.  Ugly, but necessary.
     */
    int vmtruncate(struct inode * inode, loff_t offset)
    {
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
    	if (inode->i_size < offset) {
    		unsigned long limit;
    
    		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
    		if (limit != RLIM_INFINITY && offset > limit)
    			goto out_sig;
    		if (offset > inode->i_sb->s_maxbytes)
    			goto out_big;
    		i_size_write(inode, offset);
    	} else {
    		struct address_space *mapping = inode->i_mapping;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
    		/*
    		 * truncation of in-use swapfiles is disallowed - it would
    		 * cause subsequent swapout to scribble on the now-freed
    		 * blocks.
    		 */
    		if (IS_SWAPFILE(inode))
    			return -ETXTBSY;
    		i_size_write(inode, offset);
    
    		/*
    		 * unmap_mapping_range is called twice, first simply for
    		 * efficiency so that truncate_inode_pages does fewer
    		 * single-page unmaps.  However after this first call, and
    		 * before truncate_inode_pages finishes, it is possible for
    		 * private pages to be COWed, which remain after
    		 * truncate_inode_pages finishes, hence the second
    		 * unmap_mapping_range call must be made for correctness.
    		 */
    		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
    		truncate_inode_pages(mapping, offset);
    		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
    	}
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (inode->i_op && inode->i_op->truncate)
    		inode->i_op->truncate(inode);
    	return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out_sig:
    	send_sig(SIGXFSZ, current, 0);
    out_big:
    	return -EFBIG;
    }
    EXPORT_SYMBOL(vmtruncate);
    
    
    int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
    {
    	struct address_space *mapping = inode->i_mapping;
    
    	/*
    	 * If the underlying filesystem is not going to provide
    	 * a way to truncate a range of blocks (punch a hole) -
    	 * we should return failure right now.
    	 */
    	if (!inode->i_op || !inode->i_op->truncate_range)
    		return -ENOSYS;
    
    
    	mutex_lock(&inode->i_mutex);
    
    	down_write(&inode->i_alloc_sem);
    	unmap_mapping_range(mapping, offset, (end - offset), 1);
    	truncate_inode_pages_range(mapping, offset, end);