Skip to content
Snippets Groups Projects
memory.c 91.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • 		unlock_page(old_page);
    
    	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
    
    					(VM_WRITE|VM_SHARED))) {
    
    		/*
    		 * Only catch write-faults on shared writable pages,
    		 * read-only shared pages can get COWed by
    		 * get_user_pages(.write=1, .force=1).
    		 */
    
    		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
    
    			struct vm_fault vmf;
    			int tmp;
    
    			vmf.virtual_address = (void __user *)(address &
    								PAGE_MASK);
    			vmf.pgoff = old_page->index;
    			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
    			vmf.page = old_page;
    
    
    			/*
    			 * Notify the address space that the page is about to
    			 * become writable so that it can prohibit this or wait
    			 * for the page to get into an appropriate state.
    			 *
    			 * We do this without the lock held, so that it can
    			 * sleep if it needs to.
    			 */
    			page_cache_get(old_page);
    			pte_unmap_unlock(page_table, ptl);
    
    
    			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
    			if (unlikely(tmp &
    					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    				ret = tmp;
    
    				goto unwritable_page;
    
    			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
    				lock_page(old_page);
    				if (!old_page->mapping) {
    					ret = 0; /* retry the fault */
    					unlock_page(old_page);
    					goto unwritable_page;
    				}
    			} else
    				VM_BUG_ON(!PageLocked(old_page));
    
    
    			/*
    			 * Since we dropped the lock we need to revalidate
    			 * the PTE as someone else may have changed it.  If
    			 * they did, we just return, as we can count on the
    			 * MMU to tell us if they didn't also make it writable.
    			 */
    			page_table = pte_offset_map_lock(mm, pmd, address,
    							 &ptl);
    
    			if (!pte_same(*page_table, orig_pte)) {
    				unlock_page(old_page);
    				page_cache_release(old_page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		dirty_page = old_page;
    		get_page(dirty_page);
    
    		flush_cache_page(vma, address, pte_pfn(orig_pte));
    		entry = pte_mkyoung(orig_pte);
    		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    
    		if (ptep_set_access_flags(vma, address, page_table, entry,1))
    
    			update_mmu_cache(vma, address, entry);
    
    		ret |= VM_FAULT_WRITE;
    		goto unlock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    
    	/*
    	 * Ok, we need to copy. Oh, well..
    	 */
    
    	page_cache_get(old_page);
    
    	pte_unmap_unlock(page_table, ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	if (unlikely(anon_vma_prepare(vma)))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	VM_BUG_ON(old_page == ZERO_PAGE(0));
    	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
    	if (!new_page)
    		goto oom;
    
    	/*
    	 * Don't let another task, with possibly unlocked vma,
    	 * keep the mlocked page.
    	 */
    
    	if ((vma->vm_flags & VM_LOCKED) && old_page) {
    
    		lock_page(old_page);	/* for LRU manipulation */
    		clear_page_mlock(old_page);
    		unlock_page(old_page);
    	}
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	cow_user_page(new_page, old_page, address, vma);
    
    	__SetPageUptodate(new_page);
    
    	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
    
    		goto oom_free_new;
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * Re-check the pte - we dropped the lock
    	 */
    
    	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    	if (likely(pte_same(*page_table, orig_pte))) {
    
    		if (old_page) {
    			if (!PageAnon(old_page)) {
    				dec_mm_counter(mm, file_rss);
    				inc_mm_counter(mm, anon_rss);
    			}
    		} else
    
    			inc_mm_counter(mm, anon_rss);
    
    		flush_cache_page(vma, address, pte_pfn(orig_pte));
    
    		entry = mk_pte(new_page, vma->vm_page_prot);
    		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    
    		/*
    		 * Clear the pte entry and flush it first, before updating the
    		 * pte with the new entry. This will avoid a race condition
    		 * seen in the presence of one thread doing SMC and another
    		 * thread doing COW.
    		 */
    
    		ptep_clear_flush(vma, address, page_table);
    
    		page_add_new_anon_rmap(new_page, vma, address);
    
    		/*
    		 * We call the notify macro here because, when using secondary
    		 * mmu page tables (such as kvm shadow page tables), we want the
    		 * new page to be mapped directly into the secondary page table.
    		 */
    		set_pte_at_notify(mm, address, page_table, entry);
    
    		update_mmu_cache(vma, address, entry);
    
    		if (old_page) {
    			/*
    			 * Only after switching the pte to the new page may
    			 * we remove the mapcount here. Otherwise another
    			 * process may come and find the rmap count decremented
    			 * before the pte is switched to the new page, and
    			 * "reuse" the old page writing into it while our pte
    			 * here still points into it and can be read by other
    			 * threads.
    			 *
    			 * The critical issue is to order this
    			 * page_remove_rmap with the ptp_clear_flush above.
    			 * Those stores are ordered by (if nothing else,)
    			 * the barrier present in the atomic_add_negative
    			 * in page_remove_rmap.
    			 *
    			 * Then the TLB flush in ptep_clear_flush ensures that
    			 * no process can access the old page before the
    			 * decremented mapcount is visible. And the old page
    			 * cannot be reused until after the decremented
    			 * mapcount is visible. So transitively, TLBs to
    			 * old page will be flushed before it can be reused.
    			 */
    
    			page_remove_rmap(old_page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* Free the old page.. */
    		new_page = old_page;
    
    		ret |= VM_FAULT_WRITE;
    
    	} else
    		mem_cgroup_uncharge_page(new_page);
    
    
    	if (new_page)
    		page_cache_release(new_page);
    	if (old_page)
    		page_cache_release(old_page);
    
    	pte_unmap_unlock(page_table, ptl);
    
    	if (dirty_page) {
    
    		/*
    		 * Yes, Virginia, this is actually required to prevent a race
    		 * with clear_page_dirty_for_io() from clearing the page dirty
    		 * bit after it clear all dirty ptes, but before a racing
    		 * do_wp_page installs a dirty pte.
    		 *
    		 * do_no_page is protected similarly.
    		 */
    
    		if (!page_mkwrite) {
    			wait_on_page_locked(dirty_page);
    			set_page_dirty_balance(dirty_page, page_mkwrite);
    		}
    
    		put_page(dirty_page);
    
    		if (page_mkwrite) {
    			struct address_space *mapping = dirty_page->mapping;
    
    			set_page_dirty(dirty_page);
    			unlock_page(dirty_page);
    			page_cache_release(dirty_page);
    			if (mapping)	{
    				/*
    				 * Some device drivers do not set page.mapping
    				 * but still dirty their pages
    				 */
    				balance_dirty_pages_ratelimited(mapping);
    			}
    		}
    
    		/* file_update_time outside page_lock */
    		if (vma->vm_file)
    			file_update_time(vma->vm_file);
    
    	return ret;
    
    oom_free_new:
    
    	page_cache_release(new_page);
    
    	if (old_page) {
    		if (page_mkwrite) {
    			unlock_page(old_page);
    			page_cache_release(old_page);
    		}
    
    		page_cache_release(old_page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return VM_FAULT_OOM;
    
    
    unwritable_page:
    	page_cache_release(old_page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * Helper functions for unmap_mapping_range().
     *
     * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
     *
     * We have to restart searching the prio_tree whenever we drop the lock,
     * since the iterator is only valid while the lock is held, and anyway
     * a later vma might be split and reinserted earlier while lock dropped.
     *
     * The list of nonlinear vmas could be handled more efficiently, using
     * a placeholder, but handle it in the same way until a need is shown.
     * It is important to search the prio_tree before nonlinear list: a vma
     * may become nonlinear and be shifted from prio_tree to nonlinear list
     * while the lock is dropped; but never shifted from list to prio_tree.
     *
     * In order to make forward progress despite restarting the search,
     * vm_truncate_count is used to mark a vma as now dealt with, so we can
     * quickly skip it next time around.  Since the prio_tree search only
     * shows us those vmas affected by unmapping the range in question, we
     * can't efficiently keep all vmas in step with mapping->truncate_count:
     * so instead reset them all whenever it wraps back to 0 (then go to 1).
     * mapping->truncate_count and vma->vm_truncate_count are protected by
     * i_mmap_lock.
     *
     * In order to make forward progress despite repeatedly restarting some
    
     * large vma, note the restart_addr from unmap_vmas when it breaks out:
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * and restart from that address when we reach that vma again.  It might
     * have been split or merged, shrunk or extended, but never shifted: so
     * restart_addr remains valid so long as it remains in the vma's range.
     * unmap_mapping_range forces truncate_count to leap over page-aligned
     * values so we can save vma's restart_addr in its truncate_count field.
     */
    #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
    
    static void reset_vma_truncate_counts(struct address_space *mapping)
    {
    	struct vm_area_struct *vma;
    	struct prio_tree_iter iter;
    
    	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
    		vma->vm_truncate_count = 0;
    	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
    		vma->vm_truncate_count = 0;
    }
    
    static int unmap_mapping_range_vma(struct vm_area_struct *vma,
    		unsigned long start_addr, unsigned long end_addr,
    		struct zap_details *details)
    {
    	unsigned long restart_addr;
    	int need_break;
    
    
    	/*
    	 * files that support invalidating or truncating portions of the
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	 * file from under mmaped areas must have their ->fault function
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	 * return a locked page (and set VM_FAULT_LOCKED in the return).
    	 * This provides synchronisation against concurrent unmapping here.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    again:
    	restart_addr = vma->vm_truncate_count;
    	if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
    		start_addr = restart_addr;
    		if (start_addr >= end_addr) {
    			/* Top of vma has been split off since last time */
    			vma->vm_truncate_count = details->truncate_count;
    			return 0;
    		}
    	}
    
    
    	restart_addr = zap_page_range(vma, start_addr,
    					end_addr - start_addr, details);
    
    	need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (restart_addr >= end_addr) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		/* We have now completed this vma: mark it so */
    		vma->vm_truncate_count = details->truncate_count;
    		if (!need_break)
    			return 0;
    	} else {
    		/* Note restart_addr in vma's truncate_count field */
    
    		vma->vm_truncate_count = restart_addr;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!need_break)
    			goto again;
    	}
    
    	spin_unlock(details->i_mmap_lock);
    	cond_resched();
    	spin_lock(details->i_mmap_lock);
    	return -EINTR;
    }
    
    static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
    					    struct zap_details *details)
    {
    	struct vm_area_struct *vma;
    	struct prio_tree_iter iter;
    	pgoff_t vba, vea, zba, zea;
    
    restart:
    	vma_prio_tree_foreach(vma, &iter, root,
    			details->first_index, details->last_index) {
    		/* Skip quickly over those we have already dealt with */
    		if (vma->vm_truncate_count == details->truncate_count)
    			continue;
    
    		vba = vma->vm_pgoff;
    		vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
    		/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
    		zba = details->first_index;
    		if (zba < vba)
    			zba = vba;
    		zea = details->last_index;
    		if (zea > vea)
    			zea = vea;
    
    		if (unmap_mapping_range_vma(vma,
    			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
    			((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
    				details) < 0)
    			goto restart;
    	}
    }
    
    static inline void unmap_mapping_range_list(struct list_head *head,
    					    struct zap_details *details)
    {
    	struct vm_area_struct *vma;
    
    	/*
    	 * In nonlinear VMAs there is no correspondence between virtual address
    	 * offset and file offset.  So we must perform an exhaustive search
    	 * across *all* the pages in each nonlinear VMA, not just the pages
    	 * whose virtual address lies outside the file truncation point.
    	 */
    restart:
    	list_for_each_entry(vma, head, shared.vm_set.list) {
    		/* Skip quickly over those we have already dealt with */
    		if (vma->vm_truncate_count == details->truncate_count)
    			continue;
    		details->nonlinear_vma = vma;
    		if (unmap_mapping_range_vma(vma, vma->vm_start,
    					vma->vm_end, details) < 0)
    			goto restart;
    	}
    }
    
    /**
    
     * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
    
     * @mapping: the address space containing mmaps to be unmapped.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * @holebegin: byte in first page to unmap, relative to the start of
     * the underlying file.  This will be rounded down to a PAGE_SIZE
     * boundary.  Note that this is different from vmtruncate(), which
     * must keep the partial page.  In contrast, we must get rid of
     * partial pages.
     * @holelen: size of prospective hole in bytes.  This will be rounded
     * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
     * end of the file.
     * @even_cows: 1 when truncating a file, unmap even private COWed pages;
     * but 0 when invalidating pagecache, don't throw away private data.
     */
    void unmap_mapping_range(struct address_space *mapping,
    		loff_t const holebegin, loff_t const holelen, int even_cows)
    {
    	struct zap_details details;
    	pgoff_t hba = holebegin >> PAGE_SHIFT;
    	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    
    	/* Check for overflow. */
    	if (sizeof(holelen) > sizeof(hlen)) {
    		long long holeend =
    			(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
    		if (holeend & ~(long long)ULONG_MAX)
    			hlen = ULONG_MAX - hba + 1;
    	}
    
    	details.check_mapping = even_cows? NULL: mapping;
    	details.nonlinear_vma = NULL;
    	details.first_index = hba;
    	details.last_index = hba + hlen - 1;
    	if (details.last_index < details.first_index)
    		details.last_index = ULONG_MAX;
    	details.i_mmap_lock = &mapping->i_mmap_lock;
    
    	spin_lock(&mapping->i_mmap_lock);
    
    
    	/* Protect against endless unmapping loops */
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	mapping->truncate_count++;
    	if (unlikely(is_restart_addr(mapping->truncate_count))) {
    		if (mapping->truncate_count == 0)
    			reset_vma_truncate_counts(mapping);
    		mapping->truncate_count++;
    	}
    	details.truncate_count = mapping->truncate_count;
    
    	if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
    		unmap_mapping_range_tree(&mapping->i_mmap, &details);
    	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
    		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
    	spin_unlock(&mapping->i_mmap_lock);
    }
    EXPORT_SYMBOL(unmap_mapping_range);
    
    
    /**
     * vmtruncate - unmap mappings "freed" by truncate() syscall
     * @inode: inode of the file used
     * @offset: file offset to start truncating
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * NOTE! We have to be ready to update the memory sharing
     * between the file and the memory map for a potential last
     * incomplete page.  Ugly, but necessary.
     */
    int vmtruncate(struct inode * inode, loff_t offset)
    {
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
    	if (inode->i_size < offset) {
    		unsigned long limit;
    
    		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
    		if (limit != RLIM_INFINITY && offset > limit)
    			goto out_sig;
    		if (offset > inode->i_sb->s_maxbytes)
    			goto out_big;
    		i_size_write(inode, offset);
    	} else {
    		struct address_space *mapping = inode->i_mapping;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Christoph Hellwig's avatar
    Christoph Hellwig committed
    		/*
    		 * truncation of in-use swapfiles is disallowed - it would
    		 * cause subsequent swapout to scribble on the now-freed
    		 * blocks.
    		 */
    		if (IS_SWAPFILE(inode))
    			return -ETXTBSY;
    		i_size_write(inode, offset);
    
    		/*
    		 * unmap_mapping_range is called twice, first simply for
    		 * efficiency so that truncate_inode_pages does fewer
    		 * single-page unmaps.  However after this first call, and
    		 * before truncate_inode_pages finishes, it is possible for
    		 * private pages to be COWed, which remain after
    		 * truncate_inode_pages finishes, hence the second
    		 * unmap_mapping_range call must be made for correctness.
    		 */
    		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
    		truncate_inode_pages(mapping, offset);
    		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
    	}
    
    Al Viro's avatar
    Al Viro committed
    	if (inode->i_op->truncate)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		inode->i_op->truncate(inode);
    	return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out_sig:
    	send_sig(SIGXFSZ, current, 0);
    out_big:
    	return -EFBIG;
    }
    EXPORT_SYMBOL(vmtruncate);
    
    
    int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
    {
    	struct address_space *mapping = inode->i_mapping;
    
    	/*
    	 * If the underlying filesystem is not going to provide
    	 * a way to truncate a range of blocks (punch a hole) -
    	 * we should return failure right now.
    	 */
    
    Al Viro's avatar
    Al Viro committed
    	if (!inode->i_op->truncate_range)
    
    	mutex_lock(&inode->i_mutex);
    
    	down_write(&inode->i_alloc_sem);
    	unmap_mapping_range(mapping, offset, (end - offset), 1);
    	truncate_inode_pages_range(mapping, offset, end);
    
    	unmap_mapping_range(mapping, offset, (end - offset), 1);
    
    	inode->i_op->truncate_range(inode, offset, end);
    	up_write(&inode->i_alloc_sem);
    
    	mutex_unlock(&inode->i_mutex);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
    
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
     * but allow concurrent faults), and pte mapped but not yet locked.
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
    		unsigned long address, pte_t *page_table, pmd_t *pmd,
    
    		unsigned int flags, pte_t orig_pte)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	spinlock_t *ptl;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	struct page *page;
    
    	swp_entry_t entry;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pte_t pte;
    
    	struct mem_cgroup *ptr = NULL;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	int ret = 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
    
    
    	entry = pte_to_swp_entry(orig_pte);
    
    	if (is_migration_entry(entry)) {
    		migration_entry_wait(mm, pmd, address);
    		goto out;
    	}
    
    	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	page = lookup_swap_cache(entry);
    	if (!page) {
    
    		grab_swap_token(mm); /* Contend for token _before_ read-in */
    
    		page = swapin_readahead(entry,
    					GFP_HIGHUSER_MOVABLE, vma, address);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!page) {
    			/*
    
    			 * Back out if somebody else faulted in this pte
    			 * while we released the pte lock.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			 */
    
    			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			if (likely(pte_same(*page_table, orig_pte)))
    				ret = VM_FAULT_OOM;
    
    			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    
    			goto unlock;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		}
    
    		/* Had to read the page from swap area: Major fault */
    		ret = VM_FAULT_MAJOR;
    
    		count_vm_event(PGMAJFAULT);
    
    	lock_page(page);
    	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
    
    
    	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
    
    		ret = VM_FAULT_OOM;
    
    		goto out_page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    
    	 * Back out if somebody else already faulted in this pte.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 */
    
    	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    	if (unlikely(!pte_same(*page_table, orig_pte)))
    
    		goto out_nomap;
    
    	if (unlikely(!PageUptodate(page))) {
    		ret = VM_FAULT_SIGBUS;
    		goto out_nomap;
    
    	/*
    	 * The page isn't present yet, go ahead with the fault.
    	 *
    	 * Be careful about the sequence of operations here.
    	 * To get its accounting right, reuse_swap_page() must be called
    	 * while the page is counted on swap but not yet in mapcount i.e.
    	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
    	 * must be called after the swap_free(), or it will never succeed.
    
    	 * Because delete_from_swap_page() may be called by reuse_swap_page(),
    	 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
    	 * in page->private. In this case, a record in swap_cgroup  is silently
    	 * discarded at swap_free().
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	inc_mm_counter(mm, anon_rss);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pte = mk_pte(page, vma->vm_page_prot);
    
    	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
    
    		flags &= ~FAULT_FLAG_WRITE;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	}
    	flush_icache_page(vma, page);
    	set_pte_at(mm, address, page_table, pte);
    	page_add_anon_rmap(page, vma, address);
    
    	/* It's better to call commit-charge after rmap is established */
    	mem_cgroup_commit_charge_swapin(page, ptr);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	swap_free(entry);
    
    	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
    
    	if (flags & FAULT_FLAG_WRITE) {
    
    		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
    		if (ret & VM_FAULT_ERROR)
    			ret &= VM_FAULT_ERROR;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		goto out;
    	}
    
    	/* No need to invalidate - it was non-present before */
    	update_mmu_cache(vma, address, pte);
    
    	pte_unmap_unlock(page_table, ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    out:
    	return ret;
    
    	mem_cgroup_cancel_charge_swapin(ptr);
    
    	pte_unmap_unlock(page_table, ptl);
    
    	unlock_page(page);
    	page_cache_release(page);
    
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
     * but allow concurrent faults), and pte mapped but not yet locked.
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
    		unsigned long address, pte_t *page_table, pmd_t *pmd,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	struct page *page;
    	spinlock_t *ptl;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pte_t entry;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	/* Allocate our own private page. */
    	pte_unmap(page_table);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	if (unlikely(anon_vma_prepare(vma)))
    		goto oom;
    	page = alloc_zeroed_user_highpage_movable(vma, address);
    	if (!page)
    		goto oom;
    
    	__SetPageUptodate(page);
    
    	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
    
    		goto oom_free_page;
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	entry = mk_pte(page, vma->vm_page_prot);
    
    Hugh Dickins's avatar
    Hugh Dickins committed
    	if (vma->vm_flags & VM_WRITE)
    		entry = pte_mkwrite(pte_mkdirty(entry));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    	if (!pte_none(*page_table))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		goto release;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	inc_mm_counter(mm, anon_rss);
    	page_add_new_anon_rmap(page, vma, address);
    
    	set_pte_at(mm, address, page_table, entry);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/* No need to invalidate - it was non-present before */
    
    	update_mmu_cache(vma, address, entry);
    unlock:
    
    	pte_unmap_unlock(page_table, ptl);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	return 0;
    
    	mem_cgroup_uncharge_page(page);
    
    	page_cache_release(page);
    	goto unlock;
    
    oom_free_page:
    
    	page_cache_release(page);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	return VM_FAULT_OOM;
    }
    
    /*
    
     * __do_fault() tries to create a new page mapping. It aggressively
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     * tries to share with existing pages, but makes a separate copy if
    
     * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
     * the next page fault.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     *
     * As this is called only for pages that do not currently exist, we
     * do not need to flush old virtual caches or the TLB.
     *
    
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
    
     * but allow concurrent faults), and pte neither mapped nor locked.
    
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    
    		unsigned long address, pmd_t *pmd,
    
    		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	pte_t *page_table;
    
    	spinlock_t *ptl;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	struct page *page;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pte_t entry;
    	int anon = 0;
    
    	int charged = 0;
    
    	struct page *dirty_page = NULL;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	struct vm_fault vmf;
    	int ret;
    
    	int page_mkwrite = 0;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
    	vmf.pgoff = pgoff;
    	vmf.flags = flags;
    	vmf.page = NULL;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	ret = vma->vm_ops->fault(vma, &vmf);
    	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
    		return ret;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	 * For consistency in subsequent calls, make the faulted page always
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	if (unlikely(!(ret & VM_FAULT_LOCKED)))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		lock_page(vmf.page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		VM_BUG_ON(!PageLocked(vmf.page));
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	/*
    	 * Should we do an early C-O-W break?
    	 */
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	page = vmf.page;
    
    	if (flags & FAULT_FLAG_WRITE) {
    
    		if (!(vma->vm_flags & VM_SHARED)) {
    
    			if (unlikely(anon_vma_prepare(vma))) {
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				ret = VM_FAULT_OOM;
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
    						vma, address);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				ret = VM_FAULT_OOM;
    
    			if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
    
    				ret = VM_FAULT_OOM;
    				page_cache_release(page);
    				goto out;
    			}
    			charged = 1;
    
    			/*
    			 * Don't let another task, with possibly unlocked vma,
    			 * keep the mlocked page.
    			 */
    			if (vma->vm_flags & VM_LOCKED)
    				clear_page_mlock(vmf.page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			copy_user_highpage(page, vmf.page, address, vma);
    
    			__SetPageUptodate(page);
    
    			/*
    			 * If the page will be shareable, see if the backing
    
    			 * address space wants to know that the page is about
    
    			if (vma->vm_ops->page_mkwrite) {
    
    				vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
    
    				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
    				if (unlikely(tmp &
    					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    					ret = tmp;
    
    					goto unwritable_page;
    
    				if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
    					lock_page(page);
    					if (!page->mapping) {
    						ret = 0; /* retry the fault */
    						unlock_page(page);
    						goto unwritable_page;
    					}
    				} else
    					VM_BUG_ON(!PageLocked(page));
    
    	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	/*
    	 * This silly early PAGE_DIRTY setting removes a race
    	 * due to the bad i386 page protection. But it's valid
    	 * for other architectures too.
    	 *
    
    	 * Note that if FAULT_FLAG_WRITE is set, we either now have
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	 * an exclusive copy of the page, or this is a shared mapping,
    	 * so we can make it writable and dirty to avoid having to
    	 * handle that later.
    	 */
    	/* Only go through if we didn't race with anybody else... */
    
    	if (likely(pte_same(*page_table, orig_pte))) {
    
    		flush_icache_page(vma, page);
    		entry = mk_pte(page, vma->vm_page_prot);
    
    		if (flags & FAULT_FLAG_WRITE)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    		if (anon) {
    
    			inc_mm_counter(mm, anon_rss);
    			page_add_new_anon_rmap(page, vma, address);
    
    			inc_mm_counter(mm, file_rss);
    
    			page_add_file_rmap(page);
    
    			if (flags & FAULT_FLAG_WRITE) {
    
    				get_page(dirty_page);
    			}
    
    		set_pte_at(mm, address, page_table, entry);
    
    
    		/* no need to invalidate: a not-present page won't be cached */
    		update_mmu_cache(vma, address, entry);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	} else {
    
    		if (charged)
    			mem_cgroup_uncharge_page(page);
    
    		if (anon)
    			page_cache_release(page);
    		else
    
    			anon = 1; /* no anon but release faulted_page */
    
    	pte_unmap_unlock(page_table, ptl);
    
    	if (dirty_page) {
    		struct address_space *mapping = page->mapping;
    
    		if (set_page_dirty(dirty_page))
    			page_mkwrite = 1;
    		unlock_page(dirty_page);
    
    		put_page(dirty_page);
    
    		if (page_mkwrite && mapping) {
    			/*
    			 * Some device drivers do not set page.mapping but still
    			 * dirty their pages
    			 */
    			balance_dirty_pages_ratelimited(mapping);
    		}
    
    		/* file_update_time outside page_lock */
    		if (vma->vm_file)
    			file_update_time(vma->vm_file);
    	} else {
    		unlock_page(vmf.page);
    		if (anon)
    			page_cache_release(vmf.page);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	return ret;
    
    
    unwritable_page:
    	page_cache_release(page);
    	return ret;
    
    static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    		unsigned long address, pte_t *page_table, pmd_t *pmd,
    
    		unsigned int flags, pte_t orig_pte)
    
    {
    	pgoff_t pgoff = (((address & PAGE_MASK)
    
    			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
    
    	pte_unmap(page_table);
    	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    /*
     * Fault of a previously existing named mapping. Repopulate the pte
     * from the encoded file_pte if possible. This enables swappable
     * nonlinear vmas.
    
     *
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
     * but allow concurrent faults), and pte mapped but not yet locked.
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    
    		unsigned long address, pte_t *page_table, pmd_t *pmd,
    
    		unsigned int flags, pte_t orig_pte)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    
    	pgoff_t pgoff;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	flags |= FAULT_FLAG_NONLINEAR;
    
    
    	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    		return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
    
    		/*
    		 * Page table corrupted: show pte and kill process.
    		 */
    
    		print_bad_pte(vma, address, orig_pte, NULL);
    
    		return VM_FAULT_OOM;
    	}
    
    	pgoff = pte_to_pgoff(orig_pte);
    
    	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * These routines also need to handle stuff like marking pages dirty
     * and/or accessed for architectures that don't do it in hardware (most
     * RISC architectures).  The early dirtying is also good on the i386.
     *
     * There is also a hook called "update_mmu_cache()" that architectures
     * with external mmu caches can use to update those (ie the Sparc or
     * PowerPC hashed page tables that act as extended TLBs).
     *
    
     * We enter with non-exclusive mmap_sem (to exclude vma changes,
     * but allow concurrent faults), and pte mapped but not yet locked.
     * We return with mmap_sem still held, but pte unmapped and unlocked.
    
    Linus Torvalds's avatar
    Linus Torvalds committed
     */
    static inline int handle_pte_fault(struct mm_struct *mm,
    
    		struct vm_area_struct *vma, unsigned long address,
    
    		pte_t *pte, pmd_t *pmd, unsigned int flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pte_t entry;
    
    	spinlock_t *ptl;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	if (!pte_present(entry)) {
    
    		if (pte_none(entry)) {
    
    Jes Sorensen's avatar
    Jes Sorensen committed
    			if (vma->vm_ops) {
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    				if (likely(vma->vm_ops->fault))
    
    					return do_linear_fault(mm, vma, address,
    
    						pte, pmd, flags, entry);
    
    Jes Sorensen's avatar
    Jes Sorensen committed
    			}
    			return do_anonymous_page(mm, vma, address,
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (pte_file(entry))
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    			return do_nonlinear_fault(mm, vma, address,
    
    					pte, pmd, flags, entry);
    
    		return do_swap_page(mm, vma, address,
    
    					pte, pmd, flags, entry);
    
    	ptl = pte_lockptr(mm, pmd);
    
    	spin_lock(ptl);
    	if (unlikely(!pte_same(*pte, entry)))
    		goto unlock;
    
    	if (flags & FAULT_FLAG_WRITE) {
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		if (!pte_write(entry))
    
    			return do_wp_page(mm, vma, address,
    					pte, pmd, ptl, entry);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    		entry = pte_mkdirty(entry);
    	}
    	entry = pte_mkyoung(entry);
    
    	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
    
    		update_mmu_cache(vma, address, entry);
    	} else {
    		/*
    		 * This is needed only for protection faults but the arch code
    		 * is not yet telling us if this is a protection fault or not.
    		 * This still avoids useless tlb flushes for .text page faults
    		 * with threads.
    		 */
    
    		if (flags & FAULT_FLAG_WRITE)
    
    			flush_tlb_page(vma, address);
    	}
    
    unlock:
    	pte_unmap_unlock(pte, ptl);
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    	return 0;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    }
    
    /*
     * By the time we get here, we already hold the mm semaphore
     */
    
    Nicholas Piggin's avatar
    Nicholas Piggin committed
    int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    
    		unsigned long address, unsigned int flags)
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    {
    	pgd_t *pgd;
    	pud_t *pud;
    	pmd_t *pmd;
    	pte_t *pte;
    
    	__set_current_state(TASK_RUNNING);
    
    
    	count_vm_event(PGFAULT);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    
    	if (unlikely(is_vm_hugetlb_page(vma)))
    
    		return hugetlb_fault(mm, vma, address, flags);
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    
    	pgd = pgd_offset(mm, address);
    	pud = pud_alloc(mm, pgd, address);
    	if (!pud)
    
    		return VM_FAULT_OOM;
    
    Linus Torvalds's avatar
    Linus Torvalds committed
    	pmd = pmd_alloc(mm, pud, address);
    	if (!pmd)
    
    		return VM_FAULT_OOM;