Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/export.h>

Shailabh Nagar
committed
#include <linux/delayacct.h>
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#include "internal.h"
#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
#endif
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;
EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void * high_memory;
EXPORT_SYMBOL(high_memory);
/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
1;
#else
2;
#endif
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
}
__setup("norandmaps", disable_randmaps);
unsigned long highest_memmap_pfn __read_mostly;
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
static int __init init_zero_pfn(void)
{
zero_pfn = page_to_pfn(ZERO_PAGE(0));
return 0;
}
core_initcall(init_zero_pfn);
#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {

David Rientjes
committed
if (current->rss_stat.count[i]) {
add_mm_counter(mm, i, current->rss_stat.count[i]);
current->rss_stat.count[i] = 0;

David Rientjes
committed
current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
struct task_struct *task = current;
if (likely(task->mm == mm))
task->rss_stat.count[member] += val;
else
add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task)
{
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
sync_mm_rss(task->mm);
#else /* SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat(struct task_struct *task)
{
}
#endif /* SPLIT_RSS_COUNTING */
#ifdef HAVE_GENERIC_MMU_GATHER
static int tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
return 1;
}
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return 0;
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
tlb->active->next = batch;
tlb->active = batch;
return 1;
}
/* tlb_gather_mmu
* Called to initialize an (on-stack) mmu_gather structure for page-table
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
{
tlb->mm = mm;
tlb->fullmm = fullmm;
tlb->need_flush_all = 0;
tlb->start = -1UL;
tlb->end = 0;
tlb->need_flush = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
#endif
}
void tlb_flush_mmu(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
if (!tlb->need_flush)
return;
tlb->need_flush = 0;
tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
for (batch = &tlb->local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
}
tlb->active = &tlb->local;
}
/* tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
struct mmu_gather_batch *batch, *next;
tlb->start = start;
tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
check_pgt_cache();
for (batch = tlb->local.next; batch; batch = next) {
next = batch->next;
free_pages((unsigned long)batch, 0);
}
tlb->local.next = NULL;
}
/* __tlb_remove_page
* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
* handling the additional races in SMP caused by other CPUs caching valid
* mappings in their TLBs. Returns the number of free page slots left.
* When out of page slots we must call tlb_flush_mmu().
*/
int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
struct mmu_gather_batch *batch;
batch = tlb->active;
batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return 0;
}
VM_BUG_ON(batch->nr > batch->max);
return batch->max - batch->nr;
}
#endif /* HAVE_GENERIC_MMU_GATHER */
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
/*
* See the comment near struct mmu_table_batch.
*/
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
}
static void tlb_remove_table_one(void *table)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
__tlb_remove_table(table);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
{
struct mmu_table_batch *batch;
int i;
batch = container_of(head, struct mmu_table_batch, rcu);
for (i = 0; i < batch->nr; i++)
__tlb_remove_table(batch->tables[i]);
free_page((unsigned long)batch);
}
void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;
if (*batch) {
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
}
}
void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
struct mmu_table_batch **batch = &tlb->batch;
tlb->need_flush = 1;
/*
* When there's less then two users of this mm there cannot be a
* concurrent page-table walk.
*/
if (atomic_read(&tlb->mm->mm_users) < 2) {
__tlb_remove_table(table);
return;
}
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
tlb_remove_table_one(table);
return;
}
(*batch)->nr = 0;
}
(*batch)->tables[(*batch)->nr++] = table;
if ((*batch)->nr == MAX_TABLE_BATCH)
tlb_table_flush(tlb);
}
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
* very seldom) called out from the p?d_none_or_clear_bad macros.
*/
void pgd_clear_bad(pgd_t *pgd)
{
pgd_ERROR(*pgd);
pgd_clear(pgd);
}
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
pud_clear(pud);
}
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
pgtable_t token = pmd_pgtable(*pmd);
pte_free_tlb(tlb, token, addr);
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
free_pte_range(tlb, pmd, addr);
start &= PUD_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PUD_MASK;
if (!ceiling)
return;
if (end - 1 > ceiling - 1)
return;
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
start &= PGDIR_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PGDIR_MASK;
if (!ceiling)
return;
if (end - 1 > ceiling - 1)
return;
pud = pud_offset(pgd, start);
pgd_clear(pgd);
pud_free_tlb(tlb, pud, start);
* This function frees user-level page tables of a process.
*
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr &= PMD_MASK;
if (addr < floor) {
addr += PMD_SIZE;
if (!addr)
return;
}
if (ceiling) {
ceiling &= PMD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
end -= PMD_SIZE;
if (addr > end - 1)
return;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
free_pud_range(tlb, pgd, addr, next, floor, ceiling);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
unlink_anon_vmas(vma);
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
unlink_anon_vmas(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
}
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long address)
pgtable_t new = pte_alloc_one(mm, address);
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_read_barrier_depends() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
} else if (unlikely(pmd_trans_splitting(*pmd)))
wait_split_huge_page = 1;
if (new)
pte_free(mm, new);
if (wait_split_huge_page)
wait_split_huge_page(vma->anon_vma, pmd);
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
pte_t *new = pte_alloc_one_kernel(&init_mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
} else
VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
static inline void init_rss_vec(int *rss)
{
memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}
static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)

David Rientjes
committed
sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
* a region that doesn't allow it.
*
* The calling function must still handle the error.
*/
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, struct page *page)
pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
pud_t *pud = pud_offset(pgd, addr);
pmd_t *pmd = pmd_offset(pud, addr);
struct address_space *mapping;
pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
return;
}
if (nr_unshown) {
printk(KERN_ALERT
"BUG: Bad page map: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
printk(KERN_ALERT
"BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page);
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
if (vma->vm_ops)
printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
vma->vm_ops->fault);
if (vma->vm_file && vma->vm_file->f_op)
printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
vma->vm_file->f_op->mmap);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
* vm_normal_page -- This function gets the "struct page" associated with a pte.
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
* case, NULL is returned here. "Normal" mappings do have a struct page.
* There are 2 broad cases. Firstly, an architecture may define a pte_special()
* pte bit, in which case this function is trivial. Secondly, an architecture
* may not have a spare pte bit, which requires a more complicated scheme,
* described below.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
* COWed pages of a VM_PFNMAP are always normal.
* The way we recognize COWed pages within VM_PFNMAP mappings is through the
* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
* set, and the vm_pgoff will point to the first PFN mapped: thus every special
* mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
* And for normal mappings this is false.
*
* This restricts such mappings to be a linear translation from virtual address
* to pfn. To get around this restriction, we allow arbitrary mappings so long
* as the vma is not a COW mapping; in that case, we know that all ptes are
* special (because none can have been COWed).
* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
* page (that is, those where pfn_valid is true) are refcounted and considered
* normal pages by the VM. The disadvantage is that pages are refcounted
* (which can be slower and simply not an option for some PFNMAP users). The
* advantage is that we don't have to follow the strict linearity rule of
* PFNMAP mappings in order to support COWable mappings.
*
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
unsigned long pfn = pte_pfn(pte);
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/* !HAVE_PTE_SPECIAL case follows: */
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
if (swap_duplicate(entry) < 0)
return entry.val;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
if (list_empty(&dst_mm->mmlist))
list_add(&dst_mm->mmlist,
&src_mm->mmlist);
if (likely(!non_swap_entry(entry)))
rss[MM_SWAPENTS]++;
else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
if (PageAnon(page))
rss[MM_ANONPAGES]++;
else
rss[MM_FILEPAGES]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
if (is_cow_mapping(vm_flags)) {
pte = pte_wrprotect(pte);
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
if (PageAnon(page))
rss[MM_ANONPAGES]++;
else
rss[MM_FILEPAGES]++;
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
pte_t *orig_src_pte, *orig_dst_pte;
swp_entry_t entry = (swp_entry_t){0};
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
if (progress >= 32) {
progress = 0;
if (need_resched() ||
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
if (pte_none(*src_pte)) {
progress++;
continue;
}
entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
vma, addr, rss);
if (entry.val)
break;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_dst_pte, dst_ptl);
if (entry.val) {
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
return -ENOMEM;
progress = 0;
}
if (addr != end)
goto again;
return 0;
}
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
if (!dst_pmd)
return -ENOMEM;
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*src_pmd)) {
int err;
VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
err = copy_huge_pmd(dst_mm, src_mm,
dst_pmd, src_pmd, addr, vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
vma, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,