Newer
Older
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
* collector of these hiwater stats must therefore get total_vm
* and rss too, which will usually be the higher. Barriers? not
* worth the effort, such snapshots can always be inconsistent.
*/
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
hiwater_rss = total_rss = get_mm_rss(mm);
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
seq_printf(m,
"VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
total_vm << (PAGE_SHIFT-10),
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
swap << (PAGE_SHIFT-10));
}
unsigned long task_vsize(struct mm_struct *mm)
{
return PAGE_SIZE * mm->total_vm;
}
unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
static void pad_len_spaces(struct seq_file *m, int len)
{
len = 25 + sizeof(void*) * 6 - len;
if (len < 1)
len = 1;
seq_printf(m, "%*c", len, ' ');
}
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#ifdef CONFIG_NUMA
/*
* These functions are for numa_maps but called in generic **maps seq_file
* ->start(), ->stop() ops.
*
* numa_maps scans all vmas under mmap_sem and checks their mempolicy.
* Each mempolicy object is controlled by reference counting. The problem here
* is how to avoid accessing dead mempolicy object.
*
* Because we're holding mmap_sem while reading seq_file, it's safe to access
* each vma's mempolicy, no vma objects will never drop refs to mempolicy.
*
* A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
* is set and replaced under mmap_sem but unrefed and cleared under task_lock().
* So, without task_lock(), we cannot trust get_vma_policy() because we cannot
* gurantee the task never exits under us. But taking task_lock() around
* get_vma_plicy() causes lock order problem.
*
* To access task->mempolicy without lock, we hold a reference count of an
* object pointed by task->mempolicy and remember it. This will guarantee
* that task->mempolicy points to an alive object or NULL in numa_maps accesses.
*/
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
struct task_struct *task = priv->task;
task_lock(task);
priv->task_mempolicy = task->mempolicy;
mpol_get(priv->task_mempolicy);
task_unlock(task);
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
mpol_put(priv->task_mempolicy);
}
#else
static void hold_task_mempolicy(struct proc_maps_private *priv)
{
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
}
#endif
static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
if (vma && vma != priv->tail_vma) {
struct mm_struct *mm = vma->vm_mm;
release_task_mempolicy(priv);
up_read(&mm->mmap_sem);
mmput(mm);
}
}
static void *m_start(struct seq_file *m, loff_t *pos)
struct proc_maps_private *priv = m->private;
unsigned long last_addr = m->version;
struct mm_struct *mm;
struct vm_area_struct *vma, *tail_vma = NULL;
loff_t l = *pos;
/* Clear the per syscall fields in priv */
priv->task = NULL;
priv->tail_vma = NULL;
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
if (last_addr == -1UL)
return NULL;
priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
if (!priv->task)
down_read(&mm->mmap_sem);
tail_vma = get_gate_vma(priv->task->mm);
hold_task_mempolicy(priv);
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* Start with last addr hint */
vma = find_vma(mm, last_addr);
if (last_addr && vma) {
vma = vma->vm_next;
goto out;
}
/*
* Check the vma index is within the range and do
* sequential scan until m_index.
*/
vma = NULL;
if ((unsigned long)l < mm->map_count) {
vma = mm->mmap;
while (l-- && vma)
vma = vma->vm_next;
goto out;
}
if (l != mm->map_count)
tail_vma = NULL; /* After gate vma */
out:
if (vma)
return vma;
release_task_mempolicy(priv);
/* End of vmas has been reached */
m->version = (tail_vma != NULL)? 0: -1UL;
up_read(&mm->mmap_sem);
mmput(mm);
return tail_vma;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct vm_area_struct *tail_vma = priv->tail_vma;
(*pos)++;
if (vma && (vma != tail_vma) && vma->vm_next)
return vma->vm_next;
vma_stop(priv, vma);
return (vma != tail_vma)? tail_vma: NULL;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
if (!IS_ERR(vma))
vma_stop(priv, vma);
if (priv->task)
put_task_struct(priv->task);
}
static int do_maps_open(struct inode *inode, struct file *file,
const struct seq_operations *ops)
{
struct proc_maps_private *priv;
int ret = -ENOMEM;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->pid = proc_pid(inode);
ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
} else {
kfree(priv);
}
}
return ret;
}
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
unsigned long long pgoff = 0;
unsigned long start, end;
const char *name = NULL;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
/* We don't show the stack guard page in /proc/maps */
start = vma->vm_start;
if (stack_guard_page_start(vma, start))
start += PAGE_SIZE;
end = vma->vm_end;
if (stack_guard_page_end(vma, end))
end -= PAGE_SIZE;
seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
MAJOR(dev), MINOR(dev), ino, &len);
/*
* Print the dentry name for named mappings, and a
* special [heap] marker for the heap:
*/
seq_path(m, &file->f_path, "\n");
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
goto done;
}
name = arch_vma_name(vma);
if (!name) {
pid_t tid;
if (!mm) {
name = "[vdso]";
goto done;
}
if (vma->vm_start <= mm->brk &&
vma->vm_end >= mm->start_brk) {
name = "[heap]";
goto done;
}
tid = vm_is_stack(task, vma, is_pid);
if (tid != 0) {
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack)) {
name = "[stack]";
/* Thread stack in /proc/PID/maps */
pad_len_spaces(m, len);
seq_printf(m, "[stack:%d]", tid);
}
done:
if (name) {
pad_len_spaces(m, len);
seq_puts(m, name);
static int show_map(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
show_map_vma(m, vma, is_pid);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
? vma->vm_start : 0;
static int show_pid_map(struct seq_file *m, void *v)
{
return show_map(m, v, 1);
}
static int show_tid_map(struct seq_file *m, void *v)
{
return show_map(m, v, 0);
}
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_pid_map
};
static const struct seq_operations proc_tid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_map
static int pid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_maps_op);
}
static int tid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_maps_op);
}
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_maps_operations = {
.open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
/*
* Proportional Set Size(PSS): my share of RSS.
*
* PSS of a process is the count of pages it has in memory, where each
* page is divided by the number of processes sharing it. So if a
* process has 1000 pages all to itself, and 1000 shared with one other
* process, its PSS will be 1500.
*
* To keep (accumulated) division errors low, we adopt a 64bit
* fixed-point pss counter to minimize division errors. So (pss >>
* PSS_SHIFT) would be the real byte count.
*
* A shift of 12 before division means (assuming 4K page size):
* - 1M 3-user-pages add up to 8KB errors;
* - supports mapcount up to 2^24, or 16M;
* - supports PSS up to 2^52 bytes, or 4PB.
*/
#define PSS_SHIFT 12
#ifdef CONFIG_PROC_PAGE_MONITOR
struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
unsigned long private_clean;
unsigned long private_dirty;
unsigned long referenced;
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long nonlinear;
static void smaps_pte_entry(pte_t ptent, unsigned long addr,
unsigned long ptent_size, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
pgoff_t pgoff = linear_page_index(vma, addr);
struct page *page = NULL;
int mapcount;
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
if (!non_swap_entry(swpent))
mss->swap += ptent_size;
else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
} else if (pte_file(ptent)) {
if (pte_to_pgoff(ptent) != pgoff)
mss->nonlinear += ptent_size;
if (!page)
return;
if (PageAnon(page))
mss->anonymous += ptent_size;
if (page->index != pgoff)
mss->nonlinear += ptent_size;
mss->resident += ptent_size;
/* Accumulate the size in pages that have been accessed. */
if (pte_young(ptent) || PageReferenced(page))
mss->referenced += ptent_size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
if (pte_dirty(ptent) || PageDirty(page))
mss->shared_dirty += ptent_size;
mss->shared_clean += ptent_size;
mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
} else {
if (pte_dirty(ptent) || PageDirty(page))
mss->private_dirty += ptent_size;
mss->private_clean += ptent_size;
mss->pss += (ptent_size << PSS_SHIFT);
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
if (pmd_trans_huge_lock(pmd, vma) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
spin_unlock(&walk->mm->page_table_lock);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;

Andrea Arcangeli
committed
if (pmd_trans_unstable(pmd))
return 0;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
* in here.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
* Don't forget to update Documentation/ on changes.
*/
static const char mnemonics[BITS_PER_LONG][2] = {
/*
* In case if we meet a flag we don't know about.
*/
[0 ... (BITS_PER_LONG-1)] = "??",
[ilog2(VM_READ)] = "rd",
[ilog2(VM_WRITE)] = "wr",
[ilog2(VM_EXEC)] = "ex",
[ilog2(VM_SHARED)] = "sh",
[ilog2(VM_MAYREAD)] = "mr",
[ilog2(VM_MAYWRITE)] = "mw",
[ilog2(VM_MAYEXEC)] = "me",
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_DENYWRITE)] = "dw",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
[ilog2(VM_NONLINEAR)] = "nl",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_DONTDUMP)] = "dd",
[ilog2(VM_MIXEDMAP)] = "mm",
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
};
size_t i;
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
if (vma->vm_flags & (1UL << i)) {
seq_printf(m, "%c%c ",
mnemonics[i][0], mnemonics[i][1]);
}
}
seq_putc(m, '\n');
}
static int show_smap(struct seq_file *m, void *v, int is_pid)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
.mm = vma->vm_mm,
.private = &mss,
};
/* mmap_sem is held in m_start */
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
show_map_vma(m, vma, is_pid);
seq_printf(m,
"Size: %8lu kB\n"
"Rss: %8lu kB\n"
"Pss: %8lu kB\n"
"Shared_Clean: %8lu kB\n"
"Shared_Dirty: %8lu kB\n"
"Private_Clean: %8lu kB\n"
"Private_Dirty: %8lu kB\n"
"Anonymous: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
mss.shared_clean >> 10,
mss.shared_dirty >> 10,
mss.private_clean >> 10,
mss.private_dirty >> 10,
mss.anonymous >> 10,
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
if (vma->vm_flags & VM_NONLINEAR)
seq_printf(m, "Nonlinear: %8lu kB\n",
mss.nonlinear >> 10);
show_smap_vma_flags(m, vma);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
? vma->vm_start : 0;
static int show_pid_smap(struct seq_file *m, void *v)
{
return show_smap(m, v, 1);
}
static int show_tid_smap(struct seq_file *m, void *v)
{
return show_smap(m, v, 0);
}
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_pid_smap
};
static const struct seq_operations proc_tid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_tid_smap
static int pid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
static int tid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_smaps_op);
}
const struct file_operations proc_pid_smaps_operations = {
.open = pid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
/*
* We do not want to have constant page-shift bits sitting in
* pagemap entries and are about to reuse them some time soon.
*
* Here's the "migration strategy":
* 1. when the system boots these bits remain what they are,
* but a warning about future change is printed in log;
* 2. once anyone clears soft-dirty bits via clear_refs file,
* these flag is set to denote, that user is aware of the
* new API and those page-shift bits change their meaning.
* The respective warning is printed in dmesg;
* 3. In a couple of releases we will remove all the mentions
* of page-shift in pagemap entries.
*/
static bool soft_dirty_cleared __read_mostly;
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
CLEAR_REFS_MAPPED,
CLEAR_REFS_SOFT_DIRTY,
CLEAR_REFS_LAST,
};
struct clear_refs_private {
struct vm_area_struct *vma;
enum clear_refs_types type;
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
* Documentation/vm/soft-dirty.txt for full description
* of how soft-dirty works.
*/
pte_t ptent = *pte;
if (pte_present(ptent)) {
ptent = pte_wrprotect(ptent);
ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
} else if (pte_file(ptent)) {
ptent = pte_file_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
struct clear_refs_private *cp = walk->private;
struct vm_area_struct *vma = cp->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
split_huge_page_pmd(vma, addr, pmd);

Andrea Arcangeli
committed
if (pmd_trans_unstable(pmd))
return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty(vma, addr, pte);
continue;
}
if (!pte_present(ptent))
continue;
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
return 0;
}
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
char buffer[PROC_NUMBUF];
struct vm_area_struct *vma;
enum clear_refs_types type;
int itype;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
rv = kstrtoint(strstrip(buffer), 10, &itype);
type = (enum clear_refs_types)itype;
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
if (type == CLEAR_REFS_SOFT_DIRTY) {
soft_dirty_cleared = true;
pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
"See the linux/Documentation/vm/pagemap.txt for details.\n");
}
if (!task)
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
struct clear_refs_private cp = {
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
.mm = mm,
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_start(mm, 0, -1);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (is_vm_hugetlb_page(vma))
continue;
/*
* Writing 1 to /proc/pid/clear_refs affects all pages.
*
* Writing 2 to /proc/pid/clear_refs only affects
* Anonymous pages.
*
* Writing 3 to /proc/pid/clear_refs only affects file
* mapped pages.
*/
if (type == CLEAR_REFS_ANON && vma->vm_file)
continue;
if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
mmput(mm);
}
put_task_struct(task);
return count;
const struct file_operations proc_clear_refs_operations = {
.write = clear_refs_write,
typedef struct {
u64 pme;
} pagemap_entry_t;
pagemap_entry_t *buffer;
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
#define PM_ENTRY_BYTES sizeof(u64)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
/* in "new" pagemap pshift bits are occupied with more status bits */
#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
#define __PM_SOFT_DIRTY (1LL)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
#define PM_FILE PM_STATUS(1LL)
#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
static inline pagemap_entry_t make_pme(u64 val)
{
return (pagemap_entry_t) { .pme = val };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)

Thomas Tuttle
committed
return PM_END_OF_BUFFER;
return 0;
}
static int pagemap_pte_hole(unsigned long start, unsigned long end,
unsigned long addr;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
for (addr = start; addr < end; addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
return err;
}
static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
u64 frame, flags;
struct page *page = NULL;
if (pte_present(pte)) {
frame = pte_pfn(pte);
flags = PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
flags2 |= __PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags = PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
*pme = make_pme(PM_NOT_PRESENT(pm->v2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
if (pte_soft_dirty(pte))
flags2 |= __PM_SOFT_DIRTY;
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pmd_t pmd, int offset, int pmd_flags2)
{
/*
* Currently pmd for thp is always present because thp can not be
* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
* This if-check is just to prepare for future implementation.
*/
if (pmd_present(pmd))
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);

Konstantin Khlebnikov
committed
else
*pme = make_pme(PM_NOT_PRESENT(pm->v2));
}
#else
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pmd_t pmd, int offset, int pmd_flags2)
{
}
#endif
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
int pmd_flags2;
pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);