memory.c 108 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *		Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *		(Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
48
#include <linux/ksm.h>
Linus Torvalds's avatar
Linus Torvalds committed
49
#include <linux/rmap.h>
50
#include <linux/export.h>
51
#include <linux/delayacct.h>
Linus Torvalds's avatar
Linus Torvalds committed
52
#include <linux/init.h>
53
#include <linux/writeback.h>
54
#include <linux/memcontrol.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
55
#include <linux/mmu_notifier.h>
56 57 58
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
59
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
60

61
#include <asm/io.h>
Linus Torvalds's avatar
Linus Torvalds committed
62 63 64 65 66 67
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

68 69
#include "internal.h"

70
#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds's avatar
Linus Torvalds committed
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;

EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif

unsigned long num_physpages;
/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
 * and ZONE_HIGHMEM.
 */
void * high_memory;

EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);

92 93 94 95 96 97 98 99 100 101 102 103
/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
					1;
#else
					2;
#endif
104 105 106 107

static int __init disable_randmaps(char *s)
{
	randomize_va_space = 0;
108
	return 1;
109 110 111
}
__setup("norandmaps", disable_randmaps);

112
unsigned long zero_pfn __read_mostly;
113
unsigned long highest_memmap_pfn __read_mostly;
Hugh Dickins's avatar
Hugh Dickins committed
114 115 116 117 118 119 120 121 122 123

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
	zero_pfn = page_to_pfn(ZERO_PAGE(0));
	return 0;
}
core_initcall(init_zero_pfn);
124

125

126 127
#if defined(SPLIT_RSS_COUNTING)

128
void sync_mm_rss(struct mm_struct *mm)
129 130 131 132
{
	int i;

	for (i = 0; i < NR_MM_COUNTERS; i++) {
133 134 135
		if (current->rss_stat.count[i]) {
			add_mm_counter(mm, i, current->rss_stat.count[i]);
			current->rss_stat.count[i] = 0;
136 137
		}
	}
138
	current->rss_stat.events = 0;
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
}

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
	struct task_struct *task = current;

	if (likely(task->mm == mm))
		task->rss_stat.count[member] += val;
	else
		add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)

/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH	(64)
static void check_sync_rss_stat(struct task_struct *task)
{
	if (unlikely(task != current))
		return;
	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160
		sync_mm_rss(task->mm);
161
}
162
#else /* SPLIT_RSS_COUNTING */
163 164 165 166 167 168 169 170

#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

static void check_sync_rss_stat(struct task_struct *task)
{
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
#endif /* SPLIT_RSS_COUNTING */

#ifdef HAVE_GENERIC_MMU_GATHER

static int tlb_next_batch(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	batch = tlb->active;
	if (batch->next) {
		tlb->active = batch->next;
		return 1;
	}

	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
	if (!batch)
		return 0;

	batch->next = NULL;
	batch->nr   = 0;
	batch->max  = MAX_GATHER_BATCH;

	tlb->active->next = batch;
	tlb->active = batch;

	return 1;
}

/* tlb_gather_mmu
 *	Called to initialize an (on-stack) mmu_gather structure for page-table
 *	tear-down from @mm. The @fullmm argument is used when @mm is without
 *	users and we're going to destroy the full address space (exit/execve).
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
{
	tlb->mm = mm;

	tlb->fullmm     = fullmm;
	tlb->need_flush = 0;
	tlb->fast_mode  = (num_possible_cpus() == 1);
	tlb->local.next = NULL;
	tlb->local.nr   = 0;
	tlb->local.max  = ARRAY_SIZE(tlb->__pages);
	tlb->active     = &tlb->local;

#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb->batch = NULL;
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
	struct mmu_gather_batch *batch;

	if (!tlb->need_flush)
		return;
	tlb->need_flush = 0;
	tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
	tlb_table_flush(tlb);
231 232
#endif

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
	if (tlb_fast_mode(tlb))
		return;

	for (batch = &tlb->local; batch; batch = batch->next) {
		free_pages_and_swap_cache(batch->pages, batch->nr);
		batch->nr = 0;
	}
	tlb->active = &tlb->local;
}

/* tlb_finish_mmu
 *	Called at the end of the shootdown operation to free up any resources
 *	that were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
{
	struct mmu_gather_batch *batch, *next;

	tlb_flush_mmu(tlb);

	/* keep the page table cache within bounds */
	check_pgt_cache();

	for (batch = tlb->local.next; batch; batch = next) {
		next = batch->next;
		free_pages((unsigned long)batch, 0);
	}
	tlb->local.next = NULL;
}

/* __tlb_remove_page
 *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *	handling the additional races in SMP caused by other CPUs caching valid
 *	mappings in their TLBs. Returns the number of free page slots left.
 *	When out of page slots we must call tlb_flush_mmu().
 */
int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
	struct mmu_gather_batch *batch;

273
	VM_BUG_ON(!tlb->need_flush);
274 275 276 277 278 279 280 281 282 283 284

	if (tlb_fast_mode(tlb)) {
		free_page_and_swap_cache(page);
		return 1; /* avoid calling tlb_flush_mmu() */
	}

	batch = tlb->active;
	batch->pages[batch->nr++] = page;
	if (batch->nr == batch->max) {
		if (!tlb_next_batch(tlb))
			return 0;
285
		batch = tlb->active;
286 287 288 289 290 291 292 293
	}
	VM_BUG_ON(batch->nr > batch->max);

	return batch->max - batch->nr;
}

#endif /* HAVE_GENERIC_MMU_GATHER */

294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
 * See the comment near struct mmu_table_batch.
 */

static void tlb_remove_table_smp_sync(void *arg)
{
	/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table)
{
	/*
	 * This isn't an RCU grace period and hence the page-tables cannot be
	 * assumed to be actually RCU-freed.
	 *
	 * It is however sufficient for software page-table walkers that rely on
	 * IRQ disabling. See the comment near struct mmu_table_batch.
	 */
	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
	__tlb_remove_table(table);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
	struct mmu_table_batch *batch;
	int i;

	batch = container_of(head, struct mmu_table_batch, rcu);

	for (i = 0; i < batch->nr; i++)
		__tlb_remove_table(batch->tables[i]);

	free_page((unsigned long)batch);
}

void tlb_table_flush(struct mmu_gather *tlb)
{
	struct mmu_table_batch **batch = &tlb->batch;

	if (*batch) {
		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
		*batch = NULL;
	}
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
	struct mmu_table_batch **batch = &tlb->batch;

	tlb->need_flush = 1;

	/*
	 * When there's less then two users of this mm there cannot be a
	 * concurrent page-table walk.
	 */
	if (atomic_read(&tlb->mm->mm_users) < 2) {
		__tlb_remove_table(table);
		return;
	}

	if (*batch == NULL) {
		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
		if (*batch == NULL) {
			tlb_remove_table_one(table);
			return;
		}
		(*batch)->nr = 0;
	}
	(*batch)->tables[(*batch)->nr++] = table;
	if ((*batch)->nr == MAX_TABLE_BATCH)
		tlb_table_flush(tlb);
}

369
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
370

Linus Torvalds's avatar
Linus Torvalds committed
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
	pgd_ERROR(*pgd);
	pgd_clear(pgd);
}

void pud_clear_bad(pud_t *pud)
{
	pud_ERROR(*pud);
	pud_clear(pud);
}

void pmd_clear_bad(pmd_t *pmd)
{
	pmd_ERROR(*pmd);
	pmd_clear(pmd);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
399 400
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
			   unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
401
{
402
	pgtable_t token = pmd_pgtable(*pmd);
403
	pmd_clear(pmd);
404
	pte_free_tlb(tlb, token, addr);
405
	tlb->mm->nr_ptes--;
Linus Torvalds's avatar
Linus Torvalds committed
406 407
}

408 409 410
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
411 412 413
{
	pmd_t *pmd;
	unsigned long next;
414
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
415

416
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
417 418 419 420 421
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none_or_clear_bad(pmd))
			continue;
422
		free_pte_range(tlb, pmd, addr);
Linus Torvalds's avatar
Linus Torvalds committed
423 424
	} while (pmd++, addr = next, addr != end);

425 426 427 428 429 430 431
	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
432
	}
433 434 435 436 437
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
438
	pmd_free_tlb(tlb, pmd, start);
Linus Torvalds's avatar
Linus Torvalds committed
439 440
}

441 442 443
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				unsigned long addr, unsigned long end,
				unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
444 445 446
{
	pud_t *pud;
	unsigned long next;
447
	unsigned long start;
Linus Torvalds's avatar
Linus Torvalds committed
448

449
	start = addr;
Linus Torvalds's avatar
Linus Torvalds committed
450 451 452 453 454
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(pud))
			continue;
455
		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
456 457
	} while (pud++, addr = next, addr != end);

458 459 460 461 462 463 464
	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
Linus Torvalds's avatar
Linus Torvalds committed
465
	}
466 467 468 469 470
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
471
	pud_free_tlb(tlb, pud, start);
Linus Torvalds's avatar
Linus Torvalds committed
472 473 474
}

/*
475 476
 * This function frees user-level page tables of a process.
 *
Linus Torvalds's avatar
Linus Torvalds committed
477 478
 * Must be called with pagetable lock held.
 */
479
void free_pgd_range(struct mmu_gather *tlb,
480 481
			unsigned long addr, unsigned long end,
			unsigned long floor, unsigned long ceiling)
Linus Torvalds's avatar
Linus Torvalds committed
482 483 484
{
	pgd_t *pgd;
	unsigned long next;
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510

	/*
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing PMD* at this top level?  Because often
	 * there will be no work to do at all, and we'd prefer not to
	 * go all the way down to the bottom just to discover that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we must
	 * be careful to reject "the opposite 0" before it confuses the
	 * subsequent tests.  But what about where end is brought down
	 * by PMD_SIZE below? no, end can't go down to 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
511

512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
	addr &= PMD_MASK;
	if (addr < floor) {
		addr += PMD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= PMD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= PMD_SIZE;
	if (addr > end - 1)
		return;

528
	pgd = pgd_offset(tlb->mm, addr);
Linus Torvalds's avatar
Linus Torvalds committed
529 530 531 532
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
533
		free_pud_range(tlb, pgd, addr, next, floor, ceiling);
Linus Torvalds's avatar
Linus Torvalds committed
534
	} while (pgd++, addr = next, addr != end);
535 536
}

537
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
538
		unsigned long floor, unsigned long ceiling)
539 540 541 542 543
{
	while (vma) {
		struct vm_area_struct *next = vma->vm_next;
		unsigned long addr = vma->vm_start;

544
		/*
npiggin@suse.de's avatar
npiggin@suse.de committed
545 546
		 * Hide vma from rmap and truncate_pagecache before freeing
		 * pgtables
547
		 */
548
		unlink_anon_vmas(vma);
549 550
		unlink_file_vma(vma);

551
		if (is_vm_hugetlb_page(vma)) {
552
			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
553
				floor, next? next->vm_start: ceiling);
554 555 556 557 558
		} else {
			/*
			 * Optimization: gather nearby vmas into one call down
			 */
			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
559
			       && !is_vm_hugetlb_page(next)) {
560 561
				vma = next;
				next = vma->vm_next;
562
				unlink_anon_vmas(vma);
563
				unlink_file_vma(vma);
564 565 566 567
			}
			free_pgd_range(tlb, addr, vma->vm_end,
				floor, next? next->vm_start: ceiling);
		}
568 569
		vma = next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
570 571
}

572 573
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
		pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
574
{
575
	pgtable_t new = pte_alloc_one(mm, address);
576
	int wait_split_huge_page;
577 578 579
	if (!new)
		return -ENOMEM;

580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

595
	spin_lock(&mm->page_table_lock);
596 597
	wait_split_huge_page = 0;
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
Linus Torvalds's avatar
Linus Torvalds committed
598 599
		mm->nr_ptes++;
		pmd_populate(mm, pmd, new);
600
		new = NULL;
601 602
	} else if (unlikely(pmd_trans_splitting(*pmd)))
		wait_split_huge_page = 1;
603
	spin_unlock(&mm->page_table_lock);
604 605
	if (new)
		pte_free(mm, new);
606 607
	if (wait_split_huge_page)
		wait_split_huge_page(vma->anon_vma, pmd);
608
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
609 610
}

611
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
Linus Torvalds's avatar
Linus Torvalds committed
612
{
613 614 615 616
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	if (!new)
		return -ENOMEM;

617 618
	smp_wmb(); /* See comment in __pte_alloc */

619
	spin_lock(&init_mm.page_table_lock);
620
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
621
		pmd_populate_kernel(&init_mm, pmd, new);
622
		new = NULL;
623 624
	} else
		VM_BUG_ON(pmd_trans_splitting(*pmd));
625
	spin_unlock(&init_mm.page_table_lock);
626 627
	if (new)
		pte_free_kernel(&init_mm, new);
628
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
629 630
}

631 632 633 634 635 636
static inline void init_rss_vec(int *rss)
{
	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
637
{
638 639
	int i;

640
	if (current->mm == mm)
641
		sync_mm_rss(mm);
642 643 644
	for (i = 0; i < NR_MM_COUNTERS; i++)
		if (rss[i])
			add_mm_counter(mm, i, rss[i]);
645 646
}

647
/*
648 649 650
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
651 652 653
 *
 * The calling function must still handle the error.
 */
654 655
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
			  pte_t pte, struct page *page)
656
{
657 658 659 660 661
	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
	pud_t *pud = pud_offset(pgd, addr);
	pmd_t *pmd = pmd_offset(pud, addr);
	struct address_space *mapping;
	pgoff_t index;
662 663 664 665 666 667 668 669 670 671 672 673 674 675
	static unsigned long resume;
	static unsigned long nr_shown;
	static unsigned long nr_unshown;

	/*
	 * Allow a burst of 60 reports, then keep quiet for that minute;
	 * or allow a steady drip of one report per second.
	 */
	if (nr_shown == 60) {
		if (time_before(jiffies, resume)) {
			nr_unshown++;
			return;
		}
		if (nr_unshown) {
676 677
			printk(KERN_ALERT
				"BUG: Bad page map: %lu messages suppressed\n",
678 679 680 681 682 683 684
				nr_unshown);
			nr_unshown = 0;
		}
		nr_shown = 0;
	}
	if (nr_shown++ == 0)
		resume = jiffies + 60 * HZ;
685 686 687 688

	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
	index = linear_page_index(vma, addr);

689 690
	printk(KERN_ALERT
		"BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
691 692
		current->comm,
		(long long)pte_val(pte), (long long)pmd_val(*pmd));
693 694
	if (page)
		dump_page(page);
695
	printk(KERN_ALERT
696 697 698 699 700 701
		"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
		(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
	/*
	 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
	 */
	if (vma->vm_ops)
702
		print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
703 704
				(unsigned long)vma->vm_ops->fault);
	if (vma->vm_file && vma->vm_file->f_op)
705
		print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
706
				(unsigned long)vma->vm_file->f_op->mmap);
707
	dump_stack();
708
	add_taint(TAINT_BAD_PAGE);
709 710
}

711
static inline int is_cow_mapping(vm_flags_t flags)
712 713 714 715
{
	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

716 717 718 719 720 721 722 723 724 725 726 727 728 729
#ifndef is_zero_pfn
static inline int is_zero_pfn(unsigned long pfn)
{
	return pfn == zero_pfn;
}
#endif

#ifndef my_zero_pfn
static inline unsigned long my_zero_pfn(unsigned long addr)
{
	return zero_pfn;
}
#endif

730
/*
731
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
732
 *
733 734 735
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
736
 *
737 738 739 740 741 742 743 744
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
745
 *
746 747
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
748 749
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
750 751 752
 *
 *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
753 754 755 756 757 758
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
759 760
 *
 *
761
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
762 763 764 765 766 767 768 769 770
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
771
 */
772 773 774 775 776 777 778
#ifdef __HAVE_ARCH_PTE_SPECIAL
# define HAVE_PTE_SPECIAL 1
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
				pte_t pte)
779
{
780
	unsigned long pfn = pte_pfn(pte);
781 782

	if (HAVE_PTE_SPECIAL) {
783 784
		if (likely(!pte_special(pte)))
			goto check_pfn;
Hugh Dickins's avatar
Hugh Dickins committed
785 786
		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
			return NULL;
787
		if (!is_zero_pfn(pfn))
788
			print_bad_pte(vma, addr, pte, NULL);
789 790 791 792 793
		return NULL;
	}

	/* !HAVE_PTE_SPECIAL case follows: */

794 795 796 797 798 799
	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
		if (vma->vm_flags & VM_MIXEDMAP) {
			if (!pfn_valid(pfn))
				return NULL;
			goto out;
		} else {
800 801
			unsigned long off;
			off = (addr - vma->vm_start) >> PAGE_SHIFT;
802 803 804 805 806
			if (pfn == vma->vm_pgoff + off)
				return NULL;
			if (!is_cow_mapping(vma->vm_flags))
				return NULL;
		}
807 808
	}

809 810
	if (is_zero_pfn(pfn))
		return NULL;
811 812 813 814 815
check_pfn:
	if (unlikely(pfn > highest_memmap_pfn)) {
		print_bad_pte(vma, addr, pte, NULL);
		return NULL;
	}
816 817

	/*
818 819
	 * NOTE! We still have PageReserved() pages in the page tables.
	 * eg. VDSO mappings can cause them to exist.
820
	 */
821
out:
822
	return pfn_to_page(pfn);
823 824
}

Linus Torvalds's avatar
Linus Torvalds committed
825 826 827 828 829 830
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

831
static inline unsigned long
Linus Torvalds's avatar
Linus Torvalds committed
832
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
833
		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
834
		unsigned long addr, int *rss)
Linus Torvalds's avatar
Linus Torvalds committed
835
{
836
	unsigned long vm_flags = vma->vm_flags;
Linus Torvalds's avatar
Linus Torvalds committed
837 838 839 840 841 842
	pte_t pte = *src_pte;
	struct page *page;

	/* pte contains position in swap or file, so copy. */
	if (unlikely(!pte_present(pte))) {
		if (!pte_file(pte)) {
843 844
			swp_entry_t entry = pte_to_swp_entry(pte);

845 846 847
			if (swap_duplicate(entry) < 0)
				return entry.val;

Linus Torvalds's avatar
Linus Torvalds committed
848 849 850
			/* make sure dst_mm is on swapoff's mmlist. */
			if (unlikely(list_empty(&dst_mm->mmlist))) {
				spin_lock(&mmlist_lock);
851 852 853
				if (list_empty(&dst_mm->mmlist))
					list_add(&dst_mm->mmlist,
						 &src_mm->mmlist);
Linus Torvalds's avatar
Linus Torvalds committed
854 855
				spin_unlock(&mmlist_lock);
			}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
856 857
			if (likely(!non_swap_entry(entry)))
				rss[MM_SWAPENTS]++;
858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
			else if (is_migration_entry(entry)) {
				page = migration_entry_to_page(entry);

				if (PageAnon(page))
					rss[MM_ANONPAGES]++;
				else
					rss[MM_FILEPAGES]++;

				if (is_write_migration_entry(entry) &&
				    is_cow_mapping(vm_flags)) {
					/*
					 * COW mappings require pages in both
					 * parent and child to be set to read.
					 */
					make_migration_entry_read(&entry);
					pte = swp_entry_to_pte(entry);
					set_pte_at(src_mm, addr, src_pte, pte);
				}
876
			}
Linus Torvalds's avatar
Linus Torvalds committed
877
		}
878
		goto out_set_pte;
Linus Torvalds's avatar
Linus Torvalds committed
879 880 881 882 883 884
	}

	/*
	 * If it's a COW mapping, write protect it both
	 * in the parent and the child
	 */
885
	if (is_cow_mapping(vm_flags)) {
Linus Torvalds's avatar
Linus Torvalds committed
886
		ptep_set_wrprotect(src_mm, addr, src_pte);
887
		pte = pte_wrprotect(pte);
Linus Torvalds's avatar
Linus Torvalds committed
888 889 890 891 892 893 894 895 896
	}

	/*
	 * If it's a shared mapping, mark it clean in
	 * the child
	 */
	if (vm_flags & VM_SHARED)
		pte = pte_mkclean(pte);
	pte = pte_mkold(pte);
897 898 899 900

	page = vm_normal_page(vma, addr, pte);
	if (page) {
		get_page(page);
901
		page_dup_rmap(page);
902 903 904 905
		if (PageAnon(page))
			rss[MM_ANONPAGES]++;
		else
			rss[MM_FILEPAGES]++;
906
	}
907 908 909

out_set_pte:
	set_pte_at(dst_mm, addr, dst_pte, pte);
910
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
911 912
}

913 914 915
int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
		   unsigned long addr, unsigned long end)
Linus Torvalds's avatar
Linus Torvalds committed
916
{
917
	pte_t *orig_src_pte, *orig_dst_pte;
Linus Torvalds's avatar
Linus Torvalds committed
918
	pte_t *src_pte, *dst_pte;
919
	spinlock_t *src_ptl, *dst_ptl;
920
	int progress = 0;
921
	int rss[NR_MM_COUNTERS];
922
	swp_entry_t entry = (swp_entry_t){0};
Linus Torvalds's avatar
Linus Torvalds committed
923 924

again:
925 926
	init_rss_vec(rss);

927
	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
Linus Torvalds's avatar
Linus Torvalds committed
928 929
	if (!dst_pte)
		return -ENOMEM;
930
	src_pte = pte_offset_map(src_pmd, addr);
931
	src_ptl = pte_lockptr(src_mm, src_pmd);
932
	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
933 934
	orig_src_pte = src_pte;
	orig_dst_pte = dst_pte;
935
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
936 937 938 939 940 941

	do {
		/*
		 * We are holding two locks at this point - either of them
		 * could generate latencies in another task on another CPU.
		 */
942 943 944
		if (progress >= 32) {
			progress = 0;
			if (need_resched() ||
945
			    spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
946 947
				break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
948 949 950 951
		if (pte_none(*src_pte)) {
			progress++;
			continue;
		}
952 953 954 955
		entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
							vma, addr, rss);
		if (entry.val)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
956 957 958
		progress += 8;
	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

959
	arch_leave_lazy_mmu_mode();
960
	spin_unlock(src_ptl);
961
	pte_unmap(orig_src_pte);
962
	add_mm_rss_vec(dst_mm, rss);
963
	pte_unmap_unlock(orig_dst_pte, dst_ptl);
964
	cond_resched();
965 966 967 968 969 970

	if (entry.val) {
		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
			return -ENOMEM;
		progress = 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
	if (addr != end)
		goto again;
	return 0;
}

static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pmd_t *src_pmd, *dst_pmd;
	unsigned long next;

	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
	if (!dst_pmd)
		return -ENOMEM;
	src_pmd = pmd_offset(src_pud, addr);
	do {
		next = pmd_addr_end(addr, end);
989 990
		if (pmd_trans_huge(*src_pmd)) {
			int err;
991
			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
992 993 994 995 996 997 998 999
			err = copy_huge_pmd(dst_mm, src_mm,
					    dst_pmd, src_pmd, addr, vma);
			if (err == -ENOMEM)
				return -ENOMEM;
			if (!err)
				continue;
			/* fall through */
		}
Linus Torvalds's avatar
Linus Torvalds committed
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
		if (pmd_none_or_clear_bad(src_pmd))
			continue;
		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
	return 0;
}

static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
		unsigned long addr, unsigned long end)
{
	pud_t *src_pud, *dst_pud;
	unsigned long next;

	dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
	if (!dst_pud)
		return -ENOMEM;
	src_pud = pud_offset(src_pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (pud_none_or_clear_bad(src_pud))
			continue;
		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
						vma, addr, next))
			return -ENOMEM;
	} while (dst_pud++, src_pud++, addr = next, addr != end);
	return 0;
}

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
		struct vm_area_struct *vma)
{
	pgd_t *src_pgd, *dst_pgd;
	unsigned long next;
	unsigned long addr = vma->vm_start;
	unsigned long end = vma->vm_end;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1038
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
1039

1040 1041 1042 1043 1044 1045
	/*
	 * Don't copy ptes where a page fault will fill them correctly.
	 * Fork becomes much lighter when there are big shared or private
	 * readonly mappings. The tradeoff is that copy_page_range is more
	 * efficient than faulting.
	 */
1046
	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
1047 1048 1049 1050
		if (!vma->anon_vma)
			return 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
1051 1052 1053
	if (is_vm_hugetlb_page(vma))
		return copy_hugetlb_page_range(dst_mm, src_mm, vma);

1054
	if (unlikely(is_pfn_mapping(vma))) {
1055 1056 1057 1058 1059 1060 1061 1062 1063
		/*
		 * We do not free on error cases below as remove_vma
		 * gets called on error from higher level routine
		 */
		ret = track_pfn_vma_copy(vma);
		if (ret)
			return ret;
	}

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
	/*
	 * We need to invalidate the secondary MMU mappings only when
	 * there could be a permission downgrade on the ptes of the
	 * parent mm. And a permission downgrade will only happen if
	 * is_cow_mapping() returns true.
	 */
	if (is_cow_mapping(vma->vm_flags))
		mmu_notifier_invalidate_range_start(src_mm, addr, end);

	ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1074 1075 1076 1077 1078 1079
	dst_pgd = pgd_offset(dst_mm, addr);
	src_pgd = pgd_offset(src_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(src_pgd))
			continue;
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1080 1081 1082 1083 1084
		if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
					    vma, addr, next))) {
			ret = -ENOMEM;
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1085
	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1086 1087 1088 1089 1090

	if (is_cow_mapping(vma->vm_flags))
		mmu_notifier_invalidate_range_end(src_mm,
						  vma->vm_start, end);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1091 1092
}

1093
static unsigned long zap_pte_range(struct mmu_gather *tlb,
1094
				struct vm_area_struct *vma, pmd_t *pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1095
				unsigned long addr, unsigned long end,
1096
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1097
{
1098
	struct mm_struct *mm = tlb->mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1099
	int force_flush = 0;
1100
	int rss[NR_MM_COUNTERS];
1101
	spinlock_t *ptl;
1102
	pte_t *start_pte;
1103
	pte_t *pte;
1104

Peter Zijlstra's avatar
Peter Zijlstra committed
1105
again:
1106
	init_rss_vec(rss);
1107 1108
	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
	pte = start_pte;
1109
	arch_enter_lazy_mmu_mode();
Linus Torvalds's avatar
Linus Torvalds committed
1110 1111
	do {
		pte_t ptent = *pte;
1112
		if (pte_none(ptent)) {
Linus Torvalds's avatar
Linus Torvalds committed
1113
			continue;
1114
		}
1115

Linus Torvalds's avatar
Linus Torvalds committed
1116
		if (pte_present(ptent)) {
1117
			struct page *page;
1118

1119
			page = vm_normal_page(vma, addr, ptent);
Linus Torvalds's avatar
Linus Torvalds committed
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
			if (unlikely(details) && page) {
				/*
				 * unmap_shared_mapping_pages() wants to
				 * invalidate cache without truncating:
				 * unmap shared but keep private pages.
				 */
				if (details->check_mapping &&
				    details->check_mapping != page->mapping)
					continue;
				/*
				 * Each page->index must be checked when
				 * invalidating or truncating nonlinear.
				 */
				if (details->nonlinear_vma &&
				    (page->index < details->first_index ||
				     page->index > details->last_index))
					continue;
			}
1138
			ptent = ptep_get_and_clear_full(mm, addr, pte,
1139
							tlb->fullmm);
Linus Torvalds's avatar
Linus Torvalds committed
1140 1141 1142 1143 1144 1145
			tlb_remove_tlb_entry(tlb, pte, addr);
			if (unlikely(!page))
				continue;
			if (unlikely(details) && details->nonlinear_vma
			    && linear_page_index(details->nonlinear_vma,
						addr) != page->index)
1146
				set_pte_at(mm, addr, pte,
Linus Torvalds's avatar
Linus Torvalds committed
1147 1148
					   pgoff_to_pte(page->index));
			if (PageAnon(page))
1149
				rss[MM_ANONPAGES]--;
1150 1151 1152
			else {
				if (pte_dirty(ptent))
					set_page_dirty(page);
1153 1154
				if (pte_young(ptent) &&
				    likely(!VM_SequentialReadHint(vma)))
1155
					mark_page_accessed(page);
1156
				rss[MM_FILEPAGES]--;
1157
			}
1158
			page_remove_rmap(page);
1159 1160
			if (unlikely(page_mapcount(page) < 0))
				print_bad_pte(vma, addr, ptent, page);
Peter Zijlstra's avatar
Peter Zijlstra committed
1161 1162 1163
			force_flush = !__tlb_remove_page(tlb, page);
			if (force_flush)
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1164 1165 1166 1167 1168 1169 1170 1171
			continue;
		}
		/*
		 * If details->check_mapping, we leave swap entries;
		 * if details->nonlinear_vma, we leave file entries.
		 */
		if (unlikely(details))
			continue;
1172 1173 1174
		if (pte_file(ptent)) {
			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
				print_bad_pte(vma, addr, ptent, NULL);
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1175 1176 1177 1178 1179
		} else {
			swp_entry_t entry = pte_to_swp_entry(ptent);

			if (!non_swap_entry(entry))
				rss[MM_SWAPENTS]--;
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
			else if (is_migration_entry(entry)) {
				struct page *page;

				page = migration_entry_to_page(entry);

				if (PageAnon(page))
					rss[MM_ANONPAGES]--;
				else
					rss[MM_FILEPAGES]--;
			}
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
1190 1191 1192
			if (unlikely(!free_swap_and_cache(entry)))
				print_bad_pte(vma, addr, ptent, NULL);
		}
1193
		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1194
	} while (pte++, addr += PAGE_SIZE, addr != end);
1195

1196
	add_mm_rss_vec(mm, rss);
1197
	arch_leave_lazy_mmu_mode();
1198
	pte_unmap_unlock(start_pte, ptl);
1199

Peter Zijlstra's avatar
Peter Zijlstra committed
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211
	/*
	 * mmu_gather ran out of room to batch pages, we break out of
	 * the PTE lock to avoid doing the potential expensive TLB invalidate
	 * and page-free while holding it.
	 */
	if (force_flush) {
		force_flush = 0;
		tlb_flush_mmu(tlb);
		if (addr != end)
			goto again;
	}

1212
	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1213 1214
}

1215
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1216
				struct vm_area_struct *vma, pud_t *pud,
Linus Torvalds's avatar
Linus Torvalds committed
1217
				unsigned long addr, unsigned long end,
1218
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1219 1220 1221 1222 1223 1224 1225
{
	pmd_t *pmd;
	unsigned long next;

	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
1226
		if (pmd_trans_huge(*pmd)) {
1227
			if (next - addr != HPAGE_PMD_SIZE) {
1228
				VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1229
				split_huge_page_pmd(vma->vm_mm, pmd);
1230
			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1231
				goto next;
1232 1233
			/* fall through */
		}
1234 1235 1236 1237 1238 1239 1240 1241 1242
		/*
		 * Here there can be other concurrent MADV_DONTNEED or
		 * trans huge page faults running, and if the pmd is
		 * none or trans huge it can change under us. This is
		 * because MADV_DONTNEED holds the mmap_sem in read
		 * mode.
		 */
		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			goto next;
1243
		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1244
next:
1245 1246
		cond_resched();
	} while (pmd++, addr = next, addr != end);
1247 1248

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1249 1250
}

1251
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1252
				struct vm_area_struct *vma, pgd_t *pgd,
Linus Torvalds's avatar
Linus Torvalds committed
1253
				unsigned long addr, unsigned long end,
1254
				struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1255 1256 1257 1258 1259 1260 1261
{
	pud_t *pud;
	unsigned long next;

	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
1262
		if (pud_none_or_clear_bad(pud))
Linus Torvalds's avatar
Linus Torvalds committed
1263
			continue;
1264 1265
		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
	} while (pud++, addr = next, addr != end);
1266 1267

	return addr;
Linus Torvalds's avatar
Linus Torvalds committed
1268 1269
}

1270 1271 1272 1273
static void unmap_page_range(struct mmu_gather *tlb,
			     struct vm_area_struct *vma,
			     unsigned long addr, unsigned long end,
			     struct zap_details *details)
Linus Torvalds's avatar
Linus Torvalds committed
1274 1275 1276 1277 1278 1279 1280 1281
{
	pgd_t *pgd;
	unsigned long next;

	if (details && !details->check_mapping && !details->nonlinear_vma)
		details = NULL;

	BUG_ON(addr >= end);
1282
	mem_cgroup_uncharge_start();
Linus Torvalds's avatar
Linus Torvalds committed
1283 1284 1285 1286
	tlb_start_vma(tlb, vma);
	pgd = pgd_offset(vma->vm_mm, addr);
	do {
		next = pgd_addr_end(addr, end);
1287
		if (pgd_none_or_clear_bad(pgd))
Linus Torvalds's avatar
Linus Torvalds committed
1288
			continue;
1289 1290
		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
	} while (pgd++, addr = next, addr != end);
Linus Torvalds's avatar
Linus Torvalds committed
1291
	tlb_end_vma(tlb, vma);
1292
	mem_cgroup_uncharge_end();
Linus Torvalds's avatar
Linus Torvalds committed
1293
}
1294

1295 1296 1297

static void unmap_single_vma(struct mmu_gather *tlb,
		struct vm_area_struct *vma, unsigned long start_addr,
1298
		unsigned long end_addr,
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330
		struct zap_details *details)
{
	unsigned long start = max(vma->vm_start, start_addr);
	unsigned long end;

	if (start >= vma->vm_end)
		return;
	end = min(vma->vm_end, end_addr);
	if (end <= vma->vm_start)
		return;

	if (unlikely(is_pfn_mapping(vma)))
		untrack_pfn_vma(vma, 0, 0);

	if (start != end) {
		if (unlikely(is_vm_hugetlb_page(vma))) {
			/*
			 * It is undesirable to test vma->vm_file as it
			 * should be non-null for valid hugetlb area.
			 * However, vm_file will be NULL in the error
			 * cleanup path of do_mmap_pgoff. When
			 * hugetlbfs ->mmap method fails,
			 * do_mmap_pgoff() nullifies vma->vm_file
			 * before calling this function to clean up.
			 * Since no pte has actually been setup, it is
			 * safe to do nothing in this case.
			 */
			if (vma->vm_file)
				unmap_hugepage_range(vma, start, end, NULL);
		} else
			unmap_page_range(tlb, vma, start, end, details);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1331 1332 1333 1334
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
1335
 * @tlb: address of the caller's struct mmu_gather
Linus Torvalds's avatar
Linus Torvalds committed
1336 1337 1338 1339
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 *
1340
 * Unmap all pages in the vma list.
Linus Torvalds's avatar
Linus Torvalds committed
1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
1351
void unmap_vmas(struct mmu_gather *tlb,
Linus Torvalds's avatar
Linus Torvalds committed
1352
		struct vm_area_struct *vma, unsigned long start_addr,
1353
		unsigned long end_addr)
Linus Torvalds's avatar
Linus Torvalds committed
1354
{
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1355
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
1356

Andrea Arcangeli's avatar
Andrea Arcangeli committed
1357
	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1358
	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1359
		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
Andrea Arcangeli's avatar
Andrea Arcangeli committed
1360
	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
Linus Torvalds's avatar
Linus Torvalds committed
1361 1362 1363 1364 1365 1366 1367 1368
}

/**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
1369 1370
 *
 * Caller must protect the VMA list
Linus Torvalds's avatar
Linus Torvalds committed
1371
 */
1372
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
Linus Torvalds's avatar
Linus Torvalds committed
1373 1374 1375
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1376
	struct mmu_gather tlb;
1377
	unsigned long end = start + size;
Linus Torvalds's avatar
Linus Torvalds committed
1378 1379

	lru_add_drain();
Peter Zijlstra's avatar
Peter Zijlstra committed
1380
	tlb_gather_mmu(&tlb, mm, 0);
1381
	update_hiwater_rss(mm);
1382 1383
	mmu_notifier_invalidate_range_start(mm, start, end);
	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1384
		unmap_single_vma(&tlb, vma, start, end, details);
1385 1386
	mmu_notifier_invalidate_range_end(mm, start, end);
	tlb_finish_mmu(&tlb, start, end);
Linus Torvalds's avatar
Linus Torvalds committed
1387 1388
}

1389 1390 1391 1392 1393 1394 1395 1396
/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 *
 * The range must fit into one VMA.
Linus Torvalds's avatar
Linus Torvalds committed
1397
 */
1398
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
1399 1400 1401
		unsigned long size, struct zap_details *details)
{
	struct mm_struct *mm = vma->vm_mm;
Peter Zijlstra's avatar
Peter Zijlstra committed
1402
	struct mmu_gather tlb;
Linus Torvalds's avatar
Linus Torvalds committed
1403 1404 1405
	unsigned long end = address + size;

	lru_add_drain();
Peter Zijlstra's avatar
Peter Zijlstra committed
1406
	tlb_gather_mmu(&tlb, mm, 0);
1407
	update_hiwater_rss(mm);
1408
	mmu_notifier_invalidate_range_start(mm, address, end);
1409
	unmap_single_vma(&tlb, vma, address, end, details);
1410
	mmu_notifier_invalidate_range_end(mm, address, end);
Peter Zijlstra's avatar
Peter Zijlstra committed
1411
	tlb_finish_mmu(&tlb, address, end);
Linus Torvalds's avatar
Linus Torvalds committed
1412 1413
}

1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 * Returns 0 if successful.
 */
int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
		unsigned long size)
{
	if (address < vma->vm_start || address + size > vma->vm_end ||
	    		!(vma->vm_flags & VM_PFNMAP))
		return -1;
1432
	zap_page_range_single(vma, address, size, NULL);
1433 1434 1435 1436
	return 0;
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
/**
 * follow_page - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * Returns the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
Linus Torvalds's avatar
Linus Torvalds committed
1448
 */
1449
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1450
			unsigned int flags)
Linus Torvalds's avatar
Linus Torvalds committed
1451 1452 1453 1454 1455
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep, pte;
1456
	spinlock_t *ptl;
Linus Torvalds's avatar
Linus Torvalds committed
1457
	struct page *page;
1458
	struct mm_struct *mm = vma->vm_mm;
Linus Torvalds's avatar
Linus Torvalds committed
1459

1460 1461 1462 1463 1464
	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		goto out;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1465

1466
	page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1467 1468
	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1469
		goto no_page_table;
Linus Torvalds's avatar
Linus Torvalds committed
1470 1471

	pud = pud_offset(pgd, address);
Andi Kleen's avatar
Andi Kleen committed
1472
	if (pud_none(*pud))
1473
		goto no_page_table;
1474
	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
Andi Kleen's avatar
Andi Kleen committed
1475 1476 1477 1478 1479 1480 1481
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
		goto out;
	}
	if (unlikely(pud_bad(*pud)))
		goto no_page_table;

Linus Torvalds's avatar
Linus Torvalds committed
1482
	pmd = pmd_offset(pud, address);
1483
	if (pmd_none(*pmd))
1484
		goto no_page_table;
1485
	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1486 1487
		BUG_ON(flags & FOLL_GET);
		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
Linus Torvalds's avatar
Linus Torvalds committed
1488
		goto out;
1489
	}
1490
	if (pmd_trans_huge(*pmd)) {
1491 1492 1493 1494
		if (flags & FOLL_SPLIT) {
			split_huge_page_pmd(mm, pmd);
			goto split_fallthrough;
		}
1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509
		spin_lock(&mm->page_table_lock);
		if (likely(pmd_trans_huge(*pmd))) {
			if (unlikely(pmd_trans_splitting(*pmd))) {
				spin_unlock(&mm->page_table_lock);
				wait_split_huge_page(vma->anon_vma, pmd);
			} else {
				page = follow_trans_huge_pmd(mm, address,
							     pmd, flags);
				spin_unlock(&mm->page_table_lock);
				goto out;
			}
		} else
			spin_unlock(&mm->page_table_lock);
		/* fall through */
	}
1510
split_fallthrough:
1511 1512 1513
	if (unlikely(pmd_bad(*pmd)))
		goto no_page_table;

1514
	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
Linus Torvalds's avatar
Linus Torvalds committed
1515 1516

	pte = *ptep;
1517
	if (!pte_present(pte))
1518
		goto no_page;
1519 1520
	if ((flags & FOLL_WRITE) && !pte_write(pte))
		goto unlock;
Hugh Dickins's avatar
Hugh Dickins committed
1521