hugetlb.c 130 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2
/*
 * Generic hugetlb support.
3
 * (C) Nadia Yvette Chambers, April 2004
Linus Torvalds's avatar
Linus Torvalds committed
4 5 6 7
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
8
#include <linux/seq_file.h>
Linus Torvalds's avatar
Linus Torvalds committed
9 10
#include <linux/sysctl.h>
#include <linux/highmem.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
11
#include <linux/mmu_notifier.h>
Linus Torvalds's avatar
Linus Torvalds committed
12
#include <linux/nodemask.h>
13
#include <linux/pagemap.h>
14
#include <linux/mempolicy.h>
15
#include <linux/compiler.h>
16
#include <linux/cpuset.h>
17
#include <linux/mutex.h>
18
#include <linux/bootmem.h>
19
#include <linux/sysfs.h>
20
#include <linux/slab.h>
21
#include <linux/mmdebug.h>
22
#include <linux/sched/signal.h>
23
#include <linux/rmap.h>
24
#include <linux/string_helpers.h>
25 26
#include <linux/swap.h>
#include <linux/swapops.h>
27
#include <linux/jhash.h>
28

29 30
#include <asm/page.h>
#include <asm/pgtable.h>
31
#include <asm/tlb.h>
32

33
#include <linux/io.h>
34
#include <linux/hugetlb.h>
35
#include <linux/hugetlb_cgroup.h>
36
#include <linux/node.h>
37
#include <linux/userfaultfd_k.h>
38
#include <linux/page_owner.h>
39
#include "internal.h"
Linus Torvalds's avatar
Linus Torvalds committed
40

41
int hugetlb_max_hstate __read_mostly;
42 43
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
44 45 46 47 48
/*
 * Minimum page order among possible hugepage sizes, set to a proper value
 * at boot time.
 */
static unsigned int minimum_order __read_mostly = UINT_MAX;
49

50 51
__initdata LIST_HEAD(huge_boot_pages);

52 53 54
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
55
static unsigned long __initdata default_hstate_size;
56
static bool __initdata parsed_valid_hugepagesz = true;
57

58
/*
59 60
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
61
 */
62
DEFINE_SPINLOCK(hugetlb_lock);
63

64 65 66 67 68
/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes;
69
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
70

71 72 73
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);

74 75 76 77 78 79 80
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
	bool free = (spool->count == 0) && (spool->used_hpages == 0);

	spin_unlock(&spool->lock);

	/* If no pages are used, and no other handles to the subpool
81 82 83 84 85 86
	 * remain, give up any reservations mased on minimum size and
	 * free the subpool */
	if (free) {
		if (spool->min_hpages != -1)
			hugetlb_acct_memory(spool->hstate,
						-spool->min_hpages);
87
		kfree(spool);
88
	}
89 90
}

91 92
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
						long min_hpages)
93 94 95
{
	struct hugepage_subpool *spool;

96
	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
97 98 99 100 101
	if (!spool)
		return NULL;

	spin_lock_init(&spool->lock);
	spool->count = 1;
102 103 104 105 106 107 108 109 110
	spool->max_hpages = max_hpages;
	spool->hstate = h;
	spool->min_hpages = min_hpages;

	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
		kfree(spool);
		return NULL;
	}
	spool->rsv_hpages = min_hpages;
111 112 113 114 115 116 117 118 119 120 121 122

	return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
	spin_lock(&spool->lock);
	BUG_ON(!spool->count);
	spool->count--;
	unlock_or_release_subpool(spool);
}

123 124 125 126 127 128 129 130 131
/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * the request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be manitained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
132 133
				      long delta)
{
134
	long ret = delta;
135 136

	if (!spool)
137
		return ret;
138 139

	spin_lock(&spool->lock);
140 141 142 143 144 145 146 147

	if (spool->max_hpages != -1) {		/* maximum size accounting */
		if ((spool->used_hpages + delta) <= spool->max_hpages)
			spool->used_hpages += delta;
		else {
			ret = -ENOMEM;
			goto unlock_ret;
		}
148 149
	}

150 151
	/* minimum size accounting */
	if (spool->min_hpages != -1 && spool->rsv_hpages) {
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
		if (delta > spool->rsv_hpages) {
			/*
			 * Asking for more reserves than those already taken on
			 * behalf of subpool.  Return difference.
			 */
			ret = delta - spool->rsv_hpages;
			spool->rsv_hpages = 0;
		} else {
			ret = 0;	/* reserves already accounted for */
			spool->rsv_hpages -= delta;
		}
	}

unlock_ret:
	spin_unlock(&spool->lock);
167 168 169
	return ret;
}

170 171 172 173 174 175 176
/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
177 178
				       long delta)
{
179 180
	long ret = delta;

181
	if (!spool)
182
		return delta;
183 184

	spin_lock(&spool->lock);
185 186 187 188

	if (spool->max_hpages != -1)		/* maximum size accounting */
		spool->used_hpages -= delta;

189 190
	 /* minimum size accounting */
	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
191 192 193 194 195 196 197 198 199 200 201 202 203 204
		if (spool->rsv_hpages + delta <= spool->min_hpages)
			ret = 0;
		else
			ret = spool->rsv_hpages + delta - spool->min_hpages;

		spool->rsv_hpages += delta;
		if (spool->rsv_hpages > spool->min_hpages)
			spool->rsv_hpages = spool->min_hpages;
	}

	/*
	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
	 * quota reference, free it now.
	 */
205
	unlock_or_release_subpool(spool);
206 207

	return ret;
208 209 210 211 212 213 214 215 216
}

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
	return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
Al Viro's avatar
Al Viro committed
217
	return subpool_inode(file_inode(vma->vm_file));
218 219
}

220 221 222
/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
223
 *
224 225 226 227 228 229 230 231 232 233 234 235 236 237
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indicies into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
238 239 240 241 242 243 244
 */
struct file_region {
	struct list_head link;
	long from;
	long to;
};

245 246
/*
 * Add the huge page range represented by [f, t) to the reserve
247 248 249 250 251 252 253 254
 * map.  In the normal case, existing regions will be expanded
 * to accommodate the specified range.  Sufficient regions should
 * exist for expansion due to the previous call to region_chg
 * with the same range.  However, it is possible that region_del
 * could have been called after region_chg and modifed the map
 * in such a way that no region exists to be expanded.  In this
 * case, pull a region descriptor from the cache associated with
 * the map and use that for the new range.
255 256 257
 *
 * Return the number of new huge pages added to the map.  This
 * number is greater than or equal to zero.
258
 */
259
static long region_add(struct resv_map *resv, long f, long t)
260
{
261
	struct list_head *head = &resv->regions;
262
	struct file_region *rg, *nrg, *trg;
263
	long add = 0;
264

265
	spin_lock(&resv->lock);
266 267 268 269 270
	/* Locate the region we are either in or before. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
	/*
	 * If no region exists which can be expanded to include the
	 * specified range, the list must have been modified by an
	 * interleving call to region_del().  Pull a region descriptor
	 * from the cache and use it for this range.
	 */
	if (&rg->link == head || t < rg->from) {
		VM_BUG_ON(resv->region_cache_count <= 0);

		resv->region_cache_count--;
		nrg = list_first_entry(&resv->region_cache, struct file_region,
					link);
		list_del(&nrg->link);

		nrg->from = f;
		nrg->to = t;
		list_add(&nrg->link, rg->link.prev);

		add += t - f;
		goto out_locked;
	}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;

	/* Check for and consume any regions we now overlap with. */
	nrg = rg;
	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
			break;

		/* If this area reaches higher then extend our area to
		 * include it completely.  If this is not the first area
		 * which we intend to reuse, free it. */
		if (rg->to > t)
			t = rg->to;
		if (rg != nrg) {
311 312 313 314 315
			/* Decrement return value by the deleted range.
			 * Another range will span this area so that by
			 * end of routine add will be >= zero
			 */
			add -= (rg->to - rg->from);
316 317 318 319
			list_del(&rg->link);
			kfree(rg);
		}
	}
320 321

	add += (nrg->from - f);		/* Added to beginning of region */
322
	nrg->from = f;
323
	add += t - nrg->to;		/* Added to end of region */
324
	nrg->to = t;
325

326 327
out_locked:
	resv->adds_in_progress--;
328
	spin_unlock(&resv->lock);
329 330
	VM_BUG_ON(add < 0);
	return add;
331 332
}

333 334 335 336 337 338 339 340 341 342 343 344 345
/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
 * map.  However, if the existing regions in the map can not
 * be expanded to represent the new range, a new file_region
 * structure is added to the map as a placeholder.  This is
 * so that the subsequent region_add call will have all the
 * regions it needs and will not fail.
 *
346 347 348 349 350 351 352 353
 * Upon entry, region_chg will also examine the cache of region descriptors
 * associated with the map.  If there are not enough descriptors cached, one
 * will be allocated for the in progress add operation.
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
354
 */
355
static long region_chg(struct resv_map *resv, long f, long t)
356
{
357
	struct list_head *head = &resv->regions;
358
	struct file_region *rg, *nrg = NULL;
359 360
	long chg = 0;

361 362
retry:
	spin_lock(&resv->lock);
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
retry_locked:
	resv->adds_in_progress++;

	/*
	 * Check for sufficient descriptors in the cache to accommodate
	 * the number of in progress add operations.
	 */
	if (resv->adds_in_progress > resv->region_cache_count) {
		struct file_region *trg;

		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
		/* Must drop lock to allocate a new descriptor. */
		resv->adds_in_progress--;
		spin_unlock(&resv->lock);

		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
379 380
		if (!trg) {
			kfree(nrg);
381
			return -ENOMEM;
382
		}
383 384 385 386 387 388 389

		spin_lock(&resv->lock);
		list_add(&trg->link, &resv->region_cache);
		resv->region_cache_count++;
		goto retry_locked;
	}

390 391 392 393 394 395 396 397 398
	/* Locate the region we are before or in. */
	list_for_each_entry(rg, head, link)
		if (f <= rg->to)
			break;

	/* If we are below the current region then a new region is required.
	 * Subtle, allocate a new region at the position but make it zero
	 * size such that we can guarantee to record the reservation. */
	if (&rg->link == head || t < rg->from) {
399
		if (!nrg) {
400
			resv->adds_in_progress--;
401 402 403 404 405 406 407 408 409 410
			spin_unlock(&resv->lock);
			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
			if (!nrg)
				return -ENOMEM;

			nrg->from = f;
			nrg->to   = f;
			INIT_LIST_HEAD(&nrg->link);
			goto retry;
		}
411

412 413 414
		list_add(&nrg->link, rg->link.prev);
		chg = t - f;
		goto out_nrg;
415 416 417 418 419 420 421 422 423 424 425 426
	}

	/* Round our left edge to the current segment if it encloses us. */
	if (f > rg->from)
		f = rg->from;
	chg = t - f;

	/* Check for and consume any regions we now overlap with. */
	list_for_each_entry(rg, rg->link.prev, link) {
		if (&rg->link == head)
			break;
		if (rg->from > t)
427
			goto out;
428

429
		/* We overlap with this area, if it extends further than
430 431 432 433 434 435 436 437
		 * us then we must extend ourselves.  Account for its
		 * existing reservation. */
		if (rg->to > t) {
			chg += rg->to - t;
			t = rg->to;
		}
		chg -= rg->to - rg->from;
	}
438 439 440 441 442 443 444 445

out:
	spin_unlock(&resv->lock);
	/*  We already know we raced and no longer need the new region */
	kfree(nrg);
	return chg;
out_nrg:
	spin_unlock(&resv->lock);
446 447 448
	return chg;
}

449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t)
{
	spin_lock(&resv->lock);
	VM_BUG_ON(!resv->region_cache_count);
	resv->adds_in_progress--;
	spin_unlock(&resv->lock);
}

468
/*
469 470 471 472 473 474 475 476 477 478 479 480
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
481
 */
482
static long region_del(struct resv_map *resv, long f, long t)
483
{
484
	struct list_head *head = &resv->regions;
485
	struct file_region *rg, *trg;
486 487
	struct file_region *nrg = NULL;
	long del = 0;
488

489
retry:
490
	spin_lock(&resv->lock);
491
	list_for_each_entry_safe(rg, trg, head, link) {
492 493 494 495 496 497 498 499
		/*
		 * Skip regions before the range to be deleted.  file_region
		 * ranges are normally of the form [from, to).  However, there
		 * may be a "placeholder" entry in the map which is of the form
		 * (from, to) with from == to.  Check for placeholder entries
		 * at the beginning of the range to be deleted.
		 */
		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
500
			continue;
501

502
		if (rg->from >= t)
503 504
			break;

505 506 507 508 509 510 511 512 513 514 515 516 517
		if (f > rg->from && t < rg->to) { /* Must split region */
			/*
			 * Check for an entry in the cache before dropping
			 * lock and attempting allocation.
			 */
			if (!nrg &&
			    resv->region_cache_count > resv->adds_in_progress) {
				nrg = list_first_entry(&resv->region_cache,
							struct file_region,
							link);
				list_del(&nrg->link);
				resv->region_cache_count--;
			}
518

519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
			if (!nrg) {
				spin_unlock(&resv->lock);
				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
				if (!nrg)
					return -ENOMEM;
				goto retry;
			}

			del += t - f;

			/* New entry for end of split region */
			nrg->from = t;
			nrg->to = rg->to;
			INIT_LIST_HEAD(&nrg->link);

			/* Original entry is trimmed */
			rg->to = f;

			list_add(&nrg->link, &rg->link);
			nrg = NULL;
539
			break;
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
		}

		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
			del += rg->to - rg->from;
			list_del(&rg->link);
			kfree(rg);
			continue;
		}

		if (f <= rg->from) {	/* Trim beginning of region */
			del += t - rg->from;
			rg->from = t;
		} else {		/* Trim end of region */
			del += rg->to - f;
			rg->to = f;
		}
556
	}
557 558

	spin_unlock(&resv->lock);
559 560
	kfree(nrg);
	return del;
561 562
}

563 564 565 566 567 568 569 570 571
/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
572
void hugetlb_fix_reserve_counts(struct inode *inode)
573 574 575 576 577
{
	struct hugepage_subpool *spool = subpool_inode(inode);
	long rsv_adjust;

	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
578
	if (rsv_adjust) {
579 580 581 582 583 584
		struct hstate *h = hstate_inode(inode);

		hugetlb_acct_memory(h, 1);
	}
}

585 586 587 588
/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
589
static long region_count(struct resv_map *resv, long f, long t)
590
{
591
	struct list_head *head = &resv->regions;
592 593 594
	struct file_region *rg;
	long chg = 0;

595
	spin_lock(&resv->lock);
596 597
	/* Locate each segment we overlap with, and count that overlap. */
	list_for_each_entry(rg, head, link) {
598 599
		long seg_from;
		long seg_to;
600 601 602 603 604 605 606 607 608 609 610

		if (rg->to <= f)
			continue;
		if (rg->from >= t)
			break;

		seg_from = max(rg->from, f);
		seg_to = min(rg->to, t);

		chg += seg_to - seg_from;
	}
611
	spin_unlock(&resv->lock);
612 613 614 615

	return chg;
}

616 617 618 619
/*
 * Convert the address within this vma to the page offset within
 * the mapping, in pagecache page units; huge pages here.
 */
620 621
static pgoff_t vma_hugecache_offset(struct hstate *h,
			struct vm_area_struct *vma, unsigned long address)
622
{
623 624
	return ((address - vma->vm_start) >> huge_page_shift(h)) +
			(vma->vm_pgoff >> huge_page_order(h));
625 626
}

627 628 629 630 631
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
				     unsigned long address)
{
	return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
632
EXPORT_SYMBOL_GPL(linear_hugepage_index);
633

634 635 636 637 638 639
/*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
 */
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
640 641 642
	if (vma->vm_ops && vma->vm_ops->pagesize)
		return vma->vm_ops->pagesize(vma);
	return PAGE_SIZE;
643
}
644
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
645

646 647 648
/*
 * Return the page size being used by the MMU to back a VMA. In the majority
 * of cases, the page size used by the kernel matches the MMU size. On
649 650
 * architectures where it differs, an architecture-specific 'strong'
 * version of this symbol is required.
651
 */
652
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
653 654 655 656
{
	return vma_kernel_pagesize(vma);
}

657 658 659 660 661 662 663
/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
664
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
665

666 667 668 669 670 671 672 673 674
/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
675 676 677 678 679 680 681 682 683
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
684
 */
685 686 687 688 689 690 691 692 693 694 695
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
	return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
							unsigned long value)
{
	vma->vm_private_data = (void *)value;
}

696
struct resv_map *resv_map_alloc(void)
697 698
{
	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
699 700 701 702 703
	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

	if (!resv_map || !rg) {
		kfree(resv_map);
		kfree(rg);
704
		return NULL;
705
	}
706 707

	kref_init(&resv_map->refs);
708
	spin_lock_init(&resv_map->lock);
709 710
	INIT_LIST_HEAD(&resv_map->regions);

711 712 713 714 715 716
	resv_map->adds_in_progress = 0;

	INIT_LIST_HEAD(&resv_map->region_cache);
	list_add(&rg->link, &resv_map->region_cache);
	resv_map->region_cache_count = 1;

717 718 719
	return resv_map;
}

720
void resv_map_release(struct kref *ref)
721 722
{
	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
723 724
	struct list_head *head = &resv_map->region_cache;
	struct file_region *rg, *trg;
725 726

	/* Clear out any active regions before we release the map. */
727
	region_del(resv_map, 0, LONG_MAX);
728 729 730 731 732 733 734 735 736

	/* ... and any entries left in the cache */
	list_for_each_entry_safe(rg, trg, head, link) {
		list_del(&rg->link);
		kfree(rg);
	}

	VM_BUG_ON(resv_map->adds_in_progress);

737 738 739
	kfree(resv_map);
}

740 741 742 743 744
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
	return inode->i_mapping->private_data;
}

745
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
746
{
747
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
748 749 750 751 752 753 754
	if (vma->vm_flags & VM_MAYSHARE) {
		struct address_space *mapping = vma->vm_file->f_mapping;
		struct inode *inode = mapping->host;

		return inode_resv_map(inode);

	} else {
755 756
		return (struct resv_map *)(get_vma_private_data(vma) &
							~HPAGE_RESV_MASK);
757
	}
758 759
}

760
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
761
{
762 763
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
764

765 766
	set_vma_private_data(vma, (get_vma_private_data(vma) &
				HPAGE_RESV_MASK) | (unsigned long)map);
767 768 769 770
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
771 772
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
773 774

	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
775 776 777 778
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
779
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
780 781

	return (get_vma_private_data(vma) & flag) != 0;
782 783
}

784
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
785 786
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
787
	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
788
	if (!(vma->vm_flags & VM_MAYSHARE))
789 790 791 792
		vma->vm_private_data = (void *)0;
}

/* Returns true if the VMA has associated reserve pages */
793
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
794
{
795 796 797 798 799 800 801 802 803 804 805
	if (vma->vm_flags & VM_NORESERVE) {
		/*
		 * This address is already reserved by other process(chg == 0),
		 * so, we should decrement reserved count. Without decrementing,
		 * reserve count remains after releasing inode, because this
		 * allocated page will go into page cache and is regarded as
		 * coming from reserved pool in releasing step.  Currently, we
		 * don't have any other solution to deal with this situation
		 * properly, so add work-around here.
		 */
		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
806
			return true;
807
		else
808
			return false;
809
	}
810 811

	/* Shared mappings always use reserves */
812 813 814 815 816 817 818 819 820 821 822 823 824
	if (vma->vm_flags & VM_MAYSHARE) {
		/*
		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
		 * be a region map for all pages.  The only situation where
		 * there is no region map is if a hole was punched via
		 * fallocate.  In this case, there really are no reverves to
		 * use.  This situation is indicated if chg != 0.
		 */
		if (chg)
			return false;
		else
			return true;
	}
825 826 827 828 829

	/*
	 * Only the process that called mmap() has reserves for
	 * private mappings.
	 */
830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850
	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		/*
		 * Like the shared case above, a hole punch or truncate
		 * could have been performed on the private mapping.
		 * Examine the value of chg to determine if reserves
		 * actually exist or were previously consumed.
		 * Very Subtle - The value of chg comes from a previous
		 * call to vma_needs_reserves().  The reserve map for
		 * private mappings has different (opposite) semantics
		 * than that of shared mappings.  vma_needs_reserves()
		 * has already taken this difference in semantics into
		 * account.  Therefore, the meaning of chg is the same
		 * as in the shared case above.  Code could easily be
		 * combined, but keeping it separate draws attention to
		 * subtle differences.
		 */
		if (chg)
			return false;
		else
			return true;
	}
851

852
	return false;
853 854
}

855
static void enqueue_huge_page(struct hstate *h, struct page *page)
Linus Torvalds's avatar
Linus Torvalds committed
856 857
{
	int nid = page_to_nid(page);
858
	list_move(&page->lru, &h->hugepage_freelists[nid]);
859 860
	h->free_huge_pages++;
	h->free_huge_pages_node[nid]++;
Linus Torvalds's avatar
Linus Torvalds committed
861 862
}

863
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
864 865 866
{
	struct page *page;

867
	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
868
		if (!PageHWPoison(page))
869 870 871 872 873 874
			break;
	/*
	 * if 'non-isolated free hugepage' not found on the list,
	 * the allocation fails.
	 */
	if (&h->hugepage_freelists[nid] == &page->lru)
875
		return NULL;
876
	list_move(&page->lru, &h->hugepage_activelist);
877
	set_page_refcounted(page);
878 879 880 881 882
	h->free_huge_pages--;
	h->free_huge_pages_node[nid]--;
	return page;
}

883 884
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
		nodemask_t *nmask)
885
{
886 887 888 889 890
	unsigned int cpuset_mems_cookie;
	struct zonelist *zonelist;
	struct zone *zone;
	struct zoneref *z;
	int node = -1;
891

892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
	zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
	cpuset_mems_cookie = read_mems_allowed_begin();
	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
		struct page *page;

		if (!cpuset_zone_allowed(zone, gfp_mask))
			continue;
		/*
		 * no need to ask again on the same node. Pool is node rather than
		 * zone aware
		 */
		if (zone_to_nid(zone) == node)
			continue;
		node = zone_to_nid(zone);
908 909 910 911 912

		page = dequeue_huge_page_node_exact(h, node);
		if (page)
			return page;
	}
913 914 915
	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
		goto retry_cpuset;

916 917 918
	return NULL;
}

919 920 921
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
922
	if (hugepage_migration_supported(h))
923 924 925 926 927
		return GFP_HIGHUSER_MOVABLE;
	else
		return GFP_HIGHUSER;
}

928 929
static struct page *dequeue_huge_page_vma(struct hstate *h,
				struct vm_area_struct *vma,
930 931
				unsigned long address, int avoid_reserve,
				long chg)
Linus Torvalds's avatar
Linus Torvalds committed
932
{
933
	struct page *page;
934
	struct mempolicy *mpol;
935
	gfp_t gfp_mask;
936
	nodemask_t *nodemask;
937
	int nid;
Linus Torvalds's avatar
Linus Torvalds committed
938

939 940 941 942 943
	/*
	 * A child process with MAP_PRIVATE mappings created by their parent
	 * have no page reserves. This check ensures that reservations are
	 * not "stolen". The child may still get SIGKILLed
	 */
944
	if (!vma_has_reserves(vma, chg) &&
945
			h->free_huge_pages - h->resv_huge_pages == 0)
946
		goto err;
947

948
	/* If reserves cannot be used, ensure enough pages are in the pool */
949
	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
950
		goto err;
951

952 953
	gfp_mask = htlb_alloc_mask(h);
	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
954 955 956 957
	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
		SetPagePrivate(page);
		h->resv_huge_pages--;
Linus Torvalds's avatar
Linus Torvalds committed
958
	}
959

960
	mpol_cond_put(mpol);
Linus Torvalds's avatar
Linus Torvalds committed
961
	return page;
962 963 964

err:
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
965 966
}

967 968 969 970 971 972 973 974 975
/*
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
976
	nid = next_node_in(nid, *nodes_allowed);
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
	VM_BUG_ON(nid >= MAX_NUMNODES);

	return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
	if (!node_isset(nid, *nodes_allowed))
		nid = next_node_allowed(nid, nodes_allowed);
	return nid;
}

/*
 * returns the previously saved node ["this node"] from which to
 * allocate a persistent huge page for the pool and advance the
 * next node from which to allocate, handling wrap at end of node
 * mask.
 */
static int hstate_next_node_to_alloc(struct hstate *h,
					nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);

	return nid;
}

/*
 * helper for free_pool_huge_page() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
 * next node id whether or not we find a free huge page to free so
 * that the next attempt to free addresses the next node.
 */
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
	int nid;

	VM_BUG_ON(!nodes_allowed);

	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

	return nid;
}

#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
		nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
	for (nr_nodes = nodes_weight(*mask);				\
		nr_nodes > 0 &&						\
		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
		nr_nodes--)

1038
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1039
static void destroy_compound_gigantic_page(struct page *page,
1040
					unsigned int order)
1041 1042 1043 1044 1045
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

1046
	atomic_set(compound_mapcount_ptr(page), 0);
1047
	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1048
		clear_compound_head(p);
1049 1050 1051 1052 1053 1054 1055
		set_page_refcounted(p);
	}

	set_compound_order(page, 0);
	__ClearPageHead(page);
}

1056
static void free_gigantic_page(struct page *page, unsigned int order)
1057 1058 1059 1060 1061
{
	free_contig_range(page_to_pfn(page), 1 << order);
}

static int __alloc_gigantic_page(unsigned long start_pfn,
1062
				unsigned long nr_pages, gfp_t gfp_mask)
1063 1064
{
	unsigned long end_pfn = start_pfn + nr_pages;
1065
	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
1066
				  gfp_mask);
1067 1068
}

1069 1070
static bool pfn_range_valid_gigantic(struct zone *z,
			unsigned long start_pfn, unsigned long nr_pages)
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
{
	unsigned long i, end_pfn = start_pfn + nr_pages;
	struct page *page;

	for (i = start_pfn; i < end_pfn; i++) {
		if (!pfn_valid(i))
			return false;

		page = pfn_to_page(i);

1081 1082 1083
		if (page_zone(page) != z)
			return false;

1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
		if (PageReserved(page))
			return false;

		if (page_count(page) > 0)
			return false;

		if (PageHuge(page))
			return false;
	}

	return true;
}

static bool zone_spans_last_pfn(const struct zone *zone,
			unsigned long start_pfn, unsigned long nr_pages)
{
	unsigned long last_pfn = start_pfn + nr_pages - 1;
	return zone_spans_pfn(zone, last_pfn);
}

1104 1105
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
		int nid, nodemask_t *nodemask)
1106
{
1107
	unsigned int order = huge_page_order(h);
1108 1109
	unsigned long nr_pages = 1 << order;
	unsigned long ret, pfn, flags;
1110 1111 1112
	struct zonelist *zonelist;
	struct zone *zone;
	struct zoneref *z;
1113

1114
	zonelist = node_zonelist(nid, gfp_mask);
1115
	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
1116
		spin_lock_irqsave(&zone->lock, flags);
1117

1118 1119 1120
		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
		while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
			if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
1121 1122 1123 1124 1125 1126 1127
				/*
				 * We release the zone lock here because
				 * alloc_contig_range() will also lock the zone
				 * at some point. If there's an allocation
				 * spinning on this lock, it may win the race
				 * and cause alloc_contig_range() to fail...
				 */
1128 1129
				spin_unlock_irqrestore(&zone->lock, flags);
				ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
1130 1131
				if (!ret)
					return pfn_to_page(pfn);
1132
				spin_lock_irqsave(&zone->lock, flags);
1133 1134 1135 1136
			}
			pfn += nr_pages;
		}

1137
		spin_unlock_irqrestore(&zone->lock, flags);
1138 1139 1140 1141 1142 1143
	}

	return NULL;
}

static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1144
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1145

1146
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1147
static inline bool gigantic_page_supported(void) { return false; }
1148 1149
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
		int nid, nodemask_t *nodemask) { return NULL; }
1150
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1151
static inline void destroy_compound_gigantic_page(struct page *page,
1152
						unsigned int order) { }
1153 1154
#endif

1155
static void update_and_free_page(struct hstate *h, struct page *page)
1156 1157
{
	int i;
1158

1159 1160
	if (hstate_is_gigantic(h) && !gigantic_page_supported())
		return;
1161

1162 1163 1164
	h->nr_huge_pages--;
	h->nr_huge_pages_node[page_to_nid(page)]--;
	for (i = 0; i < pages_per_huge_page(h); i++) {
1165 1166
		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
				1 << PG_referenced | 1 << PG_dirty |
1167 1168
				1 << PG_active | 1 << PG_private |
				1 << PG_writeback);
1169
	}
1170
	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1171
	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1172
	set_page_refcounted(page);
1173 1174 1175 1176 1177 1178
	if (hstate_is_gigantic(h)) {
		destroy_compound_gigantic_page(page, huge_page_order(h));
		free_gigantic_page(page, huge_page_order(h));
	} else {
		__free_pages(page, huge_page_order(h));
	}
1179 1180
}

1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
struct hstate *size_to_hstate(unsigned long size)
{
	struct hstate *h;

	for_each_hstate(h) {
		if (huge_page_size(h) == size)
			return h;
	}
	return NULL;
}

1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
/*
 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
 * to hstate->hugepage_activelist.)
 *
 * This function can be called for tail pages, but never returns true for them.
 */
bool page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHuge(page), page);
	return PageHead(page) && PagePrivate(&page[1]);
}

/* never called for tail page */
static void set_page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	SetPagePrivate(&page[1]);
}

static void clear_page_huge_active(struct page *page)
{
	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	ClearPagePrivate(&page[1]);
}

1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
/*
 * Internal hugetlb specific page flag. Do not use outside of the hugetlb
 * code
 */
static inline bool PageHugeTemporary(struct page *page)
{
	if (!PageHuge(page))
		return false;

	return (unsigned long)page[2].mapping == -1U;
}

static inline void SetPageHugeTemporary(struct page *page)
{
	page[2].mapping = (void *)-1U;
}

static inline void ClearPageHugeTemporary(struct page *page)
{
	page[2].mapping = NULL;
}

1239
void free_huge_page(struct page *page)
1240
{
1241 1242 1243 1244
	/*
	 * Can't pass hstate in here because it is called from the
	 * compound page destructor.
	 */
1245
	struct hstate *h = page_hstate(page);
1246
	int nid = page_to_nid(page);
1247 1248
	struct hugepage_subpool *spool =
		(struct hugepage_subpool *)page_private(page);
1249
	bool restore_reserve;
1250

1251
	set_page_private(page, 0);
1252
	page->mapping = NULL;
1253 1254
	VM_BUG_ON_PAGE(page_count(page), page);
	VM_BUG_ON_PAGE(page_mapcount(page), page);
1255
	restore_reserve = PagePrivate(page);
1256
	ClearPagePrivate(page);
1257

1258 1259 1260 1261 1262 1263 1264 1265
	/*
	 * A return code of zero implies that the subpool will be under its
	 * minimum size if the reservation is not restored after page is free.
	 * Therefore, force restore_reserve operation.
	 */
	if (hugepage_subpool_put_pages(spool, 1) == 0)
		restore_reserve = true;

1266
	spin_lock(&hugetlb_lock);
1267
	clear_page_huge_active(page);
1268 1269
	hugetlb_cgroup_uncharge_page(hstate_index(h),
				     pages_per_huge_page(h), page);
1270 1271 1272
	if (restore_reserve)
		h->resv_huge_pages++;

1273 1274 1275 1276 1277
	if (PageHugeTemporary(page)) {
		list_del(&page->lru);
		ClearPageHugeTemporary(page);
		update_and_free_page(h, page);
	} else if (h->surplus_huge_pages_node[nid]) {
1278 1279
		/* remove the page from active list */
		list_del(&page->lru);
1280 1281 1282
		update_and_free_page(h, page);
		h->surplus_huge_pages--;
		h->surplus_huge_pages_node[nid]--;
1283
	} else {
1284
		arch_clear_hugepage_flags(page);
1285
		enqueue_huge_page(h, page);
1286
	}
1287 1288 1289
	spin_unlock(&hugetlb_lock);
}

1290
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1291
{
1292
	INIT_LIST_HEAD(&page->lru);
1293
	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1294
	spin_lock(&hugetlb_lock);
1295
	set_hugetlb_cgroup(page, NULL);
1296 1297
	h->nr_huge_pages++;
	h->nr_huge_pages_node[nid]++;
1298 1299 1300
	spin_unlock(&hugetlb_lock);
}

1301
static void prep_compound_gigantic_page(struct page *page, unsigned int order)
1302 1303 1304 1305 1306 1307 1308
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

	/* we rely on prep_new_huge_page to set the destructor */
	set_compound_order(page, order);
1309
	__ClearPageReserved(page);
1310
	__SetPageHead(page);
1311
	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
		/*
		 * For gigantic hugepages allocated through bootmem at
		 * boot, it's safer to be consistent with the not-gigantic
		 * hugepages and clear the PG_reserved bit from all tail pages
		 * too.  Otherwse drivers using get_user_pages() to access tail
		 * pages may get the reference counting wrong if they see
		 * PG_reserved set on a tail page (despite the head page not
		 * having PG_reserved set).  Enforcing this consistency between
		 * head and tail pages allows drivers to optimize away a check
		 * on the head page when they need know if put_page() is needed
		 * after get_user_pages().
		 */
		__ClearPageReserved(p);
1325
		set_page_count(p, 0);
1326
		set_compound_head(p, page);
1327
	}
1328
	atomic_set(compound_mapcount_ptr(page), -1);
1329 1330
}

1331 1332 1333 1334 1335
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
 * transparent huge pages.  See the PageTransHuge() documentation for more
 * details.
 */
1336 1337 1338 1339 1340 1341
int PageHuge(struct page *page)
{
	if (!PageCompound(page))
		return 0;

	page = compound_head(page);
1342
	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1343
}
1344 1345
EXPORT_SYMBOL_GPL(PageHuge);

1346 1347 1348 1349 1350 1351 1352 1353 1354
/*
 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
 * normal or transparent huge pages.
 */
int PageHeadHuge(struct page *page_head)
{
	if (!PageHead(page_head))
		return 0;

1355
	return get_compound_page_dtor(page_head) == free_huge_page;
1356 1357
}

1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374
pgoff_t __basepage_index(struct page *page)
{
	struct page *page_head = compound_head(page);
	pgoff_t index = page_index(page_head);
	unsigned long compound_idx;

	if (!PageHuge(page_head))
		return page_index(page);

	if (compound_order(page_head) >= MAX_ORDER)
		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
	else
		compound_idx = page - page_head;

	return (index << compound_order(page_head)) + compound_idx;
}

1375
static struct page *alloc_buddy_huge_page(struct hstate *h,
1376
		gfp_t gfp_mask, int nid, nodemask_t *nmask)
Linus Torvalds's avatar
Linus Torvalds committed
1377
{
1378
	int order = huge_page_order(h);
Linus Torvalds's avatar
Linus Torvalds committed
1379
	struct page *page;
1380

1381 1382 1383 1384 1385 1386 1387 1388
	gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
	if (nid == NUMA_NO_NODE)
		nid = numa_mem_id();
	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
	if (page)
		__count_vm_event(HTLB_BUDDY_PGALLOC);
	else
		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1389 1390 1391 1392

	return page;
}

1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
/*
 * Common helper to allocate a fresh hugetlb page. All specific allocators
 * should use this function to get new hugetlb pages
 */
static struct page *alloc_fresh_huge_page(struct hstate *h,
		gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
	struct page *page;

	if (hstate_is_gigantic(h))
		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
	else
		page = alloc_buddy_huge_page(h, gfp_mask,
				nid, nmask);
	if (!page)
		return NULL;

	if (hstate_is_gigantic(h))
		prep_compound_gigantic_page(page, huge_page_order(h));
	prep_new_huge_page(h, page, page_to_nid(page));

	return page;
}

1417 1418 1419 1420
/*
 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
 * manner.
 */
1421
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1422 1423 1424
{
	struct page *page;
	int nr_nodes, node;
1425
	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1426 1427

	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1428
		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
1429
		if (page)
1430 1431 1432
			break;
	}

1433 1434
	if (!page)
		return 0;
1435

1436 1437 1438
	put_page(page); /* free it into the hugepage allocator */

	return 1;
1439 1440
}

1441 1442 1443 1444 1445 1446
/*
 * Free huge page from pool from next node to free.
 * Attempt to keep persistent huge pages more or less
 * balanced over allowed nodes.
 * Called with hugetlb_lock locked.
 */
1447 1448
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
							 bool acct_surplus)
1449
{
1450
	int nr_nodes, node;
1451 1452
	int ret = 0;

1453
	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1454 1455 1456 1457
		/*
		 * If we're returning unused surplus pages, only examine
		 * nodes with surplus pages.
		 */
1458 1459
		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
		    !list_empty(&h->hugepage_freelists[node])) {
1460
			struct page *page =
1461
				list_entry(h->hugepage_freelists[node].next,
1462 1463 1464
					  struct page, lru);
			list_del(&page->lru);
			h->free_huge_pages--;
1465
			h->free_huge_pages_node[node]--;
1466 1467
			if (acct_surplus) {
				h->surplus_huge_pages--;
1468
				h->surplus_huge_pages_node[node]--;
1469
			}
1470 1471
			update_and_free_page(h, page);
			ret = 1;
1472
			break;