percpu.c 83.5 KB
Newer Older
1
/*
2
 * mm/percpu.c - percpu memory allocator
3 4 5 6
 *
 * Copyright (C) 2009		SUSE Linux Products GmbH
 * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
 *
7 8 9
 * Copyright (C) 2017		Facebook Inc.
 * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
 *
10
 * This file is released under the GPLv2 license.
11
 *
12 13 14 15
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
16 17 18 19 20 21
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
22 23 24 25 26 27 28 29 30
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
31
 * are not online yet.  In short, the first chunk is structured like so:
32 33 34 35 36 37 38
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
39
 *
40 41 42 43 44 45 46 47 48 49 50 51 52 53
 * The allocator organizes chunks into lists according to free size and
 * tries to allocate from the fullest chunk first.  Each chunk is managed
 * by a bitmap with metadata blocks.  The allocation map is updated on
 * every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
54
 *
55
 * To use this allocator, arch code should do the following:
56 57
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
58 59
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
60
 *
61 62
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
63 64
 */

65 66
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

67 68
#include <linux/bitmap.h>
#include <linux/bootmem.h>
69
#include <linux/err.h>
70
#include <linux/lcm.h>
71
#include <linux/list.h>
72
#include <linux/log2.h>
73 74 75 76 77 78
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
79
#include <linux/spinlock.h>
80
#include <linux/vmalloc.h>
81
#include <linux/workqueue.h>
82
#include <linux/kmemleak.h>
83
#include <linux/sched.h>
84 85

#include <asm/cacheflush.h>
86
#include <asm/sections.h>
87
#include <asm/tlbflush.h>
88
#include <asm/io.h>
89

90 91 92
#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

93 94
#include "percpu-internal.h"

95 96 97
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT		5

98 99
#define PCPU_EMPTY_POP_PAGES_LOW	2
#define PCPU_EMPTY_POP_PAGES_HIGH	4
100

101
#ifdef CONFIG_SMP
102 103 104
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)					\
105 106 107
	(void __percpu *)((unsigned long)(addr) -			\
			  (unsigned long)pcpu_base_addr	+		\
			  (unsigned long)__per_cpu_start)
108 109 110
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)						\
111 112 113
	(void __force *)((unsigned long)(ptr) +				\
			 (unsigned long)pcpu_base_addr -		\
			 (unsigned long)__per_cpu_start)
114
#endif
115 116 117 118 119
#else	/* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
#endif	/* CONFIG_SMP */
120

121 122 123 124
static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
125
int pcpu_nr_slots __ro_after_init;
126
static size_t pcpu_chunk_struct_size __ro_after_init;
127

128
/* cpus with the lowest and highest unit addresses */
129 130
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;
131

132
/* the address of the first chunk which starts with the kernel static area */
133
void *pcpu_base_addr __ro_after_init;
134 135
EXPORT_SYMBOL_GPL(pcpu_base_addr);

136 137
static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
138

139
/* group information, used for vm allocation */
140 141 142
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;
143

144 145 146 147 148
/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
149
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
150 151 152

/*
 * Optional reserved chunk.  This chunk reserves part of the first
153 154
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
155
 */
156
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
157

158
DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
159
static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
160

161
struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
162

163 164 165
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);

166 167 168 169
/*
 * The number of empty populated pages, protected by pcpu_lock.  The
 * reserved chunk doesn't contribute to the count.
 */
170
int pcpu_nr_empty_pop_pages;
171

172 173 174 175 176 177
/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
178 179
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
180 181 182 183 184 185 186 187
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
	if (pcpu_async_enabled)
		schedule_work(&pcpu_balance_work);
}
188

189
/**
190 191 192
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
193 194
 *
 * RETURNS:
195
 * True if the address is served from this chunk.
196
 */
197
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
198
{
199 200
	void *start_addr, *end_addr;

201
	if (!chunk)
202
		return false;
203

204 205 206
	start_addr = chunk->base_addr + chunk->start_offset;
	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
		   chunk->end_offset;
207 208

	return addr >= start_addr && addr < end_addr;
209 210
}

211
static int __pcpu_size_to_slot(int size)
212
{
Tejun Heo's avatar
Tejun Heo committed
213
	int highbit = fls(size);	/* size is in bytes */
214 215 216
	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

217 218 219 220 221 222 223
static int pcpu_size_to_slot(int size)
{
	if (size == pcpu_unit_size)
		return pcpu_nr_slots - 1;
	return __pcpu_size_to_slot(size);
}

224 225
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
226
	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
227 228
		return 0;

229
	return pcpu_size_to_slot(chunk->free_bytes);
230 231
}

232 233 234 235 236 237 238 239 240 241 242 243 244
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
	page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
	return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
245
{
246
	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
247 248
}

249 250 251 252 253
static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

254 255
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
				     unsigned int cpu, int page_idx)
256
{
257 258
	return (unsigned long)chunk->base_addr +
	       pcpu_unit_page_offset(cpu, page_idx);
259 260
}

261
static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
262
{
263 264
	*rs = find_next_zero_bit(bitmap, end, *rs);
	*re = find_next_bit(bitmap, end, *rs + 1);
265 266
}

267
static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
268
{
269 270
	*rs = find_next_bit(bitmap, end, *rs);
	*re = find_next_zero_bit(bitmap, end, *rs + 1);
271 272 273
}

/*
274 275 276
 * Bitmap region iterators.  Iterates over the bitmap between
 * [@start, @end) in @chunk.  @rs and @re should be integer variables
 * and will be set to start and end index of the current free region.
277
 */
278 279 280 281
#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
	     (rs) < (re);						     \
	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
282

283 284 285 286
#define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
	     (rs) < (re);						     \
	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
287

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
	return chunk->alloc_map +
	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
	return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

308 309 310 311 312
static unsigned long pcpu_block_off_to_off(int index, int off)
{
	return index * PCPU_BITMAP_BLOCK_BITS + off;
}

313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
				     int *bits)
{
	int i = pcpu_off_to_block_index(*bit_off);
	int block_off = pcpu_off_to_block_off(*bit_off);
	struct pcpu_block_md *block;

	*bits = 0;
	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
	     block++, i++) {
		/* handles contig area across blocks */
		if (*bits) {
			*bits += block->left_free;
			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
				continue;
			return;
		}

		/*
		 * This checks three things.  First is there a contig_hint to
		 * check.  Second, have we checked this hint before by
		 * comparing the block_off.  Third, is this the same as the
		 * right contig hint.  In the last case, it spills over into
		 * the next block and should be handled by the contig area
		 * across blocks code.
		 */
		*bits = block->contig_hint;
		if (*bits && block->contig_hint_start >= block_off &&
		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
			*bit_off = pcpu_block_off_to_off(i,
					block->contig_hint_start);
			return;
		}
357 358
		/* reset to satisfy the second predicate above */
		block_off = 0;
359 360 361 362 363 364

		*bits = block->right_free;
		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
	}
}

365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
				 int align, int *bit_off, int *bits)
{
	int i = pcpu_off_to_block_index(*bit_off);
	int block_off = pcpu_off_to_block_off(*bit_off);
	struct pcpu_block_md *block;

	*bits = 0;
	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
	     block++, i++) {
		/* handles contig area across blocks */
		if (*bits) {
			*bits += block->left_free;
			if (*bits >= alloc_bits)
				return;
			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
				continue;
		}

		/* check block->contig_hint */
		*bits = ALIGN(block->contig_hint_start, align) -
			block->contig_hint_start;
		/*
		 * This uses the block offset to determine if this has been
		 * checked in the prior iteration.
		 */
		if (block->contig_hint &&
		    block->contig_hint_start >= block_off &&
		    block->contig_hint >= *bits + alloc_bits) {
			*bits += alloc_bits + block->contig_hint_start -
				 block->first_free;
			*bit_off = pcpu_block_off_to_off(i, block->first_free);
			return;
		}
413 414
		/* reset to satisfy the second predicate above */
		block_off = 0;
415 416 417 418 419 420 421 422 423 424 425 426 427

		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
				 align);
		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
		*bit_off = pcpu_block_off_to_off(i, *bit_off);
		if (*bits >= alloc_bits)
			return;
	}

	/* no valid offsets were found - fail condition */
	*bit_off = pcpu_chunk_map_bits(chunk);
}

428 429 430
/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
431 432
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
433 434 435 436 437 438 439
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
	     (bit_off) += (bits) + 1,					\
	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

440 441 442 443 444 445 446 447
#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
				  &(bits));				      \
	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
	     (bit_off) += (bits),					      \
	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
				  &(bits)))

448
/**
449
 * pcpu_mem_zalloc - allocate memory
450
 * @size: bytes to allocate
451
 * @gfp: allocation flags
452
 *
453
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
454 455 456
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
457 458
 *
 * RETURNS:
459
 * Pointer to the allocated area on success, NULL on failure.
460
 */
461
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
462
{
463 464 465
	if (WARN_ON_ONCE(!slab_is_available()))
		return NULL;

466
	if (size <= PAGE_SIZE)
467
		return kzalloc(size, gfp);
468
	else
469
		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
470
}
471

472 473 474 475
/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
476
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
477
 */
478
static void pcpu_mem_free(void *ptr)
479
{
480
	kvfree(ptr);
481 482 483 484 485 486 487 488 489
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
490 491
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
492 493 494
 *
 * CONTEXT:
 * pcpu_lock.
495 496 497 498 499
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
	int nslot = pcpu_chunk_slot(chunk);

500
	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
501 502 503 504 505 506 507
		if (oslot < nslot)
			list_move(&chunk->list, &pcpu_slot[nslot]);
		else
			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
	}
}

508
/**
509
 * pcpu_cnt_pop_pages- counts populated backing pages in range
510
 * @chunk: chunk of interest
511 512
 * @bit_off: start offset
 * @bits: size of area to check
513
 *
514 515 516
 * Calculates the number of populated pages in the region
 * [page_start, page_end).  This keeps track of how many empty populated
 * pages are available and decide if async work should be scheduled.
517
 *
518
 * RETURNS:
519
 * The nr of populated pages.
520
 */
521 522
static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
				     int bits)
523
{
524 525
	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
526

527
	if (page_start >= page_end)
528 529
		return 0;

530 531 532 533 534 535 536 537 538
	/*
	 * bitmap_weight counts the number of bits set in a bitmap up to
	 * the specified number of bits.  This is counting the populated
	 * pages up to page_end and then subtracting the populated pages
	 * up to page_start to count the populated pages in
	 * [page_start, page_end).
	 */
	return bitmap_weight(chunk->populated, page_end) -
	       bitmap_weight(chunk->populated, page_start);
539 540 541
}

/**
542
 * pcpu_chunk_update - updates the chunk metadata given a free area
543
 * @chunk: chunk of interest
544 545
 * @bit_off: chunk offset
 * @bits: size of free area
546
 *
547
 * This updates the chunk's contig hint and starting offset given a free area.
548
 * Choose the best starting offset if the contig hint is equal.
549 550 551
 */
static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
{
552 553
	if (bits > chunk->contig_bits) {
		chunk->contig_bits_start = bit_off;
554
		chunk->contig_bits = bits;
555 556 557 558 559
	} else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
		   (!bit_off ||
		    __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
		/* use the start with the best alignment */
		chunk->contig_bits_start = bit_off;
560
	}
561 562 563 564 565
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
566
 *
567 568 569
 * Iterates over the metadata blocks to find the largest contig area.
 * It also counts the populated pages and uses the delta to update the
 * global count.
570
 *
571 572
 * Updates:
 *      chunk->contig_bits
573
 *      chunk->contig_bits_start
574
 *      nr_empty_pop_pages (chunk and global)
575
 */
576
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
577
{
578
	int bit_off, bits, nr_empty_pop_pages;
579

580 581
	/* clear metadata */
	chunk->contig_bits = 0;
582

583
	bit_off = chunk->first_bit;
584
	bits = nr_empty_pop_pages = 0;
585 586
	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
		pcpu_chunk_update(chunk, bit_off, bits);
587

588
		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
589
	}
590

591 592 593 594 595 596 597 598 599 600 601
	/*
	 * Keep track of nr_empty_pop_pages.
	 *
	 * The chunk maintains the previous number of free pages it held,
	 * so the delta is used to update the global counter.  The reserved
	 * chunk is not part of the free page count as they are populated
	 * at init and are special to serving reserved allocations.
	 */
	if (chunk != pcpu_reserved_chunk)
		pcpu_nr_empty_pop_pages +=
			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
602

603 604
	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
}
605

606 607 608 609 610 611 612
/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
613 614
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
	int contig = end - start;

	block->first_free = min(block->first_free, start);
	if (start == 0)
		block->left_free = contig;

	if (end == PCPU_BITMAP_BLOCK_BITS)
		block->right_free = contig;

	if (contig > block->contig_hint) {
		block->contig_hint_start = start;
		block->contig_hint = contig;
630 631 632 633
	} else if (block->contig_hint_start && contig == block->contig_hint &&
		   (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
		/* use the start with the best alignment */
		block->contig_hint_start = start;
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
	}
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
	struct pcpu_block_md *block = chunk->md_blocks + index;
	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
	int rs, re;	/* region start, region end */

	/* clear hints */
	block->contig_hint = 0;
	block->left_free = block->right_free = 0;

	/* iterate over free areas and update the contig hints */
	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
				   PCPU_BITMAP_BLOCK_BITS) {
		pcpu_block_update(block, rs, re);
	}
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
667 668 669 670
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
					 int bits)
{
	struct pcpu_block_md *s_block, *e_block, *block;
	int s_index, e_index;	/* block indexes of the freed allocation */
	int s_off, e_off;	/* block offsets of the freed allocation */

	/*
	 * Calculate per block offsets.
	 * The calculation uses an inclusive range, but the resulting offsets
	 * are [start, end).  e_index always points to the last block in the
	 * range.
	 */
	s_index = pcpu_off_to_block_index(bit_off);
	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
	s_off = pcpu_off_to_block_off(bit_off);
	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

	s_block = chunk->md_blocks + s_index;
	e_block = chunk->md_blocks + e_index;

	/*
	 * Update s_block.
695 696 697
	 * block->first_free must be updated if the allocation takes its place.
	 * If the allocation breaks the contig_hint, a scan is required to
	 * restore this hint.
698
	 */
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
	if (s_off == s_block->first_free)
		s_block->first_free = find_next_zero_bit(
					pcpu_index_alloc_map(chunk, s_index),
					PCPU_BITMAP_BLOCK_BITS,
					s_off + bits);

	if (s_off >= s_block->contig_hint_start &&
	    s_off < s_block->contig_hint_start + s_block->contig_hint) {
		/* block contig hint is broken - scan to fix it */
		pcpu_block_refresh_hint(chunk, s_index);
	} else {
		/* update left and right contig manually */
		s_block->left_free = min(s_block->left_free, s_off);
		if (s_index == e_index)
			s_block->right_free = min_t(int, s_block->right_free,
					PCPU_BITMAP_BLOCK_BITS - e_off);
		else
			s_block->right_free = 0;
	}
718 719 720 721 722

	/*
	 * Update e_block.
	 */
	if (s_index != e_index) {
723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
		/*
		 * When the allocation is across blocks, the end is along
		 * the left part of the e_block.
		 */
		e_block->first_free = find_next_zero_bit(
				pcpu_index_alloc_map(chunk, e_index),
				PCPU_BITMAP_BLOCK_BITS, e_off);

		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
			/* reset the block */
			e_block++;
		} else {
			if (e_off > e_block->contig_hint_start) {
				/* contig hint is broken - scan to fix it */
				pcpu_block_refresh_hint(chunk, e_index);
			} else {
				e_block->left_free = 0;
				e_block->right_free =
					min_t(int, e_block->right_free,
					      PCPU_BITMAP_BLOCK_BITS - e_off);
			}
		}
745 746 747 748 749 750 751 752 753

		/* update in-between md_blocks */
		for (block = s_block + 1; block < e_block; block++) {
			block->contig_hint = 0;
			block->left_free = 0;
			block->right_free = 0;
		}
	}

754 755 756 757 758 759 760 761
	/*
	 * The only time a full chunk scan is required is if the chunk
	 * contig hint is broken.  Otherwise, it means a smaller space
	 * was used and therefore the chunk contig hint is still correct.
	 */
	if (bit_off >= chunk->contig_bits_start  &&
	    bit_off < chunk->contig_bits_start + chunk->contig_bits)
		pcpu_chunk_refresh_hint(chunk);
762 763 764 765 766 767 768
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
769 770 771 772 773 774 775 776 777 778 779
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
 * may be off by up to a page, but it will never be more than the available
 * space.  If the contig hint is contained in one block, it will be accurate.
780 781 782 783 784 785 786
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
					int bits)
{
	struct pcpu_block_md *s_block, *e_block, *block;
	int s_index, e_index;	/* block indexes of the freed allocation */
	int s_off, e_off;	/* block offsets of the freed allocation */
787
	int start, end;		/* start and end of the whole free area */
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802

	/*
	 * Calculate per block offsets.
	 * The calculation uses an inclusive range, but the resulting offsets
	 * are [start, end).  e_index always points to the last block in the
	 * range.
	 */
	s_index = pcpu_off_to_block_index(bit_off);
	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
	s_off = pcpu_off_to_block_off(bit_off);
	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

	s_block = chunk->md_blocks + s_index;
	e_block = chunk->md_blocks + e_index;

803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
	/*
	 * Check if the freed area aligns with the block->contig_hint.
	 * If it does, then the scan to find the beginning/end of the
	 * larger free area can be avoided.
	 *
	 * start and end refer to beginning and end of the free area
	 * within each their respective blocks.  This is not necessarily
	 * the entire free area as it may span blocks past the beginning
	 * or end of the block.
	 */
	start = s_off;
	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
		start = s_block->contig_hint_start;
	} else {
		/*
		 * Scan backwards to find the extent of the free area.
		 * find_last_bit returns the starting bit, so if the start bit
		 * is returned, that means there was no last bit and the
		 * remainder of the chunk is free.
		 */
		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
					  start);
		start = (start == l_bit) ? 0 : l_bit + 1;
	}

	end = e_off;
	if (e_off == e_block->contig_hint_start)
		end = e_block->contig_hint_start + e_block->contig_hint;
	else
		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
				    PCPU_BITMAP_BLOCK_BITS, end);

835
	/* update s_block */
836 837
	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
	pcpu_block_update(s_block, start, e_off);
838 839 840 841

	/* freeing in the same block */
	if (s_index != e_index) {
		/* update e_block */
842
		pcpu_block_update(e_block, 0, end);
843 844 845 846 847 848 849 850 851 852 853

		/* reset md_blocks in the middle */
		for (block = s_block + 1; block < e_block; block++) {
			block->first_free = 0;
			block->contig_hint_start = 0;
			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
			block->left_free = PCPU_BITMAP_BLOCK_BITS;
			block->right_free = PCPU_BITMAP_BLOCK_BITS;
		}
	}

854 855 856 857 858 859 860 861 862 863 864 865 866
	/*
	 * Refresh chunk metadata when the free makes a page free, a block
	 * free, or spans across blocks.  The contig hint may be off by up to
	 * a page, but if the hint is contained in a block, it will be accurate
	 * with the else condition below.
	 */
	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
	    s_index != e_index)
		pcpu_chunk_refresh_hint(chunk);
	else
		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
				  s_block->contig_hint);
867 868
}

869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
			      int *next_off)
{
	int page_start, page_end, rs, re;
886

887 888
	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
889

890 891 892 893
	rs = page_start;
	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
	if (rs >= page_end)
		return true;
894

895 896
	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
	return false;
897 898
}

899
/**
900 901 902 903 904 905
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
906 907 908 909 910 911 912 913
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
914 915 916
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
917
 */
918 919
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
			       size_t align, bool pop_only)
920
{
921
	int bit_off, bits, next_off;
922

923 924 925 926 927 928 929 930 931 932 933
	/*
	 * Check to see if the allocation can fit in the chunk's contig hint.
	 * This is an optimization to prevent scanning by assuming if it
	 * cannot fit in the global hint, there is memory pressure and creating
	 * a new chunk would happen soon.
	 */
	bit_off = ALIGN(chunk->contig_bits_start, align) -
		  chunk->contig_bits_start;
	if (bit_off + alloc_bits > chunk->contig_bits)
		return -1;

934 935 936
	bit_off = chunk->first_bit;
	bits = 0;
	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
937
		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
938
						   &next_off))
939
			break;
940

941
		bit_off = next_off;
942
		bits = 0;
943
	}
944 945 946 947 948

	if (bit_off == pcpu_chunk_map_bits(chunk))
		return -1;

	return bit_off;
949 950
}

951
/**
952
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
953
 * @chunk: chunk of interest
954 955 956
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
957
 *
958
 * This function takes in a @start offset to begin searching to fit an
959 960 961 962 963 964
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
965
 *
966
 * RETURNS:
967 968
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
969
 */
970 971
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
			   size_t align, int start)
972
{
973 974
	size_t align_mask = (align) ? (align - 1) : 0;
	int bit_off, end, oslot;
975

976
	lockdep_assert_held(&pcpu_lock);
977

978
	oslot = pcpu_chunk_slot(chunk);
979

980 981 982
	/*
	 * Search to find a fit.
	 */
983
	end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
984 985 986 987
	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
					     alloc_bits, align_mask);
	if (bit_off >= end)
		return -1;
988

989 990
	/* update alloc map */
	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
991

992 993 994 995
	/* update boundary map */
	set_bit(bit_off, chunk->bound_map);
	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
	set_bit(bit_off + alloc_bits, chunk->bound_map);
996

997
	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
998

999 1000 1001 1002 1003 1004 1005
	/* update first free bit */
	if (bit_off == chunk->first_bit)
		chunk->first_bit = find_next_zero_bit(
					chunk->alloc_map,
					pcpu_chunk_map_bits(chunk),
					bit_off + alloc_bits);

1006
	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
1007 1008 1009

	pcpu_chunk_relocate(chunk, oslot);

1010
	return bit_off * PCPU_MIN_ALLOC_SIZE;
1011 1012 1013
}

/**
1014
 * pcpu_free_area - frees the corresponding offset
1015
 * @chunk: chunk of interest
1016
 * @off: addr offset into chunk
1017
 *
1018 1019
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
1020
 */
1021
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
1022
{
1023
	int bit_off, bits, end, oslot;
1024

1025
	lockdep_assert_held(&pcpu_lock);
1026
	pcpu_stats_area_dealloc(chunk);
1027

1028
	oslot = pcpu_chunk_slot(chunk);
1029

1030
	bit_off = off / PCPU_MIN_ALLOC_SIZE;
1031

1032 1033 1034 1035 1036
	/* find end index */
	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
			    bit_off + 1);
	bits = end - bit_off;
	bitmap_clear(chunk->alloc_map, bit_off, bits);
1037

1038 1039
	/* update metadata */
	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
1040

1041 1042 1043
	/* update first free bit */
	chunk->first_bit = min(chunk->first_bit, bit_off);

1044
	pcpu_block_update_hint_free(chunk, bit_off, bits);
1045 1046 1047 1048

	pcpu_chunk_relocate(chunk, oslot);
}

1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
	struct pcpu_block_md *md_block;

	for (md_block = chunk->md_blocks;
	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
	     md_block++) {
		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
	}
}

1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
1075
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
1076
							 int map_size)
1077 1078
{
	struct pcpu_chunk *chunk;
1079
	unsigned long aligned_addr, lcm_align;
1080
	int start_offset, offset_bits, region_size, region_bits;
1081 1082 1083 1084 1085

	/* region calculations */
	aligned_addr = tmp_addr & PAGE_MASK;

	start_offset = tmp_addr - aligned_addr;
1086

1087 1088 1089 1090 1091 1092 1093
	/*
	 * Align the end of the region with the LCM of PAGE_SIZE and
	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
	 * the other.
	 */
	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
	region_size = ALIGN(start_offset + map_size, lcm_align);
1094

1095
	/* allocate chunk */
1096 1097 1098
	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
				    BITS_TO_LONGS(region_size >> PAGE_SHIFT),
				    0);
1099

1100
	INIT_LIST_HEAD(&chunk->list);
1101 1102

	chunk->base_addr = (void *)aligned_addr;
1103
	chunk->start_offset = start_offset;
1104
	chunk->end_offset = region_size - chunk->start_offset - map_size;
1105

1106
	chunk->nr_pages = region_size >> PAGE_SHIFT;
1107
	region_bits = pcpu_chunk_map_bits(chunk);
1108

1109 1110 1111 1112 1113 1114 1115
	chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
					       sizeof(chunk->alloc_map[0]), 0);
	chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
					       sizeof(chunk->bound_map[0]), 0);
	chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
					       sizeof(chunk->md_blocks[0]), 0);
	pcpu_init_md_blocks(chunk);
1116 1117 1118

	/* manage populated page bitmap */
	chunk->immutable = true;
1119 1120
	bitmap_fill(chunk->populated, chunk->nr_pages);
	chunk->nr_populated = chunk->nr_pages;
1121 1122 1123
	chunk->nr_empty_pop_pages =
		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
				   map_size / PCPU_MIN_ALLOC_SIZE);
1124

1125 1126
	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
	chunk->free_bytes = map_size;
1127 1128 1129

	if (chunk->start_offset) {
		/* hide the beginning of the bitmap */
1130 1131 1132 1133
		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
		bitmap_set(chunk->alloc_map, 0, offset_bits);
		set_bit(0, chunk->bound_map);
		set_bit(offset_bits, chunk->bound_map);
1134

1135 1136
		chunk->first_bit = offset_bits;

1137
		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1138 1139
	}

1140 1141
	if (chunk->end_offset) {
		/* hide the end of the bitmap */
1142 1143 1144 1145 1146 1147 1148
		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
		bitmap_set(chunk->alloc_map,
			   pcpu_chunk_map_bits(chunk) - offset_bits,
			   offset_bits);
		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
			chunk->bound_map);
		set_bit(region_bits, chunk->bound_map);
1149

1150 1151 1152
		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
					     - offset_bits, offset_bits);
	}
1153

1154 1155 1156
	return chunk;
}

1157
static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
1158 1159
{
	struct pcpu_chunk *chunk;
1160
	int region_bits;
1161

1162
	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
1163 1164 1165
	if (!chunk)
		return NULL;

1166 1167 1168
	INIT_LIST_HEAD(&chunk->list);
	chunk->nr_pages = pcpu_unit_pages;
	region_bits = pcpu_chunk_map_bits(chunk);
1169

1170
	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
1171
					   sizeof(chunk->alloc_map[0]), gfp);
1172 1173
	if (!chunk->alloc_map)
		goto alloc_map_fail;
1174

1175
	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
1176
					   sizeof(chunk->bound_map[0]), gfp);
1177 1178
	if (!chunk->bound_map)
		goto bound_map_fail;
1179

1180
	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
1181
					   sizeof(chunk->md_blocks[0]), gfp);
1182 1183 1184 1185 1186
	if (!chunk->md_blocks)
		goto md_blocks_fail;

	pcpu_init_md_blocks(chunk);

1187 1188 1189
	/* init metadata */
	chunk->contig_bits = region_bits;
	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1190

1191
	return chunk;
1192

1193 1194
md_blocks_fail:
	pcpu_mem_free(chunk->bound_map);
1195 1196 1197 1198 1199 1200
bound_map_fail:
	pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
	pcpu_mem_free(chunk);

	return NULL;
1201 1202 1203 1204 1205 1206
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
	if (!chunk)
		return;
1207 1208
	pcpu_mem_free(chunk->bound_map);
	pcpu_mem_free(chunk->alloc_map);
1209
	pcpu_mem_free(chunk);
1210 1211
}

1212 1213 1214 1215 1216
/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
1217
 * @for_alloc: if this is to populate for allocation
1218 1219 1220 1221
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
1222 1223 1224
 *
 * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
 * is to serve an allocation in that area.
1225
 */
1226 1227
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
				 int page_end, bool for_alloc)
1228 1229 1230 1231 1232 1233 1234
{
	int nr = page_end - page_start;

	lockdep_assert_held(&pcpu_lock);

	bitmap_set(chunk->populated, page_start, nr);
	chunk->nr_populated += nr;
1235 1236 1237 1238 1239

	if (!for_alloc) {
		chunk->nr_empty_pop_pages += nr;
		pcpu_nr_empty_pop_pages += nr;
	}
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
				   int page_start, int page_end)
{
	int nr = page_end - page_start;

	lockdep_assert_held(&pcpu_lock);

	bitmap_clear(chunk->populated, page_start, nr);
	chunk->nr_populated -= nr;
1261
	chunk->nr_empty_pop_pages -= nr;
1262 1263 1264
	pcpu_nr_empty_pop_pages -= nr;
}

1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk		- populate the specified range of a chunk
 * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
 * pcpu_create_chunk		- create a new chunk
 * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page		- translate address to physical address
 * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
1279
 */
1280
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
1281
			       int page_start, int page_end, gfp_t gfp);
1282 1283
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
				  int page_start, int page_end);
1284
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
1285 1286 1287
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1288

1289 1290 1291
#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
1292
#include "percpu-vm.c"
1293
#endif
1294

1295 1296 1297 1298
/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
1299 1300 1301
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
1302 1303 1304 1305 1306
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
1307
	/* is it in the dynamic region (first chunk)? */
1308
	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1309
		return pcpu_first_chunk;
1310 1311

	/* is it in the reserved region? */
1312
	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
1313
		return pcpu_reserved_chunk;
1314 1315 1316 1317 1318 1319 1320 1321 1322

	/*
	 * The address is relative to unit0 which might be unused and
	 * thus unmapped.  Offset the address to the unit space of the
	 * current processor before looking it up in the vmalloc
	 * space.  Note that any possible cpu id can be used here, so
	 * there's no need to worry about preemption or cpu hotplug.
	 */
	addr += pcpu_unit_offsets[raw_smp_processor_id()];
1323
	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
1324 1325
}

1326
/**
1327
 * pcpu_alloc - the percpu allocator
Tejun Heo's avatar
Tejun Heo committed
1328
 * @size: size of area to allocate in bytes
1329
 * @align: alignment of area (max PAGE_SIZE)
1330
 * @reserved: allocate from the reserved chunk if available
1331
 * @gfp: allocation flags
1332
 *
1333
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
1334 1335 1336
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
1337 1338 1339 1340
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
1341 1342
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
				 gfp_t gfp)
1343
{
1344 1345
	/* whitelisted flags that can be passed to the backing allocators */
	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
1346 1347
	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
	bool do_warn = !(gfp & __GFP_NOWARN);
1348
	static int warn_limit = 10;
1349
	struct pcpu_chunk *chunk;
1350
	const char *err;
1351
	int slot, off, cpu, ret;
1352
	unsigned long flags;
1353
	void __percpu *ptr;
1354
	size_t bits, bit_align;
1355

1356
	/*
1357 1358 1359 1360
	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
	 * therefore alignment must be a minimum of that many bytes.
	 * An allocation may have internal fragmentation from rounding up
	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
1361
	 */
1362 1363
	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
		align = PCPU_MIN_ALLOC_SIZE;
1364

1365
	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
1366 1367
	bits = size >> PCPU_MIN_ALLOC_SHIFT;
	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
1368

1369 1370
	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
		     !is_power_of_2(align))) {
1371
		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
Joe Perches's avatar
Joe Perches committed
1372
		     size, align);
1373 1374 1375
		return NULL;
	}

1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386
	if (!is_atomic) {
		/*
		 * pcpu_balance_workfn() allocates memory under this mutex,
		 * and it may wait for memory reclaim. Allow current task
		 * to become OOM victim, in case of memory pressure.
		 */
		if (gfp & __GFP_NOFAIL)
			mutex_lock(&pcpu_alloc_mutex);
		else if (mutex_lock_killable(&pcpu_alloc_mutex))
			return NULL;
	}
1387

1388
	spin_lock_irqsave(&pcpu_lock, flags);
1389

1390 1391 1392
	/* serve reserved allocations from the reserved chunk if available */
	if (reserved && pcpu_reserved_chunk) {
		chunk = pcpu_reserved_chunk;
1393

1394 1395
		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
		if (off < 0) {
1396
			err = "alloc from reserved chunk failed";
1397
			goto fail_unlock;
1398
		}
1399

1400
		off = pcpu_alloc_area(chunk, bits, bit_align, off);
1401 1402
		if (off >= 0)
			goto area_found;
1403

1404
		err = "alloc from reserved chunk failed";
1405
		goto fail_unlock;
1406 1407
	}

1408
restart:
1409
	/* search through normal chunks */
1410 1411
	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1412 1413 1414
			off = pcpu_find_block_fit(chunk, bits, bit_align,
						  is_atomic);
			if (off < 0)
1415
				continue;
1416

1417
			off = pcpu_alloc_area(chunk