shmem.c 104 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *		 2000 Transmeta Corp.
 *		 2000-2001 Christoph Rohland
 *		 2000-2001 SAP AG
 *		 2002 Red Hat Inc.
9 10
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
11
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16 17
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
18 19 20
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23
 * This file is released under the GPL.
 */

24 25 26 27
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
Andrew Morton's avatar
Andrew Morton committed
28
#include <linux/ramfs.h>
29
#include <linux/pagemap.h>
30 31
#include <linux/file.h>
#include <linux/mm.h>
32
#include <linux/random.h>
33
#include <linux/sched/signal.h>
34
#include <linux/export.h>
35
#include <linux/swap.h>
36
#include <linux/uio.h>
37
#include <linux/khugepaged.h>
38
#include <linux/hugetlb.h>
39

40 41
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */

42 43 44
static struct vfsmount *shm_mnt;

#ifdef CONFIG_SHMEM
Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48 49 50
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

51
#include <linux/xattr.h>
52
#include <linux/exportfs.h>
53
#include <linux/posix_acl.h>
Christoph Hellwig's avatar
Christoph Hellwig committed
54
#include <linux/posix_acl_xattr.h>
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57 58 59 60 61
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
62
#include <linux/pagevec.h>
63
#include <linux/percpu_counter.h>
64
#include <linux/falloc.h>
65
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
66 67 68 69
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
70
#include <linux/ctype.h>
71
#include <linux/migrate.h>
72
#include <linux/highmem.h>
73
#include <linux/seq_file.h>
Mimi Zohar's avatar
Mimi Zohar committed
74
#include <linux/magic.h>
75
#include <linux/syscalls.h>
David Herrmann's avatar
David Herrmann committed
76
#include <linux/fcntl.h>
77
#include <uapi/linux/memfd.h>
78
#include <linux/userfaultfd_k.h>
79
#include <linux/rmap.h>
80
#include <linux/uuid.h>
81

82
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
83 84
#include <asm/pgtable.h>

85 86
#include "internal.h"

87 88
#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
Linus Torvalds's avatar
Linus Torvalds committed
89 90 91 92

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

93 94 95
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

96
/*
97 98 99
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_mutex making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
100 101
 */
struct shmem_falloc {
102
	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
103 104 105 106 107 108
	pgoff_t start;		/* start of range currently being fallocated */
	pgoff_t next;		/* the next page offset to be fallocated */
	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
};

Andrew Morton's avatar
Andrew Morton committed
109
#ifdef CONFIG_TMPFS
110 111
static unsigned long shmem_default_max_blocks(void)
{
112
	return totalram_pages() / 2;
113 114 115 116
}

static unsigned long shmem_default_max_inodes(void)
{
117 118 119
	unsigned long nr_pages = totalram_pages();

	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
120
}
Andrew Morton's avatar
Andrew Morton committed
121
#endif
122

123 124 125
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				struct shmem_inode_info *info, pgoff_t index);
126
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
127
		struct page **pagep, enum sgp_type sgp,
128
		gfp_t gfp, struct vm_area_struct *vma,
129
		struct vm_fault *vmf, vm_fault_t *fault_type);
130

131
int shmem_getpage(struct inode *inode, pgoff_t index,
132
		struct page **pagep, enum sgp_type sgp)
133 134
{
	return shmem_getpage_gfp(inode, index, pagep, sgp,
135
		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
136
}
Linus Torvalds's avatar
Linus Torvalds committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
	return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
151
	return (flags & VM_NORESERVE) ?
152
		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
Linus Torvalds's avatar
Linus Torvalds committed
153 154 155 156
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
157
	if (!(flags & VM_NORESERVE))
Linus Torvalds's avatar
Linus Torvalds committed
158 159 160
		vm_unacct_memory(VM_ACCT(size));
}

161 162 163 164 165 166 167 168 169 170 171 172 173
static inline int shmem_reacct_size(unsigned long flags,
		loff_t oldsize, loff_t newsize)
{
	if (!(flags & VM_NORESERVE)) {
		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
			return security_vm_enough_memory_mm(current->mm,
					VM_ACCT(newsize) - VM_ACCT(oldsize));
		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
174 175
/*
 * ... whereas tmpfs objects are accounted incrementally as
176
 * pages are allocated, in order to allow large sparse files.
Linus Torvalds's avatar
Linus Torvalds committed
177 178 179
 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
180
static inline int shmem_acct_block(unsigned long flags, long pages)
Linus Torvalds's avatar
Linus Torvalds committed
181
{
182 183 184 185 186
	if (!(flags & VM_NORESERVE))
		return 0;

	return security_vm_enough_memory_mm(current->mm,
			pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
187 188 189 190
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
191
	if (flags & VM_NORESERVE)
192
		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
Linus Torvalds's avatar
Linus Torvalds committed
193 194
}

195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (shmem_acct_block(info->flags, pages))
		return false;

	if (sbinfo->max_blocks) {
		if (percpu_counter_compare(&sbinfo->used_blocks,
					   sbinfo->max_blocks - pages) > 0)
			goto unacct;
		percpu_counter_add(&sbinfo->used_blocks, pages);
	}

	return true;

unacct:
	shmem_unacct_blocks(info->flags, pages);
	return false;
}

static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

	if (sbinfo->max_blocks)
		percpu_counter_sub(&sbinfo->used_blocks, pages);
	shmem_unacct_blocks(info->flags, pages);
}

227
static const struct super_operations shmem_ops;
228
static const struct address_space_operations shmem_aops;
229
static const struct file_operations shmem_file_operations;
230 231 232
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
233
static const struct vm_operations_struct shmem_vm_ops;
234
static struct file_system_type shmem_fs_type;
Linus Torvalds's avatar
Linus Torvalds committed
235

236 237 238 239 240
bool vma_is_shmem(struct vm_area_struct *vma)
{
	return vma->vm_ops == &shmem_vm_ops;
}

Linus Torvalds's avatar
Linus Torvalds committed
241
static LIST_HEAD(shmem_swaplist);
242
static DEFINE_MUTEX(shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
243

244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
static int shmem_reserve_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		if (!sbinfo->free_inodes) {
			spin_unlock(&sbinfo->stat_lock);
			return -ENOSPC;
		}
		sbinfo->free_inodes--;
		spin_unlock(&sbinfo->stat_lock);
	}
	return 0;
}

static void shmem_free_inode(struct super_block *sb)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	if (sbinfo->max_inodes) {
		spin_lock(&sbinfo->stat_lock);
		sbinfo->free_inodes++;
		spin_unlock(&sbinfo->stat_lock);
	}
}

269
/**
270
 * shmem_recalc_inode - recalculate the block usage of an inode
Linus Torvalds's avatar
Linus Torvalds committed
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
static void shmem_recalc_inode(struct inode *inode)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
	long freed;

	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
	if (freed > 0) {
		info->alloced -= freed;
289
		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
290
		shmem_inode_unacct_blocks(inode, freed);
Linus Torvalds's avatar
Linus Torvalds committed
291 292 293
	}
}

294 295 296
bool shmem_charge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
297
	unsigned long flags;
298

299
	if (!shmem_inode_acct_block(inode, pages))
300
		return false;
301

302 303 304
	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
	inode->i_mapping->nrpages += pages;

305
	spin_lock_irqsave(&info->lock, flags);
306 307 308
	info->alloced += pages;
	inode->i_blocks += pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
309
	spin_unlock_irqrestore(&info->lock, flags);
310 311 312 313 314 315 316

	return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
	struct shmem_inode_info *info = SHMEM_I(inode);
317
	unsigned long flags;
318

319 320
	/* nrpages adjustment done by __delete_from_page_cache() or caller */

321
	spin_lock_irqsave(&info->lock, flags);
322 323 324
	info->alloced -= pages;
	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
325
	spin_unlock_irqrestore(&info->lock, flags);
326

327
	shmem_inode_unacct_blocks(inode, pages);
328 329
}

330
/*
331
 * Replace item expected in xarray by a new item, while holding xa_lock.
332
 */
333
static int shmem_replace_entry(struct address_space *mapping,
334 335
			pgoff_t index, void *expected, void *replacement)
{
336
	XA_STATE(xas, &mapping->i_pages, index);
337
	void *item;
338 339

	VM_BUG_ON(!expected);
340
	VM_BUG_ON(!replacement);
341
	item = xas_load(&xas);
342 343
	if (item != expected)
		return -ENOENT;
344
	xas_store(&xas, replacement);
345 346 347
	return 0;
}

348 349 350 351 352 353 354 355 356 357
/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
			       pgoff_t index, swp_entry_t swap)
{
358
	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
359 360
}

361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *	disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *	enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *	only allocate huge pages if the page will be fully within i_size,
 *	also respect fadvise()/madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *	only allocate huge pages if requested with fadvise()/madvise();
 */

#define SHMEM_HUGE_NEVER	0
#define SHMEM_HUGE_ALWAYS	1
#define SHMEM_HUGE_WITHIN_SIZE	2
#define SHMEM_HUGE_ADVISE	3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *	disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY		(-1)
#define SHMEM_HUGE_FORCE	(-2)

393
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
394 395
/* ifdef here to avoid bloating shmem.o when not necessary */

396
static int shmem_huge __read_mostly;
397

398
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
static int shmem_parse_huge(const char *str)
{
	if (!strcmp(str, "never"))
		return SHMEM_HUGE_NEVER;
	if (!strcmp(str, "always"))
		return SHMEM_HUGE_ALWAYS;
	if (!strcmp(str, "within_size"))
		return SHMEM_HUGE_WITHIN_SIZE;
	if (!strcmp(str, "advise"))
		return SHMEM_HUGE_ADVISE;
	if (!strcmp(str, "deny"))
		return SHMEM_HUGE_DENY;
	if (!strcmp(str, "force"))
		return SHMEM_HUGE_FORCE;
	return -EINVAL;
}

static const char *shmem_format_huge(int huge)
{
	switch (huge) {
	case SHMEM_HUGE_NEVER:
		return "never";
	case SHMEM_HUGE_ALWAYS:
		return "always";
	case SHMEM_HUGE_WITHIN_SIZE:
		return "within_size";
	case SHMEM_HUGE_ADVISE:
		return "advise";
	case SHMEM_HUGE_DENY:
		return "deny";
	case SHMEM_HUGE_FORCE:
		return "force";
	default:
		VM_BUG_ON(1);
		return "bad_val";
	}
}
436
#endif
437

438 439 440 441
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	LIST_HEAD(list), *pos, *next;
442
	LIST_HEAD(to_remove);
443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
	struct inode *inode;
	struct shmem_inode_info *info;
	struct page *page;
	unsigned long batch = sc ? sc->nr_to_scan : 128;
	int removed = 0, split = 0;

	if (list_empty(&sbinfo->shrinklist))
		return SHRINK_STOP;

	spin_lock(&sbinfo->shrinklist_lock);
	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);

		/* pin the inode */
		inode = igrab(&info->vfs_inode);

		/* inode is about to be evicted */
		if (!inode) {
			list_del_init(&info->shrinklist);
			removed++;
			goto next;
		}

		/* Check if there's anything to gain */
		if (round_up(inode->i_size, PAGE_SIZE) ==
				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
469
			list_move(&info->shrinklist, &to_remove);
470 471 472 473 474 475 476 477 478 479 480
			removed++;
			goto next;
		}

		list_move(&info->shrinklist, &list);
next:
		if (!--batch)
			break;
	}
	spin_unlock(&sbinfo->shrinklist_lock);

481 482 483 484 485 486 487
	list_for_each_safe(pos, next, &to_remove) {
		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;
		list_del_init(&info->shrinklist);
		iput(inode);
	}

488 489 490 491 492 493
	list_for_each_safe(pos, next, &list) {
		int ret;

		info = list_entry(pos, struct shmem_inode_info, shrinklist);
		inode = &info->vfs_inode;

494 495
		if (nr_to_split && split >= nr_to_split)
			goto leave;
496

497
		page = find_get_page(inode->i_mapping,
498 499 500 501
				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
		if (!page)
			goto drop;

502
		/* No huge page at the end of the file: nothing to split */
503 504 505 506 507
		if (!PageTransHuge(page)) {
			put_page(page);
			goto drop;
		}

508 509 510 511 512 513 514 515 516 517 518 519
		/*
		 * Leave the inode on the list if we failed to lock
		 * the page at this time.
		 *
		 * Waiting for the lock may lead to deadlock in the
		 * reclaim path.
		 */
		if (!trylock_page(page)) {
			put_page(page);
			goto leave;
		}

520 521 522 523
		ret = split_huge_page(page);
		unlock_page(page);
		put_page(page);

524 525 526
		/* If split failed leave the inode on the list */
		if (ret)
			goto leave;
527 528 529 530 531

		split++;
drop:
		list_del_init(&info->shrinklist);
		removed++;
532
leave:
533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
		iput(inode);
	}

	spin_lock(&sbinfo->shrinklist_lock);
	list_splice_tail(&list, &sbinfo->shrinklist);
	sbinfo->shrinklist_len -= removed;
	spin_unlock(&sbinfo->shrinklist_lock);

	return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

	if (!READ_ONCE(sbinfo->shrinklist_len))
		return SHRINK_STOP;

	return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
		struct shrink_control *sc)
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
	return READ_ONCE(sbinfo->shrinklist_len);
}
561
#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
562 563 564

#define shmem_huge SHMEM_HUGE_DENY

565 566 567 568 569
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
		struct shrink_control *sc, unsigned long nr_to_split)
{
	return 0;
}
570
#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
571

572 573 574 575 576 577 578 579 580
static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
{
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
	    (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
	    shmem_huge != SHMEM_HUGE_DENY)
		return true;
	return false;
}

581 582 583 584 585
/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct page *page,
				   struct address_space *mapping,
586
				   pgoff_t index, void *expected, gfp_t gfp)
587
{
588 589 590
	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
	unsigned long i = 0;
	unsigned long nr = 1UL << compound_order(page);
591

592 593
	VM_BUG_ON_PAGE(PageTail(page), page);
	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
594 595
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
596
	VM_BUG_ON(expected && PageTransHuge(page));
597

598
	page_ref_add(page, nr);
599 600 601
	page->mapping = mapping;
	page->index = index;

602 603 604 605 606 607 608 609 610 611 612 613 614 615
	do {
		void *entry;
		xas_lock_irq(&xas);
		entry = xas_find_conflict(&xas);
		if (entry != expected)
			xas_set_err(&xas, -EEXIST);
		xas_create_range(&xas);
		if (xas_error(&xas))
			goto unlock;
next:
		xas_store(&xas, page + i);
		if (++i < nr) {
			xas_next(&xas);
			goto next;
616
		}
617
		if (PageTransHuge(page)) {
618
			count_vm_event(THP_FILE_ALLOC);
619
			__inc_node_page_state(page, NR_SHMEM_THPS);
620 621
		}
		mapping->nrpages += nr;
622 623
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
624 625 626 627 628
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));

	if (xas_error(&xas)) {
629
		page->mapping = NULL;
630
		page_ref_sub(page, nr);
631
		return xas_error(&xas);
632
	}
633 634

	return 0;
635 636
}

637 638 639 640 641 642 643 644
/*
 * Like delete_from_page_cache, but substitutes swap for page.
 */
static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
	struct address_space *mapping = page->mapping;
	int error;

645 646
	VM_BUG_ON_PAGE(PageCompound(page), page);

Matthew Wilcox's avatar
Matthew Wilcox committed
647
	xa_lock_irq(&mapping->i_pages);
648
	error = shmem_replace_entry(mapping, page->index, page, radswap);
649 650
	page->mapping = NULL;
	mapping->nrpages--;
651 652
	__dec_node_page_state(page, NR_FILE_PAGES);
	__dec_node_page_state(page, NR_SHMEM);
Matthew Wilcox's avatar
Matthew Wilcox committed
653
	xa_unlock_irq(&mapping->i_pages);
654
	put_page(page);
655 656 657
	BUG_ON(error);
}

658
/*
659
 * Remove swap entry from page cache, free the swap and its page cache.
660 661 662 663
 */
static int shmem_free_swap(struct address_space *mapping,
			   pgoff_t index, void *radswap)
{
664
	void *old;
665

666
	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
667 668 669 670
	if (old != radswap)
		return -ENOENT;
	free_swap_and_cache(radix_to_swp_entry(radswap));
	return 0;
671 672
}

673 674
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
675
 * given offsets are swapped out.
676
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
677
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
678 679
 * as long as the inode doesn't go away and racy results are not a problem.
 */
680 681
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
						pgoff_t start, pgoff_t end)
682
{
683
	XA_STATE(xas, &mapping->i_pages, start);
684
	struct page *page;
685
	unsigned long swapped = 0;
686 687

	rcu_read_lock();
688 689
	xas_for_each(&xas, page, end - 1) {
		if (xas_retry(&xas, page))
690
			continue;
691
		if (xa_is_value(page))
692 693 694
			swapped++;

		if (need_resched()) {
695
			xas_pause(&xas);
696 697 698 699 700 701 702 703 704
			cond_resched_rcu();
		}
	}

	rcu_read_unlock();

	return swapped << PAGE_SHIFT;
}

705 706 707 708
/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
709
 * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
	struct inode *inode = file_inode(vma->vm_file);
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct address_space *mapping = inode->i_mapping;
	unsigned long swapped;

	/* Be careful as we don't hold info->lock */
	swapped = READ_ONCE(info->swapped);

	/*
	 * The easier cases are when the shmem object has nothing in swap, or
	 * the vma maps it whole. Then we can simply use the stats that we
	 * already track.
	 */
	if (!swapped)
		return 0;

	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
		return swapped << PAGE_SHIFT;

	/* Here comes the more involved part */
	return shmem_partial_swap_usage(mapping,
			linear_page_index(vma, vma->vm_start),
			linear_page_index(vma, vma->vm_end));
}

739 740 741 742 743 744 745 746 747
/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	pgoff_t index = 0;

748
	pagevec_init(&pvec);
749 750 751 752 753 754 755 756
	/*
	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
	 */
	while (!mapping_unevictable(mapping)) {
		/*
		 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
		 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
		 */
757 758
		pvec.nr = find_get_entries(mapping, index,
					   PAGEVEC_SIZE, pvec.pages, indices);
759 760 761
		if (!pvec.nr)
			break;
		index = indices[pvec.nr - 1] + 1;
762
		pagevec_remove_exceptionals(&pvec);
763
		check_move_unevictable_pages(&pvec);
764 765 766
		pagevec_release(&pvec);
		cond_resched();
	}
767 768 769
}

/*
Matthew Wilcox's avatar
Matthew Wilcox committed
770
 * Remove range of pages and swap entries from page cache, and free them.
771
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
772
 */
773 774
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
								 bool unfalloc)
Linus Torvalds's avatar
Linus Torvalds committed
775
{
776
	struct address_space *mapping = inode->i_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
777
	struct shmem_inode_info *info = SHMEM_I(inode);
778 779 780 781
	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
782
	struct pagevec pvec;
783 784
	pgoff_t indices[PAGEVEC_SIZE];
	long nr_swaps_freed = 0;
785
	pgoff_t index;
786 787
	int i;

788 789
	if (lend == -1)
		end = -1;	/* unsigned, so actually very big */
790

791
	pagevec_init(&pvec);
792
	index = start;
793
	while (index < end) {
794 795 796
		pvec.nr = find_get_entries(mapping, index,
			min(end - index, (pgoff_t)PAGEVEC_SIZE),
			pvec.pages, indices);
797 798
		if (!pvec.nr)
			break;
799 800 801
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

802
			index = indices[i];
803
			if (index >= end)
804 805
				break;

806
			if (xa_is_value(page)) {
807 808
				if (unfalloc)
					continue;
809 810
				nr_swaps_freed += !shmem_free_swap(mapping,
								index, page);
811
				continue;
812 813
			}

814 815
			VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);

816
			if (!trylock_page(page))
817
				continue;
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

838
			if (!unfalloc || !PageUptodate(page)) {
839 840
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
841
					VM_BUG_ON_PAGE(PageWriteback(page), page);
842 843
					truncate_inode_page(mapping, page);
				}
844 845 846
			}
			unlock_page(page);
		}
847
		pagevec_remove_exceptionals(&pvec);
848
		pagevec_release(&pvec);
849 850 851
		cond_resched();
		index++;
	}
Linus Torvalds's avatar
Linus Torvalds committed
852

853
	if (partial_start) {
854
		struct page *page = NULL;
855
		shmem_getpage(inode, start - 1, &page, SGP_READ);
856
		if (page) {
857
			unsigned int top = PAGE_SIZE;
858 859 860 861 862 863 864
			if (start > end) {
				top = partial_end;
				partial_end = 0;
			}
			zero_user_segment(page, partial_start, top);
			set_page_dirty(page);
			unlock_page(page);
865
			put_page(page);
866 867 868 869
		}
	}
	if (partial_end) {
		struct page *page = NULL;
870
		shmem_getpage(inode, end, &page, SGP_READ);
871 872
		if (page) {
			zero_user_segment(page, 0, partial_end);
873 874
			set_page_dirty(page);
			unlock_page(page);
875
			put_page(page);
876 877
		}
	}
878 879
	if (start >= end)
		return;
880 881

	index = start;
882
	while (index < end) {
883
		cond_resched();
884 885

		pvec.nr = find_get_entries(mapping, index,
886
				min(end - index, (pgoff_t)PAGEVEC_SIZE),
887
				pvec.pages, indices);
888
		if (!pvec.nr) {
889 890
			/* If all gone or hole-punch or unfalloc, we're done */
			if (index == start || end != -1)
891
				break;
892
			/* But if truncating, restart to make sure all gone */
893 894 895 896 897 898
			index = start;
			continue;
		}
		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];

899
			index = indices[i];
900
			if (index >= end)
901 902
				break;

903
			if (xa_is_value(page)) {
904 905
				if (unfalloc)
					continue;
906 907 908 909 910 911
				if (shmem_free_swap(mapping, index, page)) {
					/* Swap was replaced by page: retry */
					index--;
					break;
				}
				nr_swaps_freed++;
912 913 914
				continue;
			}

915
			lock_page(page);
916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942

			if (PageTransTail(page)) {
				/* Middle of THP: zero out the page */
				clear_highpage(page);
				unlock_page(page);
				/*
				 * Partial thp truncate due 'start' in middle
				 * of THP: don't need to look on these pages
				 * again on !pvec.nr restart.
				 */
				if (index != round_down(end, HPAGE_PMD_NR))
					start++;
				continue;
			} else if (PageTransHuge(page)) {
				if (index == round_down(end, HPAGE_PMD_NR)) {
					/*
					 * Range ends in the middle of THP:
					 * zero out the page
					 */
					clear_highpage(page);
					unlock_page(page);
					continue;
				}
				index += HPAGE_PMD_NR - 1;
				i += HPAGE_PMD_NR - 1;
			}

943
			if (!unfalloc || !PageUptodate(page)) {
944 945
				VM_BUG_ON_PAGE(PageTail(page), page);
				if (page_mapping(page) == mapping) {
946
					VM_BUG_ON_PAGE(PageWriteback(page), page);
947
					truncate_inode_page(mapping, page);
948 949 950 951 952
				} else {
					/* Page was replaced by swap: retry */
					unlock_page(page);
					index--;
					break;
953
				}
954
			}
955 956
			unlock_page(page);
		}
957
		pagevec_remove_exceptionals(&pvec);
958
		pagevec_release(&pvec);
959 960
		index++;
	}
961

962
	spin_lock_irq(&info->lock);
963
	info->swapped -= nr_swaps_freed;
Linus Torvalds's avatar
Linus Torvalds committed
964
	shmem_recalc_inode(inode);
965
	spin_unlock_irq(&info->lock);
966
}
Linus Torvalds's avatar
Linus Torvalds committed
967

968 969 970
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
	shmem_undo_range(inode, lstart, lend, false);
971
	inode->i_ctime = inode->i_mtime = current_time(inode);
Linus Torvalds's avatar
Linus Torvalds committed
972
}
973
EXPORT_SYMBOL_GPL(shmem_truncate_range);
Linus Torvalds's avatar
Linus Torvalds committed
974

975 976
static int shmem_getattr(const struct path *path, struct kstat *stat,
			 u32 request_mask, unsigned int query_flags)
977
{
978
	struct inode *inode = path->dentry->d_inode;
979
	struct shmem_inode_info *info = SHMEM_I(inode);
980
	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
981

982
	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
983
		spin_lock_irq(&info->lock);
984
		shmem_recalc_inode(inode);
985
		spin_unlock_irq(&info->lock);
986
	}
987
	generic_fillattr(inode, stat);
988 989 990 991

	if (is_huge_enabled(sb_info))
		stat->blksize = HPAGE_PMD_SIZE;

992 993 994
	return 0;
}

995
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
Linus Torvalds's avatar
Linus Torvalds committed
996
{
997
	struct inode *inode = d_inode(dentry);
David Herrmann's avatar
David Herrmann committed
998
	struct shmem_inode_info *info = SHMEM_I(inode);
999
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1000 1001
	int error;

1002
	error = setattr_prepare(dentry, attr);
1003 1004 1005
	if (error)
		return error;

1006 1007 1008
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
		loff_t oldsize = inode->i_size;
		loff_t newsize = attr->ia_size;
1009

David Herrmann's avatar
David Herrmann committed
1010 1011 1012 1013 1014
		/* protected by i_mutex */
		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
			return -EPERM;

1015
		if (newsize != oldsize) {
1016 1017 1018 1019
			error = shmem_reacct_size(SHMEM_I(inode)->flags,
					oldsize, newsize);
			if (error)
				return error;
1020
			i_size_write(inode, newsize);
1021
			inode->i_ctime = inode->i_mtime = current_time(inode);
1022
		}
1023
		if (newsize <= oldsize) {
1024
			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1025 1026 1027 1028 1029 1030
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
			if (info->alloced)
				shmem_truncate_range(inode,
							newsize, (loff_t)-1);
1031
			/* unmap again to remove racily COWed private pages */
1032 1033 1034
			if (oldsize > holebegin)
				unmap_mapping_range(inode->i_mapping,
							holebegin, 0, 1);
1035 1036 1037 1038 1039 1040 1041

			/*
			 * Part of the huge page can be beyond i_size: subject
			 * to shrink under memory pressure.
			 */
			if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				spin_lock(&sbinfo->shrinklist_lock);
1042 1043 1044 1045 1046
				/*
				 * _careful to defend against unlocked access to
				 * ->shrink_list in shmem_unused_huge_shrink()
				 */
				if (list_empty_careful(&info->shrinklist)) {
1047 1048 1049 1050 1051 1052
					list_add_tail(&info->shrinklist,
							&sbinfo->shrinklist);
					sbinfo->shrinklist_len++;
				}
				spin_unlock(&sbinfo->shrinklist_lock);
			}
1053
		}
Linus Torvalds's avatar
Linus Torvalds committed
1054 1055
	}

1056 1057
	setattr_copy(inode, attr);
	if (attr->ia_valid & ATTR_MODE)
Christoph Hellwig's avatar
Christoph Hellwig committed
1058
		error = posix_acl_chmod(inode, inode->i_mode);
Linus Torvalds's avatar
Linus Torvalds committed
1059 1060 1061
	return error;
}

Al Viro's avatar
Al Viro committed
1062
static void shmem_evict_inode(struct inode *inode)
Linus Torvalds's avatar
Linus Torvalds committed
1063 1064
{
	struct shmem_inode_info *info = SHMEM_I(inode);
1065
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
Linus Torvalds's avatar
Linus Torvalds committed
1066

1067
	if (inode->i_mapping->a_ops == &shmem_aops) {
Linus Torvalds's avatar
Linus Torvalds committed
1068 1069
		shmem_unacct_size(info->flags, inode->i_size);
		inode->i_size = 0;
1070
		shmem_truncate_range(inode, 0, (loff_t)-1);
1071 1072 1073 1074 1075 1076 1077 1078
		if (!list_empty(&info->shrinklist)) {
			spin_lock(&sbinfo->shrinklist_lock);
			if (!list_empty(&info->shrinklist)) {
				list_del_init(&info->shrinklist);
				sbinfo->shrinklist_len--;
			}
			spin_unlock(&sbinfo->shrinklist_lock);
		}
Linus Torvalds's avatar
Linus Torvalds committed
1079
		if (!list_empty(&info->swaplist)) {
1080
			mutex_lock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1081
			list_del_init(&info->swaplist);
1082
			mutex_unlock(&shmem_swaplist_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
1083
		}
1084
	}
1085

1086
	simple_xattrs_free(&info->xattrs);
1087
	WARN_ON(inode->i_blocks);
1088
	shmem_free_inode(inode->i_sb);
1089
	clear_inode(inode);
Linus Torvalds's avatar
Linus Torvalds committed
1090 1091
}

1092
static unsigned long find_swap_entry(struct xarray *xa, void *item)
1093
{
1094
	XA_STATE(xas, xa, 0);
1095
	unsigned int checked = 0;
1096
	void *entry;
1097 1098

	rcu_read_lock();
1099 1100
	xas_for_each(&xas, entry, ULONG_MAX) {
		if (xas_retry(&xas, entry))
1101
			continue;
1102
		if (entry == item)
1103 1104
			break;
		checked++;
1105
		if ((checked % XA_CHECK_SCHED) != 0)
1106
			continue;
1107
		xas_pause(&xas);
1108 1109 1110
		cond_resched_rcu();
	}
	rcu_read_unlock();
1111 1112

	return entry ? xas.xa_index : -1;
1113 1114
}

1115 1116 1117
/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
1118
static int shmem_unuse_inode(struct shmem_inode_info *info,
1119
			     swp_entry_t swap, struct page **pagep)
Linus Torvalds's avatar
Linus Torvalds committed
1120
{
1121
	struct address_space *mapping = info->vfs_inode.i_mapping;
1122
	void *radswap;
1123
	pgoff_t index;
1124 1125
	gfp_t gfp;
	int error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1126

1127
	radswap = swp_to_radix_entry(swap);
Matthew Wilcox's avatar
Matthew Wilcox committed
1128
	index = find_swap_entry(&mapping->i_pages, radswap);
1129
	if (index == -1)
1130
		return -EAGAIN;	/* tell shmem_unuse we found nothing */
1131

1132 1133
	/*
	 * Move _head_ to start search for next from here.
Al Viro's avatar
Al Viro committed
1134
	 * But be careful: shmem_evict_inode checks list_empty without taking
1135
	 * mutex, and there's an instant in list_move_tail when info->swaplist
1136
	 * would appear empty, if it were the only one on shmem_swaplist.
1137 1138 1139
	 */
	if (shmem_swaplist.next != &info->swaplist)
		list_move_tail(&shmem_swaplist, &info->swaplist);
1140

1141 1142 1143 1144 1145 1146 1147
	gfp = mapping_gfp_mask(mapping);
	if (shmem_should_replace_page(*pagep, gfp)) {
		mutex_unlock(&shmem_swaplist_mutex);
		error = shmem_replace_page(pagep, gfp, info, index);
		mutex_lock(&shmem_swaplist_mutex);
		/*
		 * We needed to drop mutex to make that restrictive page
1148 1149
		 * allocation, but the inode might have been freed while we
		 * dropped it: although a racing shmem_evict_inode() cannot
Matthew Wilcox's avatar
Matthew Wilcox committed
1150
		 * complete without emptying the page cache, our page lock
1151 1152
		 * on this swapcache page is not enough to prevent that -
		 * free_swap_and_cache() of our swap entry will only
Matthew Wilcox's avatar
Matthew Wilcox committed
1153
		 * trylock_page(), removing swap from page cache whatever.
1154 1155 1156 1157 1158 1159 1160 1161 1162
		 *
		 * We must not proceed to shmem_add_to_page_cache() if the
		 * inode has been freed, but of course we cannot rely on
		 * inode or mapping or info to check that.  However, we can
		 * safely check if our swap entry is still in use (and here
		 * it can't have got reused for another page): if it's still
		 * in use, then the inode cannot have been freed yet, and we
		 * can safely proceed (if it's no longer in use, that tells
		 * nothing about the inode, but we don't need to unuse swap).
1163 1164 1165 1166 1167
		 */
		if (!page_swapcount(*pagep))
			error = -ENOENT;
	}

1168
	/*
1169 1170 1171
	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
	 * beneath us (pagelock doesn't help until the page is in pagecache).
1172
	 */
1173 1174
	if (!error)
		error = shmem_add_to_page_cache(*pagep, mapping, index,
1175
						radswap, gfp);
1176
	if (error != -ENOMEM) {