namei.c 123 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
Robert Love's avatar
Robert Love committed
25
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27
#include <linux/personality.h>
#include <linux/security.h>
Mimi Zohar's avatar
Mimi Zohar committed
28
#include <linux/ima.h>
Linus Torvalds's avatar
Linus Torvalds committed
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
42
#include <linux/build_bug.h>
Linus Torvalds's avatar
Linus Torvalds committed
43

44
#include "internal.h"
45
#include "mount.h"
46

Linus Torvalds's avatar
Linus Torvalds committed
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
81
 * the name is a symlink pointing to a non-existent name.
Linus Torvalds's avatar
Linus Torvalds committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
114
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
Linus Torvalds's avatar
Linus Torvalds committed
115 116 117 118 119 120 121 122 123 124
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
125

Al Viro's avatar
Al Viro committed
126
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
127

128
struct filename *
129 130
getname_flags(const char __user *filename, int flags, int *empty)
{
131
	struct filename *result;
132
	char *kname;
133
	int len;
134
	BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
135

136 137 138 139
	result = audit_reusename(filename);
	if (result)
		return result;

140
	result = __getname();
141
	if (unlikely(!result))
142 143
		return ERR_PTR(-ENOMEM);

144 145 146 147
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
Al Viro's avatar
Al Viro committed
148
	kname = (char *)result->iname;
149
	result->name = kname;
150

151
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
152
	if (unlikely(len < 0)) {
153 154
		__putname(result);
		return ERR_PTR(len);
155
	}
156

157 158 159 160 161 162
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
163
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
Al Viro's avatar
Al Viro committed
164
		const size_t size = offsetof(struct filename, iname[1]);
165 166
		kname = (char *)result;

Al Viro's avatar
Al Viro committed
167 168 169 170 171 172
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
173 174 175
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
176 177
		}
		result->name = kname;
178 179 180 181 182 183 184 185 186 187 188
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
189 190
	}

191
	result->refcnt = 1;
192 193 194
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
195
			*empty = 1;
196 197 198 199
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
Linus Torvalds's avatar
Linus Torvalds committed
200
	}
201

202
	result->uptr = filename;
203
	result->aname = NULL;
204 205
	audit_getname(result);
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
206 207
}

208 209
struct filename *
getname(const char __user * filename)
Al Viro's avatar
Al Viro committed
210
{
211
	return getname_flags(filename, 0, NULL);
Al Viro's avatar
Al Viro committed
212 213
}

214 215 216 217
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
218
	int len = strlen(filename) + 1;
219 220 221 222 223

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

224
	if (len <= EMBEDDED_NAME_MAX) {
Al Viro's avatar
Al Viro committed
225
		result->name = (char *)result->iname;
226
	} else if (len <= PATH_MAX) {
227
		const size_t size = offsetof(struct filename, iname[1]);
228 229
		struct filename *tmp;

230
		tmp = kmalloc(size, GFP_KERNEL);
231 232 233 234 235 236 237 238 239 240 241
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
242 243
	result->uptr = NULL;
	result->aname = NULL;
244
	result->refcnt = 1;
245
	audit_getname(result);
246 247 248 249

	return result;
}

250
void putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
251
{
252 253 254 255 256
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

Al Viro's avatar
Al Viro committed
257
	if (name->name != name->iname) {
258 259 260 261
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
Linus Torvalds's avatar
Linus Torvalds committed
262 263
}

264 265
static int check_acl(struct inode *inode, int mask)
{
266
#ifdef CONFIG_FS_POSIX_ACL
267 268 269
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
270 271
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
272
	                return -EAGAIN;
273
		/* no ->get_acl() calls in RCU mode... */
274
		if (is_uncached_acl(acl))
275
			return -ECHILD;
276
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
277 278
	}

279 280 281
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
282 283 284 285 286
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
287
#endif
288 289 290 291

	return -EAGAIN;
}

292
/*
293
 * This does the basic permission checking
Linus Torvalds's avatar
Linus Torvalds committed
294
 */
295
static int acl_permission_check(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
296
{
297
	unsigned int mode = inode->i_mode;
Linus Torvalds's avatar
Linus Torvalds committed
298

299
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
Linus Torvalds's avatar
Linus Torvalds committed
300 301
		mode >>= 6;
	else {
302
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
303
			int error = check_acl(inode, mask);
304 305
			if (error != -EAGAIN)
				return error;
Linus Torvalds's avatar
Linus Torvalds committed
306 307 308 309 310 311 312 313 314
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
315
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
316
		return 0;
317 318 319 320
	return -EACCES;
}

/**
321
 * generic_permission -  check for access rights on a Posix-like filesystem
322
 * @inode:	inode to check access rights for
323
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
324 325 326 327
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
328 329 330 331 332
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
333
 */
334
int generic_permission(struct inode *inode, int mask)
335 336 337 338
{
	int ret;

	/*
339
	 * Do the basic permission checks.
340
	 */
341
	ret = acl_permission_check(inode, mask);
342 343
	if (ret != -EACCES)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
344

345 346 347
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
348 349
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
350
				return 0;
351
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
Linus Torvalds's avatar
Linus Torvalds committed
352
			return 0;
353 354
		return -EACCES;
	}
Linus Torvalds's avatar
Linus Torvalds committed
355 356 357 358

	/*
	 * Searching includes executable on directories, else just read.
	 */
359
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
360
	if (mask == MAY_READ)
361
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
Linus Torvalds's avatar
Linus Torvalds committed
362
			return 0;
363 364 365 366 367 368 369 370
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
Linus Torvalds's avatar
Linus Torvalds committed
371 372 373

	return -EACCES;
}
374
EXPORT_SYMBOL(generic_permission);
Linus Torvalds's avatar
Linus Torvalds committed
375

376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

396 397 398
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
399
 * @inode: Inode to check permission on
400 401 402 403 404 405 406 407 408 409
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
410
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459

	if (unlikely(mask & MAY_WRITE)) {
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
			return -EPERM;

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
	}

	retval = do_inode_permission(inode, mask);
	if (retval)
		return retval;

	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

	return security_inode_permission(inode, mask);
460
}
461
EXPORT_SYMBOL(inode_permission);
462

Jan Blunck's avatar
Jan Blunck committed
463 464 465 466 467 468
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
469
void path_get(const struct path *path)
Jan Blunck's avatar
Jan Blunck committed
470 471 472 473 474 475
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

Jan Blunck's avatar
Jan Blunck committed
476 477 478 479 480 481
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
482
void path_put(const struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
483
{
Jan Blunck's avatar
Jan Blunck committed
484 485
	dput(path->dentry);
	mntput(path->mnt);
Linus Torvalds's avatar
Linus Torvalds committed
486
}
Jan Blunck's avatar
Jan Blunck committed
487
EXPORT_SYMBOL(path_put);
Linus Torvalds's avatar
Linus Torvalds committed
488

489
#define EMBEDDED_LEVELS 2
490 491
struct nameidata {
	struct path	path;
Al Viro's avatar
Al Viro committed
492
	struct qstr	last;
493 494 495
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
496
	unsigned	seq, m_seq;
497 498
	int		last_type;
	unsigned	depth;
499
	int		total_link_count;
500 501
	struct saved {
		struct path link;
502
		struct delayed_call done;
503
		const char *name;
504
		unsigned seq;
505
	} *stack, internal[EMBEDDED_LEVELS];
506 507
	struct filename	*name;
	struct nameidata *saved;
508
	struct inode	*link_inode;
509 510
	unsigned	root_seq;
	int		dfd;
511
} __randomize_layout;
512

513
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
514
{
515 516
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
517 518
	p->dfd = dfd;
	p->name = name;
519
	p->total_link_count = old ? old->total_link_count : 0;
520
	p->saved = old;
521
	current->nameidata = p;
522 523
}

524
static void restore_nameidata(void)
525
{
526
	struct nameidata *now = current->nameidata, *old = now->saved;
527 528 529 530

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
531
	if (now->stack != now->internal)
532
		kfree(now->stack);
533 534 535 536
}

static int __nd_alloc_stack(struct nameidata *nd)
{
537 538 539
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
540
		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
541 542 543 544
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
545
		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
546
				  GFP_KERNEL);
547 548 549
		if (unlikely(!p))
			return -ENOMEM;
	}
550 551 552 553 554
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

555 556 557 558 559 560 561 562 563 564
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;
565
	struct super_block *sb = mnt->mnt_sb;
566

567 568
	/* Bind mounts and multi-root filesystems can have disconnected paths */
	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
569 570 571 572 573
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

574 575
static inline int nd_alloc_stack(struct nameidata *nd)
{
576
	if (likely(nd->depth != EMBEDDED_LEVELS))
577 578 579 580 581 582
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

583 584 585 586 587
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
588 589
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
590 591 592 593 594 595 596 597 598 599 600
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
601 602 603 604
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

646
/*
647
 * Path walking has 2 modes, rcu-walk and ref-walk (see
648 649
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
650
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
651 652 653 654
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
655 656 657
 */

/**
658 659
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
660
 * Returns: 0 on success, -ECHILD on failure
661
 *
Al Viro's avatar
Al Viro committed
662 663 664
 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
665 666
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
667
 */
Al Viro's avatar
Al Viro committed
668
static int unlazy_walk(struct nameidata *nd)
669 670 671 672
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
673

Al Viro's avatar
Al Viro committed
674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out1;
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
			goto out;
	}
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
	return 0;

out2:
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out1:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
out:
	rcu_read_unlock();
	return -ECHILD;
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

715
	nd->flags &= ~LOOKUP_RCU;
716 717 718 719
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
Al Viro's avatar
Al Viro committed
720
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
721
		goto out1;
Al Viro's avatar
Al Viro committed
722

723
	/*
Al Viro's avatar
Al Viro committed
724 725 726 727 728
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
729
	 */
Al Viro's avatar
Al Viro committed
730 731 732 733 734 735
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
		rcu_read_unlock();
		dput(dentry);
		goto drop_root_mnt;
736
	}
737 738 739 740 741
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
742 743 744 745
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
746
		}
747 748
	}

749
	rcu_read_unlock();
750
	return 0;
751

752 753 754 755
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
756
out:
757
	rcu_read_unlock();
758 759 760
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
761 762 763
	return -ECHILD;
}

764
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
765
{
766 767 768 769
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
770 771
}

772 773 774
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
775
 *
776 777 778 779 780
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
781
 */
782
static int complete_walk(struct nameidata *nd)
783
{
784
	struct dentry *dentry = nd->path.dentry;
785 786
	int status;

787 788 789
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
Al Viro's avatar
Al Viro committed
790
		if (unlikely(unlazy_walk(nd)))
791 792 793
			return -ECHILD;
	}

794 795 796
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

797
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
798 799
		return 0;

800
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
801 802 803
	if (status > 0)
		return 0;

804
	if (!status)
805
		status = -ESTALE;
806

807 808 809
	return status;
}

810
static void set_root(struct nameidata *nd)
811
{
812
	struct fs_struct *fs = current->fs;
Nick Piggin's avatar
Nick Piggin committed
813

814 815 816 817 818 819 820 821 822 823 824
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
	}
825 826
}

Jan Blunck's avatar
Jan Blunck committed
827
static void path_put_conditional(struct path *path, struct nameidata *nd)
828 829
{
	dput(path->dentry);
830
	if (path->mnt != nd->path.mnt)
831 832 833
		mntput(path->mnt);
}

834 835
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
836
{
837 838 839 840
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
841
	}
842
	nd->path.mnt = path->mnt;
843
	nd->path.dentry = path->dentry;
844 845
}

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

Christoph Hellwig's avatar
Christoph Hellwig committed
866
/*
867
 * Helper to directly jump to a known parsed path from ->get_link,
Christoph Hellwig's avatar
Christoph Hellwig committed
868 869
 * caller must have taken a reference to path beforehand.
 */
870
void nd_jump_link(struct path *path)
Christoph Hellwig's avatar
Christoph Hellwig committed
871
{
872
	struct nameidata *nd = current->nameidata;
Christoph Hellwig's avatar
Christoph Hellwig committed
873 874 875 876 877 878 879
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

880
static inline void put_link(struct nameidata *nd)
881
{
882
	struct saved *last = nd->stack + --nd->depth;
883
	do_delayed_call(&last->done);
884 885
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
886 887
}

888 889
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
890 891
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
Kees Cook's avatar
Kees Cook committed
892 893 894

/**
 * may_follow_link - Check symlink following for unsafe situations
895
 * @nd: nameidata pathwalk data
Kees Cook's avatar
Kees Cook committed
896 897 898 899 900 901 902 903 904 905 906 907
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
908
static inline int may_follow_link(struct nameidata *nd)
Kees Cook's avatar
Kees Cook committed
909 910 911
{
	const struct inode *inode;
	const struct inode *parent;
912
	kuid_t puid;
Kees Cook's avatar
Kees Cook committed
913 914 915 916 917

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
918
	inode = nd->link_inode;
919
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
920 921 922
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
923
	parent = nd->inode;
Kees Cook's avatar
Kees Cook committed
924 925 926 927
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
928 929
	puid = parent->i_uid;
	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
930 931
		return 0;

932 933 934
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

935
	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
936
	audit_log_link_denied("follow_link");
Kees Cook's avatar
Kees Cook committed
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
983
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
Kees Cook's avatar
Kees Cook committed
984 985 986 987 988
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
989 990 991 992 993
	struct inode *inode = link->dentry->d_inode;

	/* Inode writeback is not safe when the uid or gid are invalid. */
	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
		return -EOVERFLOW;
Kees Cook's avatar
Kees Cook committed
994 995 996 997 998 999 1000

	if (!sysctl_protected_hardlinks)
		return 0;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1001
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
Kees Cook's avatar
Kees Cook committed
1002 1003
		return 0;

1004
	audit_log_link_denied("linkat");
Kees Cook's avatar
Kees Cook committed
1005 1006 1007
	return -EPERM;
}

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046
/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *			  should be allowed, or not, on files that already
 *			  exist.
 * @dir: the sticky parent directory
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct dentry * const dir,
				struct inode * const inode)
{
	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
	    likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
	    uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
	    uid_eq(current_fsuid(), inode->i_uid))
		return 0;

	if (likely(dir->d_inode->i_mode & 0002) ||
	    (dir->d_inode->i_mode & 0020 &&
	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
		return -EACCES;
	}
	return 0;
}

1047 1048
static __always_inline
const char *get_link(struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
1049
{
1050
	struct saved *last = nd->stack + nd->depth - 1;
Al Viro's avatar
Al Viro committed
1051
	struct dentry *dentry = last->link.dentry;
1052
	struct inode *inode = nd->link_inode;
1053
	int error;
1054
	const char *res;
Linus Torvalds's avatar
Linus Torvalds committed
1055

1056 1057 1058
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
1059
	} else if (atime_needs_update(&last->link, inode)) {
Al Viro's avatar
Al Viro committed
1060
		if (unlikely(unlazy_walk(nd)))
1061
			return ERR_PTR(-ECHILD);
1062
		touch_atime(&last->link);
1063
	}
1064

1065 1066 1067
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
1068
		return ERR_PTR(error);
1069

1070
	nd->last_type = LAST_BIND;
1071 1072
	res = inode->i_link;
	if (!res) {
1073 1074 1075
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
1076
		if (nd->flags & LOOKUP_RCU) {
1077
			res = get(NULL, inode, &last->done);
1078
			if (res == ERR_PTR(-ECHILD)) {
Al Viro's avatar
Al Viro committed
1079
				if (unlikely(unlazy_walk(nd)))
1080
					return ERR_PTR(-ECHILD);
1081
				res = get(dentry, inode, &last->done);
1082 1083
			}
		} else {
1084
			res = get(dentry, inode, &last->done);
1085
		}
1086
		if (IS_ERR_OR_NULL(res))
1087 1088 1089
			return res;
	}
	if (*res == '/') {
1090 1091
		if (!nd->root.mnt)
			set_root(nd);
1092 1093
		if (unlikely(nd_jump_root(nd)))
			return ERR_PTR(-ECHILD);
1094 1095
		while (unlikely(*++res == '/'))
			;
Linus Torvalds's avatar
Linus Torvalds committed
1096
	}
1097 1098
	if (!*res)
		res = NULL;
1099 1100
	return res;
}
1101

1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
1112
int follow_up(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1113
{
1114 1115
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
Linus Torvalds's avatar
Linus Torvalds committed
1116
	struct dentry *mountpoint;
Nick Piggin's avatar
Nick Piggin committed
1117

Al Viro's avatar
Al Viro committed
1118
	read_seqlock_excl(&mount_lock);
1119
	parent = mnt->mnt_parent;
Al Viro's avatar
Al Viro committed
1120
	if (parent == mnt) {
Al Viro's avatar
Al Viro committed
1121
		read_sequnlock_excl(&mount_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1122 1123
		return 0;
	}
1124
	mntget(&parent->mnt);
1125
	mountpoint = dget(mnt->mnt_mountpoint);
Al Viro's avatar
Al Viro committed
1126
	read_sequnlock_excl(&mount_lock);
1127 1128 1129
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1130
	path->mnt = &parent->mnt;
Linus Torvalds's avatar
Linus Torvalds committed
1131 1132
	return 1;
}
1133
EXPORT_SYMBOL(follow_up);
Linus Torvalds's avatar
Linus Torvalds committed
1134

1135
/*
1136 1137 1138
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
Linus Torvalds's avatar
Linus Torvalds committed
1139
 */
1140
static int follow_automount(struct path *path, struct nameidata *nd,
1141
			    bool *need_mntput)
1142
{
1143
	struct vfsmount *mnt;
1144
	int err;
1145 1146 1147 1148

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1159
	 */
1160
	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1161 1162 1163
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
	    path->dentry->d_inode)
		return -EISDIR;
1164

1165 1166
	nd->total_link_count++;
	if (nd->total_link_count >= 40)
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
1180
		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1181 1182
			return -EREMOTE;
		return PTR_ERR(mnt);
1183
	}
1184

1185 1186
	if (!mnt) /* mount collision */
		return 0;
1187

1188 1189 1190 1191 1192
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1193
	err = finish_automount(mnt, path);
1194

1195 1196 1197
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1198
		return 0;
1199
	case 0:
1200
		path_put(path);
1201 1202 1203
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1204 1205
	default:
		return err;
1206
	}
1207

Al Viro's avatar
Al Viro committed
1208 1209
}

1210 1211
/*
 * Handle a dentry that is managed in some way.
1212
 * - Flagged for transit management (autofs)
1213 1214 1215 1216 1217 1218 1219
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
1220
static int follow_managed(struct path *path, struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
1221
{
1222
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1223 1224
	unsigned managed;
	bool need_mntput = false;
1225
	int ret = 0;
1226 1227 1228 1229

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
1230
	while (managed = READ_ONCE(path->dentry->d_flags),
1231 1232
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1233 1234 1235 1236 1237
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1238
			ret = path->dentry->d_op->d_manage(path, false);
1239
			if (ret < 0)
1240
				break;
1241 1242
		}

1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
Al Viro's avatar
Al Viro committed
1258 1259
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1260 1261 1262 1263
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
1264
			ret = follow_automount(path, nd, &need_mntput);
1265
			if (ret < 0)
1266
				break;
1267 1268 1269 1270 1271
			continue;
		}

		/* We didn't change the current path point */
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1272
	}
1273 1274 1275

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
1276 1277
	if (ret == -EISDIR || !ret)
		ret = 1;
1278 1279 1280 1281 1282
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1283 1284
}

1285
int follow_down_one(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1286 1287 1288
{
	struct vfsmount *mounted;

Al Viro's avatar
Al Viro committed
1289
	mounted = lookup_mnt(path);
Linus Torvalds's avatar
Linus Torvalds committed
1290
	if (mounted) {
Al Viro's avatar
Al Viro committed
1291 1292 1293 1294
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
Linus Torvalds's avatar
Linus Torvalds committed
1295 1296 1297 1298
		return 1;
	}
	return 0;
}
1299
EXPORT_SYMBOL(follow_down_one);
Linus Torvalds's avatar
Linus Torvalds committed
1300

1301
static inline int managed_dentry_rcu(const struct path *path)
1302
{
1303 1304
	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		path->dentry->d_op->d_manage(path, true) : 0;
1305 1306
}

1307
/*
1308 1309
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1310 1311
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1312
			       struct inode **inode, unsigned *seqp)
1313
{
1314
	for (;;) {
1315
		struct mount *mounted;
1316 1317 1318 1319
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1320
		switch (managed_dentry_rcu(path)) {
1321 1322
		case -ECHILD:
		default:
1323
			return false;
1324 1325 1326 1327 1328
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1329 1330

		if (!d_mountpoint(path->dentry))
1331
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1332

1333
		mounted = __lookup_mnt(path->mnt, path->dentry);
1334 1335
		if (!mounted)
			break;
1336 1337
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1338
		nd->flags |= LOOKUP_JUMPED;
1339
		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1340 1341 1342 1343 1344 1345
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1346
	}
1347
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1348
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1349 1350
}

1351 1352
static int follow_dotdot_rcu(struct nameidata *nd)
{
1353
	struct inode *inode = nd->inode;
1354

1355
	while (1) {
1356
		if (path_equal(&nd->path, &nd->root))
1357 1358 1359 1360 1361 1362
			break;
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1363
			inode = parent->d_inode;
1364