futex.c 97.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
11 12 13 14
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
15 16 17 18
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
19 20 21
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
22 23 24 25
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
57
#include <linux/signal.h>
58
#include <linux/export.h>
59
#include <linux/magic.h>
60 61
#include <linux/pid.h>
#include <linux/nsproxy.h>
62
#include <linux/ptrace.h>
63
#include <linux/sched/rt.h>
64
#include <linux/sched/wake_q.h>
65
#include <linux/sched/mm.h>
66
#include <linux/hugetlb.h>
67
#include <linux/freezer.h>
68
#include <linux/bootmem.h>
69
#include <linux/fault-inject.h>
70

71
#include <asm/futex.h>
Linus Torvalds's avatar
Linus Torvalds committed
72

73
#include "locking/rtmutex_common.h"
74

75
/*
76 77 78 79
 * READ this before attempting to hack on futexes!
 *
 * Basic futex operation and ordering guarantees
 * =============================================
80 81 82 83
 *
 * The waiter reads the futex value in user space and calls
 * futex_wait(). This function computes the hash bucket and acquires
 * the hash bucket lock. After that it reads the futex user space value
84 85 86
 * again and verifies that the data has not changed. If it has not changed
 * it enqueues itself into the hash bucket, releases the hash bucket lock
 * and schedules.
87 88
 *
 * The waker side modifies the user space value of the futex and calls
89 90 91
 * futex_wake(). This function computes the hash bucket and acquires the
 * hash bucket lock. Then it looks for waiters on that futex in the hash
 * bucket and wakes them.
92
 *
93 94 95 96 97
 * In futex wake up scenarios where no tasks are blocked on a futex, taking
 * the hb spinlock can be avoided and simply return. In order for this
 * optimization to work, ordering guarantees must exist so that the waiter
 * being added to the list is acknowledged when the list is concurrently being
 * checked by the waker, avoiding scenarios like the following:
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
 *
 * CPU 0                               CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *   uval = *futex;
 *                                     *futex = newval;
 *                                     sys_futex(WAKE, futex);
 *                                       futex_wake(futex);
 *                                       if (queue_empty())
 *                                         return;
 *   if (uval == val)
 *      lock(hash_bucket(futex));
 *      queue();
 *     unlock(hash_bucket(futex));
 *     schedule();
 *
 * This would cause the waiter on CPU 0 to wait forever because it
 * missed the transition of the user space value from val to newval
 * and the waker did not find the waiter in the hash bucket queue.
 *
119 120 121 122 123
 * The correct serialization ensures that a waiter either observes
 * the changed user space value before blocking or is woken by a
 * concurrent waker:
 *
 * CPU 0                                 CPU 1
124 125 126
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
127
 *
128
 *   waiters++; (a)
129 130 131 132 133 134 135 136 137 138
 *   smp_mb(); (A) <-- paired with -.
 *                                  |
 *   lock(hash_bucket(futex));      |
 *                                  |
 *   uval = *futex;                 |
 *                                  |        *futex = newval;
 *                                  |        sys_futex(WAKE, futex);
 *                                  |          futex_wake(futex);
 *                                  |
 *                                  `--------> smp_mb(); (B)
139
 *   if (uval == val)
140
 *     queue();
141
 *     unlock(hash_bucket(futex));
142 143
 *     schedule();                         if (waiters)
 *                                           lock(hash_bucket(futex));
144 145
 *   else                                    wake_waiters(futex);
 *     waiters--; (b)                        unlock(hash_bucket(futex));
146
 *
147 148
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see hb_waiters_inc) and where (B) orders the write
149 150
 * to futex and the waiters read -- this is done by the barriers for both
 * shared and private futexes in get_futex_key_refs().
151 152 153 154 155 156 157 158 159 160 161 162
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
 *	X = Y = 0
 *
 *	w[X]=1		w[Y]=1
 *	MB		MB
 *	r[Y]=y		r[X]=x
 *
 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 * the guarantee that we cannot both miss the futex variable change and the
 * enqueue.
163 164 165 166 167 168 169 170 171 172 173
 *
 * Note that a new waiter is accounted for in (a) even when it is possible that
 * the wait call can return error, in which case we backtrack from it in (b).
 * Refer to the comment in queue_lock().
 *
 * Similarly, in order to account for waiters being requeued on another
 * address we always increment the waiters for the destination bucket before
 * acquiring the lock. It then decrements them again  after releasing it -
 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 * will do the additional required waiter count housekeeping. This is done for
 * double_lock_hb() and double_unlock_hb(), respectively.
174 175
 */

176
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
177
int __read_mostly futex_cmpxchg_enabled;
178
#endif
179

180 181 182 183
/*
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
184 185 186 187 188 189 190 191 192
#ifdef CONFIG_MMU
# define FLAGS_SHARED		0x01
#else
/*
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
# define FLAGS_SHARED		0x00
#endif
193 194 195
#define FLAGS_CLOCKRT		0x02
#define FLAGS_HAS_TIMEOUT	0x04

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
	/*
	 * list of 'owned' pi_state instances - these have to be
	 * cleaned up in do_exit() if the task exits prematurely:
	 */
	struct list_head list;

	/*
	 * The PI object:
	 */
	struct rt_mutex pi_mutex;

	struct task_struct *owner;
	atomic_t refcount;

	union futex_key key;
215
} __randomize_layout;
216

217 218
/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
219
 * @list:		priority-sorted list of tasks waiting on this futex
220 221 222 223 224 225 226 227
 * @task:		the task waiting on the futex
 * @lock_ptr:		the hash bucket lock
 * @key:		the key the futex is hashed on
 * @pi_state:		optional priority inheritance state
 * @rt_waiter:		rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:	the requeue_pi target futex key
 * @bitset:		bitset for the optional bitmasked wakeup
 *
228
 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
Linus Torvalds's avatar
Linus Torvalds committed
229 230 231
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
232
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
233
 * The order of wakeup is always to make the first condition true, then
234 235 236 237
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See unqueue_me_pi().
Linus Torvalds's avatar
Linus Torvalds committed
238 239
 */
struct futex_q {
240
	struct plist_node list;
Linus Torvalds's avatar
Linus Torvalds committed
241

242
	struct task_struct *task;
Linus Torvalds's avatar
Linus Torvalds committed
243 244
	spinlock_t *lock_ptr;
	union futex_key key;
245
	struct futex_pi_state *pi_state;
246
	struct rt_mutex_waiter *rt_waiter;
247
	union futex_key *requeue_pi_key;
248
	u32 bitset;
249
} __randomize_layout;
Linus Torvalds's avatar
Linus Torvalds committed
250

251 252 253 254 255 256
static const struct futex_q futex_q_init = {
	/* list gets initialized in queue_me()*/
	.key = FUTEX_KEY_INIT,
	.bitset = FUTEX_BITSET_MATCH_ANY
};

Linus Torvalds's avatar
Linus Torvalds committed
257
/*
258 259 260
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
Linus Torvalds's avatar
Linus Torvalds committed
261 262
 */
struct futex_hash_bucket {
263
	atomic_t waiters;
264 265
	spinlock_t lock;
	struct plist_head chain;
266
} ____cacheline_aligned_in_smp;
Linus Torvalds's avatar
Linus Torvalds committed
267

268 269 270 271 272 273 274 275 276 277 278
/*
 * The base of the bucket array and its size are always used together
 * (after initialization only in hash_futex()), so ensure that they
 * reside in the same cacheline.
 */
static struct {
	struct futex_hash_bucket *queues;
	unsigned long            hashsize;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues   (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
279

Linus Torvalds's avatar
Linus Torvalds committed
280

281 282 283 284 285 286 287 288
/*
 * Fault injections for futexes.
 */
#ifdef CONFIG_FAIL_FUTEX

static struct {
	struct fault_attr attr;

289
	bool ignore_private;
290 291
} fail_futex = {
	.attr = FAULT_ATTR_INITIALIZER,
292
	.ignore_private = false,
293 294 295 296 297 298 299 300
};

static int __init setup_fail_futex(char *str)
{
	return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);

301
static bool should_fail_futex(bool fshared)
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
{
	if (fail_futex.ignore_private && !fshared)
		return false;

	return should_fail(&fail_futex.attr, 1);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_futex_debugfs(void)
{
	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
	struct dentry *dir;

	dir = fault_create_debugfs_attr("fail_futex", NULL,
					&fail_futex.attr);
	if (IS_ERR(dir))
		return PTR_ERR(dir);

	if (!debugfs_create_bool("ignore-private", mode, dir,
				 &fail_futex.ignore_private)) {
		debugfs_remove_recursive(dir);
		return -ENOMEM;
	}

	return 0;
}

late_initcall(fail_futex_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

#else
static inline bool should_fail_futex(bool fshared)
{
	return false;
}
#endif /* CONFIG_FAIL_FUTEX */

341 342
static inline void futex_get_mm(union futex_key *key)
{
343
	mmgrab(key->private.mm);
344 345 346
	/*
	 * Ensure futex_get_mm() implies a full barrier such that
	 * get_futex_key() implies a full barrier. This is relied upon
347
	 * as smp_mb(); (B), see the ordering comment above.
348
	 */
349
	smp_mb__after_atomic();
350 351
}

352 353 354 355
/*
 * Reflects a new waiter being added to the waitqueue.
 */
static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
356 357
{
#ifdef CONFIG_SMP
358
	atomic_inc(&hb->waiters);
359
	/*
360
	 * Full barrier (A), see the ordering comment above.
361
	 */
362
	smp_mb__after_atomic();
363 364 365 366 367 368 369 370 371 372 373 374 375
#endif
}

/*
 * Reflects a waiter being removed from the waitqueue by wakeup
 * paths.
 */
static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
	atomic_dec(&hb->waiters);
#endif
}
376

377 378 379 380
static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
	return atomic_read(&hb->waiters);
381
#else
382
	return 1;
383 384 385
#endif
}

386 387 388 389 390 391
/**
 * hash_futex - Return the hash bucket in the global hash
 * @key:	Pointer to the futex key for which the hash is calculated
 *
 * We hash on the keys returned from get_futex_key (see below) and return the
 * corresponding hash bucket in the global hash.
Linus Torvalds's avatar
Linus Torvalds committed
392 393 394 395 396 397
 */
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
	u32 hash = jhash2((u32*)&key->both.word,
			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
			  key->both.offset);
398
	return &futex_queues[hash & (futex_hashsize - 1)];
Linus Torvalds's avatar
Linus Torvalds committed
399 400
}

401 402 403 404 405 406

/**
 * match_futex - Check whether two futex keys are equal
 * @key1:	Pointer to key1
 * @key2:	Pointer to key2
 *
Linus Torvalds's avatar
Linus Torvalds committed
407 408 409 410
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
411 412
	return (key1 && key2
		&& key1->both.word == key2->both.word
Linus Torvalds's avatar
Linus Torvalds committed
413 414 415 416
		&& key1->both.ptr == key2->both.ptr
		&& key1->both.offset == key2->both.offset);
}

417 418 419 420 421 422 423 424 425 426
/*
 * Take a reference to the resource addressed by a key.
 * Can be called while holding spinlocks.
 *
 */
static void get_futex_key_refs(union futex_key *key)
{
	if (!key->both.ptr)
		return;

427 428 429 430 431 432 433 434 435 436
	/*
	 * On MMU less systems futexes are always "private" as there is no per
	 * process address space. We need the smp wmb nevertheless - yes,
	 * arch/blackfin has MMU less SMP ...
	 */
	if (!IS_ENABLED(CONFIG_MMU)) {
		smp_mb(); /* explicit smp_mb(); (B) */
		return;
	}

437 438
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
439
		ihold(key->shared.inode); /* implies smp_mb(); (B) */
440 441
		break;
	case FUT_OFF_MMSHARED:
442
		futex_get_mm(key); /* implies smp_mb(); (B) */
443
		break;
444
	default:
445 446 447 448 449
		/*
		 * Private futexes do not hold reference on an inode or
		 * mm, therefore the only purpose of calling get_futex_key_refs
		 * is because we need the barrier for the lockless waiter check.
		 */
450
		smp_mb(); /* explicit smp_mb(); (B) */
451 452 453 454 455
	}
}

/*
 * Drop a reference to the resource addressed by a key.
456 457 458
 * The hash bucket spinlock must not be held. This is
 * a no-op for private futexes, see comment in the get
 * counterpart.
459 460 461
 */
static void drop_futex_key_refs(union futex_key *key)
{
462 463 464
	if (!key->both.ptr) {
		/* If we're here then we tried to put a key we failed to get */
		WARN_ON_ONCE(1);
465
		return;
466
	}
467

468 469 470
	if (!IS_ENABLED(CONFIG_MMU))
		return;

471 472 473 474 475 476 477 478 479 480
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
		iput(key->shared.inode);
		break;
	case FUT_OFF_MMSHARED:
		mmdrop(key->private.mm);
		break;
	}
}

481
/**
482 483 484 485
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:	virtual address of the futex
 * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:	address where result is stored.
486 487
 * @rw:		mapping needs to be read/write (values: VERIFY_READ,
 *              VERIFY_WRITE)
488
 *
489 490
 * Return: a negative error code or 0
 *
491
 * The key words are stored in @key on success.
Linus Torvalds's avatar
Linus Torvalds committed
492
 *
493
 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
Linus Torvalds's avatar
Linus Torvalds committed
494 495 496
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
497
 * lock_page() might sleep, the caller should not hold a spinlock.
Linus Torvalds's avatar
Linus Torvalds committed
498
 */
499
static int
500
get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
Linus Torvalds's avatar
Linus Torvalds committed
501
{
502
	unsigned long address = (unsigned long)uaddr;
Linus Torvalds's avatar
Linus Torvalds committed
503
	struct mm_struct *mm = current->mm;
504
	struct page *page, *tail;
505
	struct address_space *mapping;
506
	int err, ro = 0;
Linus Torvalds's avatar
Linus Torvalds committed
507 508 509 510

	/*
	 * The futex address must be "naturally" aligned.
	 */
511
	key->both.offset = address % PAGE_SIZE;
512
	if (unlikely((address % sizeof(u32)) != 0))
Linus Torvalds's avatar
Linus Torvalds committed
513
		return -EINVAL;
514
	address -= key->both.offset;
Linus Torvalds's avatar
Linus Torvalds committed
515

516 517 518
	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
		return -EFAULT;

519 520 521
	if (unlikely(should_fail_futex(fshared)))
		return -EFAULT;

522 523 524 525 526 527 528 529 530 531
	/*
	 * PROCESS_PRIVATE futexes are fast.
	 * As the mm cannot disappear under us and the 'key' only needs
	 * virtual address, we dont even have to find the underlying vma.
	 * Note : We do have to check 'uaddr' is a valid user address,
	 *        but access_ok() should be faster than find_vma()
	 */
	if (!fshared) {
		key->private.mm = mm;
		key->private.address = address;
532
		get_futex_key_refs(key);  /* implies smp_mb(); (B) */
533 534
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
535

536
again:
537 538 539 540
	/* Ignore any VERIFY_READ mapping (futex common case) */
	if (unlikely(should_fail_futex(fshared)))
		return -EFAULT;

541
	err = get_user_pages_fast(address, 1, 1, &page);
542 543 544 545 546 547 548 549
	/*
	 * If write access is not required (eg. FUTEX_WAIT), try
	 * and get read-only access.
	 */
	if (err == -EFAULT && rw == VERIFY_READ) {
		err = get_user_pages_fast(address, 1, 0, &page);
		ro = 1;
	}
550 551
	if (err < 0)
		return err;
552 553
	else
		err = 0;
554

555 556 557 558 559 560 561 562 563 564
	/*
	 * The treatment of mapping from this point on is critical. The page
	 * lock protects many things but in this context the page lock
	 * stabilizes mapping, prevents inode freeing in the shared
	 * file-backed region case and guards against movement to swap cache.
	 *
	 * Strictly speaking the page lock is not needed in all cases being
	 * considered here and page lock forces unnecessarily serialization
	 * From this point on, mapping will be re-verified if necessary and
	 * page lock will be acquired only if it is unavoidable
565 566 567 568 569 570 571
	 *
	 * Mapping checks require the head page for any compound page so the
	 * head page and mapping is looked up now. For anonymous pages, it
	 * does not matter if the page splits in the future as the key is
	 * based on the address. For filesystem-backed pages, the tail is
	 * required as the index of the page determines the key. For
	 * base pages, there is no tail page and tail == page.
572
	 */
573
	tail = page;
574 575 576
	page = compound_head(page);
	mapping = READ_ONCE(page->mapping);

577
	/*
578
	 * If page->mapping is NULL, then it cannot be a PageAnon
579 580 581 582 583 584 585 586 587 588 589
	 * page; but it might be the ZERO_PAGE or in the gate area or
	 * in a special mapping (all cases which we are happy to fail);
	 * or it may have been a good file page when get_user_pages_fast
	 * found it, but truncated or holepunched or subjected to
	 * invalidate_complete_page2 before we got the page lock (also
	 * cases which we are happy to fail).  And we hold a reference,
	 * so refcount care in invalidate_complete_page's remove_mapping
	 * prevents drop_caches from setting mapping to NULL beneath us.
	 *
	 * The case we do have to guard against is when memory pressure made
	 * shmem_writepage move it from filecache to swapcache beneath us:
590
	 * an unlikely race, but we do need to retry for page->mapping.
591
	 */
592 593 594 595 596 597 598 599 600 601
	if (unlikely(!mapping)) {
		int shmem_swizzled;

		/*
		 * Page lock is required to identify which special case above
		 * applies. If this is really a shmem page then the page lock
		 * will prevent unexpected transitions.
		 */
		lock_page(page);
		shmem_swizzled = PageSwapCache(page) || page->mapping;
602 603
		unlock_page(page);
		put_page(page);
604

605 606
		if (shmem_swizzled)
			goto again;
607

608
		return -EFAULT;
609
	}
Linus Torvalds's avatar
Linus Torvalds committed
610 611 612 613

	/*
	 * Private mappings are handled in a simple way.
	 *
614 615 616
	 * If the futex key is stored on an anonymous page, then the associated
	 * object is the mm which is implicitly pinned by the calling process.
	 *
Linus Torvalds's avatar
Linus Torvalds committed
617 618
	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
	 * it's a read-only handle, it's expected that futexes attach to
619
	 * the object not the particular process.
Linus Torvalds's avatar
Linus Torvalds committed
620
	 */
621
	if (PageAnon(page)) {
622 623 624 625
		/*
		 * A RO anonymous page will never change and thus doesn't make
		 * sense for futex operations.
		 */
626
		if (unlikely(should_fail_futex(fshared)) || ro) {
627 628 629 630
			err = -EFAULT;
			goto out;
		}

631
		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
Linus Torvalds's avatar
Linus Torvalds committed
632
		key->private.mm = mm;
633
		key->private.address = address;
634 635 636

		get_futex_key_refs(key); /* implies smp_mb(); (B) */

637
	} else {
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
		struct inode *inode;

		/*
		 * The associated futex object in this case is the inode and
		 * the page->mapping must be traversed. Ordinarily this should
		 * be stabilised under page lock but it's not strictly
		 * necessary in this case as we just want to pin the inode, not
		 * update the radix tree or anything like that.
		 *
		 * The RCU read lock is taken as the inode is finally freed
		 * under RCU. If the mapping still matches expectations then the
		 * mapping->host can be safely accessed as being a valid inode.
		 */
		rcu_read_lock();

		if (READ_ONCE(page->mapping) != mapping) {
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		inode = READ_ONCE(mapping->host);
		if (!inode) {
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		/*
		 * Take a reference unless it is about to be freed. Previously
		 * this reference was taken by ihold under the page lock
		 * pinning the inode in place so i_lock was unnecessary. The
		 * only way for this check to fail is if the inode was
673 674
		 * truncated in parallel which is almost certainly an
		 * application bug. In such a case, just retry.
675 676 677 678 679
		 *
		 * We are not calling into get_futex_key_refs() in file-backed
		 * cases, therefore a successful atomic_inc return below will
		 * guarantee that get_futex_key() will still imply smp_mb(); (B).
		 */
680
		if (!atomic_inc_not_zero(&inode->i_count)) {
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		/* Should be impossible but lets be paranoid for now */
		if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
			err = -EFAULT;
			rcu_read_unlock();
			iput(inode);

			goto out;
		}

696
		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
697
		key->shared.inode = inode;
698
		key->shared.pgoff = basepage_index(tail);
699
		rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
700 701
	}

702
out:
703
	put_page(page);
704
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
705 706
}

707
static inline void put_futex_key(union futex_key *key)
Linus Torvalds's avatar
Linus Torvalds committed
708
{
709
	drop_futex_key_refs(key);
Linus Torvalds's avatar
Linus Torvalds committed
710 711
}

712 713
/**
 * fault_in_user_writeable() - Fault in user address and verify RW access
714 715 716 717 718
 * @uaddr:	pointer to faulting user space address
 *
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
719
 * We have no generic implementation of a non-destructive write to the
720 721 722 723 724 725
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
 */
static int fault_in_user_writeable(u32 __user *uaddr)
{
726 727 728 729
	struct mm_struct *mm = current->mm;
	int ret;

	down_read(&mm->mmap_sem);
730
	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
731
			       FAULT_FLAG_WRITE, NULL);
732 733
	up_read(&mm->mmap_sem);

734 735 736
	return ret < 0 ? ret : 0;
}

737 738
/**
 * futex_top_waiter() - Return the highest priority waiter on a futex
739 740
 * @hb:		the hash bucket the futex_q's reside in
 * @key:	the futex key (to distinguish it from other futex futex_q's)
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
 *
 * Must be called with the hb lock held.
 */
static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
					union futex_key *key)
{
	struct futex_q *this;

	plist_for_each_entry(this, &hb->chain, list) {
		if (match_futex(&this->key, key))
			return this;
	}
	return NULL;
}

756 757
static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
				      u32 uval, u32 newval)
758
{
759
	int ret;
760 761

	pagefault_disable();
762
	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
763 764
	pagefault_enable();

765
	return ret;
766 767 768
}

static int get_futex_value_locked(u32 *dest, u32 __user *from)
Linus Torvalds's avatar
Linus Torvalds committed
769 770 771
{
	int ret;

772
	pagefault_disable();
773
	ret = __get_user(*dest, from);
774
	pagefault_enable();
Linus Torvalds's avatar
Linus Torvalds committed
775 776 777 778

	return ret ? -EFAULT : 0;
}

779 780 781 782 783 784 785 786 787 788 789

/*
 * PI code:
 */
static int refill_pi_state_cache(void)
{
	struct futex_pi_state *pi_state;

	if (likely(current->pi_state_cache))
		return 0;

790
	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
791 792 793 794 795 796 797 798

	if (!pi_state)
		return -ENOMEM;

	INIT_LIST_HEAD(&pi_state->list);
	/* pi_mutex gets initialized later */
	pi_state->owner = NULL;
	atomic_set(&pi_state->refcount, 1);
799
	pi_state->key = FUTEX_KEY_INIT;
800 801 802 803 804 805

	current->pi_state_cache = pi_state;

	return 0;
}

806
static struct futex_pi_state *alloc_pi_state(void)
807 808 809 810 811 812 813 814 815
{
	struct futex_pi_state *pi_state = current->pi_state_cache;

	WARN_ON(!pi_state);
	current->pi_state_cache = NULL;

	return pi_state;
}

816 817 818 819 820
static void get_pi_state(struct futex_pi_state *pi_state)
{
	WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
}

821
/*
822 823
 * Drops a reference to the pi_state object and frees or caches it
 * when the last reference is gone.
824
 */
825
static void put_pi_state(struct futex_pi_state *pi_state)
826
{
827 828 829
	if (!pi_state)
		return;

830 831 832 833 834 835 836 837
	if (!atomic_dec_and_test(&pi_state->refcount))
		return;

	/*
	 * If pi_state->owner is NULL, the owner is most probably dying
	 * and has cleaned up the pi_state already
	 */
	if (pi_state->owner) {
838
		struct task_struct *owner;
839

840 841 842 843 844 845 846 847 848
		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
		owner = pi_state->owner;
		if (owner) {
			raw_spin_lock(&owner->pi_lock);
			list_del_init(&pi_state->list);
			raw_spin_unlock(&owner->pi_lock);
		}
		rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
849 850
	}

851
	if (current->pi_state_cache) {
852
		kfree(pi_state);
853
	} else {
854 855 856 857 858 859 860 861 862 863 864
		/*
		 * pi_state->list is already empty.
		 * clear pi_state->owner.
		 * refcount is at 0 - put it back to 1.
		 */
		pi_state->owner = NULL;
		atomic_set(&pi_state->refcount, 1);
		current->pi_state_cache = pi_state;
	}
}

865 866
#ifdef CONFIG_FUTEX_PI

867 868 869 870 871 872 873 874 875
/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
void exit_pi_state_list(struct task_struct *curr)
{
	struct list_head *next, *head = &curr->pi_state_list;
	struct futex_pi_state *pi_state;
876
	struct futex_hash_bucket *hb;
877
	union futex_key key = FUTEX_KEY_INIT;
878

879 880
	if (!futex_cmpxchg_enabled)
		return;
881 882 883
	/*
	 * We are a ZOMBIE and nobody can enqueue itself on
	 * pi_state_list anymore, but we have to be careful
884
	 * versus waiters unqueueing themselves:
885
	 */
886
	raw_spin_lock_irq(&curr->pi_lock);
887 888 889 890
	while (!list_empty(head)) {
		next = head->next;
		pi_state = list_entry(next, struct futex_pi_state, list);
		key = pi_state->key;
891
		hb = hash_futex(&key);
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908

		/*
		 * We can race against put_pi_state() removing itself from the
		 * list (a waiter going away). put_pi_state() will first
		 * decrement the reference count and then modify the list, so
		 * its possible to see the list entry but fail this reference
		 * acquire.
		 *
		 * In that case; drop the locks to let put_pi_state() make
		 * progress and retry the loop.
		 */
		if (!atomic_inc_not_zero(&pi_state->refcount)) {
			raw_spin_unlock_irq(&curr->pi_lock);
			cpu_relax();
			raw_spin_lock_irq(&curr->pi_lock);
			continue;
		}
909
		raw_spin_unlock_irq(&curr->pi_lock);
910 911

		spin_lock(&hb->lock);
912 913
		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
		raw_spin_lock(&curr->pi_lock);
914 915 916 917
		/*
		 * We dropped the pi-lock, so re-check whether this
		 * task still owns the PI-state:
		 */
918
		if (head->next != next) {
919
			/* retain curr->pi_lock for the loop invariant */
920
			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
921
			spin_unlock(&hb->lock);
922
			put_pi_state(pi_state);
923 924 925 926
			continue;
		}

		WARN_ON(pi_state->owner != curr);
927 928
		WARN_ON(list_empty(&pi_state->list));
		list_del_init(&pi_state->list);
929 930
		pi_state->owner = NULL;

931
		raw_spin_unlock(&curr->pi_lock);
932
		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
933 934
		spin_unlock(&hb->lock);

935 936 937
		rt_mutex_futex_unlock(&pi_state->pi_mutex);
		put_pi_state(pi_state);

938
		raw_spin_lock_irq(&curr->pi_lock);
939
	}
940
	raw_spin_unlock_irq(&curr->pi_lock);
941 942
}

943 944
#endif

945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
/*
 * We need to check the following states:
 *
 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 *
 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 *
 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 *
 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 *
 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 *
 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 *
 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 *
 * [1]	Indicates that the kernel can acquire the futex atomically. We
 *	came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 *
 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
 *      thread is found then it indicates that the owner TID has died.
 *
 * [3]	Invalid. The waiter is queued on a non PI futex
 *
 * [4]	Valid state after exit_robust_list(), which sets the user space
 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 *
 * [5]	The user space value got manipulated between exit_robust_list()
 *	and exit_pi_state_list()
 *
 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
 *	the pi_state but cannot access the user space value.
 *
 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 *
 * [8]	Owner and user space value match
 *
 * [9]	There is no transient state which sets the user space TID to 0
 *	except exit_robust_list(), but this is indicated by the
 *	FUTEX_OWNER_DIED bit. See [4]
 *
 * [10] There is no transient state which leaves owner and user space
 *	TID out of sync.
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
 *
 *
 * Serialization and lifetime rules:
 *
 * hb->lock:
 *
 *	hb -> futex_q, relation
 *	futex_q -> pi_state, relation
 *
 *	(cannot be raw because hb can contain arbitrary amount
 *	 of futex_q's)
 *
 * pi_mutex->wait_lock:
 *
 *	{uval, pi_state}
 *
 *	(and pi_mutex 'obviously')
 *
 * p->pi_lock:
 *
 *	p->pi_state_list -> pi_state->list, relation
 *
 * pi_state->refcount:
 *
 *	pi_state lifetime
 *
 *
 * Lock order:
 *
 *   hb->lock
 *     pi_mutex->wait_lock
 *       p->pi_lock
 *
1026
 */
1027 1028 1029 1030 1031 1032

/*
 * Validate that the existing waiter has a pi_state and sanity check
 * the pi_state against the user space value. If correct, attach to
 * it.
 */
1033 1034
static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
			      struct futex_pi_state *pi_state,
1035
			      struct futex_pi_state **ps)
1036
{
1037
	pid_t pid = uval & FUTEX_TID_MASK;
1038 1039
	u32 uval2;
	int ret;
1040

1041 1042 1043 1044 1045
	/*
	 * Userspace might have messed up non-PI and PI futexes [3]
	 */
	if (unlikely(!pi_state))
		return -EINVAL;
1046

1047 1048 1049 1050 1051 1052
	/*
	 * We get here with hb->lock held, and having found a
	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
	 * which in turn means that futex_lock_pi() still has a reference on
	 * our pi_state.
1053 1054 1055 1056 1057
	 *
	 * The waiter holding a reference on @pi_state also protects against
	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
	 * free pi_state before we can take a reference ourselves.
1058
	 */
1059
	WARN_ON(!atomic_read(&pi_state->refcount));
1060

1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
	/*
	 * Now that we have a pi_state, we can acquire wait_lock
	 * and do the state validation.
	 */
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);

	/*
	 * Since {uval, pi_state} is serialized by wait_lock, and our current
	 * uval was read without holding it, it can have changed. Verify it
	 * still is what we expect it to be, otherwise retry the entire
	 * operation.
	 */
	if (get_futex_value_locked(&uval2, uaddr))
		goto out_efault;

	if (uval != uval2)
		goto out_eagain;

1079 1080 1081 1082
	/*
	 * Handle the owner died case:
	 */
	if (uval & FUTEX_OWNER_DIED) {
1083
		/*
1084 1085 1086
		 * exit_pi_state_list sets owner to NULL and wakes the
		 * topmost waiter. The task which acquires the
		 * pi_state->rt_mutex will fixup owner.
1087
		 */
1088
		if (!pi_state->owner) {
1089
			/*
1090 1091
			 * No pi state owner, but the user space TID
			 * is not 0. Inconsistent state. [5]
1092
			 */
1093
			if (pid)
1094
				goto out_einval;
1095
			/*
1096
			 * Take a ref on the state and return success. [4]
1097
			 */
1098
			goto out_attach;
1099
		}
1100 1101

		/*
1102 1103 1104 1105 1106 1107 1108 1109
		 * If TID is 0, then either the dying owner has not
		 * yet executed exit_pi_state_list() or some waiter
		 * acquired the rtmutex in the pi state, but did not
		 * yet fixup the TID in user space.
		 *
		 * Take a ref on the state and return success. [6]
		 */
		if (!pid)
1110
			goto out_attach;
1111 1112 1113 1114
	} else {
		/*
		 * If the owner died bit is not set, then the pi_state
		 * must have an owner. [7]
1115
		 */
1116
		if (!pi_state->owner)
1117
			goto out_einval;
1118 1119
	}

1120 1121 1122 1123 1124 1125
	/*
	 * Bail out if user space manipulated the futex value. If pi
	 * state exists then the owner TID must be the same as the
	 * user space TID. [9/10]
	 */
	if (pid != task_pid_vnr(pi_state->owner))
1126 1127 1128
		goto out_einval;

out_attach:
1129
	get_pi_state(pi_state);
1130
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1131 1132
	*ps = pi_state;
	return 0;
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148

out_einval:
	ret = -EINVAL;
	goto out_error;

out_eagain:
	ret = -EAGAIN;
	goto out_error;

out_efault:
	ret = -EFAULT;
	goto out_error;

out_error:
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
	return ret;
1149 1150
}

1151 1152 1153 1154 1155 1156
/*
 * Lookup the task for the TID provided from user space and attach to
 * it after doing proper sanity checks.
 */
static int attach_to_pi_owner(u32 uval, union futex_key *key,
			      struct futex_pi_state **ps)
1157 1158
{
	pid_t pid = uval & FUTEX_TID_MASK;
1159 1160
	struct futex_pi_state *pi_state;
	struct task_struct *p;
1161

1162
	/*
1163
	 * We are the first waiter - try to look up the real owner and attach
1164
	 * the new pi_state to it, but bail out when TID = 0 [1]
1165
	 */
1166
	if (!pid)
1167
		return -ESRCH;
1168
	p = find_get_task_by_vpid(pid);
1169 1170
	if (!p)
		return -ESRCH;
1171

1172
	if (unlikely(p->flags & PF_KTHREAD)) {
1173 1174 1175 1176
		put_task_struct(p);
		return -EPERM;
	}

1177 1178 1179 1180 1181 1182
	/*
	 * We need to look at the task state flags to figure out,
	 * whether the task is exiting. To protect against the do_exit
	 * change of the task flags, we do this protected by
	 * p->pi_lock:
	 */
1183
	raw_spin_lock_irq(&p->pi_lock);
1184 1185 1186 1187 1188 1189 1190 1191
	if (unlikely(p->flags & PF_EXITING)) {
		/*
		 * The task is on the way out. When PF_EXITPIDONE is
		 * set, we know that the task has finished the
		 * cleanup:
		 */
		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;

1192
		raw_spin_unlock_irq(&p->pi_lock);
1193 1194 1195
		put_task_struct(p);
		return ret;
	}
1196

1197 1198
	/*
	 * No existing pi state. First waiter. [2]
1199 1200 1201
	 *
	 * This creates pi_state, we have hb->lock held, this means nothing can
	 * observe this state, wait_lock is irrelevant.
1202
	 */
1203 1204 1205
	pi_state = alloc_pi_state();

	/*
1206
	 * Initialize the pi_mutex in locked state and make @p
1207 1208 1209 1210 1211
	 * the owner of it:
	 */
	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);

	/* Store the key for possible exit cleanups: */
1212
	pi_state->key = *key;
1213

1214
	WARN_ON(!list_empty(&pi_state->list));
1215
	list_add(&pi_state->list, &p->pi_state_list);
1216 1217 1218 1219
	/*
	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
	 * because there is no concurrency as the object is not published yet.
	 */
1220
	pi_state->owner = p;
1221
	raw_spin_unlock_irq(&p->pi_lock);
1222 1223 1224

	put_task_struct(p);

1225
	*ps = pi_state;
1226 1227 1228 1229

	return 0;
}

1230 1231
static int lookup_pi_state(u32 __user *uaddr, u32 uval,
			   struct futex_hash_bucket *hb,
1232 1233
			   union futex_key *key, struct futex_pi_state **ps)
{
1234
	struct futex_q *top_waiter = futex_top_waiter(hb, key);
1235 1236 1237 1238 1239

	/*
	 * If there is a waiter on that futex, validate it and
	 * attach to the pi_state when the validation succeeds.
	 */
1240
	if (top_waiter)
1241
		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1242 1243 1244 1245 1246 1247 1248 1249

	/*
	 * We are the first waiter - try to look up the owner based on
	 * @uval and attach to it.
	 */
	return attach_to_pi_owner(uval, key, ps);
}

1250 1251 1252 1253
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
	u32 uninitialized_var(curval);

1254 1255 1256
	if (unlikely(should_fail_futex(true)))
		return -EFAULT;

1257 1258 1259
	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
		return -EFAULT;

1260
	/* If user space value changed, let the caller retry */
1261 1262 1263
	return curval != uval ? -EAGAIN : 0;
}

1264
/**
1265
 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1266 1267 1268 1269 1270 1271 1272 1273
 * @uaddr:		the pi futex user address
 * @hb:			the pi futex hash bucket
 * @key:		the futex key associated with uaddr and hb
 * @ps:			the pi_state pointer where we store the result of the
 *			lookup
 * @task:		the task to perform the atomic lock work for.  This will
 *			be "current" except in the case of requeue pi.
 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1274
 *
1275
 * Return:
1276 1277 1278
 *  -  0 - ready to wait;
 *  -  1 - acquired the lock;
 *  - <0 - error
1279 1280 1281 1282 1283 1284
 *
 * The hb->lock and futex_key refs shall be held by the caller.
 */
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
				union futex_key *key,
				struct futex_pi_state **ps,
1285
				struct task_struct *task, int set_waiters)
1286
{
1287
	u32 uval, newval, vpid = task_pid_vnr(task);
1288
	struct futex_q *top_waiter;
1289
	int ret;
1290 1291

	/*
1292 1293
	 * Read the user space value first so we can validate a few
	 * things before proceeding further.
1294
	 */
1295
	if (get_futex_value_locked(&uval, uaddr))
1296 1297
		return -EFAULT;

1298 1299 1300
	if (unlikely(should_fail_futex(true)))
		return -EFAULT;

1301 1302 1303
	/*
	 * Detect deadlocks.
	 */
1304
	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1305 1306
		return -EDEADLK;

1307 1308 1309
	if ((unlikely(should_fail_futex(true))))
		return -EDEADLK;

1310
	/*
1311 1312
	 * Lookup existing state first. If it exists, try to attach to
	 * its pi_state.
1313
	 */
1314 1315
	top_waiter = futex_top_waiter(hb, key);
	if (top_waiter)
1316
		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1317 1318

	/*
1319 1320 1321 1322
	 * No waiter and user TID is 0. We are here because the
	 * waiters or the owner died bit is set or called from
	 * requeue_cmp_pi or for whatever reason something took the
	 * syscall.
1323
	 */
1324
	if (!(uval & FUTEX_TID_MASK)) {
1325
		/*
1326 1327
		 * We take over the futex. No other waiters and the user space
		 * TID is 0. We preserve the owner died bit.
1328
		 */
1329 1330
		newval = uval & FUTEX_OWNER_DIED;
		newval |= vpid;
1331

1332 1333 1334 1335 1336 1337 1338 1339
		/* The futex requeue_pi code can enforce the waiters bit */
		if (set_waiters)
			newval |= FUTEX_WAITERS;

		ret = lock_pi_update_atomic(uaddr, uval, newval);
		/* If the take over worked, return 1 */
		return ret < 0 ? ret : 1;
	}
1340 1341

	/*
1342 1343 1344
	 * First waiter. Set the waiters bit before attaching ourself to
	 * the owner. If owner tries to unlock, it will be forced into
	 * the kernel and blocked on hb->lock.
1345
	 */
1346 1347 1348 1349
	newval = uval | FUTEX_WAITERS;
	ret = lock_pi_update_atomic(uaddr, uval, newval);
	if (ret)
		return ret;
1350
	/*
1351 1352 1353
	 * If the update of the user space value succeeded, we try to
	 * attach to the owner. If that fails, no harm done, we only
	 * set the FUTEX_WAITERS bit in the user space variable.
1354
	 */
1355
	return attach_to_pi_owner(uval, key, ps);
1356 1357
}

1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
/**
 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
 * @q:	The futex_q to unqueue
 *
 * The q->lock_ptr must not be NULL and must be held by the caller.
 */
static void __unqueue_futex(struct futex_q *q)
{
	struct futex_hash_bucket *hb;

1368 1369
	if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
	    || WARN_ON(plist_node_empty(&q->list)))
1370 1371 1372 1373
		return;

	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
	plist_del(&q->list, &hb->chain);
1374
	hb_waiters_dec(hb);
1375 1376
}

Linus Torvalds's avatar
Linus Torvalds committed
1377 1378
/*
 * The hash bucket lock must be held when this is called.
1379 1380 1381
 * Afterwards, the futex_q must not be accessed. Callers
 * must ensure to later call wake_up_q() for the actual
 * wakeups to occur.
Linus Torvalds's avatar
Linus Torvalds committed
1382
 */
1383
static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
Linus Torvalds's avatar
Linus Torvalds committed
1384
{
1385 1386
	struct task_struct *p = q->task;

1387 1388 1389
	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
		return;

Linus Torvalds's avatar
Linus Torvalds committed
1390
	/*
1391 1392
	 * Queue the task for later wakeup for after we've released
	 * the hb->lock. wake_q_add() grabs reference to p.
Linus Torvalds's avatar
Linus Torvalds committed
1393
	 */
1394
	wake_q_add(wake_q, p);
1395
	__unqueue_futex(q);
Linus Torvalds's avatar
Linus Torvalds committed
1396
	/*
1397 1398 1399 1400 1401
	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
	 * is written, without taking any locks. This is possible in the event
	 * of a spurious wakeup, for example. A memory barrier is required here
	 * to prevent the following store to lock_ptr from getting ahead of the
	 * plist_del in __unqueue_futex().
Linus Torvalds's avatar
Linus Torvalds committed
1402
	 */
1403
	smp_store_release(&q->lock_ptr, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1404 1405
}

1406 1407 1408 1409
/*
 * Caller must hold a reference on @pi_state.
 */
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1410
{
1411
	u32 uninitialized_var(curval), newval;
1412
	struct task_struct *new_owner;
Peter Zijlstra's avatar
Peter Zijlstra committed
1413
	bool postunlock = false;
1414
	DEFINE_WAKE_Q(wake_q);
1415
	int ret = 0;
1416 1417

	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1418
	if (WARN_ON_ONCE(!new_owner)) {
1419
		/*
1420
		 * As per the comment in futex_unlock_pi() this should not happen.
1421 1422 1423 1424 1425 1426 1427 1428
		 *
		 * When this happens, give up our locks and try again, giving
		 * the futex_lock_pi() instance time to complete, either by
		 * waiting on the rtmutex or removing itself from the futex
		 * queue.
		 */
		ret = -EAGAIN;
		goto out_unlock;
1429
	}
1430 1431

	/*
1432 1433 1434
	 * We pass it to the next owner. The WAITERS bit is always kept
	 * enabled while there is PI state around. We cleanup the owner
	 * died bit, because we are the owner.
1435
	 */
1436
	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1437

1438 1439 1440
	if (unlikely(should_fail_futex(true)))
		ret = -EFAULT;

1441
	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
1442
		ret = -EFAULT;
1443

1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
	} else if (curval != uval) {
		/*
		 * If a unconditional UNLOCK_PI operation (user space did not
		 * try the TID->0 transition) raced with a waiter setting the
		 * FUTEX_WAITERS flag between get_user() and locking the hash
		 * bucket lock, retry the operation.
		 */
		if ((FUTEX_TID_MASK & curval) == uval)
			ret = -EAGAIN;
		else
			ret = -EINVAL;
	}
1456

1457 1458
	if (ret)
		goto out_unlock;
1459

1460 1461 1462 1463 1464
	/*
	 * This is a point of no return; once we modify the uval there is no
	 * going back and subsequent operations must not fail.
	 */

1465
	raw_spin_lock(&pi_state->owner->pi_lock);
1466 1467
	WARN_ON(list_empty(&pi_state->list));
	list_del_init(&pi_state->list);
1468
	raw_spin_unlock(&pi_state->owner->pi_lock);
1469

1470
	raw_spin_lock(&new_owner->pi_lock);
1471
	WARN_ON(!list_empty(&pi_state->list));
1472 1473
	list_add(&pi_state->list, &new_owner->pi_state_list);
	pi_state->owner = new_owner;
1474
	raw_spin_unlock(&new_owner->pi_lock);
1475

Peter Zijlstra's avatar
Peter Zijlstra committed
1476
	postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);