futex.c 89.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
11 12 13 14
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
15 16 17 18
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
19 20 21
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
22 23 24 25
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/futex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
57
#include <linux/signal.h>
58
#include <linux/export.h>
59
#include <linux/magic.h>
60 61
#include <linux/pid.h>
#include <linux/nsproxy.h>
62
#include <linux/ptrace.h>
63
#include <linux/sched/rt.h>
64
#include <linux/sched/wake_q.h>
65
#include <linux/sched/mm.h>
66
#include <linux/hugetlb.h>
67
#include <linux/freezer.h>
68
#include <linux/bootmem.h>
69
#include <linux/fault-inject.h>
70

71
#include <asm/futex.h>
Linus Torvalds's avatar
Linus Torvalds committed
72

73
#include "locking/rtmutex_common.h"
74

75
/*
76 77 78 79
 * READ this before attempting to hack on futexes!
 *
 * Basic futex operation and ordering guarantees
 * =============================================
80 81 82 83
 *
 * The waiter reads the futex value in user space and calls
 * futex_wait(). This function computes the hash bucket and acquires
 * the hash bucket lock. After that it reads the futex user space value
84 85 86
 * again and verifies that the data has not changed. If it has not changed
 * it enqueues itself into the hash bucket, releases the hash bucket lock
 * and schedules.
87 88
 *
 * The waker side modifies the user space value of the futex and calls
89 90 91
 * futex_wake(). This function computes the hash bucket and acquires the
 * hash bucket lock. Then it looks for waiters on that futex in the hash
 * bucket and wakes them.
92
 *
93 94 95 96 97
 * In futex wake up scenarios where no tasks are blocked on a futex, taking
 * the hb spinlock can be avoided and simply return. In order for this
 * optimization to work, ordering guarantees must exist so that the waiter
 * being added to the list is acknowledged when the list is concurrently being
 * checked by the waker, avoiding scenarios like the following:
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
 *
 * CPU 0                               CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *   uval = *futex;
 *                                     *futex = newval;
 *                                     sys_futex(WAKE, futex);
 *                                       futex_wake(futex);
 *                                       if (queue_empty())
 *                                         return;
 *   if (uval == val)
 *      lock(hash_bucket(futex));
 *      queue();
 *     unlock(hash_bucket(futex));
 *     schedule();
 *
 * This would cause the waiter on CPU 0 to wait forever because it
 * missed the transition of the user space value from val to newval
 * and the waker did not find the waiter in the hash bucket queue.
 *
119 120 121 122 123
 * The correct serialization ensures that a waiter either observes
 * the changed user space value before blocking or is woken by a
 * concurrent waker:
 *
 * CPU 0                                 CPU 1
124 125 126
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
127
 *
128
 *   waiters++; (a)
129 130 131 132 133 134 135 136 137 138
 *   smp_mb(); (A) <-- paired with -.
 *                                  |
 *   lock(hash_bucket(futex));      |
 *                                  |
 *   uval = *futex;                 |
 *                                  |        *futex = newval;
 *                                  |        sys_futex(WAKE, futex);
 *                                  |          futex_wake(futex);
 *                                  |
 *                                  `--------> smp_mb(); (B)
139
 *   if (uval == val)
140
 *     queue();
141
 *     unlock(hash_bucket(futex));
142 143
 *     schedule();                         if (waiters)
 *                                           lock(hash_bucket(futex));
144 145
 *   else                                    wake_waiters(futex);
 *     waiters--; (b)                        unlock(hash_bucket(futex));
146
 *
147 148
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see hb_waiters_inc) and where (B) orders the write
149 150
 * to futex and the waiters read -- this is done by the barriers for both
 * shared and private futexes in get_futex_key_refs().
151 152 153 154 155 156 157 158 159 160 161 162
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
 *	X = Y = 0
 *
 *	w[X]=1		w[Y]=1
 *	MB		MB
 *	r[Y]=y		r[X]=x
 *
 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 * the guarantee that we cannot both miss the futex variable change and the
 * enqueue.
163 164 165 166 167 168 169 170 171 172 173
 *
 * Note that a new waiter is accounted for in (a) even when it is possible that
 * the wait call can return error, in which case we backtrack from it in (b).
 * Refer to the comment in queue_lock().
 *
 * Similarly, in order to account for waiters being requeued on another
 * address we always increment the waiters for the destination bucket before
 * acquiring the lock. It then decrements them again  after releasing it -
 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 * will do the additional required waiter count housekeeping. This is done for
 * double_lock_hb() and double_unlock_hb(), respectively.
174 175
 */

176
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
177
int __read_mostly futex_cmpxchg_enabled;
178
#endif
179

180 181 182 183
/*
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
184 185 186 187 188 189 190 191 192
#ifdef CONFIG_MMU
# define FLAGS_SHARED		0x01
#else
/*
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
# define FLAGS_SHARED		0x00
#endif
193 194 195
#define FLAGS_CLOCKRT		0x02
#define FLAGS_HAS_TIMEOUT	0x04

196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
	/*
	 * list of 'owned' pi_state instances - these have to be
	 * cleaned up in do_exit() if the task exits prematurely:
	 */
	struct list_head list;

	/*
	 * The PI object:
	 */
	struct rt_mutex pi_mutex;

	struct task_struct *owner;
	atomic_t refcount;

	union futex_key key;
};

217 218
/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
219
 * @list:		priority-sorted list of tasks waiting on this futex
220 221 222 223 224 225 226 227 228
 * @task:		the task waiting on the futex
 * @lock_ptr:		the hash bucket lock
 * @key:		the key the futex is hashed on
 * @pi_state:		optional priority inheritance state
 * @rt_waiter:		rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:	the requeue_pi target futex key
 * @bitset:		bitset for the optional bitmasked wakeup
 *
 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
Linus Torvalds's avatar
Linus Torvalds committed
229 230 231
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
232
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
233
 * The order of wakeup is always to make the first condition true, then
234 235 236 237
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See unqueue_me_pi().
Linus Torvalds's avatar
Linus Torvalds committed
238 239
 */
struct futex_q {
240
	struct plist_node list;
Linus Torvalds's avatar
Linus Torvalds committed
241

242
	struct task_struct *task;
Linus Torvalds's avatar
Linus Torvalds committed
243 244
	spinlock_t *lock_ptr;
	union futex_key key;
245
	struct futex_pi_state *pi_state;
246
	struct rt_mutex_waiter *rt_waiter;
247
	union futex_key *requeue_pi_key;
248
	u32 bitset;
Linus Torvalds's avatar
Linus Torvalds committed
249 250
};

251 252 253 254 255 256
static const struct futex_q futex_q_init = {
	/* list gets initialized in queue_me()*/
	.key = FUTEX_KEY_INIT,
	.bitset = FUTEX_BITSET_MATCH_ANY
};

Linus Torvalds's avatar
Linus Torvalds committed
257
/*
258 259 260
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
Linus Torvalds's avatar
Linus Torvalds committed
261 262
 */
struct futex_hash_bucket {
263
	atomic_t waiters;
264 265
	spinlock_t lock;
	struct plist_head chain;
266
} ____cacheline_aligned_in_smp;
Linus Torvalds's avatar
Linus Torvalds committed
267

268 269 270 271 272 273 274 275 276 277 278
/*
 * The base of the bucket array and its size are always used together
 * (after initialization only in hash_futex()), so ensure that they
 * reside in the same cacheline.
 */
static struct {
	struct futex_hash_bucket *queues;
	unsigned long            hashsize;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues   (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
279

Linus Torvalds's avatar
Linus Torvalds committed
280

281 282 283 284 285 286 287 288
/*
 * Fault injections for futexes.
 */
#ifdef CONFIG_FAIL_FUTEX

static struct {
	struct fault_attr attr;

289
	bool ignore_private;
290 291
} fail_futex = {
	.attr = FAULT_ATTR_INITIALIZER,
292
	.ignore_private = false,
293 294 295 296 297 298 299 300
};

static int __init setup_fail_futex(char *str)
{
	return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);

301
static bool should_fail_futex(bool fshared)
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
{
	if (fail_futex.ignore_private && !fshared)
		return false;

	return should_fail(&fail_futex.attr, 1);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_futex_debugfs(void)
{
	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
	struct dentry *dir;

	dir = fault_create_debugfs_attr("fail_futex", NULL,
					&fail_futex.attr);
	if (IS_ERR(dir))
		return PTR_ERR(dir);

	if (!debugfs_create_bool("ignore-private", mode, dir,
				 &fail_futex.ignore_private)) {
		debugfs_remove_recursive(dir);
		return -ENOMEM;
	}

	return 0;
}

late_initcall(fail_futex_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

#else
static inline bool should_fail_futex(bool fshared)
{
	return false;
}
#endif /* CONFIG_FAIL_FUTEX */

341 342
static inline void futex_get_mm(union futex_key *key)
{
343
	mmgrab(key->private.mm);
344 345 346
	/*
	 * Ensure futex_get_mm() implies a full barrier such that
	 * get_futex_key() implies a full barrier. This is relied upon
347
	 * as smp_mb(); (B), see the ordering comment above.
348
	 */
349
	smp_mb__after_atomic();
350 351
}

352 353 354 355
/*
 * Reflects a new waiter being added to the waitqueue.
 */
static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
356 357
{
#ifdef CONFIG_SMP
358
	atomic_inc(&hb->waiters);
359
	/*
360
	 * Full barrier (A), see the ordering comment above.
361
	 */
362
	smp_mb__after_atomic();
363 364 365 366 367 368 369 370 371 372 373 374 375
#endif
}

/*
 * Reflects a waiter being removed from the waitqueue by wakeup
 * paths.
 */
static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
	atomic_dec(&hb->waiters);
#endif
}
376

377 378 379 380
static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
	return atomic_read(&hb->waiters);
381
#else
382
	return 1;
383 384 385
#endif
}

386 387 388 389 390 391
/**
 * hash_futex - Return the hash bucket in the global hash
 * @key:	Pointer to the futex key for which the hash is calculated
 *
 * We hash on the keys returned from get_futex_key (see below) and return the
 * corresponding hash bucket in the global hash.
Linus Torvalds's avatar
Linus Torvalds committed
392 393 394 395 396 397
 */
static struct futex_hash_bucket *hash_futex(union futex_key *key)
{
	u32 hash = jhash2((u32*)&key->both.word,
			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
			  key->both.offset);
398
	return &futex_queues[hash & (futex_hashsize - 1)];
Linus Torvalds's avatar
Linus Torvalds committed
399 400
}

401 402 403 404 405 406

/**
 * match_futex - Check whether two futex keys are equal
 * @key1:	Pointer to key1
 * @key2:	Pointer to key2
 *
Linus Torvalds's avatar
Linus Torvalds committed
407 408 409 410
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
411 412
	return (key1 && key2
		&& key1->both.word == key2->both.word
Linus Torvalds's avatar
Linus Torvalds committed
413 414 415 416
		&& key1->both.ptr == key2->both.ptr
		&& key1->both.offset == key2->both.offset);
}

417 418 419 420 421 422 423 424 425 426
/*
 * Take a reference to the resource addressed by a key.
 * Can be called while holding spinlocks.
 *
 */
static void get_futex_key_refs(union futex_key *key)
{
	if (!key->both.ptr)
		return;

427 428 429 430 431 432 433 434 435 436
	/*
	 * On MMU less systems futexes are always "private" as there is no per
	 * process address space. We need the smp wmb nevertheless - yes,
	 * arch/blackfin has MMU less SMP ...
	 */
	if (!IS_ENABLED(CONFIG_MMU)) {
		smp_mb(); /* explicit smp_mb(); (B) */
		return;
	}

437 438
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
439
		ihold(key->shared.inode); /* implies smp_mb(); (B) */
440 441
		break;
	case FUT_OFF_MMSHARED:
442
		futex_get_mm(key); /* implies smp_mb(); (B) */
443
		break;
444
	default:
445 446 447 448 449
		/*
		 * Private futexes do not hold reference on an inode or
		 * mm, therefore the only purpose of calling get_futex_key_refs
		 * is because we need the barrier for the lockless waiter check.
		 */
450
		smp_mb(); /* explicit smp_mb(); (B) */
451 452 453 454 455
	}
}

/*
 * Drop a reference to the resource addressed by a key.
456 457 458
 * The hash bucket spinlock must not be held. This is
 * a no-op for private futexes, see comment in the get
 * counterpart.
459 460 461
 */
static void drop_futex_key_refs(union futex_key *key)
{
462 463 464
	if (!key->both.ptr) {
		/* If we're here then we tried to put a key we failed to get */
		WARN_ON_ONCE(1);
465
		return;
466
	}
467

468 469 470
	if (!IS_ENABLED(CONFIG_MMU))
		return;

471 472 473 474 475 476 477 478 479 480
	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
	case FUT_OFF_INODE:
		iput(key->shared.inode);
		break;
	case FUT_OFF_MMSHARED:
		mmdrop(key->private.mm);
		break;
	}
}

481
/**
482 483 484 485
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:	virtual address of the futex
 * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:	address where result is stored.
486 487
 * @rw:		mapping needs to be read/write (values: VERIFY_READ,
 *              VERIFY_WRITE)
488
 *
489 490
 * Return: a negative error code or 0
 *
491
 * The key words are stored in *key on success.
Linus Torvalds's avatar
Linus Torvalds committed
492
 *
493
 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
Linus Torvalds's avatar
Linus Torvalds committed
494 495 496
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
497
 * lock_page() might sleep, the caller should not hold a spinlock.
Linus Torvalds's avatar
Linus Torvalds committed
498
 */
499
static int
500
get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
Linus Torvalds's avatar
Linus Torvalds committed
501
{
502
	unsigned long address = (unsigned long)uaddr;
Linus Torvalds's avatar
Linus Torvalds committed
503
	struct mm_struct *mm = current->mm;
504
	struct page *page, *tail;
505
	struct address_space *mapping;
506
	int err, ro = 0;
Linus Torvalds's avatar
Linus Torvalds committed
507 508 509 510

	/*
	 * The futex address must be "naturally" aligned.
	 */
511
	key->both.offset = address % PAGE_SIZE;
512
	if (unlikely((address % sizeof(u32)) != 0))
Linus Torvalds's avatar
Linus Torvalds committed
513
		return -EINVAL;
514
	address -= key->both.offset;
Linus Torvalds's avatar
Linus Torvalds committed
515

516 517 518
	if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
		return -EFAULT;

519 520 521
	if (unlikely(should_fail_futex(fshared)))
		return -EFAULT;

522 523 524 525 526 527 528 529 530 531
	/*
	 * PROCESS_PRIVATE futexes are fast.
	 * As the mm cannot disappear under us and the 'key' only needs
	 * virtual address, we dont even have to find the underlying vma.
	 * Note : We do have to check 'uaddr' is a valid user address,
	 *        but access_ok() should be faster than find_vma()
	 */
	if (!fshared) {
		key->private.mm = mm;
		key->private.address = address;
532
		get_futex_key_refs(key);  /* implies smp_mb(); (B) */
533 534
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
535

536
again:
537 538 539 540
	/* Ignore any VERIFY_READ mapping (futex common case) */
	if (unlikely(should_fail_futex(fshared)))
		return -EFAULT;

541
	err = get_user_pages_fast(address, 1, 1, &page);
542 543 544 545 546 547 548 549
	/*
	 * If write access is not required (eg. FUTEX_WAIT), try
	 * and get read-only access.
	 */
	if (err == -EFAULT && rw == VERIFY_READ) {
		err = get_user_pages_fast(address, 1, 0, &page);
		ro = 1;
	}
550 551
	if (err < 0)
		return err;
552 553
	else
		err = 0;
554

555 556 557 558 559 560 561 562 563 564
	/*
	 * The treatment of mapping from this point on is critical. The page
	 * lock protects many things but in this context the page lock
	 * stabilizes mapping, prevents inode freeing in the shared
	 * file-backed region case and guards against movement to swap cache.
	 *
	 * Strictly speaking the page lock is not needed in all cases being
	 * considered here and page lock forces unnecessarily serialization
	 * From this point on, mapping will be re-verified if necessary and
	 * page lock will be acquired only if it is unavoidable
565 566 567 568 569 570 571
	 *
	 * Mapping checks require the head page for any compound page so the
	 * head page and mapping is looked up now. For anonymous pages, it
	 * does not matter if the page splits in the future as the key is
	 * based on the address. For filesystem-backed pages, the tail is
	 * required as the index of the page determines the key. For
	 * base pages, there is no tail page and tail == page.
572
	 */
573
	tail = page;
574 575 576
	page = compound_head(page);
	mapping = READ_ONCE(page->mapping);

577
	/*
578
	 * If page->mapping is NULL, then it cannot be a PageAnon
579 580 581 582 583 584 585 586 587 588 589
	 * page; but it might be the ZERO_PAGE or in the gate area or
	 * in a special mapping (all cases which we are happy to fail);
	 * or it may have been a good file page when get_user_pages_fast
	 * found it, but truncated or holepunched or subjected to
	 * invalidate_complete_page2 before we got the page lock (also
	 * cases which we are happy to fail).  And we hold a reference,
	 * so refcount care in invalidate_complete_page's remove_mapping
	 * prevents drop_caches from setting mapping to NULL beneath us.
	 *
	 * The case we do have to guard against is when memory pressure made
	 * shmem_writepage move it from filecache to swapcache beneath us:
590
	 * an unlikely race, but we do need to retry for page->mapping.
591
	 */
592 593 594 595 596 597 598 599 600 601
	if (unlikely(!mapping)) {
		int shmem_swizzled;

		/*
		 * Page lock is required to identify which special case above
		 * applies. If this is really a shmem page then the page lock
		 * will prevent unexpected transitions.
		 */
		lock_page(page);
		shmem_swizzled = PageSwapCache(page) || page->mapping;
602 603
		unlock_page(page);
		put_page(page);
604

605 606
		if (shmem_swizzled)
			goto again;
607

608
		return -EFAULT;
609
	}
Linus Torvalds's avatar
Linus Torvalds committed
610 611 612 613

	/*
	 * Private mappings are handled in a simple way.
	 *
614 615 616
	 * If the futex key is stored on an anonymous page, then the associated
	 * object is the mm which is implicitly pinned by the calling process.
	 *
Linus Torvalds's avatar
Linus Torvalds committed
617 618
	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
	 * it's a read-only handle, it's expected that futexes attach to
619
	 * the object not the particular process.
Linus Torvalds's avatar
Linus Torvalds committed
620
	 */
621
	if (PageAnon(page)) {
622 623 624 625
		/*
		 * A RO anonymous page will never change and thus doesn't make
		 * sense for futex operations.
		 */
626
		if (unlikely(should_fail_futex(fshared)) || ro) {
627 628 629 630
			err = -EFAULT;
			goto out;
		}

631
		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
Linus Torvalds's avatar
Linus Torvalds committed
632
		key->private.mm = mm;
633
		key->private.address = address;
634 635 636

		get_futex_key_refs(key); /* implies smp_mb(); (B) */

637
	} else {
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
		struct inode *inode;

		/*
		 * The associated futex object in this case is the inode and
		 * the page->mapping must be traversed. Ordinarily this should
		 * be stabilised under page lock but it's not strictly
		 * necessary in this case as we just want to pin the inode, not
		 * update the radix tree or anything like that.
		 *
		 * The RCU read lock is taken as the inode is finally freed
		 * under RCU. If the mapping still matches expectations then the
		 * mapping->host can be safely accessed as being a valid inode.
		 */
		rcu_read_lock();

		if (READ_ONCE(page->mapping) != mapping) {
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		inode = READ_ONCE(mapping->host);
		if (!inode) {
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		/*
		 * Take a reference unless it is about to be freed. Previously
		 * this reference was taken by ihold under the page lock
		 * pinning the inode in place so i_lock was unnecessary. The
		 * only way for this check to fail is if the inode was
		 * truncated in parallel so warn for now if this happens.
		 *
		 * We are not calling into get_futex_key_refs() in file-backed
		 * cases, therefore a successful atomic_inc return below will
		 * guarantee that get_futex_key() will still imply smp_mb(); (B).
		 */
		if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
			rcu_read_unlock();
			put_page(page);

			goto again;
		}

		/* Should be impossible but lets be paranoid for now */
		if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
			err = -EFAULT;
			rcu_read_unlock();
			iput(inode);

			goto out;
		}

695
		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
696
		key->shared.inode = inode;
697
		key->shared.pgoff = basepage_index(tail);
698
		rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
699 700
	}

701
out:
702
	put_page(page);
703
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
704 705
}

706
static inline void put_futex_key(union futex_key *key)
Linus Torvalds's avatar
Linus Torvalds committed
707
{
708
	drop_futex_key_refs(key);
Linus Torvalds's avatar
Linus Torvalds committed
709 710
}

711 712
/**
 * fault_in_user_writeable() - Fault in user address and verify RW access
713 714 715 716 717
 * @uaddr:	pointer to faulting user space address
 *
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
718
 * We have no generic implementation of a non-destructive write to the
719 720 721 722 723 724
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
 */
static int fault_in_user_writeable(u32 __user *uaddr)
{
725 726 727 728
	struct mm_struct *mm = current->mm;
	int ret;

	down_read(&mm->mmap_sem);
729
	ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
730
			       FAULT_FLAG_WRITE, NULL);
731 732
	up_read(&mm->mmap_sem);

733 734 735
	return ret < 0 ? ret : 0;
}

736 737
/**
 * futex_top_waiter() - Return the highest priority waiter on a futex
738 739
 * @hb:		the hash bucket the futex_q's reside in
 * @key:	the futex key (to distinguish it from other futex futex_q's)
740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
 *
 * Must be called with the hb lock held.
 */
static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
					union futex_key *key)
{
	struct futex_q *this;

	plist_for_each_entry(this, &hb->chain, list) {
		if (match_futex(&this->key, key))
			return this;
	}
	return NULL;
}

755 756
static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
				      u32 uval, u32 newval)
757
{
758
	int ret;
759 760

	pagefault_disable();
761
	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
762 763
	pagefault_enable();

764
	return ret;
765 766 767
}

static int get_futex_value_locked(u32 *dest, u32 __user *from)
Linus Torvalds's avatar
Linus Torvalds committed
768 769 770
{
	int ret;

771
	pagefault_disable();
772
	ret = __get_user(*dest, from);
773
	pagefault_enable();
Linus Torvalds's avatar
Linus Torvalds committed
774 775 776 777

	return ret ? -EFAULT : 0;
}

778 779 780 781 782 783 784 785 786 787 788

/*
 * PI code:
 */
static int refill_pi_state_cache(void)
{
	struct futex_pi_state *pi_state;

	if (likely(current->pi_state_cache))
		return 0;

789
	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
790 791 792 793 794 795 796 797

	if (!pi_state)
		return -ENOMEM;

	INIT_LIST_HEAD(&pi_state->list);
	/* pi_mutex gets initialized later */
	pi_state->owner = NULL;
	atomic_set(&pi_state->refcount, 1);
798
	pi_state->key = FUTEX_KEY_INIT;
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814

	current->pi_state_cache = pi_state;

	return 0;
}

static struct futex_pi_state * alloc_pi_state(void)
{
	struct futex_pi_state *pi_state = current->pi_state_cache;

	WARN_ON(!pi_state);
	current->pi_state_cache = NULL;

	return pi_state;
}

815
/*
816 817 818
 * Drops a reference to the pi_state object and frees or caches it
 * when the last reference is gone.
 *
819 820
 * Must be called with the hb lock held.
 */
821
static void put_pi_state(struct futex_pi_state *pi_state)
822
{
823 824 825
	if (!pi_state)
		return;

826 827 828 829 830 831 832 833
	if (!atomic_dec_and_test(&pi_state->refcount))
		return;

	/*
	 * If pi_state->owner is NULL, the owner is most probably dying
	 * and has cleaned up the pi_state already
	 */
	if (pi_state->owner) {
834
		raw_spin_lock_irq(&pi_state->owner->pi_lock);
835
		list_del_init(&pi_state->list);
836
		raw_spin_unlock_irq(&pi_state->owner->pi_lock);
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862

		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
	}

	if (current->pi_state_cache)
		kfree(pi_state);
	else {
		/*
		 * pi_state->list is already empty.
		 * clear pi_state->owner.
		 * refcount is at 0 - put it back to 1.
		 */
		pi_state->owner = NULL;
		atomic_set(&pi_state->refcount, 1);
		current->pi_state_cache = pi_state;
	}
}

/*
 * Look up the task based on what TID userspace gave us.
 * We dont trust it.
 */
static struct task_struct * futex_find_get_task(pid_t pid)
{
	struct task_struct *p;

863
	rcu_read_lock();
864
	p = find_task_by_vpid(pid);
865 866
	if (p)
		get_task_struct(p);
867

868
	rcu_read_unlock();
869 870 871 872 873 874 875 876 877 878 879 880 881

	return p;
}

/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
void exit_pi_state_list(struct task_struct *curr)
{
	struct list_head *next, *head = &curr->pi_state_list;
	struct futex_pi_state *pi_state;
882
	struct futex_hash_bucket *hb;
883
	union futex_key key = FUTEX_KEY_INIT;
884

885 886
	if (!futex_cmpxchg_enabled)
		return;
887 888 889
	/*
	 * We are a ZOMBIE and nobody can enqueue itself on
	 * pi_state_list anymore, but we have to be careful
890
	 * versus waiters unqueueing themselves:
891
	 */
892
	raw_spin_lock_irq(&curr->pi_lock);
893 894 895 896 897
	while (!list_empty(head)) {

		next = head->next;
		pi_state = list_entry(next, struct futex_pi_state, list);
		key = pi_state->key;
898
		hb = hash_futex(&key);
899
		raw_spin_unlock_irq(&curr->pi_lock);
900 901 902

		spin_lock(&hb->lock);

903
		raw_spin_lock_irq(&curr->pi_lock);
904 905 906 907
		/*
		 * We dropped the pi-lock, so re-check whether this
		 * task still owns the PI-state:
		 */
908 909 910 911 912 913
		if (head->next != next) {
			spin_unlock(&hb->lock);
			continue;
		}

		WARN_ON(pi_state->owner != curr);
914 915
		WARN_ON(list_empty(&pi_state->list));
		list_del_init(&pi_state->list);
916
		pi_state->owner = NULL;
917
		raw_spin_unlock_irq(&curr->pi_lock);
918 919 920 921 922

		rt_mutex_unlock(&pi_state->pi_mutex);

		spin_unlock(&hb->lock);

923
		raw_spin_lock_irq(&curr->pi_lock);
924
	}
925
	raw_spin_unlock_irq(&curr->pi_lock);
926 927
}

928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
/*
 * We need to check the following states:
 *
 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 *
 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 *
 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 *
 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 *
 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 *
 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 *
 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 *
 * [1]	Indicates that the kernel can acquire the futex atomically. We
 *	came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 *
 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
 *      thread is found then it indicates that the owner TID has died.
 *
 * [3]	Invalid. The waiter is queued on a non PI futex
 *
 * [4]	Valid state after exit_robust_list(), which sets the user space
 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 *
 * [5]	The user space value got manipulated between exit_robust_list()
 *	and exit_pi_state_list()
 *
 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
 *	the pi_state but cannot access the user space value.
 *
 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 *
 * [8]	Owner and user space value match
 *
 * [9]	There is no transient state which sets the user space TID to 0
 *	except exit_robust_list(), but this is indicated by the
 *	FUTEX_OWNER_DIED bit. See [4]
 *
 * [10] There is no transient state which leaves owner and user space
 *	TID out of sync.
 */
977 978 979 980 981 982 983 984

/*
 * Validate that the existing waiter has a pi_state and sanity check
 * the pi_state against the user space value. If correct, attach to
 * it.
 */
static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
			      struct futex_pi_state **ps)
985
{
986
	pid_t pid = uval & FUTEX_TID_MASK;
987

988 989 990 991 992
	/*
	 * Userspace might have messed up non-PI and PI futexes [3]
	 */
	if (unlikely(!pi_state))
		return -EINVAL;
993

994
	WARN_ON(!atomic_read(&pi_state->refcount));
995

996 997 998 999
	/*
	 * Handle the owner died case:
	 */
	if (uval & FUTEX_OWNER_DIED) {
1000
		/*
1001 1002 1003
		 * exit_pi_state_list sets owner to NULL and wakes the
		 * topmost waiter. The task which acquires the
		 * pi_state->rt_mutex will fixup owner.
1004
		 */
1005
		if (!pi_state->owner) {
1006
			/*
1007 1008
			 * No pi state owner, but the user space TID
			 * is not 0. Inconsistent state. [5]
1009
			 */
1010 1011
			if (pid)
				return -EINVAL;
1012
			/*
1013
			 * Take a ref on the state and return success. [4]
1014
			 */
1015
			goto out_state;
1016
		}
1017 1018

		/*
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
		 * If TID is 0, then either the dying owner has not
		 * yet executed exit_pi_state_list() or some waiter
		 * acquired the rtmutex in the pi state, but did not
		 * yet fixup the TID in user space.
		 *
		 * Take a ref on the state and return success. [6]
		 */
		if (!pid)
			goto out_state;
	} else {
		/*
		 * If the owner died bit is not set, then the pi_state
		 * must have an owner. [7]
1032
		 */
1033
		if (!pi_state->owner)
1034
			return -EINVAL;
1035 1036
	}

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
	/*
	 * Bail out if user space manipulated the futex value. If pi
	 * state exists then the owner TID must be the same as the
	 * user space TID. [9/10]
	 */
	if (pid != task_pid_vnr(pi_state->owner))
		return -EINVAL;
out_state:
	atomic_inc(&pi_state->refcount);
	*ps = pi_state;
	return 0;
}

1050 1051 1052 1053 1054 1055
/*
 * Lookup the task for the TID provided from user space and attach to
 * it after doing proper sanity checks.
 */
static int attach_to_pi_owner(u32 uval, union futex_key *key,
			      struct futex_pi_state **ps)
1056 1057
{
	pid_t pid = uval & FUTEX_TID_MASK;
1058 1059
	struct futex_pi_state *pi_state;
	struct task_struct *p;
1060

1061
	/*
1062
	 * We are the first waiter - try to look up the real owner and attach
1063
	 * the new pi_state to it, but bail out when TID = 0 [1]
1064
	 */
1065
	if (!pid)
1066
		return -ESRCH;
1067
	p = futex_find_get_task(pid);
1068 1069
	if (!p)
		return -ESRCH;
1070

1071
	if (unlikely(p->flags & PF_KTHREAD)) {
1072 1073 1074 1075
		put_task_struct(p);
		return -EPERM;
	}

1076 1077 1078 1079 1080 1081
	/*
	 * We need to look at the task state flags to figure out,
	 * whether the task is exiting. To protect against the do_exit
	 * change of the task flags, we do this protected by
	 * p->pi_lock:
	 */
1082
	raw_spin_lock_irq(&p->pi_lock);
1083 1084 1085 1086 1087 1088 1089 1090
	if (unlikely(p->flags & PF_EXITING)) {
		/*
		 * The task is on the way out. When PF_EXITPIDONE is
		 * set, we know that the task has finished the
		 * cleanup:
		 */
		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;

1091
		raw_spin_unlock_irq(&p->pi_lock);
1092 1093 1094
		put_task_struct(p);
		return ret;
	}
1095

1096 1097 1098
	/*
	 * No existing pi state. First waiter. [2]
	 */
1099 1100 1101
	pi_state = alloc_pi_state();

	/*
1102
	 * Initialize the pi_mutex in locked state and make @p
1103 1104 1105 1106 1107
	 * the owner of it:
	 */
	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);

	/* Store the key for possible exit cleanups: */
1108
	pi_state->key = *key;
1109

1110
	WARN_ON(!list_empty(&pi_state->list));
1111 1112
	list_add(&pi_state->list, &p->pi_state_list);
	pi_state->owner = p;
1113
	raw_spin_unlock_irq(&p->pi_lock);
1114 1115 1116

	put_task_struct(p);

1117
	*ps = pi_state;
1118 1119 1120 1121

	return 0;
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
			   union futex_key *key, struct futex_pi_state **ps)
{
	struct futex_q *match = futex_top_waiter(hb, key);

	/*
	 * If there is a waiter on that futex, validate it and
	 * attach to the pi_state when the validation succeeds.
	 */
	if (match)
		return attach_to_pi_state(uval, match->pi_state, ps);

	/*
	 * We are the first waiter - try to look up the owner based on
	 * @uval and attach to it.
	 */
	return attach_to_pi_owner(uval, key, ps);
}

1141 1142 1143 1144
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
	u32 uninitialized_var(curval);

1145 1146 1147
	if (unlikely(should_fail_futex(true)))
		return -EFAULT;

1148 1149 1150 1151 1152 1153 1154
	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
		return -EFAULT;

	/*If user space value changed, let the caller retry */
	return curval != uval ? -EAGAIN : 0;
}

1155
/**
1156
 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1157 1158 1159 1160 1161 1162 1163 1164
 * @uaddr:		the pi futex user address
 * @hb:			the pi futex hash bucket
 * @key:		the futex key associated with uaddr and hb
 * @ps:			the pi_state pointer where we store the result of the
 *			lookup
 * @task:		the task to perform the atomic lock work for.  This will
 *			be "current" except in the case of requeue pi.
 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
1165
 *
1166 1167 1168
 * Return:
 *  0 - ready to wait;
 *  1 - acquired the lock;
1169 1170 1171 1172 1173 1174 1175
 * <0 - error
 *
 * The hb->lock and futex_key refs shall be held by the caller.
 */
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
				union futex_key *key,
				struct futex_pi_state **ps,
1176
				struct task_struct *task, int set_waiters)
1177
{
1178 1179 1180
	u32 uval, newval, vpid = task_pid_vnr(task);
	struct futex_q *match;
	int ret;
1181 1182

	/*
1183 1184
	 * Read the user space value first so we can validate a few
	 * things before proceeding further.
1185
	 */
1186
	if (get_futex_value_locked(&uval, uaddr))
1187 1188
		return -EFAULT;

1189 1190 1191
	if (unlikely(should_fail_futex(true)))
		return -EFAULT;

1192 1193 1194
	/*
	 * Detect deadlocks.
	 */
1195
	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1196 1197
		return -EDEADLK;

1198 1199 1200
	if ((unlikely(should_fail_futex(true))))
		return -EDEADLK;

1201
	/*
1202 1203
	 * Lookup existing state first. If it exists, try to attach to
	 * its pi_state.
1204
	 */
1205 1206 1207
	match = futex_top_waiter(hb, key);
	if (match)
		return attach_to_pi_state(uval, match->pi_state, ps);
1208 1209

	/*
1210 1211 1212 1213
	 * No waiter and user TID is 0. We are here because the
	 * waiters or the owner died bit is set or called from
	 * requeue_cmp_pi or for whatever reason something took the
	 * syscall.
1214
	 */
1215
	if (!(uval & FUTEX_TID_MASK)) {
1216
		/*
1217 1218
		 * We take over the futex. No other waiters and the user space
		 * TID is 0. We preserve the owner died bit.
1219
		 */
1220 1221
		newval = uval & FUTEX_OWNER_DIED;
		newval |= vpid;
1222

1223 1224 1225 1226 1227 1228 1229 1230
		/* The futex requeue_pi code can enforce the waiters bit */
		if (set_waiters)
			newval |= FUTEX_WAITERS;

		ret = lock_pi_update_atomic(uaddr, uval, newval);
		/* If the take over worked, return 1 */
		return ret < 0 ? ret : 1;
	}
1231 1232

	/*
1233 1234 1235
	 * First waiter. Set the waiters bit before attaching ourself to
	 * the owner. If owner tries to unlock, it will be forced into
	 * the kernel and blocked on hb->lock.
1236
	 */
1237 1238 1239 1240
	newval = uval | FUTEX_WAITERS;
	ret = lock_pi_update_atomic(uaddr, uval, newval);
	if (ret)
		return ret;
1241
	/*
1242 1243 1244
	 * If the update of the user space value succeeded, we try to
	 * attach to the owner. If that fails, no harm done, we only
	 * set the FUTEX_WAITERS bit in the user space variable.
1245
	 */
1246
	return attach_to_pi_owner(uval, key, ps);
1247 1248
}

1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
/**
 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
 * @q:	The futex_q to unqueue
 *
 * The q->lock_ptr must not be NULL and must be held by the caller.
 */
static void __unqueue_futex(struct futex_q *q)
{
	struct futex_hash_bucket *hb;

1259 1260
	if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
	    || WARN_ON(plist_node_empty(&q->list)))
1261 1262 1263 1264
		return;

	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
	plist_del(&q->list, &hb->chain);
1265
	hb_waiters_dec(hb);
1266 1267
}

Linus Torvalds's avatar
Linus Torvalds committed
1268 1269
/*
 * The hash bucket lock must be held when this is called.
1270 1271 1272
 * Afterwards, the futex_q must not be accessed. Callers
 * must ensure to later call wake_up_q() for the actual
 * wakeups to occur.
Linus Torvalds's avatar
Linus Torvalds committed
1273
 */
1274
static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
Linus Torvalds's avatar
Linus Torvalds committed
1275
{
1276 1277
	struct task_struct *p = q->task;

1278 1279 1280
	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
		return;

Linus Torvalds's avatar
Linus Torvalds committed
1281
	/*
1282 1283
	 * Queue the task for later wakeup for after we've released
	 * the hb->lock. wake_q_add() grabs reference to p.
Linus Torvalds's avatar
Linus Torvalds committed
1284
	 */
1285
	wake_q_add(wake_q, p);
1286
	__unqueue_futex(q);
Linus Torvalds's avatar
Linus Torvalds committed
1287
	/*
1288 1289 1290 1291
	 * The waiting task can free the futex_q as soon as
	 * q->lock_ptr = NULL is written, without taking any locks. A
	 * memory barrier is required here to prevent the following
	 * store to lock_ptr from getting ahead of the plist_del.
Linus Torvalds's avatar
Linus Torvalds committed
1292
	 */
1293
	smp_wmb();
Linus Torvalds's avatar
Linus Torvalds committed
1294 1295 1296
	q->lock_ptr = NULL;
}

1297 1298
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
			 struct futex_hash_bucket *hb)
1299 1300 1301
{
	struct task_struct *new_owner;
	struct futex_pi_state *pi_state = this->pi_state;
1302
	u32 uninitialized_var(curval), newval;
1303
	DEFINE_WAKE_Q(wake_q);
1304
	bool deboost;
1305
	int ret = 0;
1306 1307 1308 1309

	if (!pi_state)
		return -EINVAL;

1310 1311 1312 1313 1314 1315 1316
	/*
	 * If current does not own the pi_state then the futex is
	 * inconsistent and user space fiddled with the futex value.
	 */
	if (pi_state->owner != current)
		return -EINVAL;

1317
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1318 1319 1320
	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);

	/*
1321 1322 1323
	 * It is possible that the next waiter (the one that brought
	 * this owner to the kernel) timed out and is no longer
	 * waiting on the lock.
1324 1325 1326 1327 1328
	 */
	if (!new_owner)
		new_owner = this->task;

	/*
1329 1330 1331
	 * We pass it to the next owner. The WAITERS bit is always
	 * kept enabled while there is PI state around. We cleanup the
	 * owner died bit, because we are the owner.
1332
	 */
1333
	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1334

1335 1336 1337
	if (unlikely(should_fail_futex(true)))
		ret = -EFAULT;

1338
	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
1339
		ret = -EFAULT;
1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
	} else if (curval != uval) {
		/*
		 * If a unconditional UNLOCK_PI operation (user space did not
		 * try the TID->0 transition) raced with a waiter setting the
		 * FUTEX_WAITERS flag between get_user() and locking the hash
		 * bucket lock, retry the operation.
		 */
		if ((FUTEX_TID_MASK & curval) == uval)
			ret = -EAGAIN;
		else
			ret = -EINVAL;
	}
1352
	if (ret) {
1353
		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1354
		return ret;
1355
	}
1356

1357
	raw_spin_lock(&pi_state->owner->pi_lock);
1358 1359
	WARN_ON(list_empty(&pi_state->list));
	list_del_init(&pi_state->list);
1360
	raw_spin_unlock(&pi_state->owner->pi_lock);
1361

1362
	raw_spin_lock(&new_owner->pi_lock);
1363
	WARN_ON(!list_empty(&pi_state->list));
1364 1365
	list_add(&pi_state->list, &new_owner->pi_state_list);
	pi_state->owner = new_owner;
1366
	raw_spin_unlock(&new_owner->pi_lock);
1367

1368
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381

	deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);

	/*
	 * First unlock HB so the waiter does not spin on it once he got woken
	 * up. Second wake up the waiter before the priority is adjusted. If we
	 * deboost first (and lose our higher priority), then the task might get
	 * scheduled away before the wake up can take place.
	 */
	spin_unlock(&hb->lock);
	wake_up_q(&wake_q);
	if (deboost)
		rt_mutex_adjust_prio(current);
1382 1383 1384 1385

	return 0;
}

1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401
/*
 * Express the locking dependencies for lockdep:
 */
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
	if (hb1 <= hb2) {
		spin_lock(&hb1->lock);
		if (hb1 < hb2)
			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
	} else { /* hb1 > hb2 */
		spin_lock(&hb2->lock);
		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
	}
}

1402 1403 1404
static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
1405
	spin_unlock(&hb1->lock);
1406 1407
	if (hb1 != hb2)
		spin_unlock(&hb2->lock);
1408 1409
}

Linus Torvalds's avatar
Linus Torvalds committed
1410
/*
1411
 * Wake up waiters matching bitset queued on this futex (uaddr).
Linus Torvalds's avatar
Linus Torvalds committed
1412
 */
1413 1414
static int
futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
Linus Torvalds's avatar
Linus Torvalds committed
1415
{
1416
	struct futex_hash_bucket *hb;
Linus Torvalds's avatar
Linus Torvalds committed
1417
	struct futex_q *this, *next;
1418
	union futex_key key = FUTEX_KEY_INIT;
Linus Torvalds's avatar
Linus Torvalds committed
1419
	int ret;
1420
	DEFINE_WAKE_Q(wake_q);
Linus Torvalds's avatar
Linus Torvalds committed
1421

1422 1423 1424
	if (!bitset)
		return -EINVAL;

1425
	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
Linus Torvalds's avatar
Linus Torvalds committed
1426 1427 1428
	if (unlikely(ret != 0))
		goto out;

1429
	hb = hash_futex(&key);
1430 1431 1432 1433 1434

	/* Make sure we really have tasks to wakeup */
	if (!hb_waiters_pending(hb))
		goto out_put_key;

1435
	spin_lock(&hb->lock);
Linus Torvalds's avatar
Linus Torvalds committed
1436

1437
	plist_for_each_entry_safe(this, next, &hb->chain, list) {
Linus Torvalds's avatar
Linus Torvalds committed
1438
		if (match_futex (&this->key, &key)) {
1439
			if (this->pi_state || this->rt_waiter) {
1440 1441 1442
				ret = -EINVAL;
				break;
			}
1443 1444 1445 1446 1447

			/* Check if one of the bits is set in both bitsets */
			if (!(this->bitset & bitset))
				continue;

1448
			mark_wake_futex(&wake_q, this);
Linus Torvalds's avatar
Linus Torvalds committed
1449 1450 1451 1452 1453
			if (++ret >= nr_wake)
				break;
		}
	}

1454
	spin_unlock(&hb->lock);
1455
	wake_up_q(&wake_q);
1456
out_put_key:
1457
	put_futex_key(&key);
1458
out:
Linus Torvalds's avatar
Linus Torvalds committed
1459 1460 1461
	return ret;
}

1462 1463 1464 1465
/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
1466
static int
1467
futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1468
	      int nr_wake, int nr_wake2, int op)
1469
{
1470
	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1471
	struct futex_hash_bucket *hb1, *hb2;
1472
	struct futex_q *this, *next;
1473
	int ret, op_ret;
1474
	DEFINE_WAKE_Q(wake_q);
1475

1476
retry:
1477
	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);