Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.11.2.1
      1 /*	$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell and Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $");
     34 
     35 /*
     36  * Futexes
     37  *
     38  *	The futex system call coordinates notifying threads waiting for
     39  *	changes on a 32-bit word of memory.  The word can be managed by
     40  *	CPU atomic operations in userland, without system calls, as long
     41  *	as there is no contention.
     42  *
     43  *	The simplest use case demonstrating the utility is:
     44  *
     45  *		// 32-bit word of memory shared among threads or
     46  *		// processes in userland.  lock & 1 means owned;
     47  *		// lock & 2 means there are waiters waiting.
     48  *		volatile int lock = 0;
     49  *
     50  *		int v;
     51  *
     52  *		// Acquire a lock.
     53  *		do {
     54  *			v = lock;
     55  *			if (v & 1) {
     56  *				// Lock is held.  Set a bit to say that
     57  *				// there are waiters, and wait for lock
     58  *				// to change to anything other than v;
     59  *				// then retry.
     60  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61  *					continue;
     62  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63  *				continue;
     64  *			}
     65  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66  *		membar_enter();
     67  *
     68  *		...
     69  *
     70  *		// Release the lock.  Optimistically assume there are
     71  *		// no waiters first until demonstrated otherwise.
     72  *		membar_exit();
     73  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74  *			// There may be waiters.
     75  *			v = atomic_swap_uint(&lock, 0);
     76  *			// If there are still waiters, wake one.
     77  *			if (v & 2)
     78  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79  *		}
     80  *
     81  *	The goal is to avoid the futex system call unless there is
     82  *	contention; then if there is contention, to guarantee no missed
     83  *	wakeups.
     84  *
     85  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86  *	itself to be woken, double-check the lock word, and then sleep;
     87  *	spurious wakeups are generally a fact of life, so any
     88  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89  *
     90  *	If this were all there is to it, we could then increase
     91  *	parallelism by refining the approximation: partition the
     92  *	waiters into buckets by hashing the lock addresses to reduce
     93  *	the incidence of spurious wakeups.  But this is not all.
     94  *
     95  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96  *	operation not only wakes n waiters on lock if lock == val, but
     97  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99  *	to lock2 if they merely share the same hash as waiters on lock.
    100  *	Thus, we can't approximately distribute waiters into queues by
    101  *	a hash function; we must distinguish futex queues exactly by
    102  *	lock address.
    103  *
    104  *	For now, we use a global red/black tree to index futexes.  This
    105  *	should be replaced by a lockless radix tree with a thread to
    106  *	free entries no longer in use once all lookups on all CPUs have
    107  *	completed.
    108  *
    109  *	Specifically, we maintain two maps:
    110  *
    111  *	futex_tab.va[vmspace, va] for private futexes
    112  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113  *
    114  *	This implementation does not support priority inheritance.
    115  */
    116 
    117 #include <sys/types.h>
    118 #include <sys/atomic.h>
    119 #include <sys/condvar.h>
    120 #include <sys/futex.h>
    121 #include <sys/mutex.h>
    122 #include <sys/rbtree.h>
    123 #include <sys/sleepq.h>
    124 #include <sys/queue.h>
    125 #include <sys/sdt.h>
    126 
    127 #include <sys/syscall.h>
    128 #include <sys/syscallargs.h>
    129 #include <sys/syscallvar.h>
    130 
    131 #include <uvm/uvm_extern.h>
    132 
    133 /*
    134  * DTrace probes.
    135  */
    136 SDT_PROVIDER_DEFINE(futex);
    137 
    138 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    139 /* exit: error */
    140 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    141 		  "struct timespec *", "int", "int");
    142 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    143 
    144 /* entry: uaddr, nwake, bitset, fflags */
    145 /* exit: error, nwoken */
    146 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    147 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    148 
    149 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    150 /* exit: error, nwoken */
    151 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    152 		  "int");
    153 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    154 
    155 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    156 /* exit: error, nwoken */
    157 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    158 		  "int", "int", "int");
    159 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    160 
    161 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    162 /* exit: error, nwoken */
    163 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    164 		  "int", "int");
    165 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    166 
    167 /* entry: uaddr, val, r/w, abstime, fflags */
    168 /* exit: error */
    169 SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int",
    170 		  "struct timespec *", "int");
    171 SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int");
    172 
    173 /* entry: uaddr, val, fflags */
    174 /* exit: error, nwoken */
    175 SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int");
    176 SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int");
    177 
    178 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    179 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    180 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    181 
    182 /* entry: timo */
    183 /* exit: error */
    184 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    185 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    186 
    187 /*
    188  * Lock order:
    189  *
    190  *	futex_tab.lock
    191  *	futex::fx_op_lock		ordered by kva of struct futex
    192  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    193  *
    194  * N.B. multiple futexes can share a single sleepq lock.
    195  */
    196 
    197 /*
    198  * union futex_key
    199  *
    200  *	A futex is addressed either by a vmspace+va (private) or by
    201  *	a uvm_voaddr (shared).
    202  */
    203 union futex_key {
    204 	struct {
    205 		struct vmspace			*vmspace;
    206 		vaddr_t				va;
    207 	}			fk_private;
    208 	struct uvm_voaddr	fk_shared;
    209 };
    210 
    211 /*
    212  * struct futex
    213  *
    214  *	Kernel state for a futex located at a particular address in a
    215  *	particular virtual address space.
    216  *
    217  *	N.B. fx_refcnt is an unsigned long because we need to be able
    218  *	to operate on it atomically on all systems while at the same
    219  *	time rendering practically impossible the chance of it reaching
    220  *	its max value.  In practice, we're limited by the number of LWPs
    221  *	that can be present on the system at any given time, and the
    222  *	assumption is that limit will be good enough on a 32-bit platform.
    223  *	See futex_wake() for why overflow needs to be avoided.
    224  *
    225  *	XXX Since futex addresses must be 4-byte aligned, we could
    226  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    227  *	XXX field of the key.  Worth it?
    228  */
    229 struct futex {
    230 	union futex_key		fx_key;
    231 	struct rb_node		fx_node;
    232 	unsigned long		fx_refcnt;
    233 	bool			fx_shared;
    234 	bool			fx_on_tree;
    235 	uint8_t			fx_class;
    236 
    237 	kmutex_t		fx_op_lock;	/* adaptive */
    238 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    239 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    240 	unsigned int		fx_nwaiters[2];
    241 };
    242 
    243 /*
    244  * futex classes: Some futex operations can only be carried out on
    245  * futexes that are known to be abiding by a certain protocol.  These
    246  * classes are assigned to a futex when created due to a wait event,
    247  * and when subsequent wake or requeue operations are issued, the
    248  * class is checked at futex lookup time.  If the class does not match,
    249  * EINVAL is the result.
    250  */
    251 #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    252 #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    253 #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    254 #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    255 
    256 /* sleepq definitions */
    257 #define	FUTEX_READERQ		0
    258 #define	FUTEX_WRITERQ		1
    259 
    260 CTASSERT(FUTEX_READERQ == FUTEX_RW_READER);
    261 CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER);
    262 
    263 static const char futex_wmesg[] = "futex";
    264 
    265 static void	futex_unsleep(lwp_t *, bool);
    266 
    267 static syncobj_t futex_syncobj = {
    268 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    269 	.sobj_unsleep	= futex_unsleep,
    270 	.sobj_changepri	= sleepq_changepri,
    271 	.sobj_lendpri	= sleepq_lendpri,
    272 	.sobj_owner	= syncobj_noowner,
    273 };
    274 
    275 /*
    276  * futex_tab
    277  *
    278  *	Global trees of futexes by vmspace/va and VM object address.
    279  *
    280  *	XXX This obviously doesn't scale in parallel.  We could use a
    281  *	pserialize-safe data structure, but there may be a high cost to
    282  *	frequent deletion since we don't cache futexes after we're done
    283  *	with them.  We could use hashed locks.  But for now, just make
    284  *	sure userland can't DoS the serial performance, by using a
    285  *	balanced binary tree for lookup.
    286  *
    287  *	XXX We could use a per-process tree for the table indexed by
    288  *	virtual address to reduce contention between processes.
    289  */
    290 static struct {
    291 	kmutex_t	lock;
    292 	struct rb_tree	va;
    293 	struct rb_tree	oa;
    294 } futex_tab __cacheline_aligned;
    295 
    296 static int
    297 compare_futex_key(void *cookie, const void *n, const void *k)
    298 {
    299 	const struct futex *fa = n;
    300 	const union futex_key *fka = &fa->fx_key;
    301 	const union futex_key *fkb = k;
    302 
    303 	if ((uintptr_t)fka->fk_private.vmspace <
    304 	    (uintptr_t)fkb->fk_private.vmspace)
    305 		return -1;
    306 	if ((uintptr_t)fka->fk_private.vmspace >
    307 	    (uintptr_t)fkb->fk_private.vmspace)
    308 		return +1;
    309 	if (fka->fk_private.va < fkb->fk_private.va)
    310 		return -1;
    311 	if (fka->fk_private.va > fkb->fk_private.va)
    312 		return -1;
    313 	return 0;
    314 }
    315 
    316 static int
    317 compare_futex(void *cookie, const void *na, const void *nb)
    318 {
    319 	const struct futex *fa = na;
    320 	const struct futex *fb = nb;
    321 
    322 	return compare_futex_key(cookie, fa, &fb->fx_key);
    323 }
    324 
    325 static const rb_tree_ops_t futex_rb_ops = {
    326 	.rbto_compare_nodes = compare_futex,
    327 	.rbto_compare_key = compare_futex_key,
    328 	.rbto_node_offset = offsetof(struct futex, fx_node),
    329 };
    330 
    331 static int
    332 compare_futex_shared_key(void *cookie, const void *n, const void *k)
    333 {
    334 	const struct futex *fa = n;
    335 	const union futex_key *fka = &fa->fx_key;
    336 	const union futex_key *fkb = k;
    337 
    338 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    339 }
    340 
    341 static int
    342 compare_futex_shared(void *cookie, const void *na, const void *nb)
    343 {
    344 	const struct futex *fa = na;
    345 	const struct futex *fb = nb;
    346 
    347 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    348 }
    349 
    350 static const rb_tree_ops_t futex_shared_rb_ops = {
    351 	.rbto_compare_nodes = compare_futex_shared,
    352 	.rbto_compare_key = compare_futex_shared_key,
    353 	.rbto_node_offset = offsetof(struct futex, fx_node),
    354 };
    355 
    356 /*
    357  * futex_sq_lock2(f, f2)
    358  *
    359  *	Acquire the sleepq locks for f and f2, which may be null, or
    360  *	which may be the same.  If they are distinct, an arbitrary total
    361  *	order is chosen on the locks.
    362  *
    363  *	Callers should only ever acquire multiple sleepq locks
    364  *	simultaneously using futex_sq_lock2.
    365  */
    366 static void
    367 futex_sq_lock2(struct futex * const f, struct futex * const f2)
    368 {
    369 
    370 	/*
    371 	 * If both are null, do nothing; if one is null and the other
    372 	 * is not, lock the other and be done with it.
    373 	 */
    374 	if (f == NULL && f2 == NULL) {
    375 		return;
    376 	} else if (f == NULL) {
    377 		mutex_spin_enter(f2->fx_sq_lock);
    378 		return;
    379 	} else if (f2 == NULL) {
    380 		mutex_spin_enter(f->fx_sq_lock);
    381 		return;
    382 	}
    383 
    384 	kmutex_t * const m = f->fx_sq_lock;
    385 	kmutex_t * const m2 = f2->fx_sq_lock;
    386 
    387 	/* If both locks are the same, acquire only one. */
    388 	if (m == m2) {
    389 		mutex_spin_enter(m);
    390 		return;
    391 	}
    392 
    393 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    394 	if ((uintptr_t)m < (uintptr_t)m2) {
    395 		mutex_spin_enter(m);
    396 		mutex_spin_enter(m2);
    397 	} else {
    398 		mutex_spin_enter(m2);
    399 		mutex_spin_enter(m);
    400 	}
    401 }
    402 
    403 /*
    404  * futex_sq_unlock2(f, f2)
    405  *
    406  *	Release the sleep queue locks for f and f2, which may be null, or
    407  *	which may have the same underlying lock.
    408  */
    409 static void
    410 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    411 {
    412 
    413 	/*
    414 	 * If both are null, do nothing; if one is null and the other
    415 	 * is not, unlock the other and be done with it.
    416 	 */
    417 	if (f == NULL && f2 == NULL) {
    418 		return;
    419 	} else if (f == NULL) {
    420 		mutex_spin_exit(f2->fx_sq_lock);
    421 		return;
    422 	} else if (f2 == NULL) {
    423 		mutex_spin_exit(f->fx_sq_lock);
    424 		return;
    425 	}
    426 
    427 	kmutex_t * const m = f->fx_sq_lock;
    428 	kmutex_t * const m2 = f2->fx_sq_lock;
    429 
    430 	/* If both locks are the same, release only one. */
    431 	if (m == m2) {
    432 		mutex_spin_exit(m);
    433 		return;
    434 	}
    435 
    436 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    437 	if ((uintptr_t)m < (uintptr_t)m2) {
    438 		mutex_spin_exit(m2);
    439 		mutex_spin_exit(m);
    440 	} else {
    441 		mutex_spin_exit(m);
    442 		mutex_spin_exit(m2);
    443 	}
    444 }
    445 
    446 /*
    447  * futex_load(uaddr, kaddr)
    448  *
    449  *	Perform a single atomic load to read *uaddr, and return the
    450  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    451  *	mapped.
    452  */
    453 static inline int
    454 futex_load(int *uaddr, int *kaddr)
    455 {
    456 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    457 }
    458 
    459 /*
    460  * futex_test(uaddr, expected)
    461  *
    462  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    463  *	uaddr is not mapped.
    464  */
    465 static bool
    466 futex_test(int *uaddr, int expected)
    467 {
    468 	int val;
    469 	int error;
    470 
    471 	error = futex_load(uaddr, &val);
    472 	if (error)
    473 		return false;
    474 	return val == expected;
    475 }
    476 
    477 static pool_cache_t	futex_cache		__read_mostly;
    478 
    479 static int	futex_ctor(void *, void *, int);
    480 static void	futex_dtor(void *, void *);
    481 
    482 /*
    483  * futex_sys_init()
    484  *
    485  *	Initialize the futex subsystem.
    486  */
    487 void
    488 futex_sys_init(void)
    489 {
    490 
    491 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    492 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    493 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    494 
    495 	futex_cache = pool_cache_init(sizeof(struct futex),
    496 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    497 	    futex_dtor, NULL);
    498 }
    499 
    500 /*
    501  * futex_sys_fini()
    502  *
    503  *	Finalize the futex subsystem.
    504  */
    505 void
    506 futex_sys_fini(void)
    507 {
    508 
    509 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    510 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    511 	mutex_destroy(&futex_tab.lock);
    512 }
    513 
    514 /*
    515  * futex_ctor()
    516  *
    517  *	Futex object constructor.
    518  */
    519 static int
    520 futex_ctor(void *arg __unused, void *obj, int flags __unused)
    521 {
    522 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    523 	struct futex * const f = obj;
    524 
    525 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    526 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    527 
    528 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    529 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    530 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    531 
    532 	return 0;
    533 }
    534 
    535 /*
    536  * futex_dtor()
    537  *
    538  *	Futex object destructor.
    539  */
    540 static void
    541 futex_dtor(void *arg __unused, void *obj)
    542 {
    543 	struct futex * const f = obj;
    544 
    545 	mutex_destroy(&f->fx_op_lock);
    546 	f->fx_sq_lock = NULL;
    547 }
    548 
    549 /*
    550  * futex_key_init(key, vm, va, shared)
    551  *
    552  *	Initialize a futex key for lookup, etc.
    553  */
    554 static int
    555 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
    556 {
    557 	int error = 0;
    558 
    559 	if (__predict_false(shared)) {
    560 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    561 			error = EFAULT;
    562 	} else {
    563 		fk->fk_private.vmspace = vm;
    564 		fk->fk_private.va = va;
    565 	}
    566 
    567 	return error;
    568 }
    569 
    570 /*
    571  * futex_key_fini(key, shared)
    572  *
    573  *	Release a futex key.
    574  */
    575 static void
    576 futex_key_fini(union futex_key *fk, bool shared)
    577 {
    578 	if (__predict_false(shared))
    579 		uvm_voaddr_release(&fk->fk_shared);
    580 	memset(fk, 0, sizeof(*fk));
    581 }
    582 
    583 /*
    584  * futex_create(fk, shared)
    585  *
    586  *	Create a futex.  Initial reference count is 1, representing the
    587  *	caller.  Returns NULL on failure.  Always takes ownership of the
    588  *	key, either transferring it to the newly-created futex, or releasing
    589  *	the key if creation fails.
    590  *
    591  *	Never sleeps for memory, but may sleep to acquire a lock.
    592  */
    593 static struct futex *
    594 futex_create(union futex_key *fk, bool shared, uint8_t class)
    595 {
    596 	struct futex *f;
    597 
    598 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    599 	if (f == NULL) {
    600 		futex_key_fini(fk, shared);
    601 		return NULL;
    602 	}
    603 	f->fx_key = *fk;
    604 	f->fx_refcnt = 1;
    605 	f->fx_shared = shared;
    606 	f->fx_on_tree = false;
    607 	f->fx_class = class;
    608 
    609 	return f;
    610 }
    611 
    612 /*
    613  * futex_destroy(f)
    614  *
    615  *	Destroy a futex created with futex_create.  Reference count
    616  *	must be zero.
    617  *
    618  *	May sleep.
    619  */
    620 static void
    621 futex_destroy(struct futex *f)
    622 {
    623 
    624 	ASSERT_SLEEPABLE();
    625 
    626 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    627 	KASSERT(!f->fx_on_tree);
    628 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    629 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    630 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    631 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    632 
    633 	futex_key_fini(&f->fx_key, f->fx_shared);
    634 
    635 	pool_cache_put(futex_cache, f);
    636 }
    637 
    638 /*
    639  * futex_hold_count(f, n)
    640  *
    641  *	Attempt to acquire a reference to f.  Return 0 on success,
    642  *	ENFILE on too many references.
    643  *
    644  *	Never sleeps.
    645  */
    646 static int
    647 futex_hold_count(struct futex *f, unsigned long const count)
    648 {
    649 	unsigned long refcnt;
    650 
    651 	do {
    652 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    653 		if (ULONG_MAX - refcnt < count)
    654 			return ENFILE;
    655 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    656 				  refcnt + count) != refcnt);
    657 
    658 	return 0;
    659 }
    660 #define	futex_hold(f)	futex_hold_count(f, 1)
    661 
    662 /*
    663  * futex_rele_count(f, n)
    664  *
    665  *	Release a reference to f acquired with futex_create or
    666  *	futex_hold.
    667  *
    668  *	May sleep to free f.
    669  */
    670 static void
    671 futex_rele_count(struct futex *f, unsigned long const count)
    672 {
    673 	unsigned long refcnt;
    674 
    675 	ASSERT_SLEEPABLE();
    676 
    677 	do {
    678 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    679 		KASSERT(refcnt >= count);
    680 		if (refcnt - count == 0)
    681 			goto trylast;
    682 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    683 				  refcnt - count) != refcnt);
    684 	return;
    685 
    686 trylast:
    687 	KASSERT(count <= LONG_MAX);
    688 	mutex_enter(&futex_tab.lock);
    689 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    690 		if (f->fx_on_tree) {
    691 			if (__predict_false(f->fx_shared))
    692 				rb_tree_remove_node(&futex_tab.oa, f);
    693 			else
    694 				rb_tree_remove_node(&futex_tab.va, f);
    695 			f->fx_on_tree = false;
    696 		}
    697 	} else {
    698 		/* References remain -- don't destroy it.  */
    699 		f = NULL;
    700 	}
    701 	mutex_exit(&futex_tab.lock);
    702 	if (f != NULL)
    703 		futex_destroy(f);
    704 }
    705 #define	futex_rele(f)	futex_rele_count(f, 1)
    706 
    707 /*
    708  * futex_rele_count_not_last(f, n)
    709  *
    710  *	Release a reference to f acquired with futex_create or
    711  *	futex_hold.
    712  *
    713  *	This version asserts that we are not dropping the last
    714  *	reference to f.
    715  */
    716 static void
    717 futex_rele_count_not_last(struct futex *f, unsigned long const count)
    718 {
    719 	unsigned long refcnt;
    720 
    721 	do {
    722 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    723 		KASSERT(refcnt >= count);
    724 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    725 				  refcnt - count) != refcnt);
    726 }
    727 
    728 /*
    729  * futex_lookup_by_key(key, shared, class, &f)
    730  *
    731  *	Try to find an existing futex va reference in the specified key
    732  *	On success, return 0, set f to found futex or to NULL if not found,
    733  *	and increment f's reference count if found.
    734  *
    735  *	Return ENFILE if reference count too high.
    736  *
    737  *	Internal lookup routine shared by futex_lookup() and
    738  *	futex_lookup_create().
    739  */
    740 static int
    741 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
    742     struct futex **fp)
    743 {
    744 	struct futex *f;
    745 	int error = 0;
    746 
    747 	mutex_enter(&futex_tab.lock);
    748 	if (__predict_false(shared)) {
    749 		f = rb_tree_find_node(&futex_tab.oa, fk);
    750 	} else {
    751 		f = rb_tree_find_node(&futex_tab.va, fk);
    752 	}
    753 	if (f) {
    754 		if (__predict_true(f->fx_class == class ||
    755 				   class == FUTEX_CLASS_ANY))
    756 			error = futex_hold(f);
    757 		else
    758 			error = EINVAL;
    759 		if (error)
    760 			f = NULL;
    761 	}
    762  	*fp = f;
    763 	mutex_exit(&futex_tab.lock);
    764 
    765 	return error;
    766 }
    767 
    768 /*
    769  * futex_insert(f, fp)
    770  *
    771  *	Try to insert the futex f into the tree by va.  If there
    772  *	already is a futex for its va, acquire a reference to it, and
    773  *	store it in *fp; otherwise store f in *fp.
    774  *
    775  *	Return 0 on success, ENFILE if there already is a futex but its
    776  *	reference count is too high.
    777  */
    778 static int
    779 futex_insert(struct futex *f, struct futex **fp)
    780 {
    781 	struct futex *f0;
    782 	int error;
    783 
    784 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    785 	KASSERT(!f->fx_on_tree);
    786 
    787 	mutex_enter(&futex_tab.lock);
    788 	if (__predict_false(f->fx_shared))
    789 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    790 	else
    791 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    792 	if (f0 == f) {
    793 		f->fx_on_tree = true;
    794 		error = 0;
    795 	} else {
    796 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    797 		KASSERT(f0->fx_on_tree);
    798 		error = futex_hold(f0);
    799 		if (error)
    800 			goto out;
    801 	}
    802 	*fp = f0;
    803 out:	mutex_exit(&futex_tab.lock);
    804 
    805 	return error;
    806 }
    807 
    808 /*
    809  * futex_lookup(uaddr, shared, class, &f)
    810  *
    811  *	Find a futex at the userland pointer uaddr in the current
    812  *	process's VM space.  On success, return the futex in f and
    813  *	increment its reference count.
    814  *
    815  *	Caller must call futex_rele when done.
    816  */
    817 static int
    818 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    819 {
    820 	union futex_key fk;
    821 	struct vmspace *vm = curproc->p_vmspace;
    822 	vaddr_t va = (vaddr_t)uaddr;
    823 	int error;
    824 
    825 	/*
    826 	 * Reject unaligned user pointers so we don't cross page
    827 	 * boundaries and so atomics will work.
    828 	 */
    829 	if (__predict_false((va & 3) != 0))
    830 		return EINVAL;
    831 
    832 	/* Look it up. */
    833 	error = futex_key_init(&fk, vm, va, shared);
    834 	if (error)
    835 		return error;
    836 
    837 	error = futex_lookup_by_key(&fk, shared, class, fp);
    838 	futex_key_fini(&fk, shared);
    839 	if (error)
    840 		return error;
    841 
    842 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    843 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    844 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    845 
    846 	/*
    847 	 * Success!  (Caller must still check whether we found
    848 	 * anything, but nothing went _wrong_ like trying to use
    849 	 * unmapped memory.)
    850 	 */
    851 	KASSERT(error == 0);
    852 
    853 	return error;
    854 }
    855 
    856 /*
    857  * futex_lookup_create(uaddr, shared, class, &f)
    858  *
    859  *	Find or create a futex at the userland pointer uaddr in the
    860  *	current process's VM space.  On success, return the futex in f
    861  *	and increment its reference count.
    862  *
    863  *	Caller must call futex_rele when done.
    864  */
    865 static int
    866 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    867 {
    868 	union futex_key fk;
    869 	struct vmspace *vm = curproc->p_vmspace;
    870 	struct futex *f = NULL;
    871 	vaddr_t va = (vaddr_t)uaddr;
    872 	int error;
    873 
    874 	/*
    875 	 * Reject unaligned user pointers so we don't cross page
    876 	 * boundaries and so atomics will work.
    877 	 */
    878 	if (__predict_false((va & 3) != 0))
    879 		return EINVAL;
    880 
    881 	if (__predict_false(class == FUTEX_CLASS_ANY))
    882 		return EINVAL;
    883 
    884 	error = futex_key_init(&fk, vm, va, shared);
    885 	if (error)
    886 		return error;
    887 
    888 	/*
    889 	 * Optimistically assume there already is one, and try to find
    890 	 * it.
    891 	 */
    892 	error = futex_lookup_by_key(&fk, shared, class, fp);
    893 	if (error || *fp != NULL) {
    894 		/*
    895 		 * We either found one, or there was an error.
    896 		 * In either case, we are done with the key.
    897 		 */
    898 		futex_key_fini(&fk, shared);
    899 		goto out;
    900 	}
    901 
    902 	/*
    903 	 * Create a futex recoard.  This tranfers ownership of the key
    904 	 * in all cases.
    905 	 */
    906 	f = futex_create(&fk, shared, class);
    907 	if (f == NULL) {
    908 		error = ENOMEM;
    909 		goto out;
    910 	}
    911 
    912 	/*
    913 	 * Insert our new futex, or use existing if someone else beat
    914 	 * us to it.
    915 	 */
    916 	error = futex_insert(f, fp);
    917 	if (error)
    918 		goto out;
    919 	if (*fp == f)
    920 		f = NULL;	/* don't release on exit */
    921 
    922 	/* Success!  */
    923 	KASSERT(error == 0);
    924 
    925 out:	if (f != NULL)
    926 		futex_rele(f);
    927 	KASSERT(error || *fp != NULL);
    928 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    929 	return error;
    930 }
    931 
    932 /*
    933  * futex_unsleep:
    934  *
    935  *	Remove an LWP from the futex and sleep queue.  This is called when
    936  *	the LWP has not been awoken normally but instead interrupted: for
    937  *	example, when a signal is received.  Must be called with the LWP
    938  *	locked.  Will unlock if "unlock" is true.
    939  */
    940 static void
    941 futex_unsleep(lwp_t *l, bool unlock)
    942 {
    943 	struct futex *f __diagused;
    944 
    945 	f = (struct futex *)(uintptr_t)l->l_wchan;
    946 
    947 	KASSERT(mutex_owned(f->fx_sq_lock));
    948 
    949 	/* WRITERQ is more likely */
    950 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    951 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    952 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    953 	} else {
    954 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    955 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    956 		f->fx_nwaiters[FUTEX_READERQ]--;
    957 	}
    958 
    959 	sleepq_unsleep(l, unlock);
    960 }
    961 
    962 /*
    963  * futex_wait(f, timeout, clkid, bitset)
    964  *
    965  *	Wait until deadline on the clock clkid, or forever if deadline
    966  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    967  *	ETIMEDOUT on timeout, EINTR on signal.
    968  */
    969 static int
    970 futex_wait(struct futex *f, int q, const struct timespec *deadline,
    971     clockid_t clkid, int bitset)
    972 {
    973 
    974 	/*
    975 	 * Some things to note about how this function works:
    976 	 *
    977 	 * ==> When we enter futex_wait(), the futex's op lock is
    978 	 * held, preventing us from missing wakes.  Once we are on
    979 	 * the futex's sleepq, it is safe to release the op lock.
    980 	 *
    981 	 * ==> We have a single early escape to avoid calling
    982 	 * sleepq_block(): our deadline already having passed.
    983 	 * If this is a no-timeout wait, or if the deadline has
    984 	 * not already passed, we are guaranteed to call sleepq_block().
    985 	 *
    986 	 * ==> sleepq_block() contains all of the necessary logic
    987 	 * for handling signals; we do not need to check for them
    988 	 * ourselves.
    989 	 *
    990 	 * ==> Once we have blocked, other futex operations will
    991 	 * be able to wake us or requeue us.  Until we have blocked,
    992 	 * those other futex operations will not be able to acquire
    993 	 * the sleepq locks in order to perform those operations,
    994 	 * even if we have dropped the op lock.  Thus, we will not
    995 	 * miss wakes.  This fundamental invariant is relied upon by
    996 	 * every other synchronization object in the kernel.
    997 	 *
    998 	 * ==> sleepq_block() returns for one of three reasons:
    999 	 *
   1000 	 *	-- We were awakened.
   1001 	 *	-- We were signaled.
   1002 	 *	-- We timed out.
   1003 	 *
   1004 	 * ==> Once sleepq_block() returns, we will not call
   1005 	 * sleepq_block() again.
   1006 	 *
   1007 	 * ==> If sleepq_block() returns due to being signaled,
   1008 	 * we must check to see if we were also awakened.  This
   1009 	 * is to handle the following sequence:
   1010 	 *
   1011 	 *	-- Futex wake takes us off sleepq, places us on runq.
   1012 	 *	-- We are signaled while sitting on the runq.
   1013 	 *	-- We run, and sleepq_block() notices the signal and
   1014 	 *	   returns  EINTR/ERESTART.
   1015 	 *
   1016 	 * In this situation, we want to indicate a successful wake
   1017 	 * to the caller, because that wake has been reported to the
   1018 	 * thread that issued it.
   1019 	 *
   1020 	 * Note that it is NOT possible for a wake to be issued after
   1021 	 * a signal; the LWP will already have been taken off the sleepq
   1022 	 * by the signal, so the wake will never happen.  Note that for
   1023 	 * this reason, we must *never* return ERESTART, because it could
   1024 	 * result in us waiting again with no one to awaken us.
   1025 	 */
   1026 
   1027 	struct lwp * const l = curlwp;
   1028 	int timo;
   1029 	int error;
   1030 
   1031 	/*
   1032 	 * Compute our timeout before taking any locks.
   1033 	 */
   1034 	if (deadline) {
   1035 		struct timespec ts;
   1036 
   1037 		/* Check our watch.  */
   1038 		error = clock_gettime1(clkid, &ts);
   1039 		if (error) {
   1040 			mutex_exit(&f->fx_op_lock);
   1041 			return error;
   1042 		}
   1043 
   1044 		/*
   1045 		 * If we're past the deadline, ETIMEDOUT.
   1046 		 */
   1047 		if (timespeccmp(deadline, &ts, <=)) {
   1048 			mutex_exit(&f->fx_op_lock);
   1049 			return ETIMEDOUT;
   1050 		} else {
   1051 			/* Count how much time is left.  */
   1052 			timespecsub(deadline, &ts, &ts);
   1053 			timo = tstohz(&ts);
   1054 			if (timo == 0) {
   1055 				/*
   1056 				 * tstohz() already rounds up, and
   1057 				 * a return value of 0 ticks means
   1058 				 * "expired now or in the past".
   1059 				 */
   1060 				mutex_exit(&f->fx_op_lock);
   1061 				return ETIMEDOUT;
   1062 			}
   1063 		}
   1064 	} else {
   1065 		timo = 0;
   1066 	}
   1067 
   1068 	/*
   1069 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1070 	 * that there's room for us to block.
   1071 	 */
   1072 	mutex_spin_enter(f->fx_sq_lock);
   1073 	if (__predict_false(q == FUTEX_READERQ &&
   1074 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1075 		mutex_spin_exit(f->fx_sq_lock);
   1076 		mutex_exit(&f->fx_op_lock);
   1077 		return ENFILE;
   1078 	}
   1079 	lwp_lock(l);
   1080 
   1081 	/*
   1082 	 * No need for locks here; until we're on a futex's sleepq, these
   1083 	 * values are private to the LWP, and the locks needed to get onto
   1084 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1085 	 */
   1086 	l->l_futex = f;
   1087 	l->l_futex_wakesel = bitset;
   1088 
   1089 	/*
   1090 	 * This is an inlined bit of sleepq_enter();
   1091 	 * we already hold the lwp lock.
   1092 	 */
   1093 	lwp_unlock_to(l, f->fx_sq_lock);
   1094 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1095 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1096 
   1097 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1098 	f->fx_nwaiters[q]++;
   1099 
   1100 	/* We can now safely release the op lock. */
   1101 	mutex_exit(&f->fx_op_lock);
   1102 
   1103 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1104 	error = sleepq_block(timo, true);
   1105 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1106 
   1107 	/*
   1108 	 * f is no longer valid; we may have been moved to another
   1109 	 * futex sleepq while we slept.
   1110 	 */
   1111 
   1112 	/*
   1113 	 * If something went wrong, we may still have a futex reference.
   1114 	 * Check for that here and drop it if so.  We can do this without
   1115 	 * taking any locks because l->l_futex is private to the LWP when
   1116 	 * not on any futex's sleepq, and we are not on any sleepq because
   1117 	 * we are running.
   1118 	 *
   1119 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1120 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1121 	 */
   1122 	if (error) {
   1123 		f = l->l_futex;
   1124 		if (f != NULL) {
   1125 			SDT_PROBE0(futex, wait, finish, aborted);
   1126 			l->l_futex = NULL;
   1127 			futex_rele(f);
   1128 		} else {
   1129 			/* Raced with wakeup; report success. */
   1130 			SDT_PROBE0(futex, wait, finish, wakerace);
   1131 			error = 0;
   1132 		}
   1133 		if (error == EWOULDBLOCK)
   1134 			error = ETIMEDOUT;
   1135 		else if (error != ETIMEDOUT)
   1136 			error = EINTR;
   1137 	} else {
   1138 		KASSERT(l->l_futex == NULL);
   1139 		SDT_PROBE0(futex, wait, finish, normally);
   1140 	}
   1141 
   1142 	return error;
   1143 }
   1144 
   1145 /*
   1146  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1147  *
   1148  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1149  *	provided, move up to nrequeue remaining waiters on f matching
   1150  *	bitset to f2.  Return the number of waiters actually woken.
   1151  *	Caller must hold the locks of f and f2, if provided.
   1152  */
   1153 static unsigned
   1154 futex_wake(struct futex *f, int q, unsigned int const nwake,
   1155     struct futex *f2, int q2, unsigned int const nrequeue,
   1156     int bitset)
   1157 {
   1158 	struct lwp *l, *l_next;
   1159 	unsigned nwoken = 0;
   1160 	unsigned nrequeued = 0;
   1161 
   1162 	KASSERT(mutex_owned(&f->fx_op_lock));
   1163 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1164 
   1165 	futex_sq_lock2(f, f2);
   1166 
   1167 	/* Wake up to nwake waiters, and count the number woken.  */
   1168 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1169 		if (nwoken == nwake)
   1170 			break;
   1171 
   1172 		KASSERT(l->l_syncobj == &futex_syncobj);
   1173 		KASSERT(l->l_wchan == (wchan_t)f);
   1174 		KASSERT(l->l_futex == f);
   1175 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1176 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1177 
   1178 		if ((l->l_futex_wakesel & bitset) == 0)
   1179 			continue;
   1180 
   1181 		l->l_futex_wakesel = 0;
   1182 		l->l_futex = NULL;
   1183 		sleepq_remove(&f->fx_sqs[q], l);
   1184 		f->fx_nwaiters[q]--;
   1185 		nwoken++;
   1186 		/* hold counts adjusted in bulk below */
   1187 	}
   1188 
   1189 	if (nrequeue) {
   1190 		KASSERT(f2 != NULL);
   1191 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1192 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1193 			if (nrequeued == nrequeue)
   1194 				break;
   1195 
   1196 			KASSERT(l->l_syncobj == &futex_syncobj);
   1197 			KASSERT(l->l_wchan == (wchan_t)f);
   1198 			KASSERT(l->l_futex == f);
   1199 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1200 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1201 
   1202 			if ((l->l_futex_wakesel & bitset) == 0)
   1203 				continue;
   1204 
   1205 			l->l_futex = f2;
   1206 			sleepq_transfer(l, &f->fx_sqs[q],
   1207 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1208 			    &futex_syncobj, f2->fx_sq_lock, true);
   1209 			f->fx_nwaiters[q]--;
   1210 			f2->fx_nwaiters[q2]++;
   1211 			nrequeued++;
   1212 			/* hold counts adjusted in bulk below */
   1213 		}
   1214 		if (nrequeued) {
   1215 			/* XXX futex_hold() could theoretically fail here. */
   1216 			int hold_error __diagused =
   1217 			    futex_hold_count(f2, nrequeued);
   1218 			KASSERT(hold_error == 0);
   1219 		}
   1220 	}
   1221 
   1222 	/*
   1223 	 * Adjust the futex reference counts for the wakes and
   1224 	 * requeues.
   1225 	 */
   1226 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1227 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1228 
   1229 	futex_sq_unlock2(f, f2);
   1230 
   1231 	/* Return the number of waiters woken and requeued.  */
   1232 	return nwoken + nrequeued;
   1233 }
   1234 
   1235 /*
   1236  * futex_op_lock(f)
   1237  *
   1238  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1239  *	not use if caller needs to acquire two locks; use
   1240  *	futex_op_lock2 instead.
   1241  */
   1242 static void
   1243 futex_op_lock(struct futex *f)
   1244 {
   1245 	mutex_enter(&f->fx_op_lock);
   1246 }
   1247 
   1248 /*
   1249  * futex_op_unlock(f)
   1250  *
   1251  *	Release the queue lock of f.
   1252  */
   1253 static void
   1254 futex_op_unlock(struct futex *f)
   1255 {
   1256 	mutex_exit(&f->fx_op_lock);
   1257 }
   1258 
   1259 /*
   1260  * futex_op_lock2(f, f2)
   1261  *
   1262  *	Acquire the op locks of both f and f2, which may be null, or
   1263  *	which may be the same.  If they are distinct, an arbitrary total
   1264  *	order is chosen on the locks.
   1265  *
   1266  *	Callers should only ever acquire multiple op locks
   1267  *	simultaneously using futex_op_lock2.
   1268  */
   1269 static void
   1270 futex_op_lock2(struct futex *f, struct futex *f2)
   1271 {
   1272 
   1273 	/*
   1274 	 * If both are null, do nothing; if one is null and the other
   1275 	 * is not, lock the other and be done with it.
   1276 	 */
   1277 	if (f == NULL && f2 == NULL) {
   1278 		return;
   1279 	} else if (f == NULL) {
   1280 		mutex_enter(&f2->fx_op_lock);
   1281 		return;
   1282 	} else if (f2 == NULL) {
   1283 		mutex_enter(&f->fx_op_lock);
   1284 		return;
   1285 	}
   1286 
   1287 	/* If both futexes are the same, acquire only one. */
   1288 	if (f == f2) {
   1289 		mutex_enter(&f->fx_op_lock);
   1290 		return;
   1291 	}
   1292 
   1293 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1294 	if ((uintptr_t)f < (uintptr_t)f2) {
   1295 		mutex_enter(&f->fx_op_lock);
   1296 		mutex_enter(&f2->fx_op_lock);
   1297 	} else {
   1298 		mutex_enter(&f2->fx_op_lock);
   1299 		mutex_enter(&f->fx_op_lock);
   1300 	}
   1301 }
   1302 
   1303 /*
   1304  * futex_op_unlock2(f, f2)
   1305  *
   1306  *	Release the queue locks of both f and f2, which may be null, or
   1307  *	which may have the same underlying queue.
   1308  */
   1309 static void
   1310 futex_op_unlock2(struct futex *f, struct futex *f2)
   1311 {
   1312 
   1313 	/*
   1314 	 * If both are null, do nothing; if one is null and the other
   1315 	 * is not, unlock the other and be done with it.
   1316 	 */
   1317 	if (f == NULL && f2 == NULL) {
   1318 		return;
   1319 	} else if (f == NULL) {
   1320 		mutex_exit(&f2->fx_op_lock);
   1321 		return;
   1322 	} else if (f2 == NULL) {
   1323 		mutex_exit(&f->fx_op_lock);
   1324 		return;
   1325 	}
   1326 
   1327 	/* If both futexes are the same, release only one. */
   1328 	if (f == f2) {
   1329 		mutex_exit(&f->fx_op_lock);
   1330 		return;
   1331 	}
   1332 
   1333 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1334 	if ((uintptr_t)f < (uintptr_t)f2) {
   1335 		mutex_exit(&f2->fx_op_lock);
   1336 		mutex_exit(&f->fx_op_lock);
   1337 	} else {
   1338 		mutex_exit(&f->fx_op_lock);
   1339 		mutex_exit(&f2->fx_op_lock);
   1340 	}
   1341 }
   1342 
   1343 /*
   1344  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1345  *
   1346  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1347  */
   1348 static int
   1349 futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1350     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1351     register_t *retval)
   1352 {
   1353 	struct futex *f;
   1354 	struct timespec ts;
   1355 	const struct timespec *deadline;
   1356 	int error;
   1357 
   1358 	/*
   1359 	 * If there's nothing to wait for, and nobody will ever wake
   1360 	 * us, then don't set anything up to wait -- just stop here.
   1361 	 */
   1362 	if (val3 == 0)
   1363 		return EINVAL;
   1364 
   1365 	/* Optimistically test before anything else.  */
   1366 	if (!futex_test(uaddr, val))
   1367 		return EAGAIN;
   1368 
   1369 	/* Determine a deadline on the specified clock.  */
   1370 	if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1371 		deadline = timeout;
   1372 	} else {
   1373 		struct timespec interval = *timeout;
   1374 
   1375 		error = itimespecfix(&interval);
   1376 		if (error)
   1377 			return error;
   1378 		error = clock_gettime1(clkid, &ts);
   1379 		if (error)
   1380 			return error;
   1381 		timespecadd(&ts, &interval, &ts);
   1382 		deadline = &ts;
   1383 	}
   1384 
   1385 	/* Get the futex, creating it if necessary.  */
   1386 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1387 	if (error)
   1388 		return error;
   1389 	KASSERT(f);
   1390 
   1391 	/*
   1392 	 * Under the op lock, check the value again: if it has
   1393 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1394 	 * Since FUTEX_WAKE will use the same lock and be done after
   1395 	 * modifying the value, the order in which we check and enqueue
   1396 	 * is immaterial.
   1397 	 */
   1398 	futex_op_lock(f);
   1399 	if (!futex_test(uaddr, val)) {
   1400 		futex_op_unlock(f);
   1401 		error = EAGAIN;
   1402 		goto out;
   1403 	}
   1404 
   1405 	/*
   1406 	 * Now wait.  futex_wait() will drop our op lock once we
   1407 	 * are entered into the sleep queue, thus ensuring atomicity
   1408 	 * of wakes with respect to waits.
   1409 	 */
   1410 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1411 
   1412 	/*
   1413 	 * We cannot drop our reference to the futex here, because
   1414 	 * we might be enqueued on a different one when we are awakened.
   1415 	 * The references will be managed on our behalf in the requeue,
   1416 	 * wake, and error cases.
   1417 	 */
   1418 	f = NULL;
   1419 
   1420 	if (__predict_true(error == 0)) {
   1421 		/* Return 0 on success, error on failure. */
   1422 		*retval = 0;
   1423 	}
   1424 
   1425 out:	if (f != NULL)
   1426 		futex_rele(f);
   1427 	return error;
   1428 }
   1429 
   1430 /*
   1431  * futex_func_wake(uaddr, val, val3, retval)
   1432  *
   1433  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1434  */
   1435 static int
   1436 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1437 {
   1438 	struct futex *f;
   1439 	unsigned int nwoken = 0;
   1440 	int error = 0;
   1441 
   1442 	/* Reject negative number of wakeups.  */
   1443 	if (val < 0) {
   1444 		error = EINVAL;
   1445 		goto out;
   1446 	}
   1447 
   1448 	/* Look up the futex, if any.  */
   1449 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1450 	if (error)
   1451 		goto out;
   1452 
   1453 	/* If there's no futex, there are no waiters to wake.  */
   1454 	if (f == NULL)
   1455 		goto out;
   1456 
   1457 	/*
   1458 	 * Under f's op lock, wake the waiters and remember the
   1459 	 * number woken.
   1460 	 */
   1461 	futex_op_lock(f);
   1462 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1463 			    NULL, FUTEX_WRITERQ, 0, val3);
   1464 	futex_op_unlock(f);
   1465 
   1466 	/* Release the futex.  */
   1467 	futex_rele(f);
   1468 
   1469 out:
   1470 	/* Return the number of waiters woken.  */
   1471 	*retval = nwoken;
   1472 
   1473 	/* Success!  */
   1474 	return error;
   1475 }
   1476 
   1477 /*
   1478  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1479  *
   1480  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1481  */
   1482 static int
   1483 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1484     int val2, int val3, register_t *retval)
   1485 {
   1486 	struct futex *f = NULL, *f2 = NULL;
   1487 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1488 	int error;
   1489 
   1490 	/* Reject negative number of wakeups or requeues. */
   1491 	if (val < 0 || val2 < 0) {
   1492 		error = EINVAL;
   1493 		goto out;
   1494 	}
   1495 
   1496 	/* Look up the source futex, if any. */
   1497 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1498 	if (error)
   1499 		goto out;
   1500 
   1501 	/* If there is none, nothing to do. */
   1502 	if (f == NULL)
   1503 		goto out;
   1504 
   1505 	/*
   1506 	 * We may need to create the destination futex because it's
   1507 	 * entirely possible it does not currently have any waiters.
   1508 	 */
   1509 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1510 	if (error)
   1511 		goto out;
   1512 
   1513 	/*
   1514 	 * Under the futexes' op locks, check the value; if
   1515 	 * unchanged from val3, wake the waiters.
   1516 	 */
   1517 	futex_op_lock2(f, f2);
   1518 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1519 		error = EAGAIN;
   1520 	} else {
   1521 		error = 0;
   1522 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1523 				    f2, FUTEX_WRITERQ, val2,
   1524 				    FUTEX_BITSET_MATCH_ANY);
   1525 	}
   1526 	futex_op_unlock2(f, f2);
   1527 
   1528 out:
   1529 	/* Return the number of waiters woken.  */
   1530 	*retval = nwoken;
   1531 
   1532 	/* Release the futexes if we got them.  */
   1533 	if (f2)
   1534 		futex_rele(f2);
   1535 	if (f)
   1536 		futex_rele(f);
   1537 	return error;
   1538 }
   1539 
   1540 /*
   1541  * futex_validate_op_cmp(val3)
   1542  *
   1543  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1544  */
   1545 static int
   1546 futex_validate_op_cmp(int val3)
   1547 {
   1548 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1549 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1550 
   1551 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1552 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1553 		if (oparg < 0)
   1554 			return EINVAL;
   1555 		if (oparg >= 32)
   1556 			return EINVAL;
   1557 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1558 	}
   1559 
   1560 	switch (op) {
   1561 	case FUTEX_OP_SET:
   1562 	case FUTEX_OP_ADD:
   1563 	case FUTEX_OP_OR:
   1564 	case FUTEX_OP_ANDN:
   1565 	case FUTEX_OP_XOR:
   1566 		break;
   1567 	default:
   1568 		return EINVAL;
   1569 	}
   1570 
   1571 	switch (cmp) {
   1572 	case FUTEX_OP_CMP_EQ:
   1573 	case FUTEX_OP_CMP_NE:
   1574 	case FUTEX_OP_CMP_LT:
   1575 	case FUTEX_OP_CMP_LE:
   1576 	case FUTEX_OP_CMP_GT:
   1577 	case FUTEX_OP_CMP_GE:
   1578 		break;
   1579 	default:
   1580 		return EINVAL;
   1581 	}
   1582 
   1583 	return 0;
   1584 }
   1585 
   1586 /*
   1587  * futex_compute_op(oldval, val3)
   1588  *
   1589  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1590  */
   1591 static int
   1592 futex_compute_op(int oldval, int val3)
   1593 {
   1594 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1595 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1596 
   1597 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1598 		KASSERT(oparg >= 0);
   1599 		KASSERT(oparg < 32);
   1600 		oparg = 1u << oparg;
   1601 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1602 	}
   1603 
   1604 	switch (op) {
   1605 	case FUTEX_OP_SET:
   1606 		return oparg;
   1607 
   1608 	case FUTEX_OP_ADD:
   1609 		/*
   1610 		 * Avoid signed arithmetic overflow by doing
   1611 		 * arithmetic unsigned and converting back to signed
   1612 		 * at the end.
   1613 		 */
   1614 		return (int)((unsigned)oldval + (unsigned)oparg);
   1615 
   1616 	case FUTEX_OP_OR:
   1617 		return oldval | oparg;
   1618 
   1619 	case FUTEX_OP_ANDN:
   1620 		return oldval & ~oparg;
   1621 
   1622 	case FUTEX_OP_XOR:
   1623 		return oldval ^ oparg;
   1624 
   1625 	default:
   1626 		panic("invalid futex op");
   1627 	}
   1628 }
   1629 
   1630 /*
   1631  * futex_compute_cmp(oldval, val3)
   1632  *
   1633  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1634  */
   1635 static bool
   1636 futex_compute_cmp(int oldval, int val3)
   1637 {
   1638 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1639 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1640 
   1641 	switch (cmp) {
   1642 	case FUTEX_OP_CMP_EQ:
   1643 		return (oldval == cmparg);
   1644 
   1645 	case FUTEX_OP_CMP_NE:
   1646 		return (oldval != cmparg);
   1647 
   1648 	case FUTEX_OP_CMP_LT:
   1649 		return (oldval < cmparg);
   1650 
   1651 	case FUTEX_OP_CMP_LE:
   1652 		return (oldval <= cmparg);
   1653 
   1654 	case FUTEX_OP_CMP_GT:
   1655 		return (oldval > cmparg);
   1656 
   1657 	case FUTEX_OP_CMP_GE:
   1658 		return (oldval >= cmparg);
   1659 
   1660 	default:
   1661 		panic("invalid futex cmp operation");
   1662 	}
   1663 }
   1664 
   1665 /*
   1666  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1667  *
   1668  *	Implement futex(FUTEX_WAKE_OP).
   1669  */
   1670 static int
   1671 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1672     int val3, register_t *retval)
   1673 {
   1674 	struct futex *f = NULL, *f2 = NULL;
   1675 	int oldval, newval, actual;
   1676 	unsigned nwoken = 0;
   1677 	int error;
   1678 
   1679 	/* Reject negative number of wakeups.  */
   1680 	if (val < 0 || val2 < 0) {
   1681 		error = EINVAL;
   1682 		goto out;
   1683 	}
   1684 
   1685 	/* Reject invalid operations before we start doing things.  */
   1686 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1687 		goto out;
   1688 
   1689 	/* Look up the first futex, if any.  */
   1690 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1691 	if (error)
   1692 		goto out;
   1693 
   1694 	/* Look up the second futex, if any.  */
   1695 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1696 	if (error)
   1697 		goto out;
   1698 
   1699 	/*
   1700 	 * Under the op locks:
   1701 	 *
   1702 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1703 	 * 2. Unconditionally wake uaddr.
   1704 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1705 	 */
   1706 	futex_op_lock2(f, f2);
   1707 	do {
   1708 		error = futex_load(uaddr2, &oldval);
   1709 		if (error)
   1710 			goto out_unlock;
   1711 		newval = futex_compute_op(oldval, val3);
   1712 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1713 		if (error)
   1714 			goto out_unlock;
   1715 	} while (actual != oldval);
   1716 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1717 				 NULL, FUTEX_WRITERQ, 0,
   1718 				 FUTEX_BITSET_MATCH_ANY) : 0);
   1719 	if (f2 && futex_compute_cmp(oldval, val3))
   1720 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1721 				     NULL, FUTEX_WRITERQ, 0,
   1722 				     FUTEX_BITSET_MATCH_ANY);
   1723 
   1724 	/* Success! */
   1725 	error = 0;
   1726 out_unlock:
   1727 	futex_op_unlock2(f, f2);
   1728 
   1729 out:
   1730 	/* Return the number of waiters woken. */
   1731 	*retval = nwoken;
   1732 
   1733 	/* Release the futexes, if we got them. */
   1734 	if (f2)
   1735 		futex_rele(f2);
   1736 	if (f)
   1737 		futex_rele(f);
   1738 	return error;
   1739 }
   1740 
   1741 /*
   1742  * futex_read_waiter_prio(l)
   1743  *
   1744  *	Returns the read-waiter priority for purposes of comparing
   1745  *	against a write-waiter's priority.  Read-waiters are only
   1746  *	prioritized if they are real-time threads.
   1747  */
   1748 static inline pri_t
   1749 futex_read_waiter_prio(struct lwp * const l)
   1750 {
   1751 	const pri_t pri = lwp_eprio(l);
   1752 	if (__predict_false(pri >= PRI_USER_RT))
   1753 		return pri;
   1754 	return PRI_NONE;
   1755 }
   1756 
   1757 #if 0 /* XXX */
   1758 /*
   1759  * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp)
   1760  *
   1761  *	Attempt to resolve the case of a real-time thread attempting
   1762  *	to acquire a read lock.  Turns true if the attempt is resolved
   1763  *	and the wait should be elided.
   1764  */
   1765 static int __noinline
   1766 futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val,
   1767     pri_t pri_reader)
   1768 {
   1769 	struct lwp *l_writer = NULL;
   1770 	int newval, next;
   1771 	int error;
   1772 
   1773 	KASSERT(mutex_owned(&f->fx_op_lock));
   1774 
   1775 	/*
   1776 	 * If the lock is write-locked, there isn't anything we
   1777 	 * can do but wait.
   1778 	 */
   1779 	if (val & FUTEX_RW_WRITE_LOCKED) {
   1780 		return 0;
   1781 	}
   1782 
   1783 	/*
   1784 	 * If we're maxed out on readers already, nothing we can do.
   1785 	 */
   1786 	if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
   1787 		return ENFILE;		/* disctinct from EAGAIN */
   1788 	}
   1789 
   1790 	/*
   1791 	 * The assumption then is that we arrived here with WRITE_WANTED
   1792 	 * set.  We're not going to bother testing that, however.  We
   1793 	 * will preserve it if we see it.
   1794 	 *
   1795 	 * Because WRITE_WANTED is set, This will cause everyone to enter
   1796 	 * the futex to rw_wait.  And we are holding the op lock so no
   1797 	 * additional waiters will apear on the sleepq.  We are going
   1798 	 * to do the same tricky dance as rw_handoff to let higher-priority
   1799 	 * real-time waiters slip past the gate.
   1800 	 */
   1801 
   1802 	/*
   1803 	 * All we want to do here is check if there is a writer waiting.
   1804 	 * If there is, and it is equal or greater priority, this reader
   1805 	 * loses, otherwise we will just make note if it to ensure that
   1806 	 * the WRITE_WANTED bit is set when we update the futex word.
   1807 	 *
   1808 	 * Since we are not going to actually wake someone from the
   1809 	 * queue here, we're not really interested if the write-waiter
   1810 	 * happens to leave based on a timeout or signal; we know that
   1811 	 * any write-waiter *after* the first one is of equal or lower
   1812 	 * priority, so we would still best it.
   1813 	 */
   1814 	mutex_spin_enter(f->fx_sq_lock);
   1815 	l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
   1816 	if (__predict_true(l_writer != NULL)) {
   1817 		if (pri_reader <= lwp_eprio(l_writer)) {
   1818 			return 0;
   1819 		}
   1820 	}
   1821 	mutex_spin_exit(f->fx_sq_lock);
   1822 
   1823 	for (;; val = next) {
   1824 		if (__predict_true(l_writer != NULL)) {
   1825 			newval = (val + 1) | FUTEX_RW_WRITE_WANTED;
   1826 		} else {
   1827 			/*
   1828 			 * No writer waiting; increment the reader
   1829 			 * count, preserve any WRITE_WANTED bit.
   1830 			 */
   1831 			newval = val + 1;
   1832 			KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0);
   1833 		}
   1834 
   1835 		error = ucas_int(uaddr, val, newval, &next);
   1836 		if (__predict_false(error != 0)) {
   1837 			return error;
   1838 		}
   1839 		if (next == val) {
   1840 			/* Successfully acquired the read lock. */
   1841 			return EJUSTRETURN;
   1842 		}
   1843 		/*
   1844 		 * Repeat the terminal checks from above on the new
   1845 		 * value.
   1846 		 */
   1847 		if (next & FUTEX_RW_WRITE_LOCKED) {
   1848 			return 0;
   1849 		}
   1850 		if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
   1851 			return ENFILE;		/* disctinct from EAGAIN */
   1852 		}
   1853 	}
   1854 }
   1855 #endif /* XXX */
   1856 
   1857 /*
   1858  * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval)
   1859  *
   1860  *	Implement futex(FUTEX_NETBSD_RW_WAIT).
   1861  */
   1862 static int
   1863 futex_func_rw_wait(bool shared, int *uaddr, int val, int val3,
   1864     const struct timespec *abstime, clockid_t clkid, register_t *retval)
   1865 {
   1866 #if 1
   1867 	/* XXX NETBSD_RW ops are currently broken XXX */
   1868 	return ENOSYS;
   1869 #else
   1870 	struct futex *f;
   1871 	int error;
   1872 
   1873 	/* Must specify READER or WRITER. */
   1874 	if (__predict_false(val3 != FUTEX_RW_READER &&
   1875 			    val3 != FUTEX_RW_WRITER))
   1876 		return EINVAL;
   1877 
   1878 	/* Optimistically test before anything else.  */
   1879 	if (!futex_test(uaddr, val))
   1880 		return EAGAIN;
   1881 
   1882 	/* Get the futex, creating it if necessary.  */
   1883 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
   1884 	if (error)
   1885 		return error;
   1886 	KASSERT(f);
   1887 
   1888 	/*
   1889 	 * Under the op lock, check the value again: if it has
   1890 	 * already changed, EAGAIN; otherwise, enqueue the waiter
   1891 	 * on the specified queue.
   1892 	 */
   1893 	futex_op_lock(f);
   1894 	if (!futex_test(uaddr, val)) {
   1895 		futex_op_unlock(f);
   1896 		error = EAGAIN;
   1897 		goto out;
   1898 	}
   1899 
   1900 	/*
   1901 	 * POSIX dictates that a real-time reader will be prioritized
   1902 	 * over writers of lesser priority, when normally we would
   1903 	 * prefer the writers.
   1904 	 */
   1905 	if (__predict_false(val3 == FUTEX_RW_READER)) {
   1906 		struct lwp * const l = curlwp;
   1907 		const pri_t pri_reader = futex_read_waiter_prio(l);
   1908 		if (__predict_false(pri_reader > PRI_NONE)) {
   1909 			error = futex_rw_handle_rt_reader(f, uaddr, val,
   1910 			    pri_reader);
   1911 			if (error) {
   1912 				if (__predict_true(error == EJUSTRETURN)) {
   1913 					/* RT reader acquired the lock. */
   1914 					error = 0;
   1915 				}
   1916 				futex_op_unlock(f);
   1917 				goto out;
   1918 			}
   1919 		}
   1920 	}
   1921 
   1922 	/*
   1923 	 * Now wait.  futex_wait() will dop our op lock once we
   1924 	 * are entered into the sleep queue, thus ensuring atomicity
   1925 	 * of wakes with respect to waits.
   1926 	 *
   1927 	 * Use a wake selector of 0 so that this waiter won't match on any
   1928 	 * of the other operations in case someone makes that error; only
   1929 	 * rw_handoff is allowed!  This is critical because a waiter that
   1930 	 * awakens from an rw_wait assumes it has been given ownership of
   1931 	 * the lock.
   1932 	 */
   1933 	error = futex_wait(f, val3, abstime, clkid, 0);
   1934 
   1935 	/*
   1936 	 * Don't drop our reference here.  We won't be requeued, but
   1937 	 * it's best to main symmetry with other operations.
   1938 	 */
   1939 	f = NULL;
   1940 
   1941 out:	if (__predict_true(error == 0)) {
   1942 		/* Return 0 on success, error on failure. */
   1943 		*retval = 0;
   1944 	}
   1945 
   1946 	if (f != NULL)
   1947 		futex_rele(f);
   1948 	return error;
   1949 #endif
   1950 }
   1951 
   1952 /*
   1953  * futex_func_rw_handoff(uaddr, val, retval)
   1954  *
   1955  *	Implement futex(FUTEX_NETBSD_RW_HANDOFF).
   1956  *
   1957  *	This implements direct hand-off to either a single writer
   1958  *	or all readers.
   1959  */
   1960 static int
   1961 futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval)
   1962 {
   1963 #if 1
   1964 	/* XXX NETBSD_RW ops are currently broken XXX */
   1965 	return ENOSYS;
   1966 #else
   1967 	struct lwp *l, *l_next, *l_writer, *l_reader;
   1968 	struct futex *f;
   1969 	sleepq_t wsq, *sq;
   1970 	int error = 0;
   1971 	int newval, next, nwake, nwoken = 0;
   1972 	int allreaders __diagused = 0;
   1973 	unsigned int *nwaitersp;
   1974 	pri_t pri_writer;
   1975 
   1976 	/* Look up the futex, if any.  */
   1977 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
   1978 	if (error)
   1979 		goto out;
   1980 
   1981 	/*
   1982 	 * There are a couple of diffrent scenarios where a thread
   1983 	 * releasing a RW lock would call rw_handoff, yet we find no
   1984 	 * waiters:
   1985 	 *
   1986 	 * ==> There were waiters on the queue, but they left the queue
   1987 	 *     due to a signal.
   1988 	 * ==> The waiting thread set the waiter bit(s), but decided to
   1989 	 *     try spinning before calling rw_wait.
   1990 	 *
   1991 	 * In both of these cases, we will ensure that the lock word
   1992 	 * gets cleared.
   1993 	 */
   1994 
   1995 	/* If there's no futex, there are no waiters to wake. */
   1996 	if (__predict_false(f == NULL)) {
   1997 		/*
   1998 		 * If there are no waiters, ensure that the lock
   1999 		 * word is cleared.
   2000 		 */
   2001 		error = ucas_int(uaddr, val, 0, &next);
   2002 		if (__predict_true(error == 0)) {
   2003 			if (__predict_false(val != next))
   2004 				error = EAGAIN;
   2005 		}
   2006 		goto out;
   2007 	}
   2008 
   2009 	/*
   2010 	 * Compute the new value and store it in the futex word.
   2011 	 *
   2012 	 * This is a little tricky because the ucas could cause
   2013 	 * a page fault, and we can't let that happen while holding
   2014 	 * the sleepq locks.  But we have to make sure that choices
   2015 	 * we make regarding what threads to wake is accurately
   2016 	 * reflected in the futex word and that the futex word is
   2017 	 * updated before the LWPs can run.
   2018 	 *
   2019 	 * This is easy enough ... we can transfer the LWPs to a private
   2020 	 * sleepq to prevent changes in the original sleepq while we have
   2021 	 * it unlocked from affecting the results, but we must also
   2022 	 * consider that LWPs might be using timed-wait, so we have to
   2023 	 * make sure that won't wake the LWP up out from under us if the
   2024 	 * timer fires.  To do this, we have to set the "wchan" to NULL
   2025 	 * and use a dummy syncobj that takes no action on "unsleep".
   2026 	 *
   2027 	 * We thus perform the hand-off in three steps, all with the op
   2028 	 * lock held:
   2029 	 *
   2030 	 * ==> Wake selection: sleepq locked, select LWPs to wake,
   2031 	 *     compute new futex word.  LWPs are tranferred from the
   2032 	 *     futex sleepq to the private sleepq and further sedated.
   2033 	 *
   2034 	 * ==> Futex word update: sleepq unlocked, use a loop around ucas
   2035 	 *     to update the futex word.  There are no scenarios where
   2036 	 *     user space code can update the futex in a way that would
   2037 	 *     impact the work we do here; because we've been asked to do
   2038 	 *     the hand-off, any new LWPs attempting to acquire the lock
   2039 	 *     would be entering rw_wait by definition (either because
   2040 	 *     they're read-lockers and the lock is write-wanted, or they're
   2041 	 *     write-lockers and the lock is read-held).  Those new rw_wait
   2042 	 *     LWPs will serialize against the op lock.  We DO, however,
   2043 	 *     need to preserve any newly-set WANTED bits, hence the ucas
   2044 	 *     loop.  Those newly-set WANTED bits, however, will not impact
   2045 	 *     our decisions.  Those LWPs have simply lost the race to
   2046 	 *     acquire the lock, and we don't consult those bits anyway;
   2047 	 *     we instead use the contents of the sleepqs as the truth
   2048 	 *     about who is waiting, and now new waiters will appear on
   2049 	 *     the sleepqs while we hold the op lock.
   2050 	 *
   2051 	 * ==> Wake waiters: sleepq locked, run down our private sleepq
   2052 	 *     and actually awaken all of the LWPs we selected earlier.
   2053 	 *
   2054 	 * If for some reason, the ucas fails because it page backing it
   2055 	 * was unmapped, all bets are off.  We still awaken the waiters.
   2056 	 * This is either a malicious attempt to screw things up, or a
   2057 	 * programming error, and we don't care about either one.
   2058 	 */
   2059 	sleepq_init(&wsq);
   2060 
   2061 	futex_op_lock(f);
   2062 	mutex_spin_enter(f->fx_sq_lock);
   2063 
   2064 	/*
   2065 	 * STEP 1
   2066 	 */
   2067 
   2068 	/*
   2069 	 * POSIX dictates that any real-time waiters will acquire the
   2070 	 * lock in priority order.  This implies that a real-time
   2071 	 * read-waiter has priority over a non-real-time write-waiter,
   2072 	 * where we would otherwise prioritize waking the write-waiter.
   2073 	 */
   2074 	l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
   2075 	l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]);
   2076 	if (__predict_true(l_writer == NULL)) {
   2077 		/* We will wake all the readers. */
   2078 		sq = &f->fx_sqs[FUTEX_READERQ];
   2079 		nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
   2080 		nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ];
   2081 		KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
   2082 		if (__predict_false(nwake == 0)) {
   2083 			KASSERT(l_reader == NULL);
   2084 			newval = 0;
   2085 		} else {
   2086 			KASSERT(l_reader != NULL);
   2087 			newval = nwake;
   2088 		}
   2089 		l = l_reader;
   2090 	} else if (__predict_false(l_reader != NULL &&
   2091 				   futex_read_waiter_prio(l_reader) >
   2092 					(pri_writer = lwp_eprio(l_writer)))) {
   2093 		/*
   2094 		 * Count the number of real-time read-waiters that
   2095 		 * exceed the write-waiter's priority.  We will
   2096 		 * wake that many (we alreay know it's at least one).
   2097 		 */
   2098 		sq = &f->fx_sqs[FUTEX_READERQ];
   2099 		nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
   2100 		for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain);
   2101 		     l != NULL && futex_read_waiter_prio(l) > pri_writer;
   2102 		     l = LIST_NEXT(l, l_sleepchain)) {
   2103 			nwake++;
   2104 		}
   2105 		KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
   2106 		/* We know there is at least one write-waiter. */
   2107 		newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
   2108 		l = l_reader;
   2109 	} else {
   2110 		/* We will wake one writer. */
   2111 		sq = &f->fx_sqs[FUTEX_WRITERQ];
   2112 		nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ];
   2113 		nwake = 1;
   2114 		newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid;
   2115 		if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) {
   2116 			KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL);
   2117 			newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
   2118 		} else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) {
   2119 			KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2120 			newval |= FUTEX_WAITERS;
   2121 		} else {
   2122 			KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2123 		}
   2124 		l = l_writer;
   2125 	}
   2126 
   2127 	KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] ||
   2128 		sq == &f->fx_sqs[FUTEX_WRITERQ]);
   2129 	while (nwake != 0 && l != NULL) {
   2130 		l_next = LIST_NEXT(l, l_sleepchain);
   2131 		KASSERT(l->l_syncobj == &futex_syncobj);
   2132 		KASSERT(l->l_wchan == (wchan_t)f);
   2133 		KASSERT(l->l_futex == f);
   2134 		KASSERT(l->l_sleepq == sq);
   2135 		KASSERT(l->l_mutex == f->fx_sq_lock);
   2136 
   2137 		/*
   2138 		 * NULL wchan --> timeout will not wake LWP.
   2139 		 * NULL lock --> keep existing lock.
   2140 		 */
   2141 		sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg,
   2142 		    &sched_syncobj, NULL/*lock*/, false);
   2143 
   2144 		KASSERT(*nwaitersp > 0);
   2145 		(*nwaitersp)--;
   2146 		nwoken++;
   2147 		nwake--;
   2148 		/* hold count adjusted below */
   2149 		l = l_next;
   2150 	}
   2151 
   2152 	mutex_spin_exit(f->fx_sq_lock);
   2153 
   2154 	/*
   2155 	 * STEP 2
   2156 	 */
   2157 	for (;; val = next) {
   2158 		error = ucas_int(uaddr, val, newval, &next);
   2159 		if (__predict_false(error != 0)) {
   2160 			/*
   2161 			 * The futex word has become unmapped.  All bets
   2162 			 * are off.  Break out of the loop and awaken the
   2163 			 * waiters; this is easier than trying to stuff
   2164 			 * them back into their previous sleepq, and the
   2165 			 * application is screwed anyway.
   2166 			 */
   2167 			break;
   2168 		}
   2169 		if (__predict_true(next == val)) {
   2170 			/*
   2171 			 * Successfully updated the futex word!
   2172 			 */
   2173 			break;
   2174 		}
   2175 		/*
   2176 		 * The only thing that could have possibly happened
   2177 		 * (barring some bug in the thread library) is that
   2178 		 * additional waiter bits arrived.  Those new waiters
   2179 		 * have lost the race to acquire the lock, but we
   2180 		 * need to preserve those bits.
   2181 		 */
   2182 		newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED);
   2183 	}
   2184 
   2185 	mutex_spin_enter(f->fx_sq_lock);
   2186 
   2187 	/*
   2188 	 * STEP 3
   2189 	 */
   2190 
   2191 	LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) {
   2192 		KASSERT(l->l_syncobj == &sched_syncobj);
   2193 		KASSERT(l->l_wchan == NULL);
   2194 		KASSERT(l->l_futex == f);
   2195 		KASSERT(l->l_sleepq == &wsq);
   2196 		KASSERT(l->l_mutex == f->fx_sq_lock);
   2197 
   2198 		l->l_futex_wakesel = 0;
   2199 		l->l_futex = NULL;
   2200 		sleepq_remove(&wsq, l);
   2201 	}
   2202 	/* If we woke all-readers, ensure we will wake them all. */
   2203 	KASSERT(allreaders == 0 || allreaders == nwoken);
   2204 	KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2205 	KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0);
   2206 
   2207 	mutex_spin_exit(f->fx_sq_lock);
   2208 
   2209 	/* Adjust hold count. */
   2210 	futex_rele_count_not_last(f, nwoken);
   2211 
   2212 	futex_op_unlock(f);
   2213 
   2214 	/* Release the futex.  */
   2215 	futex_rele(f);
   2216 
   2217 out:	if (__predict_true(error == 0)) {
   2218 		/* Return the number of waiters woken.  */
   2219 		*retval = nwoken;
   2220 	}
   2221 
   2222 	/* Success!  */
   2223 	return error;
   2224 #endif
   2225 }
   2226 
   2227 /*
   2228  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   2229  *
   2230  *	Implement the futex system call with all the parameters
   2231  *	parsed out.
   2232  */
   2233 int
   2234 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   2235     int *uaddr2, int val2, int val3, register_t *retval)
   2236 {
   2237 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   2238 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   2239 							    : CLOCK_MONOTONIC;
   2240 	int error;
   2241 
   2242 	op &= FUTEX_CMD_MASK;
   2243 
   2244 	switch (op) {
   2245 	case FUTEX_WAIT:
   2246 		SDT_PROBE6(futex, func, wait, entry,
   2247 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   2248 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   2249 		error = futex_func_wait(shared, uaddr, val,
   2250 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   2251 		    retval);
   2252 		SDT_PROBE1(futex, func, wait, exit, error);
   2253 		break;
   2254 
   2255 	case FUTEX_WAIT_BITSET:
   2256 		SDT_PROBE6(futex, func, wait, entry,
   2257 		    uaddr, val, val3, timeout,
   2258 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   2259 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   2260 		    clkid, TIMER_ABSTIME, retval);
   2261 		SDT_PROBE1(futex, func, wait, exit, error);
   2262 		break;
   2263 
   2264 	case FUTEX_WAKE:
   2265 		SDT_PROBE4(futex, func, wake, entry,
   2266 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   2267 		error = futex_func_wake(shared, uaddr, val,
   2268 		    FUTEX_BITSET_MATCH_ANY, retval);
   2269 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   2270 		break;
   2271 
   2272 	case FUTEX_WAKE_BITSET:
   2273 		SDT_PROBE4(futex, func, wake, entry,
   2274 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   2275 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   2276 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   2277 		break;
   2278 
   2279 	case FUTEX_REQUEUE:
   2280 		SDT_PROBE5(futex, func, requeue, entry,
   2281 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   2282 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   2283 		    val2, 0, retval);
   2284 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   2285 		break;
   2286 
   2287 	case FUTEX_CMP_REQUEUE:
   2288 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   2289 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   2290 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   2291 		    val2, val3, retval);
   2292 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   2293 		break;
   2294 
   2295 	case FUTEX_WAKE_OP:
   2296 		SDT_PROBE6(futex, func, wake_op, entry,
   2297 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   2298 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   2299 		    val3, retval);
   2300 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   2301 		break;
   2302 
   2303 	case FUTEX_NETBSD_RW_WAIT:
   2304 		SDT_PROBE5(futex, func, rw_wait, entry,
   2305 		    uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK);
   2306 		error = futex_func_rw_wait(shared, uaddr, val, val3,
   2307 		    timeout, clkid, retval);
   2308 		SDT_PROBE1(futex, func, rw_wait, exit, error);
   2309 		break;
   2310 
   2311 	case FUTEX_NETBSD_RW_HANDOFF:
   2312 		SDT_PROBE3(futex, func, rw_handoff, entry,
   2313 		    uaddr, val, op & ~FUTEX_CMD_MASK);
   2314 		error = futex_func_rw_handoff(shared, uaddr, val, retval);
   2315 		SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval);
   2316 		break;
   2317 
   2318 	case FUTEX_FD:
   2319 	default:
   2320 		error = ENOSYS;
   2321 		break;
   2322 	}
   2323 
   2324 	return error;
   2325 }
   2326 
   2327 /*
   2328  * sys___futex(l, uap, retval)
   2329  *
   2330  *	__futex(2) system call: generic futex operations.
   2331  */
   2332 int
   2333 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   2334     register_t *retval)
   2335 {
   2336 	/* {
   2337 		syscallarg(int *) uaddr;
   2338 		syscallarg(int) op;
   2339 		syscallarg(int) val;
   2340 		syscallarg(const struct timespec *) timeout;
   2341 		syscallarg(int *) uaddr2;
   2342 		syscallarg(int) val2;
   2343 		syscallarg(int) val3;
   2344 	} */
   2345 	struct timespec ts, *tsp;
   2346 	int error;
   2347 
   2348 	/*
   2349 	 * Copy in the timeout argument, if specified.
   2350 	 */
   2351 	if (SCARG(uap, timeout)) {
   2352 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   2353 		if (error)
   2354 			return error;
   2355 		tsp = &ts;
   2356 	} else {
   2357 		tsp = NULL;
   2358 	}
   2359 
   2360 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   2361 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   2362 	    retval);
   2363 }
   2364 
   2365 /*
   2366  * sys___futex_set_robust_list(l, uap, retval)
   2367  *
   2368  *	__futex_set_robust_list(2) system call for robust futexes.
   2369  */
   2370 int
   2371 sys___futex_set_robust_list(struct lwp *l,
   2372     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   2373 {
   2374 	/* {
   2375 		syscallarg(void *) head;
   2376 		syscallarg(size_t) len;
   2377 	} */
   2378 	void *head = SCARG(uap, head);
   2379 
   2380 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   2381 		return EINVAL;
   2382 	if ((uintptr_t)head % sizeof(u_long))
   2383 		return EINVAL;
   2384 
   2385 	l->l_robust_head = (uintptr_t)head;
   2386 
   2387 	return 0;
   2388 }
   2389 
   2390 /*
   2391  * sys___futex_get_robust_list(l, uap, retval)
   2392  *
   2393  *	__futex_get_robust_list(2) system call for robust futexes.
   2394  */
   2395 int
   2396 sys___futex_get_robust_list(struct lwp *l,
   2397     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   2398 {
   2399 	/* {
   2400 		syscallarg(lwpid_t) lwpid;
   2401 		syscallarg(void **) headp;
   2402 		syscallarg(size_t *) lenp;
   2403 	} */
   2404 	void *head;
   2405 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   2406 	int error;
   2407 
   2408 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   2409 	if (error)
   2410 		return error;
   2411 
   2412 	/* Copy out the head pointer and the head structure length. */
   2413 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   2414 	if (__predict_true(error == 0)) {
   2415 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   2416 	}
   2417 
   2418 	return error;
   2419 }
   2420 
   2421 /*
   2422  * release_futex(uva, tid)
   2423  *
   2424  *	Try to release the robust futex at uva in the current process
   2425  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   2426  *	userland program's obligation to arrange correct behaviour.
   2427  */
   2428 static void
   2429 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   2430     bool const is_pending)
   2431 {
   2432 	int *uaddr;
   2433 	struct futex *f;
   2434 	int oldval, newval, actual;
   2435 	int error;
   2436 	bool shared;
   2437 
   2438 	/* If it's misaligned, tough.  */
   2439 	if (__predict_false(uptr & 3))
   2440 		return;
   2441 	uaddr = (int *)uptr;
   2442 
   2443 	error = futex_load(uaddr, &oldval);
   2444 	if (__predict_false(error))
   2445 		return;
   2446 
   2447 	/*
   2448 	 * There are two race conditions we need to handle here:
   2449 	 *
   2450 	 * 1. User space cleared the futex word but died before
   2451 	 *    being able to issue the wakeup.  No wakeups will
   2452 	 *    ever be issued, oops!
   2453 	 *
   2454 	 * 2. Awakened waiter died before being able to acquire
   2455 	 *    the futex in user space.  Any other waiters are
   2456 	 *    now stuck, oops!
   2457 	 *
   2458 	 * In both of these cases, the futex word will be 0 (because
   2459 	 * it's updated before the wake is issued).  The best we can
   2460 	 * do is detect this situation if it's the pending futex and
   2461 	 * issue a wake without modifying the futex word.
   2462 	 *
   2463 	 * XXX eventual PI handling?
   2464 	 */
   2465 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   2466 		register_t retval;
   2467 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   2468 		    FUTEX_BITSET_MATCH_ANY, &retval);
   2469 		if (error != 0 || retval == 0)
   2470 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   2471 			    FUTEX_BITSET_MATCH_ANY, &retval);
   2472 		return;
   2473 	}
   2474 
   2475 	/* Optimistically test whether we need to do anything at all.  */
   2476 	if ((oldval & FUTEX_TID_MASK) != tid)
   2477 		return;
   2478 
   2479 	/*
   2480 	 * We need to handle the case where this thread owned the futex,
   2481 	 * but it was uncontended.  In this case, there won't be any
   2482 	 * kernel state to look up.  All we can do is mark the futex
   2483 	 * as a zombie to be mopped up the next time another thread
   2484 	 * attempts to acquire it.
   2485 	 *
   2486 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   2487 	 * this loop, even if waiters appear while we're are doing
   2488 	 * so.  This is beause FUTEX_WAITERS is set by user space
   2489 	 * before calling __futex() to wait, and the futex needs
   2490 	 * to be marked as a zombie when the new waiter gets into
   2491 	 * the kernel.
   2492 	 */
   2493 	if ((oldval & FUTEX_WAITERS) == 0) {
   2494 		do {
   2495 			error = futex_load(uaddr, &oldval);
   2496 			if (error)
   2497 				return;
   2498 			if ((oldval & FUTEX_TID_MASK) != tid)
   2499 				return;
   2500 			newval = oldval | FUTEX_OWNER_DIED;
   2501 			error = ucas_int(uaddr, oldval, newval, &actual);
   2502 			if (error)
   2503 				return;
   2504 		} while (actual != oldval);
   2505 
   2506 		/*
   2507 		 * If where is still no indication of waiters, then there is
   2508 		 * no more work for us to do.
   2509 		 */
   2510 		if ((oldval & FUTEX_WAITERS) == 0)
   2511 			return;
   2512 	}
   2513 
   2514 	/*
   2515 	 * Look for a futex.  Try shared first, then private.  If we
   2516 	 * can't fine one, tough.
   2517 	 *
   2518 	 * Note: the assumption here is that anyone placing a futex on
   2519 	 * the robust list is adhering to the rules, regardless of the
   2520 	 * futex class.
   2521 	 */
   2522 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2523 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2524 		if (error)
   2525 			return;
   2526 		if (f == NULL && shared == false)
   2527 			return;
   2528 	}
   2529 
   2530 	/* Work under the futex op lock.  */
   2531 	futex_op_lock(f);
   2532 
   2533 	/*
   2534 	 * Fetch the word: if the tid doesn't match ours, skip;
   2535 	 * otherwise, set the owner-died bit, atomically.
   2536 	 */
   2537 	do {
   2538 		error = futex_load(uaddr, &oldval);
   2539 		if (error)
   2540 			goto out;
   2541 		if ((oldval & FUTEX_TID_MASK) != tid)
   2542 			goto out;
   2543 		newval = oldval | FUTEX_OWNER_DIED;
   2544 		error = ucas_int(uaddr, oldval, newval, &actual);
   2545 		if (error)
   2546 			goto out;
   2547 	} while (actual != oldval);
   2548 
   2549 	/*
   2550 	 * If there may be waiters, try to wake one.  If anything goes
   2551 	 * wrong, tough.
   2552 	 *
   2553 	 * XXX eventual PI handling?
   2554 	 */
   2555 	if (oldval & FUTEX_WAITERS) {
   2556 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2557 				 NULL, FUTEX_WRITERQ, 0,
   2558 				 FUTEX_BITSET_MATCH_ANY);
   2559 	}
   2560 
   2561 	/* Unlock the queue and release the futex.  */
   2562 out:	futex_op_unlock(f);
   2563 	futex_rele(f);
   2564 }
   2565 
   2566 /*
   2567  * futex_robust_head_lookup(l, lwpid)
   2568  *
   2569  *	Helper function to look up a robust head by LWP ID.
   2570  */
   2571 int
   2572 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2573 {
   2574 	struct proc *p = l->l_proc;
   2575 
   2576 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2577 	if (lwpid) {
   2578 		mutex_enter(p->p_lock);
   2579 		l = lwp_find(p, lwpid);
   2580 		if (l == NULL) {
   2581 			mutex_exit(p->p_lock);
   2582 			return ESRCH;
   2583 		}
   2584 		*headp = (void *)l->l_robust_head;
   2585 		mutex_exit(p->p_lock);
   2586 	} else {
   2587 		*headp = (void *)l->l_robust_head;
   2588 	}
   2589 	return 0;
   2590 }
   2591 
   2592 /*
   2593  * futex_fetch_robust_head(uaddr)
   2594  *
   2595  *	Helper routine to fetch the futex robust list head that
   2596  *	handles 32-bit binaries running on 64-bit kernels.
   2597  */
   2598 static int
   2599 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2600 {
   2601 #ifdef _LP64
   2602 	if (curproc->p_flag & PK_32) {
   2603 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2604 		int error;
   2605 
   2606 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2607 		if (__predict_true(error == 0)) {
   2608 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2609 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2610 					/*
   2611 					 * Make sure the offset is sign-
   2612 					 * extended.
   2613 					 */
   2614 					rhead[i] = (int32_t)rhead32[i];
   2615 				} else {
   2616 					rhead[i] = rhead32[i];
   2617 				}
   2618 			}
   2619 		}
   2620 		return error;
   2621 	}
   2622 #endif /* _L64 */
   2623 
   2624 	return copyin((void *)uaddr, rhead,
   2625 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2626 }
   2627 
   2628 /*
   2629  * futex_decode_robust_word(word)
   2630  *
   2631  *	Decode a robust futex list word into the entry and entry
   2632  *	properties.
   2633  */
   2634 static inline void
   2635 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2636     bool * const is_pi)
   2637 {
   2638 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2639 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2640 }
   2641 
   2642 /*
   2643  * futex_fetch_robust_entry(uaddr)
   2644  *
   2645  *	Helper routine to fetch and decode a robust futex entry
   2646  *	that handles 32-bit binaries running on 64-bit kernels.
   2647  */
   2648 static int
   2649 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2650     bool * const is_pi)
   2651 {
   2652 	uintptr_t val = 0;
   2653 	int error = 0;
   2654 
   2655 #ifdef _LP64
   2656 	if (curproc->p_flag & PK_32) {
   2657 		uint32_t val32;
   2658 
   2659 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2660 		if (__predict_true(error == 0))
   2661 			val = val32;
   2662 	} else
   2663 #endif /* _LP64 */
   2664 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2665 	if (__predict_false(error))
   2666 		return error;
   2667 
   2668 	futex_decode_robust_word(val, valp, is_pi);
   2669 	return 0;
   2670 }
   2671 
   2672 /*
   2673  * futex_release_all_lwp(l, tid)
   2674  *
   2675  *	Release all l's robust futexes.  If anything looks funny in
   2676  *	the process, give up -- it's userland's responsibility to dot
   2677  *	the i's and cross the t's.
   2678  */
   2679 void
   2680 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2681 {
   2682 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2683 	int limit = 1000000;
   2684 	int error;
   2685 
   2686 	/* If there's no robust list there's nothing to do. */
   2687 	if (l->l_robust_head == 0)
   2688 		return;
   2689 
   2690 	/* Read the final snapshot of the robust list head. */
   2691 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2692 	if (error) {
   2693 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2694 		    " unmapped robust futex list head\n",
   2695 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2696 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2697 		return;
   2698 	}
   2699 
   2700 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2701 
   2702 	uintptr_t next, pending;
   2703 	bool is_pi, pending_is_pi;
   2704 
   2705 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2706 	    &next, &is_pi);
   2707 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2708 	    &pending, &pending_is_pi);
   2709 
   2710 	/*
   2711 	 * Walk down the list of locked futexes and release them, up
   2712 	 * to one million of them before we give up.
   2713 	 */
   2714 
   2715 	while (next != l->l_robust_head && limit-- > 0) {
   2716 		/* pending handled below. */
   2717 		if (next != pending)
   2718 			release_futex(next + offset, tid, is_pi, false);
   2719 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2720 		if (error)
   2721 			break;
   2722 		preempt_point();
   2723 	}
   2724 	if (limit <= 0) {
   2725 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2726 		    " exhausted robust futex limit\n",
   2727 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2728 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2729 	}
   2730 
   2731 	/* If there's a pending futex, it may need to be released too. */
   2732 	if (pending != 0) {
   2733 		release_futex(pending + offset, tid, pending_is_pi, true);
   2734 	}
   2735 }
   2736