Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.12.4.3
      1 /*	$NetBSD: sys_futex.c,v 1.12.4.3 2021/08/06 18:23:57 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell and Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.3 2021/08/06 18:23:57 thorpej Exp $");
     34 
     35 /*
     36  * Futexes
     37  *
     38  *	The futex system call coordinates notifying threads waiting for
     39  *	changes on a 32-bit word of memory.  The word can be managed by
     40  *	CPU atomic operations in userland, without system calls, as long
     41  *	as there is no contention.
     42  *
     43  *	The simplest use case demonstrating the utility is:
     44  *
     45  *		// 32-bit word of memory shared among threads or
     46  *		// processes in userland.  lock & 1 means owned;
     47  *		// lock & 2 means there are waiters waiting.
     48  *		volatile int lock = 0;
     49  *
     50  *		int v;
     51  *
     52  *		// Acquire a lock.
     53  *		do {
     54  *			v = lock;
     55  *			if (v & 1) {
     56  *				// Lock is held.  Set a bit to say that
     57  *				// there are waiters, and wait for lock
     58  *				// to change to anything other than v;
     59  *				// then retry.
     60  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61  *					continue;
     62  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63  *				continue;
     64  *			}
     65  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66  *		membar_enter();
     67  *
     68  *		...
     69  *
     70  *		// Release the lock.  Optimistically assume there are
     71  *		// no waiters first until demonstrated otherwise.
     72  *		membar_exit();
     73  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74  *			// There may be waiters.
     75  *			v = atomic_swap_uint(&lock, 0);
     76  *			// If there are still waiters, wake one.
     77  *			if (v & 2)
     78  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79  *		}
     80  *
     81  *	The goal is to avoid the futex system call unless there is
     82  *	contention; then if there is contention, to guarantee no missed
     83  *	wakeups.
     84  *
     85  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86  *	itself to be woken, double-check the lock word, and then sleep;
     87  *	spurious wakeups are generally a fact of life, so any
     88  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89  *
     90  *	If this were all there is to it, we could then increase
     91  *	parallelism by refining the approximation: partition the
     92  *	waiters into buckets by hashing the lock addresses to reduce
     93  *	the incidence of spurious wakeups.  But this is not all.
     94  *
     95  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96  *	operation not only wakes n waiters on lock if lock == val, but
     97  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99  *	to lock2 if they merely share the same hash as waiters on lock.
    100  *	Thus, we can't approximately distribute waiters into queues by
    101  *	a hash function; we must distinguish futex queues exactly by
    102  *	lock address.
    103  *
    104  *	For now, we use a global red/black tree to index futexes.  This
    105  *	should be replaced by a lockless radix tree with a thread to
    106  *	free entries no longer in use once all lookups on all CPUs have
    107  *	completed.
    108  *
    109  *	Specifically, we maintain two maps:
    110  *
    111  *	futex_tab.va[vmspace, va] for private futexes
    112  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113  *
    114  *	This implementation does not support priority inheritance.
    115  */
    116 
    117 #include <sys/param.h>
    118 #include <sys/types.h>
    119 #include <sys/atomic.h>
    120 #include <sys/condvar.h>
    121 #include <sys/futex.h>
    122 #include <sys/mutex.h>
    123 #include <sys/rbtree.h>
    124 #include <sys/sleepq.h>
    125 #include <sys/queue.h>
    126 #include <sys/sdt.h>
    127 
    128 #include <sys/syscall.h>
    129 #include <sys/syscallargs.h>
    130 #include <sys/syscallvar.h>
    131 
    132 #include <uvm/uvm_extern.h>
    133 
    134 /*
    135  * DTrace probes.
    136  */
    137 SDT_PROVIDER_DEFINE(futex);
    138 
    139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    140 /* exit: error */
    141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    142 		  "struct timespec *", "int", "int");
    143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    144 
    145 /* entry: uaddr, nwake, bitset, fflags */
    146 /* exit: error, nwoken */
    147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    149 
    150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    151 /* exit: error, nwoken */
    152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    153 		  "int");
    154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    155 
    156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    157 /* exit: error, nwoken */
    158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    159 		  "int", "int", "int");
    160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    161 
    162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    163 /* exit: error, nwoken */
    164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    165 		  "int", "int");
    166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    167 
    168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    171 
    172 /* entry: timo */
    173 /* exit: error */
    174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    176 
    177 /*
    178  * Lock order:
    179  *
    180  *	futex_tab.lock
    181  *	futex::fx_op_lock		ordered by kva of struct futex
    182  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    183  *
    184  * N.B. multiple futexes can share a single sleepq lock.
    185  */
    186 
    187 /*
    188  * union futex_key
    189  *
    190  *	A futex is addressed either by a vmspace+va (private) or by
    191  *	a uvm_voaddr (shared).
    192  */
    193 union futex_key {
    194 	struct {
    195 		struct vmspace			*vmspace;
    196 		vaddr_t				va;
    197 	}			fk_private;
    198 	struct uvm_voaddr	fk_shared;
    199 };
    200 
    201 /*
    202  * struct futex
    203  *
    204  *	Kernel state for a futex located at a particular address in a
    205  *	particular virtual address space.
    206  *
    207  *	N.B. fx_refcnt is an unsigned long because we need to be able
    208  *	to operate on it atomically on all systems while at the same
    209  *	time rendering practically impossible the chance of it reaching
    210  *	its max value.  In practice, we're limited by the number of LWPs
    211  *	that can be present on the system at any given time, and the
    212  *	assumption is that limit will be good enough on a 32-bit platform.
    213  *	See futex_wake() for why overflow needs to be avoided.
    214  *
    215  *	XXX Since futex addresses must be 4-byte aligned, we could
    216  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    217  *	XXX field of the key.  Worth it?
    218  */
    219 struct futex {
    220 	union futex_key		fx_key;
    221 	struct rb_node		fx_node;
    222 	unsigned long		fx_refcnt;
    223 	bool			fx_shared;
    224 	bool			fx_on_tree;
    225 	uint8_t			fx_class;
    226 
    227 	kmutex_t		fx_op_lock;	/* adaptive */
    228 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    229 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    230 	unsigned int		fx_nwaiters[2];
    231 };
    232 
    233 /*
    234  * futex classes: Some futex operations can only be carried out on
    235  * futexes that are known to be abiding by a certain protocol.  These
    236  * classes are assigned to a futex when created due to a wait event,
    237  * and when subsequent wake or requeue operations are issued, the
    238  * class is checked at futex lookup time.  If the class does not match,
    239  * EINVAL is the result.
    240  */
    241 #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    242 #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    243 #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    244 #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    245 
    246 /* sleepq definitions */
    247 #define	FUTEX_READERQ		0
    248 #define	FUTEX_WRITERQ		1
    249 
    250 static const char futex_wmesg[] = "futex";
    251 
    252 static void	futex_unsleep(lwp_t *, bool);
    253 
    254 static syncobj_t futex_syncobj = {
    255 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    256 	.sobj_unsleep	= futex_unsleep,
    257 	.sobj_changepri	= sleepq_changepri,
    258 	.sobj_lendpri	= sleepq_lendpri,
    259 	.sobj_owner	= syncobj_noowner,
    260 };
    261 
    262 /*
    263  * futex_tab
    264  *
    265  *	Global trees of futexes by vmspace/va and VM object address.
    266  *
    267  *	XXX This obviously doesn't scale in parallel.  We could use a
    268  *	pserialize-safe data structure, but there may be a high cost to
    269  *	frequent deletion since we don't cache futexes after we're done
    270  *	with them.  We could use hashed locks.  But for now, just make
    271  *	sure userland can't DoS the serial performance, by using a
    272  *	balanced binary tree for lookup.
    273  *
    274  *	XXX We could use a per-process tree for the table indexed by
    275  *	virtual address to reduce contention between processes.
    276  */
    277 static struct {
    278 	kmutex_t	lock;
    279 	struct rb_tree	va;
    280 	struct rb_tree	oa;
    281 } futex_tab __cacheline_aligned;
    282 
    283 static int
    284 compare_futex_key(void *cookie, const void *n, const void *k)
    285 {
    286 	const struct futex *fa = n;
    287 	const union futex_key *fka = &fa->fx_key;
    288 	const union futex_key *fkb = k;
    289 
    290 	if ((uintptr_t)fka->fk_private.vmspace <
    291 	    (uintptr_t)fkb->fk_private.vmspace)
    292 		return -1;
    293 	if ((uintptr_t)fka->fk_private.vmspace >
    294 	    (uintptr_t)fkb->fk_private.vmspace)
    295 		return +1;
    296 	if (fka->fk_private.va < fkb->fk_private.va)
    297 		return -1;
    298 	if (fka->fk_private.va > fkb->fk_private.va)
    299 		return -1;
    300 	return 0;
    301 }
    302 
    303 static int
    304 compare_futex(void *cookie, const void *na, const void *nb)
    305 {
    306 	const struct futex *fa = na;
    307 	const struct futex *fb = nb;
    308 
    309 	return compare_futex_key(cookie, fa, &fb->fx_key);
    310 }
    311 
    312 static const rb_tree_ops_t futex_rb_ops = {
    313 	.rbto_compare_nodes = compare_futex,
    314 	.rbto_compare_key = compare_futex_key,
    315 	.rbto_node_offset = offsetof(struct futex, fx_node),
    316 };
    317 
    318 static int
    319 compare_futex_shared_key(void *cookie, const void *n, const void *k)
    320 {
    321 	const struct futex *fa = n;
    322 	const union futex_key *fka = &fa->fx_key;
    323 	const union futex_key *fkb = k;
    324 
    325 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    326 }
    327 
    328 static int
    329 compare_futex_shared(void *cookie, const void *na, const void *nb)
    330 {
    331 	const struct futex *fa = na;
    332 	const struct futex *fb = nb;
    333 
    334 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    335 }
    336 
    337 static const rb_tree_ops_t futex_shared_rb_ops = {
    338 	.rbto_compare_nodes = compare_futex_shared,
    339 	.rbto_compare_key = compare_futex_shared_key,
    340 	.rbto_node_offset = offsetof(struct futex, fx_node),
    341 };
    342 
    343 /*
    344  * futex_sq_lock2(f, f2)
    345  *
    346  *	Acquire the sleepq locks for f and f2, which may be null, or
    347  *	which may be the same.  If they are distinct, an arbitrary total
    348  *	order is chosen on the locks.
    349  *
    350  *	Callers should only ever acquire multiple sleepq locks
    351  *	simultaneously using futex_sq_lock2.
    352  */
    353 static void
    354 futex_sq_lock2(struct futex * const f, struct futex * const f2)
    355 {
    356 
    357 	/*
    358 	 * If both are null, do nothing; if one is null and the other
    359 	 * is not, lock the other and be done with it.
    360 	 */
    361 	if (f == NULL && f2 == NULL) {
    362 		return;
    363 	} else if (f == NULL) {
    364 		mutex_spin_enter(f2->fx_sq_lock);
    365 		return;
    366 	} else if (f2 == NULL) {
    367 		mutex_spin_enter(f->fx_sq_lock);
    368 		return;
    369 	}
    370 
    371 	kmutex_t * const m = f->fx_sq_lock;
    372 	kmutex_t * const m2 = f2->fx_sq_lock;
    373 
    374 	/* If both locks are the same, acquire only one. */
    375 	if (m == m2) {
    376 		mutex_spin_enter(m);
    377 		return;
    378 	}
    379 
    380 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    381 	if ((uintptr_t)m < (uintptr_t)m2) {
    382 		mutex_spin_enter(m);
    383 		mutex_spin_enter(m2);
    384 	} else {
    385 		mutex_spin_enter(m2);
    386 		mutex_spin_enter(m);
    387 	}
    388 }
    389 
    390 /*
    391  * futex_sq_unlock2(f, f2)
    392  *
    393  *	Release the sleep queue locks for f and f2, which may be null, or
    394  *	which may have the same underlying lock.
    395  */
    396 static void
    397 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    398 {
    399 
    400 	/*
    401 	 * If both are null, do nothing; if one is null and the other
    402 	 * is not, unlock the other and be done with it.
    403 	 */
    404 	if (f == NULL && f2 == NULL) {
    405 		return;
    406 	} else if (f == NULL) {
    407 		mutex_spin_exit(f2->fx_sq_lock);
    408 		return;
    409 	} else if (f2 == NULL) {
    410 		mutex_spin_exit(f->fx_sq_lock);
    411 		return;
    412 	}
    413 
    414 	kmutex_t * const m = f->fx_sq_lock;
    415 	kmutex_t * const m2 = f2->fx_sq_lock;
    416 
    417 	/* If both locks are the same, release only one. */
    418 	if (m == m2) {
    419 		mutex_spin_exit(m);
    420 		return;
    421 	}
    422 
    423 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    424 	if ((uintptr_t)m < (uintptr_t)m2) {
    425 		mutex_spin_exit(m2);
    426 		mutex_spin_exit(m);
    427 	} else {
    428 		mutex_spin_exit(m);
    429 		mutex_spin_exit(m2);
    430 	}
    431 }
    432 
    433 /*
    434  * futex_load(uaddr, kaddr)
    435  *
    436  *	Perform a single atomic load to read *uaddr, and return the
    437  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    438  *	mapped.
    439  */
    440 static inline int
    441 futex_load(int *uaddr, int *kaddr)
    442 {
    443 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    444 }
    445 
    446 /*
    447  * futex_test(uaddr, expected)
    448  *
    449  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    450  *	uaddr is not mapped.
    451  */
    452 static bool
    453 futex_test(int *uaddr, int expected)
    454 {
    455 	int val;
    456 	int error;
    457 
    458 	error = futex_load(uaddr, &val);
    459 	if (error)
    460 		return false;
    461 	return val == expected;
    462 }
    463 
    464 static pool_cache_t	futex_cache		__read_mostly;
    465 
    466 static int	futex_ctor(void *, void *, int);
    467 static void	futex_dtor(void *, void *);
    468 
    469 /*
    470  * futex_sys_init()
    471  *
    472  *	Initialize the futex subsystem.
    473  */
    474 void
    475 futex_sys_init(void)
    476 {
    477 
    478 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    479 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    480 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    481 
    482 	futex_cache = pool_cache_init(sizeof(struct futex),
    483 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    484 	    futex_dtor, NULL);
    485 }
    486 
    487 /*
    488  * futex_sys_fini()
    489  *
    490  *	Finalize the futex subsystem.
    491  */
    492 void
    493 futex_sys_fini(void)
    494 {
    495 
    496 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    497 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    498 	mutex_destroy(&futex_tab.lock);
    499 }
    500 
    501 /*
    502  * futex_ctor()
    503  *
    504  *	Futex object constructor.
    505  */
    506 static int
    507 futex_ctor(void *arg __unused, void *obj, int flags __unused)
    508 {
    509 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    510 	struct futex * const f = obj;
    511 
    512 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    513 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    514 
    515 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    516 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    517 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    518 
    519 	return 0;
    520 }
    521 
    522 /*
    523  * futex_dtor()
    524  *
    525  *	Futex object destructor.
    526  */
    527 static void
    528 futex_dtor(void *arg __unused, void *obj)
    529 {
    530 	struct futex * const f = obj;
    531 
    532 	mutex_destroy(&f->fx_op_lock);
    533 	f->fx_sq_lock = NULL;
    534 }
    535 
    536 /*
    537  * futex_key_init(key, vm, va, shared)
    538  *
    539  *	Initialize a futex key for lookup, etc.
    540  */
    541 static int
    542 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
    543 {
    544 	int error = 0;
    545 
    546 	if (__predict_false(shared)) {
    547 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    548 			error = EFAULT;
    549 	} else {
    550 		fk->fk_private.vmspace = vm;
    551 		fk->fk_private.va = va;
    552 	}
    553 
    554 	return error;
    555 }
    556 
    557 /*
    558  * futex_key_fini(key, shared)
    559  *
    560  *	Release a futex key.
    561  */
    562 static void
    563 futex_key_fini(union futex_key *fk, bool shared)
    564 {
    565 	if (__predict_false(shared))
    566 		uvm_voaddr_release(&fk->fk_shared);
    567 	memset(fk, 0, sizeof(*fk));
    568 }
    569 
    570 /*
    571  * futex_create(fk, shared)
    572  *
    573  *	Create a futex.  Initial reference count is 1, representing the
    574  *	caller.  Returns NULL on failure.  Always takes ownership of the
    575  *	key, either transferring it to the newly-created futex, or releasing
    576  *	the key if creation fails.
    577  *
    578  *	Never sleeps for memory, but may sleep to acquire a lock.
    579  */
    580 static struct futex *
    581 futex_create(union futex_key *fk, bool shared, uint8_t class)
    582 {
    583 	struct futex *f;
    584 
    585 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    586 	if (f == NULL) {
    587 		futex_key_fini(fk, shared);
    588 		return NULL;
    589 	}
    590 	f->fx_key = *fk;
    591 	f->fx_refcnt = 1;
    592 	f->fx_shared = shared;
    593 	f->fx_on_tree = false;
    594 	f->fx_class = class;
    595 
    596 	return f;
    597 }
    598 
    599 /*
    600  * futex_destroy(f)
    601  *
    602  *	Destroy a futex created with futex_create.  Reference count
    603  *	must be zero.
    604  *
    605  *	May sleep.
    606  */
    607 static void
    608 futex_destroy(struct futex *f)
    609 {
    610 
    611 	ASSERT_SLEEPABLE();
    612 
    613 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    614 	KASSERT(!f->fx_on_tree);
    615 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    616 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    617 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    618 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    619 
    620 	futex_key_fini(&f->fx_key, f->fx_shared);
    621 
    622 	pool_cache_put(futex_cache, f);
    623 }
    624 
    625 /*
    626  * futex_hold_count(f, n)
    627  *
    628  *	Attempt to acquire a reference to f.  Return 0 on success,
    629  *	ENFILE on too many references.
    630  *
    631  *	Never sleeps.
    632  */
    633 static int
    634 futex_hold_count(struct futex *f, unsigned long const count)
    635 {
    636 	unsigned long refcnt;
    637 
    638 	do {
    639 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    640 		if (ULONG_MAX - refcnt < count)
    641 			return ENFILE;
    642 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    643 				  refcnt + count) != refcnt);
    644 
    645 	return 0;
    646 }
    647 #define	futex_hold(f)	futex_hold_count(f, 1)
    648 
    649 /*
    650  * futex_rele_count(f, n)
    651  *
    652  *	Release a reference to f acquired with futex_create or
    653  *	futex_hold.
    654  *
    655  *	May sleep to free f.
    656  */
    657 static void
    658 futex_rele_count(struct futex *f, unsigned long const count)
    659 {
    660 	unsigned long refcnt;
    661 
    662 	ASSERT_SLEEPABLE();
    663 
    664 	do {
    665 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    666 		KASSERT(refcnt >= count);
    667 		if (refcnt - count == 0)
    668 			goto trylast;
    669 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    670 				  refcnt - count) != refcnt);
    671 	return;
    672 
    673 trylast:
    674 	KASSERT(count <= LONG_MAX);
    675 	mutex_enter(&futex_tab.lock);
    676 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    677 		if (f->fx_on_tree) {
    678 			if (__predict_false(f->fx_shared))
    679 				rb_tree_remove_node(&futex_tab.oa, f);
    680 			else
    681 				rb_tree_remove_node(&futex_tab.va, f);
    682 			f->fx_on_tree = false;
    683 		}
    684 	} else {
    685 		/* References remain -- don't destroy it.  */
    686 		f = NULL;
    687 	}
    688 	mutex_exit(&futex_tab.lock);
    689 	if (f != NULL)
    690 		futex_destroy(f);
    691 }
    692 #define	futex_rele(f)	futex_rele_count(f, 1)
    693 
    694 /*
    695  * futex_rele_count_not_last(f, n)
    696  *
    697  *	Release a reference to f acquired with futex_create or
    698  *	futex_hold.
    699  *
    700  *	This version asserts that we are not dropping the last
    701  *	reference to f.
    702  */
    703 static void
    704 futex_rele_count_not_last(struct futex *f, unsigned long const count)
    705 {
    706 	unsigned long refcnt;
    707 
    708 	do {
    709 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    710 		KASSERT(refcnt >= count);
    711 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    712 				  refcnt - count) != refcnt);
    713 }
    714 
    715 /*
    716  * futex_lookup_by_key(key, shared, class, &f)
    717  *
    718  *	Try to find an existing futex va reference in the specified key
    719  *	On success, return 0, set f to found futex or to NULL if not found,
    720  *	and increment f's reference count if found.
    721  *
    722  *	Return ENFILE if reference count too high.
    723  *
    724  *	Internal lookup routine shared by futex_lookup() and
    725  *	futex_lookup_create().
    726  */
    727 static int
    728 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
    729     struct futex **fp)
    730 {
    731 	struct futex *f;
    732 	int error = 0;
    733 
    734 	mutex_enter(&futex_tab.lock);
    735 	if (__predict_false(shared)) {
    736 		f = rb_tree_find_node(&futex_tab.oa, fk);
    737 	} else {
    738 		f = rb_tree_find_node(&futex_tab.va, fk);
    739 	}
    740 	if (f) {
    741 		if (__predict_true(f->fx_class == class ||
    742 				   class == FUTEX_CLASS_ANY))
    743 			error = futex_hold(f);
    744 		else
    745 			error = EINVAL;
    746 		if (error)
    747 			f = NULL;
    748 	}
    749  	*fp = f;
    750 	mutex_exit(&futex_tab.lock);
    751 
    752 	return error;
    753 }
    754 
    755 /*
    756  * futex_insert(f, fp)
    757  *
    758  *	Try to insert the futex f into the tree by va.  If there
    759  *	already is a futex for its va, acquire a reference to it, and
    760  *	store it in *fp; otherwise store f in *fp.
    761  *
    762  *	Return 0 on success, ENFILE if there already is a futex but its
    763  *	reference count is too high.
    764  */
    765 static int
    766 futex_insert(struct futex *f, struct futex **fp)
    767 {
    768 	struct futex *f0;
    769 	int error;
    770 
    771 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    772 	KASSERT(!f->fx_on_tree);
    773 
    774 	mutex_enter(&futex_tab.lock);
    775 	if (__predict_false(f->fx_shared))
    776 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    777 	else
    778 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    779 	if (f0 == f) {
    780 		f->fx_on_tree = true;
    781 		error = 0;
    782 	} else {
    783 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    784 		KASSERT(f0->fx_on_tree);
    785 		error = futex_hold(f0);
    786 		if (error)
    787 			goto out;
    788 	}
    789 	*fp = f0;
    790 out:	mutex_exit(&futex_tab.lock);
    791 
    792 	return error;
    793 }
    794 
    795 /*
    796  * futex_lookup(uaddr, shared, class, &f)
    797  *
    798  *	Find a futex at the userland pointer uaddr in the current
    799  *	process's VM space.  On success, return the futex in f and
    800  *	increment its reference count.
    801  *
    802  *	Caller must call futex_rele when done.
    803  */
    804 static int
    805 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    806 {
    807 	union futex_key fk;
    808 	struct vmspace *vm = curproc->p_vmspace;
    809 	vaddr_t va = (vaddr_t)uaddr;
    810 	int error;
    811 
    812 	/*
    813 	 * Reject unaligned user pointers so we don't cross page
    814 	 * boundaries and so atomics will work.
    815 	 */
    816 	if (__predict_false((va & 3) != 0))
    817 		return EINVAL;
    818 
    819 	/* Look it up. */
    820 	error = futex_key_init(&fk, vm, va, shared);
    821 	if (error)
    822 		return error;
    823 
    824 	error = futex_lookup_by_key(&fk, shared, class, fp);
    825 	futex_key_fini(&fk, shared);
    826 	if (error)
    827 		return error;
    828 
    829 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    830 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    831 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    832 
    833 	/*
    834 	 * Success!  (Caller must still check whether we found
    835 	 * anything, but nothing went _wrong_ like trying to use
    836 	 * unmapped memory.)
    837 	 */
    838 	KASSERT(error == 0);
    839 
    840 	return error;
    841 }
    842 
    843 /*
    844  * futex_lookup_create(uaddr, shared, class, &f)
    845  *
    846  *	Find or create a futex at the userland pointer uaddr in the
    847  *	current process's VM space.  On success, return the futex in f
    848  *	and increment its reference count.
    849  *
    850  *	Caller must call futex_rele when done.
    851  */
    852 static int
    853 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    854 {
    855 	union futex_key fk;
    856 	struct vmspace *vm = curproc->p_vmspace;
    857 	struct futex *f = NULL;
    858 	vaddr_t va = (vaddr_t)uaddr;
    859 	int error;
    860 
    861 	/*
    862 	 * Reject unaligned user pointers so we don't cross page
    863 	 * boundaries and so atomics will work.
    864 	 */
    865 	if (__predict_false((va & 3) != 0))
    866 		return EINVAL;
    867 
    868 	if (__predict_false(class == FUTEX_CLASS_ANY))
    869 		return EINVAL;
    870 
    871 	error = futex_key_init(&fk, vm, va, shared);
    872 	if (error)
    873 		return error;
    874 
    875 	/*
    876 	 * Optimistically assume there already is one, and try to find
    877 	 * it.
    878 	 */
    879 	error = futex_lookup_by_key(&fk, shared, class, fp);
    880 	if (error || *fp != NULL) {
    881 		/*
    882 		 * We either found one, or there was an error.
    883 		 * In either case, we are done with the key.
    884 		 */
    885 		futex_key_fini(&fk, shared);
    886 		goto out;
    887 	}
    888 
    889 	/*
    890 	 * Create a futex recoard.  This tranfers ownership of the key
    891 	 * in all cases.
    892 	 */
    893 	f = futex_create(&fk, shared, class);
    894 	if (f == NULL) {
    895 		error = ENOMEM;
    896 		goto out;
    897 	}
    898 
    899 	/*
    900 	 * Insert our new futex, or use existing if someone else beat
    901 	 * us to it.
    902 	 */
    903 	error = futex_insert(f, fp);
    904 	if (error)
    905 		goto out;
    906 	if (*fp == f)
    907 		f = NULL;	/* don't release on exit */
    908 
    909 	/* Success!  */
    910 	KASSERT(error == 0);
    911 
    912 out:	if (f != NULL)
    913 		futex_rele(f);
    914 	KASSERT(error || *fp != NULL);
    915 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    916 	return error;
    917 }
    918 
    919 /*
    920  * futex_unsleep:
    921  *
    922  *	Remove an LWP from the futex and sleep queue.  This is called when
    923  *	the LWP has not been awoken normally but instead interrupted: for
    924  *	example, when a signal is received.  Must be called with the LWP
    925  *	locked.  Will unlock if "unlock" is true.
    926  */
    927 static void
    928 futex_unsleep(lwp_t *l, bool unlock)
    929 {
    930 	struct futex *f __diagused;
    931 
    932 	f = (struct futex *)(uintptr_t)l->l_wchan;
    933 
    934 	KASSERT(mutex_owned(f->fx_sq_lock));
    935 
    936 	/* WRITERQ is more likely */
    937 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    938 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    939 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    940 	} else {
    941 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    942 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    943 		f->fx_nwaiters[FUTEX_READERQ]--;
    944 	}
    945 
    946 	sleepq_unsleep(l, unlock);
    947 }
    948 
    949 /*
    950  * futex_wait(f, timeout, clkid, bitset)
    951  *
    952  *	Wait until deadline on the clock clkid, or forever if deadline
    953  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    954  *	ETIMEDOUT on timeout, EINTR on signal.
    955  */
    956 static int
    957 futex_wait(struct futex *f, int q, const struct timespec *deadline,
    958     clockid_t clkid, int bitset)
    959 {
    960 
    961 	/*
    962 	 * Some things to note about how this function works:
    963 	 *
    964 	 * ==> When we enter futex_wait(), the futex's op lock is
    965 	 * held, preventing us from missing wakes.  Once we are on
    966 	 * the futex's sleepq, it is safe to release the op lock.
    967 	 *
    968 	 * ==> We have a single early escape to avoid calling
    969 	 * sleepq_block(): our deadline already having passed.
    970 	 * If this is a no-timeout wait, or if the deadline has
    971 	 * not already passed, we are guaranteed to call sleepq_block().
    972 	 *
    973 	 * ==> sleepq_block() contains all of the necessary logic
    974 	 * for handling signals; we do not need to check for them
    975 	 * ourselves.
    976 	 *
    977 	 * ==> Once we have blocked, other futex operations will
    978 	 * be able to wake us or requeue us.  Until we have blocked,
    979 	 * those other futex operations will not be able to acquire
    980 	 * the sleepq locks in order to perform those operations,
    981 	 * even if we have dropped the op lock.  Thus, we will not
    982 	 * miss wakes.  This fundamental invariant is relied upon by
    983 	 * every other synchronization object in the kernel.
    984 	 *
    985 	 * ==> sleepq_block() returns for one of three reasons:
    986 	 *
    987 	 *	-- We were awakened.
    988 	 *	-- We were signaled.
    989 	 *	-- We timed out.
    990 	 *
    991 	 * ==> Once sleepq_block() returns, we will not call
    992 	 * sleepq_block() again.
    993 	 *
    994 	 * ==> If sleepq_block() returns due to being signaled,
    995 	 * we must check to see if we were also awakened.  This
    996 	 * is to handle the following sequence:
    997 	 *
    998 	 *	-- Futex wake takes us off sleepq, places us on runq.
    999 	 *	-- We are signaled while sitting on the runq.
   1000 	 *	-- We run, and sleepq_block() notices the signal and
   1001 	 *	   returns  EINTR/ERESTART.
   1002 	 *
   1003 	 * In this situation, we want to indicate a successful wake
   1004 	 * to the caller, because that wake has been reported to the
   1005 	 * thread that issued it.
   1006 	 *
   1007 	 * Note that it is NOT possible for a wake to be issued after
   1008 	 * a signal; the LWP will already have been taken off the sleepq
   1009 	 * by the signal, so the wake will never happen.  Note that for
   1010 	 * this reason, we must *never* return ERESTART, because it could
   1011 	 * result in us waiting again with no one to awaken us.
   1012 	 */
   1013 
   1014 	struct lwp * const l = curlwp;
   1015 	int timo;
   1016 	int error;
   1017 
   1018 	/*
   1019 	 * Compute our timeout before taking any locks.
   1020 	 */
   1021 	if (deadline) {
   1022 		struct timespec ts;
   1023 
   1024 		/* Check our watch.  */
   1025 		error = clock_gettime1(clkid, &ts);
   1026 		if (error) {
   1027 			mutex_exit(&f->fx_op_lock);
   1028 			return error;
   1029 		}
   1030 
   1031 		/*
   1032 		 * If we're past the deadline, ETIMEDOUT.
   1033 		 */
   1034 		if (timespeccmp(deadline, &ts, <=)) {
   1035 			mutex_exit(&f->fx_op_lock);
   1036 			return ETIMEDOUT;
   1037 		} else {
   1038 			/* Count how much time is left.  */
   1039 			timespecsub(deadline, &ts, &ts);
   1040 			timo = tstohz(&ts);
   1041 			if (timo == 0) {
   1042 				/*
   1043 				 * tstohz() already rounds up, and
   1044 				 * a return value of 0 ticks means
   1045 				 * "expired now or in the past".
   1046 				 */
   1047 				mutex_exit(&f->fx_op_lock);
   1048 				return ETIMEDOUT;
   1049 			}
   1050 		}
   1051 	} else {
   1052 		timo = 0;
   1053 	}
   1054 
   1055 	/*
   1056 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1057 	 * that there's room for us to block.
   1058 	 */
   1059 	mutex_spin_enter(f->fx_sq_lock);
   1060 	if (__predict_false(q == FUTEX_READERQ &&
   1061 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1062 		mutex_spin_exit(f->fx_sq_lock);
   1063 		mutex_exit(&f->fx_op_lock);
   1064 		return ENFILE;
   1065 	}
   1066 	lwp_lock(l);
   1067 
   1068 	/*
   1069 	 * No need for locks here; until we're on a futex's sleepq, these
   1070 	 * values are private to the LWP, and the locks needed to get onto
   1071 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1072 	 */
   1073 	l->l_futex = f;
   1074 	l->l_futex_wakesel = bitset;
   1075 
   1076 	/*
   1077 	 * This is an inlined bit of sleepq_enter();
   1078 	 * we already hold the lwp lock.
   1079 	 */
   1080 	lwp_unlock_to(l, f->fx_sq_lock);
   1081 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1082 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1083 
   1084 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1085 	f->fx_nwaiters[q]++;
   1086 
   1087 	/* We can now safely release the op lock. */
   1088 	mutex_exit(&f->fx_op_lock);
   1089 
   1090 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1091 	error = sleepq_block(timo, true);
   1092 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1093 
   1094 	/*
   1095 	 * f is no longer valid; we may have been moved to another
   1096 	 * futex sleepq while we slept.
   1097 	 */
   1098 
   1099 	/*
   1100 	 * If something went wrong, we may still have a futex reference.
   1101 	 * Check for that here and drop it if so.  We can do this without
   1102 	 * taking any locks because l->l_futex is private to the LWP when
   1103 	 * not on any futex's sleepq, and we are not on any sleepq because
   1104 	 * we are running.
   1105 	 *
   1106 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1107 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1108 	 */
   1109 	if (error) {
   1110 		f = l->l_futex;
   1111 		if (f != NULL) {
   1112 			SDT_PROBE0(futex, wait, finish, aborted);
   1113 			l->l_futex = NULL;
   1114 			futex_rele(f);
   1115 		} else {
   1116 			/* Raced with wakeup; report success. */
   1117 			SDT_PROBE0(futex, wait, finish, wakerace);
   1118 			error = 0;
   1119 		}
   1120 		if (error == EWOULDBLOCK)
   1121 			error = ETIMEDOUT;
   1122 		else if (error != ETIMEDOUT)
   1123 			error = EINTR;
   1124 	} else {
   1125 		f = l->l_futex;
   1126 		if (__predict_false(f != NULL)) {
   1127 			/*
   1128 			 * There are certain situations that can cause
   1129 			 * sleepq_block() to return 0 even if we were
   1130 			 * signalled (by e.g. SIGKILL).  In this case,
   1131 			 * we will need to release our futex hold and
   1132 			 * return EINTR (we're probably about to die
   1133 			 * anyway).
   1134 			 */
   1135 			SDT_PROBE0(futex, wait, finish, aborted);
   1136 			l->l_futex = NULL;
   1137 			futex_rele(f);
   1138 			error = EINTR;
   1139 		} else {
   1140 			SDT_PROBE0(futex, wait, finish, normally);
   1141 		}
   1142 	}
   1143 
   1144 	return error;
   1145 }
   1146 
   1147 /*
   1148  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1149  *
   1150  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1151  *	provided, move up to nrequeue remaining waiters on f matching
   1152  *	bitset to f2.  Return the number of waiters actually woken.
   1153  *	Caller must hold the locks of f and f2, if provided.
   1154  */
   1155 static unsigned
   1156 futex_wake(struct futex *f, int q, unsigned int const nwake,
   1157     struct futex *f2, int q2, unsigned int const nrequeue,
   1158     int bitset)
   1159 {
   1160 	struct lwp *l, *l_next;
   1161 	unsigned nwoken = 0;
   1162 	unsigned nrequeued = 0;
   1163 
   1164 	KASSERT(mutex_owned(&f->fx_op_lock));
   1165 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1166 
   1167 	futex_sq_lock2(f, f2);
   1168 
   1169 	/* Wake up to nwake waiters, and count the number woken.  */
   1170 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1171 		if (nwoken == nwake)
   1172 			break;
   1173 
   1174 		KASSERT(l->l_syncobj == &futex_syncobj);
   1175 		KASSERT(l->l_wchan == (wchan_t)f);
   1176 		KASSERT(l->l_futex == f);
   1177 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1178 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1179 
   1180 		if ((l->l_futex_wakesel & bitset) == 0)
   1181 			continue;
   1182 
   1183 		l->l_futex_wakesel = 0;
   1184 		l->l_futex = NULL;
   1185 		sleepq_remove(&f->fx_sqs[q], l);
   1186 		f->fx_nwaiters[q]--;
   1187 		nwoken++;
   1188 		/* hold counts adjusted in bulk below */
   1189 	}
   1190 
   1191 	if (nrequeue) {
   1192 		KASSERT(f2 != NULL);
   1193 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1194 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1195 			if (nrequeued == nrequeue)
   1196 				break;
   1197 
   1198 			KASSERT(l->l_syncobj == &futex_syncobj);
   1199 			KASSERT(l->l_wchan == (wchan_t)f);
   1200 			KASSERT(l->l_futex == f);
   1201 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1202 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1203 
   1204 			if ((l->l_futex_wakesel & bitset) == 0)
   1205 				continue;
   1206 
   1207 			l->l_futex = f2;
   1208 			sleepq_transfer(l, &f->fx_sqs[q],
   1209 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1210 			    &futex_syncobj, f2->fx_sq_lock, true);
   1211 			f->fx_nwaiters[q]--;
   1212 			f2->fx_nwaiters[q2]++;
   1213 			nrequeued++;
   1214 			/* hold counts adjusted in bulk below */
   1215 		}
   1216 		if (nrequeued) {
   1217 			/* XXX futex_hold() could theoretically fail here. */
   1218 			int hold_error __diagused =
   1219 			    futex_hold_count(f2, nrequeued);
   1220 			KASSERT(hold_error == 0);
   1221 		}
   1222 	}
   1223 
   1224 	/*
   1225 	 * Adjust the futex reference counts for the wakes and
   1226 	 * requeues.
   1227 	 */
   1228 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1229 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1230 
   1231 	futex_sq_unlock2(f, f2);
   1232 
   1233 	/* Return the number of waiters woken and requeued.  */
   1234 	return nwoken + nrequeued;
   1235 }
   1236 
   1237 /*
   1238  * futex_op_lock(f)
   1239  *
   1240  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1241  *	not use if caller needs to acquire two locks; use
   1242  *	futex_op_lock2 instead.
   1243  */
   1244 static void
   1245 futex_op_lock(struct futex *f)
   1246 {
   1247 	mutex_enter(&f->fx_op_lock);
   1248 }
   1249 
   1250 /*
   1251  * futex_op_unlock(f)
   1252  *
   1253  *	Release the op lock of f.
   1254  */
   1255 static void
   1256 futex_op_unlock(struct futex *f)
   1257 {
   1258 	mutex_exit(&f->fx_op_lock);
   1259 }
   1260 
   1261 /*
   1262  * futex_op_lock2(f, f2)
   1263  *
   1264  *	Acquire the op locks of both f and f2, which may be null, or
   1265  *	which may be the same.  If they are distinct, an arbitrary total
   1266  *	order is chosen on the locks.
   1267  *
   1268  *	Callers should only ever acquire multiple op locks
   1269  *	simultaneously using futex_op_lock2.
   1270  */
   1271 static void
   1272 futex_op_lock2(struct futex *f, struct futex *f2)
   1273 {
   1274 
   1275 	/*
   1276 	 * If both are null, do nothing; if one is null and the other
   1277 	 * is not, lock the other and be done with it.
   1278 	 */
   1279 	if (f == NULL && f2 == NULL) {
   1280 		return;
   1281 	} else if (f == NULL) {
   1282 		mutex_enter(&f2->fx_op_lock);
   1283 		return;
   1284 	} else if (f2 == NULL) {
   1285 		mutex_enter(&f->fx_op_lock);
   1286 		return;
   1287 	}
   1288 
   1289 	/* If both futexes are the same, acquire only one. */
   1290 	if (f == f2) {
   1291 		mutex_enter(&f->fx_op_lock);
   1292 		return;
   1293 	}
   1294 
   1295 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1296 	if ((uintptr_t)f < (uintptr_t)f2) {
   1297 		mutex_enter(&f->fx_op_lock);
   1298 		mutex_enter(&f2->fx_op_lock);
   1299 	} else {
   1300 		mutex_enter(&f2->fx_op_lock);
   1301 		mutex_enter(&f->fx_op_lock);
   1302 	}
   1303 }
   1304 
   1305 /*
   1306  * futex_op_unlock2(f, f2)
   1307  *
   1308  *	Release the queue locks of both f and f2, which may be null, or
   1309  *	which may have the same underlying queue.
   1310  */
   1311 static void
   1312 futex_op_unlock2(struct futex *f, struct futex *f2)
   1313 {
   1314 
   1315 	/*
   1316 	 * If both are null, do nothing; if one is null and the other
   1317 	 * is not, unlock the other and be done with it.
   1318 	 */
   1319 	if (f == NULL && f2 == NULL) {
   1320 		return;
   1321 	} else if (f == NULL) {
   1322 		mutex_exit(&f2->fx_op_lock);
   1323 		return;
   1324 	} else if (f2 == NULL) {
   1325 		mutex_exit(&f->fx_op_lock);
   1326 		return;
   1327 	}
   1328 
   1329 	/* If both futexes are the same, release only one. */
   1330 	if (f == f2) {
   1331 		mutex_exit(&f->fx_op_lock);
   1332 		return;
   1333 	}
   1334 
   1335 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1336 	if ((uintptr_t)f < (uintptr_t)f2) {
   1337 		mutex_exit(&f2->fx_op_lock);
   1338 		mutex_exit(&f->fx_op_lock);
   1339 	} else {
   1340 		mutex_exit(&f->fx_op_lock);
   1341 		mutex_exit(&f2->fx_op_lock);
   1342 	}
   1343 }
   1344 
   1345 /*
   1346  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1347  *
   1348  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1349  */
   1350 static int
   1351 futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1352     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1353     register_t *retval)
   1354 {
   1355 	struct futex *f;
   1356 	struct timespec ts;
   1357 	const struct timespec *deadline;
   1358 	int error;
   1359 
   1360 	/*
   1361 	 * If there's nothing to wait for, and nobody will ever wake
   1362 	 * us, then don't set anything up to wait -- just stop here.
   1363 	 */
   1364 	if (val3 == 0)
   1365 		return EINVAL;
   1366 
   1367 	/* Optimistically test before anything else.  */
   1368 	if (!futex_test(uaddr, val))
   1369 		return EAGAIN;
   1370 
   1371 	/* Determine a deadline on the specified clock.  */
   1372 	if (timeout == NULL) {
   1373 		deadline = timeout;
   1374 	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1375 		/* Sanity-check the deadline. */
   1376 		if (timeout->tv_sec < 0 ||
   1377 		    timeout->tv_nsec < 0 ||
   1378 		    timeout->tv_nsec >= 1000000000L) {
   1379 			return EINVAL;
   1380 		}
   1381 		deadline = timeout;
   1382 	} else {
   1383 		struct timespec interval = *timeout;
   1384 
   1385 		error = itimespecfix(&interval);
   1386 		if (error)
   1387 			return error;
   1388 		error = clock_gettime1(clkid, &ts);
   1389 		if (error)
   1390 			return error;
   1391 		timespecadd(&ts, &interval, &ts);
   1392 		deadline = &ts;
   1393 	}
   1394 
   1395 	/* Get the futex, creating it if necessary.  */
   1396 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1397 	if (error)
   1398 		return error;
   1399 	KASSERT(f);
   1400 
   1401 	/*
   1402 	 * Under the op lock, check the value again: if it has
   1403 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1404 	 * Since FUTEX_WAKE will use the same lock and be done after
   1405 	 * modifying the value, the order in which we check and enqueue
   1406 	 * is immaterial.
   1407 	 */
   1408 	futex_op_lock(f);
   1409 	if (!futex_test(uaddr, val)) {
   1410 		futex_op_unlock(f);
   1411 		error = EAGAIN;
   1412 		goto out;
   1413 	}
   1414 
   1415 	/*
   1416 	 * Now wait.  futex_wait() will drop our op lock once we
   1417 	 * are entered into the sleep queue, thus ensuring atomicity
   1418 	 * of wakes with respect to waits.
   1419 	 */
   1420 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1421 
   1422 	/*
   1423 	 * We cannot drop our reference to the futex here, because
   1424 	 * we might be enqueued on a different one when we are awakened.
   1425 	 * The references will be managed on our behalf in the requeue,
   1426 	 * wake, and error cases.
   1427 	 */
   1428 	f = NULL;
   1429 
   1430 	if (__predict_true(error == 0)) {
   1431 		/* Return 0 on success, error on failure. */
   1432 		*retval = 0;
   1433 	}
   1434 
   1435 out:	if (f != NULL)
   1436 		futex_rele(f);
   1437 	return error;
   1438 }
   1439 
   1440 /*
   1441  * futex_func_wake(uaddr, val, val3, retval)
   1442  *
   1443  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1444  */
   1445 static int
   1446 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1447 {
   1448 	struct futex *f;
   1449 	unsigned int nwoken = 0;
   1450 	int error = 0;
   1451 
   1452 	/* Reject negative number of wakeups.  */
   1453 	if (val < 0) {
   1454 		error = EINVAL;
   1455 		goto out;
   1456 	}
   1457 
   1458 	/* Look up the futex, if any.  */
   1459 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1460 	if (error)
   1461 		goto out;
   1462 
   1463 	/* If there's no futex, there are no waiters to wake.  */
   1464 	if (f == NULL)
   1465 		goto out;
   1466 
   1467 	/*
   1468 	 * Under f's op lock, wake the waiters and remember the
   1469 	 * number woken.
   1470 	 */
   1471 	futex_op_lock(f);
   1472 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1473 			    NULL, FUTEX_WRITERQ, 0, val3);
   1474 	futex_op_unlock(f);
   1475 
   1476 	/* Release the futex.  */
   1477 	futex_rele(f);
   1478 
   1479 out:
   1480 	/* Return the number of waiters woken.  */
   1481 	*retval = nwoken;
   1482 
   1483 	/* Success!  */
   1484 	return error;
   1485 }
   1486 
   1487 /*
   1488  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1489  *
   1490  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1491  */
   1492 static int
   1493 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1494     int val2, int val3, register_t *retval)
   1495 {
   1496 	struct futex *f = NULL, *f2 = NULL;
   1497 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1498 	int error;
   1499 
   1500 	/* Reject negative number of wakeups or requeues. */
   1501 	if (val < 0 || val2 < 0) {
   1502 		error = EINVAL;
   1503 		goto out;
   1504 	}
   1505 
   1506 	/* Look up the source futex, if any. */
   1507 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1508 	if (error)
   1509 		goto out;
   1510 
   1511 	/* If there is none, nothing to do. */
   1512 	if (f == NULL)
   1513 		goto out;
   1514 
   1515 	/*
   1516 	 * We may need to create the destination futex because it's
   1517 	 * entirely possible it does not currently have any waiters.
   1518 	 */
   1519 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1520 	if (error)
   1521 		goto out;
   1522 
   1523 	/*
   1524 	 * Under the futexes' op locks, check the value; if
   1525 	 * unchanged from val3, wake the waiters.
   1526 	 */
   1527 	futex_op_lock2(f, f2);
   1528 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1529 		error = EAGAIN;
   1530 	} else {
   1531 		error = 0;
   1532 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1533 				    f2, FUTEX_WRITERQ, val2,
   1534 				    FUTEX_BITSET_MATCH_ANY);
   1535 	}
   1536 	futex_op_unlock2(f, f2);
   1537 
   1538 out:
   1539 	/* Return the number of waiters woken.  */
   1540 	*retval = nwoken;
   1541 
   1542 	/* Release the futexes if we got them.  */
   1543 	if (f2)
   1544 		futex_rele(f2);
   1545 	if (f)
   1546 		futex_rele(f);
   1547 	return error;
   1548 }
   1549 
   1550 /*
   1551  * futex_validate_op_cmp(val3)
   1552  *
   1553  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1554  */
   1555 static int
   1556 futex_validate_op_cmp(int val3)
   1557 {
   1558 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1559 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1560 
   1561 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1562 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1563 		if (oparg < 0)
   1564 			return EINVAL;
   1565 		if (oparg >= 32)
   1566 			return EINVAL;
   1567 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1568 	}
   1569 
   1570 	switch (op) {
   1571 	case FUTEX_OP_SET:
   1572 	case FUTEX_OP_ADD:
   1573 	case FUTEX_OP_OR:
   1574 	case FUTEX_OP_ANDN:
   1575 	case FUTEX_OP_XOR:
   1576 		break;
   1577 	default:
   1578 		return EINVAL;
   1579 	}
   1580 
   1581 	switch (cmp) {
   1582 	case FUTEX_OP_CMP_EQ:
   1583 	case FUTEX_OP_CMP_NE:
   1584 	case FUTEX_OP_CMP_LT:
   1585 	case FUTEX_OP_CMP_LE:
   1586 	case FUTEX_OP_CMP_GT:
   1587 	case FUTEX_OP_CMP_GE:
   1588 		break;
   1589 	default:
   1590 		return EINVAL;
   1591 	}
   1592 
   1593 	return 0;
   1594 }
   1595 
   1596 /*
   1597  * futex_compute_op(oldval, val3)
   1598  *
   1599  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1600  */
   1601 static int
   1602 futex_compute_op(int oldval, int val3)
   1603 {
   1604 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1605 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1606 
   1607 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1608 		KASSERT(oparg >= 0);
   1609 		KASSERT(oparg < 32);
   1610 		oparg = 1u << oparg;
   1611 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1612 	}
   1613 
   1614 	switch (op) {
   1615 	case FUTEX_OP_SET:
   1616 		return oparg;
   1617 
   1618 	case FUTEX_OP_ADD:
   1619 		/*
   1620 		 * Avoid signed arithmetic overflow by doing
   1621 		 * arithmetic unsigned and converting back to signed
   1622 		 * at the end.
   1623 		 */
   1624 		return (int)((unsigned)oldval + (unsigned)oparg);
   1625 
   1626 	case FUTEX_OP_OR:
   1627 		return oldval | oparg;
   1628 
   1629 	case FUTEX_OP_ANDN:
   1630 		return oldval & ~oparg;
   1631 
   1632 	case FUTEX_OP_XOR:
   1633 		return oldval ^ oparg;
   1634 
   1635 	default:
   1636 		panic("invalid futex op");
   1637 	}
   1638 }
   1639 
   1640 /*
   1641  * futex_compute_cmp(oldval, val3)
   1642  *
   1643  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1644  */
   1645 static bool
   1646 futex_compute_cmp(int oldval, int val3)
   1647 {
   1648 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1649 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1650 
   1651 	switch (cmp) {
   1652 	case FUTEX_OP_CMP_EQ:
   1653 		return (oldval == cmparg);
   1654 
   1655 	case FUTEX_OP_CMP_NE:
   1656 		return (oldval != cmparg);
   1657 
   1658 	case FUTEX_OP_CMP_LT:
   1659 		return (oldval < cmparg);
   1660 
   1661 	case FUTEX_OP_CMP_LE:
   1662 		return (oldval <= cmparg);
   1663 
   1664 	case FUTEX_OP_CMP_GT:
   1665 		return (oldval > cmparg);
   1666 
   1667 	case FUTEX_OP_CMP_GE:
   1668 		return (oldval >= cmparg);
   1669 
   1670 	default:
   1671 		panic("invalid futex cmp operation");
   1672 	}
   1673 }
   1674 
   1675 /*
   1676  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1677  *
   1678  *	Implement futex(FUTEX_WAKE_OP).
   1679  */
   1680 static int
   1681 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1682     int val3, register_t *retval)
   1683 {
   1684 	struct futex *f = NULL, *f2 = NULL;
   1685 	int oldval, newval, actual;
   1686 	unsigned nwoken = 0;
   1687 	int error;
   1688 
   1689 	/* Reject negative number of wakeups.  */
   1690 	if (val < 0 || val2 < 0) {
   1691 		error = EINVAL;
   1692 		goto out;
   1693 	}
   1694 
   1695 	/* Reject invalid operations before we start doing things.  */
   1696 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1697 		goto out;
   1698 
   1699 	/* Look up the first futex, if any.  */
   1700 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1701 	if (error)
   1702 		goto out;
   1703 
   1704 	/* Look up the second futex, if any.  */
   1705 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1706 	if (error)
   1707 		goto out;
   1708 
   1709 	/*
   1710 	 * Under the op locks:
   1711 	 *
   1712 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1713 	 * 2. Unconditionally wake uaddr.
   1714 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1715 	 */
   1716 	futex_op_lock2(f, f2);
   1717 	do {
   1718 		error = futex_load(uaddr2, &oldval);
   1719 		if (error)
   1720 			goto out_unlock;
   1721 		newval = futex_compute_op(oldval, val3);
   1722 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1723 		if (error)
   1724 			goto out_unlock;
   1725 	} while (actual != oldval);
   1726 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1727 				 NULL, FUTEX_WRITERQ, 0,
   1728 				 FUTEX_BITSET_MATCH_ANY) : 0);
   1729 	if (f2 && futex_compute_cmp(oldval, val3))
   1730 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1731 				     NULL, FUTEX_WRITERQ, 0,
   1732 				     FUTEX_BITSET_MATCH_ANY);
   1733 
   1734 	/* Success! */
   1735 	error = 0;
   1736 out_unlock:
   1737 	futex_op_unlock2(f, f2);
   1738 
   1739 out:
   1740 	/* Return the number of waiters woken. */
   1741 	*retval = nwoken;
   1742 
   1743 	/* Release the futexes, if we got them. */
   1744 	if (f2)
   1745 		futex_rele(f2);
   1746 	if (f)
   1747 		futex_rele(f);
   1748 	return error;
   1749 }
   1750 
   1751 /*
   1752  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   1753  *
   1754  *	Implement the futex system call with all the parameters
   1755  *	parsed out.
   1756  */
   1757 int
   1758 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   1759     int *uaddr2, int val2, int val3, register_t *retval)
   1760 {
   1761 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   1762 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   1763 							    : CLOCK_MONOTONIC;
   1764 	int error;
   1765 
   1766 	op &= FUTEX_CMD_MASK;
   1767 
   1768 	switch (op) {
   1769 	case FUTEX_WAIT:
   1770 		SDT_PROBE6(futex, func, wait, entry,
   1771 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   1772 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   1773 		error = futex_func_wait(shared, uaddr, val,
   1774 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   1775 		    retval);
   1776 		SDT_PROBE1(futex, func, wait, exit, error);
   1777 		break;
   1778 
   1779 	case FUTEX_WAIT_BITSET:
   1780 		SDT_PROBE6(futex, func, wait, entry,
   1781 		    uaddr, val, val3, timeout,
   1782 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   1783 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   1784 		    clkid, TIMER_ABSTIME, retval);
   1785 		SDT_PROBE1(futex, func, wait, exit, error);
   1786 		break;
   1787 
   1788 	case FUTEX_WAKE:
   1789 		SDT_PROBE4(futex, func, wake, entry,
   1790 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   1791 		error = futex_func_wake(shared, uaddr, val,
   1792 		    FUTEX_BITSET_MATCH_ANY, retval);
   1793 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1794 		break;
   1795 
   1796 	case FUTEX_WAKE_BITSET:
   1797 		SDT_PROBE4(futex, func, wake, entry,
   1798 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   1799 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   1800 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1801 		break;
   1802 
   1803 	case FUTEX_REQUEUE:
   1804 		SDT_PROBE5(futex, func, requeue, entry,
   1805 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   1806 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1807 		    val2, 0, retval);
   1808 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   1809 		break;
   1810 
   1811 	case FUTEX_CMP_REQUEUE:
   1812 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   1813 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1814 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1815 		    val2, val3, retval);
   1816 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   1817 		break;
   1818 
   1819 	case FUTEX_WAKE_OP:
   1820 		SDT_PROBE6(futex, func, wake_op, entry,
   1821 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1822 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   1823 		    val3, retval);
   1824 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   1825 		break;
   1826 
   1827 	case FUTEX_FD:
   1828 	default:
   1829 		error = ENOSYS;
   1830 		break;
   1831 	}
   1832 
   1833 	return error;
   1834 }
   1835 
   1836 /*
   1837  * sys___futex(l, uap, retval)
   1838  *
   1839  *	__futex(2) system call: generic futex operations.
   1840  */
   1841 int
   1842 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   1843     register_t *retval)
   1844 {
   1845 	/* {
   1846 		syscallarg(int *) uaddr;
   1847 		syscallarg(int) op;
   1848 		syscallarg(int) val;
   1849 		syscallarg(const struct timespec *) timeout;
   1850 		syscallarg(int *) uaddr2;
   1851 		syscallarg(int) val2;
   1852 		syscallarg(int) val3;
   1853 	} */
   1854 	struct timespec ts, *tsp;
   1855 	int error;
   1856 
   1857 	/*
   1858 	 * Copy in the timeout argument, if specified.
   1859 	 */
   1860 	if (SCARG(uap, timeout)) {
   1861 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   1862 		if (error)
   1863 			return error;
   1864 		tsp = &ts;
   1865 	} else {
   1866 		tsp = NULL;
   1867 	}
   1868 
   1869 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   1870 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   1871 	    retval);
   1872 }
   1873 
   1874 /*
   1875  * sys___futex_set_robust_list(l, uap, retval)
   1876  *
   1877  *	__futex_set_robust_list(2) system call for robust futexes.
   1878  */
   1879 int
   1880 sys___futex_set_robust_list(struct lwp *l,
   1881     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   1882 {
   1883 	/* {
   1884 		syscallarg(void *) head;
   1885 		syscallarg(size_t) len;
   1886 	} */
   1887 	void *head = SCARG(uap, head);
   1888 
   1889 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   1890 		return EINVAL;
   1891 	if ((uintptr_t)head % sizeof(u_long))
   1892 		return EINVAL;
   1893 
   1894 	l->l_robust_head = (uintptr_t)head;
   1895 
   1896 	return 0;
   1897 }
   1898 
   1899 /*
   1900  * sys___futex_get_robust_list(l, uap, retval)
   1901  *
   1902  *	__futex_get_robust_list(2) system call for robust futexes.
   1903  */
   1904 int
   1905 sys___futex_get_robust_list(struct lwp *l,
   1906     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   1907 {
   1908 	/* {
   1909 		syscallarg(lwpid_t) lwpid;
   1910 		syscallarg(void **) headp;
   1911 		syscallarg(size_t *) lenp;
   1912 	} */
   1913 	void *head;
   1914 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   1915 	int error;
   1916 
   1917 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   1918 	if (error)
   1919 		return error;
   1920 
   1921 	/* Copy out the head pointer and the head structure length. */
   1922 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   1923 	if (__predict_true(error == 0)) {
   1924 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   1925 	}
   1926 
   1927 	return error;
   1928 }
   1929 
   1930 /*
   1931  * release_futex(uva, tid)
   1932  *
   1933  *	Try to release the robust futex at uva in the current process
   1934  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   1935  *	userland program's obligation to arrange correct behaviour.
   1936  */
   1937 static void
   1938 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   1939     bool const is_pending)
   1940 {
   1941 	int *uaddr;
   1942 	struct futex *f;
   1943 	int oldval, newval, actual;
   1944 	int error;
   1945 	bool shared;
   1946 
   1947 	/* If it's misaligned, tough.  */
   1948 	if (__predict_false(uptr & 3))
   1949 		return;
   1950 	uaddr = (int *)uptr;
   1951 
   1952 	error = futex_load(uaddr, &oldval);
   1953 	if (__predict_false(error))
   1954 		return;
   1955 
   1956 	/*
   1957 	 * There are two race conditions we need to handle here:
   1958 	 *
   1959 	 * 1. User space cleared the futex word but died before
   1960 	 *    being able to issue the wakeup.  No wakeups will
   1961 	 *    ever be issued, oops!
   1962 	 *
   1963 	 * 2. Awakened waiter died before being able to acquire
   1964 	 *    the futex in user space.  Any other waiters are
   1965 	 *    now stuck, oops!
   1966 	 *
   1967 	 * In both of these cases, the futex word will be 0 (because
   1968 	 * it's updated before the wake is issued).  The best we can
   1969 	 * do is detect this situation if it's the pending futex and
   1970 	 * issue a wake without modifying the futex word.
   1971 	 *
   1972 	 * XXX eventual PI handling?
   1973 	 */
   1974 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   1975 		register_t retval;
   1976 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   1977 		    FUTEX_BITSET_MATCH_ANY, &retval);
   1978 		if (error != 0 || retval == 0)
   1979 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   1980 			    FUTEX_BITSET_MATCH_ANY, &retval);
   1981 		return;
   1982 	}
   1983 
   1984 	/* Optimistically test whether we need to do anything at all.  */
   1985 	if ((oldval & FUTEX_TID_MASK) != tid)
   1986 		return;
   1987 
   1988 	/*
   1989 	 * We need to handle the case where this thread owned the futex,
   1990 	 * but it was uncontended.  In this case, there won't be any
   1991 	 * kernel state to look up.  All we can do is mark the futex
   1992 	 * as a zombie to be mopped up the next time another thread
   1993 	 * attempts to acquire it.
   1994 	 *
   1995 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   1996 	 * this loop, even if waiters appear while we're are doing
   1997 	 * so.  This is beause FUTEX_WAITERS is set by user space
   1998 	 * before calling __futex() to wait, and the futex needs
   1999 	 * to be marked as a zombie when the new waiter gets into
   2000 	 * the kernel.
   2001 	 */
   2002 	if ((oldval & FUTEX_WAITERS) == 0) {
   2003 		do {
   2004 			error = futex_load(uaddr, &oldval);
   2005 			if (error)
   2006 				return;
   2007 			if ((oldval & FUTEX_TID_MASK) != tid)
   2008 				return;
   2009 			newval = oldval | FUTEX_OWNER_DIED;
   2010 			error = ucas_int(uaddr, oldval, newval, &actual);
   2011 			if (error)
   2012 				return;
   2013 		} while (actual != oldval);
   2014 
   2015 		/*
   2016 		 * If where is still no indication of waiters, then there is
   2017 		 * no more work for us to do.
   2018 		 */
   2019 		if ((oldval & FUTEX_WAITERS) == 0)
   2020 			return;
   2021 	}
   2022 
   2023 	/*
   2024 	 * Look for a futex.  Try shared first, then private.  If we
   2025 	 * can't fine one, tough.
   2026 	 *
   2027 	 * Note: the assumption here is that anyone placing a futex on
   2028 	 * the robust list is adhering to the rules, regardless of the
   2029 	 * futex class.
   2030 	 */
   2031 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2032 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2033 		if (error)
   2034 			return;
   2035 		if (f == NULL && shared == false)
   2036 			return;
   2037 	}
   2038 
   2039 	/* Work under the futex op lock.  */
   2040 	futex_op_lock(f);
   2041 
   2042 	/*
   2043 	 * Fetch the word: if the tid doesn't match ours, skip;
   2044 	 * otherwise, set the owner-died bit, atomically.
   2045 	 */
   2046 	do {
   2047 		error = futex_load(uaddr, &oldval);
   2048 		if (error)
   2049 			goto out;
   2050 		if ((oldval & FUTEX_TID_MASK) != tid)
   2051 			goto out;
   2052 		newval = oldval | FUTEX_OWNER_DIED;
   2053 		error = ucas_int(uaddr, oldval, newval, &actual);
   2054 		if (error)
   2055 			goto out;
   2056 	} while (actual != oldval);
   2057 
   2058 	/*
   2059 	 * If there may be waiters, try to wake one.  If anything goes
   2060 	 * wrong, tough.
   2061 	 *
   2062 	 * XXX eventual PI handling?
   2063 	 */
   2064 	if (oldval & FUTEX_WAITERS) {
   2065 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2066 				 NULL, FUTEX_WRITERQ, 0,
   2067 				 FUTEX_BITSET_MATCH_ANY);
   2068 	}
   2069 
   2070 	/* Unlock the queue and release the futex.  */
   2071 out:	futex_op_unlock(f);
   2072 	futex_rele(f);
   2073 }
   2074 
   2075 /*
   2076  * futex_robust_head_lookup(l, lwpid)
   2077  *
   2078  *	Helper function to look up a robust head by LWP ID.
   2079  */
   2080 int
   2081 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2082 {
   2083 	struct proc *p = l->l_proc;
   2084 
   2085 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2086 	if (lwpid) {
   2087 		mutex_enter(p->p_lock);
   2088 		l = lwp_find(p, lwpid);
   2089 		if (l == NULL) {
   2090 			mutex_exit(p->p_lock);
   2091 			return ESRCH;
   2092 		}
   2093 		*headp = (void *)l->l_robust_head;
   2094 		mutex_exit(p->p_lock);
   2095 	} else {
   2096 		*headp = (void *)l->l_robust_head;
   2097 	}
   2098 	return 0;
   2099 }
   2100 
   2101 /*
   2102  * futex_fetch_robust_head(uaddr)
   2103  *
   2104  *	Helper routine to fetch the futex robust list head that
   2105  *	handles 32-bit binaries running on 64-bit kernels.
   2106  */
   2107 static int
   2108 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2109 {
   2110 #ifdef _LP64
   2111 	if (curproc->p_flag & PK_32) {
   2112 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2113 		int error;
   2114 
   2115 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2116 		if (__predict_true(error == 0)) {
   2117 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2118 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2119 					/*
   2120 					 * Make sure the offset is sign-
   2121 					 * extended.
   2122 					 */
   2123 					rhead[i] = (int32_t)rhead32[i];
   2124 				} else {
   2125 					rhead[i] = rhead32[i];
   2126 				}
   2127 			}
   2128 		}
   2129 		return error;
   2130 	}
   2131 #endif /* _L64 */
   2132 
   2133 	return copyin((void *)uaddr, rhead,
   2134 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2135 }
   2136 
   2137 /*
   2138  * futex_decode_robust_word(word)
   2139  *
   2140  *	Decode a robust futex list word into the entry and entry
   2141  *	properties.
   2142  */
   2143 static inline void
   2144 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2145     bool * const is_pi)
   2146 {
   2147 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2148 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2149 }
   2150 
   2151 /*
   2152  * futex_fetch_robust_entry(uaddr)
   2153  *
   2154  *	Helper routine to fetch and decode a robust futex entry
   2155  *	that handles 32-bit binaries running on 64-bit kernels.
   2156  */
   2157 static int
   2158 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2159     bool * const is_pi)
   2160 {
   2161 	uintptr_t val = 0;
   2162 	int error = 0;
   2163 
   2164 #ifdef _LP64
   2165 	if (curproc->p_flag & PK_32) {
   2166 		uint32_t val32;
   2167 
   2168 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2169 		if (__predict_true(error == 0))
   2170 			val = val32;
   2171 	} else
   2172 #endif /* _LP64 */
   2173 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2174 	if (__predict_false(error))
   2175 		return error;
   2176 
   2177 	futex_decode_robust_word(val, valp, is_pi);
   2178 	return 0;
   2179 }
   2180 
   2181 /*
   2182  * futex_release_all_lwp(l, tid)
   2183  *
   2184  *	Release all l's robust futexes.  If anything looks funny in
   2185  *	the process, give up -- it's userland's responsibility to dot
   2186  *	the i's and cross the t's.
   2187  */
   2188 void
   2189 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2190 {
   2191 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2192 	int limit = 1000000;
   2193 	int error;
   2194 
   2195 	/* If there's no robust list there's nothing to do. */
   2196 	if (l->l_robust_head == 0)
   2197 		return;
   2198 
   2199 	/* Read the final snapshot of the robust list head. */
   2200 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2201 	if (error) {
   2202 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2203 		    " unmapped robust futex list head\n",
   2204 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2205 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2206 		return;
   2207 	}
   2208 
   2209 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2210 
   2211 	uintptr_t next, pending;
   2212 	bool is_pi, pending_is_pi;
   2213 
   2214 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2215 	    &next, &is_pi);
   2216 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2217 	    &pending, &pending_is_pi);
   2218 
   2219 	/*
   2220 	 * Walk down the list of locked futexes and release them, up
   2221 	 * to one million of them before we give up.
   2222 	 */
   2223 
   2224 	while (next != l->l_robust_head && limit-- > 0) {
   2225 		/* pending handled below. */
   2226 		if (next != pending)
   2227 			release_futex(next + offset, tid, is_pi, false);
   2228 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2229 		if (error)
   2230 			break;
   2231 		preempt_point();
   2232 	}
   2233 	if (limit <= 0) {
   2234 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2235 		    " exhausted robust futex limit\n",
   2236 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2237 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2238 	}
   2239 
   2240 	/* If there's a pending futex, it may need to be released too. */
   2241 	if (pending != 0) {
   2242 		release_futex(pending + offset, tid, pending_is_pi, true);
   2243 	}
   2244 }
   2245