Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.12.4.6
      1 /*	$NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell and Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $");
     34 
     35 /*
     36  * Futexes
     37  *
     38  *	The futex system call coordinates notifying threads waiting for
     39  *	changes on a 32-bit word of memory.  The word can be managed by
     40  *	CPU atomic operations in userland, without system calls, as long
     41  *	as there is no contention.
     42  *
     43  *	The simplest use case demonstrating the utility is:
     44  *
     45  *		// 32-bit word of memory shared among threads or
     46  *		// processes in userland.  lock & 1 means owned;
     47  *		// lock & 2 means there are waiters waiting.
     48  *		volatile int lock = 0;
     49  *
     50  *		int v;
     51  *
     52  *		// Acquire a lock.
     53  *		do {
     54  *			v = lock;
     55  *			if (v & 1) {
     56  *				// Lock is held.  Set a bit to say that
     57  *				// there are waiters, and wait for lock
     58  *				// to change to anything other than v;
     59  *				// then retry.
     60  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61  *					continue;
     62  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63  *				continue;
     64  *			}
     65  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66  *		membar_enter();
     67  *
     68  *		...
     69  *
     70  *		// Release the lock.  Optimistically assume there are
     71  *		// no waiters first until demonstrated otherwise.
     72  *		membar_exit();
     73  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74  *			// There may be waiters.
     75  *			v = atomic_swap_uint(&lock, 0);
     76  *			// If there are still waiters, wake one.
     77  *			if (v & 2)
     78  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79  *		}
     80  *
     81  *	The goal is to avoid the futex system call unless there is
     82  *	contention; then if there is contention, to guarantee no missed
     83  *	wakeups.
     84  *
     85  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86  *	itself to be woken, double-check the lock word, and then sleep;
     87  *	spurious wakeups are generally a fact of life, so any
     88  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89  *
     90  *	If this were all there is to it, we could then increase
     91  *	parallelism by refining the approximation: partition the
     92  *	waiters into buckets by hashing the lock addresses to reduce
     93  *	the incidence of spurious wakeups.  But this is not all.
     94  *
     95  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96  *	operation not only wakes n waiters on lock if lock == val, but
     97  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99  *	to lock2 if they merely share the same hash as waiters on lock.
    100  *	Thus, we can't approximately distribute waiters into queues by
    101  *	a hash function; we must distinguish futex queues exactly by
    102  *	lock address.
    103  *
    104  *	For now, we use a global red/black tree to index futexes.  This
    105  *	should be replaced by a lockless radix tree with a thread to
    106  *	free entries no longer in use once all lookups on all CPUs have
    107  *	completed.
    108  *
    109  *	Specifically, we maintain two maps:
    110  *
    111  *	futex_tab.va[vmspace, va] for private futexes
    112  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113  *
    114  *	This implementation does not support priority inheritance.
    115  */
    116 
    117 #include <sys/param.h>
    118 #include <sys/types.h>
    119 #include <sys/atomic.h>
    120 #include <sys/condvar.h>
    121 #include <sys/futex.h>
    122 #include <sys/mutex.h>
    123 #include <sys/rbtree.h>
    124 #include <sys/sleepq.h>
    125 #include <sys/queue.h>
    126 #include <sys/sdt.h>
    127 
    128 #include <sys/syscall.h>
    129 #include <sys/syscallargs.h>
    130 #include <sys/syscallvar.h>
    131 
    132 #include <uvm/uvm_extern.h>
    133 
    134 /*
    135  * DTrace probes.
    136  */
    137 SDT_PROVIDER_DEFINE(futex);
    138 
    139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    140 /* exit: error */
    141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    142 		  "struct timespec *", "int", "int");
    143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    144 
    145 /* entry: uaddr, nwake, bitset, fflags */
    146 /* exit: error, nwoken */
    147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    149 
    150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    151 /* exit: error, nwoken */
    152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    153 		  "int");
    154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    155 
    156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    157 /* exit: error, nwoken */
    158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    159 		  "int", "int", "int");
    160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    161 
    162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    163 /* exit: error, nwoken */
    164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    165 		  "int", "int");
    166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    167 
    168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    171 
    172 /* entry: timo */
    173 /* exit: error */
    174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    176 
    177 /*
    178  * Lock order:
    179  *
    180  *	futex_tab.lock
    181  *	futex::fx_op_lock		ordered by kva of struct futex
    182  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    183  *
    184  * N.B. multiple futexes can share a single sleepq lock.
    185  */
    186 
    187 /*
    188  * union futex_key
    189  *
    190  *	A futex is addressed either by a vmspace+va (private) or by
    191  *	a uvm_voaddr (shared).
    192  */
    193 typedef union futex_key {
    194 	struct {
    195 		struct vmspace			*vmspace;
    196 		vaddr_t				va;
    197 	}			fk_private;
    198 	struct uvm_voaddr	fk_shared;
    199 } futex_key_t;
    200 
    201 CTASSERT(offsetof(futex_key_t, fk_private.va) ==
    202 	 offsetof(futex_key_t, fk_shared.offset));
    203 
    204 /*
    205  * struct futex
    206  *
    207  *	Kernel state for a futex located at a particular address in a
    208  *	particular virtual address space.
    209  *
    210  *	N.B. fx_refcnt is an unsigned long because we need to be able
    211  *	to operate on it atomically on all systems while at the same
    212  *	time rendering practically impossible the chance of it reaching
    213  *	its max value.  In practice, we're limited by the number of LWPs
    214  *	that can be present on the system at any given time, and the
    215  *	assumption is that limit will be good enough on a 32-bit platform.
    216  *	See futex_wake() for why overflow needs to be avoided.
    217  *
    218  *	XXX Since futex addresses must be 4-byte aligned, we could
    219  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    220  *	XXX field of the key.  Worth it?
    221  */
    222 struct futex {
    223 	futex_key_t		fx_key;
    224 	struct rb_node		fx_node;
    225 	unsigned long		fx_refcnt;
    226 	bool			fx_shared;
    227 	bool			fx_on_tree;
    228 	uint8_t			fx_class;
    229 
    230 	kmutex_t		fx_op_lock;	/* adaptive */
    231 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    232 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    233 	unsigned int		fx_nwaiters[2];
    234 };
    235 
    236 /*
    237  * futex classes: Some futex operations can only be carried out on
    238  * futexes that are known to be abiding by a certain protocol.  These
    239  * classes are assigned to a futex when created due to a wait event,
    240  * and when subsequent wake or requeue operations are issued, the
    241  * class is checked at futex lookup time.  If the class does not match,
    242  * EINVAL is the result.
    243  */
    244 #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    245 #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    246 #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    247 #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    248 
    249 /* sleepq definitions */
    250 #define	FUTEX_READERQ		0
    251 #define	FUTEX_WRITERQ		1
    252 
    253 static const char futex_wmesg[] = "futex";
    254 
    255 static void	futex_unsleep(lwp_t *, bool);
    256 
    257 static syncobj_t futex_syncobj = {
    258 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    259 	.sobj_unsleep	= futex_unsleep,
    260 	.sobj_changepri	= sleepq_changepri,
    261 	.sobj_lendpri	= sleepq_lendpri,
    262 	.sobj_owner	= syncobj_noowner,
    263 };
    264 
    265 /*
    266  * futex_tab
    267  *
    268  *	Global trees of futexes by vmspace/va and VM object address.
    269  *
    270  *	XXX This obviously doesn't scale in parallel.  We could use a
    271  *	pserialize-safe data structure, but there may be a high cost to
    272  *	frequent deletion since we don't cache futexes after we're done
    273  *	with them.  We could use hashed locks.  But for now, just make
    274  *	sure userland can't DoS the serial performance, by using a
    275  *	balanced binary tree for lookup.
    276  *
    277  *	XXX We could use a per-process tree for the table indexed by
    278  *	virtual address to reduce contention between processes.
    279  */
    280 static struct {
    281 	kmutex_t	lock;
    282 	struct rb_tree	va;
    283 	struct rb_tree	oa;
    284 } futex_tab __cacheline_aligned;
    285 
    286 static int
    287 compare_futex_key(void *cookie, const void *n, const void *k)
    288 {
    289 	const struct futex *fa = n;
    290 	const futex_key_t *fka = &fa->fx_key;
    291 	const futex_key_t *fkb = k;
    292 
    293 	if ((uintptr_t)fka->fk_private.vmspace <
    294 	    (uintptr_t)fkb->fk_private.vmspace)
    295 		return -1;
    296 	if ((uintptr_t)fka->fk_private.vmspace >
    297 	    (uintptr_t)fkb->fk_private.vmspace)
    298 		return +1;
    299 	if (fka->fk_private.va < fkb->fk_private.va)
    300 		return -1;
    301 	if (fka->fk_private.va > fkb->fk_private.va)
    302 		return +1;
    303 	return 0;
    304 }
    305 
    306 static int
    307 compare_futex(void *cookie, const void *na, const void *nb)
    308 {
    309 	const struct futex *fa = na;
    310 	const struct futex *fb = nb;
    311 
    312 	return compare_futex_key(cookie, fa, &fb->fx_key);
    313 }
    314 
    315 static const rb_tree_ops_t futex_rb_ops = {
    316 	.rbto_compare_nodes = compare_futex,
    317 	.rbto_compare_key = compare_futex_key,
    318 	.rbto_node_offset = offsetof(struct futex, fx_node),
    319 };
    320 
    321 static int
    322 compare_futex_shared_key(void *cookie, const void *n, const void *k)
    323 {
    324 	const struct futex *fa = n;
    325 	const futex_key_t *fka = &fa->fx_key;
    326 	const futex_key_t *fkb = k;
    327 
    328 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    329 }
    330 
    331 static int
    332 compare_futex_shared(void *cookie, const void *na, const void *nb)
    333 {
    334 	const struct futex *fa = na;
    335 	const struct futex *fb = nb;
    336 
    337 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    338 }
    339 
    340 static const rb_tree_ops_t futex_shared_rb_ops = {
    341 	.rbto_compare_nodes = compare_futex_shared,
    342 	.rbto_compare_key = compare_futex_shared_key,
    343 	.rbto_node_offset = offsetof(struct futex, fx_node),
    344 };
    345 
    346 /*
    347  * futex_sq_lock2(f, f2)
    348  *
    349  *	Acquire the sleepq locks for f and f2, which may be null, or
    350  *	which may be the same.  If they are distinct, an arbitrary total
    351  *	order is chosen on the locks.
    352  *
    353  *	Callers should only ever acquire multiple sleepq locks
    354  *	simultaneously using futex_sq_lock2.
    355  */
    356 static void
    357 futex_sq_lock2(struct futex * const f, struct futex * const f2)
    358 {
    359 
    360 	/*
    361 	 * If both are null, do nothing; if one is null and the other
    362 	 * is not, lock the other and be done with it.
    363 	 */
    364 	if (f == NULL && f2 == NULL) {
    365 		return;
    366 	} else if (f == NULL) {
    367 		mutex_spin_enter(f2->fx_sq_lock);
    368 		return;
    369 	} else if (f2 == NULL) {
    370 		mutex_spin_enter(f->fx_sq_lock);
    371 		return;
    372 	}
    373 
    374 	kmutex_t * const m = f->fx_sq_lock;
    375 	kmutex_t * const m2 = f2->fx_sq_lock;
    376 
    377 	/* If both locks are the same, acquire only one. */
    378 	if (m == m2) {
    379 		mutex_spin_enter(m);
    380 		return;
    381 	}
    382 
    383 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    384 	if ((uintptr_t)m < (uintptr_t)m2) {
    385 		mutex_spin_enter(m);
    386 		mutex_spin_enter(m2);
    387 	} else {
    388 		mutex_spin_enter(m2);
    389 		mutex_spin_enter(m);
    390 	}
    391 }
    392 
    393 /*
    394  * futex_sq_unlock2(f, f2)
    395  *
    396  *	Release the sleep queue locks for f and f2, which may be null, or
    397  *	which may have the same underlying lock.
    398  */
    399 static void
    400 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    401 {
    402 
    403 	/*
    404 	 * If both are null, do nothing; if one is null and the other
    405 	 * is not, unlock the other and be done with it.
    406 	 */
    407 	if (f == NULL && f2 == NULL) {
    408 		return;
    409 	} else if (f == NULL) {
    410 		mutex_spin_exit(f2->fx_sq_lock);
    411 		return;
    412 	} else if (f2 == NULL) {
    413 		mutex_spin_exit(f->fx_sq_lock);
    414 		return;
    415 	}
    416 
    417 	kmutex_t * const m = f->fx_sq_lock;
    418 	kmutex_t * const m2 = f2->fx_sq_lock;
    419 
    420 	/* If both locks are the same, release only one. */
    421 	if (m == m2) {
    422 		mutex_spin_exit(m);
    423 		return;
    424 	}
    425 
    426 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    427 	if ((uintptr_t)m < (uintptr_t)m2) {
    428 		mutex_spin_exit(m2);
    429 		mutex_spin_exit(m);
    430 	} else {
    431 		mutex_spin_exit(m);
    432 		mutex_spin_exit(m2);
    433 	}
    434 }
    435 
    436 /*
    437  * futex_load(uaddr, kaddr)
    438  *
    439  *	Perform a single atomic load to read *uaddr, and return the
    440  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    441  *	mapped.
    442  */
    443 static inline int
    444 futex_load(int *uaddr, int *kaddr)
    445 {
    446 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    447 }
    448 
    449 /*
    450  * futex_test(uaddr, expected)
    451  *
    452  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    453  *	uaddr is not mapped.
    454  */
    455 static bool
    456 futex_test(int *uaddr, int expected)
    457 {
    458 	int val;
    459 	int error;
    460 
    461 	error = futex_load(uaddr, &val);
    462 	if (error)
    463 		return false;
    464 	return val == expected;
    465 }
    466 
    467 static pool_cache_t	futex_cache		__read_mostly;
    468 
    469 static int	futex_ctor(void *, void *, int);
    470 static void	futex_dtor(void *, void *);
    471 
    472 /*
    473  * futex_sys_init()
    474  *
    475  *	Initialize the futex subsystem.
    476  */
    477 void
    478 futex_sys_init(void)
    479 {
    480 
    481 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    482 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    483 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    484 
    485 	futex_cache = pool_cache_init(sizeof(struct futex),
    486 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    487 	    futex_dtor, NULL);
    488 }
    489 
    490 /*
    491  * futex_sys_fini()
    492  *
    493  *	Finalize the futex subsystem.
    494  */
    495 void
    496 futex_sys_fini(void)
    497 {
    498 
    499 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    500 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    501 	mutex_destroy(&futex_tab.lock);
    502 }
    503 
    504 /*
    505  * futex_ctor()
    506  *
    507  *	Futex object constructor.
    508  */
    509 static int
    510 futex_ctor(void *arg __unused, void *obj, int flags __unused)
    511 {
    512 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    513 	struct futex * const f = obj;
    514 
    515 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    516 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    517 
    518 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    519 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    520 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    521 
    522 	return 0;
    523 }
    524 
    525 /*
    526  * futex_dtor()
    527  *
    528  *	Futex object destructor.
    529  */
    530 static void
    531 futex_dtor(void *arg __unused, void *obj)
    532 {
    533 	struct futex * const f = obj;
    534 
    535 	mutex_destroy(&f->fx_op_lock);
    536 	f->fx_sq_lock = NULL;
    537 }
    538 
    539 /*
    540  * futex_key_init(key, vm, va, shared)
    541  *
    542  *	Initialize a futex key for lookup, etc.
    543  */
    544 static int
    545 futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
    546 {
    547 	int error = 0;
    548 
    549 	if (__predict_false(shared)) {
    550 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    551 			error = EFAULT;
    552 	} else {
    553 		fk->fk_private.vmspace = vm;
    554 		fk->fk_private.va = va;
    555 	}
    556 
    557 	return error;
    558 }
    559 
    560 /*
    561  * futex_key_fini(key, shared)
    562  *
    563  *	Release a futex key.
    564  */
    565 static void
    566 futex_key_fini(futex_key_t *fk, bool shared)
    567 {
    568 	if (__predict_false(shared))
    569 		uvm_voaddr_release(&fk->fk_shared);
    570 	memset(fk, 0, sizeof(*fk));
    571 }
    572 
    573 /*
    574  * futex_create(fk, shared)
    575  *
    576  *	Create a futex.  Initial reference count is 1, representing the
    577  *	caller.  Returns NULL on failure.  Always takes ownership of the
    578  *	key, either transferring it to the newly-created futex, or releasing
    579  *	the key if creation fails.
    580  *
    581  *	Never sleeps for memory, but may sleep to acquire a lock.
    582  */
    583 static struct futex *
    584 futex_create(futex_key_t *fk, bool shared, uint8_t class)
    585 {
    586 	struct futex *f;
    587 
    588 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    589 	if (f == NULL) {
    590 		futex_key_fini(fk, shared);
    591 		return NULL;
    592 	}
    593 	f->fx_key = *fk;
    594 	f->fx_refcnt = 1;
    595 	f->fx_shared = shared;
    596 	f->fx_on_tree = false;
    597 	f->fx_class = class;
    598 
    599 	return f;
    600 }
    601 
    602 /*
    603  * futex_destroy(f)
    604  *
    605  *	Destroy a futex created with futex_create.  Reference count
    606  *	must be zero.
    607  *
    608  *	May sleep.
    609  */
    610 static void
    611 futex_destroy(struct futex *f)
    612 {
    613 
    614 	ASSERT_SLEEPABLE();
    615 
    616 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    617 	KASSERT(!f->fx_on_tree);
    618 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    619 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    620 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    621 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    622 
    623 	futex_key_fini(&f->fx_key, f->fx_shared);
    624 
    625 	pool_cache_put(futex_cache, f);
    626 }
    627 
    628 /*
    629  * futex_hold_count(f, n)
    630  *
    631  *	Attempt to acquire a reference to f.  Return 0 on success,
    632  *	ENFILE on too many references.
    633  *
    634  *	Never sleeps.
    635  */
    636 static int
    637 futex_hold_count(struct futex *f, unsigned long const count)
    638 {
    639 	unsigned long refcnt;
    640 
    641 	do {
    642 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    643 		if (ULONG_MAX - refcnt < count)
    644 			return ENFILE;
    645 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    646 				  refcnt + count) != refcnt);
    647 
    648 	return 0;
    649 }
    650 #define	futex_hold(f)	futex_hold_count(f, 1)
    651 
    652 /*
    653  * futex_rele_count(f, n)
    654  *
    655  *	Release a reference to f acquired with futex_create or
    656  *	futex_hold.
    657  *
    658  *	May sleep to free f.
    659  */
    660 static void
    661 futex_rele_count(struct futex *f, unsigned long const count)
    662 {
    663 	unsigned long refcnt;
    664 
    665 	ASSERT_SLEEPABLE();
    666 
    667 	do {
    668 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    669 		KASSERT(refcnt >= count);
    670 		if (refcnt - count == 0)
    671 			goto trylast;
    672 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    673 				  refcnt - count) != refcnt);
    674 	return;
    675 
    676 trylast:
    677 	KASSERT(count <= LONG_MAX);
    678 	mutex_enter(&futex_tab.lock);
    679 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    680 		if (f->fx_on_tree) {
    681 			if (__predict_false(f->fx_shared))
    682 				rb_tree_remove_node(&futex_tab.oa, f);
    683 			else
    684 				rb_tree_remove_node(&futex_tab.va, f);
    685 			f->fx_on_tree = false;
    686 		}
    687 	} else {
    688 		/* References remain -- don't destroy it.  */
    689 		f = NULL;
    690 	}
    691 	mutex_exit(&futex_tab.lock);
    692 	if (f != NULL)
    693 		futex_destroy(f);
    694 }
    695 #define	futex_rele(f)	futex_rele_count(f, 1)
    696 
    697 /*
    698  * futex_rele_count_not_last(f, n)
    699  *
    700  *	Release a reference to f acquired with futex_create or
    701  *	futex_hold.
    702  *
    703  *	This version asserts that we are not dropping the last
    704  *	reference to f.
    705  */
    706 static void
    707 futex_rele_count_not_last(struct futex *f, unsigned long const count)
    708 {
    709 	unsigned long refcnt;
    710 
    711 	do {
    712 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    713 		KASSERT(refcnt >= count);
    714 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    715 				  refcnt - count) != refcnt);
    716 }
    717 
    718 /*
    719  * futex_lookup_by_key(key, shared, class, &f)
    720  *
    721  *	Try to find an existing futex va reference in the specified key
    722  *	On success, return 0, set f to found futex or to NULL if not found,
    723  *	and increment f's reference count if found.
    724  *
    725  *	Return ENFILE if reference count too high.
    726  *
    727  *	Internal lookup routine shared by futex_lookup() and
    728  *	futex_lookup_create().
    729  */
    730 static int
    731 futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
    732     struct futex **fp)
    733 {
    734 	struct futex *f;
    735 	int error = 0;
    736 
    737 	mutex_enter(&futex_tab.lock);
    738 	if (__predict_false(shared)) {
    739 		f = rb_tree_find_node(&futex_tab.oa, fk);
    740 	} else {
    741 		f = rb_tree_find_node(&futex_tab.va, fk);
    742 	}
    743 	if (f) {
    744 		if (__predict_true(f->fx_class == class ||
    745 				   class == FUTEX_CLASS_ANY))
    746 			error = futex_hold(f);
    747 		else
    748 			error = EINVAL;
    749 		if (error)
    750 			f = NULL;
    751 	}
    752  	*fp = f;
    753 	mutex_exit(&futex_tab.lock);
    754 
    755 	return error;
    756 }
    757 
    758 /*
    759  * futex_insert(f, fp)
    760  *
    761  *	Try to insert the futex f into the tree by va.  If there
    762  *	already is a futex for its va, acquire a reference to it, and
    763  *	store it in *fp; otherwise store f in *fp.
    764  *
    765  *	Return 0 on success, ENFILE if there already is a futex but its
    766  *	reference count is too high.
    767  */
    768 static int
    769 futex_insert(struct futex *f, struct futex **fp)
    770 {
    771 	struct futex *f0;
    772 	int error;
    773 
    774 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    775 	KASSERT(!f->fx_on_tree);
    776 
    777 	mutex_enter(&futex_tab.lock);
    778 	if (__predict_false(f->fx_shared))
    779 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    780 	else
    781 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    782 	if (f0 == f) {
    783 		f->fx_on_tree = true;
    784 		error = 0;
    785 	} else {
    786 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    787 		KASSERT(f0->fx_on_tree);
    788 		error = futex_hold(f0);
    789 		if (error)
    790 			goto out;
    791 	}
    792 	*fp = f0;
    793 out:	mutex_exit(&futex_tab.lock);
    794 
    795 	return error;
    796 }
    797 
    798 /*
    799  * futex_lookup(uaddr, shared, class, &f)
    800  *
    801  *	Find a futex at the userland pointer uaddr in the current
    802  *	process's VM space.  On success, return the futex in f and
    803  *	increment its reference count.
    804  *
    805  *	Caller must call futex_rele when done.
    806  */
    807 static int
    808 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    809 {
    810 	futex_key_t fk;
    811 	struct vmspace *vm = curproc->p_vmspace;
    812 	vaddr_t va = (vaddr_t)uaddr;
    813 	int error;
    814 
    815 	/*
    816 	 * Reject unaligned user pointers so we don't cross page
    817 	 * boundaries and so atomics will work.
    818 	 */
    819 	if (__predict_false((va & 3) != 0))
    820 		return EINVAL;
    821 
    822 	/* Look it up. */
    823 	error = futex_key_init(&fk, vm, va, shared);
    824 	if (error)
    825 		return error;
    826 
    827 	error = futex_lookup_by_key(&fk, shared, class, fp);
    828 	futex_key_fini(&fk, shared);
    829 	if (error)
    830 		return error;
    831 
    832 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    833 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    834 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    835 
    836 	/*
    837 	 * Success!  (Caller must still check whether we found
    838 	 * anything, but nothing went _wrong_ like trying to use
    839 	 * unmapped memory.)
    840 	 */
    841 	KASSERT(error == 0);
    842 
    843 	return error;
    844 }
    845 
    846 /*
    847  * futex_lookup_create(uaddr, shared, class, &f)
    848  *
    849  *	Find or create a futex at the userland pointer uaddr in the
    850  *	current process's VM space.  On success, return the futex in f
    851  *	and increment its reference count.
    852  *
    853  *	Caller must call futex_rele when done.
    854  */
    855 static int
    856 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    857 {
    858 	futex_key_t fk;
    859 	struct vmspace *vm = curproc->p_vmspace;
    860 	struct futex *f = NULL;
    861 	vaddr_t va = (vaddr_t)uaddr;
    862 	int error;
    863 
    864 	/*
    865 	 * Reject unaligned user pointers so we don't cross page
    866 	 * boundaries and so atomics will work.
    867 	 */
    868 	if (__predict_false((va & 3) != 0))
    869 		return EINVAL;
    870 
    871 	if (__predict_false(class == FUTEX_CLASS_ANY))
    872 		return EINVAL;
    873 
    874 	error = futex_key_init(&fk, vm, va, shared);
    875 	if (error)
    876 		return error;
    877 
    878 	/*
    879 	 * Optimistically assume there already is one, and try to find
    880 	 * it.
    881 	 */
    882 	error = futex_lookup_by_key(&fk, shared, class, fp);
    883 	if (error || *fp != NULL) {
    884 		/*
    885 		 * We either found one, or there was an error.
    886 		 * In either case, we are done with the key.
    887 		 */
    888 		futex_key_fini(&fk, shared);
    889 		goto out;
    890 	}
    891 
    892 	/*
    893 	 * Create a futex recoard.  This tranfers ownership of the key
    894 	 * in all cases.
    895 	 */
    896 	f = futex_create(&fk, shared, class);
    897 	if (f == NULL) {
    898 		error = ENOMEM;
    899 		goto out;
    900 	}
    901 
    902 	/*
    903 	 * Insert our new futex, or use existing if someone else beat
    904 	 * us to it.
    905 	 */
    906 	error = futex_insert(f, fp);
    907 	if (error)
    908 		goto out;
    909 	if (*fp == f)
    910 		f = NULL;	/* don't release on exit */
    911 
    912 	/* Success!  */
    913 	KASSERT(error == 0);
    914 
    915 out:	if (f != NULL)
    916 		futex_rele(f);
    917 	KASSERT(error || *fp != NULL);
    918 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    919 	return error;
    920 }
    921 
    922 /*
    923  * futex_unsleep:
    924  *
    925  *	Remove an LWP from the futex and sleep queue.  This is called when
    926  *	the LWP has not been awoken normally but instead interrupted: for
    927  *	example, when a signal is received.  Must be called with the LWP
    928  *	locked.  Will unlock if "unlock" is true.
    929  */
    930 static void
    931 futex_unsleep(lwp_t *l, bool unlock)
    932 {
    933 	struct futex *f __diagused;
    934 
    935 	f = (struct futex *)(uintptr_t)l->l_wchan;
    936 
    937 	KASSERT(mutex_owned(f->fx_sq_lock));
    938 
    939 	/* WRITERQ is more likely */
    940 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    941 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    942 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    943 	} else {
    944 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    945 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    946 		f->fx_nwaiters[FUTEX_READERQ]--;
    947 	}
    948 
    949 	sleepq_unsleep(l, unlock);
    950 }
    951 
    952 /*
    953  * futex_wait(f, timeout, clkid, bitset)
    954  *
    955  *	Wait until deadline on the clock clkid, or forever if deadline
    956  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    957  *	ETIMEDOUT on timeout, EINTR on signal.
    958  */
    959 static int
    960 futex_wait(struct futex *f, int q, const struct timespec *deadline,
    961     clockid_t clkid, int bitset)
    962 {
    963 
    964 	/*
    965 	 * Some things to note about how this function works:
    966 	 *
    967 	 * ==> When we enter futex_wait(), the futex's op lock is
    968 	 * held, preventing us from missing wakes.  Once we are on
    969 	 * the futex's sleepq, it is safe to release the op lock.
    970 	 *
    971 	 * ==> We have a single early escape to avoid calling
    972 	 * sleepq_block(): our deadline already having passed.
    973 	 * If this is a no-timeout wait, or if the deadline has
    974 	 * not already passed, we are guaranteed to call sleepq_block().
    975 	 *
    976 	 * ==> sleepq_block() contains all of the necessary logic
    977 	 * for handling signals; we do not need to check for them
    978 	 * ourselves.
    979 	 *
    980 	 * ==> Once we have blocked, other futex operations will
    981 	 * be able to wake us or requeue us.  Until we have blocked,
    982 	 * those other futex operations will not be able to acquire
    983 	 * the sleepq locks in order to perform those operations,
    984 	 * even if we have dropped the op lock.  Thus, we will not
    985 	 * miss wakes.  This fundamental invariant is relied upon by
    986 	 * every other synchronization object in the kernel.
    987 	 *
    988 	 * ==> sleepq_block() returns for one of three reasons:
    989 	 *
    990 	 *	-- We were awakened.
    991 	 *	-- We were signaled.
    992 	 *	-- We timed out.
    993 	 *
    994 	 * ==> Once sleepq_block() returns, we will not call
    995 	 * sleepq_block() again.
    996 	 *
    997 	 * ==> If sleepq_block() returns due to being signaled,
    998 	 * we must check to see if we were also awakened.  This
    999 	 * is to handle the following sequence:
   1000 	 *
   1001 	 *	-- Futex wake takes us off sleepq, places us on runq.
   1002 	 *	-- We are signaled while sitting on the runq.
   1003 	 *	-- We run, and sleepq_block() notices the signal and
   1004 	 *	   returns  EINTR/ERESTART.
   1005 	 *
   1006 	 * In this situation, we want to indicate a successful wake
   1007 	 * to the caller, because that wake has been reported to the
   1008 	 * thread that issued it.
   1009 	 *
   1010 	 * Note that it is NOT possible for a wake to be issued after
   1011 	 * a signal; the LWP will already have been taken off the sleepq
   1012 	 * by the signal, so the wake will never happen.  Note that for
   1013 	 * this reason, we must *never* return ERESTART, because it could
   1014 	 * result in us waiting again with no one to awaken us.
   1015 	 */
   1016 
   1017 	struct lwp * const l = curlwp;
   1018 	int timo;
   1019 	int error;
   1020 
   1021 	/*
   1022 	 * Compute our timeout before taking any locks.
   1023 	 */
   1024 	if (deadline) {
   1025 		struct timespec ts;
   1026 
   1027 		/* Check our watch.  */
   1028 		error = clock_gettime1(clkid, &ts);
   1029 		if (error) {
   1030 			mutex_exit(&f->fx_op_lock);
   1031 			return error;
   1032 		}
   1033 
   1034 		/*
   1035 		 * If we're past the deadline, ETIMEDOUT.
   1036 		 */
   1037 		if (timespeccmp(deadline, &ts, <=)) {
   1038 			mutex_exit(&f->fx_op_lock);
   1039 			return ETIMEDOUT;
   1040 		} else {
   1041 			/* Count how much time is left.  */
   1042 			timespecsub(deadline, &ts, &ts);
   1043 			timo = tstohz(&ts);
   1044 			if (timo == 0) {
   1045 				/*
   1046 				 * tstohz() already rounds up, and
   1047 				 * a return value of 0 ticks means
   1048 				 * "expired now or in the past".
   1049 				 */
   1050 				mutex_exit(&f->fx_op_lock);
   1051 				return ETIMEDOUT;
   1052 			}
   1053 		}
   1054 	} else {
   1055 		timo = 0;
   1056 	}
   1057 
   1058 	/*
   1059 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1060 	 * that there's room for us to block.
   1061 	 */
   1062 	mutex_spin_enter(f->fx_sq_lock);
   1063 	if (__predict_false(q == FUTEX_READERQ &&
   1064 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1065 		mutex_spin_exit(f->fx_sq_lock);
   1066 		mutex_exit(&f->fx_op_lock);
   1067 		return ENFILE;
   1068 	}
   1069 	lwp_lock(l);
   1070 
   1071 	/*
   1072 	 * No need for locks here; until we're on a futex's sleepq, these
   1073 	 * values are private to the LWP, and the locks needed to get onto
   1074 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1075 	 */
   1076 	l->l_futex = f;
   1077 	l->l_futex_wakesel = bitset;
   1078 
   1079 	/*
   1080 	 * This is an inlined bit of sleepq_enter();
   1081 	 * we already hold the lwp lock.
   1082 	 */
   1083 	lwp_unlock_to(l, f->fx_sq_lock);
   1084 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1085 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1086 
   1087 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1088 	f->fx_nwaiters[q]++;
   1089 
   1090 	/* We can now safely release the op lock. */
   1091 	mutex_exit(&f->fx_op_lock);
   1092 
   1093 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1094 	error = sleepq_block(timo, true);
   1095 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1096 
   1097 	/*
   1098 	 * f is no longer valid; we may have been moved to another
   1099 	 * futex sleepq while we slept.
   1100 	 */
   1101 
   1102 	/*
   1103 	 * If something went wrong, we may still have a futex reference.
   1104 	 * Check for that here and drop it if so.  We can do this without
   1105 	 * taking any locks because l->l_futex is private to the LWP when
   1106 	 * not on any futex's sleepq, and we are not on any sleepq because
   1107 	 * we are running.
   1108 	 *
   1109 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1110 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1111 	 */
   1112 	if (error) {
   1113 		f = l->l_futex;
   1114 		if (f != NULL) {
   1115 			SDT_PROBE0(futex, wait, finish, aborted);
   1116 			l->l_futex = NULL;
   1117 			futex_rele(f);
   1118 		} else {
   1119 			/* Raced with wakeup; report success. */
   1120 			SDT_PROBE0(futex, wait, finish, wakerace);
   1121 			error = 0;
   1122 		}
   1123 		if (error == EWOULDBLOCK)
   1124 			error = ETIMEDOUT;
   1125 		else if (error != ETIMEDOUT)
   1126 			error = EINTR;
   1127 	} else {
   1128 		f = l->l_futex;
   1129 		if (__predict_false(f != NULL)) {
   1130 			/*
   1131 			 * There are certain situations that can cause
   1132 			 * sleepq_block() to return 0 even if we were
   1133 			 * signalled (by e.g. SIGKILL).  In this case,
   1134 			 * we will need to release our futex hold and
   1135 			 * return EINTR (we're probably about to die
   1136 			 * anyway).
   1137 			 */
   1138 			SDT_PROBE0(futex, wait, finish, aborted);
   1139 			l->l_futex = NULL;
   1140 			futex_rele(f);
   1141 			error = EINTR;
   1142 		} else {
   1143 			SDT_PROBE0(futex, wait, finish, normally);
   1144 		}
   1145 	}
   1146 
   1147 	return error;
   1148 }
   1149 
   1150 /*
   1151  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1152  *
   1153  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1154  *	provided, move up to nrequeue remaining waiters on f matching
   1155  *	bitset to f2.  Return the number of waiters actually woken.
   1156  *	Caller must hold the locks of f and f2, if provided.
   1157  */
   1158 static unsigned
   1159 futex_wake(struct futex *f, int q, unsigned int const nwake,
   1160     struct futex *f2, int q2, unsigned int const nrequeue,
   1161     int bitset, unsigned int *nrequeuedp)
   1162 {
   1163 	struct lwp *l, *l_next;
   1164 	unsigned nwoken = 0;
   1165 	unsigned nrequeued = 0;
   1166 
   1167 	KASSERT(mutex_owned(&f->fx_op_lock));
   1168 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1169 
   1170 	futex_sq_lock2(f, f2);
   1171 
   1172 	/* Wake up to nwake waiters, and count the number woken.  */
   1173 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1174 		if (nwoken == nwake)
   1175 			break;
   1176 
   1177 		KASSERT(l->l_syncobj == &futex_syncobj);
   1178 		KASSERT(l->l_wchan == (wchan_t)f);
   1179 		KASSERT(l->l_futex == f);
   1180 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1181 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1182 
   1183 		if ((l->l_futex_wakesel & bitset) == 0)
   1184 			continue;
   1185 
   1186 		l->l_futex_wakesel = 0;
   1187 		l->l_futex = NULL;
   1188 		sleepq_remove(&f->fx_sqs[q], l);
   1189 		f->fx_nwaiters[q]--;
   1190 		nwoken++;
   1191 		/* hold counts adjusted in bulk below */
   1192 	}
   1193 
   1194 	if (nrequeue) {
   1195 		KASSERT(f2 != NULL);
   1196 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1197 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1198 			if (nrequeued == nrequeue)
   1199 				break;
   1200 
   1201 			KASSERT(l->l_syncobj == &futex_syncobj);
   1202 			KASSERT(l->l_wchan == (wchan_t)f);
   1203 			KASSERT(l->l_futex == f);
   1204 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1205 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1206 
   1207 			if ((l->l_futex_wakesel & bitset) == 0)
   1208 				continue;
   1209 
   1210 			l->l_futex = f2;
   1211 			sleepq_transfer(l, &f->fx_sqs[q],
   1212 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1213 			    &futex_syncobj, f2->fx_sq_lock, true);
   1214 			f->fx_nwaiters[q]--;
   1215 			f2->fx_nwaiters[q2]++;
   1216 			nrequeued++;
   1217 			/* hold counts adjusted in bulk below */
   1218 		}
   1219 		if (nrequeued) {
   1220 			/* XXX futex_hold() could theoretically fail here. */
   1221 			int hold_error __diagused =
   1222 			    futex_hold_count(f2, nrequeued);
   1223 			KASSERT(hold_error == 0);
   1224 		}
   1225 	}
   1226 
   1227 	/*
   1228 	 * Adjust the futex reference counts for the wakes and
   1229 	 * requeues.
   1230 	 */
   1231 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1232 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1233 
   1234 	futex_sq_unlock2(f, f2);
   1235 
   1236 	/* Return the number of waiters woken and requeued.  */
   1237 	if (nrequeuedp != NULL) {
   1238 		*nrequeuedp = nrequeued;
   1239 	}
   1240 	return nwoken;
   1241 }
   1242 
   1243 /*
   1244  * futex_op_lock(f)
   1245  *
   1246  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1247  *	not use if caller needs to acquire two locks; use
   1248  *	futex_op_lock2 instead.
   1249  */
   1250 static void
   1251 futex_op_lock(struct futex *f)
   1252 {
   1253 	mutex_enter(&f->fx_op_lock);
   1254 }
   1255 
   1256 /*
   1257  * futex_op_unlock(f)
   1258  *
   1259  *	Release the op lock of f.
   1260  */
   1261 static void
   1262 futex_op_unlock(struct futex *f)
   1263 {
   1264 	mutex_exit(&f->fx_op_lock);
   1265 }
   1266 
   1267 /*
   1268  * futex_op_lock2(f, f2)
   1269  *
   1270  *	Acquire the op locks of both f and f2, which may be null, or
   1271  *	which may be the same.  If they are distinct, an arbitrary total
   1272  *	order is chosen on the locks.
   1273  *
   1274  *	Callers should only ever acquire multiple op locks
   1275  *	simultaneously using futex_op_lock2.
   1276  */
   1277 static void
   1278 futex_op_lock2(struct futex *f, struct futex *f2)
   1279 {
   1280 
   1281 	/*
   1282 	 * If both are null, do nothing; if one is null and the other
   1283 	 * is not, lock the other and be done with it.
   1284 	 */
   1285 	if (f == NULL && f2 == NULL) {
   1286 		return;
   1287 	} else if (f == NULL) {
   1288 		mutex_enter(&f2->fx_op_lock);
   1289 		return;
   1290 	} else if (f2 == NULL) {
   1291 		mutex_enter(&f->fx_op_lock);
   1292 		return;
   1293 	}
   1294 
   1295 	/* If both futexes are the same, acquire only one. */
   1296 	if (f == f2) {
   1297 		mutex_enter(&f->fx_op_lock);
   1298 		return;
   1299 	}
   1300 
   1301 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1302 	if ((uintptr_t)f < (uintptr_t)f2) {
   1303 		mutex_enter(&f->fx_op_lock);
   1304 		mutex_enter(&f2->fx_op_lock);
   1305 	} else {
   1306 		mutex_enter(&f2->fx_op_lock);
   1307 		mutex_enter(&f->fx_op_lock);
   1308 	}
   1309 }
   1310 
   1311 /*
   1312  * futex_op_unlock2(f, f2)
   1313  *
   1314  *	Release the queue locks of both f and f2, which may be null, or
   1315  *	which may have the same underlying queue.
   1316  */
   1317 static void
   1318 futex_op_unlock2(struct futex *f, struct futex *f2)
   1319 {
   1320 
   1321 	/*
   1322 	 * If both are null, do nothing; if one is null and the other
   1323 	 * is not, unlock the other and be done with it.
   1324 	 */
   1325 	if (f == NULL && f2 == NULL) {
   1326 		return;
   1327 	} else if (f == NULL) {
   1328 		mutex_exit(&f2->fx_op_lock);
   1329 		return;
   1330 	} else if (f2 == NULL) {
   1331 		mutex_exit(&f->fx_op_lock);
   1332 		return;
   1333 	}
   1334 
   1335 	/* If both futexes are the same, release only one. */
   1336 	if (f == f2) {
   1337 		mutex_exit(&f->fx_op_lock);
   1338 		return;
   1339 	}
   1340 
   1341 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1342 	if ((uintptr_t)f < (uintptr_t)f2) {
   1343 		mutex_exit(&f2->fx_op_lock);
   1344 		mutex_exit(&f->fx_op_lock);
   1345 	} else {
   1346 		mutex_exit(&f->fx_op_lock);
   1347 		mutex_exit(&f2->fx_op_lock);
   1348 	}
   1349 }
   1350 
   1351 /*
   1352  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1353  *
   1354  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1355  */
   1356 static int
   1357 futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1358     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1359     register_t *retval)
   1360 {
   1361 	struct futex *f;
   1362 	struct timespec ts;
   1363 	const struct timespec *deadline;
   1364 	int error;
   1365 
   1366 	/*
   1367 	 * If there's nothing to wait for, and nobody will ever wake
   1368 	 * us, then don't set anything up to wait -- just stop here.
   1369 	 */
   1370 	if (val3 == 0)
   1371 		return EINVAL;
   1372 
   1373 	/* Optimistically test before anything else.  */
   1374 	if (!futex_test(uaddr, val))
   1375 		return EAGAIN;
   1376 
   1377 	/* Determine a deadline on the specified clock.  */
   1378 	if (timeout == NULL) {
   1379 		deadline = timeout;
   1380 	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1381 		/* Sanity-check the deadline. */
   1382 		if (timeout->tv_sec < 0 ||
   1383 		    timeout->tv_nsec < 0 ||
   1384 		    timeout->tv_nsec >= 1000000000L) {
   1385 			return EINVAL;
   1386 		}
   1387 		deadline = timeout;
   1388 	} else {
   1389 		struct timespec interval = *timeout;
   1390 
   1391 		error = itimespecfix(&interval);
   1392 		if (error)
   1393 			return error;
   1394 		error = clock_gettime1(clkid, &ts);
   1395 		if (error)
   1396 			return error;
   1397 		timespecadd(&ts, &interval, &ts);
   1398 		deadline = &ts;
   1399 	}
   1400 
   1401 	/* Get the futex, creating it if necessary.  */
   1402 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1403 	if (error)
   1404 		return error;
   1405 	KASSERT(f);
   1406 
   1407 	/*
   1408 	 * Under the op lock, check the value again: if it has
   1409 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1410 	 * Since FUTEX_WAKE will use the same lock and be done after
   1411 	 * modifying the value, the order in which we check and enqueue
   1412 	 * is immaterial.
   1413 	 */
   1414 	futex_op_lock(f);
   1415 	if (!futex_test(uaddr, val)) {
   1416 		futex_op_unlock(f);
   1417 		error = EAGAIN;
   1418 		goto out;
   1419 	}
   1420 
   1421 	/*
   1422 	 * Now wait.  futex_wait() will drop our op lock once we
   1423 	 * are entered into the sleep queue, thus ensuring atomicity
   1424 	 * of wakes with respect to waits.
   1425 	 */
   1426 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1427 
   1428 	/*
   1429 	 * We cannot drop our reference to the futex here, because
   1430 	 * we might be enqueued on a different one when we are awakened.
   1431 	 * The references will be managed on our behalf in the requeue,
   1432 	 * wake, and error cases.
   1433 	 */
   1434 	f = NULL;
   1435 
   1436 	if (__predict_true(error == 0)) {
   1437 		/* Return 0 on success, error on failure. */
   1438 		*retval = 0;
   1439 	}
   1440 
   1441 out:	if (f != NULL)
   1442 		futex_rele(f);
   1443 	return error;
   1444 }
   1445 
   1446 /*
   1447  * futex_func_wake(uaddr, val, val3, retval)
   1448  *
   1449  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1450  */
   1451 static int
   1452 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1453 {
   1454 	struct futex *f;
   1455 	unsigned int nwoken = 0;
   1456 	int error = 0;
   1457 
   1458 	/* Reject negative number of wakeups.  */
   1459 	if (val < 0) {
   1460 		error = EINVAL;
   1461 		goto out;
   1462 	}
   1463 
   1464 	/* Look up the futex, if any.  */
   1465 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1466 	if (error)
   1467 		goto out;
   1468 
   1469 	/* If there's no futex, there are no waiters to wake.  */
   1470 	if (f == NULL)
   1471 		goto out;
   1472 
   1473 	/*
   1474 	 * Under f's op lock, wake the waiters and remember the
   1475 	 * number woken.
   1476 	 */
   1477 	futex_op_lock(f);
   1478 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1479 			    NULL, FUTEX_WRITERQ, 0, val3, NULL);
   1480 	futex_op_unlock(f);
   1481 
   1482 	/* Release the futex.  */
   1483 	futex_rele(f);
   1484 
   1485 out:
   1486 	/* Return the number of waiters woken.  */
   1487 	*retval = nwoken;
   1488 
   1489 	/* Success!  */
   1490 	return error;
   1491 }
   1492 
   1493 /*
   1494  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1495  *
   1496  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1497  */
   1498 static int
   1499 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1500     int val2, int val3, register_t *retval)
   1501 {
   1502 	struct futex *f = NULL, *f2 = NULL;
   1503 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1504 	unsigned nrequeued = 0;
   1505 	int error;
   1506 
   1507 	/* Reject negative number of wakeups or requeues. */
   1508 	if (val < 0 || val2 < 0) {
   1509 		error = EINVAL;
   1510 		goto out;
   1511 	}
   1512 
   1513 	/* Look up the source futex, if any. */
   1514 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1515 	if (error)
   1516 		goto out;
   1517 
   1518 	/* If there is none, nothing to do. */
   1519 	if (f == NULL)
   1520 		goto out;
   1521 
   1522 	/*
   1523 	 * We may need to create the destination futex because it's
   1524 	 * entirely possible it does not currently have any waiters.
   1525 	 */
   1526 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1527 	if (error)
   1528 		goto out;
   1529 
   1530 	/*
   1531 	 * Under the futexes' op locks, check the value; if
   1532 	 * unchanged from val3, wake the waiters.
   1533 	 */
   1534 	futex_op_lock2(f, f2);
   1535 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1536 		error = EAGAIN;
   1537 	} else {
   1538 		error = 0;
   1539 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1540 				    f2, FUTEX_WRITERQ, val2,
   1541 				    FUTEX_BITSET_MATCH_ANY,
   1542 				    &nrequeued);
   1543 	}
   1544 	futex_op_unlock2(f, f2);
   1545 
   1546 out:
   1547 	/*
   1548 	 * For FUTUEX_REQUEUE, return the numner of waiters woken.
   1549 	 *
   1550 	 * For FUTEX_CMP_REQUEUE, return the number of waiters woken
   1551 	 * **and** requeued.
   1552 	 */
   1553 	*retval = nwoken + (op == FUTEX_CMP_REQUEUE) ? nrequeued : 0;
   1554 
   1555 	/* Release the futexes if we got them.  */
   1556 	if (f2)
   1557 		futex_rele(f2);
   1558 	if (f)
   1559 		futex_rele(f);
   1560 	return error;
   1561 }
   1562 
   1563 /*
   1564  * futex_validate_op_cmp(val3)
   1565  *
   1566  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1567  */
   1568 static int
   1569 futex_validate_op_cmp(int val3)
   1570 {
   1571 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1572 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1573 
   1574 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1575 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1576 		if (oparg < 0)
   1577 			return EINVAL;
   1578 		if (oparg >= 32)
   1579 			return EINVAL;
   1580 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1581 	}
   1582 
   1583 	switch (op) {
   1584 	case FUTEX_OP_SET:
   1585 	case FUTEX_OP_ADD:
   1586 	case FUTEX_OP_OR:
   1587 	case FUTEX_OP_ANDN:
   1588 	case FUTEX_OP_XOR:
   1589 		break;
   1590 	default:
   1591 		return EINVAL;
   1592 	}
   1593 
   1594 	switch (cmp) {
   1595 	case FUTEX_OP_CMP_EQ:
   1596 	case FUTEX_OP_CMP_NE:
   1597 	case FUTEX_OP_CMP_LT:
   1598 	case FUTEX_OP_CMP_LE:
   1599 	case FUTEX_OP_CMP_GT:
   1600 	case FUTEX_OP_CMP_GE:
   1601 		break;
   1602 	default:
   1603 		return EINVAL;
   1604 	}
   1605 
   1606 	return 0;
   1607 }
   1608 
   1609 /*
   1610  * futex_compute_op(oldval, val3)
   1611  *
   1612  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1613  */
   1614 static int
   1615 futex_compute_op(int oldval, int val3)
   1616 {
   1617 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1618 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1619 
   1620 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1621 		KASSERT(oparg >= 0);
   1622 		KASSERT(oparg < 32);
   1623 		oparg = 1u << oparg;
   1624 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1625 	}
   1626 
   1627 	switch (op) {
   1628 	case FUTEX_OP_SET:
   1629 		return oparg;
   1630 
   1631 	case FUTEX_OP_ADD:
   1632 		/*
   1633 		 * Avoid signed arithmetic overflow by doing
   1634 		 * arithmetic unsigned and converting back to signed
   1635 		 * at the end.
   1636 		 */
   1637 		return (int)((unsigned)oldval + (unsigned)oparg);
   1638 
   1639 	case FUTEX_OP_OR:
   1640 		return oldval | oparg;
   1641 
   1642 	case FUTEX_OP_ANDN:
   1643 		return oldval & ~oparg;
   1644 
   1645 	case FUTEX_OP_XOR:
   1646 		return oldval ^ oparg;
   1647 
   1648 	default:
   1649 		panic("invalid futex op");
   1650 	}
   1651 }
   1652 
   1653 /*
   1654  * futex_compute_cmp(oldval, val3)
   1655  *
   1656  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1657  */
   1658 static bool
   1659 futex_compute_cmp(int oldval, int val3)
   1660 {
   1661 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1662 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1663 
   1664 	switch (cmp) {
   1665 	case FUTEX_OP_CMP_EQ:
   1666 		return (oldval == cmparg);
   1667 
   1668 	case FUTEX_OP_CMP_NE:
   1669 		return (oldval != cmparg);
   1670 
   1671 	case FUTEX_OP_CMP_LT:
   1672 		return (oldval < cmparg);
   1673 
   1674 	case FUTEX_OP_CMP_LE:
   1675 		return (oldval <= cmparg);
   1676 
   1677 	case FUTEX_OP_CMP_GT:
   1678 		return (oldval > cmparg);
   1679 
   1680 	case FUTEX_OP_CMP_GE:
   1681 		return (oldval >= cmparg);
   1682 
   1683 	default:
   1684 		panic("invalid futex cmp operation");
   1685 	}
   1686 }
   1687 
   1688 /*
   1689  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1690  *
   1691  *	Implement futex(FUTEX_WAKE_OP).
   1692  */
   1693 static int
   1694 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1695     int val3, register_t *retval)
   1696 {
   1697 	struct futex *f = NULL, *f2 = NULL;
   1698 	int oldval, newval, actual;
   1699 	unsigned nwoken = 0;
   1700 	int error;
   1701 
   1702 	/* Reject negative number of wakeups.  */
   1703 	if (val < 0 || val2 < 0) {
   1704 		error = EINVAL;
   1705 		goto out;
   1706 	}
   1707 
   1708 	/* Reject invalid operations before we start doing things.  */
   1709 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1710 		goto out;
   1711 
   1712 	/* Look up the first futex, if any.  */
   1713 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1714 	if (error)
   1715 		goto out;
   1716 
   1717 	/* Look up the second futex, if any.  */
   1718 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1719 	if (error)
   1720 		goto out;
   1721 
   1722 	/*
   1723 	 * Under the op locks:
   1724 	 *
   1725 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1726 	 * 2. Unconditionally wake uaddr.
   1727 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1728 	 */
   1729 	futex_op_lock2(f, f2);
   1730 	do {
   1731 		error = futex_load(uaddr2, &oldval);
   1732 		if (error)
   1733 			goto out_unlock;
   1734 		newval = futex_compute_op(oldval, val3);
   1735 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1736 		if (error)
   1737 			goto out_unlock;
   1738 	} while (actual != oldval);
   1739 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1740 				 NULL, FUTEX_WRITERQ, 0,
   1741 				 FUTEX_BITSET_MATCH_ANY, NULL) : 0);
   1742 	if (f2 && futex_compute_cmp(oldval, val3))
   1743 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1744 				     NULL, FUTEX_WRITERQ, 0,
   1745 				     FUTEX_BITSET_MATCH_ANY, NULL);
   1746 
   1747 	/* Success! */
   1748 	error = 0;
   1749 out_unlock:
   1750 	futex_op_unlock2(f, f2);
   1751 
   1752 out:
   1753 	/* Return the number of waiters woken. */
   1754 	*retval = nwoken;
   1755 
   1756 	/* Release the futexes, if we got them. */
   1757 	if (f2)
   1758 		futex_rele(f2);
   1759 	if (f)
   1760 		futex_rele(f);
   1761 	return error;
   1762 }
   1763 
   1764 /*
   1765  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   1766  *
   1767  *	Implement the futex system call with all the parameters
   1768  *	parsed out.
   1769  */
   1770 int
   1771 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   1772     int *uaddr2, int val2, int val3, register_t *retval)
   1773 {
   1774 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   1775 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   1776 							    : CLOCK_MONOTONIC;
   1777 	int error;
   1778 
   1779 	op &= FUTEX_CMD_MASK;
   1780 
   1781 	switch (op) {
   1782 	case FUTEX_WAIT:
   1783 		SDT_PROBE6(futex, func, wait, entry,
   1784 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   1785 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   1786 		error = futex_func_wait(shared, uaddr, val,
   1787 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   1788 		    retval);
   1789 		SDT_PROBE1(futex, func, wait, exit, error);
   1790 		break;
   1791 
   1792 	case FUTEX_WAIT_BITSET:
   1793 		SDT_PROBE6(futex, func, wait, entry,
   1794 		    uaddr, val, val3, timeout,
   1795 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   1796 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   1797 		    clkid, TIMER_ABSTIME, retval);
   1798 		SDT_PROBE1(futex, func, wait, exit, error);
   1799 		break;
   1800 
   1801 	case FUTEX_WAKE:
   1802 		SDT_PROBE4(futex, func, wake, entry,
   1803 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   1804 		error = futex_func_wake(shared, uaddr, val,
   1805 		    FUTEX_BITSET_MATCH_ANY, retval);
   1806 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1807 		break;
   1808 
   1809 	case FUTEX_WAKE_BITSET:
   1810 		SDT_PROBE4(futex, func, wake, entry,
   1811 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   1812 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   1813 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1814 		break;
   1815 
   1816 	case FUTEX_REQUEUE:
   1817 		SDT_PROBE5(futex, func, requeue, entry,
   1818 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   1819 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1820 		    val2, 0, retval);
   1821 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   1822 		break;
   1823 
   1824 	case FUTEX_CMP_REQUEUE:
   1825 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   1826 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1827 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1828 		    val2, val3, retval);
   1829 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   1830 		break;
   1831 
   1832 	case FUTEX_WAKE_OP:
   1833 		SDT_PROBE6(futex, func, wake_op, entry,
   1834 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1835 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   1836 		    val3, retval);
   1837 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   1838 		break;
   1839 
   1840 	case FUTEX_FD:
   1841 	default:
   1842 		error = ENOSYS;
   1843 		break;
   1844 	}
   1845 
   1846 	return error;
   1847 }
   1848 
   1849 /*
   1850  * sys___futex(l, uap, retval)
   1851  *
   1852  *	__futex(2) system call: generic futex operations.
   1853  */
   1854 int
   1855 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   1856     register_t *retval)
   1857 {
   1858 	/* {
   1859 		syscallarg(int *) uaddr;
   1860 		syscallarg(int) op;
   1861 		syscallarg(int) val;
   1862 		syscallarg(const struct timespec *) timeout;
   1863 		syscallarg(int *) uaddr2;
   1864 		syscallarg(int) val2;
   1865 		syscallarg(int) val3;
   1866 	} */
   1867 	struct timespec ts, *tsp;
   1868 	int error;
   1869 
   1870 	/*
   1871 	 * Copy in the timeout argument, if specified.
   1872 	 */
   1873 	if (SCARG(uap, timeout)) {
   1874 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   1875 		if (error)
   1876 			return error;
   1877 		tsp = &ts;
   1878 	} else {
   1879 		tsp = NULL;
   1880 	}
   1881 
   1882 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   1883 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   1884 	    retval);
   1885 }
   1886 
   1887 /*
   1888  * sys___futex_set_robust_list(l, uap, retval)
   1889  *
   1890  *	__futex_set_robust_list(2) system call for robust futexes.
   1891  */
   1892 int
   1893 sys___futex_set_robust_list(struct lwp *l,
   1894     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   1895 {
   1896 	/* {
   1897 		syscallarg(void *) head;
   1898 		syscallarg(size_t) len;
   1899 	} */
   1900 	void *head = SCARG(uap, head);
   1901 
   1902 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   1903 		return EINVAL;
   1904 	if ((uintptr_t)head % sizeof(u_long))
   1905 		return EINVAL;
   1906 
   1907 	l->l_robust_head = (uintptr_t)head;
   1908 
   1909 	return 0;
   1910 }
   1911 
   1912 /*
   1913  * sys___futex_get_robust_list(l, uap, retval)
   1914  *
   1915  *	__futex_get_robust_list(2) system call for robust futexes.
   1916  */
   1917 int
   1918 sys___futex_get_robust_list(struct lwp *l,
   1919     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   1920 {
   1921 	/* {
   1922 		syscallarg(lwpid_t) lwpid;
   1923 		syscallarg(void **) headp;
   1924 		syscallarg(size_t *) lenp;
   1925 	} */
   1926 	void *head;
   1927 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   1928 	int error;
   1929 
   1930 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   1931 	if (error)
   1932 		return error;
   1933 
   1934 	/* Copy out the head pointer and the head structure length. */
   1935 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   1936 	if (__predict_true(error == 0)) {
   1937 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   1938 	}
   1939 
   1940 	return error;
   1941 }
   1942 
   1943 /*
   1944  * release_futex(uva, tid)
   1945  *
   1946  *	Try to release the robust futex at uva in the current process
   1947  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   1948  *	userland program's obligation to arrange correct behaviour.
   1949  */
   1950 static void
   1951 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   1952     bool const is_pending)
   1953 {
   1954 	int *uaddr;
   1955 	struct futex *f;
   1956 	int oldval, newval, actual;
   1957 	int error;
   1958 	bool shared;
   1959 
   1960 	/* If it's misaligned, tough.  */
   1961 	if (__predict_false(uptr & 3))
   1962 		return;
   1963 	uaddr = (int *)uptr;
   1964 
   1965 	error = futex_load(uaddr, &oldval);
   1966 	if (__predict_false(error))
   1967 		return;
   1968 
   1969 	/*
   1970 	 * There are two race conditions we need to handle here:
   1971 	 *
   1972 	 * 1. User space cleared the futex word but died before
   1973 	 *    being able to issue the wakeup.  No wakeups will
   1974 	 *    ever be issued, oops!
   1975 	 *
   1976 	 * 2. Awakened waiter died before being able to acquire
   1977 	 *    the futex in user space.  Any other waiters are
   1978 	 *    now stuck, oops!
   1979 	 *
   1980 	 * In both of these cases, the futex word will be 0 (because
   1981 	 * it's updated before the wake is issued).  The best we can
   1982 	 * do is detect this situation if it's the pending futex and
   1983 	 * issue a wake without modifying the futex word.
   1984 	 *
   1985 	 * XXX eventual PI handling?
   1986 	 */
   1987 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   1988 		register_t retval;
   1989 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   1990 		    FUTEX_BITSET_MATCH_ANY, &retval);
   1991 		if (error != 0 || retval == 0)
   1992 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   1993 			    FUTEX_BITSET_MATCH_ANY, &retval);
   1994 		return;
   1995 	}
   1996 
   1997 	/* Optimistically test whether we need to do anything at all.  */
   1998 	if ((oldval & FUTEX_TID_MASK) != tid)
   1999 		return;
   2000 
   2001 	/*
   2002 	 * We need to handle the case where this thread owned the futex,
   2003 	 * but it was uncontended.  In this case, there won't be any
   2004 	 * kernel state to look up.  All we can do is mark the futex
   2005 	 * as a zombie to be mopped up the next time another thread
   2006 	 * attempts to acquire it.
   2007 	 *
   2008 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   2009 	 * this loop, even if waiters appear while we're are doing
   2010 	 * so.  This is beause FUTEX_WAITERS is set by user space
   2011 	 * before calling __futex() to wait, and the futex needs
   2012 	 * to be marked as a zombie when the new waiter gets into
   2013 	 * the kernel.
   2014 	 */
   2015 	if ((oldval & FUTEX_WAITERS) == 0) {
   2016 		do {
   2017 			error = futex_load(uaddr, &oldval);
   2018 			if (error)
   2019 				return;
   2020 			if ((oldval & FUTEX_TID_MASK) != tid)
   2021 				return;
   2022 			newval = oldval | FUTEX_OWNER_DIED;
   2023 			error = ucas_int(uaddr, oldval, newval, &actual);
   2024 			if (error)
   2025 				return;
   2026 		} while (actual != oldval);
   2027 
   2028 		/*
   2029 		 * If where is still no indication of waiters, then there is
   2030 		 * no more work for us to do.
   2031 		 */
   2032 		if ((oldval & FUTEX_WAITERS) == 0)
   2033 			return;
   2034 	}
   2035 
   2036 	/*
   2037 	 * Look for a futex.  Try shared first, then private.  If we
   2038 	 * can't fine one, tough.
   2039 	 *
   2040 	 * Note: the assumption here is that anyone placing a futex on
   2041 	 * the robust list is adhering to the rules, regardless of the
   2042 	 * futex class.
   2043 	 */
   2044 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2045 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2046 		if (error)
   2047 			return;
   2048 		if (f == NULL && shared == false)
   2049 			return;
   2050 	}
   2051 
   2052 	/* Work under the futex op lock.  */
   2053 	futex_op_lock(f);
   2054 
   2055 	/*
   2056 	 * Fetch the word: if the tid doesn't match ours, skip;
   2057 	 * otherwise, set the owner-died bit, atomically.
   2058 	 */
   2059 	do {
   2060 		error = futex_load(uaddr, &oldval);
   2061 		if (error)
   2062 			goto out;
   2063 		if ((oldval & FUTEX_TID_MASK) != tid)
   2064 			goto out;
   2065 		newval = oldval | FUTEX_OWNER_DIED;
   2066 		error = ucas_int(uaddr, oldval, newval, &actual);
   2067 		if (error)
   2068 			goto out;
   2069 	} while (actual != oldval);
   2070 
   2071 	/*
   2072 	 * If there may be waiters, try to wake one.  If anything goes
   2073 	 * wrong, tough.
   2074 	 *
   2075 	 * XXX eventual PI handling?
   2076 	 */
   2077 	if (oldval & FUTEX_WAITERS) {
   2078 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2079 				 NULL, FUTEX_WRITERQ, 0,
   2080 				 FUTEX_BITSET_MATCH_ANY, NULL);
   2081 	}
   2082 
   2083 	/* Unlock the queue and release the futex.  */
   2084 out:	futex_op_unlock(f);
   2085 	futex_rele(f);
   2086 }
   2087 
   2088 /*
   2089  * futex_robust_head_lookup(l, lwpid)
   2090  *
   2091  *	Helper function to look up a robust head by LWP ID.
   2092  */
   2093 int
   2094 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2095 {
   2096 	struct proc *p = l->l_proc;
   2097 
   2098 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2099 	if (lwpid) {
   2100 		mutex_enter(p->p_lock);
   2101 		l = lwp_find(p, lwpid);
   2102 		if (l == NULL) {
   2103 			mutex_exit(p->p_lock);
   2104 			return ESRCH;
   2105 		}
   2106 		*headp = (void *)l->l_robust_head;
   2107 		mutex_exit(p->p_lock);
   2108 	} else {
   2109 		*headp = (void *)l->l_robust_head;
   2110 	}
   2111 	return 0;
   2112 }
   2113 
   2114 /*
   2115  * futex_fetch_robust_head(uaddr)
   2116  *
   2117  *	Helper routine to fetch the futex robust list head that
   2118  *	handles 32-bit binaries running on 64-bit kernels.
   2119  */
   2120 static int
   2121 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2122 {
   2123 #ifdef _LP64
   2124 	if (curproc->p_flag & PK_32) {
   2125 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2126 		int error;
   2127 
   2128 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2129 		if (__predict_true(error == 0)) {
   2130 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2131 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2132 					/*
   2133 					 * Make sure the offset is sign-
   2134 					 * extended.
   2135 					 */
   2136 					rhead[i] = (int32_t)rhead32[i];
   2137 				} else {
   2138 					rhead[i] = rhead32[i];
   2139 				}
   2140 			}
   2141 		}
   2142 		return error;
   2143 	}
   2144 #endif /* _L64 */
   2145 
   2146 	return copyin((void *)uaddr, rhead,
   2147 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2148 }
   2149 
   2150 /*
   2151  * futex_decode_robust_word(word)
   2152  *
   2153  *	Decode a robust futex list word into the entry and entry
   2154  *	properties.
   2155  */
   2156 static inline void
   2157 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2158     bool * const is_pi)
   2159 {
   2160 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2161 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2162 }
   2163 
   2164 /*
   2165  * futex_fetch_robust_entry(uaddr)
   2166  *
   2167  *	Helper routine to fetch and decode a robust futex entry
   2168  *	that handles 32-bit binaries running on 64-bit kernels.
   2169  */
   2170 static int
   2171 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2172     bool * const is_pi)
   2173 {
   2174 	uintptr_t val = 0;
   2175 	int error = 0;
   2176 
   2177 #ifdef _LP64
   2178 	if (curproc->p_flag & PK_32) {
   2179 		uint32_t val32;
   2180 
   2181 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2182 		if (__predict_true(error == 0))
   2183 			val = val32;
   2184 	} else
   2185 #endif /* _LP64 */
   2186 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2187 	if (__predict_false(error))
   2188 		return error;
   2189 
   2190 	futex_decode_robust_word(val, valp, is_pi);
   2191 	return 0;
   2192 }
   2193 
   2194 /*
   2195  * futex_release_all_lwp(l, tid)
   2196  *
   2197  *	Release all l's robust futexes.  If anything looks funny in
   2198  *	the process, give up -- it's userland's responsibility to dot
   2199  *	the i's and cross the t's.
   2200  */
   2201 void
   2202 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2203 {
   2204 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2205 	int limit = 1000000;
   2206 	int error;
   2207 
   2208 	/* If there's no robust list there's nothing to do. */
   2209 	if (l->l_robust_head == 0)
   2210 		return;
   2211 
   2212 	/* Read the final snapshot of the robust list head. */
   2213 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2214 	if (error) {
   2215 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2216 		    " unmapped robust futex list head\n",
   2217 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2218 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2219 		return;
   2220 	}
   2221 
   2222 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2223 
   2224 	uintptr_t next, pending;
   2225 	bool is_pi, pending_is_pi;
   2226 
   2227 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2228 	    &next, &is_pi);
   2229 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2230 	    &pending, &pending_is_pi);
   2231 
   2232 	/*
   2233 	 * Walk down the list of locked futexes and release them, up
   2234 	 * to one million of them before we give up.
   2235 	 */
   2236 
   2237 	while (next != l->l_robust_head && limit-- > 0) {
   2238 		/* pending handled below. */
   2239 		if (next != pending)
   2240 			release_futex(next + offset, tid, is_pi, false);
   2241 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2242 		if (error)
   2243 			break;
   2244 		preempt_point();
   2245 	}
   2246 	if (limit <= 0) {
   2247 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2248 		    " exhausted robust futex limit\n",
   2249 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2250 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2251 	}
   2252 
   2253 	/* If there's a pending futex, it may need to be released too. */
   2254 	if (pending != 0) {
   2255 		release_futex(pending + offset, tid, pending_is_pi, true);
   2256 	}
   2257 }
   2258