Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.12.4.4
      1 /*	$NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Taylor R. Campbell and Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $");
     34 
     35 /*
     36  * Futexes
     37  *
     38  *	The futex system call coordinates notifying threads waiting for
     39  *	changes on a 32-bit word of memory.  The word can be managed by
     40  *	CPU atomic operations in userland, without system calls, as long
     41  *	as there is no contention.
     42  *
     43  *	The simplest use case demonstrating the utility is:
     44  *
     45  *		// 32-bit word of memory shared among threads or
     46  *		// processes in userland.  lock & 1 means owned;
     47  *		// lock & 2 means there are waiters waiting.
     48  *		volatile int lock = 0;
     49  *
     50  *		int v;
     51  *
     52  *		// Acquire a lock.
     53  *		do {
     54  *			v = lock;
     55  *			if (v & 1) {
     56  *				// Lock is held.  Set a bit to say that
     57  *				// there are waiters, and wait for lock
     58  *				// to change to anything other than v;
     59  *				// then retry.
     60  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61  *					continue;
     62  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63  *				continue;
     64  *			}
     65  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66  *		membar_enter();
     67  *
     68  *		...
     69  *
     70  *		// Release the lock.  Optimistically assume there are
     71  *		// no waiters first until demonstrated otherwise.
     72  *		membar_exit();
     73  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74  *			// There may be waiters.
     75  *			v = atomic_swap_uint(&lock, 0);
     76  *			// If there are still waiters, wake one.
     77  *			if (v & 2)
     78  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79  *		}
     80  *
     81  *	The goal is to avoid the futex system call unless there is
     82  *	contention; then if there is contention, to guarantee no missed
     83  *	wakeups.
     84  *
     85  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86  *	itself to be woken, double-check the lock word, and then sleep;
     87  *	spurious wakeups are generally a fact of life, so any
     88  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89  *
     90  *	If this were all there is to it, we could then increase
     91  *	parallelism by refining the approximation: partition the
     92  *	waiters into buckets by hashing the lock addresses to reduce
     93  *	the incidence of spurious wakeups.  But this is not all.
     94  *
     95  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96  *	operation not only wakes n waiters on lock if lock == val, but
     97  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99  *	to lock2 if they merely share the same hash as waiters on lock.
    100  *	Thus, we can't approximately distribute waiters into queues by
    101  *	a hash function; we must distinguish futex queues exactly by
    102  *	lock address.
    103  *
    104  *	For now, we use a global red/black tree to index futexes.  This
    105  *	should be replaced by a lockless radix tree with a thread to
    106  *	free entries no longer in use once all lookups on all CPUs have
    107  *	completed.
    108  *
    109  *	Specifically, we maintain two maps:
    110  *
    111  *	futex_tab.va[vmspace, va] for private futexes
    112  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113  *
    114  *	This implementation does not support priority inheritance.
    115  */
    116 
    117 #include <sys/param.h>
    118 #include <sys/types.h>
    119 #include <sys/atomic.h>
    120 #include <sys/condvar.h>
    121 #include <sys/futex.h>
    122 #include <sys/mutex.h>
    123 #include <sys/rbtree.h>
    124 #include <sys/sleepq.h>
    125 #include <sys/queue.h>
    126 #include <sys/sdt.h>
    127 
    128 #include <sys/syscall.h>
    129 #include <sys/syscallargs.h>
    130 #include <sys/syscallvar.h>
    131 
    132 #include <uvm/uvm_extern.h>
    133 
    134 /*
    135  * DTrace probes.
    136  */
    137 SDT_PROVIDER_DEFINE(futex);
    138 
    139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    140 /* exit: error */
    141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    142 		  "struct timespec *", "int", "int");
    143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    144 
    145 /* entry: uaddr, nwake, bitset, fflags */
    146 /* exit: error, nwoken */
    147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    149 
    150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    151 /* exit: error, nwoken */
    152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    153 		  "int");
    154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    155 
    156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    157 /* exit: error, nwoken */
    158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    159 		  "int", "int", "int");
    160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    161 
    162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    163 /* exit: error, nwoken */
    164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    165 		  "int", "int");
    166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    167 
    168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    171 
    172 /* entry: timo */
    173 /* exit: error */
    174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    176 
    177 /*
    178  * Lock order:
    179  *
    180  *	futex_tab.lock
    181  *	futex::fx_op_lock		ordered by kva of struct futex
    182  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    183  *
    184  * N.B. multiple futexes can share a single sleepq lock.
    185  */
    186 
    187 /*
    188  * union futex_key
    189  *
    190  *	A futex is addressed either by a vmspace+va (private) or by
    191  *	a uvm_voaddr (shared).
    192  */
    193 typedef union futex_key {
    194 	struct {
    195 		struct vmspace			*vmspace;
    196 		vaddr_t				va;
    197 	}			fk_private;
    198 	struct uvm_voaddr	fk_shared;
    199 } futex_key_t;
    200 
    201 CTASSERT(offsetof(futex_key_t, fk_private.va) ==
    202 	 offsetof(futex_key_t, fk_shared.offset));
    203 
    204 /*
    205  * struct futex
    206  *
    207  *	Kernel state for a futex located at a particular address in a
    208  *	particular virtual address space.
    209  *
    210  *	N.B. fx_refcnt is an unsigned long because we need to be able
    211  *	to operate on it atomically on all systems while at the same
    212  *	time rendering practically impossible the chance of it reaching
    213  *	its max value.  In practice, we're limited by the number of LWPs
    214  *	that can be present on the system at any given time, and the
    215  *	assumption is that limit will be good enough on a 32-bit platform.
    216  *	See futex_wake() for why overflow needs to be avoided.
    217  *
    218  *	XXX Since futex addresses must be 4-byte aligned, we could
    219  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    220  *	XXX field of the key.  Worth it?
    221  */
    222 struct futex {
    223 	futex_key_t		fx_key;
    224 	struct rb_node		fx_node;
    225 	unsigned long		fx_refcnt;
    226 	bool			fx_shared;
    227 	bool			fx_on_tree;
    228 	uint8_t			fx_class;
    229 
    230 	kmutex_t		fx_op_lock;	/* adaptive */
    231 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    232 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    233 	unsigned int		fx_nwaiters[2];
    234 };
    235 
    236 /*
    237  * futex classes: Some futex operations can only be carried out on
    238  * futexes that are known to be abiding by a certain protocol.  These
    239  * classes are assigned to a futex when created due to a wait event,
    240  * and when subsequent wake or requeue operations are issued, the
    241  * class is checked at futex lookup time.  If the class does not match,
    242  * EINVAL is the result.
    243  */
    244 #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    245 #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    246 #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    247 #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    248 
    249 /* sleepq definitions */
    250 #define	FUTEX_READERQ		0
    251 #define	FUTEX_WRITERQ		1
    252 
    253 static const char futex_wmesg[] = "futex";
    254 
    255 static void	futex_unsleep(lwp_t *, bool);
    256 
    257 static syncobj_t futex_syncobj = {
    258 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    259 	.sobj_unsleep	= futex_unsleep,
    260 	.sobj_changepri	= sleepq_changepri,
    261 	.sobj_lendpri	= sleepq_lendpri,
    262 	.sobj_owner	= syncobj_noowner,
    263 };
    264 
    265 /*
    266  * futex_tab
    267  *
    268  *	Global trees of futexes by vmspace/va and VM object address.
    269  *
    270  *	XXX This obviously doesn't scale in parallel.  We could use a
    271  *	pserialize-safe data structure, but there may be a high cost to
    272  *	frequent deletion since we don't cache futexes after we're done
    273  *	with them.  We could use hashed locks.  But for now, just make
    274  *	sure userland can't DoS the serial performance, by using a
    275  *	balanced binary tree for lookup.
    276  *
    277  *	XXX We could use a per-process tree for the table indexed by
    278  *	virtual address to reduce contention between processes.
    279  */
    280 static struct {
    281 	kmutex_t	lock;
    282 	struct rb_tree	va;
    283 	struct rb_tree	oa;
    284 } futex_tab __cacheline_aligned;
    285 
    286 static int
    287 compare_futex_key(void *cookie, const void *n, const void *k)
    288 {
    289 	const struct futex *fa = n;
    290 	const futex_key_t *fka = &fa->fx_key;
    291 	const futex_key_t *fkb = k;
    292 
    293 	if ((uintptr_t)fka->fk_private.vmspace <
    294 	    (uintptr_t)fkb->fk_private.vmspace)
    295 		return -1;
    296 	if ((uintptr_t)fka->fk_private.vmspace >
    297 	    (uintptr_t)fkb->fk_private.vmspace)
    298 		return +1;
    299 	if (fka->fk_private.va < fkb->fk_private.va)
    300 		return -1;
    301 	if (fka->fk_private.va > fkb->fk_private.va)
    302 		return -1;
    303 	return 0;
    304 }
    305 
    306 static int
    307 compare_futex(void *cookie, const void *na, const void *nb)
    308 {
    309 	const struct futex *fa = na;
    310 	const struct futex *fb = nb;
    311 
    312 	return compare_futex_key(cookie, fa, &fb->fx_key);
    313 }
    314 
    315 static const rb_tree_ops_t futex_rb_ops = {
    316 	.rbto_compare_nodes = compare_futex,
    317 	.rbto_compare_key = compare_futex_key,
    318 	.rbto_node_offset = offsetof(struct futex, fx_node),
    319 };
    320 
    321 static int
    322 compare_futex_shared_key(void *cookie, const void *n, const void *k)
    323 {
    324 	const struct futex *fa = n;
    325 	const futex_key_t *fka = &fa->fx_key;
    326 	const futex_key_t *fkb = k;
    327 
    328 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    329 }
    330 
    331 static int
    332 compare_futex_shared(void *cookie, const void *na, const void *nb)
    333 {
    334 	const struct futex *fa = na;
    335 	const struct futex *fb = nb;
    336 
    337 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    338 }
    339 
    340 static const rb_tree_ops_t futex_shared_rb_ops = {
    341 	.rbto_compare_nodes = compare_futex_shared,
    342 	.rbto_compare_key = compare_futex_shared_key,
    343 	.rbto_node_offset = offsetof(struct futex, fx_node),
    344 };
    345 
    346 /*
    347  * futex_sq_lock2(f, f2)
    348  *
    349  *	Acquire the sleepq locks for f and f2, which may be null, or
    350  *	which may be the same.  If they are distinct, an arbitrary total
    351  *	order is chosen on the locks.
    352  *
    353  *	Callers should only ever acquire multiple sleepq locks
    354  *	simultaneously using futex_sq_lock2.
    355  */
    356 static void
    357 futex_sq_lock2(struct futex * const f, struct futex * const f2)
    358 {
    359 
    360 	/*
    361 	 * If both are null, do nothing; if one is null and the other
    362 	 * is not, lock the other and be done with it.
    363 	 */
    364 	if (f == NULL && f2 == NULL) {
    365 		return;
    366 	} else if (f == NULL) {
    367 		mutex_spin_enter(f2->fx_sq_lock);
    368 		return;
    369 	} else if (f2 == NULL) {
    370 		mutex_spin_enter(f->fx_sq_lock);
    371 		return;
    372 	}
    373 
    374 	kmutex_t * const m = f->fx_sq_lock;
    375 	kmutex_t * const m2 = f2->fx_sq_lock;
    376 
    377 	/* If both locks are the same, acquire only one. */
    378 	if (m == m2) {
    379 		mutex_spin_enter(m);
    380 		return;
    381 	}
    382 
    383 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    384 	if ((uintptr_t)m < (uintptr_t)m2) {
    385 		mutex_spin_enter(m);
    386 		mutex_spin_enter(m2);
    387 	} else {
    388 		mutex_spin_enter(m2);
    389 		mutex_spin_enter(m);
    390 	}
    391 }
    392 
    393 /*
    394  * futex_sq_unlock2(f, f2)
    395  *
    396  *	Release the sleep queue locks for f and f2, which may be null, or
    397  *	which may have the same underlying lock.
    398  */
    399 static void
    400 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    401 {
    402 
    403 	/*
    404 	 * If both are null, do nothing; if one is null and the other
    405 	 * is not, unlock the other and be done with it.
    406 	 */
    407 	if (f == NULL && f2 == NULL) {
    408 		return;
    409 	} else if (f == NULL) {
    410 		mutex_spin_exit(f2->fx_sq_lock);
    411 		return;
    412 	} else if (f2 == NULL) {
    413 		mutex_spin_exit(f->fx_sq_lock);
    414 		return;
    415 	}
    416 
    417 	kmutex_t * const m = f->fx_sq_lock;
    418 	kmutex_t * const m2 = f2->fx_sq_lock;
    419 
    420 	/* If both locks are the same, release only one. */
    421 	if (m == m2) {
    422 		mutex_spin_exit(m);
    423 		return;
    424 	}
    425 
    426 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    427 	if ((uintptr_t)m < (uintptr_t)m2) {
    428 		mutex_spin_exit(m2);
    429 		mutex_spin_exit(m);
    430 	} else {
    431 		mutex_spin_exit(m);
    432 		mutex_spin_exit(m2);
    433 	}
    434 }
    435 
    436 /*
    437  * futex_load(uaddr, kaddr)
    438  *
    439  *	Perform a single atomic load to read *uaddr, and return the
    440  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    441  *	mapped.
    442  */
    443 static inline int
    444 futex_load(int *uaddr, int *kaddr)
    445 {
    446 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    447 }
    448 
    449 /*
    450  * futex_test(uaddr, expected)
    451  *
    452  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    453  *	uaddr is not mapped.
    454  */
    455 static bool
    456 futex_test(int *uaddr, int expected)
    457 {
    458 	int val;
    459 	int error;
    460 
    461 	error = futex_load(uaddr, &val);
    462 	if (error)
    463 		return false;
    464 	return val == expected;
    465 }
    466 
    467 static pool_cache_t	futex_cache		__read_mostly;
    468 
    469 static int	futex_ctor(void *, void *, int);
    470 static void	futex_dtor(void *, void *);
    471 
    472 /*
    473  * futex_sys_init()
    474  *
    475  *	Initialize the futex subsystem.
    476  */
    477 void
    478 futex_sys_init(void)
    479 {
    480 
    481 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    482 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    483 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    484 
    485 	futex_cache = pool_cache_init(sizeof(struct futex),
    486 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    487 	    futex_dtor, NULL);
    488 }
    489 
    490 /*
    491  * futex_sys_fini()
    492  *
    493  *	Finalize the futex subsystem.
    494  */
    495 void
    496 futex_sys_fini(void)
    497 {
    498 
    499 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    500 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    501 	mutex_destroy(&futex_tab.lock);
    502 }
    503 
    504 /*
    505  * futex_ctor()
    506  *
    507  *	Futex object constructor.
    508  */
    509 static int
    510 futex_ctor(void *arg __unused, void *obj, int flags __unused)
    511 {
    512 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    513 	struct futex * const f = obj;
    514 
    515 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    516 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    517 
    518 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    519 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    520 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    521 
    522 	return 0;
    523 }
    524 
    525 /*
    526  * futex_dtor()
    527  *
    528  *	Futex object destructor.
    529  */
    530 static void
    531 futex_dtor(void *arg __unused, void *obj)
    532 {
    533 	struct futex * const f = obj;
    534 
    535 	mutex_destroy(&f->fx_op_lock);
    536 	f->fx_sq_lock = NULL;
    537 }
    538 
    539 /*
    540  * futex_key_init(key, vm, va, shared)
    541  *
    542  *	Initialize a futex key for lookup, etc.
    543  */
    544 static int
    545 futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
    546 {
    547 	int error = 0;
    548 
    549 	if (__predict_false(shared)) {
    550 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    551 			error = EFAULT;
    552 	} else {
    553 		fk->fk_private.vmspace = vm;
    554 		fk->fk_private.va = va;
    555 	}
    556 
    557 	return error;
    558 }
    559 
    560 /*
    561  * futex_key_fini(key, shared)
    562  *
    563  *	Release a futex key.
    564  */
    565 static void
    566 futex_key_fini(futex_key_t *fk, bool shared)
    567 {
    568 	if (__predict_false(shared))
    569 		uvm_voaddr_release(&fk->fk_shared);
    570 	memset(fk, 0, sizeof(*fk));
    571 }
    572 
    573 /*
    574  * futex_create(fk, shared)
    575  *
    576  *	Create a futex.  Initial reference count is 1, representing the
    577  *	caller.  Returns NULL on failure.  Always takes ownership of the
    578  *	key, either transferring it to the newly-created futex, or releasing
    579  *	the key if creation fails.
    580  *
    581  *	Never sleeps for memory, but may sleep to acquire a lock.
    582  */
    583 static struct futex *
    584 futex_create(futex_key_t *fk, bool shared, uint8_t class)
    585 {
    586 	struct futex *f;
    587 
    588 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    589 	if (f == NULL) {
    590 		futex_key_fini(fk, shared);
    591 		return NULL;
    592 	}
    593 	f->fx_key = *fk;
    594 	f->fx_refcnt = 1;
    595 	f->fx_shared = shared;
    596 	f->fx_on_tree = false;
    597 	f->fx_class = class;
    598 
    599 	return f;
    600 }
    601 
    602 /*
    603  * futex_destroy(f)
    604  *
    605  *	Destroy a futex created with futex_create.  Reference count
    606  *	must be zero.
    607  *
    608  *	May sleep.
    609  */
    610 static void
    611 futex_destroy(struct futex *f)
    612 {
    613 
    614 	ASSERT_SLEEPABLE();
    615 
    616 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    617 	KASSERT(!f->fx_on_tree);
    618 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    619 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    620 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    621 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    622 
    623 	futex_key_fini(&f->fx_key, f->fx_shared);
    624 
    625 	pool_cache_put(futex_cache, f);
    626 }
    627 
    628 /*
    629  * futex_hold_count(f, n)
    630  *
    631  *	Attempt to acquire a reference to f.  Return 0 on success,
    632  *	ENFILE on too many references.
    633  *
    634  *	Never sleeps.
    635  */
    636 static int
    637 futex_hold_count(struct futex *f, unsigned long const count)
    638 {
    639 	unsigned long refcnt;
    640 
    641 	do {
    642 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    643 		if (ULONG_MAX - refcnt < count)
    644 			return ENFILE;
    645 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    646 				  refcnt + count) != refcnt);
    647 
    648 	return 0;
    649 }
    650 #define	futex_hold(f)	futex_hold_count(f, 1)
    651 
    652 /*
    653  * futex_rele_count(f, n)
    654  *
    655  *	Release a reference to f acquired with futex_create or
    656  *	futex_hold.
    657  *
    658  *	May sleep to free f.
    659  */
    660 static void
    661 futex_rele_count(struct futex *f, unsigned long const count)
    662 {
    663 	unsigned long refcnt;
    664 
    665 	ASSERT_SLEEPABLE();
    666 
    667 	do {
    668 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    669 		KASSERT(refcnt >= count);
    670 		if (refcnt - count == 0)
    671 			goto trylast;
    672 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    673 				  refcnt - count) != refcnt);
    674 	return;
    675 
    676 trylast:
    677 	KASSERT(count <= LONG_MAX);
    678 	mutex_enter(&futex_tab.lock);
    679 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    680 		if (f->fx_on_tree) {
    681 			if (__predict_false(f->fx_shared))
    682 				rb_tree_remove_node(&futex_tab.oa, f);
    683 			else
    684 				rb_tree_remove_node(&futex_tab.va, f);
    685 			f->fx_on_tree = false;
    686 		}
    687 	} else {
    688 		/* References remain -- don't destroy it.  */
    689 		f = NULL;
    690 	}
    691 	mutex_exit(&futex_tab.lock);
    692 	if (f != NULL)
    693 		futex_destroy(f);
    694 }
    695 #define	futex_rele(f)	futex_rele_count(f, 1)
    696 
    697 /*
    698  * futex_rele_count_not_last(f, n)
    699  *
    700  *	Release a reference to f acquired with futex_create or
    701  *	futex_hold.
    702  *
    703  *	This version asserts that we are not dropping the last
    704  *	reference to f.
    705  */
    706 static void
    707 futex_rele_count_not_last(struct futex *f, unsigned long const count)
    708 {
    709 	unsigned long refcnt;
    710 
    711 	do {
    712 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    713 		KASSERT(refcnt >= count);
    714 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    715 				  refcnt - count) != refcnt);
    716 }
    717 
    718 /*
    719  * futex_lookup_by_key(key, shared, class, &f)
    720  *
    721  *	Try to find an existing futex va reference in the specified key
    722  *	On success, return 0, set f to found futex or to NULL if not found,
    723  *	and increment f's reference count if found.
    724  *
    725  *	Return ENFILE if reference count too high.
    726  *
    727  *	Internal lookup routine shared by futex_lookup() and
    728  *	futex_lookup_create().
    729  */
    730 static int
    731 futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
    732     struct futex **fp)
    733 {
    734 	struct futex *f;
    735 	int error = 0;
    736 
    737 	mutex_enter(&futex_tab.lock);
    738 	if (__predict_false(shared)) {
    739 		f = rb_tree_find_node(&futex_tab.oa, fk);
    740 	} else {
    741 		f = rb_tree_find_node(&futex_tab.va, fk);
    742 	}
    743 	if (f) {
    744 		if (__predict_true(f->fx_class == class ||
    745 				   class == FUTEX_CLASS_ANY))
    746 			error = futex_hold(f);
    747 		else
    748 			error = EINVAL;
    749 		if (error)
    750 			f = NULL;
    751 	}
    752  	*fp = f;
    753 	mutex_exit(&futex_tab.lock);
    754 
    755 	return error;
    756 }
    757 
    758 /*
    759  * futex_insert(f, fp)
    760  *
    761  *	Try to insert the futex f into the tree by va.  If there
    762  *	already is a futex for its va, acquire a reference to it, and
    763  *	store it in *fp; otherwise store f in *fp.
    764  *
    765  *	Return 0 on success, ENFILE if there already is a futex but its
    766  *	reference count is too high.
    767  */
    768 static int
    769 futex_insert(struct futex *f, struct futex **fp)
    770 {
    771 	struct futex *f0;
    772 	int error;
    773 
    774 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    775 	KASSERT(!f->fx_on_tree);
    776 
    777 	mutex_enter(&futex_tab.lock);
    778 	if (__predict_false(f->fx_shared))
    779 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    780 	else
    781 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    782 	if (f0 == f) {
    783 		f->fx_on_tree = true;
    784 		error = 0;
    785 	} else {
    786 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    787 		KASSERT(f0->fx_on_tree);
    788 		error = futex_hold(f0);
    789 		if (error)
    790 			goto out;
    791 	}
    792 	*fp = f0;
    793 out:	mutex_exit(&futex_tab.lock);
    794 
    795 	return error;
    796 }
    797 
    798 /*
    799  * futex_lookup(uaddr, shared, class, &f)
    800  *
    801  *	Find a futex at the userland pointer uaddr in the current
    802  *	process's VM space.  On success, return the futex in f and
    803  *	increment its reference count.
    804  *
    805  *	Caller must call futex_rele when done.
    806  */
    807 static int
    808 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    809 {
    810 	futex_key_t fk;
    811 	struct vmspace *vm = curproc->p_vmspace;
    812 	vaddr_t va = (vaddr_t)uaddr;
    813 	int error;
    814 
    815 	/*
    816 	 * Reject unaligned user pointers so we don't cross page
    817 	 * boundaries and so atomics will work.
    818 	 */
    819 	if (__predict_false((va & 3) != 0))
    820 		return EINVAL;
    821 
    822 	/* Look it up. */
    823 	error = futex_key_init(&fk, vm, va, shared);
    824 	if (error)
    825 		return error;
    826 
    827 	error = futex_lookup_by_key(&fk, shared, class, fp);
    828 	futex_key_fini(&fk, shared);
    829 	if (error)
    830 		return error;
    831 
    832 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    833 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    834 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    835 
    836 	/*
    837 	 * Success!  (Caller must still check whether we found
    838 	 * anything, but nothing went _wrong_ like trying to use
    839 	 * unmapped memory.)
    840 	 */
    841 	KASSERT(error == 0);
    842 
    843 	return error;
    844 }
    845 
    846 /*
    847  * futex_lookup_create(uaddr, shared, class, &f)
    848  *
    849  *	Find or create a futex at the userland pointer uaddr in the
    850  *	current process's VM space.  On success, return the futex in f
    851  *	and increment its reference count.
    852  *
    853  *	Caller must call futex_rele when done.
    854  */
    855 static int
    856 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    857 {
    858 	futex_key_t fk;
    859 	struct vmspace *vm = curproc->p_vmspace;
    860 	struct futex *f = NULL;
    861 	vaddr_t va = (vaddr_t)uaddr;
    862 	int error;
    863 
    864 	/*
    865 	 * Reject unaligned user pointers so we don't cross page
    866 	 * boundaries and so atomics will work.
    867 	 */
    868 	if (__predict_false((va & 3) != 0))
    869 		return EINVAL;
    870 
    871 	if (__predict_false(class == FUTEX_CLASS_ANY))
    872 		return EINVAL;
    873 
    874 	error = futex_key_init(&fk, vm, va, shared);
    875 	if (error)
    876 		return error;
    877 
    878 	/*
    879 	 * Optimistically assume there already is one, and try to find
    880 	 * it.
    881 	 */
    882 	error = futex_lookup_by_key(&fk, shared, class, fp);
    883 	if (error || *fp != NULL) {
    884 		/*
    885 		 * We either found one, or there was an error.
    886 		 * In either case, we are done with the key.
    887 		 */
    888 		futex_key_fini(&fk, shared);
    889 		goto out;
    890 	}
    891 
    892 	/*
    893 	 * Create a futex recoard.  This tranfers ownership of the key
    894 	 * in all cases.
    895 	 */
    896 	f = futex_create(&fk, shared, class);
    897 	if (f == NULL) {
    898 		error = ENOMEM;
    899 		goto out;
    900 	}
    901 
    902 	/*
    903 	 * Insert our new futex, or use existing if someone else beat
    904 	 * us to it.
    905 	 */
    906 	error = futex_insert(f, fp);
    907 	if (error)
    908 		goto out;
    909 	if (*fp == f)
    910 		f = NULL;	/* don't release on exit */
    911 
    912 	/* Success!  */
    913 	KASSERT(error == 0);
    914 
    915 out:	if (f != NULL)
    916 		futex_rele(f);
    917 	KASSERT(error || *fp != NULL);
    918 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    919 	return error;
    920 }
    921 
    922 /*
    923  * futex_unsleep:
    924  *
    925  *	Remove an LWP from the futex and sleep queue.  This is called when
    926  *	the LWP has not been awoken normally but instead interrupted: for
    927  *	example, when a signal is received.  Must be called with the LWP
    928  *	locked.  Will unlock if "unlock" is true.
    929  */
    930 static void
    931 futex_unsleep(lwp_t *l, bool unlock)
    932 {
    933 	struct futex *f __diagused;
    934 
    935 	f = (struct futex *)(uintptr_t)l->l_wchan;
    936 
    937 	KASSERT(mutex_owned(f->fx_sq_lock));
    938 
    939 	/* WRITERQ is more likely */
    940 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    941 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    942 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    943 	} else {
    944 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    945 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    946 		f->fx_nwaiters[FUTEX_READERQ]--;
    947 	}
    948 
    949 	sleepq_unsleep(l, unlock);
    950 }
    951 
    952 /*
    953  * futex_wait(f, timeout, clkid, bitset)
    954  *
    955  *	Wait until deadline on the clock clkid, or forever if deadline
    956  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    957  *	ETIMEDOUT on timeout, EINTR on signal.
    958  */
    959 static int
    960 futex_wait(struct futex *f, int q, const struct timespec *deadline,
    961     clockid_t clkid, int bitset)
    962 {
    963 
    964 	/*
    965 	 * Some things to note about how this function works:
    966 	 *
    967 	 * ==> When we enter futex_wait(), the futex's op lock is
    968 	 * held, preventing us from missing wakes.  Once we are on
    969 	 * the futex's sleepq, it is safe to release the op lock.
    970 	 *
    971 	 * ==> We have a single early escape to avoid calling
    972 	 * sleepq_block(): our deadline already having passed.
    973 	 * If this is a no-timeout wait, or if the deadline has
    974 	 * not already passed, we are guaranteed to call sleepq_block().
    975 	 *
    976 	 * ==> sleepq_block() contains all of the necessary logic
    977 	 * for handling signals; we do not need to check for them
    978 	 * ourselves.
    979 	 *
    980 	 * ==> Once we have blocked, other futex operations will
    981 	 * be able to wake us or requeue us.  Until we have blocked,
    982 	 * those other futex operations will not be able to acquire
    983 	 * the sleepq locks in order to perform those operations,
    984 	 * even if we have dropped the op lock.  Thus, we will not
    985 	 * miss wakes.  This fundamental invariant is relied upon by
    986 	 * every other synchronization object in the kernel.
    987 	 *
    988 	 * ==> sleepq_block() returns for one of three reasons:
    989 	 *
    990 	 *	-- We were awakened.
    991 	 *	-- We were signaled.
    992 	 *	-- We timed out.
    993 	 *
    994 	 * ==> Once sleepq_block() returns, we will not call
    995 	 * sleepq_block() again.
    996 	 *
    997 	 * ==> If sleepq_block() returns due to being signaled,
    998 	 * we must check to see if we were also awakened.  This
    999 	 * is to handle the following sequence:
   1000 	 *
   1001 	 *	-- Futex wake takes us off sleepq, places us on runq.
   1002 	 *	-- We are signaled while sitting on the runq.
   1003 	 *	-- We run, and sleepq_block() notices the signal and
   1004 	 *	   returns  EINTR/ERESTART.
   1005 	 *
   1006 	 * In this situation, we want to indicate a successful wake
   1007 	 * to the caller, because that wake has been reported to the
   1008 	 * thread that issued it.
   1009 	 *
   1010 	 * Note that it is NOT possible for a wake to be issued after
   1011 	 * a signal; the LWP will already have been taken off the sleepq
   1012 	 * by the signal, so the wake will never happen.  Note that for
   1013 	 * this reason, we must *never* return ERESTART, because it could
   1014 	 * result in us waiting again with no one to awaken us.
   1015 	 */
   1016 
   1017 	struct lwp * const l = curlwp;
   1018 	int timo;
   1019 	int error;
   1020 
   1021 	/*
   1022 	 * Compute our timeout before taking any locks.
   1023 	 */
   1024 	if (deadline) {
   1025 		struct timespec ts;
   1026 
   1027 		/* Check our watch.  */
   1028 		error = clock_gettime1(clkid, &ts);
   1029 		if (error) {
   1030 			mutex_exit(&f->fx_op_lock);
   1031 			return error;
   1032 		}
   1033 
   1034 		/*
   1035 		 * If we're past the deadline, ETIMEDOUT.
   1036 		 */
   1037 		if (timespeccmp(deadline, &ts, <=)) {
   1038 			mutex_exit(&f->fx_op_lock);
   1039 			return ETIMEDOUT;
   1040 		} else {
   1041 			/* Count how much time is left.  */
   1042 			timespecsub(deadline, &ts, &ts);
   1043 			timo = tstohz(&ts);
   1044 			if (timo == 0) {
   1045 				/*
   1046 				 * tstohz() already rounds up, and
   1047 				 * a return value of 0 ticks means
   1048 				 * "expired now or in the past".
   1049 				 */
   1050 				mutex_exit(&f->fx_op_lock);
   1051 				return ETIMEDOUT;
   1052 			}
   1053 		}
   1054 	} else {
   1055 		timo = 0;
   1056 	}
   1057 
   1058 	/*
   1059 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1060 	 * that there's room for us to block.
   1061 	 */
   1062 	mutex_spin_enter(f->fx_sq_lock);
   1063 	if (__predict_false(q == FUTEX_READERQ &&
   1064 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1065 		mutex_spin_exit(f->fx_sq_lock);
   1066 		mutex_exit(&f->fx_op_lock);
   1067 		return ENFILE;
   1068 	}
   1069 	lwp_lock(l);
   1070 
   1071 	/*
   1072 	 * No need for locks here; until we're on a futex's sleepq, these
   1073 	 * values are private to the LWP, and the locks needed to get onto
   1074 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1075 	 */
   1076 	l->l_futex = f;
   1077 	l->l_futex_wakesel = bitset;
   1078 
   1079 	/*
   1080 	 * This is an inlined bit of sleepq_enter();
   1081 	 * we already hold the lwp lock.
   1082 	 */
   1083 	lwp_unlock_to(l, f->fx_sq_lock);
   1084 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1085 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1086 
   1087 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1088 	f->fx_nwaiters[q]++;
   1089 
   1090 	/* We can now safely release the op lock. */
   1091 	mutex_exit(&f->fx_op_lock);
   1092 
   1093 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1094 	error = sleepq_block(timo, true);
   1095 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1096 
   1097 	/*
   1098 	 * f is no longer valid; we may have been moved to another
   1099 	 * futex sleepq while we slept.
   1100 	 */
   1101 
   1102 	/*
   1103 	 * If something went wrong, we may still have a futex reference.
   1104 	 * Check for that here and drop it if so.  We can do this without
   1105 	 * taking any locks because l->l_futex is private to the LWP when
   1106 	 * not on any futex's sleepq, and we are not on any sleepq because
   1107 	 * we are running.
   1108 	 *
   1109 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1110 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1111 	 */
   1112 	if (error) {
   1113 		f = l->l_futex;
   1114 		if (f != NULL) {
   1115 			SDT_PROBE0(futex, wait, finish, aborted);
   1116 			l->l_futex = NULL;
   1117 			futex_rele(f);
   1118 		} else {
   1119 			/* Raced with wakeup; report success. */
   1120 			SDT_PROBE0(futex, wait, finish, wakerace);
   1121 			error = 0;
   1122 		}
   1123 		if (error == EWOULDBLOCK)
   1124 			error = ETIMEDOUT;
   1125 		else if (error != ETIMEDOUT)
   1126 			error = EINTR;
   1127 	} else {
   1128 		f = l->l_futex;
   1129 		if (__predict_false(f != NULL)) {
   1130 			/*
   1131 			 * There are certain situations that can cause
   1132 			 * sleepq_block() to return 0 even if we were
   1133 			 * signalled (by e.g. SIGKILL).  In this case,
   1134 			 * we will need to release our futex hold and
   1135 			 * return EINTR (we're probably about to die
   1136 			 * anyway).
   1137 			 */
   1138 			SDT_PROBE0(futex, wait, finish, aborted);
   1139 			l->l_futex = NULL;
   1140 			futex_rele(f);
   1141 			error = EINTR;
   1142 		} else {
   1143 			SDT_PROBE0(futex, wait, finish, normally);
   1144 		}
   1145 	}
   1146 
   1147 	return error;
   1148 }
   1149 
   1150 /*
   1151  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1152  *
   1153  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1154  *	provided, move up to nrequeue remaining waiters on f matching
   1155  *	bitset to f2.  Return the number of waiters actually woken.
   1156  *	Caller must hold the locks of f and f2, if provided.
   1157  */
   1158 static unsigned
   1159 futex_wake(struct futex *f, int q, unsigned int const nwake,
   1160     struct futex *f2, int q2, unsigned int const nrequeue,
   1161     int bitset)
   1162 {
   1163 	struct lwp *l, *l_next;
   1164 	unsigned nwoken = 0;
   1165 	unsigned nrequeued = 0;
   1166 
   1167 	KASSERT(mutex_owned(&f->fx_op_lock));
   1168 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1169 
   1170 	futex_sq_lock2(f, f2);
   1171 
   1172 	/* Wake up to nwake waiters, and count the number woken.  */
   1173 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1174 		if (nwoken == nwake)
   1175 			break;
   1176 
   1177 		KASSERT(l->l_syncobj == &futex_syncobj);
   1178 		KASSERT(l->l_wchan == (wchan_t)f);
   1179 		KASSERT(l->l_futex == f);
   1180 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1181 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1182 
   1183 		if ((l->l_futex_wakesel & bitset) == 0)
   1184 			continue;
   1185 
   1186 		l->l_futex_wakesel = 0;
   1187 		l->l_futex = NULL;
   1188 		sleepq_remove(&f->fx_sqs[q], l);
   1189 		f->fx_nwaiters[q]--;
   1190 		nwoken++;
   1191 		/* hold counts adjusted in bulk below */
   1192 	}
   1193 
   1194 	if (nrequeue) {
   1195 		KASSERT(f2 != NULL);
   1196 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1197 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1198 			if (nrequeued == nrequeue)
   1199 				break;
   1200 
   1201 			KASSERT(l->l_syncobj == &futex_syncobj);
   1202 			KASSERT(l->l_wchan == (wchan_t)f);
   1203 			KASSERT(l->l_futex == f);
   1204 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1205 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1206 
   1207 			if ((l->l_futex_wakesel & bitset) == 0)
   1208 				continue;
   1209 
   1210 			l->l_futex = f2;
   1211 			sleepq_transfer(l, &f->fx_sqs[q],
   1212 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1213 			    &futex_syncobj, f2->fx_sq_lock, true);
   1214 			f->fx_nwaiters[q]--;
   1215 			f2->fx_nwaiters[q2]++;
   1216 			nrequeued++;
   1217 			/* hold counts adjusted in bulk below */
   1218 		}
   1219 		if (nrequeued) {
   1220 			/* XXX futex_hold() could theoretically fail here. */
   1221 			int hold_error __diagused =
   1222 			    futex_hold_count(f2, nrequeued);
   1223 			KASSERT(hold_error == 0);
   1224 		}
   1225 	}
   1226 
   1227 	/*
   1228 	 * Adjust the futex reference counts for the wakes and
   1229 	 * requeues.
   1230 	 */
   1231 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1232 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1233 
   1234 	futex_sq_unlock2(f, f2);
   1235 
   1236 	/* Return the number of waiters woken and requeued.  */
   1237 	return nwoken + nrequeued;
   1238 }
   1239 
   1240 /*
   1241  * futex_op_lock(f)
   1242  *
   1243  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1244  *	not use if caller needs to acquire two locks; use
   1245  *	futex_op_lock2 instead.
   1246  */
   1247 static void
   1248 futex_op_lock(struct futex *f)
   1249 {
   1250 	mutex_enter(&f->fx_op_lock);
   1251 }
   1252 
   1253 /*
   1254  * futex_op_unlock(f)
   1255  *
   1256  *	Release the op lock of f.
   1257  */
   1258 static void
   1259 futex_op_unlock(struct futex *f)
   1260 {
   1261 	mutex_exit(&f->fx_op_lock);
   1262 }
   1263 
   1264 /*
   1265  * futex_op_lock2(f, f2)
   1266  *
   1267  *	Acquire the op locks of both f and f2, which may be null, or
   1268  *	which may be the same.  If they are distinct, an arbitrary total
   1269  *	order is chosen on the locks.
   1270  *
   1271  *	Callers should only ever acquire multiple op locks
   1272  *	simultaneously using futex_op_lock2.
   1273  */
   1274 static void
   1275 futex_op_lock2(struct futex *f, struct futex *f2)
   1276 {
   1277 
   1278 	/*
   1279 	 * If both are null, do nothing; if one is null and the other
   1280 	 * is not, lock the other and be done with it.
   1281 	 */
   1282 	if (f == NULL && f2 == NULL) {
   1283 		return;
   1284 	} else if (f == NULL) {
   1285 		mutex_enter(&f2->fx_op_lock);
   1286 		return;
   1287 	} else if (f2 == NULL) {
   1288 		mutex_enter(&f->fx_op_lock);
   1289 		return;
   1290 	}
   1291 
   1292 	/* If both futexes are the same, acquire only one. */
   1293 	if (f == f2) {
   1294 		mutex_enter(&f->fx_op_lock);
   1295 		return;
   1296 	}
   1297 
   1298 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1299 	if ((uintptr_t)f < (uintptr_t)f2) {
   1300 		mutex_enter(&f->fx_op_lock);
   1301 		mutex_enter(&f2->fx_op_lock);
   1302 	} else {
   1303 		mutex_enter(&f2->fx_op_lock);
   1304 		mutex_enter(&f->fx_op_lock);
   1305 	}
   1306 }
   1307 
   1308 /*
   1309  * futex_op_unlock2(f, f2)
   1310  *
   1311  *	Release the queue locks of both f and f2, which may be null, or
   1312  *	which may have the same underlying queue.
   1313  */
   1314 static void
   1315 futex_op_unlock2(struct futex *f, struct futex *f2)
   1316 {
   1317 
   1318 	/*
   1319 	 * If both are null, do nothing; if one is null and the other
   1320 	 * is not, unlock the other and be done with it.
   1321 	 */
   1322 	if (f == NULL && f2 == NULL) {
   1323 		return;
   1324 	} else if (f == NULL) {
   1325 		mutex_exit(&f2->fx_op_lock);
   1326 		return;
   1327 	} else if (f2 == NULL) {
   1328 		mutex_exit(&f->fx_op_lock);
   1329 		return;
   1330 	}
   1331 
   1332 	/* If both futexes are the same, release only one. */
   1333 	if (f == f2) {
   1334 		mutex_exit(&f->fx_op_lock);
   1335 		return;
   1336 	}
   1337 
   1338 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1339 	if ((uintptr_t)f < (uintptr_t)f2) {
   1340 		mutex_exit(&f2->fx_op_lock);
   1341 		mutex_exit(&f->fx_op_lock);
   1342 	} else {
   1343 		mutex_exit(&f->fx_op_lock);
   1344 		mutex_exit(&f2->fx_op_lock);
   1345 	}
   1346 }
   1347 
   1348 /*
   1349  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1350  *
   1351  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1352  */
   1353 static int
   1354 futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1355     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1356     register_t *retval)
   1357 {
   1358 	struct futex *f;
   1359 	struct timespec ts;
   1360 	const struct timespec *deadline;
   1361 	int error;
   1362 
   1363 	/*
   1364 	 * If there's nothing to wait for, and nobody will ever wake
   1365 	 * us, then don't set anything up to wait -- just stop here.
   1366 	 */
   1367 	if (val3 == 0)
   1368 		return EINVAL;
   1369 
   1370 	/* Optimistically test before anything else.  */
   1371 	if (!futex_test(uaddr, val))
   1372 		return EAGAIN;
   1373 
   1374 	/* Determine a deadline on the specified clock.  */
   1375 	if (timeout == NULL) {
   1376 		deadline = timeout;
   1377 	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1378 		/* Sanity-check the deadline. */
   1379 		if (timeout->tv_sec < 0 ||
   1380 		    timeout->tv_nsec < 0 ||
   1381 		    timeout->tv_nsec >= 1000000000L) {
   1382 			return EINVAL;
   1383 		}
   1384 		deadline = timeout;
   1385 	} else {
   1386 		struct timespec interval = *timeout;
   1387 
   1388 		error = itimespecfix(&interval);
   1389 		if (error)
   1390 			return error;
   1391 		error = clock_gettime1(clkid, &ts);
   1392 		if (error)
   1393 			return error;
   1394 		timespecadd(&ts, &interval, &ts);
   1395 		deadline = &ts;
   1396 	}
   1397 
   1398 	/* Get the futex, creating it if necessary.  */
   1399 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1400 	if (error)
   1401 		return error;
   1402 	KASSERT(f);
   1403 
   1404 	/*
   1405 	 * Under the op lock, check the value again: if it has
   1406 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1407 	 * Since FUTEX_WAKE will use the same lock and be done after
   1408 	 * modifying the value, the order in which we check and enqueue
   1409 	 * is immaterial.
   1410 	 */
   1411 	futex_op_lock(f);
   1412 	if (!futex_test(uaddr, val)) {
   1413 		futex_op_unlock(f);
   1414 		error = EAGAIN;
   1415 		goto out;
   1416 	}
   1417 
   1418 	/*
   1419 	 * Now wait.  futex_wait() will drop our op lock once we
   1420 	 * are entered into the sleep queue, thus ensuring atomicity
   1421 	 * of wakes with respect to waits.
   1422 	 */
   1423 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1424 
   1425 	/*
   1426 	 * We cannot drop our reference to the futex here, because
   1427 	 * we might be enqueued on a different one when we are awakened.
   1428 	 * The references will be managed on our behalf in the requeue,
   1429 	 * wake, and error cases.
   1430 	 */
   1431 	f = NULL;
   1432 
   1433 	if (__predict_true(error == 0)) {
   1434 		/* Return 0 on success, error on failure. */
   1435 		*retval = 0;
   1436 	}
   1437 
   1438 out:	if (f != NULL)
   1439 		futex_rele(f);
   1440 	return error;
   1441 }
   1442 
   1443 /*
   1444  * futex_func_wake(uaddr, val, val3, retval)
   1445  *
   1446  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1447  */
   1448 static int
   1449 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1450 {
   1451 	struct futex *f;
   1452 	unsigned int nwoken = 0;
   1453 	int error = 0;
   1454 
   1455 	/* Reject negative number of wakeups.  */
   1456 	if (val < 0) {
   1457 		error = EINVAL;
   1458 		goto out;
   1459 	}
   1460 
   1461 	/* Look up the futex, if any.  */
   1462 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1463 	if (error)
   1464 		goto out;
   1465 
   1466 	/* If there's no futex, there are no waiters to wake.  */
   1467 	if (f == NULL)
   1468 		goto out;
   1469 
   1470 	/*
   1471 	 * Under f's op lock, wake the waiters and remember the
   1472 	 * number woken.
   1473 	 */
   1474 	futex_op_lock(f);
   1475 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1476 			    NULL, FUTEX_WRITERQ, 0, val3);
   1477 	futex_op_unlock(f);
   1478 
   1479 	/* Release the futex.  */
   1480 	futex_rele(f);
   1481 
   1482 out:
   1483 	/* Return the number of waiters woken.  */
   1484 	*retval = nwoken;
   1485 
   1486 	/* Success!  */
   1487 	return error;
   1488 }
   1489 
   1490 /*
   1491  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1492  *
   1493  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1494  */
   1495 static int
   1496 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1497     int val2, int val3, register_t *retval)
   1498 {
   1499 	struct futex *f = NULL, *f2 = NULL;
   1500 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1501 	int error;
   1502 
   1503 	/* Reject negative number of wakeups or requeues. */
   1504 	if (val < 0 || val2 < 0) {
   1505 		error = EINVAL;
   1506 		goto out;
   1507 	}
   1508 
   1509 	/* Look up the source futex, if any. */
   1510 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1511 	if (error)
   1512 		goto out;
   1513 
   1514 	/* If there is none, nothing to do. */
   1515 	if (f == NULL)
   1516 		goto out;
   1517 
   1518 	/*
   1519 	 * We may need to create the destination futex because it's
   1520 	 * entirely possible it does not currently have any waiters.
   1521 	 */
   1522 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1523 	if (error)
   1524 		goto out;
   1525 
   1526 	/*
   1527 	 * Under the futexes' op locks, check the value; if
   1528 	 * unchanged from val3, wake the waiters.
   1529 	 */
   1530 	futex_op_lock2(f, f2);
   1531 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1532 		error = EAGAIN;
   1533 	} else {
   1534 		error = 0;
   1535 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1536 				    f2, FUTEX_WRITERQ, val2,
   1537 				    FUTEX_BITSET_MATCH_ANY);
   1538 	}
   1539 	futex_op_unlock2(f, f2);
   1540 
   1541 out:
   1542 	/* Return the number of waiters woken.  */
   1543 	*retval = nwoken;
   1544 
   1545 	/* Release the futexes if we got them.  */
   1546 	if (f2)
   1547 		futex_rele(f2);
   1548 	if (f)
   1549 		futex_rele(f);
   1550 	return error;
   1551 }
   1552 
   1553 /*
   1554  * futex_validate_op_cmp(val3)
   1555  *
   1556  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1557  */
   1558 static int
   1559 futex_validate_op_cmp(int val3)
   1560 {
   1561 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1562 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1563 
   1564 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1565 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1566 		if (oparg < 0)
   1567 			return EINVAL;
   1568 		if (oparg >= 32)
   1569 			return EINVAL;
   1570 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1571 	}
   1572 
   1573 	switch (op) {
   1574 	case FUTEX_OP_SET:
   1575 	case FUTEX_OP_ADD:
   1576 	case FUTEX_OP_OR:
   1577 	case FUTEX_OP_ANDN:
   1578 	case FUTEX_OP_XOR:
   1579 		break;
   1580 	default:
   1581 		return EINVAL;
   1582 	}
   1583 
   1584 	switch (cmp) {
   1585 	case FUTEX_OP_CMP_EQ:
   1586 	case FUTEX_OP_CMP_NE:
   1587 	case FUTEX_OP_CMP_LT:
   1588 	case FUTEX_OP_CMP_LE:
   1589 	case FUTEX_OP_CMP_GT:
   1590 	case FUTEX_OP_CMP_GE:
   1591 		break;
   1592 	default:
   1593 		return EINVAL;
   1594 	}
   1595 
   1596 	return 0;
   1597 }
   1598 
   1599 /*
   1600  * futex_compute_op(oldval, val3)
   1601  *
   1602  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1603  */
   1604 static int
   1605 futex_compute_op(int oldval, int val3)
   1606 {
   1607 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1608 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1609 
   1610 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1611 		KASSERT(oparg >= 0);
   1612 		KASSERT(oparg < 32);
   1613 		oparg = 1u << oparg;
   1614 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1615 	}
   1616 
   1617 	switch (op) {
   1618 	case FUTEX_OP_SET:
   1619 		return oparg;
   1620 
   1621 	case FUTEX_OP_ADD:
   1622 		/*
   1623 		 * Avoid signed arithmetic overflow by doing
   1624 		 * arithmetic unsigned and converting back to signed
   1625 		 * at the end.
   1626 		 */
   1627 		return (int)((unsigned)oldval + (unsigned)oparg);
   1628 
   1629 	case FUTEX_OP_OR:
   1630 		return oldval | oparg;
   1631 
   1632 	case FUTEX_OP_ANDN:
   1633 		return oldval & ~oparg;
   1634 
   1635 	case FUTEX_OP_XOR:
   1636 		return oldval ^ oparg;
   1637 
   1638 	default:
   1639 		panic("invalid futex op");
   1640 	}
   1641 }
   1642 
   1643 /*
   1644  * futex_compute_cmp(oldval, val3)
   1645  *
   1646  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1647  */
   1648 static bool
   1649 futex_compute_cmp(int oldval, int val3)
   1650 {
   1651 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1652 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1653 
   1654 	switch (cmp) {
   1655 	case FUTEX_OP_CMP_EQ:
   1656 		return (oldval == cmparg);
   1657 
   1658 	case FUTEX_OP_CMP_NE:
   1659 		return (oldval != cmparg);
   1660 
   1661 	case FUTEX_OP_CMP_LT:
   1662 		return (oldval < cmparg);
   1663 
   1664 	case FUTEX_OP_CMP_LE:
   1665 		return (oldval <= cmparg);
   1666 
   1667 	case FUTEX_OP_CMP_GT:
   1668 		return (oldval > cmparg);
   1669 
   1670 	case FUTEX_OP_CMP_GE:
   1671 		return (oldval >= cmparg);
   1672 
   1673 	default:
   1674 		panic("invalid futex cmp operation");
   1675 	}
   1676 }
   1677 
   1678 /*
   1679  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1680  *
   1681  *	Implement futex(FUTEX_WAKE_OP).
   1682  */
   1683 static int
   1684 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1685     int val3, register_t *retval)
   1686 {
   1687 	struct futex *f = NULL, *f2 = NULL;
   1688 	int oldval, newval, actual;
   1689 	unsigned nwoken = 0;
   1690 	int error;
   1691 
   1692 	/* Reject negative number of wakeups.  */
   1693 	if (val < 0 || val2 < 0) {
   1694 		error = EINVAL;
   1695 		goto out;
   1696 	}
   1697 
   1698 	/* Reject invalid operations before we start doing things.  */
   1699 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1700 		goto out;
   1701 
   1702 	/* Look up the first futex, if any.  */
   1703 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1704 	if (error)
   1705 		goto out;
   1706 
   1707 	/* Look up the second futex, if any.  */
   1708 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1709 	if (error)
   1710 		goto out;
   1711 
   1712 	/*
   1713 	 * Under the op locks:
   1714 	 *
   1715 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1716 	 * 2. Unconditionally wake uaddr.
   1717 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1718 	 */
   1719 	futex_op_lock2(f, f2);
   1720 	do {
   1721 		error = futex_load(uaddr2, &oldval);
   1722 		if (error)
   1723 			goto out_unlock;
   1724 		newval = futex_compute_op(oldval, val3);
   1725 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1726 		if (error)
   1727 			goto out_unlock;
   1728 	} while (actual != oldval);
   1729 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1730 				 NULL, FUTEX_WRITERQ, 0,
   1731 				 FUTEX_BITSET_MATCH_ANY) : 0);
   1732 	if (f2 && futex_compute_cmp(oldval, val3))
   1733 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1734 				     NULL, FUTEX_WRITERQ, 0,
   1735 				     FUTEX_BITSET_MATCH_ANY);
   1736 
   1737 	/* Success! */
   1738 	error = 0;
   1739 out_unlock:
   1740 	futex_op_unlock2(f, f2);
   1741 
   1742 out:
   1743 	/* Return the number of waiters woken. */
   1744 	*retval = nwoken;
   1745 
   1746 	/* Release the futexes, if we got them. */
   1747 	if (f2)
   1748 		futex_rele(f2);
   1749 	if (f)
   1750 		futex_rele(f);
   1751 	return error;
   1752 }
   1753 
   1754 /*
   1755  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   1756  *
   1757  *	Implement the futex system call with all the parameters
   1758  *	parsed out.
   1759  */
   1760 int
   1761 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   1762     int *uaddr2, int val2, int val3, register_t *retval)
   1763 {
   1764 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   1765 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   1766 							    : CLOCK_MONOTONIC;
   1767 	int error;
   1768 
   1769 	op &= FUTEX_CMD_MASK;
   1770 
   1771 	switch (op) {
   1772 	case FUTEX_WAIT:
   1773 		SDT_PROBE6(futex, func, wait, entry,
   1774 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   1775 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   1776 		error = futex_func_wait(shared, uaddr, val,
   1777 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   1778 		    retval);
   1779 		SDT_PROBE1(futex, func, wait, exit, error);
   1780 		break;
   1781 
   1782 	case FUTEX_WAIT_BITSET:
   1783 		SDT_PROBE6(futex, func, wait, entry,
   1784 		    uaddr, val, val3, timeout,
   1785 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   1786 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   1787 		    clkid, TIMER_ABSTIME, retval);
   1788 		SDT_PROBE1(futex, func, wait, exit, error);
   1789 		break;
   1790 
   1791 	case FUTEX_WAKE:
   1792 		SDT_PROBE4(futex, func, wake, entry,
   1793 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   1794 		error = futex_func_wake(shared, uaddr, val,
   1795 		    FUTEX_BITSET_MATCH_ANY, retval);
   1796 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1797 		break;
   1798 
   1799 	case FUTEX_WAKE_BITSET:
   1800 		SDT_PROBE4(futex, func, wake, entry,
   1801 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   1802 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   1803 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1804 		break;
   1805 
   1806 	case FUTEX_REQUEUE:
   1807 		SDT_PROBE5(futex, func, requeue, entry,
   1808 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   1809 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1810 		    val2, 0, retval);
   1811 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   1812 		break;
   1813 
   1814 	case FUTEX_CMP_REQUEUE:
   1815 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   1816 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1817 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1818 		    val2, val3, retval);
   1819 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   1820 		break;
   1821 
   1822 	case FUTEX_WAKE_OP:
   1823 		SDT_PROBE6(futex, func, wake_op, entry,
   1824 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1825 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   1826 		    val3, retval);
   1827 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   1828 		break;
   1829 
   1830 	case FUTEX_FD:
   1831 	default:
   1832 		error = ENOSYS;
   1833 		break;
   1834 	}
   1835 
   1836 	return error;
   1837 }
   1838 
   1839 /*
   1840  * sys___futex(l, uap, retval)
   1841  *
   1842  *	__futex(2) system call: generic futex operations.
   1843  */
   1844 int
   1845 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   1846     register_t *retval)
   1847 {
   1848 	/* {
   1849 		syscallarg(int *) uaddr;
   1850 		syscallarg(int) op;
   1851 		syscallarg(int) val;
   1852 		syscallarg(const struct timespec *) timeout;
   1853 		syscallarg(int *) uaddr2;
   1854 		syscallarg(int) val2;
   1855 		syscallarg(int) val3;
   1856 	} */
   1857 	struct timespec ts, *tsp;
   1858 	int error;
   1859 
   1860 	/*
   1861 	 * Copy in the timeout argument, if specified.
   1862 	 */
   1863 	if (SCARG(uap, timeout)) {
   1864 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   1865 		if (error)
   1866 			return error;
   1867 		tsp = &ts;
   1868 	} else {
   1869 		tsp = NULL;
   1870 	}
   1871 
   1872 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   1873 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   1874 	    retval);
   1875 }
   1876 
   1877 /*
   1878  * sys___futex_set_robust_list(l, uap, retval)
   1879  *
   1880  *	__futex_set_robust_list(2) system call for robust futexes.
   1881  */
   1882 int
   1883 sys___futex_set_robust_list(struct lwp *l,
   1884     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   1885 {
   1886 	/* {
   1887 		syscallarg(void *) head;
   1888 		syscallarg(size_t) len;
   1889 	} */
   1890 	void *head = SCARG(uap, head);
   1891 
   1892 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   1893 		return EINVAL;
   1894 	if ((uintptr_t)head % sizeof(u_long))
   1895 		return EINVAL;
   1896 
   1897 	l->l_robust_head = (uintptr_t)head;
   1898 
   1899 	return 0;
   1900 }
   1901 
   1902 /*
   1903  * sys___futex_get_robust_list(l, uap, retval)
   1904  *
   1905  *	__futex_get_robust_list(2) system call for robust futexes.
   1906  */
   1907 int
   1908 sys___futex_get_robust_list(struct lwp *l,
   1909     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   1910 {
   1911 	/* {
   1912 		syscallarg(lwpid_t) lwpid;
   1913 		syscallarg(void **) headp;
   1914 		syscallarg(size_t *) lenp;
   1915 	} */
   1916 	void *head;
   1917 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   1918 	int error;
   1919 
   1920 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   1921 	if (error)
   1922 		return error;
   1923 
   1924 	/* Copy out the head pointer and the head structure length. */
   1925 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   1926 	if (__predict_true(error == 0)) {
   1927 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   1928 	}
   1929 
   1930 	return error;
   1931 }
   1932 
   1933 /*
   1934  * release_futex(uva, tid)
   1935  *
   1936  *	Try to release the robust futex at uva in the current process
   1937  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   1938  *	userland program's obligation to arrange correct behaviour.
   1939  */
   1940 static void
   1941 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   1942     bool const is_pending)
   1943 {
   1944 	int *uaddr;
   1945 	struct futex *f;
   1946 	int oldval, newval, actual;
   1947 	int error;
   1948 	bool shared;
   1949 
   1950 	/* If it's misaligned, tough.  */
   1951 	if (__predict_false(uptr & 3))
   1952 		return;
   1953 	uaddr = (int *)uptr;
   1954 
   1955 	error = futex_load(uaddr, &oldval);
   1956 	if (__predict_false(error))
   1957 		return;
   1958 
   1959 	/*
   1960 	 * There are two race conditions we need to handle here:
   1961 	 *
   1962 	 * 1. User space cleared the futex word but died before
   1963 	 *    being able to issue the wakeup.  No wakeups will
   1964 	 *    ever be issued, oops!
   1965 	 *
   1966 	 * 2. Awakened waiter died before being able to acquire
   1967 	 *    the futex in user space.  Any other waiters are
   1968 	 *    now stuck, oops!
   1969 	 *
   1970 	 * In both of these cases, the futex word will be 0 (because
   1971 	 * it's updated before the wake is issued).  The best we can
   1972 	 * do is detect this situation if it's the pending futex and
   1973 	 * issue a wake without modifying the futex word.
   1974 	 *
   1975 	 * XXX eventual PI handling?
   1976 	 */
   1977 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   1978 		register_t retval;
   1979 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   1980 		    FUTEX_BITSET_MATCH_ANY, &retval);
   1981 		if (error != 0 || retval == 0)
   1982 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   1983 			    FUTEX_BITSET_MATCH_ANY, &retval);
   1984 		return;
   1985 	}
   1986 
   1987 	/* Optimistically test whether we need to do anything at all.  */
   1988 	if ((oldval & FUTEX_TID_MASK) != tid)
   1989 		return;
   1990 
   1991 	/*
   1992 	 * We need to handle the case where this thread owned the futex,
   1993 	 * but it was uncontended.  In this case, there won't be any
   1994 	 * kernel state to look up.  All we can do is mark the futex
   1995 	 * as a zombie to be mopped up the next time another thread
   1996 	 * attempts to acquire it.
   1997 	 *
   1998 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   1999 	 * this loop, even if waiters appear while we're are doing
   2000 	 * so.  This is beause FUTEX_WAITERS is set by user space
   2001 	 * before calling __futex() to wait, and the futex needs
   2002 	 * to be marked as a zombie when the new waiter gets into
   2003 	 * the kernel.
   2004 	 */
   2005 	if ((oldval & FUTEX_WAITERS) == 0) {
   2006 		do {
   2007 			error = futex_load(uaddr, &oldval);
   2008 			if (error)
   2009 				return;
   2010 			if ((oldval & FUTEX_TID_MASK) != tid)
   2011 				return;
   2012 			newval = oldval | FUTEX_OWNER_DIED;
   2013 			error = ucas_int(uaddr, oldval, newval, &actual);
   2014 			if (error)
   2015 				return;
   2016 		} while (actual != oldval);
   2017 
   2018 		/*
   2019 		 * If where is still no indication of waiters, then there is
   2020 		 * no more work for us to do.
   2021 		 */
   2022 		if ((oldval & FUTEX_WAITERS) == 0)
   2023 			return;
   2024 	}
   2025 
   2026 	/*
   2027 	 * Look for a futex.  Try shared first, then private.  If we
   2028 	 * can't fine one, tough.
   2029 	 *
   2030 	 * Note: the assumption here is that anyone placing a futex on
   2031 	 * the robust list is adhering to the rules, regardless of the
   2032 	 * futex class.
   2033 	 */
   2034 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2035 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2036 		if (error)
   2037 			return;
   2038 		if (f == NULL && shared == false)
   2039 			return;
   2040 	}
   2041 
   2042 	/* Work under the futex op lock.  */
   2043 	futex_op_lock(f);
   2044 
   2045 	/*
   2046 	 * Fetch the word: if the tid doesn't match ours, skip;
   2047 	 * otherwise, set the owner-died bit, atomically.
   2048 	 */
   2049 	do {
   2050 		error = futex_load(uaddr, &oldval);
   2051 		if (error)
   2052 			goto out;
   2053 		if ((oldval & FUTEX_TID_MASK) != tid)
   2054 			goto out;
   2055 		newval = oldval | FUTEX_OWNER_DIED;
   2056 		error = ucas_int(uaddr, oldval, newval, &actual);
   2057 		if (error)
   2058 			goto out;
   2059 	} while (actual != oldval);
   2060 
   2061 	/*
   2062 	 * If there may be waiters, try to wake one.  If anything goes
   2063 	 * wrong, tough.
   2064 	 *
   2065 	 * XXX eventual PI handling?
   2066 	 */
   2067 	if (oldval & FUTEX_WAITERS) {
   2068 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2069 				 NULL, FUTEX_WRITERQ, 0,
   2070 				 FUTEX_BITSET_MATCH_ANY);
   2071 	}
   2072 
   2073 	/* Unlock the queue and release the futex.  */
   2074 out:	futex_op_unlock(f);
   2075 	futex_rele(f);
   2076 }
   2077 
   2078 /*
   2079  * futex_robust_head_lookup(l, lwpid)
   2080  *
   2081  *	Helper function to look up a robust head by LWP ID.
   2082  */
   2083 int
   2084 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2085 {
   2086 	struct proc *p = l->l_proc;
   2087 
   2088 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2089 	if (lwpid) {
   2090 		mutex_enter(p->p_lock);
   2091 		l = lwp_find(p, lwpid);
   2092 		if (l == NULL) {
   2093 			mutex_exit(p->p_lock);
   2094 			return ESRCH;
   2095 		}
   2096 		*headp = (void *)l->l_robust_head;
   2097 		mutex_exit(p->p_lock);
   2098 	} else {
   2099 		*headp = (void *)l->l_robust_head;
   2100 	}
   2101 	return 0;
   2102 }
   2103 
   2104 /*
   2105  * futex_fetch_robust_head(uaddr)
   2106  *
   2107  *	Helper routine to fetch the futex robust list head that
   2108  *	handles 32-bit binaries running on 64-bit kernels.
   2109  */
   2110 static int
   2111 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2112 {
   2113 #ifdef _LP64
   2114 	if (curproc->p_flag & PK_32) {
   2115 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2116 		int error;
   2117 
   2118 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2119 		if (__predict_true(error == 0)) {
   2120 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2121 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2122 					/*
   2123 					 * Make sure the offset is sign-
   2124 					 * extended.
   2125 					 */
   2126 					rhead[i] = (int32_t)rhead32[i];
   2127 				} else {
   2128 					rhead[i] = rhead32[i];
   2129 				}
   2130 			}
   2131 		}
   2132 		return error;
   2133 	}
   2134 #endif /* _L64 */
   2135 
   2136 	return copyin((void *)uaddr, rhead,
   2137 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2138 }
   2139 
   2140 /*
   2141  * futex_decode_robust_word(word)
   2142  *
   2143  *	Decode a robust futex list word into the entry and entry
   2144  *	properties.
   2145  */
   2146 static inline void
   2147 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2148     bool * const is_pi)
   2149 {
   2150 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2151 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2152 }
   2153 
   2154 /*
   2155  * futex_fetch_robust_entry(uaddr)
   2156  *
   2157  *	Helper routine to fetch and decode a robust futex entry
   2158  *	that handles 32-bit binaries running on 64-bit kernels.
   2159  */
   2160 static int
   2161 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2162     bool * const is_pi)
   2163 {
   2164 	uintptr_t val = 0;
   2165 	int error = 0;
   2166 
   2167 #ifdef _LP64
   2168 	if (curproc->p_flag & PK_32) {
   2169 		uint32_t val32;
   2170 
   2171 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2172 		if (__predict_true(error == 0))
   2173 			val = val32;
   2174 	} else
   2175 #endif /* _LP64 */
   2176 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2177 	if (__predict_false(error))
   2178 		return error;
   2179 
   2180 	futex_decode_robust_word(val, valp, is_pi);
   2181 	return 0;
   2182 }
   2183 
   2184 /*
   2185  * futex_release_all_lwp(l, tid)
   2186  *
   2187  *	Release all l's robust futexes.  If anything looks funny in
   2188  *	the process, give up -- it's userland's responsibility to dot
   2189  *	the i's and cross the t's.
   2190  */
   2191 void
   2192 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2193 {
   2194 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2195 	int limit = 1000000;
   2196 	int error;
   2197 
   2198 	/* If there's no robust list there's nothing to do. */
   2199 	if (l->l_robust_head == 0)
   2200 		return;
   2201 
   2202 	/* Read the final snapshot of the robust list head. */
   2203 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2204 	if (error) {
   2205 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2206 		    " unmapped robust futex list head\n",
   2207 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2208 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2209 		return;
   2210 	}
   2211 
   2212 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2213 
   2214 	uintptr_t next, pending;
   2215 	bool is_pi, pending_is_pi;
   2216 
   2217 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2218 	    &next, &is_pi);
   2219 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2220 	    &pending, &pending_is_pi);
   2221 
   2222 	/*
   2223 	 * Walk down the list of locked futexes and release them, up
   2224 	 * to one million of them before we give up.
   2225 	 */
   2226 
   2227 	while (next != l->l_robust_head && limit-- > 0) {
   2228 		/* pending handled below. */
   2229 		if (next != pending)
   2230 			release_futex(next + offset, tid, is_pi, false);
   2231 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2232 		if (error)
   2233 			break;
   2234 		preempt_point();
   2235 	}
   2236 	if (limit <= 0) {
   2237 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2238 		    " exhausted robust futex limit\n",
   2239 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2240 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2241 	}
   2242 
   2243 	/* If there's a pending futex, it may need to be released too. */
   2244 	if (pending != 0) {
   2245 		release_futex(pending + offset, tid, pending_is_pi, true);
   2246 	}
   2247 }
   2248