Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.12.4.6
      1  1.12.4.6       chs /*	$NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $	*/
      2       1.1   thorpej 
      3       1.1   thorpej /*-
      4       1.1   thorpej  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5       1.1   thorpej  * All rights reserved.
      6       1.1   thorpej  *
      7       1.1   thorpej  * This code is derived from software contributed to The NetBSD Foundation
      8       1.1   thorpej  * by Taylor R. Campbell and Jason R. Thorpe.
      9       1.1   thorpej  *
     10       1.1   thorpej  * Redistribution and use in source and binary forms, with or without
     11       1.1   thorpej  * modification, are permitted provided that the following conditions
     12       1.1   thorpej  * are met:
     13       1.1   thorpej  * 1. Redistributions of source code must retain the above copyright
     14       1.1   thorpej  *    notice, this list of conditions and the following disclaimer.
     15       1.1   thorpej  * 2. Redistributions in binary form must reproduce the above copyright
     16       1.1   thorpej  *    notice, this list of conditions and the following disclaimer in the
     17       1.1   thorpej  *    documentation and/or other materials provided with the distribution.
     18       1.1   thorpej  *
     19       1.1   thorpej  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20       1.1   thorpej  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21       1.1   thorpej  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22       1.1   thorpej  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23       1.1   thorpej  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24       1.1   thorpej  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25       1.1   thorpej  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26       1.1   thorpej  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27       1.1   thorpej  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28       1.1   thorpej  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29       1.1   thorpej  * POSSIBILITY OF SUCH DAMAGE.
     30       1.1   thorpej  */
     31       1.1   thorpej 
     32       1.1   thorpej #include <sys/cdefs.h>
     33  1.12.4.6       chs __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $");
     34       1.1   thorpej 
     35       1.1   thorpej /*
     36       1.1   thorpej  * Futexes
     37       1.1   thorpej  *
     38       1.1   thorpej  *	The futex system call coordinates notifying threads waiting for
     39       1.1   thorpej  *	changes on a 32-bit word of memory.  The word can be managed by
     40       1.1   thorpej  *	CPU atomic operations in userland, without system calls, as long
     41       1.1   thorpej  *	as there is no contention.
     42       1.1   thorpej  *
     43       1.1   thorpej  *	The simplest use case demonstrating the utility is:
     44       1.1   thorpej  *
     45       1.1   thorpej  *		// 32-bit word of memory shared among threads or
     46       1.1   thorpej  *		// processes in userland.  lock & 1 means owned;
     47       1.1   thorpej  *		// lock & 2 means there are waiters waiting.
     48       1.1   thorpej  *		volatile int lock = 0;
     49       1.1   thorpej  *
     50       1.1   thorpej  *		int v;
     51       1.1   thorpej  *
     52       1.1   thorpej  *		// Acquire a lock.
     53       1.1   thorpej  *		do {
     54       1.1   thorpej  *			v = lock;
     55       1.1   thorpej  *			if (v & 1) {
     56       1.1   thorpej  *				// Lock is held.  Set a bit to say that
     57       1.1   thorpej  *				// there are waiters, and wait for lock
     58       1.1   thorpej  *				// to change to anything other than v;
     59       1.1   thorpej  *				// then retry.
     60       1.1   thorpej  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61       1.1   thorpej  *					continue;
     62       1.1   thorpej  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63       1.1   thorpej  *				continue;
     64       1.1   thorpej  *			}
     65       1.1   thorpej  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66       1.1   thorpej  *		membar_enter();
     67       1.1   thorpej  *
     68       1.1   thorpej  *		...
     69       1.1   thorpej  *
     70       1.1   thorpej  *		// Release the lock.  Optimistically assume there are
     71       1.1   thorpej  *		// no waiters first until demonstrated otherwise.
     72       1.1   thorpej  *		membar_exit();
     73       1.1   thorpej  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74       1.1   thorpej  *			// There may be waiters.
     75       1.1   thorpej  *			v = atomic_swap_uint(&lock, 0);
     76       1.1   thorpej  *			// If there are still waiters, wake one.
     77       1.1   thorpej  *			if (v & 2)
     78       1.1   thorpej  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79       1.1   thorpej  *		}
     80       1.1   thorpej  *
     81       1.1   thorpej  *	The goal is to avoid the futex system call unless there is
     82       1.1   thorpej  *	contention; then if there is contention, to guarantee no missed
     83       1.1   thorpej  *	wakeups.
     84       1.1   thorpej  *
     85       1.1   thorpej  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86       1.1   thorpej  *	itself to be woken, double-check the lock word, and then sleep;
     87       1.1   thorpej  *	spurious wakeups are generally a fact of life, so any
     88       1.1   thorpej  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89       1.1   thorpej  *
     90       1.1   thorpej  *	If this were all there is to it, we could then increase
     91       1.1   thorpej  *	parallelism by refining the approximation: partition the
     92       1.1   thorpej  *	waiters into buckets by hashing the lock addresses to reduce
     93       1.1   thorpej  *	the incidence of spurious wakeups.  But this is not all.
     94       1.1   thorpej  *
     95       1.1   thorpej  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96       1.1   thorpej  *	operation not only wakes n waiters on lock if lock == val, but
     97       1.1   thorpej  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98       1.1   thorpej  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99       1.1   thorpej  *	to lock2 if they merely share the same hash as waiters on lock.
    100       1.1   thorpej  *	Thus, we can't approximately distribute waiters into queues by
    101       1.1   thorpej  *	a hash function; we must distinguish futex queues exactly by
    102       1.1   thorpej  *	lock address.
    103       1.1   thorpej  *
    104       1.1   thorpej  *	For now, we use a global red/black tree to index futexes.  This
    105       1.1   thorpej  *	should be replaced by a lockless radix tree with a thread to
    106       1.1   thorpej  *	free entries no longer in use once all lookups on all CPUs have
    107       1.1   thorpej  *	completed.
    108       1.1   thorpej  *
    109       1.1   thorpej  *	Specifically, we maintain two maps:
    110       1.1   thorpej  *
    111       1.1   thorpej  *	futex_tab.va[vmspace, va] for private futexes
    112       1.1   thorpej  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113       1.1   thorpej  *
    114       1.1   thorpej  *	This implementation does not support priority inheritance.
    115       1.1   thorpej  */
    116       1.1   thorpej 
    117      1.12     skrll #include <sys/param.h>
    118       1.1   thorpej #include <sys/types.h>
    119       1.1   thorpej #include <sys/atomic.h>
    120       1.1   thorpej #include <sys/condvar.h>
    121       1.1   thorpej #include <sys/futex.h>
    122       1.1   thorpej #include <sys/mutex.h>
    123       1.1   thorpej #include <sys/rbtree.h>
    124  1.12.4.1   thorpej #include <sys/sleepq.h>
    125       1.1   thorpej #include <sys/queue.h>
    126  1.12.4.1   thorpej #include <sys/sdt.h>
    127       1.1   thorpej 
    128       1.1   thorpej #include <sys/syscall.h>
    129       1.1   thorpej #include <sys/syscallargs.h>
    130       1.1   thorpej #include <sys/syscallvar.h>
    131       1.1   thorpej 
    132       1.1   thorpej #include <uvm/uvm_extern.h>
    133       1.1   thorpej 
    134       1.1   thorpej /*
    135  1.12.4.1   thorpej  * DTrace probes.
    136  1.12.4.1   thorpej  */
    137  1.12.4.1   thorpej SDT_PROVIDER_DEFINE(futex);
    138  1.12.4.1   thorpej 
    139  1.12.4.1   thorpej /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    140  1.12.4.1   thorpej /* exit: error */
    141  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    142  1.12.4.1   thorpej 		  "struct timespec *", "int", "int");
    143  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    144  1.12.4.1   thorpej 
    145  1.12.4.1   thorpej /* entry: uaddr, nwake, bitset, fflags */
    146  1.12.4.1   thorpej /* exit: error, nwoken */
    147  1.12.4.1   thorpej SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    148  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    149  1.12.4.1   thorpej 
    150  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    151  1.12.4.1   thorpej /* exit: error, nwoken */
    152  1.12.4.1   thorpej SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    153  1.12.4.1   thorpej 		  "int");
    154  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    155  1.12.4.1   thorpej 
    156  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    157  1.12.4.1   thorpej /* exit: error, nwoken */
    158  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    159  1.12.4.1   thorpej 		  "int", "int", "int");
    160  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    161  1.12.4.1   thorpej 
    162  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    163  1.12.4.1   thorpej /* exit: error, nwoken */
    164  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    165  1.12.4.1   thorpej 		  "int", "int");
    166  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    167  1.12.4.1   thorpej 
    168  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    169  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    170  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    171  1.12.4.1   thorpej 
    172  1.12.4.1   thorpej /* entry: timo */
    173  1.12.4.1   thorpej /* exit: error */
    174  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    175  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    176  1.12.4.1   thorpej 
    177  1.12.4.1   thorpej /*
    178       1.1   thorpej  * Lock order:
    179       1.1   thorpej  *
    180       1.1   thorpej  *	futex_tab.lock
    181  1.12.4.1   thorpej  *	futex::fx_op_lock		ordered by kva of struct futex
    182  1.12.4.1   thorpej  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    183  1.12.4.1   thorpej  *
    184  1.12.4.1   thorpej  * N.B. multiple futexes can share a single sleepq lock.
    185       1.1   thorpej  */
    186       1.1   thorpej 
    187       1.1   thorpej /*
    188       1.1   thorpej  * union futex_key
    189       1.1   thorpej  *
    190       1.1   thorpej  *	A futex is addressed either by a vmspace+va (private) or by
    191       1.1   thorpej  *	a uvm_voaddr (shared).
    192       1.1   thorpej  */
    193  1.12.4.4   thorpej typedef union futex_key {
    194       1.1   thorpej 	struct {
    195       1.1   thorpej 		struct vmspace			*vmspace;
    196       1.1   thorpej 		vaddr_t				va;
    197       1.1   thorpej 	}			fk_private;
    198       1.1   thorpej 	struct uvm_voaddr	fk_shared;
    199  1.12.4.4   thorpej } futex_key_t;
    200  1.12.4.4   thorpej 
    201  1.12.4.4   thorpej CTASSERT(offsetof(futex_key_t, fk_private.va) ==
    202  1.12.4.4   thorpej 	 offsetof(futex_key_t, fk_shared.offset));
    203       1.1   thorpej 
    204       1.1   thorpej /*
    205       1.1   thorpej  * struct futex
    206       1.1   thorpej  *
    207       1.1   thorpej  *	Kernel state for a futex located at a particular address in a
    208       1.1   thorpej  *	particular virtual address space.
    209       1.1   thorpej  *
    210       1.1   thorpej  *	N.B. fx_refcnt is an unsigned long because we need to be able
    211       1.1   thorpej  *	to operate on it atomically on all systems while at the same
    212       1.1   thorpej  *	time rendering practically impossible the chance of it reaching
    213       1.1   thorpej  *	its max value.  In practice, we're limited by the number of LWPs
    214       1.1   thorpej  *	that can be present on the system at any given time, and the
    215       1.1   thorpej  *	assumption is that limit will be good enough on a 32-bit platform.
    216       1.1   thorpej  *	See futex_wake() for why overflow needs to be avoided.
    217  1.12.4.1   thorpej  *
    218  1.12.4.1   thorpej  *	XXX Since futex addresses must be 4-byte aligned, we could
    219  1.12.4.1   thorpej  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    220  1.12.4.1   thorpej  *	XXX field of the key.  Worth it?
    221       1.1   thorpej  */
    222       1.1   thorpej struct futex {
    223  1.12.4.4   thorpej 	futex_key_t		fx_key;
    224  1.12.4.1   thorpej 	struct rb_node		fx_node;
    225       1.1   thorpej 	unsigned long		fx_refcnt;
    226       1.1   thorpej 	bool			fx_shared;
    227       1.1   thorpej 	bool			fx_on_tree;
    228  1.12.4.1   thorpej 	uint8_t			fx_class;
    229       1.1   thorpej 
    230  1.12.4.1   thorpej 	kmutex_t		fx_op_lock;	/* adaptive */
    231  1.12.4.1   thorpej 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    232  1.12.4.1   thorpej 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    233  1.12.4.1   thorpej 	unsigned int		fx_nwaiters[2];
    234       1.1   thorpej };
    235       1.1   thorpej 
    236       1.1   thorpej /*
    237  1.12.4.1   thorpej  * futex classes: Some futex operations can only be carried out on
    238  1.12.4.1   thorpej  * futexes that are known to be abiding by a certain protocol.  These
    239  1.12.4.1   thorpej  * classes are assigned to a futex when created due to a wait event,
    240  1.12.4.1   thorpej  * and when subsequent wake or requeue operations are issued, the
    241  1.12.4.1   thorpej  * class is checked at futex lookup time.  If the class does not match,
    242  1.12.4.1   thorpej  * EINVAL is the result.
    243  1.12.4.1   thorpej  */
    244  1.12.4.1   thorpej #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    245  1.12.4.1   thorpej #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    246  1.12.4.1   thorpej #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    247  1.12.4.1   thorpej #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    248  1.12.4.1   thorpej 
    249  1.12.4.1   thorpej /* sleepq definitions */
    250  1.12.4.1   thorpej #define	FUTEX_READERQ		0
    251  1.12.4.1   thorpej #define	FUTEX_WRITERQ		1
    252  1.12.4.1   thorpej 
    253  1.12.4.1   thorpej static const char futex_wmesg[] = "futex";
    254  1.12.4.1   thorpej 
    255  1.12.4.1   thorpej static void	futex_unsleep(lwp_t *, bool);
    256  1.12.4.1   thorpej 
    257  1.12.4.1   thorpej static syncobj_t futex_syncobj = {
    258  1.12.4.1   thorpej 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    259  1.12.4.1   thorpej 	.sobj_unsleep	= futex_unsleep,
    260  1.12.4.1   thorpej 	.sobj_changepri	= sleepq_changepri,
    261  1.12.4.1   thorpej 	.sobj_lendpri	= sleepq_lendpri,
    262  1.12.4.1   thorpej 	.sobj_owner	= syncobj_noowner,
    263       1.1   thorpej };
    264       1.1   thorpej 
    265       1.1   thorpej /*
    266       1.1   thorpej  * futex_tab
    267       1.1   thorpej  *
    268       1.1   thorpej  *	Global trees of futexes by vmspace/va and VM object address.
    269       1.1   thorpej  *
    270       1.1   thorpej  *	XXX This obviously doesn't scale in parallel.  We could use a
    271       1.1   thorpej  *	pserialize-safe data structure, but there may be a high cost to
    272       1.1   thorpej  *	frequent deletion since we don't cache futexes after we're done
    273       1.1   thorpej  *	with them.  We could use hashed locks.  But for now, just make
    274       1.1   thorpej  *	sure userland can't DoS the serial performance, by using a
    275       1.1   thorpej  *	balanced binary tree for lookup.
    276       1.1   thorpej  *
    277       1.1   thorpej  *	XXX We could use a per-process tree for the table indexed by
    278       1.1   thorpej  *	virtual address to reduce contention between processes.
    279       1.1   thorpej  */
    280       1.1   thorpej static struct {
    281       1.1   thorpej 	kmutex_t	lock;
    282       1.1   thorpej 	struct rb_tree	va;
    283       1.1   thorpej 	struct rb_tree	oa;
    284       1.1   thorpej } futex_tab __cacheline_aligned;
    285       1.1   thorpej 
    286       1.1   thorpej static int
    287       1.1   thorpej compare_futex_key(void *cookie, const void *n, const void *k)
    288       1.1   thorpej {
    289       1.1   thorpej 	const struct futex *fa = n;
    290  1.12.4.4   thorpej 	const futex_key_t *fka = &fa->fx_key;
    291  1.12.4.4   thorpej 	const futex_key_t *fkb = k;
    292       1.1   thorpej 
    293       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace <
    294       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    295       1.1   thorpej 		return -1;
    296       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace >
    297       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    298       1.1   thorpej 		return +1;
    299       1.1   thorpej 	if (fka->fk_private.va < fkb->fk_private.va)
    300       1.1   thorpej 		return -1;
    301       1.1   thorpej 	if (fka->fk_private.va > fkb->fk_private.va)
    302  1.12.4.6       chs 		return +1;
    303       1.1   thorpej 	return 0;
    304       1.1   thorpej }
    305       1.1   thorpej 
    306       1.1   thorpej static int
    307       1.1   thorpej compare_futex(void *cookie, const void *na, const void *nb)
    308       1.1   thorpej {
    309       1.1   thorpej 	const struct futex *fa = na;
    310       1.1   thorpej 	const struct futex *fb = nb;
    311       1.1   thorpej 
    312       1.1   thorpej 	return compare_futex_key(cookie, fa, &fb->fx_key);
    313       1.1   thorpej }
    314       1.1   thorpej 
    315       1.1   thorpej static const rb_tree_ops_t futex_rb_ops = {
    316       1.1   thorpej 	.rbto_compare_nodes = compare_futex,
    317       1.1   thorpej 	.rbto_compare_key = compare_futex_key,
    318       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    319       1.1   thorpej };
    320       1.1   thorpej 
    321       1.1   thorpej static int
    322       1.1   thorpej compare_futex_shared_key(void *cookie, const void *n, const void *k)
    323       1.1   thorpej {
    324       1.1   thorpej 	const struct futex *fa = n;
    325  1.12.4.4   thorpej 	const futex_key_t *fka = &fa->fx_key;
    326  1.12.4.4   thorpej 	const futex_key_t *fkb = k;
    327       1.1   thorpej 
    328       1.1   thorpej 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    329       1.1   thorpej }
    330       1.1   thorpej 
    331       1.1   thorpej static int
    332       1.1   thorpej compare_futex_shared(void *cookie, const void *na, const void *nb)
    333       1.1   thorpej {
    334       1.1   thorpej 	const struct futex *fa = na;
    335       1.1   thorpej 	const struct futex *fb = nb;
    336       1.1   thorpej 
    337       1.1   thorpej 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    338       1.1   thorpej }
    339       1.1   thorpej 
    340       1.1   thorpej static const rb_tree_ops_t futex_shared_rb_ops = {
    341       1.1   thorpej 	.rbto_compare_nodes = compare_futex_shared,
    342       1.1   thorpej 	.rbto_compare_key = compare_futex_shared_key,
    343       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    344       1.1   thorpej };
    345       1.1   thorpej 
    346  1.12.4.1   thorpej /*
    347  1.12.4.1   thorpej  * futex_sq_lock2(f, f2)
    348  1.12.4.1   thorpej  *
    349  1.12.4.1   thorpej  *	Acquire the sleepq locks for f and f2, which may be null, or
    350  1.12.4.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
    351  1.12.4.1   thorpej  *	order is chosen on the locks.
    352  1.12.4.1   thorpej  *
    353  1.12.4.1   thorpej  *	Callers should only ever acquire multiple sleepq locks
    354  1.12.4.1   thorpej  *	simultaneously using futex_sq_lock2.
    355  1.12.4.1   thorpej  */
    356  1.12.4.1   thorpej static void
    357  1.12.4.1   thorpej futex_sq_lock2(struct futex * const f, struct futex * const f2)
    358  1.12.4.1   thorpej {
    359  1.12.4.1   thorpej 
    360  1.12.4.1   thorpej 	/*
    361  1.12.4.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    362  1.12.4.1   thorpej 	 * is not, lock the other and be done with it.
    363  1.12.4.1   thorpej 	 */
    364  1.12.4.1   thorpej 	if (f == NULL && f2 == NULL) {
    365  1.12.4.1   thorpej 		return;
    366  1.12.4.1   thorpej 	} else if (f == NULL) {
    367  1.12.4.1   thorpej 		mutex_spin_enter(f2->fx_sq_lock);
    368  1.12.4.1   thorpej 		return;
    369  1.12.4.1   thorpej 	} else if (f2 == NULL) {
    370  1.12.4.1   thorpej 		mutex_spin_enter(f->fx_sq_lock);
    371  1.12.4.1   thorpej 		return;
    372  1.12.4.1   thorpej 	}
    373  1.12.4.1   thorpej 
    374  1.12.4.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    375  1.12.4.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    376  1.12.4.1   thorpej 
    377  1.12.4.1   thorpej 	/* If both locks are the same, acquire only one. */
    378  1.12.4.1   thorpej 	if (m == m2) {
    379  1.12.4.1   thorpej 		mutex_spin_enter(m);
    380  1.12.4.1   thorpej 		return;
    381  1.12.4.1   thorpej 	}
    382  1.12.4.1   thorpej 
    383  1.12.4.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    384  1.12.4.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    385  1.12.4.1   thorpej 		mutex_spin_enter(m);
    386  1.12.4.1   thorpej 		mutex_spin_enter(m2);
    387  1.12.4.1   thorpej 	} else {
    388  1.12.4.1   thorpej 		mutex_spin_enter(m2);
    389  1.12.4.1   thorpej 		mutex_spin_enter(m);
    390  1.12.4.1   thorpej 	}
    391  1.12.4.1   thorpej }
    392  1.12.4.1   thorpej 
    393  1.12.4.1   thorpej /*
    394  1.12.4.1   thorpej  * futex_sq_unlock2(f, f2)
    395  1.12.4.1   thorpej  *
    396  1.12.4.1   thorpej  *	Release the sleep queue locks for f and f2, which may be null, or
    397  1.12.4.1   thorpej  *	which may have the same underlying lock.
    398  1.12.4.1   thorpej  */
    399  1.12.4.1   thorpej static void
    400  1.12.4.1   thorpej futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    401  1.12.4.1   thorpej {
    402  1.12.4.1   thorpej 
    403  1.12.4.1   thorpej 	/*
    404  1.12.4.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    405  1.12.4.1   thorpej 	 * is not, unlock the other and be done with it.
    406  1.12.4.1   thorpej 	 */
    407  1.12.4.1   thorpej 	if (f == NULL && f2 == NULL) {
    408  1.12.4.1   thorpej 		return;
    409  1.12.4.1   thorpej 	} else if (f == NULL) {
    410  1.12.4.1   thorpej 		mutex_spin_exit(f2->fx_sq_lock);
    411  1.12.4.1   thorpej 		return;
    412  1.12.4.1   thorpej 	} else if (f2 == NULL) {
    413  1.12.4.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
    414  1.12.4.1   thorpej 		return;
    415  1.12.4.1   thorpej 	}
    416  1.12.4.1   thorpej 
    417  1.12.4.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    418  1.12.4.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    419  1.12.4.1   thorpej 
    420  1.12.4.1   thorpej 	/* If both locks are the same, release only one. */
    421  1.12.4.1   thorpej 	if (m == m2) {
    422  1.12.4.1   thorpej 		mutex_spin_exit(m);
    423  1.12.4.1   thorpej 		return;
    424  1.12.4.1   thorpej 	}
    425  1.12.4.1   thorpej 
    426  1.12.4.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    427  1.12.4.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    428  1.12.4.1   thorpej 		mutex_spin_exit(m2);
    429  1.12.4.1   thorpej 		mutex_spin_exit(m);
    430  1.12.4.1   thorpej 	} else {
    431  1.12.4.1   thorpej 		mutex_spin_exit(m);
    432  1.12.4.1   thorpej 		mutex_spin_exit(m2);
    433  1.12.4.1   thorpej 	}
    434  1.12.4.1   thorpej }
    435       1.1   thorpej 
    436       1.1   thorpej /*
    437       1.1   thorpej  * futex_load(uaddr, kaddr)
    438       1.1   thorpej  *
    439       1.1   thorpej  *	Perform a single atomic load to read *uaddr, and return the
    440       1.1   thorpej  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    441       1.1   thorpej  *	mapped.
    442       1.1   thorpej  */
    443       1.1   thorpej static inline int
    444       1.1   thorpej futex_load(int *uaddr, int *kaddr)
    445       1.1   thorpej {
    446       1.1   thorpej 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    447       1.1   thorpej }
    448       1.1   thorpej 
    449       1.1   thorpej /*
    450       1.1   thorpej  * futex_test(uaddr, expected)
    451       1.1   thorpej  *
    452       1.1   thorpej  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    453       1.1   thorpej  *	uaddr is not mapped.
    454       1.1   thorpej  */
    455       1.1   thorpej static bool
    456       1.1   thorpej futex_test(int *uaddr, int expected)
    457       1.1   thorpej {
    458       1.1   thorpej 	int val;
    459       1.1   thorpej 	int error;
    460       1.1   thorpej 
    461       1.1   thorpej 	error = futex_load(uaddr, &val);
    462       1.1   thorpej 	if (error)
    463       1.1   thorpej 		return false;
    464       1.1   thorpej 	return val == expected;
    465       1.1   thorpej }
    466       1.1   thorpej 
    467  1.12.4.1   thorpej static pool_cache_t	futex_cache		__read_mostly;
    468  1.12.4.1   thorpej 
    469  1.12.4.1   thorpej static int	futex_ctor(void *, void *, int);
    470  1.12.4.1   thorpej static void	futex_dtor(void *, void *);
    471  1.12.4.1   thorpej 
    472       1.1   thorpej /*
    473       1.1   thorpej  * futex_sys_init()
    474       1.1   thorpej  *
    475       1.1   thorpej  *	Initialize the futex subsystem.
    476       1.1   thorpej  */
    477       1.1   thorpej void
    478       1.1   thorpej futex_sys_init(void)
    479       1.1   thorpej {
    480       1.1   thorpej 
    481       1.1   thorpej 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    482       1.1   thorpej 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    483       1.1   thorpej 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    484  1.12.4.1   thorpej 
    485  1.12.4.1   thorpej 	futex_cache = pool_cache_init(sizeof(struct futex),
    486  1.12.4.1   thorpej 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    487  1.12.4.1   thorpej 	    futex_dtor, NULL);
    488       1.1   thorpej }
    489       1.1   thorpej 
    490       1.1   thorpej /*
    491       1.1   thorpej  * futex_sys_fini()
    492       1.1   thorpej  *
    493       1.1   thorpej  *	Finalize the futex subsystem.
    494       1.1   thorpej  */
    495       1.1   thorpej void
    496       1.1   thorpej futex_sys_fini(void)
    497       1.1   thorpej {
    498       1.1   thorpej 
    499       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    500       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    501       1.1   thorpej 	mutex_destroy(&futex_tab.lock);
    502       1.1   thorpej }
    503       1.1   thorpej 
    504       1.1   thorpej /*
    505  1.12.4.1   thorpej  * futex_ctor()
    506       1.1   thorpej  *
    507  1.12.4.1   thorpej  *	Futex object constructor.
    508       1.1   thorpej  */
    509  1.12.4.1   thorpej static int
    510  1.12.4.1   thorpej futex_ctor(void *arg __unused, void *obj, int flags __unused)
    511       1.1   thorpej {
    512  1.12.4.1   thorpej 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    513  1.12.4.1   thorpej 	struct futex * const f = obj;
    514       1.1   thorpej 
    515  1.12.4.1   thorpej 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    516  1.12.4.1   thorpej 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    517       1.1   thorpej 
    518  1.12.4.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    519  1.12.4.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    520  1.12.4.1   thorpej 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    521       1.1   thorpej 
    522  1.12.4.1   thorpej 	return 0;
    523       1.1   thorpej }
    524       1.1   thorpej 
    525       1.1   thorpej /*
    526  1.12.4.1   thorpej  * futex_dtor()
    527       1.1   thorpej  *
    528  1.12.4.1   thorpej  *	Futex object destructor.
    529       1.1   thorpej  */
    530       1.1   thorpej static void
    531  1.12.4.1   thorpej futex_dtor(void *arg __unused, void *obj)
    532       1.1   thorpej {
    533  1.12.4.1   thorpej 	struct futex * const f = obj;
    534       1.1   thorpej 
    535  1.12.4.1   thorpej 	mutex_destroy(&f->fx_op_lock);
    536  1.12.4.1   thorpej 	f->fx_sq_lock = NULL;
    537       1.1   thorpej }
    538       1.1   thorpej 
    539       1.1   thorpej /*
    540       1.1   thorpej  * futex_key_init(key, vm, va, shared)
    541       1.1   thorpej  *
    542       1.1   thorpej  *	Initialize a futex key for lookup, etc.
    543       1.1   thorpej  */
    544       1.1   thorpej static int
    545  1.12.4.4   thorpej futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
    546       1.1   thorpej {
    547       1.1   thorpej 	int error = 0;
    548       1.1   thorpej 
    549       1.1   thorpej 	if (__predict_false(shared)) {
    550       1.1   thorpej 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    551       1.1   thorpej 			error = EFAULT;
    552       1.1   thorpej 	} else {
    553       1.1   thorpej 		fk->fk_private.vmspace = vm;
    554       1.1   thorpej 		fk->fk_private.va = va;
    555       1.1   thorpej 	}
    556       1.1   thorpej 
    557       1.1   thorpej 	return error;
    558       1.1   thorpej }
    559       1.1   thorpej 
    560       1.1   thorpej /*
    561       1.1   thorpej  * futex_key_fini(key, shared)
    562       1.1   thorpej  *
    563       1.1   thorpej  *	Release a futex key.
    564       1.1   thorpej  */
    565       1.1   thorpej static void
    566  1.12.4.4   thorpej futex_key_fini(futex_key_t *fk, bool shared)
    567       1.1   thorpej {
    568       1.1   thorpej 	if (__predict_false(shared))
    569       1.1   thorpej 		uvm_voaddr_release(&fk->fk_shared);
    570       1.1   thorpej 	memset(fk, 0, sizeof(*fk));
    571       1.1   thorpej }
    572       1.1   thorpej 
    573       1.1   thorpej /*
    574       1.1   thorpej  * futex_create(fk, shared)
    575       1.1   thorpej  *
    576       1.1   thorpej  *	Create a futex.  Initial reference count is 1, representing the
    577       1.1   thorpej  *	caller.  Returns NULL on failure.  Always takes ownership of the
    578       1.1   thorpej  *	key, either transferring it to the newly-created futex, or releasing
    579       1.1   thorpej  *	the key if creation fails.
    580       1.1   thorpej  *
    581       1.1   thorpej  *	Never sleeps for memory, but may sleep to acquire a lock.
    582       1.1   thorpej  */
    583       1.1   thorpej static struct futex *
    584  1.12.4.4   thorpej futex_create(futex_key_t *fk, bool shared, uint8_t class)
    585       1.1   thorpej {
    586       1.1   thorpej 	struct futex *f;
    587       1.1   thorpej 
    588  1.12.4.1   thorpej 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    589       1.1   thorpej 	if (f == NULL) {
    590       1.1   thorpej 		futex_key_fini(fk, shared);
    591       1.1   thorpej 		return NULL;
    592       1.1   thorpej 	}
    593       1.1   thorpej 	f->fx_key = *fk;
    594       1.1   thorpej 	f->fx_refcnt = 1;
    595       1.1   thorpej 	f->fx_shared = shared;
    596       1.1   thorpej 	f->fx_on_tree = false;
    597  1.12.4.1   thorpej 	f->fx_class = class;
    598       1.1   thorpej 
    599       1.1   thorpej 	return f;
    600       1.1   thorpej }
    601       1.1   thorpej 
    602       1.1   thorpej /*
    603       1.1   thorpej  * futex_destroy(f)
    604       1.1   thorpej  *
    605       1.1   thorpej  *	Destroy a futex created with futex_create.  Reference count
    606       1.1   thorpej  *	must be zero.
    607       1.1   thorpej  *
    608       1.1   thorpej  *	May sleep.
    609       1.1   thorpej  */
    610       1.1   thorpej static void
    611       1.1   thorpej futex_destroy(struct futex *f)
    612       1.1   thorpej {
    613       1.1   thorpej 
    614       1.1   thorpej 	ASSERT_SLEEPABLE();
    615       1.1   thorpej 
    616       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    617       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    618  1.12.4.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    619  1.12.4.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    620  1.12.4.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    621  1.12.4.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    622       1.1   thorpej 
    623       1.1   thorpej 	futex_key_fini(&f->fx_key, f->fx_shared);
    624       1.1   thorpej 
    625  1.12.4.1   thorpej 	pool_cache_put(futex_cache, f);
    626       1.1   thorpej }
    627       1.1   thorpej 
    628       1.1   thorpej /*
    629  1.12.4.1   thorpej  * futex_hold_count(f, n)
    630       1.1   thorpej  *
    631       1.1   thorpej  *	Attempt to acquire a reference to f.  Return 0 on success,
    632       1.1   thorpej  *	ENFILE on too many references.
    633       1.1   thorpej  *
    634       1.1   thorpej  *	Never sleeps.
    635       1.1   thorpej  */
    636       1.1   thorpej static int
    637  1.12.4.1   thorpej futex_hold_count(struct futex *f, unsigned long const count)
    638       1.1   thorpej {
    639       1.1   thorpej 	unsigned long refcnt;
    640       1.1   thorpej 
    641       1.1   thorpej 	do {
    642       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    643  1.12.4.1   thorpej 		if (ULONG_MAX - refcnt < count)
    644       1.1   thorpej 			return ENFILE;
    645  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    646  1.12.4.1   thorpej 				  refcnt + count) != refcnt);
    647       1.1   thorpej 
    648       1.1   thorpej 	return 0;
    649       1.1   thorpej }
    650  1.12.4.1   thorpej #define	futex_hold(f)	futex_hold_count(f, 1)
    651       1.1   thorpej 
    652       1.1   thorpej /*
    653  1.12.4.1   thorpej  * futex_rele_count(f, n)
    654       1.1   thorpej  *
    655       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    656       1.1   thorpej  *	futex_hold.
    657       1.1   thorpej  *
    658       1.1   thorpej  *	May sleep to free f.
    659       1.1   thorpej  */
    660       1.1   thorpej static void
    661  1.12.4.1   thorpej futex_rele_count(struct futex *f, unsigned long const count)
    662       1.1   thorpej {
    663       1.1   thorpej 	unsigned long refcnt;
    664       1.1   thorpej 
    665       1.1   thorpej 	ASSERT_SLEEPABLE();
    666       1.1   thorpej 
    667       1.1   thorpej 	do {
    668       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    669  1.12.4.1   thorpej 		KASSERT(refcnt >= count);
    670  1.12.4.1   thorpej 		if (refcnt - count == 0)
    671       1.1   thorpej 			goto trylast;
    672  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    673  1.12.4.1   thorpej 				  refcnt - count) != refcnt);
    674       1.1   thorpej 	return;
    675       1.1   thorpej 
    676       1.1   thorpej trylast:
    677  1.12.4.1   thorpej 	KASSERT(count <= LONG_MAX);
    678       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    679  1.12.4.1   thorpej 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    680       1.1   thorpej 		if (f->fx_on_tree) {
    681       1.1   thorpej 			if (__predict_false(f->fx_shared))
    682       1.1   thorpej 				rb_tree_remove_node(&futex_tab.oa, f);
    683       1.1   thorpej 			else
    684       1.1   thorpej 				rb_tree_remove_node(&futex_tab.va, f);
    685       1.1   thorpej 			f->fx_on_tree = false;
    686       1.1   thorpej 		}
    687       1.1   thorpej 	} else {
    688       1.1   thorpej 		/* References remain -- don't destroy it.  */
    689       1.1   thorpej 		f = NULL;
    690       1.1   thorpej 	}
    691       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    692       1.1   thorpej 	if (f != NULL)
    693       1.1   thorpej 		futex_destroy(f);
    694       1.1   thorpej }
    695  1.12.4.1   thorpej #define	futex_rele(f)	futex_rele_count(f, 1)
    696       1.1   thorpej 
    697       1.1   thorpej /*
    698  1.12.4.1   thorpej  * futex_rele_count_not_last(f, n)
    699       1.1   thorpej  *
    700       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    701       1.1   thorpej  *	futex_hold.
    702       1.1   thorpej  *
    703       1.1   thorpej  *	This version asserts that we are not dropping the last
    704       1.1   thorpej  *	reference to f.
    705       1.1   thorpej  */
    706       1.1   thorpej static void
    707  1.12.4.1   thorpej futex_rele_count_not_last(struct futex *f, unsigned long const count)
    708       1.1   thorpej {
    709       1.1   thorpej 	unsigned long refcnt;
    710       1.1   thorpej 
    711       1.1   thorpej 	do {
    712       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    713  1.12.4.1   thorpej 		KASSERT(refcnt >= count);
    714  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    715  1.12.4.1   thorpej 				  refcnt - count) != refcnt);
    716       1.1   thorpej }
    717       1.1   thorpej 
    718       1.1   thorpej /*
    719  1.12.4.1   thorpej  * futex_lookup_by_key(key, shared, class, &f)
    720       1.1   thorpej  *
    721       1.1   thorpej  *	Try to find an existing futex va reference in the specified key
    722       1.1   thorpej  *	On success, return 0, set f to found futex or to NULL if not found,
    723       1.1   thorpej  *	and increment f's reference count if found.
    724       1.1   thorpej  *
    725       1.1   thorpej  *	Return ENFILE if reference count too high.
    726       1.1   thorpej  *
    727       1.1   thorpej  *	Internal lookup routine shared by futex_lookup() and
    728       1.5  riastrad  *	futex_lookup_create().
    729       1.1   thorpej  */
    730       1.1   thorpej static int
    731  1.12.4.4   thorpej futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
    732  1.12.4.1   thorpej     struct futex **fp)
    733       1.1   thorpej {
    734       1.1   thorpej 	struct futex *f;
    735       1.1   thorpej 	int error = 0;
    736       1.1   thorpej 
    737       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    738       1.1   thorpej 	if (__predict_false(shared)) {
    739       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.oa, fk);
    740       1.1   thorpej 	} else {
    741       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.va, fk);
    742       1.1   thorpej 	}
    743       1.1   thorpej 	if (f) {
    744  1.12.4.1   thorpej 		if (__predict_true(f->fx_class == class ||
    745  1.12.4.1   thorpej 				   class == FUTEX_CLASS_ANY))
    746  1.12.4.1   thorpej 			error = futex_hold(f);
    747  1.12.4.1   thorpej 		else
    748  1.12.4.1   thorpej 			error = EINVAL;
    749       1.1   thorpej 		if (error)
    750       1.1   thorpej 			f = NULL;
    751       1.1   thorpej 	}
    752       1.1   thorpej  	*fp = f;
    753       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    754       1.1   thorpej 
    755       1.1   thorpej 	return error;
    756       1.1   thorpej }
    757       1.1   thorpej 
    758       1.1   thorpej /*
    759       1.1   thorpej  * futex_insert(f, fp)
    760       1.1   thorpej  *
    761       1.1   thorpej  *	Try to insert the futex f into the tree by va.  If there
    762       1.1   thorpej  *	already is a futex for its va, acquire a reference to it, and
    763       1.1   thorpej  *	store it in *fp; otherwise store f in *fp.
    764       1.1   thorpej  *
    765       1.1   thorpej  *	Return 0 on success, ENFILE if there already is a futex but its
    766       1.1   thorpej  *	reference count is too high.
    767       1.1   thorpej  */
    768       1.1   thorpej static int
    769       1.1   thorpej futex_insert(struct futex *f, struct futex **fp)
    770       1.1   thorpej {
    771       1.1   thorpej 	struct futex *f0;
    772       1.1   thorpej 	int error;
    773       1.1   thorpej 
    774       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    775       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    776       1.1   thorpej 
    777       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    778       1.1   thorpej 	if (__predict_false(f->fx_shared))
    779       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    780       1.1   thorpej 	else
    781       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    782       1.1   thorpej 	if (f0 == f) {
    783       1.1   thorpej 		f->fx_on_tree = true;
    784       1.1   thorpej 		error = 0;
    785       1.1   thorpej 	} else {
    786       1.1   thorpej 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    787       1.1   thorpej 		KASSERT(f0->fx_on_tree);
    788       1.1   thorpej 		error = futex_hold(f0);
    789       1.1   thorpej 		if (error)
    790       1.1   thorpej 			goto out;
    791       1.1   thorpej 	}
    792       1.1   thorpej 	*fp = f0;
    793       1.1   thorpej out:	mutex_exit(&futex_tab.lock);
    794       1.1   thorpej 
    795       1.1   thorpej 	return error;
    796       1.1   thorpej }
    797       1.1   thorpej 
    798       1.1   thorpej /*
    799  1.12.4.1   thorpej  * futex_lookup(uaddr, shared, class, &f)
    800       1.1   thorpej  *
    801       1.1   thorpej  *	Find a futex at the userland pointer uaddr in the current
    802       1.1   thorpej  *	process's VM space.  On success, return the futex in f and
    803       1.1   thorpej  *	increment its reference count.
    804       1.1   thorpej  *
    805       1.5  riastrad  *	Caller must call futex_rele when done.
    806       1.1   thorpej  */
    807       1.1   thorpej static int
    808  1.12.4.1   thorpej futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    809       1.1   thorpej {
    810  1.12.4.4   thorpej 	futex_key_t fk;
    811       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    812       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    813       1.1   thorpej 	int error;
    814       1.1   thorpej 
    815       1.1   thorpej 	/*
    816       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    817       1.1   thorpej 	 * boundaries and so atomics will work.
    818       1.1   thorpej 	 */
    819  1.12.4.1   thorpej 	if (__predict_false((va & 3) != 0))
    820       1.1   thorpej 		return EINVAL;
    821       1.1   thorpej 
    822       1.1   thorpej 	/* Look it up. */
    823       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    824       1.1   thorpej 	if (error)
    825       1.1   thorpej 		return error;
    826       1.1   thorpej 
    827  1.12.4.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    828       1.1   thorpej 	futex_key_fini(&fk, shared);
    829       1.1   thorpej 	if (error)
    830       1.1   thorpej 		return error;
    831       1.1   thorpej 
    832       1.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    833  1.12.4.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    834       1.1   thorpej 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    835       1.1   thorpej 
    836       1.1   thorpej 	/*
    837       1.1   thorpej 	 * Success!  (Caller must still check whether we found
    838       1.1   thorpej 	 * anything, but nothing went _wrong_ like trying to use
    839       1.1   thorpej 	 * unmapped memory.)
    840       1.1   thorpej 	 */
    841       1.1   thorpej 	KASSERT(error == 0);
    842       1.1   thorpej 
    843       1.1   thorpej 	return error;
    844       1.1   thorpej }
    845       1.1   thorpej 
    846       1.1   thorpej /*
    847  1.12.4.1   thorpej  * futex_lookup_create(uaddr, shared, class, &f)
    848       1.1   thorpej  *
    849       1.1   thorpej  *	Find or create a futex at the userland pointer uaddr in the
    850       1.1   thorpej  *	current process's VM space.  On success, return the futex in f
    851       1.1   thorpej  *	and increment its reference count.
    852       1.1   thorpej  *
    853       1.5  riastrad  *	Caller must call futex_rele when done.
    854       1.1   thorpej  */
    855       1.1   thorpej static int
    856  1.12.4.1   thorpej futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    857       1.1   thorpej {
    858  1.12.4.4   thorpej 	futex_key_t fk;
    859       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    860       1.1   thorpej 	struct futex *f = NULL;
    861       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    862       1.1   thorpej 	int error;
    863       1.1   thorpej 
    864       1.1   thorpej 	/*
    865       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    866       1.1   thorpej 	 * boundaries and so atomics will work.
    867       1.1   thorpej 	 */
    868  1.12.4.1   thorpej 	if (__predict_false((va & 3) != 0))
    869  1.12.4.1   thorpej 		return EINVAL;
    870  1.12.4.1   thorpej 
    871  1.12.4.1   thorpej 	if (__predict_false(class == FUTEX_CLASS_ANY))
    872       1.1   thorpej 		return EINVAL;
    873       1.1   thorpej 
    874       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    875       1.1   thorpej 	if (error)
    876       1.1   thorpej 		return error;
    877       1.1   thorpej 
    878       1.1   thorpej 	/*
    879       1.1   thorpej 	 * Optimistically assume there already is one, and try to find
    880       1.1   thorpej 	 * it.
    881       1.1   thorpej 	 */
    882  1.12.4.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    883       1.1   thorpej 	if (error || *fp != NULL) {
    884       1.1   thorpej 		/*
    885       1.1   thorpej 		 * We either found one, or there was an error.
    886       1.1   thorpej 		 * In either case, we are done with the key.
    887       1.1   thorpej 		 */
    888       1.1   thorpej 		futex_key_fini(&fk, shared);
    889       1.1   thorpej 		goto out;
    890       1.1   thorpej 	}
    891       1.1   thorpej 
    892       1.1   thorpej 	/*
    893       1.1   thorpej 	 * Create a futex recoard.  This tranfers ownership of the key
    894       1.1   thorpej 	 * in all cases.
    895       1.1   thorpej 	 */
    896  1.12.4.1   thorpej 	f = futex_create(&fk, shared, class);
    897       1.1   thorpej 	if (f == NULL) {
    898       1.1   thorpej 		error = ENOMEM;
    899       1.1   thorpej 		goto out;
    900       1.1   thorpej 	}
    901       1.1   thorpej 
    902       1.1   thorpej 	/*
    903       1.1   thorpej 	 * Insert our new futex, or use existing if someone else beat
    904       1.1   thorpej 	 * us to it.
    905       1.1   thorpej 	 */
    906       1.1   thorpej 	error = futex_insert(f, fp);
    907       1.1   thorpej 	if (error)
    908       1.1   thorpej 		goto out;
    909       1.1   thorpej 	if (*fp == f)
    910       1.1   thorpej 		f = NULL;	/* don't release on exit */
    911       1.1   thorpej 
    912       1.1   thorpej 	/* Success!  */
    913       1.1   thorpej 	KASSERT(error == 0);
    914       1.1   thorpej 
    915       1.1   thorpej out:	if (f != NULL)
    916       1.1   thorpej 		futex_rele(f);
    917       1.1   thorpej 	KASSERT(error || *fp != NULL);
    918       1.1   thorpej 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    919       1.1   thorpej 	return error;
    920       1.1   thorpej }
    921       1.1   thorpej 
    922       1.1   thorpej /*
    923  1.12.4.1   thorpej  * futex_unsleep:
    924       1.1   thorpej  *
    925  1.12.4.1   thorpej  *	Remove an LWP from the futex and sleep queue.  This is called when
    926  1.12.4.1   thorpej  *	the LWP has not been awoken normally but instead interrupted: for
    927  1.12.4.1   thorpej  *	example, when a signal is received.  Must be called with the LWP
    928  1.12.4.1   thorpej  *	locked.  Will unlock if "unlock" is true.
    929       1.1   thorpej  */
    930       1.1   thorpej static void
    931  1.12.4.1   thorpej futex_unsleep(lwp_t *l, bool unlock)
    932       1.1   thorpej {
    933  1.12.4.1   thorpej 	struct futex *f __diagused;
    934       1.1   thorpej 
    935  1.12.4.1   thorpej 	f = (struct futex *)(uintptr_t)l->l_wchan;
    936       1.1   thorpej 
    937  1.12.4.1   thorpej 	KASSERT(mutex_owned(f->fx_sq_lock));
    938       1.1   thorpej 
    939  1.12.4.1   thorpej 	/* WRITERQ is more likely */
    940  1.12.4.1   thorpej 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    941  1.12.4.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    942  1.12.4.1   thorpej 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    943  1.12.4.1   thorpej 	} else {
    944  1.12.4.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    945  1.12.4.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    946  1.12.4.1   thorpej 		f->fx_nwaiters[FUTEX_READERQ]--;
    947  1.12.4.1   thorpej 	}
    948       1.1   thorpej 
    949  1.12.4.1   thorpej 	sleepq_unsleep(l, unlock);
    950       1.1   thorpej }
    951       1.1   thorpej 
    952       1.1   thorpej /*
    953  1.12.4.1   thorpej  * futex_wait(f, timeout, clkid, bitset)
    954       1.1   thorpej  *
    955  1.12.4.1   thorpej  *	Wait until deadline on the clock clkid, or forever if deadline
    956  1.12.4.1   thorpej  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    957  1.12.4.1   thorpej  *	ETIMEDOUT on timeout, EINTR on signal.
    958       1.1   thorpej  */
    959  1.12.4.1   thorpej static int
    960  1.12.4.1   thorpej futex_wait(struct futex *f, int q, const struct timespec *deadline,
    961  1.12.4.1   thorpej     clockid_t clkid, int bitset)
    962       1.1   thorpej {
    963       1.1   thorpej 
    964       1.1   thorpej 	/*
    965  1.12.4.1   thorpej 	 * Some things to note about how this function works:
    966  1.12.4.1   thorpej 	 *
    967  1.12.4.1   thorpej 	 * ==> When we enter futex_wait(), the futex's op lock is
    968  1.12.4.1   thorpej 	 * held, preventing us from missing wakes.  Once we are on
    969  1.12.4.1   thorpej 	 * the futex's sleepq, it is safe to release the op lock.
    970  1.12.4.1   thorpej 	 *
    971  1.12.4.1   thorpej 	 * ==> We have a single early escape to avoid calling
    972  1.12.4.1   thorpej 	 * sleepq_block(): our deadline already having passed.
    973  1.12.4.1   thorpej 	 * If this is a no-timeout wait, or if the deadline has
    974  1.12.4.1   thorpej 	 * not already passed, we are guaranteed to call sleepq_block().
    975  1.12.4.1   thorpej 	 *
    976  1.12.4.1   thorpej 	 * ==> sleepq_block() contains all of the necessary logic
    977  1.12.4.1   thorpej 	 * for handling signals; we do not need to check for them
    978  1.12.4.1   thorpej 	 * ourselves.
    979  1.12.4.1   thorpej 	 *
    980  1.12.4.1   thorpej 	 * ==> Once we have blocked, other futex operations will
    981  1.12.4.1   thorpej 	 * be able to wake us or requeue us.  Until we have blocked,
    982  1.12.4.1   thorpej 	 * those other futex operations will not be able to acquire
    983  1.12.4.1   thorpej 	 * the sleepq locks in order to perform those operations,
    984  1.12.4.1   thorpej 	 * even if we have dropped the op lock.  Thus, we will not
    985  1.12.4.1   thorpej 	 * miss wakes.  This fundamental invariant is relied upon by
    986  1.12.4.1   thorpej 	 * every other synchronization object in the kernel.
    987  1.12.4.1   thorpej 	 *
    988  1.12.4.1   thorpej 	 * ==> sleepq_block() returns for one of three reasons:
    989  1.12.4.1   thorpej 	 *
    990  1.12.4.1   thorpej 	 *	-- We were awakened.
    991  1.12.4.1   thorpej 	 *	-- We were signaled.
    992  1.12.4.1   thorpej 	 *	-- We timed out.
    993  1.12.4.1   thorpej 	 *
    994  1.12.4.1   thorpej 	 * ==> Once sleepq_block() returns, we will not call
    995  1.12.4.1   thorpej 	 * sleepq_block() again.
    996  1.12.4.1   thorpej 	 *
    997  1.12.4.1   thorpej 	 * ==> If sleepq_block() returns due to being signaled,
    998  1.12.4.1   thorpej 	 * we must check to see if we were also awakened.  This
    999  1.12.4.1   thorpej 	 * is to handle the following sequence:
   1000  1.12.4.1   thorpej 	 *
   1001  1.12.4.1   thorpej 	 *	-- Futex wake takes us off sleepq, places us on runq.
   1002  1.12.4.1   thorpej 	 *	-- We are signaled while sitting on the runq.
   1003  1.12.4.1   thorpej 	 *	-- We run, and sleepq_block() notices the signal and
   1004  1.12.4.1   thorpej 	 *	   returns  EINTR/ERESTART.
   1005  1.12.4.1   thorpej 	 *
   1006  1.12.4.1   thorpej 	 * In this situation, we want to indicate a successful wake
   1007  1.12.4.1   thorpej 	 * to the caller, because that wake has been reported to the
   1008  1.12.4.1   thorpej 	 * thread that issued it.
   1009  1.12.4.1   thorpej 	 *
   1010  1.12.4.1   thorpej 	 * Note that it is NOT possible for a wake to be issued after
   1011  1.12.4.1   thorpej 	 * a signal; the LWP will already have been taken off the sleepq
   1012  1.12.4.1   thorpej 	 * by the signal, so the wake will never happen.  Note that for
   1013  1.12.4.1   thorpej 	 * this reason, we must *never* return ERESTART, because it could
   1014  1.12.4.1   thorpej 	 * result in us waiting again with no one to awaken us.
   1015       1.1   thorpej 	 */
   1016       1.1   thorpej 
   1017  1.12.4.1   thorpej 	struct lwp * const l = curlwp;
   1018  1.12.4.1   thorpej 	int timo;
   1019  1.12.4.1   thorpej 	int error;
   1020       1.1   thorpej 
   1021       1.4  riastrad 	/*
   1022  1.12.4.1   thorpej 	 * Compute our timeout before taking any locks.
   1023       1.4  riastrad 	 */
   1024  1.12.4.1   thorpej 	if (deadline) {
   1025  1.12.4.1   thorpej 		struct timespec ts;
   1026       1.4  riastrad 
   1027  1.12.4.1   thorpej 		/* Check our watch.  */
   1028  1.12.4.1   thorpej 		error = clock_gettime1(clkid, &ts);
   1029  1.12.4.1   thorpej 		if (error) {
   1030  1.12.4.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1031  1.12.4.1   thorpej 			return error;
   1032  1.12.4.1   thorpej 		}
   1033       1.1   thorpej 
   1034  1.12.4.1   thorpej 		/*
   1035  1.12.4.1   thorpej 		 * If we're past the deadline, ETIMEDOUT.
   1036  1.12.4.1   thorpej 		 */
   1037  1.12.4.1   thorpej 		if (timespeccmp(deadline, &ts, <=)) {
   1038  1.12.4.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1039  1.12.4.1   thorpej 			return ETIMEDOUT;
   1040  1.12.4.1   thorpej 		} else {
   1041  1.12.4.1   thorpej 			/* Count how much time is left.  */
   1042  1.12.4.1   thorpej 			timespecsub(deadline, &ts, &ts);
   1043  1.12.4.1   thorpej 			timo = tstohz(&ts);
   1044  1.12.4.1   thorpej 			if (timo == 0) {
   1045  1.12.4.1   thorpej 				/*
   1046  1.12.4.1   thorpej 				 * tstohz() already rounds up, and
   1047  1.12.4.1   thorpej 				 * a return value of 0 ticks means
   1048  1.12.4.1   thorpej 				 * "expired now or in the past".
   1049  1.12.4.1   thorpej 				 */
   1050  1.12.4.1   thorpej 				mutex_exit(&f->fx_op_lock);
   1051  1.12.4.1   thorpej 				return ETIMEDOUT;
   1052  1.12.4.1   thorpej 			}
   1053  1.12.4.1   thorpej 		}
   1054  1.12.4.1   thorpej 	} else {
   1055  1.12.4.1   thorpej 		timo = 0;
   1056  1.12.4.1   thorpej 	}
   1057       1.1   thorpej 
   1058       1.1   thorpej 	/*
   1059  1.12.4.1   thorpej 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1060  1.12.4.1   thorpej 	 * that there's room for us to block.
   1061       1.1   thorpej 	 */
   1062  1.12.4.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   1063  1.12.4.1   thorpej 	if (__predict_false(q == FUTEX_READERQ &&
   1064  1.12.4.1   thorpej 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1065  1.12.4.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
   1066  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1067  1.12.4.1   thorpej 		return ENFILE;
   1068  1.12.4.1   thorpej 	}
   1069  1.12.4.1   thorpej 	lwp_lock(l);
   1070       1.4  riastrad 
   1071       1.4  riastrad 	/*
   1072  1.12.4.1   thorpej 	 * No need for locks here; until we're on a futex's sleepq, these
   1073  1.12.4.1   thorpej 	 * values are private to the LWP, and the locks needed to get onto
   1074  1.12.4.1   thorpej 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1075       1.4  riastrad 	 */
   1076  1.12.4.1   thorpej 	l->l_futex = f;
   1077  1.12.4.1   thorpej 	l->l_futex_wakesel = bitset;
   1078       1.4  riastrad 
   1079       1.4  riastrad 	/*
   1080  1.12.4.1   thorpej 	 * This is an inlined bit of sleepq_enter();
   1081  1.12.4.1   thorpej 	 * we already hold the lwp lock.
   1082       1.4  riastrad 	 */
   1083  1.12.4.1   thorpej 	lwp_unlock_to(l, f->fx_sq_lock);
   1084  1.12.4.1   thorpej 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1085  1.12.4.1   thorpej 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1086       1.4  riastrad 
   1087  1.12.4.1   thorpej 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1088  1.12.4.1   thorpej 	f->fx_nwaiters[q]++;
   1089      1.11  riastrad 
   1090  1.12.4.1   thorpej 	/* We can now safely release the op lock. */
   1091  1.12.4.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1092      1.11  riastrad 
   1093  1.12.4.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1094  1.12.4.1   thorpej 	error = sleepq_block(timo, true);
   1095  1.12.4.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1096      1.11  riastrad 
   1097  1.12.4.1   thorpej 	/*
   1098  1.12.4.1   thorpej 	 * f is no longer valid; we may have been moved to another
   1099  1.12.4.1   thorpej 	 * futex sleepq while we slept.
   1100  1.12.4.1   thorpej 	 */
   1101       1.4  riastrad 
   1102       1.4  riastrad 	/*
   1103  1.12.4.1   thorpej 	 * If something went wrong, we may still have a futex reference.
   1104  1.12.4.1   thorpej 	 * Check for that here and drop it if so.  We can do this without
   1105  1.12.4.1   thorpej 	 * taking any locks because l->l_futex is private to the LWP when
   1106  1.12.4.1   thorpej 	 * not on any futex's sleepq, and we are not on any sleepq because
   1107  1.12.4.1   thorpej 	 * we are running.
   1108  1.12.4.1   thorpej 	 *
   1109  1.12.4.1   thorpej 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1110  1.12.4.1   thorpej 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1111       1.4  riastrad 	 */
   1112       1.4  riastrad 	if (error) {
   1113  1.12.4.1   thorpej 		f = l->l_futex;
   1114  1.12.4.1   thorpej 		if (f != NULL) {
   1115  1.12.4.1   thorpej 			SDT_PROBE0(futex, wait, finish, aborted);
   1116  1.12.4.1   thorpej 			l->l_futex = NULL;
   1117  1.12.4.1   thorpej 			futex_rele(f);
   1118  1.12.4.1   thorpej 		} else {
   1119  1.12.4.1   thorpej 			/* Raced with wakeup; report success. */
   1120  1.12.4.1   thorpej 			SDT_PROBE0(futex, wait, finish, wakerace);
   1121  1.12.4.1   thorpej 			error = 0;
   1122  1.12.4.1   thorpej 		}
   1123       1.4  riastrad 		if (error == EWOULDBLOCK)
   1124       1.4  riastrad 			error = ETIMEDOUT;
   1125  1.12.4.1   thorpej 		else if (error != ETIMEDOUT)
   1126  1.12.4.1   thorpej 			error = EINTR;
   1127  1.12.4.1   thorpej 	} else {
   1128  1.12.4.2   thorpej 		f = l->l_futex;
   1129  1.12.4.2   thorpej 		if (__predict_false(f != NULL)) {
   1130  1.12.4.2   thorpej 			/*
   1131  1.12.4.2   thorpej 			 * There are certain situations that can cause
   1132  1.12.4.2   thorpej 			 * sleepq_block() to return 0 even if we were
   1133  1.12.4.2   thorpej 			 * signalled (by e.g. SIGKILL).  In this case,
   1134  1.12.4.2   thorpej 			 * we will need to release our futex hold and
   1135  1.12.4.2   thorpej 			 * return EINTR (we're probably about to die
   1136  1.12.4.2   thorpej 			 * anyway).
   1137  1.12.4.2   thorpej 			 */
   1138  1.12.4.2   thorpej 			SDT_PROBE0(futex, wait, finish, aborted);
   1139  1.12.4.2   thorpej 			l->l_futex = NULL;
   1140  1.12.4.2   thorpej 			futex_rele(f);
   1141  1.12.4.2   thorpej 			error = EINTR;
   1142  1.12.4.2   thorpej 		} else {
   1143  1.12.4.2   thorpej 			SDT_PROBE0(futex, wait, finish, normally);
   1144  1.12.4.2   thorpej 		}
   1145       1.4  riastrad 	}
   1146       1.4  riastrad 
   1147       1.1   thorpej 	return error;
   1148       1.1   thorpej }
   1149       1.1   thorpej 
   1150       1.1   thorpej /*
   1151  1.12.4.1   thorpej  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1152       1.1   thorpej  *
   1153       1.1   thorpej  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1154       1.1   thorpej  *	provided, move up to nrequeue remaining waiters on f matching
   1155       1.1   thorpej  *	bitset to f2.  Return the number of waiters actually woken.
   1156       1.1   thorpej  *	Caller must hold the locks of f and f2, if provided.
   1157       1.1   thorpej  */
   1158       1.1   thorpej static unsigned
   1159  1.12.4.1   thorpej futex_wake(struct futex *f, int q, unsigned int const nwake,
   1160  1.12.4.1   thorpej     struct futex *f2, int q2, unsigned int const nrequeue,
   1161  1.12.4.5   thorpej     int bitset, unsigned int *nrequeuedp)
   1162       1.1   thorpej {
   1163  1.12.4.1   thorpej 	struct lwp *l, *l_next;
   1164       1.1   thorpej 	unsigned nwoken = 0;
   1165  1.12.4.1   thorpej 	unsigned nrequeued = 0;
   1166       1.1   thorpej 
   1167  1.12.4.1   thorpej 	KASSERT(mutex_owned(&f->fx_op_lock));
   1168  1.12.4.1   thorpej 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1169  1.12.4.1   thorpej 
   1170  1.12.4.1   thorpej 	futex_sq_lock2(f, f2);
   1171       1.1   thorpej 
   1172       1.1   thorpej 	/* Wake up to nwake waiters, and count the number woken.  */
   1173  1.12.4.1   thorpej 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1174  1.12.4.1   thorpej 		if (nwoken == nwake)
   1175       1.1   thorpej 			break;
   1176  1.12.4.1   thorpej 
   1177  1.12.4.1   thorpej 		KASSERT(l->l_syncobj == &futex_syncobj);
   1178  1.12.4.1   thorpej 		KASSERT(l->l_wchan == (wchan_t)f);
   1179  1.12.4.1   thorpej 		KASSERT(l->l_futex == f);
   1180  1.12.4.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1181  1.12.4.1   thorpej 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1182  1.12.4.1   thorpej 
   1183  1.12.4.1   thorpej 		if ((l->l_futex_wakesel & bitset) == 0)
   1184  1.12.4.1   thorpej 			continue;
   1185  1.12.4.1   thorpej 
   1186  1.12.4.1   thorpej 		l->l_futex_wakesel = 0;
   1187  1.12.4.1   thorpej 		l->l_futex = NULL;
   1188  1.12.4.1   thorpej 		sleepq_remove(&f->fx_sqs[q], l);
   1189  1.12.4.1   thorpej 		f->fx_nwaiters[q]--;
   1190  1.12.4.1   thorpej 		nwoken++;
   1191  1.12.4.1   thorpej 		/* hold counts adjusted in bulk below */
   1192       1.1   thorpej 	}
   1193       1.1   thorpej 
   1194  1.12.4.1   thorpej 	if (nrequeue) {
   1195  1.12.4.1   thorpej 		KASSERT(f2 != NULL);
   1196       1.1   thorpej 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1197  1.12.4.1   thorpej 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1198  1.12.4.1   thorpej 			if (nrequeued == nrequeue)
   1199       1.1   thorpej 				break;
   1200  1.12.4.1   thorpej 
   1201  1.12.4.1   thorpej 			KASSERT(l->l_syncobj == &futex_syncobj);
   1202  1.12.4.1   thorpej 			KASSERT(l->l_wchan == (wchan_t)f);
   1203  1.12.4.1   thorpej 			KASSERT(l->l_futex == f);
   1204  1.12.4.1   thorpej 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1205  1.12.4.1   thorpej 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1206  1.12.4.1   thorpej 
   1207  1.12.4.1   thorpej 			if ((l->l_futex_wakesel & bitset) == 0)
   1208  1.12.4.1   thorpej 				continue;
   1209  1.12.4.1   thorpej 
   1210  1.12.4.1   thorpej 			l->l_futex = f2;
   1211  1.12.4.1   thorpej 			sleepq_transfer(l, &f->fx_sqs[q],
   1212  1.12.4.1   thorpej 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1213  1.12.4.1   thorpej 			    &futex_syncobj, f2->fx_sq_lock, true);
   1214  1.12.4.1   thorpej 			f->fx_nwaiters[q]--;
   1215  1.12.4.1   thorpej 			f2->fx_nwaiters[q2]++;
   1216  1.12.4.1   thorpej 			nrequeued++;
   1217  1.12.4.1   thorpej 			/* hold counts adjusted in bulk below */
   1218  1.12.4.1   thorpej 		}
   1219  1.12.4.1   thorpej 		if (nrequeued) {
   1220  1.12.4.1   thorpej 			/* XXX futex_hold() could theoretically fail here. */
   1221  1.12.4.1   thorpej 			int hold_error __diagused =
   1222  1.12.4.1   thorpej 			    futex_hold_count(f2, nrequeued);
   1223  1.12.4.1   thorpej 			KASSERT(hold_error == 0);
   1224       1.1   thorpej 		}
   1225       1.1   thorpej 	}
   1226       1.1   thorpej 
   1227  1.12.4.1   thorpej 	/*
   1228  1.12.4.1   thorpej 	 * Adjust the futex reference counts for the wakes and
   1229  1.12.4.1   thorpej 	 * requeues.
   1230  1.12.4.1   thorpej 	 */
   1231  1.12.4.1   thorpej 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1232  1.12.4.1   thorpej 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1233  1.12.4.1   thorpej 
   1234  1.12.4.1   thorpej 	futex_sq_unlock2(f, f2);
   1235  1.12.4.1   thorpej 
   1236  1.12.4.1   thorpej 	/* Return the number of waiters woken and requeued.  */
   1237  1.12.4.5   thorpej 	if (nrequeuedp != NULL) {
   1238  1.12.4.5   thorpej 		*nrequeuedp = nrequeued;
   1239  1.12.4.5   thorpej 	}
   1240  1.12.4.5   thorpej 	return nwoken;
   1241       1.1   thorpej }
   1242       1.1   thorpej 
   1243       1.1   thorpej /*
   1244  1.12.4.1   thorpej  * futex_op_lock(f)
   1245       1.1   thorpej  *
   1246  1.12.4.1   thorpej  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1247       1.1   thorpej  *	not use if caller needs to acquire two locks; use
   1248  1.12.4.1   thorpej  *	futex_op_lock2 instead.
   1249       1.1   thorpej  */
   1250       1.1   thorpej static void
   1251  1.12.4.1   thorpej futex_op_lock(struct futex *f)
   1252       1.1   thorpej {
   1253  1.12.4.1   thorpej 	mutex_enter(&f->fx_op_lock);
   1254       1.1   thorpej }
   1255       1.1   thorpej 
   1256       1.1   thorpej /*
   1257  1.12.4.1   thorpej  * futex_op_unlock(f)
   1258       1.1   thorpej  *
   1259  1.12.4.3   thorpej  *	Release the op lock of f.
   1260       1.1   thorpej  */
   1261       1.1   thorpej static void
   1262  1.12.4.1   thorpej futex_op_unlock(struct futex *f)
   1263       1.1   thorpej {
   1264  1.12.4.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1265       1.1   thorpej }
   1266       1.1   thorpej 
   1267       1.1   thorpej /*
   1268  1.12.4.1   thorpej  * futex_op_lock2(f, f2)
   1269       1.1   thorpej  *
   1270  1.12.4.1   thorpej  *	Acquire the op locks of both f and f2, which may be null, or
   1271  1.12.4.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
   1272  1.12.4.1   thorpej  *	order is chosen on the locks.
   1273       1.1   thorpej  *
   1274  1.12.4.1   thorpej  *	Callers should only ever acquire multiple op locks
   1275  1.12.4.1   thorpej  *	simultaneously using futex_op_lock2.
   1276       1.1   thorpej  */
   1277       1.1   thorpej static void
   1278  1.12.4.1   thorpej futex_op_lock2(struct futex *f, struct futex *f2)
   1279       1.1   thorpej {
   1280       1.1   thorpej 
   1281       1.1   thorpej 	/*
   1282       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1283       1.1   thorpej 	 * is not, lock the other and be done with it.
   1284       1.1   thorpej 	 */
   1285       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1286       1.1   thorpej 		return;
   1287       1.1   thorpej 	} else if (f == NULL) {
   1288  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1289       1.1   thorpej 		return;
   1290       1.1   thorpej 	} else if (f2 == NULL) {
   1291  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1292       1.1   thorpej 		return;
   1293       1.1   thorpej 	}
   1294       1.1   thorpej 
   1295       1.1   thorpej 	/* If both futexes are the same, acquire only one. */
   1296       1.1   thorpej 	if (f == f2) {
   1297  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1298       1.1   thorpej 		return;
   1299       1.1   thorpej 	}
   1300       1.1   thorpej 
   1301       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1302       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1303  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1304  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1305       1.1   thorpej 	} else {
   1306  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1307  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1308       1.1   thorpej 	}
   1309       1.1   thorpej }
   1310       1.1   thorpej 
   1311       1.1   thorpej /*
   1312  1.12.4.1   thorpej  * futex_op_unlock2(f, f2)
   1313       1.1   thorpej  *
   1314       1.1   thorpej  *	Release the queue locks of both f and f2, which may be null, or
   1315       1.1   thorpej  *	which may have the same underlying queue.
   1316       1.1   thorpej  */
   1317       1.1   thorpej static void
   1318  1.12.4.1   thorpej futex_op_unlock2(struct futex *f, struct futex *f2)
   1319       1.1   thorpej {
   1320       1.1   thorpej 
   1321       1.1   thorpej 	/*
   1322       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1323       1.1   thorpej 	 * is not, unlock the other and be done with it.
   1324       1.1   thorpej 	 */
   1325       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1326       1.1   thorpej 		return;
   1327       1.1   thorpej 	} else if (f == NULL) {
   1328  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1329       1.1   thorpej 		return;
   1330       1.1   thorpej 	} else if (f2 == NULL) {
   1331  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1332       1.1   thorpej 		return;
   1333       1.1   thorpej 	}
   1334       1.1   thorpej 
   1335       1.1   thorpej 	/* If both futexes are the same, release only one. */
   1336       1.1   thorpej 	if (f == f2) {
   1337  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1338       1.1   thorpej 		return;
   1339       1.1   thorpej 	}
   1340       1.1   thorpej 
   1341       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1342       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1343  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1344  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1345       1.1   thorpej 	} else {
   1346  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1347  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1348       1.1   thorpej 	}
   1349       1.1   thorpej }
   1350       1.1   thorpej 
   1351       1.1   thorpej /*
   1352       1.1   thorpej  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1353       1.1   thorpej  *
   1354  1.12.4.1   thorpej  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1355       1.1   thorpej  */
   1356       1.1   thorpej static int
   1357       1.1   thorpej futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1358      1.11  riastrad     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1359       1.1   thorpej     register_t *retval)
   1360       1.1   thorpej {
   1361       1.1   thorpej 	struct futex *f;
   1362      1.11  riastrad 	struct timespec ts;
   1363      1.11  riastrad 	const struct timespec *deadline;
   1364       1.1   thorpej 	int error;
   1365       1.1   thorpej 
   1366       1.6  riastrad 	/*
   1367       1.6  riastrad 	 * If there's nothing to wait for, and nobody will ever wake
   1368       1.6  riastrad 	 * us, then don't set anything up to wait -- just stop here.
   1369       1.6  riastrad 	 */
   1370       1.6  riastrad 	if (val3 == 0)
   1371       1.7  riastrad 		return EINVAL;
   1372       1.6  riastrad 
   1373       1.1   thorpej 	/* Optimistically test before anything else.  */
   1374       1.1   thorpej 	if (!futex_test(uaddr, val))
   1375       1.1   thorpej 		return EAGAIN;
   1376       1.1   thorpej 
   1377      1.11  riastrad 	/* Determine a deadline on the specified clock.  */
   1378  1.12.4.1   thorpej 	if (timeout == NULL) {
   1379  1.12.4.1   thorpej 		deadline = timeout;
   1380  1.12.4.1   thorpej 	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1381  1.12.4.1   thorpej 		/* Sanity-check the deadline. */
   1382  1.12.4.1   thorpej 		if (timeout->tv_sec < 0 ||
   1383  1.12.4.1   thorpej 		    timeout->tv_nsec < 0 ||
   1384  1.12.4.1   thorpej 		    timeout->tv_nsec >= 1000000000L) {
   1385  1.12.4.1   thorpej 			return EINVAL;
   1386  1.12.4.1   thorpej 		}
   1387      1.11  riastrad 		deadline = timeout;
   1388      1.11  riastrad 	} else {
   1389  1.12.4.1   thorpej 		struct timespec interval = *timeout;
   1390  1.12.4.1   thorpej 
   1391  1.12.4.1   thorpej 		error = itimespecfix(&interval);
   1392  1.12.4.1   thorpej 		if (error)
   1393  1.12.4.1   thorpej 			return error;
   1394      1.11  riastrad 		error = clock_gettime1(clkid, &ts);
   1395      1.11  riastrad 		if (error)
   1396      1.11  riastrad 			return error;
   1397  1.12.4.1   thorpej 		timespecadd(&ts, &interval, &ts);
   1398      1.11  riastrad 		deadline = &ts;
   1399      1.11  riastrad 	}
   1400      1.11  riastrad 
   1401       1.1   thorpej 	/* Get the futex, creating it if necessary.  */
   1402  1.12.4.1   thorpej 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1403       1.1   thorpej 	if (error)
   1404       1.1   thorpej 		return error;
   1405       1.1   thorpej 	KASSERT(f);
   1406       1.1   thorpej 
   1407       1.1   thorpej 	/*
   1408  1.12.4.1   thorpej 	 * Under the op lock, check the value again: if it has
   1409       1.1   thorpej 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1410       1.1   thorpej 	 * Since FUTEX_WAKE will use the same lock and be done after
   1411       1.1   thorpej 	 * modifying the value, the order in which we check and enqueue
   1412       1.1   thorpej 	 * is immaterial.
   1413       1.1   thorpej 	 */
   1414  1.12.4.1   thorpej 	futex_op_lock(f);
   1415       1.1   thorpej 	if (!futex_test(uaddr, val)) {
   1416  1.12.4.1   thorpej 		futex_op_unlock(f);
   1417       1.1   thorpej 		error = EAGAIN;
   1418       1.1   thorpej 		goto out;
   1419       1.1   thorpej 	}
   1420  1.12.4.1   thorpej 
   1421  1.12.4.1   thorpej 	/*
   1422  1.12.4.1   thorpej 	 * Now wait.  futex_wait() will drop our op lock once we
   1423  1.12.4.1   thorpej 	 * are entered into the sleep queue, thus ensuring atomicity
   1424  1.12.4.1   thorpej 	 * of wakes with respect to waits.
   1425  1.12.4.1   thorpej 	 */
   1426  1.12.4.1   thorpej 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1427       1.1   thorpej 
   1428       1.1   thorpej 	/*
   1429       1.1   thorpej 	 * We cannot drop our reference to the futex here, because
   1430       1.1   thorpej 	 * we might be enqueued on a different one when we are awakened.
   1431  1.12.4.1   thorpej 	 * The references will be managed on our behalf in the requeue,
   1432  1.12.4.1   thorpej 	 * wake, and error cases.
   1433       1.1   thorpej 	 */
   1434       1.1   thorpej 	f = NULL;
   1435       1.1   thorpej 
   1436  1.12.4.1   thorpej 	if (__predict_true(error == 0)) {
   1437  1.12.4.1   thorpej 		/* Return 0 on success, error on failure. */
   1438  1.12.4.1   thorpej 		*retval = 0;
   1439  1.12.4.1   thorpej 	}
   1440       1.1   thorpej 
   1441       1.1   thorpej out:	if (f != NULL)
   1442       1.5  riastrad 		futex_rele(f);
   1443       1.1   thorpej 	return error;
   1444       1.1   thorpej }
   1445       1.1   thorpej 
   1446       1.1   thorpej /*
   1447       1.1   thorpej  * futex_func_wake(uaddr, val, val3, retval)
   1448       1.1   thorpej  *
   1449       1.1   thorpej  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1450       1.1   thorpej  */
   1451       1.1   thorpej static int
   1452       1.1   thorpej futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1453       1.1   thorpej {
   1454       1.1   thorpej 	struct futex *f;
   1455       1.1   thorpej 	unsigned int nwoken = 0;
   1456       1.1   thorpej 	int error = 0;
   1457       1.1   thorpej 
   1458       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1459       1.1   thorpej 	if (val < 0) {
   1460       1.1   thorpej 		error = EINVAL;
   1461       1.1   thorpej 		goto out;
   1462       1.1   thorpej 	}
   1463       1.1   thorpej 
   1464       1.1   thorpej 	/* Look up the futex, if any.  */
   1465  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1466       1.1   thorpej 	if (error)
   1467       1.1   thorpej 		goto out;
   1468       1.1   thorpej 
   1469       1.1   thorpej 	/* If there's no futex, there are no waiters to wake.  */
   1470       1.1   thorpej 	if (f == NULL)
   1471       1.1   thorpej 		goto out;
   1472       1.1   thorpej 
   1473       1.1   thorpej 	/*
   1474  1.12.4.1   thorpej 	 * Under f's op lock, wake the waiters and remember the
   1475       1.1   thorpej 	 * number woken.
   1476       1.1   thorpej 	 */
   1477  1.12.4.1   thorpej 	futex_op_lock(f);
   1478  1.12.4.1   thorpej 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1479  1.12.4.5   thorpej 			    NULL, FUTEX_WRITERQ, 0, val3, NULL);
   1480  1.12.4.1   thorpej 	futex_op_unlock(f);
   1481       1.1   thorpej 
   1482       1.1   thorpej 	/* Release the futex.  */
   1483       1.5  riastrad 	futex_rele(f);
   1484       1.1   thorpej 
   1485       1.1   thorpej out:
   1486       1.1   thorpej 	/* Return the number of waiters woken.  */
   1487       1.1   thorpej 	*retval = nwoken;
   1488       1.1   thorpej 
   1489       1.1   thorpej 	/* Success!  */
   1490       1.1   thorpej 	return error;
   1491       1.1   thorpej }
   1492       1.1   thorpej 
   1493       1.1   thorpej /*
   1494       1.1   thorpej  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1495       1.1   thorpej  *
   1496       1.1   thorpej  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1497       1.1   thorpej  */
   1498       1.1   thorpej static int
   1499       1.1   thorpej futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1500       1.1   thorpej     int val2, int val3, register_t *retval)
   1501       1.1   thorpej {
   1502       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1503       1.1   thorpej 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1504  1.12.4.5   thorpej 	unsigned nrequeued = 0;
   1505       1.1   thorpej 	int error;
   1506       1.1   thorpej 
   1507       1.1   thorpej 	/* Reject negative number of wakeups or requeues. */
   1508       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1509       1.1   thorpej 		error = EINVAL;
   1510       1.1   thorpej 		goto out;
   1511       1.1   thorpej 	}
   1512       1.1   thorpej 
   1513       1.1   thorpej 	/* Look up the source futex, if any. */
   1514  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1515       1.1   thorpej 	if (error)
   1516       1.1   thorpej 		goto out;
   1517       1.1   thorpej 
   1518       1.1   thorpej 	/* If there is none, nothing to do. */
   1519       1.1   thorpej 	if (f == NULL)
   1520       1.1   thorpej 		goto out;
   1521       1.1   thorpej 
   1522       1.1   thorpej 	/*
   1523       1.1   thorpej 	 * We may need to create the destination futex because it's
   1524       1.1   thorpej 	 * entirely possible it does not currently have any waiters.
   1525       1.1   thorpej 	 */
   1526  1.12.4.1   thorpej 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1527       1.1   thorpej 	if (error)
   1528       1.1   thorpej 		goto out;
   1529       1.1   thorpej 
   1530       1.1   thorpej 	/*
   1531  1.12.4.1   thorpej 	 * Under the futexes' op locks, check the value; if
   1532       1.1   thorpej 	 * unchanged from val3, wake the waiters.
   1533       1.1   thorpej 	 */
   1534  1.12.4.1   thorpej 	futex_op_lock2(f, f2);
   1535       1.1   thorpej 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1536       1.1   thorpej 		error = EAGAIN;
   1537       1.1   thorpej 	} else {
   1538       1.1   thorpej 		error = 0;
   1539  1.12.4.1   thorpej 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1540  1.12.4.1   thorpej 				    f2, FUTEX_WRITERQ, val2,
   1541  1.12.4.5   thorpej 				    FUTEX_BITSET_MATCH_ANY,
   1542  1.12.4.5   thorpej 				    &nrequeued);
   1543       1.1   thorpej 	}
   1544  1.12.4.1   thorpej 	futex_op_unlock2(f, f2);
   1545       1.1   thorpej 
   1546       1.1   thorpej out:
   1547  1.12.4.5   thorpej 	/*
   1548  1.12.4.5   thorpej 	 * For FUTUEX_REQUEUE, return the numner of waiters woken.
   1549  1.12.4.5   thorpej 	 *
   1550  1.12.4.5   thorpej 	 * For FUTEX_CMP_REQUEUE, return the number of waiters woken
   1551  1.12.4.5   thorpej 	 * **and** requeued.
   1552  1.12.4.5   thorpej 	 */
   1553  1.12.4.5   thorpej 	*retval = nwoken + (op == FUTEX_CMP_REQUEUE) ? nrequeued : 0;
   1554       1.1   thorpej 
   1555       1.1   thorpej 	/* Release the futexes if we got them.  */
   1556       1.1   thorpej 	if (f2)
   1557       1.5  riastrad 		futex_rele(f2);
   1558       1.1   thorpej 	if (f)
   1559       1.5  riastrad 		futex_rele(f);
   1560       1.1   thorpej 	return error;
   1561       1.1   thorpej }
   1562       1.1   thorpej 
   1563       1.1   thorpej /*
   1564       1.1   thorpej  * futex_validate_op_cmp(val3)
   1565       1.1   thorpej  *
   1566       1.1   thorpej  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1567       1.1   thorpej  */
   1568       1.1   thorpej static int
   1569       1.1   thorpej futex_validate_op_cmp(int val3)
   1570       1.1   thorpej {
   1571       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1572       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1573       1.1   thorpej 
   1574       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1575       1.1   thorpej 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1576       1.1   thorpej 		if (oparg < 0)
   1577       1.1   thorpej 			return EINVAL;
   1578       1.1   thorpej 		if (oparg >= 32)
   1579       1.1   thorpej 			return EINVAL;
   1580       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1581       1.1   thorpej 	}
   1582       1.1   thorpej 
   1583       1.1   thorpej 	switch (op) {
   1584       1.1   thorpej 	case FUTEX_OP_SET:
   1585       1.1   thorpej 	case FUTEX_OP_ADD:
   1586       1.1   thorpej 	case FUTEX_OP_OR:
   1587       1.1   thorpej 	case FUTEX_OP_ANDN:
   1588       1.1   thorpej 	case FUTEX_OP_XOR:
   1589       1.1   thorpej 		break;
   1590       1.1   thorpej 	default:
   1591       1.1   thorpej 		return EINVAL;
   1592       1.1   thorpej 	}
   1593       1.1   thorpej 
   1594       1.1   thorpej 	switch (cmp) {
   1595       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1596       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1597       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1598       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1599       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1600       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1601       1.1   thorpej 		break;
   1602       1.1   thorpej 	default:
   1603       1.1   thorpej 		return EINVAL;
   1604       1.1   thorpej 	}
   1605       1.1   thorpej 
   1606       1.1   thorpej 	return 0;
   1607       1.1   thorpej }
   1608       1.1   thorpej 
   1609       1.1   thorpej /*
   1610       1.1   thorpej  * futex_compute_op(oldval, val3)
   1611       1.1   thorpej  *
   1612       1.1   thorpej  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1613       1.1   thorpej  */
   1614       1.1   thorpej static int
   1615       1.1   thorpej futex_compute_op(int oldval, int val3)
   1616       1.1   thorpej {
   1617       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1618       1.1   thorpej 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1619       1.1   thorpej 
   1620       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1621       1.1   thorpej 		KASSERT(oparg >= 0);
   1622       1.1   thorpej 		KASSERT(oparg < 32);
   1623       1.1   thorpej 		oparg = 1u << oparg;
   1624       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1625       1.1   thorpej 	}
   1626       1.1   thorpej 
   1627       1.1   thorpej 	switch (op) {
   1628       1.1   thorpej 	case FUTEX_OP_SET:
   1629       1.1   thorpej 		return oparg;
   1630       1.1   thorpej 
   1631       1.1   thorpej 	case FUTEX_OP_ADD:
   1632       1.1   thorpej 		/*
   1633       1.1   thorpej 		 * Avoid signed arithmetic overflow by doing
   1634       1.1   thorpej 		 * arithmetic unsigned and converting back to signed
   1635       1.1   thorpej 		 * at the end.
   1636       1.1   thorpej 		 */
   1637       1.1   thorpej 		return (int)((unsigned)oldval + (unsigned)oparg);
   1638       1.1   thorpej 
   1639       1.1   thorpej 	case FUTEX_OP_OR:
   1640       1.1   thorpej 		return oldval | oparg;
   1641       1.1   thorpej 
   1642       1.1   thorpej 	case FUTEX_OP_ANDN:
   1643       1.1   thorpej 		return oldval & ~oparg;
   1644       1.1   thorpej 
   1645       1.1   thorpej 	case FUTEX_OP_XOR:
   1646       1.1   thorpej 		return oldval ^ oparg;
   1647       1.1   thorpej 
   1648       1.1   thorpej 	default:
   1649       1.1   thorpej 		panic("invalid futex op");
   1650       1.1   thorpej 	}
   1651       1.1   thorpej }
   1652       1.1   thorpej 
   1653       1.1   thorpej /*
   1654       1.1   thorpej  * futex_compute_cmp(oldval, val3)
   1655       1.1   thorpej  *
   1656       1.1   thorpej  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1657       1.1   thorpej  */
   1658       1.1   thorpej static bool
   1659       1.1   thorpej futex_compute_cmp(int oldval, int val3)
   1660       1.1   thorpej {
   1661       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1662       1.1   thorpej 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1663       1.1   thorpej 
   1664       1.1   thorpej 	switch (cmp) {
   1665       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1666       1.1   thorpej 		return (oldval == cmparg);
   1667       1.1   thorpej 
   1668       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1669       1.1   thorpej 		return (oldval != cmparg);
   1670       1.1   thorpej 
   1671       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1672       1.1   thorpej 		return (oldval < cmparg);
   1673       1.1   thorpej 
   1674       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1675       1.1   thorpej 		return (oldval <= cmparg);
   1676       1.1   thorpej 
   1677       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1678       1.1   thorpej 		return (oldval > cmparg);
   1679       1.1   thorpej 
   1680       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1681       1.1   thorpej 		return (oldval >= cmparg);
   1682       1.1   thorpej 
   1683       1.1   thorpej 	default:
   1684       1.1   thorpej 		panic("invalid futex cmp operation");
   1685       1.1   thorpej 	}
   1686       1.1   thorpej }
   1687       1.1   thorpej 
   1688       1.1   thorpej /*
   1689       1.1   thorpej  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1690       1.1   thorpej  *
   1691       1.1   thorpej  *	Implement futex(FUTEX_WAKE_OP).
   1692       1.1   thorpej  */
   1693       1.1   thorpej static int
   1694       1.1   thorpej futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1695       1.1   thorpej     int val3, register_t *retval)
   1696       1.1   thorpej {
   1697       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1698       1.1   thorpej 	int oldval, newval, actual;
   1699       1.1   thorpej 	unsigned nwoken = 0;
   1700       1.1   thorpej 	int error;
   1701       1.1   thorpej 
   1702       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1703       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1704       1.1   thorpej 		error = EINVAL;
   1705       1.1   thorpej 		goto out;
   1706       1.1   thorpej 	}
   1707       1.1   thorpej 
   1708       1.1   thorpej 	/* Reject invalid operations before we start doing things.  */
   1709       1.1   thorpej 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1710       1.1   thorpej 		goto out;
   1711       1.1   thorpej 
   1712       1.1   thorpej 	/* Look up the first futex, if any.  */
   1713  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1714       1.1   thorpej 	if (error)
   1715       1.1   thorpej 		goto out;
   1716       1.1   thorpej 
   1717       1.1   thorpej 	/* Look up the second futex, if any.  */
   1718  1.12.4.1   thorpej 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1719       1.1   thorpej 	if (error)
   1720       1.1   thorpej 		goto out;
   1721       1.1   thorpej 
   1722       1.1   thorpej 	/*
   1723  1.12.4.1   thorpej 	 * Under the op locks:
   1724       1.1   thorpej 	 *
   1725       1.1   thorpej 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1726       1.1   thorpej 	 * 2. Unconditionally wake uaddr.
   1727  1.12.4.1   thorpej 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1728       1.1   thorpej 	 */
   1729  1.12.4.1   thorpej 	futex_op_lock2(f, f2);
   1730       1.1   thorpej 	do {
   1731       1.1   thorpej 		error = futex_load(uaddr2, &oldval);
   1732       1.1   thorpej 		if (error)
   1733       1.1   thorpej 			goto out_unlock;
   1734       1.1   thorpej 		newval = futex_compute_op(oldval, val3);
   1735       1.1   thorpej 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1736       1.1   thorpej 		if (error)
   1737       1.1   thorpej 			goto out_unlock;
   1738       1.1   thorpej 	} while (actual != oldval);
   1739  1.12.4.1   thorpej 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1740  1.12.4.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   1741  1.12.4.5   thorpej 				 FUTEX_BITSET_MATCH_ANY, NULL) : 0);
   1742       1.1   thorpej 	if (f2 && futex_compute_cmp(oldval, val3))
   1743  1.12.4.1   thorpej 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1744  1.12.4.1   thorpej 				     NULL, FUTEX_WRITERQ, 0,
   1745  1.12.4.5   thorpej 				     FUTEX_BITSET_MATCH_ANY, NULL);
   1746       1.1   thorpej 
   1747       1.1   thorpej 	/* Success! */
   1748       1.1   thorpej 	error = 0;
   1749       1.1   thorpej out_unlock:
   1750  1.12.4.1   thorpej 	futex_op_unlock2(f, f2);
   1751       1.1   thorpej 
   1752       1.1   thorpej out:
   1753       1.1   thorpej 	/* Return the number of waiters woken. */
   1754       1.1   thorpej 	*retval = nwoken;
   1755       1.1   thorpej 
   1756       1.1   thorpej 	/* Release the futexes, if we got them. */
   1757       1.1   thorpej 	if (f2)
   1758       1.5  riastrad 		futex_rele(f2);
   1759       1.1   thorpej 	if (f)
   1760       1.5  riastrad 		futex_rele(f);
   1761       1.1   thorpej 	return error;
   1762       1.1   thorpej }
   1763       1.1   thorpej 
   1764       1.1   thorpej /*
   1765       1.1   thorpej  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   1766       1.1   thorpej  *
   1767       1.1   thorpej  *	Implement the futex system call with all the parameters
   1768       1.1   thorpej  *	parsed out.
   1769       1.1   thorpej  */
   1770       1.1   thorpej int
   1771      1.11  riastrad do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   1772      1.11  riastrad     int *uaddr2, int val2, int val3, register_t *retval)
   1773       1.1   thorpej {
   1774       1.1   thorpej 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   1775       1.1   thorpej 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   1776       1.1   thorpej 							    : CLOCK_MONOTONIC;
   1777  1.12.4.1   thorpej 	int error;
   1778       1.1   thorpej 
   1779       1.1   thorpej 	op &= FUTEX_CMD_MASK;
   1780       1.1   thorpej 
   1781       1.1   thorpej 	switch (op) {
   1782       1.1   thorpej 	case FUTEX_WAIT:
   1783  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   1784  1.12.4.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   1785  1.12.4.1   thorpej 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   1786  1.12.4.1   thorpej 		error = futex_func_wait(shared, uaddr, val,
   1787       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   1788       1.1   thorpej 		    retval);
   1789  1.12.4.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   1790  1.12.4.1   thorpej 		break;
   1791  1.12.4.1   thorpej 
   1792  1.12.4.1   thorpej 	case FUTEX_WAIT_BITSET:
   1793  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   1794  1.12.4.1   thorpej 		    uaddr, val, val3, timeout,
   1795  1.12.4.1   thorpej 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   1796  1.12.4.1   thorpej 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   1797  1.12.4.1   thorpej 		    clkid, TIMER_ABSTIME, retval);
   1798  1.12.4.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   1799  1.12.4.1   thorpej 		break;
   1800       1.1   thorpej 
   1801       1.1   thorpej 	case FUTEX_WAKE:
   1802  1.12.4.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   1803  1.12.4.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   1804  1.12.4.1   thorpej 		error = futex_func_wake(shared, uaddr, val,
   1805  1.12.4.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, retval);
   1806  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1807  1.12.4.1   thorpej 		break;
   1808  1.12.4.1   thorpej 
   1809       1.1   thorpej 	case FUTEX_WAKE_BITSET:
   1810  1.12.4.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   1811  1.12.4.1   thorpej 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   1812  1.12.4.1   thorpej 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   1813  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1814  1.12.4.1   thorpej 		break;
   1815       1.1   thorpej 
   1816       1.1   thorpej 	case FUTEX_REQUEUE:
   1817  1.12.4.1   thorpej 		SDT_PROBE5(futex, func, requeue, entry,
   1818  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   1819  1.12.4.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1820  1.12.4.1   thorpej 		    val2, 0, retval);
   1821  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   1822  1.12.4.1   thorpej 		break;
   1823  1.12.4.1   thorpej 
   1824       1.1   thorpej 	case FUTEX_CMP_REQUEUE:
   1825  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   1826  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1827  1.12.4.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1828       1.1   thorpej 		    val2, val3, retval);
   1829  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   1830  1.12.4.1   thorpej 		break;
   1831       1.1   thorpej 
   1832       1.1   thorpej 	case FUTEX_WAKE_OP:
   1833  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wake_op, entry,
   1834  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1835  1.12.4.1   thorpej 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   1836       1.1   thorpej 		    val3, retval);
   1837  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   1838  1.12.4.1   thorpej 		break;
   1839       1.1   thorpej 
   1840       1.1   thorpej 	case FUTEX_FD:
   1841       1.1   thorpej 	default:
   1842  1.12.4.1   thorpej 		error = ENOSYS;
   1843  1.12.4.1   thorpej 		break;
   1844       1.1   thorpej 	}
   1845  1.12.4.1   thorpej 
   1846  1.12.4.1   thorpej 	return error;
   1847       1.1   thorpej }
   1848       1.1   thorpej 
   1849       1.1   thorpej /*
   1850       1.1   thorpej  * sys___futex(l, uap, retval)
   1851       1.1   thorpej  *
   1852       1.1   thorpej  *	__futex(2) system call: generic futex operations.
   1853       1.1   thorpej  */
   1854       1.1   thorpej int
   1855       1.1   thorpej sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   1856       1.1   thorpej     register_t *retval)
   1857       1.1   thorpej {
   1858       1.1   thorpej 	/* {
   1859       1.1   thorpej 		syscallarg(int *) uaddr;
   1860       1.1   thorpej 		syscallarg(int) op;
   1861       1.1   thorpej 		syscallarg(int) val;
   1862       1.1   thorpej 		syscallarg(const struct timespec *) timeout;
   1863       1.1   thorpej 		syscallarg(int *) uaddr2;
   1864       1.1   thorpej 		syscallarg(int) val2;
   1865       1.1   thorpej 		syscallarg(int) val3;
   1866       1.1   thorpej 	} */
   1867       1.1   thorpej 	struct timespec ts, *tsp;
   1868       1.1   thorpej 	int error;
   1869       1.1   thorpej 
   1870       1.1   thorpej 	/*
   1871       1.1   thorpej 	 * Copy in the timeout argument, if specified.
   1872       1.1   thorpej 	 */
   1873       1.1   thorpej 	if (SCARG(uap, timeout)) {
   1874       1.1   thorpej 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   1875       1.1   thorpej 		if (error)
   1876       1.1   thorpej 			return error;
   1877       1.1   thorpej 		tsp = &ts;
   1878       1.1   thorpej 	} else {
   1879       1.1   thorpej 		tsp = NULL;
   1880       1.1   thorpej 	}
   1881       1.1   thorpej 
   1882       1.1   thorpej 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   1883       1.1   thorpej 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   1884       1.1   thorpej 	    retval);
   1885       1.1   thorpej }
   1886       1.1   thorpej 
   1887       1.1   thorpej /*
   1888       1.1   thorpej  * sys___futex_set_robust_list(l, uap, retval)
   1889       1.1   thorpej  *
   1890       1.1   thorpej  *	__futex_set_robust_list(2) system call for robust futexes.
   1891       1.1   thorpej  */
   1892       1.1   thorpej int
   1893       1.1   thorpej sys___futex_set_robust_list(struct lwp *l,
   1894       1.1   thorpej     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   1895       1.1   thorpej {
   1896       1.1   thorpej 	/* {
   1897       1.1   thorpej 		syscallarg(void *) head;
   1898       1.1   thorpej 		syscallarg(size_t) len;
   1899       1.1   thorpej 	} */
   1900       1.1   thorpej 	void *head = SCARG(uap, head);
   1901       1.1   thorpej 
   1902       1.1   thorpej 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   1903       1.1   thorpej 		return EINVAL;
   1904       1.1   thorpej 	if ((uintptr_t)head % sizeof(u_long))
   1905       1.1   thorpej 		return EINVAL;
   1906       1.1   thorpej 
   1907       1.1   thorpej 	l->l_robust_head = (uintptr_t)head;
   1908       1.1   thorpej 
   1909       1.1   thorpej 	return 0;
   1910       1.1   thorpej }
   1911       1.1   thorpej 
   1912       1.1   thorpej /*
   1913       1.1   thorpej  * sys___futex_get_robust_list(l, uap, retval)
   1914       1.1   thorpej  *
   1915       1.1   thorpej  *	__futex_get_robust_list(2) system call for robust futexes.
   1916       1.1   thorpej  */
   1917       1.1   thorpej int
   1918       1.1   thorpej sys___futex_get_robust_list(struct lwp *l,
   1919       1.1   thorpej     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   1920       1.1   thorpej {
   1921       1.1   thorpej 	/* {
   1922       1.1   thorpej 		syscallarg(lwpid_t) lwpid;
   1923       1.1   thorpej 		syscallarg(void **) headp;
   1924       1.1   thorpej 		syscallarg(size_t *) lenp;
   1925       1.1   thorpej 	} */
   1926       1.1   thorpej 	void *head;
   1927       1.1   thorpej 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   1928       1.1   thorpej 	int error;
   1929       1.1   thorpej 
   1930       1.1   thorpej 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   1931       1.1   thorpej 	if (error)
   1932       1.1   thorpej 		return error;
   1933       1.1   thorpej 
   1934       1.1   thorpej 	/* Copy out the head pointer and the head structure length. */
   1935       1.1   thorpej 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   1936       1.1   thorpej 	if (__predict_true(error == 0)) {
   1937       1.1   thorpej 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   1938       1.1   thorpej 	}
   1939       1.1   thorpej 
   1940       1.1   thorpej 	return error;
   1941       1.1   thorpej }
   1942       1.1   thorpej 
   1943       1.1   thorpej /*
   1944       1.1   thorpej  * release_futex(uva, tid)
   1945       1.1   thorpej  *
   1946       1.1   thorpej  *	Try to release the robust futex at uva in the current process
   1947       1.1   thorpej  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   1948       1.1   thorpej  *	userland program's obligation to arrange correct behaviour.
   1949       1.1   thorpej  */
   1950       1.1   thorpej static void
   1951       1.1   thorpej release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   1952       1.1   thorpej     bool const is_pending)
   1953       1.1   thorpej {
   1954       1.1   thorpej 	int *uaddr;
   1955       1.1   thorpej 	struct futex *f;
   1956       1.1   thorpej 	int oldval, newval, actual;
   1957       1.1   thorpej 	int error;
   1958  1.12.4.1   thorpej 	bool shared;
   1959       1.1   thorpej 
   1960       1.1   thorpej 	/* If it's misaligned, tough.  */
   1961       1.1   thorpej 	if (__predict_false(uptr & 3))
   1962       1.1   thorpej 		return;
   1963       1.1   thorpej 	uaddr = (int *)uptr;
   1964       1.1   thorpej 
   1965       1.1   thorpej 	error = futex_load(uaddr, &oldval);
   1966       1.1   thorpej 	if (__predict_false(error))
   1967       1.1   thorpej 		return;
   1968       1.1   thorpej 
   1969       1.1   thorpej 	/*
   1970       1.1   thorpej 	 * There are two race conditions we need to handle here:
   1971       1.1   thorpej 	 *
   1972       1.1   thorpej 	 * 1. User space cleared the futex word but died before
   1973       1.1   thorpej 	 *    being able to issue the wakeup.  No wakeups will
   1974       1.1   thorpej 	 *    ever be issued, oops!
   1975       1.1   thorpej 	 *
   1976       1.1   thorpej 	 * 2. Awakened waiter died before being able to acquire
   1977       1.1   thorpej 	 *    the futex in user space.  Any other waiters are
   1978       1.1   thorpej 	 *    now stuck, oops!
   1979       1.1   thorpej 	 *
   1980       1.1   thorpej 	 * In both of these cases, the futex word will be 0 (because
   1981       1.1   thorpej 	 * it's updated before the wake is issued).  The best we can
   1982       1.1   thorpej 	 * do is detect this situation if it's the pending futex and
   1983       1.1   thorpej 	 * issue a wake without modifying the futex word.
   1984       1.1   thorpej 	 *
   1985       1.1   thorpej 	 * XXX eventual PI handling?
   1986       1.1   thorpej 	 */
   1987       1.1   thorpej 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   1988       1.1   thorpej 		register_t retval;
   1989  1.12.4.1   thorpej 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   1990       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, &retval);
   1991  1.12.4.1   thorpej 		if (error != 0 || retval == 0)
   1992  1.12.4.1   thorpej 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   1993  1.12.4.1   thorpej 			    FUTEX_BITSET_MATCH_ANY, &retval);
   1994       1.1   thorpej 		return;
   1995       1.1   thorpej 	}
   1996       1.1   thorpej 
   1997       1.1   thorpej 	/* Optimistically test whether we need to do anything at all.  */
   1998       1.1   thorpej 	if ((oldval & FUTEX_TID_MASK) != tid)
   1999       1.1   thorpej 		return;
   2000       1.1   thorpej 
   2001       1.1   thorpej 	/*
   2002       1.1   thorpej 	 * We need to handle the case where this thread owned the futex,
   2003       1.1   thorpej 	 * but it was uncontended.  In this case, there won't be any
   2004       1.1   thorpej 	 * kernel state to look up.  All we can do is mark the futex
   2005       1.1   thorpej 	 * as a zombie to be mopped up the next time another thread
   2006       1.1   thorpej 	 * attempts to acquire it.
   2007       1.1   thorpej 	 *
   2008       1.1   thorpej 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   2009       1.1   thorpej 	 * this loop, even if waiters appear while we're are doing
   2010       1.1   thorpej 	 * so.  This is beause FUTEX_WAITERS is set by user space
   2011       1.1   thorpej 	 * before calling __futex() to wait, and the futex needs
   2012       1.1   thorpej 	 * to be marked as a zombie when the new waiter gets into
   2013       1.1   thorpej 	 * the kernel.
   2014       1.1   thorpej 	 */
   2015       1.1   thorpej 	if ((oldval & FUTEX_WAITERS) == 0) {
   2016       1.1   thorpej 		do {
   2017       1.1   thorpej 			error = futex_load(uaddr, &oldval);
   2018       1.1   thorpej 			if (error)
   2019       1.1   thorpej 				return;
   2020       1.1   thorpej 			if ((oldval & FUTEX_TID_MASK) != tid)
   2021       1.1   thorpej 				return;
   2022       1.1   thorpej 			newval = oldval | FUTEX_OWNER_DIED;
   2023       1.1   thorpej 			error = ucas_int(uaddr, oldval, newval, &actual);
   2024       1.1   thorpej 			if (error)
   2025       1.1   thorpej 				return;
   2026       1.1   thorpej 		} while (actual != oldval);
   2027       1.1   thorpej 
   2028       1.1   thorpej 		/*
   2029       1.1   thorpej 		 * If where is still no indication of waiters, then there is
   2030       1.1   thorpej 		 * no more work for us to do.
   2031       1.1   thorpej 		 */
   2032       1.1   thorpej 		if ((oldval & FUTEX_WAITERS) == 0)
   2033       1.1   thorpej 			return;
   2034       1.1   thorpej 	}
   2035       1.1   thorpej 
   2036       1.1   thorpej 	/*
   2037  1.12.4.1   thorpej 	 * Look for a futex.  Try shared first, then private.  If we
   2038  1.12.4.1   thorpej 	 * can't fine one, tough.
   2039  1.12.4.1   thorpej 	 *
   2040  1.12.4.1   thorpej 	 * Note: the assumption here is that anyone placing a futex on
   2041  1.12.4.1   thorpej 	 * the robust list is adhering to the rules, regardless of the
   2042  1.12.4.1   thorpej 	 * futex class.
   2043       1.1   thorpej 	 */
   2044  1.12.4.1   thorpej 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2045  1.12.4.1   thorpej 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2046  1.12.4.1   thorpej 		if (error)
   2047  1.12.4.1   thorpej 			return;
   2048  1.12.4.1   thorpej 		if (f == NULL && shared == false)
   2049  1.12.4.1   thorpej 			return;
   2050  1.12.4.1   thorpej 	}
   2051       1.1   thorpej 
   2052  1.12.4.1   thorpej 	/* Work under the futex op lock.  */
   2053  1.12.4.1   thorpej 	futex_op_lock(f);
   2054       1.1   thorpej 
   2055       1.1   thorpej 	/*
   2056       1.1   thorpej 	 * Fetch the word: if the tid doesn't match ours, skip;
   2057       1.1   thorpej 	 * otherwise, set the owner-died bit, atomically.
   2058       1.1   thorpej 	 */
   2059       1.1   thorpej 	do {
   2060       1.1   thorpej 		error = futex_load(uaddr, &oldval);
   2061       1.1   thorpej 		if (error)
   2062       1.1   thorpej 			goto out;
   2063       1.1   thorpej 		if ((oldval & FUTEX_TID_MASK) != tid)
   2064       1.1   thorpej 			goto out;
   2065       1.1   thorpej 		newval = oldval | FUTEX_OWNER_DIED;
   2066       1.1   thorpej 		error = ucas_int(uaddr, oldval, newval, &actual);
   2067       1.1   thorpej 		if (error)
   2068       1.1   thorpej 			goto out;
   2069       1.1   thorpej 	} while (actual != oldval);
   2070       1.1   thorpej 
   2071       1.1   thorpej 	/*
   2072       1.1   thorpej 	 * If there may be waiters, try to wake one.  If anything goes
   2073       1.1   thorpej 	 * wrong, tough.
   2074       1.1   thorpej 	 *
   2075       1.1   thorpej 	 * XXX eventual PI handling?
   2076       1.1   thorpej 	 */
   2077  1.12.4.1   thorpej 	if (oldval & FUTEX_WAITERS) {
   2078  1.12.4.1   thorpej 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2079  1.12.4.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   2080  1.12.4.5   thorpej 				 FUTEX_BITSET_MATCH_ANY, NULL);
   2081  1.12.4.1   thorpej 	}
   2082       1.1   thorpej 
   2083       1.1   thorpej 	/* Unlock the queue and release the futex.  */
   2084  1.12.4.1   thorpej out:	futex_op_unlock(f);
   2085       1.5  riastrad 	futex_rele(f);
   2086       1.1   thorpej }
   2087       1.1   thorpej 
   2088       1.1   thorpej /*
   2089       1.1   thorpej  * futex_robust_head_lookup(l, lwpid)
   2090       1.1   thorpej  *
   2091       1.1   thorpej  *	Helper function to look up a robust head by LWP ID.
   2092       1.1   thorpej  */
   2093       1.1   thorpej int
   2094       1.1   thorpej futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2095       1.1   thorpej {
   2096       1.1   thorpej 	struct proc *p = l->l_proc;
   2097       1.1   thorpej 
   2098       1.1   thorpej 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2099       1.1   thorpej 	if (lwpid) {
   2100       1.1   thorpej 		mutex_enter(p->p_lock);
   2101       1.1   thorpej 		l = lwp_find(p, lwpid);
   2102       1.1   thorpej 		if (l == NULL) {
   2103       1.1   thorpej 			mutex_exit(p->p_lock);
   2104       1.1   thorpej 			return ESRCH;
   2105       1.1   thorpej 		}
   2106       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2107       1.1   thorpej 		mutex_exit(p->p_lock);
   2108       1.1   thorpej 	} else {
   2109       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2110       1.1   thorpej 	}
   2111       1.1   thorpej 	return 0;
   2112       1.1   thorpej }
   2113       1.1   thorpej 
   2114       1.1   thorpej /*
   2115       1.1   thorpej  * futex_fetch_robust_head(uaddr)
   2116       1.1   thorpej  *
   2117       1.1   thorpej  *	Helper routine to fetch the futex robust list head that
   2118       1.1   thorpej  *	handles 32-bit binaries running on 64-bit kernels.
   2119       1.1   thorpej  */
   2120       1.1   thorpej static int
   2121       1.1   thorpej futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2122       1.1   thorpej {
   2123       1.1   thorpej #ifdef _LP64
   2124       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2125       1.1   thorpej 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2126       1.1   thorpej 		int error;
   2127       1.1   thorpej 
   2128       1.1   thorpej 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2129       1.1   thorpej 		if (__predict_true(error == 0)) {
   2130       1.1   thorpej 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2131       1.1   thorpej 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2132       1.1   thorpej 					/*
   2133       1.1   thorpej 					 * Make sure the offset is sign-
   2134       1.1   thorpej 					 * extended.
   2135       1.1   thorpej 					 */
   2136       1.1   thorpej 					rhead[i] = (int32_t)rhead32[i];
   2137       1.1   thorpej 				} else {
   2138       1.1   thorpej 					rhead[i] = rhead32[i];
   2139       1.1   thorpej 				}
   2140       1.1   thorpej 			}
   2141       1.1   thorpej 		}
   2142       1.1   thorpej 		return error;
   2143       1.1   thorpej 	}
   2144       1.1   thorpej #endif /* _L64 */
   2145       1.1   thorpej 
   2146       1.1   thorpej 	return copyin((void *)uaddr, rhead,
   2147       1.1   thorpej 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2148       1.1   thorpej }
   2149       1.1   thorpej 
   2150       1.1   thorpej /*
   2151       1.1   thorpej  * futex_decode_robust_word(word)
   2152       1.1   thorpej  *
   2153       1.1   thorpej  *	Decode a robust futex list word into the entry and entry
   2154       1.1   thorpej  *	properties.
   2155       1.1   thorpej  */
   2156       1.1   thorpej static inline void
   2157       1.1   thorpej futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2158       1.1   thorpej     bool * const is_pi)
   2159       1.1   thorpej {
   2160       1.1   thorpej 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2161       1.1   thorpej 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2162       1.1   thorpej }
   2163       1.1   thorpej 
   2164       1.1   thorpej /*
   2165       1.1   thorpej  * futex_fetch_robust_entry(uaddr)
   2166       1.1   thorpej  *
   2167       1.1   thorpej  *	Helper routine to fetch and decode a robust futex entry
   2168       1.1   thorpej  *	that handles 32-bit binaries running on 64-bit kernels.
   2169       1.1   thorpej  */
   2170       1.1   thorpej static int
   2171       1.1   thorpej futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2172       1.1   thorpej     bool * const is_pi)
   2173       1.1   thorpej {
   2174       1.1   thorpej 	uintptr_t val = 0;
   2175       1.1   thorpej 	int error = 0;
   2176       1.1   thorpej 
   2177       1.1   thorpej #ifdef _LP64
   2178       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2179       1.1   thorpej 		uint32_t val32;
   2180       1.1   thorpej 
   2181       1.1   thorpej 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2182       1.1   thorpej 		if (__predict_true(error == 0))
   2183       1.1   thorpej 			val = val32;
   2184       1.1   thorpej 	} else
   2185       1.1   thorpej #endif /* _LP64 */
   2186       1.1   thorpej 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2187       1.1   thorpej 	if (__predict_false(error))
   2188       1.1   thorpej 		return error;
   2189       1.1   thorpej 
   2190       1.1   thorpej 	futex_decode_robust_word(val, valp, is_pi);
   2191       1.1   thorpej 	return 0;
   2192       1.1   thorpej }
   2193       1.1   thorpej 
   2194       1.1   thorpej /*
   2195       1.1   thorpej  * futex_release_all_lwp(l, tid)
   2196       1.1   thorpej  *
   2197       1.1   thorpej  *	Release all l's robust futexes.  If anything looks funny in
   2198       1.1   thorpej  *	the process, give up -- it's userland's responsibility to dot
   2199       1.1   thorpej  *	the i's and cross the t's.
   2200       1.1   thorpej  */
   2201       1.1   thorpej void
   2202       1.1   thorpej futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2203       1.1   thorpej {
   2204       1.1   thorpej 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2205       1.1   thorpej 	int limit = 1000000;
   2206       1.1   thorpej 	int error;
   2207       1.1   thorpej 
   2208       1.1   thorpej 	/* If there's no robust list there's nothing to do. */
   2209       1.1   thorpej 	if (l->l_robust_head == 0)
   2210       1.1   thorpej 		return;
   2211       1.1   thorpej 
   2212       1.1   thorpej 	/* Read the final snapshot of the robust list head. */
   2213       1.1   thorpej 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2214       1.1   thorpej 	if (error) {
   2215       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2216       1.1   thorpej 		    " unmapped robust futex list head\n",
   2217       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2218       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2219       1.1   thorpej 		return;
   2220       1.1   thorpej 	}
   2221       1.1   thorpej 
   2222       1.1   thorpej 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2223       1.1   thorpej 
   2224       1.1   thorpej 	uintptr_t next, pending;
   2225       1.1   thorpej 	bool is_pi, pending_is_pi;
   2226       1.1   thorpej 
   2227       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2228       1.1   thorpej 	    &next, &is_pi);
   2229       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2230       1.1   thorpej 	    &pending, &pending_is_pi);
   2231       1.1   thorpej 
   2232       1.1   thorpej 	/*
   2233       1.1   thorpej 	 * Walk down the list of locked futexes and release them, up
   2234       1.1   thorpej 	 * to one million of them before we give up.
   2235       1.1   thorpej 	 */
   2236       1.1   thorpej 
   2237       1.1   thorpej 	while (next != l->l_robust_head && limit-- > 0) {
   2238       1.1   thorpej 		/* pending handled below. */
   2239       1.1   thorpej 		if (next != pending)
   2240       1.1   thorpej 			release_futex(next + offset, tid, is_pi, false);
   2241       1.1   thorpej 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2242       1.1   thorpej 		if (error)
   2243       1.1   thorpej 			break;
   2244       1.1   thorpej 		preempt_point();
   2245       1.1   thorpej 	}
   2246       1.1   thorpej 	if (limit <= 0) {
   2247       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2248       1.1   thorpej 		    " exhausted robust futex limit\n",
   2249       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2250       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2251       1.1   thorpej 	}
   2252       1.1   thorpej 
   2253       1.1   thorpej 	/* If there's a pending futex, it may need to be released too. */
   2254       1.1   thorpej 	if (pending != 0) {
   2255       1.1   thorpej 		release_futex(pending + offset, tid, pending_is_pi, true);
   2256       1.1   thorpej 	}
   2257       1.1   thorpej }
   2258