Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.12.4.1
      1  1.12.4.1   thorpej /*	$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $	*/
      2       1.1   thorpej 
      3       1.1   thorpej /*-
      4       1.1   thorpej  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5       1.1   thorpej  * All rights reserved.
      6       1.1   thorpej  *
      7       1.1   thorpej  * This code is derived from software contributed to The NetBSD Foundation
      8       1.1   thorpej  * by Taylor R. Campbell and Jason R. Thorpe.
      9       1.1   thorpej  *
     10       1.1   thorpej  * Redistribution and use in source and binary forms, with or without
     11       1.1   thorpej  * modification, are permitted provided that the following conditions
     12       1.1   thorpej  * are met:
     13       1.1   thorpej  * 1. Redistributions of source code must retain the above copyright
     14       1.1   thorpej  *    notice, this list of conditions and the following disclaimer.
     15       1.1   thorpej  * 2. Redistributions in binary form must reproduce the above copyright
     16       1.1   thorpej  *    notice, this list of conditions and the following disclaimer in the
     17       1.1   thorpej  *    documentation and/or other materials provided with the distribution.
     18       1.1   thorpej  *
     19       1.1   thorpej  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20       1.1   thorpej  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21       1.1   thorpej  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22       1.1   thorpej  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23       1.1   thorpej  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24       1.1   thorpej  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25       1.1   thorpej  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26       1.1   thorpej  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27       1.1   thorpej  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28       1.1   thorpej  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29       1.1   thorpej  * POSSIBILITY OF SUCH DAMAGE.
     30       1.1   thorpej  */
     31       1.1   thorpej 
     32       1.1   thorpej #include <sys/cdefs.h>
     33  1.12.4.1   thorpej __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $");
     34       1.1   thorpej 
     35       1.1   thorpej /*
     36       1.1   thorpej  * Futexes
     37       1.1   thorpej  *
     38       1.1   thorpej  *	The futex system call coordinates notifying threads waiting for
     39       1.1   thorpej  *	changes on a 32-bit word of memory.  The word can be managed by
     40       1.1   thorpej  *	CPU atomic operations in userland, without system calls, as long
     41       1.1   thorpej  *	as there is no contention.
     42       1.1   thorpej  *
     43       1.1   thorpej  *	The simplest use case demonstrating the utility is:
     44       1.1   thorpej  *
     45       1.1   thorpej  *		// 32-bit word of memory shared among threads or
     46       1.1   thorpej  *		// processes in userland.  lock & 1 means owned;
     47       1.1   thorpej  *		// lock & 2 means there are waiters waiting.
     48       1.1   thorpej  *		volatile int lock = 0;
     49       1.1   thorpej  *
     50       1.1   thorpej  *		int v;
     51       1.1   thorpej  *
     52       1.1   thorpej  *		// Acquire a lock.
     53       1.1   thorpej  *		do {
     54       1.1   thorpej  *			v = lock;
     55       1.1   thorpej  *			if (v & 1) {
     56       1.1   thorpej  *				// Lock is held.  Set a bit to say that
     57       1.1   thorpej  *				// there are waiters, and wait for lock
     58       1.1   thorpej  *				// to change to anything other than v;
     59       1.1   thorpej  *				// then retry.
     60       1.1   thorpej  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61       1.1   thorpej  *					continue;
     62       1.1   thorpej  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63       1.1   thorpej  *				continue;
     64       1.1   thorpej  *			}
     65       1.1   thorpej  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66       1.1   thorpej  *		membar_enter();
     67       1.1   thorpej  *
     68       1.1   thorpej  *		...
     69       1.1   thorpej  *
     70       1.1   thorpej  *		// Release the lock.  Optimistically assume there are
     71       1.1   thorpej  *		// no waiters first until demonstrated otherwise.
     72       1.1   thorpej  *		membar_exit();
     73       1.1   thorpej  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74       1.1   thorpej  *			// There may be waiters.
     75       1.1   thorpej  *			v = atomic_swap_uint(&lock, 0);
     76       1.1   thorpej  *			// If there are still waiters, wake one.
     77       1.1   thorpej  *			if (v & 2)
     78       1.1   thorpej  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79       1.1   thorpej  *		}
     80       1.1   thorpej  *
     81       1.1   thorpej  *	The goal is to avoid the futex system call unless there is
     82       1.1   thorpej  *	contention; then if there is contention, to guarantee no missed
     83       1.1   thorpej  *	wakeups.
     84       1.1   thorpej  *
     85       1.1   thorpej  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86       1.1   thorpej  *	itself to be woken, double-check the lock word, and then sleep;
     87       1.1   thorpej  *	spurious wakeups are generally a fact of life, so any
     88       1.1   thorpej  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89       1.1   thorpej  *
     90       1.1   thorpej  *	If this were all there is to it, we could then increase
     91       1.1   thorpej  *	parallelism by refining the approximation: partition the
     92       1.1   thorpej  *	waiters into buckets by hashing the lock addresses to reduce
     93       1.1   thorpej  *	the incidence of spurious wakeups.  But this is not all.
     94       1.1   thorpej  *
     95       1.1   thorpej  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96       1.1   thorpej  *	operation not only wakes n waiters on lock if lock == val, but
     97       1.1   thorpej  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98       1.1   thorpej  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99       1.1   thorpej  *	to lock2 if they merely share the same hash as waiters on lock.
    100       1.1   thorpej  *	Thus, we can't approximately distribute waiters into queues by
    101       1.1   thorpej  *	a hash function; we must distinguish futex queues exactly by
    102       1.1   thorpej  *	lock address.
    103       1.1   thorpej  *
    104       1.1   thorpej  *	For now, we use a global red/black tree to index futexes.  This
    105       1.1   thorpej  *	should be replaced by a lockless radix tree with a thread to
    106       1.1   thorpej  *	free entries no longer in use once all lookups on all CPUs have
    107       1.1   thorpej  *	completed.
    108       1.1   thorpej  *
    109       1.1   thorpej  *	Specifically, we maintain two maps:
    110       1.1   thorpej  *
    111       1.1   thorpej  *	futex_tab.va[vmspace, va] for private futexes
    112       1.1   thorpej  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113       1.1   thorpej  *
    114       1.1   thorpej  *	This implementation does not support priority inheritance.
    115       1.1   thorpej  */
    116       1.1   thorpej 
    117      1.12     skrll #include <sys/param.h>
    118       1.1   thorpej #include <sys/types.h>
    119       1.1   thorpej #include <sys/atomic.h>
    120       1.1   thorpej #include <sys/condvar.h>
    121       1.1   thorpej #include <sys/futex.h>
    122       1.1   thorpej #include <sys/mutex.h>
    123       1.1   thorpej #include <sys/rbtree.h>
    124  1.12.4.1   thorpej #include <sys/sleepq.h>
    125       1.1   thorpej #include <sys/queue.h>
    126  1.12.4.1   thorpej #include <sys/sdt.h>
    127       1.1   thorpej 
    128       1.1   thorpej #include <sys/syscall.h>
    129       1.1   thorpej #include <sys/syscallargs.h>
    130       1.1   thorpej #include <sys/syscallvar.h>
    131       1.1   thorpej 
    132       1.1   thorpej #include <uvm/uvm_extern.h>
    133       1.1   thorpej 
    134       1.1   thorpej /*
    135  1.12.4.1   thorpej  * DTrace probes.
    136  1.12.4.1   thorpej  */
    137  1.12.4.1   thorpej SDT_PROVIDER_DEFINE(futex);
    138  1.12.4.1   thorpej 
    139  1.12.4.1   thorpej /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    140  1.12.4.1   thorpej /* exit: error */
    141  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    142  1.12.4.1   thorpej 		  "struct timespec *", "int", "int");
    143  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    144  1.12.4.1   thorpej 
    145  1.12.4.1   thorpej /* entry: uaddr, nwake, bitset, fflags */
    146  1.12.4.1   thorpej /* exit: error, nwoken */
    147  1.12.4.1   thorpej SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    148  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    149  1.12.4.1   thorpej 
    150  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    151  1.12.4.1   thorpej /* exit: error, nwoken */
    152  1.12.4.1   thorpej SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    153  1.12.4.1   thorpej 		  "int");
    154  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    155  1.12.4.1   thorpej 
    156  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    157  1.12.4.1   thorpej /* exit: error, nwoken */
    158  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    159  1.12.4.1   thorpej 		  "int", "int", "int");
    160  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    161  1.12.4.1   thorpej 
    162  1.12.4.1   thorpej /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    163  1.12.4.1   thorpej /* exit: error, nwoken */
    164  1.12.4.1   thorpej SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    165  1.12.4.1   thorpej 		  "int", "int");
    166  1.12.4.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    167  1.12.4.1   thorpej 
    168  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    169  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    170  1.12.4.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    171  1.12.4.1   thorpej 
    172  1.12.4.1   thorpej /* entry: timo */
    173  1.12.4.1   thorpej /* exit: error */
    174  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    175  1.12.4.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    176  1.12.4.1   thorpej 
    177  1.12.4.1   thorpej /*
    178       1.1   thorpej  * Lock order:
    179       1.1   thorpej  *
    180       1.1   thorpej  *	futex_tab.lock
    181  1.12.4.1   thorpej  *	futex::fx_op_lock		ordered by kva of struct futex
    182  1.12.4.1   thorpej  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    183  1.12.4.1   thorpej  *
    184  1.12.4.1   thorpej  * N.B. multiple futexes can share a single sleepq lock.
    185       1.1   thorpej  */
    186       1.1   thorpej 
    187       1.1   thorpej /*
    188       1.1   thorpej  * union futex_key
    189       1.1   thorpej  *
    190       1.1   thorpej  *	A futex is addressed either by a vmspace+va (private) or by
    191       1.1   thorpej  *	a uvm_voaddr (shared).
    192       1.1   thorpej  */
    193       1.1   thorpej union futex_key {
    194       1.1   thorpej 	struct {
    195       1.1   thorpej 		struct vmspace			*vmspace;
    196       1.1   thorpej 		vaddr_t				va;
    197       1.1   thorpej 	}			fk_private;
    198       1.1   thorpej 	struct uvm_voaddr	fk_shared;
    199       1.1   thorpej };
    200       1.1   thorpej 
    201       1.1   thorpej /*
    202       1.1   thorpej  * struct futex
    203       1.1   thorpej  *
    204       1.1   thorpej  *	Kernel state for a futex located at a particular address in a
    205       1.1   thorpej  *	particular virtual address space.
    206       1.1   thorpej  *
    207       1.1   thorpej  *	N.B. fx_refcnt is an unsigned long because we need to be able
    208       1.1   thorpej  *	to operate on it atomically on all systems while at the same
    209       1.1   thorpej  *	time rendering practically impossible the chance of it reaching
    210       1.1   thorpej  *	its max value.  In practice, we're limited by the number of LWPs
    211       1.1   thorpej  *	that can be present on the system at any given time, and the
    212       1.1   thorpej  *	assumption is that limit will be good enough on a 32-bit platform.
    213       1.1   thorpej  *	See futex_wake() for why overflow needs to be avoided.
    214  1.12.4.1   thorpej  *
    215  1.12.4.1   thorpej  *	XXX Since futex addresses must be 4-byte aligned, we could
    216  1.12.4.1   thorpej  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    217  1.12.4.1   thorpej  *	XXX field of the key.  Worth it?
    218       1.1   thorpej  */
    219       1.1   thorpej struct futex {
    220       1.1   thorpej 	union futex_key		fx_key;
    221  1.12.4.1   thorpej 	struct rb_node		fx_node;
    222       1.1   thorpej 	unsigned long		fx_refcnt;
    223       1.1   thorpej 	bool			fx_shared;
    224       1.1   thorpej 	bool			fx_on_tree;
    225  1.12.4.1   thorpej 	uint8_t			fx_class;
    226       1.1   thorpej 
    227  1.12.4.1   thorpej 	kmutex_t		fx_op_lock;	/* adaptive */
    228  1.12.4.1   thorpej 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    229  1.12.4.1   thorpej 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    230  1.12.4.1   thorpej 	unsigned int		fx_nwaiters[2];
    231       1.1   thorpej };
    232       1.1   thorpej 
    233       1.1   thorpej /*
    234  1.12.4.1   thorpej  * futex classes: Some futex operations can only be carried out on
    235  1.12.4.1   thorpej  * futexes that are known to be abiding by a certain protocol.  These
    236  1.12.4.1   thorpej  * classes are assigned to a futex when created due to a wait event,
    237  1.12.4.1   thorpej  * and when subsequent wake or requeue operations are issued, the
    238  1.12.4.1   thorpej  * class is checked at futex lookup time.  If the class does not match,
    239  1.12.4.1   thorpej  * EINVAL is the result.
    240  1.12.4.1   thorpej  */
    241  1.12.4.1   thorpej #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    242  1.12.4.1   thorpej #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    243  1.12.4.1   thorpej #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    244  1.12.4.1   thorpej #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    245  1.12.4.1   thorpej 
    246  1.12.4.1   thorpej /* sleepq definitions */
    247  1.12.4.1   thorpej #define	FUTEX_READERQ		0
    248  1.12.4.1   thorpej #define	FUTEX_WRITERQ		1
    249  1.12.4.1   thorpej 
    250  1.12.4.1   thorpej static const char futex_wmesg[] = "futex";
    251  1.12.4.1   thorpej 
    252  1.12.4.1   thorpej static void	futex_unsleep(lwp_t *, bool);
    253  1.12.4.1   thorpej 
    254  1.12.4.1   thorpej static syncobj_t futex_syncobj = {
    255  1.12.4.1   thorpej 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    256  1.12.4.1   thorpej 	.sobj_unsleep	= futex_unsleep,
    257  1.12.4.1   thorpej 	.sobj_changepri	= sleepq_changepri,
    258  1.12.4.1   thorpej 	.sobj_lendpri	= sleepq_lendpri,
    259  1.12.4.1   thorpej 	.sobj_owner	= syncobj_noowner,
    260       1.1   thorpej };
    261       1.1   thorpej 
    262       1.1   thorpej /*
    263       1.1   thorpej  * futex_tab
    264       1.1   thorpej  *
    265       1.1   thorpej  *	Global trees of futexes by vmspace/va and VM object address.
    266       1.1   thorpej  *
    267       1.1   thorpej  *	XXX This obviously doesn't scale in parallel.  We could use a
    268       1.1   thorpej  *	pserialize-safe data structure, but there may be a high cost to
    269       1.1   thorpej  *	frequent deletion since we don't cache futexes after we're done
    270       1.1   thorpej  *	with them.  We could use hashed locks.  But for now, just make
    271       1.1   thorpej  *	sure userland can't DoS the serial performance, by using a
    272       1.1   thorpej  *	balanced binary tree for lookup.
    273       1.1   thorpej  *
    274       1.1   thorpej  *	XXX We could use a per-process tree for the table indexed by
    275       1.1   thorpej  *	virtual address to reduce contention between processes.
    276       1.1   thorpej  */
    277       1.1   thorpej static struct {
    278       1.1   thorpej 	kmutex_t	lock;
    279       1.1   thorpej 	struct rb_tree	va;
    280       1.1   thorpej 	struct rb_tree	oa;
    281       1.1   thorpej } futex_tab __cacheline_aligned;
    282       1.1   thorpej 
    283       1.1   thorpej static int
    284       1.1   thorpej compare_futex_key(void *cookie, const void *n, const void *k)
    285       1.1   thorpej {
    286       1.1   thorpej 	const struct futex *fa = n;
    287       1.1   thorpej 	const union futex_key *fka = &fa->fx_key;
    288       1.1   thorpej 	const union futex_key *fkb = k;
    289       1.1   thorpej 
    290       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace <
    291       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    292       1.1   thorpej 		return -1;
    293       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace >
    294       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    295       1.1   thorpej 		return +1;
    296       1.1   thorpej 	if (fka->fk_private.va < fkb->fk_private.va)
    297       1.1   thorpej 		return -1;
    298       1.1   thorpej 	if (fka->fk_private.va > fkb->fk_private.va)
    299       1.1   thorpej 		return -1;
    300       1.1   thorpej 	return 0;
    301       1.1   thorpej }
    302       1.1   thorpej 
    303       1.1   thorpej static int
    304       1.1   thorpej compare_futex(void *cookie, const void *na, const void *nb)
    305       1.1   thorpej {
    306       1.1   thorpej 	const struct futex *fa = na;
    307       1.1   thorpej 	const struct futex *fb = nb;
    308       1.1   thorpej 
    309       1.1   thorpej 	return compare_futex_key(cookie, fa, &fb->fx_key);
    310       1.1   thorpej }
    311       1.1   thorpej 
    312       1.1   thorpej static const rb_tree_ops_t futex_rb_ops = {
    313       1.1   thorpej 	.rbto_compare_nodes = compare_futex,
    314       1.1   thorpej 	.rbto_compare_key = compare_futex_key,
    315       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    316       1.1   thorpej };
    317       1.1   thorpej 
    318       1.1   thorpej static int
    319       1.1   thorpej compare_futex_shared_key(void *cookie, const void *n, const void *k)
    320       1.1   thorpej {
    321       1.1   thorpej 	const struct futex *fa = n;
    322       1.1   thorpej 	const union futex_key *fka = &fa->fx_key;
    323       1.1   thorpej 	const union futex_key *fkb = k;
    324       1.1   thorpej 
    325       1.1   thorpej 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    326       1.1   thorpej }
    327       1.1   thorpej 
    328       1.1   thorpej static int
    329       1.1   thorpej compare_futex_shared(void *cookie, const void *na, const void *nb)
    330       1.1   thorpej {
    331       1.1   thorpej 	const struct futex *fa = na;
    332       1.1   thorpej 	const struct futex *fb = nb;
    333       1.1   thorpej 
    334       1.1   thorpej 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    335       1.1   thorpej }
    336       1.1   thorpej 
    337       1.1   thorpej static const rb_tree_ops_t futex_shared_rb_ops = {
    338       1.1   thorpej 	.rbto_compare_nodes = compare_futex_shared,
    339       1.1   thorpej 	.rbto_compare_key = compare_futex_shared_key,
    340       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    341       1.1   thorpej };
    342       1.1   thorpej 
    343  1.12.4.1   thorpej /*
    344  1.12.4.1   thorpej  * futex_sq_lock2(f, f2)
    345  1.12.4.1   thorpej  *
    346  1.12.4.1   thorpej  *	Acquire the sleepq locks for f and f2, which may be null, or
    347  1.12.4.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
    348  1.12.4.1   thorpej  *	order is chosen on the locks.
    349  1.12.4.1   thorpej  *
    350  1.12.4.1   thorpej  *	Callers should only ever acquire multiple sleepq locks
    351  1.12.4.1   thorpej  *	simultaneously using futex_sq_lock2.
    352  1.12.4.1   thorpej  */
    353  1.12.4.1   thorpej static void
    354  1.12.4.1   thorpej futex_sq_lock2(struct futex * const f, struct futex * const f2)
    355  1.12.4.1   thorpej {
    356  1.12.4.1   thorpej 
    357  1.12.4.1   thorpej 	/*
    358  1.12.4.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    359  1.12.4.1   thorpej 	 * is not, lock the other and be done with it.
    360  1.12.4.1   thorpej 	 */
    361  1.12.4.1   thorpej 	if (f == NULL && f2 == NULL) {
    362  1.12.4.1   thorpej 		return;
    363  1.12.4.1   thorpej 	} else if (f == NULL) {
    364  1.12.4.1   thorpej 		mutex_spin_enter(f2->fx_sq_lock);
    365  1.12.4.1   thorpej 		return;
    366  1.12.4.1   thorpej 	} else if (f2 == NULL) {
    367  1.12.4.1   thorpej 		mutex_spin_enter(f->fx_sq_lock);
    368  1.12.4.1   thorpej 		return;
    369  1.12.4.1   thorpej 	}
    370  1.12.4.1   thorpej 
    371  1.12.4.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    372  1.12.4.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    373  1.12.4.1   thorpej 
    374  1.12.4.1   thorpej 	/* If both locks are the same, acquire only one. */
    375  1.12.4.1   thorpej 	if (m == m2) {
    376  1.12.4.1   thorpej 		mutex_spin_enter(m);
    377  1.12.4.1   thorpej 		return;
    378  1.12.4.1   thorpej 	}
    379  1.12.4.1   thorpej 
    380  1.12.4.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    381  1.12.4.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    382  1.12.4.1   thorpej 		mutex_spin_enter(m);
    383  1.12.4.1   thorpej 		mutex_spin_enter(m2);
    384  1.12.4.1   thorpej 	} else {
    385  1.12.4.1   thorpej 		mutex_spin_enter(m2);
    386  1.12.4.1   thorpej 		mutex_spin_enter(m);
    387  1.12.4.1   thorpej 	}
    388  1.12.4.1   thorpej }
    389  1.12.4.1   thorpej 
    390  1.12.4.1   thorpej /*
    391  1.12.4.1   thorpej  * futex_sq_unlock2(f, f2)
    392  1.12.4.1   thorpej  *
    393  1.12.4.1   thorpej  *	Release the sleep queue locks for f and f2, which may be null, or
    394  1.12.4.1   thorpej  *	which may have the same underlying lock.
    395  1.12.4.1   thorpej  */
    396  1.12.4.1   thorpej static void
    397  1.12.4.1   thorpej futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    398  1.12.4.1   thorpej {
    399  1.12.4.1   thorpej 
    400  1.12.4.1   thorpej 	/*
    401  1.12.4.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    402  1.12.4.1   thorpej 	 * is not, unlock the other and be done with it.
    403  1.12.4.1   thorpej 	 */
    404  1.12.4.1   thorpej 	if (f == NULL && f2 == NULL) {
    405  1.12.4.1   thorpej 		return;
    406  1.12.4.1   thorpej 	} else if (f == NULL) {
    407  1.12.4.1   thorpej 		mutex_spin_exit(f2->fx_sq_lock);
    408  1.12.4.1   thorpej 		return;
    409  1.12.4.1   thorpej 	} else if (f2 == NULL) {
    410  1.12.4.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
    411  1.12.4.1   thorpej 		return;
    412  1.12.4.1   thorpej 	}
    413  1.12.4.1   thorpej 
    414  1.12.4.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    415  1.12.4.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    416  1.12.4.1   thorpej 
    417  1.12.4.1   thorpej 	/* If both locks are the same, release only one. */
    418  1.12.4.1   thorpej 	if (m == m2) {
    419  1.12.4.1   thorpej 		mutex_spin_exit(m);
    420  1.12.4.1   thorpej 		return;
    421  1.12.4.1   thorpej 	}
    422  1.12.4.1   thorpej 
    423  1.12.4.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    424  1.12.4.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    425  1.12.4.1   thorpej 		mutex_spin_exit(m2);
    426  1.12.4.1   thorpej 		mutex_spin_exit(m);
    427  1.12.4.1   thorpej 	} else {
    428  1.12.4.1   thorpej 		mutex_spin_exit(m);
    429  1.12.4.1   thorpej 		mutex_spin_exit(m2);
    430  1.12.4.1   thorpej 	}
    431  1.12.4.1   thorpej }
    432       1.1   thorpej 
    433       1.1   thorpej /*
    434       1.1   thorpej  * futex_load(uaddr, kaddr)
    435       1.1   thorpej  *
    436       1.1   thorpej  *	Perform a single atomic load to read *uaddr, and return the
    437       1.1   thorpej  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    438       1.1   thorpej  *	mapped.
    439       1.1   thorpej  */
    440       1.1   thorpej static inline int
    441       1.1   thorpej futex_load(int *uaddr, int *kaddr)
    442       1.1   thorpej {
    443       1.1   thorpej 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    444       1.1   thorpej }
    445       1.1   thorpej 
    446       1.1   thorpej /*
    447       1.1   thorpej  * futex_test(uaddr, expected)
    448       1.1   thorpej  *
    449       1.1   thorpej  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    450       1.1   thorpej  *	uaddr is not mapped.
    451       1.1   thorpej  */
    452       1.1   thorpej static bool
    453       1.1   thorpej futex_test(int *uaddr, int expected)
    454       1.1   thorpej {
    455       1.1   thorpej 	int val;
    456       1.1   thorpej 	int error;
    457       1.1   thorpej 
    458       1.1   thorpej 	error = futex_load(uaddr, &val);
    459       1.1   thorpej 	if (error)
    460       1.1   thorpej 		return false;
    461       1.1   thorpej 	return val == expected;
    462       1.1   thorpej }
    463       1.1   thorpej 
    464  1.12.4.1   thorpej static pool_cache_t	futex_cache		__read_mostly;
    465  1.12.4.1   thorpej 
    466  1.12.4.1   thorpej static int	futex_ctor(void *, void *, int);
    467  1.12.4.1   thorpej static void	futex_dtor(void *, void *);
    468  1.12.4.1   thorpej 
    469       1.1   thorpej /*
    470       1.1   thorpej  * futex_sys_init()
    471       1.1   thorpej  *
    472       1.1   thorpej  *	Initialize the futex subsystem.
    473       1.1   thorpej  */
    474       1.1   thorpej void
    475       1.1   thorpej futex_sys_init(void)
    476       1.1   thorpej {
    477       1.1   thorpej 
    478       1.1   thorpej 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    479       1.1   thorpej 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    480       1.1   thorpej 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    481  1.12.4.1   thorpej 
    482  1.12.4.1   thorpej 	futex_cache = pool_cache_init(sizeof(struct futex),
    483  1.12.4.1   thorpej 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    484  1.12.4.1   thorpej 	    futex_dtor, NULL);
    485       1.1   thorpej }
    486       1.1   thorpej 
    487       1.1   thorpej /*
    488       1.1   thorpej  * futex_sys_fini()
    489       1.1   thorpej  *
    490       1.1   thorpej  *	Finalize the futex subsystem.
    491       1.1   thorpej  */
    492       1.1   thorpej void
    493       1.1   thorpej futex_sys_fini(void)
    494       1.1   thorpej {
    495       1.1   thorpej 
    496       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    497       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    498       1.1   thorpej 	mutex_destroy(&futex_tab.lock);
    499       1.1   thorpej }
    500       1.1   thorpej 
    501       1.1   thorpej /*
    502  1.12.4.1   thorpej  * futex_ctor()
    503       1.1   thorpej  *
    504  1.12.4.1   thorpej  *	Futex object constructor.
    505       1.1   thorpej  */
    506  1.12.4.1   thorpej static int
    507  1.12.4.1   thorpej futex_ctor(void *arg __unused, void *obj, int flags __unused)
    508       1.1   thorpej {
    509  1.12.4.1   thorpej 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    510  1.12.4.1   thorpej 	struct futex * const f = obj;
    511       1.1   thorpej 
    512  1.12.4.1   thorpej 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    513  1.12.4.1   thorpej 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    514       1.1   thorpej 
    515  1.12.4.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    516  1.12.4.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    517  1.12.4.1   thorpej 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    518       1.1   thorpej 
    519  1.12.4.1   thorpej 	return 0;
    520       1.1   thorpej }
    521       1.1   thorpej 
    522       1.1   thorpej /*
    523  1.12.4.1   thorpej  * futex_dtor()
    524       1.1   thorpej  *
    525  1.12.4.1   thorpej  *	Futex object destructor.
    526       1.1   thorpej  */
    527       1.1   thorpej static void
    528  1.12.4.1   thorpej futex_dtor(void *arg __unused, void *obj)
    529       1.1   thorpej {
    530  1.12.4.1   thorpej 	struct futex * const f = obj;
    531       1.1   thorpej 
    532  1.12.4.1   thorpej 	mutex_destroy(&f->fx_op_lock);
    533  1.12.4.1   thorpej 	f->fx_sq_lock = NULL;
    534       1.1   thorpej }
    535       1.1   thorpej 
    536       1.1   thorpej /*
    537       1.1   thorpej  * futex_key_init(key, vm, va, shared)
    538       1.1   thorpej  *
    539       1.1   thorpej  *	Initialize a futex key for lookup, etc.
    540       1.1   thorpej  */
    541       1.1   thorpej static int
    542       1.1   thorpej futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
    543       1.1   thorpej {
    544       1.1   thorpej 	int error = 0;
    545       1.1   thorpej 
    546       1.1   thorpej 	if (__predict_false(shared)) {
    547       1.1   thorpej 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    548       1.1   thorpej 			error = EFAULT;
    549       1.1   thorpej 	} else {
    550       1.1   thorpej 		fk->fk_private.vmspace = vm;
    551       1.1   thorpej 		fk->fk_private.va = va;
    552       1.1   thorpej 	}
    553       1.1   thorpej 
    554       1.1   thorpej 	return error;
    555       1.1   thorpej }
    556       1.1   thorpej 
    557       1.1   thorpej /*
    558       1.1   thorpej  * futex_key_fini(key, shared)
    559       1.1   thorpej  *
    560       1.1   thorpej  *	Release a futex key.
    561       1.1   thorpej  */
    562       1.1   thorpej static void
    563       1.1   thorpej futex_key_fini(union futex_key *fk, bool shared)
    564       1.1   thorpej {
    565       1.1   thorpej 	if (__predict_false(shared))
    566       1.1   thorpej 		uvm_voaddr_release(&fk->fk_shared);
    567       1.1   thorpej 	memset(fk, 0, sizeof(*fk));
    568       1.1   thorpej }
    569       1.1   thorpej 
    570       1.1   thorpej /*
    571       1.1   thorpej  * futex_create(fk, shared)
    572       1.1   thorpej  *
    573       1.1   thorpej  *	Create a futex.  Initial reference count is 1, representing the
    574       1.1   thorpej  *	caller.  Returns NULL on failure.  Always takes ownership of the
    575       1.1   thorpej  *	key, either transferring it to the newly-created futex, or releasing
    576       1.1   thorpej  *	the key if creation fails.
    577       1.1   thorpej  *
    578       1.1   thorpej  *	Never sleeps for memory, but may sleep to acquire a lock.
    579       1.1   thorpej  */
    580       1.1   thorpej static struct futex *
    581  1.12.4.1   thorpej futex_create(union futex_key *fk, bool shared, uint8_t class)
    582       1.1   thorpej {
    583       1.1   thorpej 	struct futex *f;
    584       1.1   thorpej 
    585  1.12.4.1   thorpej 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    586       1.1   thorpej 	if (f == NULL) {
    587       1.1   thorpej 		futex_key_fini(fk, shared);
    588       1.1   thorpej 		return NULL;
    589       1.1   thorpej 	}
    590       1.1   thorpej 	f->fx_key = *fk;
    591       1.1   thorpej 	f->fx_refcnt = 1;
    592       1.1   thorpej 	f->fx_shared = shared;
    593       1.1   thorpej 	f->fx_on_tree = false;
    594  1.12.4.1   thorpej 	f->fx_class = class;
    595       1.1   thorpej 
    596       1.1   thorpej 	return f;
    597       1.1   thorpej }
    598       1.1   thorpej 
    599       1.1   thorpej /*
    600       1.1   thorpej  * futex_destroy(f)
    601       1.1   thorpej  *
    602       1.1   thorpej  *	Destroy a futex created with futex_create.  Reference count
    603       1.1   thorpej  *	must be zero.
    604       1.1   thorpej  *
    605       1.1   thorpej  *	May sleep.
    606       1.1   thorpej  */
    607       1.1   thorpej static void
    608       1.1   thorpej futex_destroy(struct futex *f)
    609       1.1   thorpej {
    610       1.1   thorpej 
    611       1.1   thorpej 	ASSERT_SLEEPABLE();
    612       1.1   thorpej 
    613       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    614       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    615  1.12.4.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    616  1.12.4.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    617  1.12.4.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    618  1.12.4.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    619       1.1   thorpej 
    620       1.1   thorpej 	futex_key_fini(&f->fx_key, f->fx_shared);
    621       1.1   thorpej 
    622  1.12.4.1   thorpej 	pool_cache_put(futex_cache, f);
    623       1.1   thorpej }
    624       1.1   thorpej 
    625       1.1   thorpej /*
    626  1.12.4.1   thorpej  * futex_hold_count(f, n)
    627       1.1   thorpej  *
    628       1.1   thorpej  *	Attempt to acquire a reference to f.  Return 0 on success,
    629       1.1   thorpej  *	ENFILE on too many references.
    630       1.1   thorpej  *
    631       1.1   thorpej  *	Never sleeps.
    632       1.1   thorpej  */
    633       1.1   thorpej static int
    634  1.12.4.1   thorpej futex_hold_count(struct futex *f, unsigned long const count)
    635       1.1   thorpej {
    636       1.1   thorpej 	unsigned long refcnt;
    637       1.1   thorpej 
    638       1.1   thorpej 	do {
    639       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    640  1.12.4.1   thorpej 		if (ULONG_MAX - refcnt < count)
    641       1.1   thorpej 			return ENFILE;
    642  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    643  1.12.4.1   thorpej 				  refcnt + count) != refcnt);
    644       1.1   thorpej 
    645       1.1   thorpej 	return 0;
    646       1.1   thorpej }
    647  1.12.4.1   thorpej #define	futex_hold(f)	futex_hold_count(f, 1)
    648       1.1   thorpej 
    649       1.1   thorpej /*
    650  1.12.4.1   thorpej  * futex_rele_count(f, n)
    651       1.1   thorpej  *
    652       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    653       1.1   thorpej  *	futex_hold.
    654       1.1   thorpej  *
    655       1.1   thorpej  *	May sleep to free f.
    656       1.1   thorpej  */
    657       1.1   thorpej static void
    658  1.12.4.1   thorpej futex_rele_count(struct futex *f, unsigned long const count)
    659       1.1   thorpej {
    660       1.1   thorpej 	unsigned long refcnt;
    661       1.1   thorpej 
    662       1.1   thorpej 	ASSERT_SLEEPABLE();
    663       1.1   thorpej 
    664       1.1   thorpej 	do {
    665       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    666  1.12.4.1   thorpej 		KASSERT(refcnt >= count);
    667  1.12.4.1   thorpej 		if (refcnt - count == 0)
    668       1.1   thorpej 			goto trylast;
    669  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    670  1.12.4.1   thorpej 				  refcnt - count) != refcnt);
    671       1.1   thorpej 	return;
    672       1.1   thorpej 
    673       1.1   thorpej trylast:
    674  1.12.4.1   thorpej 	KASSERT(count <= LONG_MAX);
    675       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    676  1.12.4.1   thorpej 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    677       1.1   thorpej 		if (f->fx_on_tree) {
    678       1.1   thorpej 			if (__predict_false(f->fx_shared))
    679       1.1   thorpej 				rb_tree_remove_node(&futex_tab.oa, f);
    680       1.1   thorpej 			else
    681       1.1   thorpej 				rb_tree_remove_node(&futex_tab.va, f);
    682       1.1   thorpej 			f->fx_on_tree = false;
    683       1.1   thorpej 		}
    684       1.1   thorpej 	} else {
    685       1.1   thorpej 		/* References remain -- don't destroy it.  */
    686       1.1   thorpej 		f = NULL;
    687       1.1   thorpej 	}
    688       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    689       1.1   thorpej 	if (f != NULL)
    690       1.1   thorpej 		futex_destroy(f);
    691       1.1   thorpej }
    692  1.12.4.1   thorpej #define	futex_rele(f)	futex_rele_count(f, 1)
    693       1.1   thorpej 
    694       1.1   thorpej /*
    695  1.12.4.1   thorpej  * futex_rele_count_not_last(f, n)
    696       1.1   thorpej  *
    697       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    698       1.1   thorpej  *	futex_hold.
    699       1.1   thorpej  *
    700       1.1   thorpej  *	This version asserts that we are not dropping the last
    701       1.1   thorpej  *	reference to f.
    702       1.1   thorpej  */
    703       1.1   thorpej static void
    704  1.12.4.1   thorpej futex_rele_count_not_last(struct futex *f, unsigned long const count)
    705       1.1   thorpej {
    706       1.1   thorpej 	unsigned long refcnt;
    707       1.1   thorpej 
    708       1.1   thorpej 	do {
    709       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    710  1.12.4.1   thorpej 		KASSERT(refcnt >= count);
    711  1.12.4.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    712  1.12.4.1   thorpej 				  refcnt - count) != refcnt);
    713       1.1   thorpej }
    714       1.1   thorpej 
    715       1.1   thorpej /*
    716  1.12.4.1   thorpej  * futex_lookup_by_key(key, shared, class, &f)
    717       1.1   thorpej  *
    718       1.1   thorpej  *	Try to find an existing futex va reference in the specified key
    719       1.1   thorpej  *	On success, return 0, set f to found futex or to NULL if not found,
    720       1.1   thorpej  *	and increment f's reference count if found.
    721       1.1   thorpej  *
    722       1.1   thorpej  *	Return ENFILE if reference count too high.
    723       1.1   thorpej  *
    724       1.1   thorpej  *	Internal lookup routine shared by futex_lookup() and
    725       1.5  riastrad  *	futex_lookup_create().
    726       1.1   thorpej  */
    727       1.1   thorpej static int
    728  1.12.4.1   thorpej futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
    729  1.12.4.1   thorpej     struct futex **fp)
    730       1.1   thorpej {
    731       1.1   thorpej 	struct futex *f;
    732       1.1   thorpej 	int error = 0;
    733       1.1   thorpej 
    734       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    735       1.1   thorpej 	if (__predict_false(shared)) {
    736       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.oa, fk);
    737       1.1   thorpej 	} else {
    738       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.va, fk);
    739       1.1   thorpej 	}
    740       1.1   thorpej 	if (f) {
    741  1.12.4.1   thorpej 		if (__predict_true(f->fx_class == class ||
    742  1.12.4.1   thorpej 				   class == FUTEX_CLASS_ANY))
    743  1.12.4.1   thorpej 			error = futex_hold(f);
    744  1.12.4.1   thorpej 		else
    745  1.12.4.1   thorpej 			error = EINVAL;
    746       1.1   thorpej 		if (error)
    747       1.1   thorpej 			f = NULL;
    748       1.1   thorpej 	}
    749       1.1   thorpej  	*fp = f;
    750       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    751       1.1   thorpej 
    752       1.1   thorpej 	return error;
    753       1.1   thorpej }
    754       1.1   thorpej 
    755       1.1   thorpej /*
    756       1.1   thorpej  * futex_insert(f, fp)
    757       1.1   thorpej  *
    758       1.1   thorpej  *	Try to insert the futex f into the tree by va.  If there
    759       1.1   thorpej  *	already is a futex for its va, acquire a reference to it, and
    760       1.1   thorpej  *	store it in *fp; otherwise store f in *fp.
    761       1.1   thorpej  *
    762       1.1   thorpej  *	Return 0 on success, ENFILE if there already is a futex but its
    763       1.1   thorpej  *	reference count is too high.
    764       1.1   thorpej  */
    765       1.1   thorpej static int
    766       1.1   thorpej futex_insert(struct futex *f, struct futex **fp)
    767       1.1   thorpej {
    768       1.1   thorpej 	struct futex *f0;
    769       1.1   thorpej 	int error;
    770       1.1   thorpej 
    771       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    772       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    773       1.1   thorpej 
    774       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    775       1.1   thorpej 	if (__predict_false(f->fx_shared))
    776       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    777       1.1   thorpej 	else
    778       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    779       1.1   thorpej 	if (f0 == f) {
    780       1.1   thorpej 		f->fx_on_tree = true;
    781       1.1   thorpej 		error = 0;
    782       1.1   thorpej 	} else {
    783       1.1   thorpej 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    784       1.1   thorpej 		KASSERT(f0->fx_on_tree);
    785       1.1   thorpej 		error = futex_hold(f0);
    786       1.1   thorpej 		if (error)
    787       1.1   thorpej 			goto out;
    788       1.1   thorpej 	}
    789       1.1   thorpej 	*fp = f0;
    790       1.1   thorpej out:	mutex_exit(&futex_tab.lock);
    791       1.1   thorpej 
    792       1.1   thorpej 	return error;
    793       1.1   thorpej }
    794       1.1   thorpej 
    795       1.1   thorpej /*
    796  1.12.4.1   thorpej  * futex_lookup(uaddr, shared, class, &f)
    797       1.1   thorpej  *
    798       1.1   thorpej  *	Find a futex at the userland pointer uaddr in the current
    799       1.1   thorpej  *	process's VM space.  On success, return the futex in f and
    800       1.1   thorpej  *	increment its reference count.
    801       1.1   thorpej  *
    802       1.5  riastrad  *	Caller must call futex_rele when done.
    803       1.1   thorpej  */
    804       1.1   thorpej static int
    805  1.12.4.1   thorpej futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    806       1.1   thorpej {
    807       1.1   thorpej 	union futex_key fk;
    808       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    809       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    810       1.1   thorpej 	int error;
    811       1.1   thorpej 
    812       1.1   thorpej 	/*
    813       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    814       1.1   thorpej 	 * boundaries and so atomics will work.
    815       1.1   thorpej 	 */
    816  1.12.4.1   thorpej 	if (__predict_false((va & 3) != 0))
    817       1.1   thorpej 		return EINVAL;
    818       1.1   thorpej 
    819       1.1   thorpej 	/* Look it up. */
    820       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    821       1.1   thorpej 	if (error)
    822       1.1   thorpej 		return error;
    823       1.1   thorpej 
    824  1.12.4.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    825       1.1   thorpej 	futex_key_fini(&fk, shared);
    826       1.1   thorpej 	if (error)
    827       1.1   thorpej 		return error;
    828       1.1   thorpej 
    829       1.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    830  1.12.4.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    831       1.1   thorpej 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    832       1.1   thorpej 
    833       1.1   thorpej 	/*
    834       1.1   thorpej 	 * Success!  (Caller must still check whether we found
    835       1.1   thorpej 	 * anything, but nothing went _wrong_ like trying to use
    836       1.1   thorpej 	 * unmapped memory.)
    837       1.1   thorpej 	 */
    838       1.1   thorpej 	KASSERT(error == 0);
    839       1.1   thorpej 
    840       1.1   thorpej 	return error;
    841       1.1   thorpej }
    842       1.1   thorpej 
    843       1.1   thorpej /*
    844  1.12.4.1   thorpej  * futex_lookup_create(uaddr, shared, class, &f)
    845       1.1   thorpej  *
    846       1.1   thorpej  *	Find or create a futex at the userland pointer uaddr in the
    847       1.1   thorpej  *	current process's VM space.  On success, return the futex in f
    848       1.1   thorpej  *	and increment its reference count.
    849       1.1   thorpej  *
    850       1.5  riastrad  *	Caller must call futex_rele when done.
    851       1.1   thorpej  */
    852       1.1   thorpej static int
    853  1.12.4.1   thorpej futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    854       1.1   thorpej {
    855       1.1   thorpej 	union futex_key fk;
    856       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    857       1.1   thorpej 	struct futex *f = NULL;
    858       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    859       1.1   thorpej 	int error;
    860       1.1   thorpej 
    861       1.1   thorpej 	/*
    862       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    863       1.1   thorpej 	 * boundaries and so atomics will work.
    864       1.1   thorpej 	 */
    865  1.12.4.1   thorpej 	if (__predict_false((va & 3) != 0))
    866  1.12.4.1   thorpej 		return EINVAL;
    867  1.12.4.1   thorpej 
    868  1.12.4.1   thorpej 	if (__predict_false(class == FUTEX_CLASS_ANY))
    869       1.1   thorpej 		return EINVAL;
    870       1.1   thorpej 
    871       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    872       1.1   thorpej 	if (error)
    873       1.1   thorpej 		return error;
    874       1.1   thorpej 
    875       1.1   thorpej 	/*
    876       1.1   thorpej 	 * Optimistically assume there already is one, and try to find
    877       1.1   thorpej 	 * it.
    878       1.1   thorpej 	 */
    879  1.12.4.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    880       1.1   thorpej 	if (error || *fp != NULL) {
    881       1.1   thorpej 		/*
    882       1.1   thorpej 		 * We either found one, or there was an error.
    883       1.1   thorpej 		 * In either case, we are done with the key.
    884       1.1   thorpej 		 */
    885       1.1   thorpej 		futex_key_fini(&fk, shared);
    886       1.1   thorpej 		goto out;
    887       1.1   thorpej 	}
    888       1.1   thorpej 
    889       1.1   thorpej 	/*
    890       1.1   thorpej 	 * Create a futex recoard.  This tranfers ownership of the key
    891       1.1   thorpej 	 * in all cases.
    892       1.1   thorpej 	 */
    893  1.12.4.1   thorpej 	f = futex_create(&fk, shared, class);
    894       1.1   thorpej 	if (f == NULL) {
    895       1.1   thorpej 		error = ENOMEM;
    896       1.1   thorpej 		goto out;
    897       1.1   thorpej 	}
    898       1.1   thorpej 
    899       1.1   thorpej 	/*
    900       1.1   thorpej 	 * Insert our new futex, or use existing if someone else beat
    901       1.1   thorpej 	 * us to it.
    902       1.1   thorpej 	 */
    903       1.1   thorpej 	error = futex_insert(f, fp);
    904       1.1   thorpej 	if (error)
    905       1.1   thorpej 		goto out;
    906       1.1   thorpej 	if (*fp == f)
    907       1.1   thorpej 		f = NULL;	/* don't release on exit */
    908       1.1   thorpej 
    909       1.1   thorpej 	/* Success!  */
    910       1.1   thorpej 	KASSERT(error == 0);
    911       1.1   thorpej 
    912       1.1   thorpej out:	if (f != NULL)
    913       1.1   thorpej 		futex_rele(f);
    914       1.1   thorpej 	KASSERT(error || *fp != NULL);
    915       1.1   thorpej 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    916       1.1   thorpej 	return error;
    917       1.1   thorpej }
    918       1.1   thorpej 
    919       1.1   thorpej /*
    920  1.12.4.1   thorpej  * futex_unsleep:
    921       1.1   thorpej  *
    922  1.12.4.1   thorpej  *	Remove an LWP from the futex and sleep queue.  This is called when
    923  1.12.4.1   thorpej  *	the LWP has not been awoken normally but instead interrupted: for
    924  1.12.4.1   thorpej  *	example, when a signal is received.  Must be called with the LWP
    925  1.12.4.1   thorpej  *	locked.  Will unlock if "unlock" is true.
    926       1.1   thorpej  */
    927       1.1   thorpej static void
    928  1.12.4.1   thorpej futex_unsleep(lwp_t *l, bool unlock)
    929       1.1   thorpej {
    930  1.12.4.1   thorpej 	struct futex *f __diagused;
    931       1.1   thorpej 
    932  1.12.4.1   thorpej 	f = (struct futex *)(uintptr_t)l->l_wchan;
    933       1.1   thorpej 
    934  1.12.4.1   thorpej 	KASSERT(mutex_owned(f->fx_sq_lock));
    935       1.1   thorpej 
    936  1.12.4.1   thorpej 	/* WRITERQ is more likely */
    937  1.12.4.1   thorpej 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    938  1.12.4.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    939  1.12.4.1   thorpej 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    940  1.12.4.1   thorpej 	} else {
    941  1.12.4.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    942  1.12.4.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    943  1.12.4.1   thorpej 		f->fx_nwaiters[FUTEX_READERQ]--;
    944  1.12.4.1   thorpej 	}
    945       1.1   thorpej 
    946  1.12.4.1   thorpej 	sleepq_unsleep(l, unlock);
    947       1.1   thorpej }
    948       1.1   thorpej 
    949       1.1   thorpej /*
    950  1.12.4.1   thorpej  * futex_wait(f, timeout, clkid, bitset)
    951       1.1   thorpej  *
    952  1.12.4.1   thorpej  *	Wait until deadline on the clock clkid, or forever if deadline
    953  1.12.4.1   thorpej  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    954  1.12.4.1   thorpej  *	ETIMEDOUT on timeout, EINTR on signal.
    955       1.1   thorpej  */
    956  1.12.4.1   thorpej static int
    957  1.12.4.1   thorpej futex_wait(struct futex *f, int q, const struct timespec *deadline,
    958  1.12.4.1   thorpej     clockid_t clkid, int bitset)
    959       1.1   thorpej {
    960       1.1   thorpej 
    961       1.1   thorpej 	/*
    962  1.12.4.1   thorpej 	 * Some things to note about how this function works:
    963  1.12.4.1   thorpej 	 *
    964  1.12.4.1   thorpej 	 * ==> When we enter futex_wait(), the futex's op lock is
    965  1.12.4.1   thorpej 	 * held, preventing us from missing wakes.  Once we are on
    966  1.12.4.1   thorpej 	 * the futex's sleepq, it is safe to release the op lock.
    967  1.12.4.1   thorpej 	 *
    968  1.12.4.1   thorpej 	 * ==> We have a single early escape to avoid calling
    969  1.12.4.1   thorpej 	 * sleepq_block(): our deadline already having passed.
    970  1.12.4.1   thorpej 	 * If this is a no-timeout wait, or if the deadline has
    971  1.12.4.1   thorpej 	 * not already passed, we are guaranteed to call sleepq_block().
    972  1.12.4.1   thorpej 	 *
    973  1.12.4.1   thorpej 	 * ==> sleepq_block() contains all of the necessary logic
    974  1.12.4.1   thorpej 	 * for handling signals; we do not need to check for them
    975  1.12.4.1   thorpej 	 * ourselves.
    976  1.12.4.1   thorpej 	 *
    977  1.12.4.1   thorpej 	 * ==> Once we have blocked, other futex operations will
    978  1.12.4.1   thorpej 	 * be able to wake us or requeue us.  Until we have blocked,
    979  1.12.4.1   thorpej 	 * those other futex operations will not be able to acquire
    980  1.12.4.1   thorpej 	 * the sleepq locks in order to perform those operations,
    981  1.12.4.1   thorpej 	 * even if we have dropped the op lock.  Thus, we will not
    982  1.12.4.1   thorpej 	 * miss wakes.  This fundamental invariant is relied upon by
    983  1.12.4.1   thorpej 	 * every other synchronization object in the kernel.
    984  1.12.4.1   thorpej 	 *
    985  1.12.4.1   thorpej 	 * ==> sleepq_block() returns for one of three reasons:
    986  1.12.4.1   thorpej 	 *
    987  1.12.4.1   thorpej 	 *	-- We were awakened.
    988  1.12.4.1   thorpej 	 *	-- We were signaled.
    989  1.12.4.1   thorpej 	 *	-- We timed out.
    990  1.12.4.1   thorpej 	 *
    991  1.12.4.1   thorpej 	 * ==> Once sleepq_block() returns, we will not call
    992  1.12.4.1   thorpej 	 * sleepq_block() again.
    993  1.12.4.1   thorpej 	 *
    994  1.12.4.1   thorpej 	 * ==> If sleepq_block() returns due to being signaled,
    995  1.12.4.1   thorpej 	 * we must check to see if we were also awakened.  This
    996  1.12.4.1   thorpej 	 * is to handle the following sequence:
    997  1.12.4.1   thorpej 	 *
    998  1.12.4.1   thorpej 	 *	-- Futex wake takes us off sleepq, places us on runq.
    999  1.12.4.1   thorpej 	 *	-- We are signaled while sitting on the runq.
   1000  1.12.4.1   thorpej 	 *	-- We run, and sleepq_block() notices the signal and
   1001  1.12.4.1   thorpej 	 *	   returns  EINTR/ERESTART.
   1002  1.12.4.1   thorpej 	 *
   1003  1.12.4.1   thorpej 	 * In this situation, we want to indicate a successful wake
   1004  1.12.4.1   thorpej 	 * to the caller, because that wake has been reported to the
   1005  1.12.4.1   thorpej 	 * thread that issued it.
   1006  1.12.4.1   thorpej 	 *
   1007  1.12.4.1   thorpej 	 * Note that it is NOT possible for a wake to be issued after
   1008  1.12.4.1   thorpej 	 * a signal; the LWP will already have been taken off the sleepq
   1009  1.12.4.1   thorpej 	 * by the signal, so the wake will never happen.  Note that for
   1010  1.12.4.1   thorpej 	 * this reason, we must *never* return ERESTART, because it could
   1011  1.12.4.1   thorpej 	 * result in us waiting again with no one to awaken us.
   1012       1.1   thorpej 	 */
   1013       1.1   thorpej 
   1014  1.12.4.1   thorpej 	struct lwp * const l = curlwp;
   1015  1.12.4.1   thorpej 	int timo;
   1016  1.12.4.1   thorpej 	int error;
   1017       1.1   thorpej 
   1018       1.4  riastrad 	/*
   1019  1.12.4.1   thorpej 	 * Compute our timeout before taking any locks.
   1020       1.4  riastrad 	 */
   1021  1.12.4.1   thorpej 	if (deadline) {
   1022  1.12.4.1   thorpej 		struct timespec ts;
   1023       1.4  riastrad 
   1024  1.12.4.1   thorpej 		/* Check our watch.  */
   1025  1.12.4.1   thorpej 		error = clock_gettime1(clkid, &ts);
   1026  1.12.4.1   thorpej 		if (error) {
   1027  1.12.4.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1028  1.12.4.1   thorpej 			return error;
   1029  1.12.4.1   thorpej 		}
   1030       1.1   thorpej 
   1031  1.12.4.1   thorpej 		/*
   1032  1.12.4.1   thorpej 		 * If we're past the deadline, ETIMEDOUT.
   1033  1.12.4.1   thorpej 		 */
   1034  1.12.4.1   thorpej 		if (timespeccmp(deadline, &ts, <=)) {
   1035  1.12.4.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1036  1.12.4.1   thorpej 			return ETIMEDOUT;
   1037  1.12.4.1   thorpej 		} else {
   1038  1.12.4.1   thorpej 			/* Count how much time is left.  */
   1039  1.12.4.1   thorpej 			timespecsub(deadline, &ts, &ts);
   1040  1.12.4.1   thorpej 			timo = tstohz(&ts);
   1041  1.12.4.1   thorpej 			if (timo == 0) {
   1042  1.12.4.1   thorpej 				/*
   1043  1.12.4.1   thorpej 				 * tstohz() already rounds up, and
   1044  1.12.4.1   thorpej 				 * a return value of 0 ticks means
   1045  1.12.4.1   thorpej 				 * "expired now or in the past".
   1046  1.12.4.1   thorpej 				 */
   1047  1.12.4.1   thorpej 				mutex_exit(&f->fx_op_lock);
   1048  1.12.4.1   thorpej 				return ETIMEDOUT;
   1049  1.12.4.1   thorpej 			}
   1050  1.12.4.1   thorpej 		}
   1051  1.12.4.1   thorpej 	} else {
   1052  1.12.4.1   thorpej 		timo = 0;
   1053  1.12.4.1   thorpej 	}
   1054       1.1   thorpej 
   1055       1.1   thorpej 	/*
   1056  1.12.4.1   thorpej 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1057  1.12.4.1   thorpej 	 * that there's room for us to block.
   1058       1.1   thorpej 	 */
   1059  1.12.4.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   1060  1.12.4.1   thorpej 	if (__predict_false(q == FUTEX_READERQ &&
   1061  1.12.4.1   thorpej 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1062  1.12.4.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
   1063  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1064  1.12.4.1   thorpej 		return ENFILE;
   1065  1.12.4.1   thorpej 	}
   1066  1.12.4.1   thorpej 	lwp_lock(l);
   1067       1.4  riastrad 
   1068       1.4  riastrad 	/*
   1069  1.12.4.1   thorpej 	 * No need for locks here; until we're on a futex's sleepq, these
   1070  1.12.4.1   thorpej 	 * values are private to the LWP, and the locks needed to get onto
   1071  1.12.4.1   thorpej 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1072       1.4  riastrad 	 */
   1073  1.12.4.1   thorpej 	l->l_futex = f;
   1074  1.12.4.1   thorpej 	l->l_futex_wakesel = bitset;
   1075       1.4  riastrad 
   1076       1.4  riastrad 	/*
   1077  1.12.4.1   thorpej 	 * This is an inlined bit of sleepq_enter();
   1078  1.12.4.1   thorpej 	 * we already hold the lwp lock.
   1079       1.4  riastrad 	 */
   1080  1.12.4.1   thorpej 	lwp_unlock_to(l, f->fx_sq_lock);
   1081  1.12.4.1   thorpej 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1082  1.12.4.1   thorpej 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1083       1.4  riastrad 
   1084  1.12.4.1   thorpej 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1085  1.12.4.1   thorpej 	f->fx_nwaiters[q]++;
   1086      1.11  riastrad 
   1087  1.12.4.1   thorpej 	/* We can now safely release the op lock. */
   1088  1.12.4.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1089      1.11  riastrad 
   1090  1.12.4.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1091  1.12.4.1   thorpej 	error = sleepq_block(timo, true);
   1092  1.12.4.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1093      1.11  riastrad 
   1094  1.12.4.1   thorpej 	/*
   1095  1.12.4.1   thorpej 	 * f is no longer valid; we may have been moved to another
   1096  1.12.4.1   thorpej 	 * futex sleepq while we slept.
   1097  1.12.4.1   thorpej 	 */
   1098       1.4  riastrad 
   1099       1.4  riastrad 	/*
   1100  1.12.4.1   thorpej 	 * If something went wrong, we may still have a futex reference.
   1101  1.12.4.1   thorpej 	 * Check for that here and drop it if so.  We can do this without
   1102  1.12.4.1   thorpej 	 * taking any locks because l->l_futex is private to the LWP when
   1103  1.12.4.1   thorpej 	 * not on any futex's sleepq, and we are not on any sleepq because
   1104  1.12.4.1   thorpej 	 * we are running.
   1105  1.12.4.1   thorpej 	 *
   1106  1.12.4.1   thorpej 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1107  1.12.4.1   thorpej 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1108       1.4  riastrad 	 */
   1109       1.4  riastrad 	if (error) {
   1110  1.12.4.1   thorpej 		f = l->l_futex;
   1111  1.12.4.1   thorpej 		if (f != NULL) {
   1112  1.12.4.1   thorpej 			SDT_PROBE0(futex, wait, finish, aborted);
   1113  1.12.4.1   thorpej 			l->l_futex = NULL;
   1114  1.12.4.1   thorpej 			futex_rele(f);
   1115  1.12.4.1   thorpej 		} else {
   1116  1.12.4.1   thorpej 			/* Raced with wakeup; report success. */
   1117  1.12.4.1   thorpej 			SDT_PROBE0(futex, wait, finish, wakerace);
   1118  1.12.4.1   thorpej 			error = 0;
   1119  1.12.4.1   thorpej 		}
   1120       1.4  riastrad 		if (error == EWOULDBLOCK)
   1121       1.4  riastrad 			error = ETIMEDOUT;
   1122  1.12.4.1   thorpej 		else if (error != ETIMEDOUT)
   1123  1.12.4.1   thorpej 			error = EINTR;
   1124  1.12.4.1   thorpej 	} else {
   1125  1.12.4.1   thorpej 		KASSERT(l->l_futex == NULL);
   1126  1.12.4.1   thorpej 		SDT_PROBE0(futex, wait, finish, normally);
   1127       1.4  riastrad 	}
   1128       1.4  riastrad 
   1129       1.1   thorpej 	return error;
   1130       1.1   thorpej }
   1131       1.1   thorpej 
   1132       1.1   thorpej /*
   1133  1.12.4.1   thorpej  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1134       1.1   thorpej  *
   1135       1.1   thorpej  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1136       1.1   thorpej  *	provided, move up to nrequeue remaining waiters on f matching
   1137       1.1   thorpej  *	bitset to f2.  Return the number of waiters actually woken.
   1138       1.1   thorpej  *	Caller must hold the locks of f and f2, if provided.
   1139       1.1   thorpej  */
   1140       1.1   thorpej static unsigned
   1141  1.12.4.1   thorpej futex_wake(struct futex *f, int q, unsigned int const nwake,
   1142  1.12.4.1   thorpej     struct futex *f2, int q2, unsigned int const nrequeue,
   1143  1.12.4.1   thorpej     int bitset)
   1144       1.1   thorpej {
   1145  1.12.4.1   thorpej 	struct lwp *l, *l_next;
   1146       1.1   thorpej 	unsigned nwoken = 0;
   1147  1.12.4.1   thorpej 	unsigned nrequeued = 0;
   1148       1.1   thorpej 
   1149  1.12.4.1   thorpej 	KASSERT(mutex_owned(&f->fx_op_lock));
   1150  1.12.4.1   thorpej 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1151  1.12.4.1   thorpej 
   1152  1.12.4.1   thorpej 	futex_sq_lock2(f, f2);
   1153       1.1   thorpej 
   1154       1.1   thorpej 	/* Wake up to nwake waiters, and count the number woken.  */
   1155  1.12.4.1   thorpej 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1156  1.12.4.1   thorpej 		if (nwoken == nwake)
   1157       1.1   thorpej 			break;
   1158  1.12.4.1   thorpej 
   1159  1.12.4.1   thorpej 		KASSERT(l->l_syncobj == &futex_syncobj);
   1160  1.12.4.1   thorpej 		KASSERT(l->l_wchan == (wchan_t)f);
   1161  1.12.4.1   thorpej 		KASSERT(l->l_futex == f);
   1162  1.12.4.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1163  1.12.4.1   thorpej 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1164  1.12.4.1   thorpej 
   1165  1.12.4.1   thorpej 		if ((l->l_futex_wakesel & bitset) == 0)
   1166  1.12.4.1   thorpej 			continue;
   1167  1.12.4.1   thorpej 
   1168  1.12.4.1   thorpej 		l->l_futex_wakesel = 0;
   1169  1.12.4.1   thorpej 		l->l_futex = NULL;
   1170  1.12.4.1   thorpej 		sleepq_remove(&f->fx_sqs[q], l);
   1171  1.12.4.1   thorpej 		f->fx_nwaiters[q]--;
   1172  1.12.4.1   thorpej 		nwoken++;
   1173  1.12.4.1   thorpej 		/* hold counts adjusted in bulk below */
   1174       1.1   thorpej 	}
   1175       1.1   thorpej 
   1176  1.12.4.1   thorpej 	if (nrequeue) {
   1177  1.12.4.1   thorpej 		KASSERT(f2 != NULL);
   1178       1.1   thorpej 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1179  1.12.4.1   thorpej 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1180  1.12.4.1   thorpej 			if (nrequeued == nrequeue)
   1181       1.1   thorpej 				break;
   1182  1.12.4.1   thorpej 
   1183  1.12.4.1   thorpej 			KASSERT(l->l_syncobj == &futex_syncobj);
   1184  1.12.4.1   thorpej 			KASSERT(l->l_wchan == (wchan_t)f);
   1185  1.12.4.1   thorpej 			KASSERT(l->l_futex == f);
   1186  1.12.4.1   thorpej 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1187  1.12.4.1   thorpej 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1188  1.12.4.1   thorpej 
   1189  1.12.4.1   thorpej 			if ((l->l_futex_wakesel & bitset) == 0)
   1190  1.12.4.1   thorpej 				continue;
   1191  1.12.4.1   thorpej 
   1192  1.12.4.1   thorpej 			l->l_futex = f2;
   1193  1.12.4.1   thorpej 			sleepq_transfer(l, &f->fx_sqs[q],
   1194  1.12.4.1   thorpej 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1195  1.12.4.1   thorpej 			    &futex_syncobj, f2->fx_sq_lock, true);
   1196  1.12.4.1   thorpej 			f->fx_nwaiters[q]--;
   1197  1.12.4.1   thorpej 			f2->fx_nwaiters[q2]++;
   1198  1.12.4.1   thorpej 			nrequeued++;
   1199  1.12.4.1   thorpej 			/* hold counts adjusted in bulk below */
   1200  1.12.4.1   thorpej 		}
   1201  1.12.4.1   thorpej 		if (nrequeued) {
   1202  1.12.4.1   thorpej 			/* XXX futex_hold() could theoretically fail here. */
   1203  1.12.4.1   thorpej 			int hold_error __diagused =
   1204  1.12.4.1   thorpej 			    futex_hold_count(f2, nrequeued);
   1205  1.12.4.1   thorpej 			KASSERT(hold_error == 0);
   1206       1.1   thorpej 		}
   1207       1.1   thorpej 	}
   1208       1.1   thorpej 
   1209  1.12.4.1   thorpej 	/*
   1210  1.12.4.1   thorpej 	 * Adjust the futex reference counts for the wakes and
   1211  1.12.4.1   thorpej 	 * requeues.
   1212  1.12.4.1   thorpej 	 */
   1213  1.12.4.1   thorpej 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1214  1.12.4.1   thorpej 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1215  1.12.4.1   thorpej 
   1216  1.12.4.1   thorpej 	futex_sq_unlock2(f, f2);
   1217  1.12.4.1   thorpej 
   1218  1.12.4.1   thorpej 	/* Return the number of waiters woken and requeued.  */
   1219  1.12.4.1   thorpej 	return nwoken + nrequeued;
   1220       1.1   thorpej }
   1221       1.1   thorpej 
   1222       1.1   thorpej /*
   1223  1.12.4.1   thorpej  * futex_op_lock(f)
   1224       1.1   thorpej  *
   1225  1.12.4.1   thorpej  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1226       1.1   thorpej  *	not use if caller needs to acquire two locks; use
   1227  1.12.4.1   thorpej  *	futex_op_lock2 instead.
   1228       1.1   thorpej  */
   1229       1.1   thorpej static void
   1230  1.12.4.1   thorpej futex_op_lock(struct futex *f)
   1231       1.1   thorpej {
   1232  1.12.4.1   thorpej 	mutex_enter(&f->fx_op_lock);
   1233       1.1   thorpej }
   1234       1.1   thorpej 
   1235       1.1   thorpej /*
   1236  1.12.4.1   thorpej  * futex_op_unlock(f)
   1237       1.1   thorpej  *
   1238       1.1   thorpej  *	Release the queue lock of f.
   1239       1.1   thorpej  */
   1240       1.1   thorpej static void
   1241  1.12.4.1   thorpej futex_op_unlock(struct futex *f)
   1242       1.1   thorpej {
   1243  1.12.4.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1244       1.1   thorpej }
   1245       1.1   thorpej 
   1246       1.1   thorpej /*
   1247  1.12.4.1   thorpej  * futex_op_lock2(f, f2)
   1248       1.1   thorpej  *
   1249  1.12.4.1   thorpej  *	Acquire the op locks of both f and f2, which may be null, or
   1250  1.12.4.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
   1251  1.12.4.1   thorpej  *	order is chosen on the locks.
   1252       1.1   thorpej  *
   1253  1.12.4.1   thorpej  *	Callers should only ever acquire multiple op locks
   1254  1.12.4.1   thorpej  *	simultaneously using futex_op_lock2.
   1255       1.1   thorpej  */
   1256       1.1   thorpej static void
   1257  1.12.4.1   thorpej futex_op_lock2(struct futex *f, struct futex *f2)
   1258       1.1   thorpej {
   1259       1.1   thorpej 
   1260       1.1   thorpej 	/*
   1261       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1262       1.1   thorpej 	 * is not, lock the other and be done with it.
   1263       1.1   thorpej 	 */
   1264       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1265       1.1   thorpej 		return;
   1266       1.1   thorpej 	} else if (f == NULL) {
   1267  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1268       1.1   thorpej 		return;
   1269       1.1   thorpej 	} else if (f2 == NULL) {
   1270  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1271       1.1   thorpej 		return;
   1272       1.1   thorpej 	}
   1273       1.1   thorpej 
   1274       1.1   thorpej 	/* If both futexes are the same, acquire only one. */
   1275       1.1   thorpej 	if (f == f2) {
   1276  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1277       1.1   thorpej 		return;
   1278       1.1   thorpej 	}
   1279       1.1   thorpej 
   1280       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1281       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1282  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1283  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1284       1.1   thorpej 	} else {
   1285  1.12.4.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1286  1.12.4.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1287       1.1   thorpej 	}
   1288       1.1   thorpej }
   1289       1.1   thorpej 
   1290       1.1   thorpej /*
   1291  1.12.4.1   thorpej  * futex_op_unlock2(f, f2)
   1292       1.1   thorpej  *
   1293       1.1   thorpej  *	Release the queue locks of both f and f2, which may be null, or
   1294       1.1   thorpej  *	which may have the same underlying queue.
   1295       1.1   thorpej  */
   1296       1.1   thorpej static void
   1297  1.12.4.1   thorpej futex_op_unlock2(struct futex *f, struct futex *f2)
   1298       1.1   thorpej {
   1299       1.1   thorpej 
   1300       1.1   thorpej 	/*
   1301       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1302       1.1   thorpej 	 * is not, unlock the other and be done with it.
   1303       1.1   thorpej 	 */
   1304       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1305       1.1   thorpej 		return;
   1306       1.1   thorpej 	} else if (f == NULL) {
   1307  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1308       1.1   thorpej 		return;
   1309       1.1   thorpej 	} else if (f2 == NULL) {
   1310  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1311       1.1   thorpej 		return;
   1312       1.1   thorpej 	}
   1313       1.1   thorpej 
   1314       1.1   thorpej 	/* If both futexes are the same, release only one. */
   1315       1.1   thorpej 	if (f == f2) {
   1316  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1317       1.1   thorpej 		return;
   1318       1.1   thorpej 	}
   1319       1.1   thorpej 
   1320       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1321       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1322  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1323  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1324       1.1   thorpej 	} else {
   1325  1.12.4.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1326  1.12.4.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1327       1.1   thorpej 	}
   1328       1.1   thorpej }
   1329       1.1   thorpej 
   1330       1.1   thorpej /*
   1331       1.1   thorpej  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1332       1.1   thorpej  *
   1333  1.12.4.1   thorpej  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1334       1.1   thorpej  */
   1335       1.1   thorpej static int
   1336       1.1   thorpej futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1337      1.11  riastrad     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1338       1.1   thorpej     register_t *retval)
   1339       1.1   thorpej {
   1340       1.1   thorpej 	struct futex *f;
   1341      1.11  riastrad 	struct timespec ts;
   1342      1.11  riastrad 	const struct timespec *deadline;
   1343       1.1   thorpej 	int error;
   1344       1.1   thorpej 
   1345       1.6  riastrad 	/*
   1346       1.6  riastrad 	 * If there's nothing to wait for, and nobody will ever wake
   1347       1.6  riastrad 	 * us, then don't set anything up to wait -- just stop here.
   1348       1.6  riastrad 	 */
   1349       1.6  riastrad 	if (val3 == 0)
   1350       1.7  riastrad 		return EINVAL;
   1351       1.6  riastrad 
   1352       1.1   thorpej 	/* Optimistically test before anything else.  */
   1353       1.1   thorpej 	if (!futex_test(uaddr, val))
   1354       1.1   thorpej 		return EAGAIN;
   1355       1.1   thorpej 
   1356      1.11  riastrad 	/* Determine a deadline on the specified clock.  */
   1357  1.12.4.1   thorpej 	if (timeout == NULL) {
   1358  1.12.4.1   thorpej 		deadline = timeout;
   1359  1.12.4.1   thorpej 	} else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1360  1.12.4.1   thorpej 		/* Sanity-check the deadline. */
   1361  1.12.4.1   thorpej 		if (timeout->tv_sec < 0 ||
   1362  1.12.4.1   thorpej 		    timeout->tv_nsec < 0 ||
   1363  1.12.4.1   thorpej 		    timeout->tv_nsec >= 1000000000L) {
   1364  1.12.4.1   thorpej 			return EINVAL;
   1365  1.12.4.1   thorpej 		}
   1366      1.11  riastrad 		deadline = timeout;
   1367      1.11  riastrad 	} else {
   1368  1.12.4.1   thorpej 		struct timespec interval = *timeout;
   1369  1.12.4.1   thorpej 
   1370  1.12.4.1   thorpej 		error = itimespecfix(&interval);
   1371  1.12.4.1   thorpej 		if (error)
   1372  1.12.4.1   thorpej 			return error;
   1373      1.11  riastrad 		error = clock_gettime1(clkid, &ts);
   1374      1.11  riastrad 		if (error)
   1375      1.11  riastrad 			return error;
   1376  1.12.4.1   thorpej 		timespecadd(&ts, &interval, &ts);
   1377      1.11  riastrad 		deadline = &ts;
   1378      1.11  riastrad 	}
   1379      1.11  riastrad 
   1380       1.1   thorpej 	/* Get the futex, creating it if necessary.  */
   1381  1.12.4.1   thorpej 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1382       1.1   thorpej 	if (error)
   1383       1.1   thorpej 		return error;
   1384       1.1   thorpej 	KASSERT(f);
   1385       1.1   thorpej 
   1386       1.1   thorpej 	/*
   1387  1.12.4.1   thorpej 	 * Under the op lock, check the value again: if it has
   1388       1.1   thorpej 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1389       1.1   thorpej 	 * Since FUTEX_WAKE will use the same lock and be done after
   1390       1.1   thorpej 	 * modifying the value, the order in which we check and enqueue
   1391       1.1   thorpej 	 * is immaterial.
   1392       1.1   thorpej 	 */
   1393  1.12.4.1   thorpej 	futex_op_lock(f);
   1394       1.1   thorpej 	if (!futex_test(uaddr, val)) {
   1395  1.12.4.1   thorpej 		futex_op_unlock(f);
   1396       1.1   thorpej 		error = EAGAIN;
   1397       1.1   thorpej 		goto out;
   1398       1.1   thorpej 	}
   1399  1.12.4.1   thorpej 
   1400  1.12.4.1   thorpej 	/*
   1401  1.12.4.1   thorpej 	 * Now wait.  futex_wait() will drop our op lock once we
   1402  1.12.4.1   thorpej 	 * are entered into the sleep queue, thus ensuring atomicity
   1403  1.12.4.1   thorpej 	 * of wakes with respect to waits.
   1404  1.12.4.1   thorpej 	 */
   1405  1.12.4.1   thorpej 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1406       1.1   thorpej 
   1407       1.1   thorpej 	/*
   1408       1.1   thorpej 	 * We cannot drop our reference to the futex here, because
   1409       1.1   thorpej 	 * we might be enqueued on a different one when we are awakened.
   1410  1.12.4.1   thorpej 	 * The references will be managed on our behalf in the requeue,
   1411  1.12.4.1   thorpej 	 * wake, and error cases.
   1412       1.1   thorpej 	 */
   1413       1.1   thorpej 	f = NULL;
   1414       1.1   thorpej 
   1415  1.12.4.1   thorpej 	if (__predict_true(error == 0)) {
   1416  1.12.4.1   thorpej 		/* Return 0 on success, error on failure. */
   1417  1.12.4.1   thorpej 		*retval = 0;
   1418  1.12.4.1   thorpej 	}
   1419       1.1   thorpej 
   1420       1.1   thorpej out:	if (f != NULL)
   1421       1.5  riastrad 		futex_rele(f);
   1422       1.1   thorpej 	return error;
   1423       1.1   thorpej }
   1424       1.1   thorpej 
   1425       1.1   thorpej /*
   1426       1.1   thorpej  * futex_func_wake(uaddr, val, val3, retval)
   1427       1.1   thorpej  *
   1428       1.1   thorpej  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1429       1.1   thorpej  */
   1430       1.1   thorpej static int
   1431       1.1   thorpej futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1432       1.1   thorpej {
   1433       1.1   thorpej 	struct futex *f;
   1434       1.1   thorpej 	unsigned int nwoken = 0;
   1435       1.1   thorpej 	int error = 0;
   1436       1.1   thorpej 
   1437       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1438       1.1   thorpej 	if (val < 0) {
   1439       1.1   thorpej 		error = EINVAL;
   1440       1.1   thorpej 		goto out;
   1441       1.1   thorpej 	}
   1442       1.1   thorpej 
   1443       1.1   thorpej 	/* Look up the futex, if any.  */
   1444  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1445       1.1   thorpej 	if (error)
   1446       1.1   thorpej 		goto out;
   1447       1.1   thorpej 
   1448       1.1   thorpej 	/* If there's no futex, there are no waiters to wake.  */
   1449       1.1   thorpej 	if (f == NULL)
   1450       1.1   thorpej 		goto out;
   1451       1.1   thorpej 
   1452       1.1   thorpej 	/*
   1453  1.12.4.1   thorpej 	 * Under f's op lock, wake the waiters and remember the
   1454       1.1   thorpej 	 * number woken.
   1455       1.1   thorpej 	 */
   1456  1.12.4.1   thorpej 	futex_op_lock(f);
   1457  1.12.4.1   thorpej 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1458  1.12.4.1   thorpej 			    NULL, FUTEX_WRITERQ, 0, val3);
   1459  1.12.4.1   thorpej 	futex_op_unlock(f);
   1460       1.1   thorpej 
   1461       1.1   thorpej 	/* Release the futex.  */
   1462       1.5  riastrad 	futex_rele(f);
   1463       1.1   thorpej 
   1464       1.1   thorpej out:
   1465       1.1   thorpej 	/* Return the number of waiters woken.  */
   1466       1.1   thorpej 	*retval = nwoken;
   1467       1.1   thorpej 
   1468       1.1   thorpej 	/* Success!  */
   1469       1.1   thorpej 	return error;
   1470       1.1   thorpej }
   1471       1.1   thorpej 
   1472       1.1   thorpej /*
   1473       1.1   thorpej  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1474       1.1   thorpej  *
   1475       1.1   thorpej  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1476       1.1   thorpej  */
   1477       1.1   thorpej static int
   1478       1.1   thorpej futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1479       1.1   thorpej     int val2, int val3, register_t *retval)
   1480       1.1   thorpej {
   1481       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1482       1.1   thorpej 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1483       1.1   thorpej 	int error;
   1484       1.1   thorpej 
   1485       1.1   thorpej 	/* Reject negative number of wakeups or requeues. */
   1486       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1487       1.1   thorpej 		error = EINVAL;
   1488       1.1   thorpej 		goto out;
   1489       1.1   thorpej 	}
   1490       1.1   thorpej 
   1491       1.1   thorpej 	/* Look up the source futex, if any. */
   1492  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1493       1.1   thorpej 	if (error)
   1494       1.1   thorpej 		goto out;
   1495       1.1   thorpej 
   1496       1.1   thorpej 	/* If there is none, nothing to do. */
   1497       1.1   thorpej 	if (f == NULL)
   1498       1.1   thorpej 		goto out;
   1499       1.1   thorpej 
   1500       1.1   thorpej 	/*
   1501       1.1   thorpej 	 * We may need to create the destination futex because it's
   1502       1.1   thorpej 	 * entirely possible it does not currently have any waiters.
   1503       1.1   thorpej 	 */
   1504  1.12.4.1   thorpej 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1505       1.1   thorpej 	if (error)
   1506       1.1   thorpej 		goto out;
   1507       1.1   thorpej 
   1508       1.1   thorpej 	/*
   1509  1.12.4.1   thorpej 	 * Under the futexes' op locks, check the value; if
   1510       1.1   thorpej 	 * unchanged from val3, wake the waiters.
   1511       1.1   thorpej 	 */
   1512  1.12.4.1   thorpej 	futex_op_lock2(f, f2);
   1513       1.1   thorpej 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1514       1.1   thorpej 		error = EAGAIN;
   1515       1.1   thorpej 	} else {
   1516       1.1   thorpej 		error = 0;
   1517  1.12.4.1   thorpej 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1518  1.12.4.1   thorpej 				    f2, FUTEX_WRITERQ, val2,
   1519  1.12.4.1   thorpej 				    FUTEX_BITSET_MATCH_ANY);
   1520       1.1   thorpej 	}
   1521  1.12.4.1   thorpej 	futex_op_unlock2(f, f2);
   1522       1.1   thorpej 
   1523       1.1   thorpej out:
   1524       1.1   thorpej 	/* Return the number of waiters woken.  */
   1525       1.1   thorpej 	*retval = nwoken;
   1526       1.1   thorpej 
   1527       1.1   thorpej 	/* Release the futexes if we got them.  */
   1528       1.1   thorpej 	if (f2)
   1529       1.5  riastrad 		futex_rele(f2);
   1530       1.1   thorpej 	if (f)
   1531       1.5  riastrad 		futex_rele(f);
   1532       1.1   thorpej 	return error;
   1533       1.1   thorpej }
   1534       1.1   thorpej 
   1535       1.1   thorpej /*
   1536       1.1   thorpej  * futex_validate_op_cmp(val3)
   1537       1.1   thorpej  *
   1538       1.1   thorpej  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1539       1.1   thorpej  */
   1540       1.1   thorpej static int
   1541       1.1   thorpej futex_validate_op_cmp(int val3)
   1542       1.1   thorpej {
   1543       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1544       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1545       1.1   thorpej 
   1546       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1547       1.1   thorpej 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1548       1.1   thorpej 		if (oparg < 0)
   1549       1.1   thorpej 			return EINVAL;
   1550       1.1   thorpej 		if (oparg >= 32)
   1551       1.1   thorpej 			return EINVAL;
   1552       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1553       1.1   thorpej 	}
   1554       1.1   thorpej 
   1555       1.1   thorpej 	switch (op) {
   1556       1.1   thorpej 	case FUTEX_OP_SET:
   1557       1.1   thorpej 	case FUTEX_OP_ADD:
   1558       1.1   thorpej 	case FUTEX_OP_OR:
   1559       1.1   thorpej 	case FUTEX_OP_ANDN:
   1560       1.1   thorpej 	case FUTEX_OP_XOR:
   1561       1.1   thorpej 		break;
   1562       1.1   thorpej 	default:
   1563       1.1   thorpej 		return EINVAL;
   1564       1.1   thorpej 	}
   1565       1.1   thorpej 
   1566       1.1   thorpej 	switch (cmp) {
   1567       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1568       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1569       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1570       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1571       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1572       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1573       1.1   thorpej 		break;
   1574       1.1   thorpej 	default:
   1575       1.1   thorpej 		return EINVAL;
   1576       1.1   thorpej 	}
   1577       1.1   thorpej 
   1578       1.1   thorpej 	return 0;
   1579       1.1   thorpej }
   1580       1.1   thorpej 
   1581       1.1   thorpej /*
   1582       1.1   thorpej  * futex_compute_op(oldval, val3)
   1583       1.1   thorpej  *
   1584       1.1   thorpej  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1585       1.1   thorpej  */
   1586       1.1   thorpej static int
   1587       1.1   thorpej futex_compute_op(int oldval, int val3)
   1588       1.1   thorpej {
   1589       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1590       1.1   thorpej 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1591       1.1   thorpej 
   1592       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1593       1.1   thorpej 		KASSERT(oparg >= 0);
   1594       1.1   thorpej 		KASSERT(oparg < 32);
   1595       1.1   thorpej 		oparg = 1u << oparg;
   1596       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1597       1.1   thorpej 	}
   1598       1.1   thorpej 
   1599       1.1   thorpej 	switch (op) {
   1600       1.1   thorpej 	case FUTEX_OP_SET:
   1601       1.1   thorpej 		return oparg;
   1602       1.1   thorpej 
   1603       1.1   thorpej 	case FUTEX_OP_ADD:
   1604       1.1   thorpej 		/*
   1605       1.1   thorpej 		 * Avoid signed arithmetic overflow by doing
   1606       1.1   thorpej 		 * arithmetic unsigned and converting back to signed
   1607       1.1   thorpej 		 * at the end.
   1608       1.1   thorpej 		 */
   1609       1.1   thorpej 		return (int)((unsigned)oldval + (unsigned)oparg);
   1610       1.1   thorpej 
   1611       1.1   thorpej 	case FUTEX_OP_OR:
   1612       1.1   thorpej 		return oldval | oparg;
   1613       1.1   thorpej 
   1614       1.1   thorpej 	case FUTEX_OP_ANDN:
   1615       1.1   thorpej 		return oldval & ~oparg;
   1616       1.1   thorpej 
   1617       1.1   thorpej 	case FUTEX_OP_XOR:
   1618       1.1   thorpej 		return oldval ^ oparg;
   1619       1.1   thorpej 
   1620       1.1   thorpej 	default:
   1621       1.1   thorpej 		panic("invalid futex op");
   1622       1.1   thorpej 	}
   1623       1.1   thorpej }
   1624       1.1   thorpej 
   1625       1.1   thorpej /*
   1626       1.1   thorpej  * futex_compute_cmp(oldval, val3)
   1627       1.1   thorpej  *
   1628       1.1   thorpej  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1629       1.1   thorpej  */
   1630       1.1   thorpej static bool
   1631       1.1   thorpej futex_compute_cmp(int oldval, int val3)
   1632       1.1   thorpej {
   1633       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1634       1.1   thorpej 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1635       1.1   thorpej 
   1636       1.1   thorpej 	switch (cmp) {
   1637       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1638       1.1   thorpej 		return (oldval == cmparg);
   1639       1.1   thorpej 
   1640       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1641       1.1   thorpej 		return (oldval != cmparg);
   1642       1.1   thorpej 
   1643       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1644       1.1   thorpej 		return (oldval < cmparg);
   1645       1.1   thorpej 
   1646       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1647       1.1   thorpej 		return (oldval <= cmparg);
   1648       1.1   thorpej 
   1649       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1650       1.1   thorpej 		return (oldval > cmparg);
   1651       1.1   thorpej 
   1652       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1653       1.1   thorpej 		return (oldval >= cmparg);
   1654       1.1   thorpej 
   1655       1.1   thorpej 	default:
   1656       1.1   thorpej 		panic("invalid futex cmp operation");
   1657       1.1   thorpej 	}
   1658       1.1   thorpej }
   1659       1.1   thorpej 
   1660       1.1   thorpej /*
   1661       1.1   thorpej  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1662       1.1   thorpej  *
   1663       1.1   thorpej  *	Implement futex(FUTEX_WAKE_OP).
   1664       1.1   thorpej  */
   1665       1.1   thorpej static int
   1666       1.1   thorpej futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1667       1.1   thorpej     int val3, register_t *retval)
   1668       1.1   thorpej {
   1669       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1670       1.1   thorpej 	int oldval, newval, actual;
   1671       1.1   thorpej 	unsigned nwoken = 0;
   1672       1.1   thorpej 	int error;
   1673       1.1   thorpej 
   1674       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1675       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1676       1.1   thorpej 		error = EINVAL;
   1677       1.1   thorpej 		goto out;
   1678       1.1   thorpej 	}
   1679       1.1   thorpej 
   1680       1.1   thorpej 	/* Reject invalid operations before we start doing things.  */
   1681       1.1   thorpej 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1682       1.1   thorpej 		goto out;
   1683       1.1   thorpej 
   1684       1.1   thorpej 	/* Look up the first futex, if any.  */
   1685  1.12.4.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1686       1.1   thorpej 	if (error)
   1687       1.1   thorpej 		goto out;
   1688       1.1   thorpej 
   1689       1.1   thorpej 	/* Look up the second futex, if any.  */
   1690  1.12.4.1   thorpej 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1691       1.1   thorpej 	if (error)
   1692       1.1   thorpej 		goto out;
   1693       1.1   thorpej 
   1694       1.1   thorpej 	/*
   1695  1.12.4.1   thorpej 	 * Under the op locks:
   1696       1.1   thorpej 	 *
   1697       1.1   thorpej 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1698       1.1   thorpej 	 * 2. Unconditionally wake uaddr.
   1699  1.12.4.1   thorpej 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1700       1.1   thorpej 	 */
   1701  1.12.4.1   thorpej 	futex_op_lock2(f, f2);
   1702       1.1   thorpej 	do {
   1703       1.1   thorpej 		error = futex_load(uaddr2, &oldval);
   1704       1.1   thorpej 		if (error)
   1705       1.1   thorpej 			goto out_unlock;
   1706       1.1   thorpej 		newval = futex_compute_op(oldval, val3);
   1707       1.1   thorpej 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1708       1.1   thorpej 		if (error)
   1709       1.1   thorpej 			goto out_unlock;
   1710       1.1   thorpej 	} while (actual != oldval);
   1711  1.12.4.1   thorpej 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1712  1.12.4.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   1713  1.12.4.1   thorpej 				 FUTEX_BITSET_MATCH_ANY) : 0);
   1714       1.1   thorpej 	if (f2 && futex_compute_cmp(oldval, val3))
   1715  1.12.4.1   thorpej 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1716  1.12.4.1   thorpej 				     NULL, FUTEX_WRITERQ, 0,
   1717  1.12.4.1   thorpej 				     FUTEX_BITSET_MATCH_ANY);
   1718       1.1   thorpej 
   1719       1.1   thorpej 	/* Success! */
   1720       1.1   thorpej 	error = 0;
   1721       1.1   thorpej out_unlock:
   1722  1.12.4.1   thorpej 	futex_op_unlock2(f, f2);
   1723       1.1   thorpej 
   1724       1.1   thorpej out:
   1725       1.1   thorpej 	/* Return the number of waiters woken. */
   1726       1.1   thorpej 	*retval = nwoken;
   1727       1.1   thorpej 
   1728       1.1   thorpej 	/* Release the futexes, if we got them. */
   1729       1.1   thorpej 	if (f2)
   1730       1.5  riastrad 		futex_rele(f2);
   1731       1.1   thorpej 	if (f)
   1732       1.5  riastrad 		futex_rele(f);
   1733       1.1   thorpej 	return error;
   1734       1.1   thorpej }
   1735       1.1   thorpej 
   1736       1.1   thorpej /*
   1737       1.1   thorpej  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   1738       1.1   thorpej  *
   1739       1.1   thorpej  *	Implement the futex system call with all the parameters
   1740       1.1   thorpej  *	parsed out.
   1741       1.1   thorpej  */
   1742       1.1   thorpej int
   1743      1.11  riastrad do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   1744      1.11  riastrad     int *uaddr2, int val2, int val3, register_t *retval)
   1745       1.1   thorpej {
   1746       1.1   thorpej 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   1747       1.1   thorpej 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   1748       1.1   thorpej 							    : CLOCK_MONOTONIC;
   1749  1.12.4.1   thorpej 	int error;
   1750       1.1   thorpej 
   1751       1.1   thorpej 	op &= FUTEX_CMD_MASK;
   1752       1.1   thorpej 
   1753       1.1   thorpej 	switch (op) {
   1754       1.1   thorpej 	case FUTEX_WAIT:
   1755  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   1756  1.12.4.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   1757  1.12.4.1   thorpej 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   1758  1.12.4.1   thorpej 		error = futex_func_wait(shared, uaddr, val,
   1759       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   1760       1.1   thorpej 		    retval);
   1761  1.12.4.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   1762  1.12.4.1   thorpej 		break;
   1763  1.12.4.1   thorpej 
   1764  1.12.4.1   thorpej 	case FUTEX_WAIT_BITSET:
   1765  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   1766  1.12.4.1   thorpej 		    uaddr, val, val3, timeout,
   1767  1.12.4.1   thorpej 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   1768  1.12.4.1   thorpej 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   1769  1.12.4.1   thorpej 		    clkid, TIMER_ABSTIME, retval);
   1770  1.12.4.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   1771  1.12.4.1   thorpej 		break;
   1772       1.1   thorpej 
   1773       1.1   thorpej 	case FUTEX_WAKE:
   1774  1.12.4.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   1775  1.12.4.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   1776  1.12.4.1   thorpej 		error = futex_func_wake(shared, uaddr, val,
   1777  1.12.4.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, retval);
   1778  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1779  1.12.4.1   thorpej 		break;
   1780  1.12.4.1   thorpej 
   1781       1.1   thorpej 	case FUTEX_WAKE_BITSET:
   1782  1.12.4.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   1783  1.12.4.1   thorpej 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   1784  1.12.4.1   thorpej 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   1785  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   1786  1.12.4.1   thorpej 		break;
   1787       1.1   thorpej 
   1788       1.1   thorpej 	case FUTEX_REQUEUE:
   1789  1.12.4.1   thorpej 		SDT_PROBE5(futex, func, requeue, entry,
   1790  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   1791  1.12.4.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1792  1.12.4.1   thorpej 		    val2, 0, retval);
   1793  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   1794  1.12.4.1   thorpej 		break;
   1795  1.12.4.1   thorpej 
   1796       1.1   thorpej 	case FUTEX_CMP_REQUEUE:
   1797  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   1798  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1799  1.12.4.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   1800       1.1   thorpej 		    val2, val3, retval);
   1801  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   1802  1.12.4.1   thorpej 		break;
   1803       1.1   thorpej 
   1804       1.1   thorpej 	case FUTEX_WAKE_OP:
   1805  1.12.4.1   thorpej 		SDT_PROBE6(futex, func, wake_op, entry,
   1806  1.12.4.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   1807  1.12.4.1   thorpej 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   1808       1.1   thorpej 		    val3, retval);
   1809  1.12.4.1   thorpej 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   1810  1.12.4.1   thorpej 		break;
   1811       1.1   thorpej 
   1812       1.1   thorpej 	case FUTEX_FD:
   1813       1.1   thorpej 	default:
   1814  1.12.4.1   thorpej 		error = ENOSYS;
   1815  1.12.4.1   thorpej 		break;
   1816       1.1   thorpej 	}
   1817  1.12.4.1   thorpej 
   1818  1.12.4.1   thorpej 	return error;
   1819       1.1   thorpej }
   1820       1.1   thorpej 
   1821       1.1   thorpej /*
   1822       1.1   thorpej  * sys___futex(l, uap, retval)
   1823       1.1   thorpej  *
   1824       1.1   thorpej  *	__futex(2) system call: generic futex operations.
   1825       1.1   thorpej  */
   1826       1.1   thorpej int
   1827       1.1   thorpej sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   1828       1.1   thorpej     register_t *retval)
   1829       1.1   thorpej {
   1830       1.1   thorpej 	/* {
   1831       1.1   thorpej 		syscallarg(int *) uaddr;
   1832       1.1   thorpej 		syscallarg(int) op;
   1833       1.1   thorpej 		syscallarg(int) val;
   1834       1.1   thorpej 		syscallarg(const struct timespec *) timeout;
   1835       1.1   thorpej 		syscallarg(int *) uaddr2;
   1836       1.1   thorpej 		syscallarg(int) val2;
   1837       1.1   thorpej 		syscallarg(int) val3;
   1838       1.1   thorpej 	} */
   1839       1.1   thorpej 	struct timespec ts, *tsp;
   1840       1.1   thorpej 	int error;
   1841       1.1   thorpej 
   1842       1.1   thorpej 	/*
   1843       1.1   thorpej 	 * Copy in the timeout argument, if specified.
   1844       1.1   thorpej 	 */
   1845       1.1   thorpej 	if (SCARG(uap, timeout)) {
   1846       1.1   thorpej 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   1847       1.1   thorpej 		if (error)
   1848       1.1   thorpej 			return error;
   1849       1.1   thorpej 		tsp = &ts;
   1850       1.1   thorpej 	} else {
   1851       1.1   thorpej 		tsp = NULL;
   1852       1.1   thorpej 	}
   1853       1.1   thorpej 
   1854       1.1   thorpej 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   1855       1.1   thorpej 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   1856       1.1   thorpej 	    retval);
   1857       1.1   thorpej }
   1858       1.1   thorpej 
   1859       1.1   thorpej /*
   1860       1.1   thorpej  * sys___futex_set_robust_list(l, uap, retval)
   1861       1.1   thorpej  *
   1862       1.1   thorpej  *	__futex_set_robust_list(2) system call for robust futexes.
   1863       1.1   thorpej  */
   1864       1.1   thorpej int
   1865       1.1   thorpej sys___futex_set_robust_list(struct lwp *l,
   1866       1.1   thorpej     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   1867       1.1   thorpej {
   1868       1.1   thorpej 	/* {
   1869       1.1   thorpej 		syscallarg(void *) head;
   1870       1.1   thorpej 		syscallarg(size_t) len;
   1871       1.1   thorpej 	} */
   1872       1.1   thorpej 	void *head = SCARG(uap, head);
   1873       1.1   thorpej 
   1874       1.1   thorpej 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   1875       1.1   thorpej 		return EINVAL;
   1876       1.1   thorpej 	if ((uintptr_t)head % sizeof(u_long))
   1877       1.1   thorpej 		return EINVAL;
   1878       1.1   thorpej 
   1879       1.1   thorpej 	l->l_robust_head = (uintptr_t)head;
   1880       1.1   thorpej 
   1881       1.1   thorpej 	return 0;
   1882       1.1   thorpej }
   1883       1.1   thorpej 
   1884       1.1   thorpej /*
   1885       1.1   thorpej  * sys___futex_get_robust_list(l, uap, retval)
   1886       1.1   thorpej  *
   1887       1.1   thorpej  *	__futex_get_robust_list(2) system call for robust futexes.
   1888       1.1   thorpej  */
   1889       1.1   thorpej int
   1890       1.1   thorpej sys___futex_get_robust_list(struct lwp *l,
   1891       1.1   thorpej     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   1892       1.1   thorpej {
   1893       1.1   thorpej 	/* {
   1894       1.1   thorpej 		syscallarg(lwpid_t) lwpid;
   1895       1.1   thorpej 		syscallarg(void **) headp;
   1896       1.1   thorpej 		syscallarg(size_t *) lenp;
   1897       1.1   thorpej 	} */
   1898       1.1   thorpej 	void *head;
   1899       1.1   thorpej 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   1900       1.1   thorpej 	int error;
   1901       1.1   thorpej 
   1902       1.1   thorpej 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   1903       1.1   thorpej 	if (error)
   1904       1.1   thorpej 		return error;
   1905       1.1   thorpej 
   1906       1.1   thorpej 	/* Copy out the head pointer and the head structure length. */
   1907       1.1   thorpej 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   1908       1.1   thorpej 	if (__predict_true(error == 0)) {
   1909       1.1   thorpej 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   1910       1.1   thorpej 	}
   1911       1.1   thorpej 
   1912       1.1   thorpej 	return error;
   1913       1.1   thorpej }
   1914       1.1   thorpej 
   1915       1.1   thorpej /*
   1916       1.1   thorpej  * release_futex(uva, tid)
   1917       1.1   thorpej  *
   1918       1.1   thorpej  *	Try to release the robust futex at uva in the current process
   1919       1.1   thorpej  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   1920       1.1   thorpej  *	userland program's obligation to arrange correct behaviour.
   1921       1.1   thorpej  */
   1922       1.1   thorpej static void
   1923       1.1   thorpej release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   1924       1.1   thorpej     bool const is_pending)
   1925       1.1   thorpej {
   1926       1.1   thorpej 	int *uaddr;
   1927       1.1   thorpej 	struct futex *f;
   1928       1.1   thorpej 	int oldval, newval, actual;
   1929       1.1   thorpej 	int error;
   1930  1.12.4.1   thorpej 	bool shared;
   1931       1.1   thorpej 
   1932       1.1   thorpej 	/* If it's misaligned, tough.  */
   1933       1.1   thorpej 	if (__predict_false(uptr & 3))
   1934       1.1   thorpej 		return;
   1935       1.1   thorpej 	uaddr = (int *)uptr;
   1936       1.1   thorpej 
   1937       1.1   thorpej 	error = futex_load(uaddr, &oldval);
   1938       1.1   thorpej 	if (__predict_false(error))
   1939       1.1   thorpej 		return;
   1940       1.1   thorpej 
   1941       1.1   thorpej 	/*
   1942       1.1   thorpej 	 * There are two race conditions we need to handle here:
   1943       1.1   thorpej 	 *
   1944       1.1   thorpej 	 * 1. User space cleared the futex word but died before
   1945       1.1   thorpej 	 *    being able to issue the wakeup.  No wakeups will
   1946       1.1   thorpej 	 *    ever be issued, oops!
   1947       1.1   thorpej 	 *
   1948       1.1   thorpej 	 * 2. Awakened waiter died before being able to acquire
   1949       1.1   thorpej 	 *    the futex in user space.  Any other waiters are
   1950       1.1   thorpej 	 *    now stuck, oops!
   1951       1.1   thorpej 	 *
   1952       1.1   thorpej 	 * In both of these cases, the futex word will be 0 (because
   1953       1.1   thorpej 	 * it's updated before the wake is issued).  The best we can
   1954       1.1   thorpej 	 * do is detect this situation if it's the pending futex and
   1955       1.1   thorpej 	 * issue a wake without modifying the futex word.
   1956       1.1   thorpej 	 *
   1957       1.1   thorpej 	 * XXX eventual PI handling?
   1958       1.1   thorpej 	 */
   1959       1.1   thorpej 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   1960       1.1   thorpej 		register_t retval;
   1961  1.12.4.1   thorpej 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   1962       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, &retval);
   1963  1.12.4.1   thorpej 		if (error != 0 || retval == 0)
   1964  1.12.4.1   thorpej 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   1965  1.12.4.1   thorpej 			    FUTEX_BITSET_MATCH_ANY, &retval);
   1966       1.1   thorpej 		return;
   1967       1.1   thorpej 	}
   1968       1.1   thorpej 
   1969       1.1   thorpej 	/* Optimistically test whether we need to do anything at all.  */
   1970       1.1   thorpej 	if ((oldval & FUTEX_TID_MASK) != tid)
   1971       1.1   thorpej 		return;
   1972       1.1   thorpej 
   1973       1.1   thorpej 	/*
   1974       1.1   thorpej 	 * We need to handle the case where this thread owned the futex,
   1975       1.1   thorpej 	 * but it was uncontended.  In this case, there won't be any
   1976       1.1   thorpej 	 * kernel state to look up.  All we can do is mark the futex
   1977       1.1   thorpej 	 * as a zombie to be mopped up the next time another thread
   1978       1.1   thorpej 	 * attempts to acquire it.
   1979       1.1   thorpej 	 *
   1980       1.1   thorpej 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   1981       1.1   thorpej 	 * this loop, even if waiters appear while we're are doing
   1982       1.1   thorpej 	 * so.  This is beause FUTEX_WAITERS is set by user space
   1983       1.1   thorpej 	 * before calling __futex() to wait, and the futex needs
   1984       1.1   thorpej 	 * to be marked as a zombie when the new waiter gets into
   1985       1.1   thorpej 	 * the kernel.
   1986       1.1   thorpej 	 */
   1987       1.1   thorpej 	if ((oldval & FUTEX_WAITERS) == 0) {
   1988       1.1   thorpej 		do {
   1989       1.1   thorpej 			error = futex_load(uaddr, &oldval);
   1990       1.1   thorpej 			if (error)
   1991       1.1   thorpej 				return;
   1992       1.1   thorpej 			if ((oldval & FUTEX_TID_MASK) != tid)
   1993       1.1   thorpej 				return;
   1994       1.1   thorpej 			newval = oldval | FUTEX_OWNER_DIED;
   1995       1.1   thorpej 			error = ucas_int(uaddr, oldval, newval, &actual);
   1996       1.1   thorpej 			if (error)
   1997       1.1   thorpej 				return;
   1998       1.1   thorpej 		} while (actual != oldval);
   1999       1.1   thorpej 
   2000       1.1   thorpej 		/*
   2001       1.1   thorpej 		 * If where is still no indication of waiters, then there is
   2002       1.1   thorpej 		 * no more work for us to do.
   2003       1.1   thorpej 		 */
   2004       1.1   thorpej 		if ((oldval & FUTEX_WAITERS) == 0)
   2005       1.1   thorpej 			return;
   2006       1.1   thorpej 	}
   2007       1.1   thorpej 
   2008       1.1   thorpej 	/*
   2009  1.12.4.1   thorpej 	 * Look for a futex.  Try shared first, then private.  If we
   2010  1.12.4.1   thorpej 	 * can't fine one, tough.
   2011  1.12.4.1   thorpej 	 *
   2012  1.12.4.1   thorpej 	 * Note: the assumption here is that anyone placing a futex on
   2013  1.12.4.1   thorpej 	 * the robust list is adhering to the rules, regardless of the
   2014  1.12.4.1   thorpej 	 * futex class.
   2015       1.1   thorpej 	 */
   2016  1.12.4.1   thorpej 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2017  1.12.4.1   thorpej 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2018  1.12.4.1   thorpej 		if (error)
   2019  1.12.4.1   thorpej 			return;
   2020  1.12.4.1   thorpej 		if (f == NULL && shared == false)
   2021  1.12.4.1   thorpej 			return;
   2022  1.12.4.1   thorpej 	}
   2023       1.1   thorpej 
   2024  1.12.4.1   thorpej 	/* Work under the futex op lock.  */
   2025  1.12.4.1   thorpej 	futex_op_lock(f);
   2026       1.1   thorpej 
   2027       1.1   thorpej 	/*
   2028       1.1   thorpej 	 * Fetch the word: if the tid doesn't match ours, skip;
   2029       1.1   thorpej 	 * otherwise, set the owner-died bit, atomically.
   2030       1.1   thorpej 	 */
   2031       1.1   thorpej 	do {
   2032       1.1   thorpej 		error = futex_load(uaddr, &oldval);
   2033       1.1   thorpej 		if (error)
   2034       1.1   thorpej 			goto out;
   2035       1.1   thorpej 		if ((oldval & FUTEX_TID_MASK) != tid)
   2036       1.1   thorpej 			goto out;
   2037       1.1   thorpej 		newval = oldval | FUTEX_OWNER_DIED;
   2038       1.1   thorpej 		error = ucas_int(uaddr, oldval, newval, &actual);
   2039       1.1   thorpej 		if (error)
   2040       1.1   thorpej 			goto out;
   2041       1.1   thorpej 	} while (actual != oldval);
   2042       1.1   thorpej 
   2043       1.1   thorpej 	/*
   2044       1.1   thorpej 	 * If there may be waiters, try to wake one.  If anything goes
   2045       1.1   thorpej 	 * wrong, tough.
   2046       1.1   thorpej 	 *
   2047       1.1   thorpej 	 * XXX eventual PI handling?
   2048       1.1   thorpej 	 */
   2049  1.12.4.1   thorpej 	if (oldval & FUTEX_WAITERS) {
   2050  1.12.4.1   thorpej 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2051  1.12.4.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   2052  1.12.4.1   thorpej 				 FUTEX_BITSET_MATCH_ANY);
   2053  1.12.4.1   thorpej 	}
   2054       1.1   thorpej 
   2055       1.1   thorpej 	/* Unlock the queue and release the futex.  */
   2056  1.12.4.1   thorpej out:	futex_op_unlock(f);
   2057       1.5  riastrad 	futex_rele(f);
   2058       1.1   thorpej }
   2059       1.1   thorpej 
   2060       1.1   thorpej /*
   2061       1.1   thorpej  * futex_robust_head_lookup(l, lwpid)
   2062       1.1   thorpej  *
   2063       1.1   thorpej  *	Helper function to look up a robust head by LWP ID.
   2064       1.1   thorpej  */
   2065       1.1   thorpej int
   2066       1.1   thorpej futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2067       1.1   thorpej {
   2068       1.1   thorpej 	struct proc *p = l->l_proc;
   2069       1.1   thorpej 
   2070       1.1   thorpej 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2071       1.1   thorpej 	if (lwpid) {
   2072       1.1   thorpej 		mutex_enter(p->p_lock);
   2073       1.1   thorpej 		l = lwp_find(p, lwpid);
   2074       1.1   thorpej 		if (l == NULL) {
   2075       1.1   thorpej 			mutex_exit(p->p_lock);
   2076       1.1   thorpej 			return ESRCH;
   2077       1.1   thorpej 		}
   2078       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2079       1.1   thorpej 		mutex_exit(p->p_lock);
   2080       1.1   thorpej 	} else {
   2081       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2082       1.1   thorpej 	}
   2083       1.1   thorpej 	return 0;
   2084       1.1   thorpej }
   2085       1.1   thorpej 
   2086       1.1   thorpej /*
   2087       1.1   thorpej  * futex_fetch_robust_head(uaddr)
   2088       1.1   thorpej  *
   2089       1.1   thorpej  *	Helper routine to fetch the futex robust list head that
   2090       1.1   thorpej  *	handles 32-bit binaries running on 64-bit kernels.
   2091       1.1   thorpej  */
   2092       1.1   thorpej static int
   2093       1.1   thorpej futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2094       1.1   thorpej {
   2095       1.1   thorpej #ifdef _LP64
   2096       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2097       1.1   thorpej 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2098       1.1   thorpej 		int error;
   2099       1.1   thorpej 
   2100       1.1   thorpej 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2101       1.1   thorpej 		if (__predict_true(error == 0)) {
   2102       1.1   thorpej 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2103       1.1   thorpej 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2104       1.1   thorpej 					/*
   2105       1.1   thorpej 					 * Make sure the offset is sign-
   2106       1.1   thorpej 					 * extended.
   2107       1.1   thorpej 					 */
   2108       1.1   thorpej 					rhead[i] = (int32_t)rhead32[i];
   2109       1.1   thorpej 				} else {
   2110       1.1   thorpej 					rhead[i] = rhead32[i];
   2111       1.1   thorpej 				}
   2112       1.1   thorpej 			}
   2113       1.1   thorpej 		}
   2114       1.1   thorpej 		return error;
   2115       1.1   thorpej 	}
   2116       1.1   thorpej #endif /* _L64 */
   2117       1.1   thorpej 
   2118       1.1   thorpej 	return copyin((void *)uaddr, rhead,
   2119       1.1   thorpej 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2120       1.1   thorpej }
   2121       1.1   thorpej 
   2122       1.1   thorpej /*
   2123       1.1   thorpej  * futex_decode_robust_word(word)
   2124       1.1   thorpej  *
   2125       1.1   thorpej  *	Decode a robust futex list word into the entry and entry
   2126       1.1   thorpej  *	properties.
   2127       1.1   thorpej  */
   2128       1.1   thorpej static inline void
   2129       1.1   thorpej futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2130       1.1   thorpej     bool * const is_pi)
   2131       1.1   thorpej {
   2132       1.1   thorpej 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2133       1.1   thorpej 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2134       1.1   thorpej }
   2135       1.1   thorpej 
   2136       1.1   thorpej /*
   2137       1.1   thorpej  * futex_fetch_robust_entry(uaddr)
   2138       1.1   thorpej  *
   2139       1.1   thorpej  *	Helper routine to fetch and decode a robust futex entry
   2140       1.1   thorpej  *	that handles 32-bit binaries running on 64-bit kernels.
   2141       1.1   thorpej  */
   2142       1.1   thorpej static int
   2143       1.1   thorpej futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2144       1.1   thorpej     bool * const is_pi)
   2145       1.1   thorpej {
   2146       1.1   thorpej 	uintptr_t val = 0;
   2147       1.1   thorpej 	int error = 0;
   2148       1.1   thorpej 
   2149       1.1   thorpej #ifdef _LP64
   2150       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2151       1.1   thorpej 		uint32_t val32;
   2152       1.1   thorpej 
   2153       1.1   thorpej 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2154       1.1   thorpej 		if (__predict_true(error == 0))
   2155       1.1   thorpej 			val = val32;
   2156       1.1   thorpej 	} else
   2157       1.1   thorpej #endif /* _LP64 */
   2158       1.1   thorpej 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2159       1.1   thorpej 	if (__predict_false(error))
   2160       1.1   thorpej 		return error;
   2161       1.1   thorpej 
   2162       1.1   thorpej 	futex_decode_robust_word(val, valp, is_pi);
   2163       1.1   thorpej 	return 0;
   2164       1.1   thorpej }
   2165       1.1   thorpej 
   2166       1.1   thorpej /*
   2167       1.1   thorpej  * futex_release_all_lwp(l, tid)
   2168       1.1   thorpej  *
   2169       1.1   thorpej  *	Release all l's robust futexes.  If anything looks funny in
   2170       1.1   thorpej  *	the process, give up -- it's userland's responsibility to dot
   2171       1.1   thorpej  *	the i's and cross the t's.
   2172       1.1   thorpej  */
   2173       1.1   thorpej void
   2174       1.1   thorpej futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2175       1.1   thorpej {
   2176       1.1   thorpej 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2177       1.1   thorpej 	int limit = 1000000;
   2178       1.1   thorpej 	int error;
   2179       1.1   thorpej 
   2180       1.1   thorpej 	/* If there's no robust list there's nothing to do. */
   2181       1.1   thorpej 	if (l->l_robust_head == 0)
   2182       1.1   thorpej 		return;
   2183       1.1   thorpej 
   2184       1.1   thorpej 	/* Read the final snapshot of the robust list head. */
   2185       1.1   thorpej 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2186       1.1   thorpej 	if (error) {
   2187       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2188       1.1   thorpej 		    " unmapped robust futex list head\n",
   2189       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2190       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2191       1.1   thorpej 		return;
   2192       1.1   thorpej 	}
   2193       1.1   thorpej 
   2194       1.1   thorpej 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2195       1.1   thorpej 
   2196       1.1   thorpej 	uintptr_t next, pending;
   2197       1.1   thorpej 	bool is_pi, pending_is_pi;
   2198       1.1   thorpej 
   2199       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2200       1.1   thorpej 	    &next, &is_pi);
   2201       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2202       1.1   thorpej 	    &pending, &pending_is_pi);
   2203       1.1   thorpej 
   2204       1.1   thorpej 	/*
   2205       1.1   thorpej 	 * Walk down the list of locked futexes and release them, up
   2206       1.1   thorpej 	 * to one million of them before we give up.
   2207       1.1   thorpej 	 */
   2208       1.1   thorpej 
   2209       1.1   thorpej 	while (next != l->l_robust_head && limit-- > 0) {
   2210       1.1   thorpej 		/* pending handled below. */
   2211       1.1   thorpej 		if (next != pending)
   2212       1.1   thorpej 			release_futex(next + offset, tid, is_pi, false);
   2213       1.1   thorpej 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2214       1.1   thorpej 		if (error)
   2215       1.1   thorpej 			break;
   2216       1.1   thorpej 		preempt_point();
   2217       1.1   thorpej 	}
   2218       1.1   thorpej 	if (limit <= 0) {
   2219       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2220       1.1   thorpej 		    " exhausted robust futex limit\n",
   2221       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2222       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2223       1.1   thorpej 	}
   2224       1.1   thorpej 
   2225       1.1   thorpej 	/* If there's a pending futex, it may need to be released too. */
   2226       1.1   thorpej 	if (pending != 0) {
   2227       1.1   thorpej 		release_futex(pending + offset, tid, pending_is_pi, true);
   2228       1.1   thorpej 	}
   2229       1.1   thorpej }
   2230