Home | History | Annotate | Line # | Download | only in kern
sys_futex.c revision 1.11.2.1
      1  1.11.2.1   thorpej /*	$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $	*/
      2       1.1   thorpej 
      3       1.1   thorpej /*-
      4       1.1   thorpej  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
      5       1.1   thorpej  * All rights reserved.
      6       1.1   thorpej  *
      7       1.1   thorpej  * This code is derived from software contributed to The NetBSD Foundation
      8       1.1   thorpej  * by Taylor R. Campbell and Jason R. Thorpe.
      9       1.1   thorpej  *
     10       1.1   thorpej  * Redistribution and use in source and binary forms, with or without
     11       1.1   thorpej  * modification, are permitted provided that the following conditions
     12       1.1   thorpej  * are met:
     13       1.1   thorpej  * 1. Redistributions of source code must retain the above copyright
     14       1.1   thorpej  *    notice, this list of conditions and the following disclaimer.
     15       1.1   thorpej  * 2. Redistributions in binary form must reproduce the above copyright
     16       1.1   thorpej  *    notice, this list of conditions and the following disclaimer in the
     17       1.1   thorpej  *    documentation and/or other materials provided with the distribution.
     18       1.1   thorpej  *
     19       1.1   thorpej  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20       1.1   thorpej  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21       1.1   thorpej  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22       1.1   thorpej  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23       1.1   thorpej  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24       1.1   thorpej  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25       1.1   thorpej  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26       1.1   thorpej  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27       1.1   thorpej  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28       1.1   thorpej  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29       1.1   thorpej  * POSSIBILITY OF SUCH DAMAGE.
     30       1.1   thorpej  */
     31       1.1   thorpej 
     32       1.1   thorpej #include <sys/cdefs.h>
     33  1.11.2.1   thorpej __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $");
     34       1.1   thorpej 
     35       1.1   thorpej /*
     36       1.1   thorpej  * Futexes
     37       1.1   thorpej  *
     38       1.1   thorpej  *	The futex system call coordinates notifying threads waiting for
     39       1.1   thorpej  *	changes on a 32-bit word of memory.  The word can be managed by
     40       1.1   thorpej  *	CPU atomic operations in userland, without system calls, as long
     41       1.1   thorpej  *	as there is no contention.
     42       1.1   thorpej  *
     43       1.1   thorpej  *	The simplest use case demonstrating the utility is:
     44       1.1   thorpej  *
     45       1.1   thorpej  *		// 32-bit word of memory shared among threads or
     46       1.1   thorpej  *		// processes in userland.  lock & 1 means owned;
     47       1.1   thorpej  *		// lock & 2 means there are waiters waiting.
     48       1.1   thorpej  *		volatile int lock = 0;
     49       1.1   thorpej  *
     50       1.1   thorpej  *		int v;
     51       1.1   thorpej  *
     52       1.1   thorpej  *		// Acquire a lock.
     53       1.1   thorpej  *		do {
     54       1.1   thorpej  *			v = lock;
     55       1.1   thorpej  *			if (v & 1) {
     56       1.1   thorpej  *				// Lock is held.  Set a bit to say that
     57       1.1   thorpej  *				// there are waiters, and wait for lock
     58       1.1   thorpej  *				// to change to anything other than v;
     59       1.1   thorpej  *				// then retry.
     60       1.1   thorpej  *				if (atomic_cas_uint(&lock, v, v | 2) != v)
     61       1.1   thorpej  *					continue;
     62       1.1   thorpej  *				futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
     63       1.1   thorpej  *				continue;
     64       1.1   thorpej  *			}
     65       1.1   thorpej  *		} while (atomic_cas_uint(&lock, v, v & ~1) != v);
     66       1.1   thorpej  *		membar_enter();
     67       1.1   thorpej  *
     68       1.1   thorpej  *		...
     69       1.1   thorpej  *
     70       1.1   thorpej  *		// Release the lock.  Optimistically assume there are
     71       1.1   thorpej  *		// no waiters first until demonstrated otherwise.
     72       1.1   thorpej  *		membar_exit();
     73       1.1   thorpej  *		if (atomic_cas_uint(&lock, 1, 0) != 1) {
     74       1.1   thorpej  *			// There may be waiters.
     75       1.1   thorpej  *			v = atomic_swap_uint(&lock, 0);
     76       1.1   thorpej  *			// If there are still waiters, wake one.
     77       1.1   thorpej  *			if (v & 2)
     78       1.1   thorpej  *				futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
     79       1.1   thorpej  *		}
     80       1.1   thorpej  *
     81       1.1   thorpej  *	The goal is to avoid the futex system call unless there is
     82       1.1   thorpej  *	contention; then if there is contention, to guarantee no missed
     83       1.1   thorpej  *	wakeups.
     84       1.1   thorpej  *
     85       1.1   thorpej  *	For a simple implementation, futex(FUTEX_WAIT) could queue
     86       1.1   thorpej  *	itself to be woken, double-check the lock word, and then sleep;
     87       1.1   thorpej  *	spurious wakeups are generally a fact of life, so any
     88       1.1   thorpej  *	FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
     89       1.1   thorpej  *
     90       1.1   thorpej  *	If this were all there is to it, we could then increase
     91       1.1   thorpej  *	parallelism by refining the approximation: partition the
     92       1.1   thorpej  *	waiters into buckets by hashing the lock addresses to reduce
     93       1.1   thorpej  *	the incidence of spurious wakeups.  But this is not all.
     94       1.1   thorpej  *
     95       1.1   thorpej  *	The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
     96       1.1   thorpej  *	operation not only wakes n waiters on lock if lock == val, but
     97       1.1   thorpej  *	also _transfers_ m additional waiters to lock2.  Unless wakeups
     98       1.1   thorpej  *	on lock2 also trigger wakeups on lock, we cannot move waiters
     99       1.1   thorpej  *	to lock2 if they merely share the same hash as waiters on lock.
    100       1.1   thorpej  *	Thus, we can't approximately distribute waiters into queues by
    101       1.1   thorpej  *	a hash function; we must distinguish futex queues exactly by
    102       1.1   thorpej  *	lock address.
    103       1.1   thorpej  *
    104       1.1   thorpej  *	For now, we use a global red/black tree to index futexes.  This
    105       1.1   thorpej  *	should be replaced by a lockless radix tree with a thread to
    106       1.1   thorpej  *	free entries no longer in use once all lookups on all CPUs have
    107       1.1   thorpej  *	completed.
    108       1.1   thorpej  *
    109       1.1   thorpej  *	Specifically, we maintain two maps:
    110       1.1   thorpej  *
    111       1.1   thorpej  *	futex_tab.va[vmspace, va] for private futexes
    112       1.1   thorpej  *	futex_tab.oa[uvm_voaddr] for shared futexes
    113       1.1   thorpej  *
    114       1.1   thorpej  *	This implementation does not support priority inheritance.
    115       1.1   thorpej  */
    116       1.1   thorpej 
    117       1.1   thorpej #include <sys/types.h>
    118       1.1   thorpej #include <sys/atomic.h>
    119       1.1   thorpej #include <sys/condvar.h>
    120       1.1   thorpej #include <sys/futex.h>
    121       1.1   thorpej #include <sys/mutex.h>
    122       1.1   thorpej #include <sys/rbtree.h>
    123  1.11.2.1   thorpej #include <sys/sleepq.h>
    124       1.1   thorpej #include <sys/queue.h>
    125  1.11.2.1   thorpej #include <sys/sdt.h>
    126       1.1   thorpej 
    127       1.1   thorpej #include <sys/syscall.h>
    128       1.1   thorpej #include <sys/syscallargs.h>
    129       1.1   thorpej #include <sys/syscallvar.h>
    130       1.1   thorpej 
    131       1.1   thorpej #include <uvm/uvm_extern.h>
    132       1.1   thorpej 
    133       1.1   thorpej /*
    134  1.11.2.1   thorpej  * DTrace probes.
    135  1.11.2.1   thorpej  */
    136  1.11.2.1   thorpej SDT_PROVIDER_DEFINE(futex);
    137  1.11.2.1   thorpej 
    138  1.11.2.1   thorpej /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
    139  1.11.2.1   thorpej /* exit: error */
    140  1.11.2.1   thorpej SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
    141  1.11.2.1   thorpej 		  "struct timespec *", "int", "int");
    142  1.11.2.1   thorpej SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
    143  1.11.2.1   thorpej 
    144  1.11.2.1   thorpej /* entry: uaddr, nwake, bitset, fflags */
    145  1.11.2.1   thorpej /* exit: error, nwoken */
    146  1.11.2.1   thorpej SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
    147  1.11.2.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
    148  1.11.2.1   thorpej 
    149  1.11.2.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
    150  1.11.2.1   thorpej /* exit: error, nwoken */
    151  1.11.2.1   thorpej SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
    152  1.11.2.1   thorpej 		  "int");
    153  1.11.2.1   thorpej SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
    154  1.11.2.1   thorpej 
    155  1.11.2.1   thorpej /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
    156  1.11.2.1   thorpej /* exit: error, nwoken */
    157  1.11.2.1   thorpej SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
    158  1.11.2.1   thorpej 		  "int", "int", "int");
    159  1.11.2.1   thorpej SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
    160  1.11.2.1   thorpej 
    161  1.11.2.1   thorpej /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
    162  1.11.2.1   thorpej /* exit: error, nwoken */
    163  1.11.2.1   thorpej SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
    164  1.11.2.1   thorpej 		  "int", "int");
    165  1.11.2.1   thorpej SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
    166  1.11.2.1   thorpej 
    167  1.11.2.1   thorpej /* entry: uaddr, val, r/w, abstime, fflags */
    168  1.11.2.1   thorpej /* exit: error */
    169  1.11.2.1   thorpej SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int",
    170  1.11.2.1   thorpej 		  "struct timespec *", "int");
    171  1.11.2.1   thorpej SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int");
    172  1.11.2.1   thorpej 
    173  1.11.2.1   thorpej /* entry: uaddr, val, fflags */
    174  1.11.2.1   thorpej /* exit: error, nwoken */
    175  1.11.2.1   thorpej SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int");
    176  1.11.2.1   thorpej SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int");
    177  1.11.2.1   thorpej 
    178  1.11.2.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, normally);
    179  1.11.2.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
    180  1.11.2.1   thorpej SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
    181  1.11.2.1   thorpej 
    182  1.11.2.1   thorpej /* entry: timo */
    183  1.11.2.1   thorpej /* exit: error */
    184  1.11.2.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
    185  1.11.2.1   thorpej SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
    186  1.11.2.1   thorpej 
    187  1.11.2.1   thorpej /*
    188       1.1   thorpej  * Lock order:
    189       1.1   thorpej  *
    190       1.1   thorpej  *	futex_tab.lock
    191  1.11.2.1   thorpej  *	futex::fx_op_lock		ordered by kva of struct futex
    192  1.11.2.1   thorpej  *	 -> futex::fx_sq_lock		ordered by kva of sleepq lock
    193  1.11.2.1   thorpej  *
    194  1.11.2.1   thorpej  * N.B. multiple futexes can share a single sleepq lock.
    195       1.1   thorpej  */
    196       1.1   thorpej 
    197       1.1   thorpej /*
    198       1.1   thorpej  * union futex_key
    199       1.1   thorpej  *
    200       1.1   thorpej  *	A futex is addressed either by a vmspace+va (private) or by
    201       1.1   thorpej  *	a uvm_voaddr (shared).
    202       1.1   thorpej  */
    203       1.1   thorpej union futex_key {
    204       1.1   thorpej 	struct {
    205       1.1   thorpej 		struct vmspace			*vmspace;
    206       1.1   thorpej 		vaddr_t				va;
    207       1.1   thorpej 	}			fk_private;
    208       1.1   thorpej 	struct uvm_voaddr	fk_shared;
    209       1.1   thorpej };
    210       1.1   thorpej 
    211       1.1   thorpej /*
    212       1.1   thorpej  * struct futex
    213       1.1   thorpej  *
    214       1.1   thorpej  *	Kernel state for a futex located at a particular address in a
    215       1.1   thorpej  *	particular virtual address space.
    216       1.1   thorpej  *
    217       1.1   thorpej  *	N.B. fx_refcnt is an unsigned long because we need to be able
    218       1.1   thorpej  *	to operate on it atomically on all systems while at the same
    219       1.1   thorpej  *	time rendering practically impossible the chance of it reaching
    220       1.1   thorpej  *	its max value.  In practice, we're limited by the number of LWPs
    221       1.1   thorpej  *	that can be present on the system at any given time, and the
    222       1.1   thorpej  *	assumption is that limit will be good enough on a 32-bit platform.
    223       1.1   thorpej  *	See futex_wake() for why overflow needs to be avoided.
    224  1.11.2.1   thorpej  *
    225  1.11.2.1   thorpej  *	XXX Since futex addresses must be 4-byte aligned, we could
    226  1.11.2.1   thorpej  *	XXX squirrel away fx_shared and fx_on_tree bits in the "va"
    227  1.11.2.1   thorpej  *	XXX field of the key.  Worth it?
    228       1.1   thorpej  */
    229       1.1   thorpej struct futex {
    230       1.1   thorpej 	union futex_key		fx_key;
    231  1.11.2.1   thorpej 	struct rb_node		fx_node;
    232       1.1   thorpej 	unsigned long		fx_refcnt;
    233       1.1   thorpej 	bool			fx_shared;
    234       1.1   thorpej 	bool			fx_on_tree;
    235  1.11.2.1   thorpej 	uint8_t			fx_class;
    236       1.1   thorpej 
    237  1.11.2.1   thorpej 	kmutex_t		fx_op_lock;	/* adaptive */
    238  1.11.2.1   thorpej 	kmutex_t *		fx_sq_lock;	/* &sleepq_locks[...] */
    239  1.11.2.1   thorpej 	sleepq_t		fx_sqs[2];	/* 0=reader, 1=writer */
    240  1.11.2.1   thorpej 	unsigned int		fx_nwaiters[2];
    241       1.1   thorpej };
    242       1.1   thorpej 
    243       1.1   thorpej /*
    244  1.11.2.1   thorpej  * futex classes: Some futex operations can only be carried out on
    245  1.11.2.1   thorpej  * futexes that are known to be abiding by a certain protocol.  These
    246  1.11.2.1   thorpej  * classes are assigned to a futex when created due to a wait event,
    247  1.11.2.1   thorpej  * and when subsequent wake or requeue operations are issued, the
    248  1.11.2.1   thorpej  * class is checked at futex lookup time.  If the class does not match,
    249  1.11.2.1   thorpej  * EINVAL is the result.
    250  1.11.2.1   thorpej  */
    251  1.11.2.1   thorpej #define	FUTEX_CLASS_ANY		0		/* match any class in lookup */
    252  1.11.2.1   thorpej #define	FUTEX_CLASS_NORMAL	1		/* normal futex */
    253  1.11.2.1   thorpej #define	FUTEX_CLASS_RWLOCK	2		/* rwlock futex */
    254  1.11.2.1   thorpej #define	FUTEX_CLASS_PI		3		/* for FUTEX_*_PI ops */
    255  1.11.2.1   thorpej 
    256  1.11.2.1   thorpej /* sleepq definitions */
    257  1.11.2.1   thorpej #define	FUTEX_READERQ		0
    258  1.11.2.1   thorpej #define	FUTEX_WRITERQ		1
    259  1.11.2.1   thorpej 
    260  1.11.2.1   thorpej CTASSERT(FUTEX_READERQ == FUTEX_RW_READER);
    261  1.11.2.1   thorpej CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER);
    262  1.11.2.1   thorpej 
    263  1.11.2.1   thorpej static const char futex_wmesg[] = "futex";
    264  1.11.2.1   thorpej 
    265  1.11.2.1   thorpej static void	futex_unsleep(lwp_t *, bool);
    266  1.11.2.1   thorpej 
    267  1.11.2.1   thorpej static syncobj_t futex_syncobj = {
    268  1.11.2.1   thorpej 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    269  1.11.2.1   thorpej 	.sobj_unsleep	= futex_unsleep,
    270  1.11.2.1   thorpej 	.sobj_changepri	= sleepq_changepri,
    271  1.11.2.1   thorpej 	.sobj_lendpri	= sleepq_lendpri,
    272  1.11.2.1   thorpej 	.sobj_owner	= syncobj_noowner,
    273       1.1   thorpej };
    274       1.1   thorpej 
    275       1.1   thorpej /*
    276       1.1   thorpej  * futex_tab
    277       1.1   thorpej  *
    278       1.1   thorpej  *	Global trees of futexes by vmspace/va and VM object address.
    279       1.1   thorpej  *
    280       1.1   thorpej  *	XXX This obviously doesn't scale in parallel.  We could use a
    281       1.1   thorpej  *	pserialize-safe data structure, but there may be a high cost to
    282       1.1   thorpej  *	frequent deletion since we don't cache futexes after we're done
    283       1.1   thorpej  *	with them.  We could use hashed locks.  But for now, just make
    284       1.1   thorpej  *	sure userland can't DoS the serial performance, by using a
    285       1.1   thorpej  *	balanced binary tree for lookup.
    286       1.1   thorpej  *
    287       1.1   thorpej  *	XXX We could use a per-process tree for the table indexed by
    288       1.1   thorpej  *	virtual address to reduce contention between processes.
    289       1.1   thorpej  */
    290       1.1   thorpej static struct {
    291       1.1   thorpej 	kmutex_t	lock;
    292       1.1   thorpej 	struct rb_tree	va;
    293       1.1   thorpej 	struct rb_tree	oa;
    294       1.1   thorpej } futex_tab __cacheline_aligned;
    295       1.1   thorpej 
    296       1.1   thorpej static int
    297       1.1   thorpej compare_futex_key(void *cookie, const void *n, const void *k)
    298       1.1   thorpej {
    299       1.1   thorpej 	const struct futex *fa = n;
    300       1.1   thorpej 	const union futex_key *fka = &fa->fx_key;
    301       1.1   thorpej 	const union futex_key *fkb = k;
    302       1.1   thorpej 
    303       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace <
    304       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    305       1.1   thorpej 		return -1;
    306       1.1   thorpej 	if ((uintptr_t)fka->fk_private.vmspace >
    307       1.1   thorpej 	    (uintptr_t)fkb->fk_private.vmspace)
    308       1.1   thorpej 		return +1;
    309       1.1   thorpej 	if (fka->fk_private.va < fkb->fk_private.va)
    310       1.1   thorpej 		return -1;
    311       1.1   thorpej 	if (fka->fk_private.va > fkb->fk_private.va)
    312       1.1   thorpej 		return -1;
    313       1.1   thorpej 	return 0;
    314       1.1   thorpej }
    315       1.1   thorpej 
    316       1.1   thorpej static int
    317       1.1   thorpej compare_futex(void *cookie, const void *na, const void *nb)
    318       1.1   thorpej {
    319       1.1   thorpej 	const struct futex *fa = na;
    320       1.1   thorpej 	const struct futex *fb = nb;
    321       1.1   thorpej 
    322       1.1   thorpej 	return compare_futex_key(cookie, fa, &fb->fx_key);
    323       1.1   thorpej }
    324       1.1   thorpej 
    325       1.1   thorpej static const rb_tree_ops_t futex_rb_ops = {
    326       1.1   thorpej 	.rbto_compare_nodes = compare_futex,
    327       1.1   thorpej 	.rbto_compare_key = compare_futex_key,
    328       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    329       1.1   thorpej };
    330       1.1   thorpej 
    331       1.1   thorpej static int
    332       1.1   thorpej compare_futex_shared_key(void *cookie, const void *n, const void *k)
    333       1.1   thorpej {
    334       1.1   thorpej 	const struct futex *fa = n;
    335       1.1   thorpej 	const union futex_key *fka = &fa->fx_key;
    336       1.1   thorpej 	const union futex_key *fkb = k;
    337       1.1   thorpej 
    338       1.1   thorpej 	return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
    339       1.1   thorpej }
    340       1.1   thorpej 
    341       1.1   thorpej static int
    342       1.1   thorpej compare_futex_shared(void *cookie, const void *na, const void *nb)
    343       1.1   thorpej {
    344       1.1   thorpej 	const struct futex *fa = na;
    345       1.1   thorpej 	const struct futex *fb = nb;
    346       1.1   thorpej 
    347       1.1   thorpej 	return compare_futex_shared_key(cookie, fa, &fb->fx_key);
    348       1.1   thorpej }
    349       1.1   thorpej 
    350       1.1   thorpej static const rb_tree_ops_t futex_shared_rb_ops = {
    351       1.1   thorpej 	.rbto_compare_nodes = compare_futex_shared,
    352       1.1   thorpej 	.rbto_compare_key = compare_futex_shared_key,
    353       1.1   thorpej 	.rbto_node_offset = offsetof(struct futex, fx_node),
    354       1.1   thorpej };
    355       1.1   thorpej 
    356  1.11.2.1   thorpej /*
    357  1.11.2.1   thorpej  * futex_sq_lock2(f, f2)
    358  1.11.2.1   thorpej  *
    359  1.11.2.1   thorpej  *	Acquire the sleepq locks for f and f2, which may be null, or
    360  1.11.2.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
    361  1.11.2.1   thorpej  *	order is chosen on the locks.
    362  1.11.2.1   thorpej  *
    363  1.11.2.1   thorpej  *	Callers should only ever acquire multiple sleepq locks
    364  1.11.2.1   thorpej  *	simultaneously using futex_sq_lock2.
    365  1.11.2.1   thorpej  */
    366  1.11.2.1   thorpej static void
    367  1.11.2.1   thorpej futex_sq_lock2(struct futex * const f, struct futex * const f2)
    368  1.11.2.1   thorpej {
    369  1.11.2.1   thorpej 
    370  1.11.2.1   thorpej 	/*
    371  1.11.2.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    372  1.11.2.1   thorpej 	 * is not, lock the other and be done with it.
    373  1.11.2.1   thorpej 	 */
    374  1.11.2.1   thorpej 	if (f == NULL && f2 == NULL) {
    375  1.11.2.1   thorpej 		return;
    376  1.11.2.1   thorpej 	} else if (f == NULL) {
    377  1.11.2.1   thorpej 		mutex_spin_enter(f2->fx_sq_lock);
    378  1.11.2.1   thorpej 		return;
    379  1.11.2.1   thorpej 	} else if (f2 == NULL) {
    380  1.11.2.1   thorpej 		mutex_spin_enter(f->fx_sq_lock);
    381  1.11.2.1   thorpej 		return;
    382  1.11.2.1   thorpej 	}
    383  1.11.2.1   thorpej 
    384  1.11.2.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    385  1.11.2.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    386  1.11.2.1   thorpej 
    387  1.11.2.1   thorpej 	/* If both locks are the same, acquire only one. */
    388  1.11.2.1   thorpej 	if (m == m2) {
    389  1.11.2.1   thorpej 		mutex_spin_enter(m);
    390  1.11.2.1   thorpej 		return;
    391  1.11.2.1   thorpej 	}
    392  1.11.2.1   thorpej 
    393  1.11.2.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    394  1.11.2.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    395  1.11.2.1   thorpej 		mutex_spin_enter(m);
    396  1.11.2.1   thorpej 		mutex_spin_enter(m2);
    397  1.11.2.1   thorpej 	} else {
    398  1.11.2.1   thorpej 		mutex_spin_enter(m2);
    399  1.11.2.1   thorpej 		mutex_spin_enter(m);
    400  1.11.2.1   thorpej 	}
    401  1.11.2.1   thorpej }
    402  1.11.2.1   thorpej 
    403  1.11.2.1   thorpej /*
    404  1.11.2.1   thorpej  * futex_sq_unlock2(f, f2)
    405  1.11.2.1   thorpej  *
    406  1.11.2.1   thorpej  *	Release the sleep queue locks for f and f2, which may be null, or
    407  1.11.2.1   thorpej  *	which may have the same underlying lock.
    408  1.11.2.1   thorpej  */
    409  1.11.2.1   thorpej static void
    410  1.11.2.1   thorpej futex_sq_unlock2(struct futex * const f, struct futex * const f2)
    411  1.11.2.1   thorpej {
    412  1.11.2.1   thorpej 
    413  1.11.2.1   thorpej 	/*
    414  1.11.2.1   thorpej 	 * If both are null, do nothing; if one is null and the other
    415  1.11.2.1   thorpej 	 * is not, unlock the other and be done with it.
    416  1.11.2.1   thorpej 	 */
    417  1.11.2.1   thorpej 	if (f == NULL && f2 == NULL) {
    418  1.11.2.1   thorpej 		return;
    419  1.11.2.1   thorpej 	} else if (f == NULL) {
    420  1.11.2.1   thorpej 		mutex_spin_exit(f2->fx_sq_lock);
    421  1.11.2.1   thorpej 		return;
    422  1.11.2.1   thorpej 	} else if (f2 == NULL) {
    423  1.11.2.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
    424  1.11.2.1   thorpej 		return;
    425  1.11.2.1   thorpej 	}
    426  1.11.2.1   thorpej 
    427  1.11.2.1   thorpej 	kmutex_t * const m = f->fx_sq_lock;
    428  1.11.2.1   thorpej 	kmutex_t * const m2 = f2->fx_sq_lock;
    429  1.11.2.1   thorpej 
    430  1.11.2.1   thorpej 	/* If both locks are the same, release only one. */
    431  1.11.2.1   thorpej 	if (m == m2) {
    432  1.11.2.1   thorpej 		mutex_spin_exit(m);
    433  1.11.2.1   thorpej 		return;
    434  1.11.2.1   thorpej 	}
    435  1.11.2.1   thorpej 
    436  1.11.2.1   thorpej 	/* Otherwise, use the ordering on the kva of the lock pointer.  */
    437  1.11.2.1   thorpej 	if ((uintptr_t)m < (uintptr_t)m2) {
    438  1.11.2.1   thorpej 		mutex_spin_exit(m2);
    439  1.11.2.1   thorpej 		mutex_spin_exit(m);
    440  1.11.2.1   thorpej 	} else {
    441  1.11.2.1   thorpej 		mutex_spin_exit(m);
    442  1.11.2.1   thorpej 		mutex_spin_exit(m2);
    443  1.11.2.1   thorpej 	}
    444  1.11.2.1   thorpej }
    445       1.1   thorpej 
    446       1.1   thorpej /*
    447       1.1   thorpej  * futex_load(uaddr, kaddr)
    448       1.1   thorpej  *
    449       1.1   thorpej  *	Perform a single atomic load to read *uaddr, and return the
    450       1.1   thorpej  *	result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
    451       1.1   thorpej  *	mapped.
    452       1.1   thorpej  */
    453       1.1   thorpej static inline int
    454       1.1   thorpej futex_load(int *uaddr, int *kaddr)
    455       1.1   thorpej {
    456       1.1   thorpej 	return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
    457       1.1   thorpej }
    458       1.1   thorpej 
    459       1.1   thorpej /*
    460       1.1   thorpej  * futex_test(uaddr, expected)
    461       1.1   thorpej  *
    462       1.1   thorpej  *	True if *uaddr == expected.  False if *uaddr != expected, or if
    463       1.1   thorpej  *	uaddr is not mapped.
    464       1.1   thorpej  */
    465       1.1   thorpej static bool
    466       1.1   thorpej futex_test(int *uaddr, int expected)
    467       1.1   thorpej {
    468       1.1   thorpej 	int val;
    469       1.1   thorpej 	int error;
    470       1.1   thorpej 
    471       1.1   thorpej 	error = futex_load(uaddr, &val);
    472       1.1   thorpej 	if (error)
    473       1.1   thorpej 		return false;
    474       1.1   thorpej 	return val == expected;
    475       1.1   thorpej }
    476       1.1   thorpej 
    477  1.11.2.1   thorpej static pool_cache_t	futex_cache		__read_mostly;
    478  1.11.2.1   thorpej 
    479  1.11.2.1   thorpej static int	futex_ctor(void *, void *, int);
    480  1.11.2.1   thorpej static void	futex_dtor(void *, void *);
    481  1.11.2.1   thorpej 
    482       1.1   thorpej /*
    483       1.1   thorpej  * futex_sys_init()
    484       1.1   thorpej  *
    485       1.1   thorpej  *	Initialize the futex subsystem.
    486       1.1   thorpej  */
    487       1.1   thorpej void
    488       1.1   thorpej futex_sys_init(void)
    489       1.1   thorpej {
    490       1.1   thorpej 
    491       1.1   thorpej 	mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
    492       1.1   thorpej 	rb_tree_init(&futex_tab.va, &futex_rb_ops);
    493       1.1   thorpej 	rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
    494  1.11.2.1   thorpej 
    495  1.11.2.1   thorpej 	futex_cache = pool_cache_init(sizeof(struct futex),
    496  1.11.2.1   thorpej 	    coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
    497  1.11.2.1   thorpej 	    futex_dtor, NULL);
    498       1.1   thorpej }
    499       1.1   thorpej 
    500       1.1   thorpej /*
    501       1.1   thorpej  * futex_sys_fini()
    502       1.1   thorpej  *
    503       1.1   thorpej  *	Finalize the futex subsystem.
    504       1.1   thorpej  */
    505       1.1   thorpej void
    506       1.1   thorpej futex_sys_fini(void)
    507       1.1   thorpej {
    508       1.1   thorpej 
    509       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
    510       1.1   thorpej 	KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
    511       1.1   thorpej 	mutex_destroy(&futex_tab.lock);
    512       1.1   thorpej }
    513       1.1   thorpej 
    514       1.1   thorpej /*
    515  1.11.2.1   thorpej  * futex_ctor()
    516       1.1   thorpej  *
    517  1.11.2.1   thorpej  *	Futex object constructor.
    518       1.1   thorpej  */
    519  1.11.2.1   thorpej static int
    520  1.11.2.1   thorpej futex_ctor(void *arg __unused, void *obj, int flags __unused)
    521       1.1   thorpej {
    522  1.11.2.1   thorpej 	extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
    523  1.11.2.1   thorpej 	struct futex * const f = obj;
    524       1.1   thorpej 
    525  1.11.2.1   thorpej 	mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
    526  1.11.2.1   thorpej 	f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
    527       1.1   thorpej 
    528  1.11.2.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
    529  1.11.2.1   thorpej 	sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
    530  1.11.2.1   thorpej 	f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
    531       1.1   thorpej 
    532  1.11.2.1   thorpej 	return 0;
    533       1.1   thorpej }
    534       1.1   thorpej 
    535       1.1   thorpej /*
    536  1.11.2.1   thorpej  * futex_dtor()
    537       1.1   thorpej  *
    538  1.11.2.1   thorpej  *	Futex object destructor.
    539       1.1   thorpej  */
    540       1.1   thorpej static void
    541  1.11.2.1   thorpej futex_dtor(void *arg __unused, void *obj)
    542       1.1   thorpej {
    543  1.11.2.1   thorpej 	struct futex * const f = obj;
    544       1.1   thorpej 
    545  1.11.2.1   thorpej 	mutex_destroy(&f->fx_op_lock);
    546  1.11.2.1   thorpej 	f->fx_sq_lock = NULL;
    547       1.1   thorpej }
    548       1.1   thorpej 
    549       1.1   thorpej /*
    550       1.1   thorpej  * futex_key_init(key, vm, va, shared)
    551       1.1   thorpej  *
    552       1.1   thorpej  *	Initialize a futex key for lookup, etc.
    553       1.1   thorpej  */
    554       1.1   thorpej static int
    555       1.1   thorpej futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
    556       1.1   thorpej {
    557       1.1   thorpej 	int error = 0;
    558       1.1   thorpej 
    559       1.1   thorpej 	if (__predict_false(shared)) {
    560       1.1   thorpej 		if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
    561       1.1   thorpej 			error = EFAULT;
    562       1.1   thorpej 	} else {
    563       1.1   thorpej 		fk->fk_private.vmspace = vm;
    564       1.1   thorpej 		fk->fk_private.va = va;
    565       1.1   thorpej 	}
    566       1.1   thorpej 
    567       1.1   thorpej 	return error;
    568       1.1   thorpej }
    569       1.1   thorpej 
    570       1.1   thorpej /*
    571       1.1   thorpej  * futex_key_fini(key, shared)
    572       1.1   thorpej  *
    573       1.1   thorpej  *	Release a futex key.
    574       1.1   thorpej  */
    575       1.1   thorpej static void
    576       1.1   thorpej futex_key_fini(union futex_key *fk, bool shared)
    577       1.1   thorpej {
    578       1.1   thorpej 	if (__predict_false(shared))
    579       1.1   thorpej 		uvm_voaddr_release(&fk->fk_shared);
    580       1.1   thorpej 	memset(fk, 0, sizeof(*fk));
    581       1.1   thorpej }
    582       1.1   thorpej 
    583       1.1   thorpej /*
    584       1.1   thorpej  * futex_create(fk, shared)
    585       1.1   thorpej  *
    586       1.1   thorpej  *	Create a futex.  Initial reference count is 1, representing the
    587       1.1   thorpej  *	caller.  Returns NULL on failure.  Always takes ownership of the
    588       1.1   thorpej  *	key, either transferring it to the newly-created futex, or releasing
    589       1.1   thorpej  *	the key if creation fails.
    590       1.1   thorpej  *
    591       1.1   thorpej  *	Never sleeps for memory, but may sleep to acquire a lock.
    592       1.1   thorpej  */
    593       1.1   thorpej static struct futex *
    594  1.11.2.1   thorpej futex_create(union futex_key *fk, bool shared, uint8_t class)
    595       1.1   thorpej {
    596       1.1   thorpej 	struct futex *f;
    597       1.1   thorpej 
    598  1.11.2.1   thorpej 	f = pool_cache_get(futex_cache, PR_NOWAIT);
    599       1.1   thorpej 	if (f == NULL) {
    600       1.1   thorpej 		futex_key_fini(fk, shared);
    601       1.1   thorpej 		return NULL;
    602       1.1   thorpej 	}
    603       1.1   thorpej 	f->fx_key = *fk;
    604       1.1   thorpej 	f->fx_refcnt = 1;
    605       1.1   thorpej 	f->fx_shared = shared;
    606       1.1   thorpej 	f->fx_on_tree = false;
    607  1.11.2.1   thorpej 	f->fx_class = class;
    608       1.1   thorpej 
    609       1.1   thorpej 	return f;
    610       1.1   thorpej }
    611       1.1   thorpej 
    612       1.1   thorpej /*
    613       1.1   thorpej  * futex_destroy(f)
    614       1.1   thorpej  *
    615       1.1   thorpej  *	Destroy a futex created with futex_create.  Reference count
    616       1.1   thorpej  *	must be zero.
    617       1.1   thorpej  *
    618       1.1   thorpej  *	May sleep.
    619       1.1   thorpej  */
    620       1.1   thorpej static void
    621       1.1   thorpej futex_destroy(struct futex *f)
    622       1.1   thorpej {
    623       1.1   thorpej 
    624       1.1   thorpej 	ASSERT_SLEEPABLE();
    625       1.1   thorpej 
    626       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
    627       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    628  1.11.2.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
    629  1.11.2.1   thorpej 	KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
    630  1.11.2.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
    631  1.11.2.1   thorpej 	KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
    632       1.1   thorpej 
    633       1.1   thorpej 	futex_key_fini(&f->fx_key, f->fx_shared);
    634       1.1   thorpej 
    635  1.11.2.1   thorpej 	pool_cache_put(futex_cache, f);
    636       1.1   thorpej }
    637       1.1   thorpej 
    638       1.1   thorpej /*
    639  1.11.2.1   thorpej  * futex_hold_count(f, n)
    640       1.1   thorpej  *
    641       1.1   thorpej  *	Attempt to acquire a reference to f.  Return 0 on success,
    642       1.1   thorpej  *	ENFILE on too many references.
    643       1.1   thorpej  *
    644       1.1   thorpej  *	Never sleeps.
    645       1.1   thorpej  */
    646       1.1   thorpej static int
    647  1.11.2.1   thorpej futex_hold_count(struct futex *f, unsigned long const count)
    648       1.1   thorpej {
    649       1.1   thorpej 	unsigned long refcnt;
    650       1.1   thorpej 
    651       1.1   thorpej 	do {
    652       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    653  1.11.2.1   thorpej 		if (ULONG_MAX - refcnt < count)
    654       1.1   thorpej 			return ENFILE;
    655  1.11.2.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    656  1.11.2.1   thorpej 				  refcnt + count) != refcnt);
    657       1.1   thorpej 
    658       1.1   thorpej 	return 0;
    659       1.1   thorpej }
    660  1.11.2.1   thorpej #define	futex_hold(f)	futex_hold_count(f, 1)
    661       1.1   thorpej 
    662       1.1   thorpej /*
    663  1.11.2.1   thorpej  * futex_rele_count(f, n)
    664       1.1   thorpej  *
    665       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    666       1.1   thorpej  *	futex_hold.
    667       1.1   thorpej  *
    668       1.1   thorpej  *	May sleep to free f.
    669       1.1   thorpej  */
    670       1.1   thorpej static void
    671  1.11.2.1   thorpej futex_rele_count(struct futex *f, unsigned long const count)
    672       1.1   thorpej {
    673       1.1   thorpej 	unsigned long refcnt;
    674       1.1   thorpej 
    675       1.1   thorpej 	ASSERT_SLEEPABLE();
    676       1.1   thorpej 
    677       1.1   thorpej 	do {
    678       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    679  1.11.2.1   thorpej 		KASSERT(refcnt >= count);
    680  1.11.2.1   thorpej 		if (refcnt - count == 0)
    681       1.1   thorpej 			goto trylast;
    682  1.11.2.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    683  1.11.2.1   thorpej 				  refcnt - count) != refcnt);
    684       1.1   thorpej 	return;
    685       1.1   thorpej 
    686       1.1   thorpej trylast:
    687  1.11.2.1   thorpej 	KASSERT(count <= LONG_MAX);
    688       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    689  1.11.2.1   thorpej 	if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
    690       1.1   thorpej 		if (f->fx_on_tree) {
    691       1.1   thorpej 			if (__predict_false(f->fx_shared))
    692       1.1   thorpej 				rb_tree_remove_node(&futex_tab.oa, f);
    693       1.1   thorpej 			else
    694       1.1   thorpej 				rb_tree_remove_node(&futex_tab.va, f);
    695       1.1   thorpej 			f->fx_on_tree = false;
    696       1.1   thorpej 		}
    697       1.1   thorpej 	} else {
    698       1.1   thorpej 		/* References remain -- don't destroy it.  */
    699       1.1   thorpej 		f = NULL;
    700       1.1   thorpej 	}
    701       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    702       1.1   thorpej 	if (f != NULL)
    703       1.1   thorpej 		futex_destroy(f);
    704       1.1   thorpej }
    705  1.11.2.1   thorpej #define	futex_rele(f)	futex_rele_count(f, 1)
    706       1.1   thorpej 
    707       1.1   thorpej /*
    708  1.11.2.1   thorpej  * futex_rele_count_not_last(f, n)
    709       1.1   thorpej  *
    710       1.1   thorpej  *	Release a reference to f acquired with futex_create or
    711       1.1   thorpej  *	futex_hold.
    712       1.1   thorpej  *
    713       1.1   thorpej  *	This version asserts that we are not dropping the last
    714       1.1   thorpej  *	reference to f.
    715       1.1   thorpej  */
    716       1.1   thorpej static void
    717  1.11.2.1   thorpej futex_rele_count_not_last(struct futex *f, unsigned long const count)
    718       1.1   thorpej {
    719       1.1   thorpej 	unsigned long refcnt;
    720       1.1   thorpej 
    721       1.1   thorpej 	do {
    722       1.1   thorpej 		refcnt = atomic_load_relaxed(&f->fx_refcnt);
    723  1.11.2.1   thorpej 		KASSERT(refcnt >= count);
    724  1.11.2.1   thorpej 	} while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
    725  1.11.2.1   thorpej 				  refcnt - count) != refcnt);
    726       1.1   thorpej }
    727       1.1   thorpej 
    728       1.1   thorpej /*
    729  1.11.2.1   thorpej  * futex_lookup_by_key(key, shared, class, &f)
    730       1.1   thorpej  *
    731       1.1   thorpej  *	Try to find an existing futex va reference in the specified key
    732       1.1   thorpej  *	On success, return 0, set f to found futex or to NULL if not found,
    733       1.1   thorpej  *	and increment f's reference count if found.
    734       1.1   thorpej  *
    735       1.1   thorpej  *	Return ENFILE if reference count too high.
    736       1.1   thorpej  *
    737       1.1   thorpej  *	Internal lookup routine shared by futex_lookup() and
    738       1.5  riastrad  *	futex_lookup_create().
    739       1.1   thorpej  */
    740       1.1   thorpej static int
    741  1.11.2.1   thorpej futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
    742  1.11.2.1   thorpej     struct futex **fp)
    743       1.1   thorpej {
    744       1.1   thorpej 	struct futex *f;
    745       1.1   thorpej 	int error = 0;
    746       1.1   thorpej 
    747       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    748       1.1   thorpej 	if (__predict_false(shared)) {
    749       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.oa, fk);
    750       1.1   thorpej 	} else {
    751       1.1   thorpej 		f = rb_tree_find_node(&futex_tab.va, fk);
    752       1.1   thorpej 	}
    753       1.1   thorpej 	if (f) {
    754  1.11.2.1   thorpej 		if (__predict_true(f->fx_class == class ||
    755  1.11.2.1   thorpej 				   class == FUTEX_CLASS_ANY))
    756  1.11.2.1   thorpej 			error = futex_hold(f);
    757  1.11.2.1   thorpej 		else
    758  1.11.2.1   thorpej 			error = EINVAL;
    759       1.1   thorpej 		if (error)
    760       1.1   thorpej 			f = NULL;
    761       1.1   thorpej 	}
    762       1.1   thorpej  	*fp = f;
    763       1.1   thorpej 	mutex_exit(&futex_tab.lock);
    764       1.1   thorpej 
    765       1.1   thorpej 	return error;
    766       1.1   thorpej }
    767       1.1   thorpej 
    768       1.1   thorpej /*
    769       1.1   thorpej  * futex_insert(f, fp)
    770       1.1   thorpej  *
    771       1.1   thorpej  *	Try to insert the futex f into the tree by va.  If there
    772       1.1   thorpej  *	already is a futex for its va, acquire a reference to it, and
    773       1.1   thorpej  *	store it in *fp; otherwise store f in *fp.
    774       1.1   thorpej  *
    775       1.1   thorpej  *	Return 0 on success, ENFILE if there already is a futex but its
    776       1.1   thorpej  *	reference count is too high.
    777       1.1   thorpej  */
    778       1.1   thorpej static int
    779       1.1   thorpej futex_insert(struct futex *f, struct futex **fp)
    780       1.1   thorpej {
    781       1.1   thorpej 	struct futex *f0;
    782       1.1   thorpej 	int error;
    783       1.1   thorpej 
    784       1.1   thorpej 	KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
    785       1.1   thorpej 	KASSERT(!f->fx_on_tree);
    786       1.1   thorpej 
    787       1.1   thorpej 	mutex_enter(&futex_tab.lock);
    788       1.1   thorpej 	if (__predict_false(f->fx_shared))
    789       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.oa, f);
    790       1.1   thorpej 	else
    791       1.1   thorpej 		f0 = rb_tree_insert_node(&futex_tab.va, f);
    792       1.1   thorpej 	if (f0 == f) {
    793       1.1   thorpej 		f->fx_on_tree = true;
    794       1.1   thorpej 		error = 0;
    795       1.1   thorpej 	} else {
    796       1.1   thorpej 		KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
    797       1.1   thorpej 		KASSERT(f0->fx_on_tree);
    798       1.1   thorpej 		error = futex_hold(f0);
    799       1.1   thorpej 		if (error)
    800       1.1   thorpej 			goto out;
    801       1.1   thorpej 	}
    802       1.1   thorpej 	*fp = f0;
    803       1.1   thorpej out:	mutex_exit(&futex_tab.lock);
    804       1.1   thorpej 
    805       1.1   thorpej 	return error;
    806       1.1   thorpej }
    807       1.1   thorpej 
    808       1.1   thorpej /*
    809  1.11.2.1   thorpej  * futex_lookup(uaddr, shared, class, &f)
    810       1.1   thorpej  *
    811       1.1   thorpej  *	Find a futex at the userland pointer uaddr in the current
    812       1.1   thorpej  *	process's VM space.  On success, return the futex in f and
    813       1.1   thorpej  *	increment its reference count.
    814       1.1   thorpej  *
    815       1.5  riastrad  *	Caller must call futex_rele when done.
    816       1.1   thorpej  */
    817       1.1   thorpej static int
    818  1.11.2.1   thorpej futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    819       1.1   thorpej {
    820       1.1   thorpej 	union futex_key fk;
    821       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    822       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    823       1.1   thorpej 	int error;
    824       1.1   thorpej 
    825       1.1   thorpej 	/*
    826       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    827       1.1   thorpej 	 * boundaries and so atomics will work.
    828       1.1   thorpej 	 */
    829  1.11.2.1   thorpej 	if (__predict_false((va & 3) != 0))
    830       1.1   thorpej 		return EINVAL;
    831       1.1   thorpej 
    832       1.1   thorpej 	/* Look it up. */
    833       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    834       1.1   thorpej 	if (error)
    835       1.1   thorpej 		return error;
    836       1.1   thorpej 
    837  1.11.2.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    838       1.1   thorpej 	futex_key_fini(&fk, shared);
    839       1.1   thorpej 	if (error)
    840       1.1   thorpej 		return error;
    841       1.1   thorpej 
    842       1.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
    843  1.11.2.1   thorpej 	KASSERT(*fp == NULL || (*fp)->fx_class == class);
    844       1.1   thorpej 	KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    845       1.1   thorpej 
    846       1.1   thorpej 	/*
    847       1.1   thorpej 	 * Success!  (Caller must still check whether we found
    848       1.1   thorpej 	 * anything, but nothing went _wrong_ like trying to use
    849       1.1   thorpej 	 * unmapped memory.)
    850       1.1   thorpej 	 */
    851       1.1   thorpej 	KASSERT(error == 0);
    852       1.1   thorpej 
    853       1.1   thorpej 	return error;
    854       1.1   thorpej }
    855       1.1   thorpej 
    856       1.1   thorpej /*
    857  1.11.2.1   thorpej  * futex_lookup_create(uaddr, shared, class, &f)
    858       1.1   thorpej  *
    859       1.1   thorpej  *	Find or create a futex at the userland pointer uaddr in the
    860       1.1   thorpej  *	current process's VM space.  On success, return the futex in f
    861       1.1   thorpej  *	and increment its reference count.
    862       1.1   thorpej  *
    863       1.5  riastrad  *	Caller must call futex_rele when done.
    864       1.1   thorpej  */
    865       1.1   thorpej static int
    866  1.11.2.1   thorpej futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
    867       1.1   thorpej {
    868       1.1   thorpej 	union futex_key fk;
    869       1.1   thorpej 	struct vmspace *vm = curproc->p_vmspace;
    870       1.1   thorpej 	struct futex *f = NULL;
    871       1.1   thorpej 	vaddr_t va = (vaddr_t)uaddr;
    872       1.1   thorpej 	int error;
    873       1.1   thorpej 
    874       1.1   thorpej 	/*
    875       1.1   thorpej 	 * Reject unaligned user pointers so we don't cross page
    876       1.1   thorpej 	 * boundaries and so atomics will work.
    877       1.1   thorpej 	 */
    878  1.11.2.1   thorpej 	if (__predict_false((va & 3) != 0))
    879  1.11.2.1   thorpej 		return EINVAL;
    880  1.11.2.1   thorpej 
    881  1.11.2.1   thorpej 	if (__predict_false(class == FUTEX_CLASS_ANY))
    882       1.1   thorpej 		return EINVAL;
    883       1.1   thorpej 
    884       1.1   thorpej 	error = futex_key_init(&fk, vm, va, shared);
    885       1.1   thorpej 	if (error)
    886       1.1   thorpej 		return error;
    887       1.1   thorpej 
    888       1.1   thorpej 	/*
    889       1.1   thorpej 	 * Optimistically assume there already is one, and try to find
    890       1.1   thorpej 	 * it.
    891       1.1   thorpej 	 */
    892  1.11.2.1   thorpej 	error = futex_lookup_by_key(&fk, shared, class, fp);
    893       1.1   thorpej 	if (error || *fp != NULL) {
    894       1.1   thorpej 		/*
    895       1.1   thorpej 		 * We either found one, or there was an error.
    896       1.1   thorpej 		 * In either case, we are done with the key.
    897       1.1   thorpej 		 */
    898       1.1   thorpej 		futex_key_fini(&fk, shared);
    899       1.1   thorpej 		goto out;
    900       1.1   thorpej 	}
    901       1.1   thorpej 
    902       1.1   thorpej 	/*
    903       1.1   thorpej 	 * Create a futex recoard.  This tranfers ownership of the key
    904       1.1   thorpej 	 * in all cases.
    905       1.1   thorpej 	 */
    906  1.11.2.1   thorpej 	f = futex_create(&fk, shared, class);
    907       1.1   thorpej 	if (f == NULL) {
    908       1.1   thorpej 		error = ENOMEM;
    909       1.1   thorpej 		goto out;
    910       1.1   thorpej 	}
    911       1.1   thorpej 
    912       1.1   thorpej 	/*
    913       1.1   thorpej 	 * Insert our new futex, or use existing if someone else beat
    914       1.1   thorpej 	 * us to it.
    915       1.1   thorpej 	 */
    916       1.1   thorpej 	error = futex_insert(f, fp);
    917       1.1   thorpej 	if (error)
    918       1.1   thorpej 		goto out;
    919       1.1   thorpej 	if (*fp == f)
    920       1.1   thorpej 		f = NULL;	/* don't release on exit */
    921       1.1   thorpej 
    922       1.1   thorpej 	/* Success!  */
    923       1.1   thorpej 	KASSERT(error == 0);
    924       1.1   thorpej 
    925       1.1   thorpej out:	if (f != NULL)
    926       1.1   thorpej 		futex_rele(f);
    927       1.1   thorpej 	KASSERT(error || *fp != NULL);
    928       1.1   thorpej 	KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
    929       1.1   thorpej 	return error;
    930       1.1   thorpej }
    931       1.1   thorpej 
    932       1.1   thorpej /*
    933  1.11.2.1   thorpej  * futex_unsleep:
    934       1.1   thorpej  *
    935  1.11.2.1   thorpej  *	Remove an LWP from the futex and sleep queue.  This is called when
    936  1.11.2.1   thorpej  *	the LWP has not been awoken normally but instead interrupted: for
    937  1.11.2.1   thorpej  *	example, when a signal is received.  Must be called with the LWP
    938  1.11.2.1   thorpej  *	locked.  Will unlock if "unlock" is true.
    939       1.1   thorpej  */
    940       1.1   thorpej static void
    941  1.11.2.1   thorpej futex_unsleep(lwp_t *l, bool unlock)
    942       1.1   thorpej {
    943  1.11.2.1   thorpej 	struct futex *f __diagused;
    944       1.1   thorpej 
    945  1.11.2.1   thorpej 	f = (struct futex *)(uintptr_t)l->l_wchan;
    946       1.6  riastrad 
    947  1.11.2.1   thorpej 	KASSERT(mutex_owned(f->fx_sq_lock));
    948       1.1   thorpej 
    949  1.11.2.1   thorpej 	/* WRITERQ is more likely */
    950  1.11.2.1   thorpej 	if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
    951  1.11.2.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
    952  1.11.2.1   thorpej 		f->fx_nwaiters[FUTEX_WRITERQ]--;
    953  1.11.2.1   thorpej 	} else {
    954  1.11.2.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
    955  1.11.2.1   thorpej 		KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
    956  1.11.2.1   thorpej 		f->fx_nwaiters[FUTEX_READERQ]--;
    957  1.11.2.1   thorpej 	}
    958       1.1   thorpej 
    959  1.11.2.1   thorpej 	sleepq_unsleep(l, unlock);
    960       1.1   thorpej }
    961       1.1   thorpej 
    962       1.1   thorpej /*
    963  1.11.2.1   thorpej  * futex_wait(f, timeout, clkid, bitset)
    964       1.1   thorpej  *
    965  1.11.2.1   thorpej  *	Wait until deadline on the clock clkid, or forever if deadline
    966  1.11.2.1   thorpej  *	is NULL, for a futex wakeup.  Return 0 on explicit wakeup,
    967  1.11.2.1   thorpej  *	ETIMEDOUT on timeout, EINTR on signal.
    968       1.1   thorpej  */
    969  1.11.2.1   thorpej static int
    970  1.11.2.1   thorpej futex_wait(struct futex *f, int q, const struct timespec *deadline,
    971  1.11.2.1   thorpej     clockid_t clkid, int bitset)
    972       1.1   thorpej {
    973       1.1   thorpej 
    974       1.1   thorpej 	/*
    975  1.11.2.1   thorpej 	 * Some things to note about how this function works:
    976  1.11.2.1   thorpej 	 *
    977  1.11.2.1   thorpej 	 * ==> When we enter futex_wait(), the futex's op lock is
    978  1.11.2.1   thorpej 	 * held, preventing us from missing wakes.  Once we are on
    979  1.11.2.1   thorpej 	 * the futex's sleepq, it is safe to release the op lock.
    980  1.11.2.1   thorpej 	 *
    981  1.11.2.1   thorpej 	 * ==> We have a single early escape to avoid calling
    982  1.11.2.1   thorpej 	 * sleepq_block(): our deadline already having passed.
    983  1.11.2.1   thorpej 	 * If this is a no-timeout wait, or if the deadline has
    984  1.11.2.1   thorpej 	 * not already passed, we are guaranteed to call sleepq_block().
    985  1.11.2.1   thorpej 	 *
    986  1.11.2.1   thorpej 	 * ==> sleepq_block() contains all of the necessary logic
    987  1.11.2.1   thorpej 	 * for handling signals; we do not need to check for them
    988  1.11.2.1   thorpej 	 * ourselves.
    989  1.11.2.1   thorpej 	 *
    990  1.11.2.1   thorpej 	 * ==> Once we have blocked, other futex operations will
    991  1.11.2.1   thorpej 	 * be able to wake us or requeue us.  Until we have blocked,
    992  1.11.2.1   thorpej 	 * those other futex operations will not be able to acquire
    993  1.11.2.1   thorpej 	 * the sleepq locks in order to perform those operations,
    994  1.11.2.1   thorpej 	 * even if we have dropped the op lock.  Thus, we will not
    995  1.11.2.1   thorpej 	 * miss wakes.  This fundamental invariant is relied upon by
    996  1.11.2.1   thorpej 	 * every other synchronization object in the kernel.
    997  1.11.2.1   thorpej 	 *
    998  1.11.2.1   thorpej 	 * ==> sleepq_block() returns for one of three reasons:
    999  1.11.2.1   thorpej 	 *
   1000  1.11.2.1   thorpej 	 *	-- We were awakened.
   1001  1.11.2.1   thorpej 	 *	-- We were signaled.
   1002  1.11.2.1   thorpej 	 *	-- We timed out.
   1003  1.11.2.1   thorpej 	 *
   1004  1.11.2.1   thorpej 	 * ==> Once sleepq_block() returns, we will not call
   1005  1.11.2.1   thorpej 	 * sleepq_block() again.
   1006  1.11.2.1   thorpej 	 *
   1007  1.11.2.1   thorpej 	 * ==> If sleepq_block() returns due to being signaled,
   1008  1.11.2.1   thorpej 	 * we must check to see if we were also awakened.  This
   1009  1.11.2.1   thorpej 	 * is to handle the following sequence:
   1010  1.11.2.1   thorpej 	 *
   1011  1.11.2.1   thorpej 	 *	-- Futex wake takes us off sleepq, places us on runq.
   1012  1.11.2.1   thorpej 	 *	-- We are signaled while sitting on the runq.
   1013  1.11.2.1   thorpej 	 *	-- We run, and sleepq_block() notices the signal and
   1014  1.11.2.1   thorpej 	 *	   returns  EINTR/ERESTART.
   1015  1.11.2.1   thorpej 	 *
   1016  1.11.2.1   thorpej 	 * In this situation, we want to indicate a successful wake
   1017  1.11.2.1   thorpej 	 * to the caller, because that wake has been reported to the
   1018  1.11.2.1   thorpej 	 * thread that issued it.
   1019  1.11.2.1   thorpej 	 *
   1020  1.11.2.1   thorpej 	 * Note that it is NOT possible for a wake to be issued after
   1021  1.11.2.1   thorpej 	 * a signal; the LWP will already have been taken off the sleepq
   1022  1.11.2.1   thorpej 	 * by the signal, so the wake will never happen.  Note that for
   1023  1.11.2.1   thorpej 	 * this reason, we must *never* return ERESTART, because it could
   1024  1.11.2.1   thorpej 	 * result in us waiting again with no one to awaken us.
   1025       1.1   thorpej 	 */
   1026       1.1   thorpej 
   1027  1.11.2.1   thorpej 	struct lwp * const l = curlwp;
   1028  1.11.2.1   thorpej 	int timo;
   1029  1.11.2.1   thorpej 	int error;
   1030       1.1   thorpej 
   1031       1.4  riastrad 	/*
   1032  1.11.2.1   thorpej 	 * Compute our timeout before taking any locks.
   1033       1.4  riastrad 	 */
   1034  1.11.2.1   thorpej 	if (deadline) {
   1035  1.11.2.1   thorpej 		struct timespec ts;
   1036       1.4  riastrad 
   1037  1.11.2.1   thorpej 		/* Check our watch.  */
   1038  1.11.2.1   thorpej 		error = clock_gettime1(clkid, &ts);
   1039  1.11.2.1   thorpej 		if (error) {
   1040  1.11.2.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1041  1.11.2.1   thorpej 			return error;
   1042  1.11.2.1   thorpej 		}
   1043       1.1   thorpej 
   1044  1.11.2.1   thorpej 		/*
   1045  1.11.2.1   thorpej 		 * If we're past the deadline, ETIMEDOUT.
   1046  1.11.2.1   thorpej 		 */
   1047  1.11.2.1   thorpej 		if (timespeccmp(deadline, &ts, <=)) {
   1048  1.11.2.1   thorpej 			mutex_exit(&f->fx_op_lock);
   1049  1.11.2.1   thorpej 			return ETIMEDOUT;
   1050  1.11.2.1   thorpej 		} else {
   1051  1.11.2.1   thorpej 			/* Count how much time is left.  */
   1052  1.11.2.1   thorpej 			timespecsub(deadline, &ts, &ts);
   1053  1.11.2.1   thorpej 			timo = tstohz(&ts);
   1054  1.11.2.1   thorpej 			if (timo == 0) {
   1055  1.11.2.1   thorpej 				/*
   1056  1.11.2.1   thorpej 				 * tstohz() already rounds up, and
   1057  1.11.2.1   thorpej 				 * a return value of 0 ticks means
   1058  1.11.2.1   thorpej 				 * "expired now or in the past".
   1059  1.11.2.1   thorpej 				 */
   1060  1.11.2.1   thorpej 				mutex_exit(&f->fx_op_lock);
   1061  1.11.2.1   thorpej 				return ETIMEDOUT;
   1062  1.11.2.1   thorpej 			}
   1063  1.11.2.1   thorpej 		}
   1064  1.11.2.1   thorpej 	} else {
   1065  1.11.2.1   thorpej 		timo = 0;
   1066  1.11.2.1   thorpej 	}
   1067       1.1   thorpej 
   1068       1.1   thorpej 	/*
   1069  1.11.2.1   thorpej 	 * Acquire in sleepq -> lwp order.  While we're at it, ensure
   1070  1.11.2.1   thorpej 	 * that there's room for us to block.
   1071       1.1   thorpej 	 */
   1072  1.11.2.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   1073  1.11.2.1   thorpej 	if (__predict_false(q == FUTEX_READERQ &&
   1074  1.11.2.1   thorpej 			    f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
   1075  1.11.2.1   thorpej 		mutex_spin_exit(f->fx_sq_lock);
   1076  1.11.2.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1077  1.11.2.1   thorpej 		return ENFILE;
   1078  1.11.2.1   thorpej 	}
   1079  1.11.2.1   thorpej 	lwp_lock(l);
   1080       1.4  riastrad 
   1081       1.4  riastrad 	/*
   1082  1.11.2.1   thorpej 	 * No need for locks here; until we're on a futex's sleepq, these
   1083  1.11.2.1   thorpej 	 * values are private to the LWP, and the locks needed to get onto
   1084  1.11.2.1   thorpej 	 * the sleepq provide the memory barriers needed to ensure visibility.
   1085       1.4  riastrad 	 */
   1086  1.11.2.1   thorpej 	l->l_futex = f;
   1087  1.11.2.1   thorpej 	l->l_futex_wakesel = bitset;
   1088       1.4  riastrad 
   1089       1.4  riastrad 	/*
   1090  1.11.2.1   thorpej 	 * This is an inlined bit of sleepq_enter();
   1091  1.11.2.1   thorpej 	 * we already hold the lwp lock.
   1092       1.4  riastrad 	 */
   1093  1.11.2.1   thorpej 	lwp_unlock_to(l, f->fx_sq_lock);
   1094  1.11.2.1   thorpej 	KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
   1095  1.11.2.1   thorpej 	KASSERT(lwp_locked(l, f->fx_sq_lock));
   1096       1.1   thorpej 
   1097  1.11.2.1   thorpej 	sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
   1098  1.11.2.1   thorpej 	f->fx_nwaiters[q]++;
   1099       1.4  riastrad 
   1100  1.11.2.1   thorpej 	/* We can now safely release the op lock. */
   1101  1.11.2.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1102      1.11  riastrad 
   1103  1.11.2.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
   1104  1.11.2.1   thorpej 	error = sleepq_block(timo, true);
   1105  1.11.2.1   thorpej 	SDT_PROBE1(futex, wait, sleepq_block, exit, error);
   1106      1.11  riastrad 
   1107  1.11.2.1   thorpej 	/*
   1108  1.11.2.1   thorpej 	 * f is no longer valid; we may have been moved to another
   1109  1.11.2.1   thorpej 	 * futex sleepq while we slept.
   1110  1.11.2.1   thorpej 	 */
   1111       1.4  riastrad 
   1112       1.4  riastrad 	/*
   1113  1.11.2.1   thorpej 	 * If something went wrong, we may still have a futex reference.
   1114  1.11.2.1   thorpej 	 * Check for that here and drop it if so.  We can do this without
   1115  1.11.2.1   thorpej 	 * taking any locks because l->l_futex is private to the LWP when
   1116  1.11.2.1   thorpej 	 * not on any futex's sleepq, and we are not on any sleepq because
   1117  1.11.2.1   thorpej 	 * we are running.
   1118  1.11.2.1   thorpej 	 *
   1119  1.11.2.1   thorpej 	 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
   1120  1.11.2.1   thorpej 	 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
   1121       1.4  riastrad 	 */
   1122       1.4  riastrad 	if (error) {
   1123  1.11.2.1   thorpej 		f = l->l_futex;
   1124  1.11.2.1   thorpej 		if (f != NULL) {
   1125  1.11.2.1   thorpej 			SDT_PROBE0(futex, wait, finish, aborted);
   1126  1.11.2.1   thorpej 			l->l_futex = NULL;
   1127  1.11.2.1   thorpej 			futex_rele(f);
   1128  1.11.2.1   thorpej 		} else {
   1129  1.11.2.1   thorpej 			/* Raced with wakeup; report success. */
   1130  1.11.2.1   thorpej 			SDT_PROBE0(futex, wait, finish, wakerace);
   1131  1.11.2.1   thorpej 			error = 0;
   1132  1.11.2.1   thorpej 		}
   1133       1.4  riastrad 		if (error == EWOULDBLOCK)
   1134       1.4  riastrad 			error = ETIMEDOUT;
   1135  1.11.2.1   thorpej 		else if (error != ETIMEDOUT)
   1136  1.11.2.1   thorpej 			error = EINTR;
   1137  1.11.2.1   thorpej 	} else {
   1138  1.11.2.1   thorpej 		KASSERT(l->l_futex == NULL);
   1139  1.11.2.1   thorpej 		SDT_PROBE0(futex, wait, finish, normally);
   1140       1.4  riastrad 	}
   1141       1.4  riastrad 
   1142       1.1   thorpej 	return error;
   1143       1.1   thorpej }
   1144       1.1   thorpej 
   1145       1.1   thorpej /*
   1146  1.11.2.1   thorpej  * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
   1147       1.1   thorpej  *
   1148       1.1   thorpej  *	Wake up to nwake waiters on f matching bitset; then, if f2 is
   1149       1.1   thorpej  *	provided, move up to nrequeue remaining waiters on f matching
   1150       1.1   thorpej  *	bitset to f2.  Return the number of waiters actually woken.
   1151       1.1   thorpej  *	Caller must hold the locks of f and f2, if provided.
   1152       1.1   thorpej  */
   1153       1.1   thorpej static unsigned
   1154  1.11.2.1   thorpej futex_wake(struct futex *f, int q, unsigned int const nwake,
   1155  1.11.2.1   thorpej     struct futex *f2, int q2, unsigned int const nrequeue,
   1156  1.11.2.1   thorpej     int bitset)
   1157       1.1   thorpej {
   1158  1.11.2.1   thorpej 	struct lwp *l, *l_next;
   1159       1.1   thorpej 	unsigned nwoken = 0;
   1160  1.11.2.1   thorpej 	unsigned nrequeued = 0;
   1161  1.11.2.1   thorpej 
   1162  1.11.2.1   thorpej 	KASSERT(mutex_owned(&f->fx_op_lock));
   1163  1.11.2.1   thorpej 	KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
   1164       1.1   thorpej 
   1165  1.11.2.1   thorpej 	futex_sq_lock2(f, f2);
   1166       1.1   thorpej 
   1167       1.1   thorpej 	/* Wake up to nwake waiters, and count the number woken.  */
   1168  1.11.2.1   thorpej 	LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1169  1.11.2.1   thorpej 		if (nwoken == nwake)
   1170       1.1   thorpej 			break;
   1171  1.11.2.1   thorpej 
   1172  1.11.2.1   thorpej 		KASSERT(l->l_syncobj == &futex_syncobj);
   1173  1.11.2.1   thorpej 		KASSERT(l->l_wchan == (wchan_t)f);
   1174  1.11.2.1   thorpej 		KASSERT(l->l_futex == f);
   1175  1.11.2.1   thorpej 		KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1176  1.11.2.1   thorpej 		KASSERT(l->l_mutex == f->fx_sq_lock);
   1177  1.11.2.1   thorpej 
   1178  1.11.2.1   thorpej 		if ((l->l_futex_wakesel & bitset) == 0)
   1179  1.11.2.1   thorpej 			continue;
   1180  1.11.2.1   thorpej 
   1181  1.11.2.1   thorpej 		l->l_futex_wakesel = 0;
   1182  1.11.2.1   thorpej 		l->l_futex = NULL;
   1183  1.11.2.1   thorpej 		sleepq_remove(&f->fx_sqs[q], l);
   1184  1.11.2.1   thorpej 		f->fx_nwaiters[q]--;
   1185  1.11.2.1   thorpej 		nwoken++;
   1186  1.11.2.1   thorpej 		/* hold counts adjusted in bulk below */
   1187       1.1   thorpej 	}
   1188       1.1   thorpej 
   1189  1.11.2.1   thorpej 	if (nrequeue) {
   1190  1.11.2.1   thorpej 		KASSERT(f2 != NULL);
   1191       1.1   thorpej 		/* Move up to nrequeue waiters from f's queue to f2's queue. */
   1192  1.11.2.1   thorpej 		LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
   1193  1.11.2.1   thorpej 			if (nrequeued == nrequeue)
   1194       1.1   thorpej 				break;
   1195  1.11.2.1   thorpej 
   1196  1.11.2.1   thorpej 			KASSERT(l->l_syncobj == &futex_syncobj);
   1197  1.11.2.1   thorpej 			KASSERT(l->l_wchan == (wchan_t)f);
   1198  1.11.2.1   thorpej 			KASSERT(l->l_futex == f);
   1199  1.11.2.1   thorpej 			KASSERT(l->l_sleepq == &f->fx_sqs[q]);
   1200  1.11.2.1   thorpej 			KASSERT(l->l_mutex == f->fx_sq_lock);
   1201  1.11.2.1   thorpej 
   1202  1.11.2.1   thorpej 			if ((l->l_futex_wakesel & bitset) == 0)
   1203  1.11.2.1   thorpej 				continue;
   1204  1.11.2.1   thorpej 
   1205  1.11.2.1   thorpej 			l->l_futex = f2;
   1206  1.11.2.1   thorpej 			sleepq_transfer(l, &f->fx_sqs[q],
   1207  1.11.2.1   thorpej 			    &f2->fx_sqs[q2], f2, futex_wmesg,
   1208  1.11.2.1   thorpej 			    &futex_syncobj, f2->fx_sq_lock, true);
   1209  1.11.2.1   thorpej 			f->fx_nwaiters[q]--;
   1210  1.11.2.1   thorpej 			f2->fx_nwaiters[q2]++;
   1211  1.11.2.1   thorpej 			nrequeued++;
   1212  1.11.2.1   thorpej 			/* hold counts adjusted in bulk below */
   1213  1.11.2.1   thorpej 		}
   1214  1.11.2.1   thorpej 		if (nrequeued) {
   1215  1.11.2.1   thorpej 			/* XXX futex_hold() could theoretically fail here. */
   1216  1.11.2.1   thorpej 			int hold_error __diagused =
   1217  1.11.2.1   thorpej 			    futex_hold_count(f2, nrequeued);
   1218  1.11.2.1   thorpej 			KASSERT(hold_error == 0);
   1219       1.1   thorpej 		}
   1220       1.1   thorpej 	}
   1221       1.1   thorpej 
   1222  1.11.2.1   thorpej 	/*
   1223  1.11.2.1   thorpej 	 * Adjust the futex reference counts for the wakes and
   1224  1.11.2.1   thorpej 	 * requeues.
   1225  1.11.2.1   thorpej 	 */
   1226  1.11.2.1   thorpej 	KASSERT(nwoken + nrequeued <= LONG_MAX);
   1227  1.11.2.1   thorpej 	futex_rele_count_not_last(f, nwoken + nrequeued);
   1228  1.11.2.1   thorpej 
   1229  1.11.2.1   thorpej 	futex_sq_unlock2(f, f2);
   1230  1.11.2.1   thorpej 
   1231  1.11.2.1   thorpej 	/* Return the number of waiters woken and requeued.  */
   1232  1.11.2.1   thorpej 	return nwoken + nrequeued;
   1233       1.1   thorpej }
   1234       1.1   thorpej 
   1235       1.1   thorpej /*
   1236  1.11.2.1   thorpej  * futex_op_lock(f)
   1237       1.1   thorpej  *
   1238  1.11.2.1   thorpej  *	Acquire the op lock of f.  Pair with futex_op_unlock.  Do
   1239       1.1   thorpej  *	not use if caller needs to acquire two locks; use
   1240  1.11.2.1   thorpej  *	futex_op_lock2 instead.
   1241       1.1   thorpej  */
   1242       1.1   thorpej static void
   1243  1.11.2.1   thorpej futex_op_lock(struct futex *f)
   1244       1.1   thorpej {
   1245  1.11.2.1   thorpej 	mutex_enter(&f->fx_op_lock);
   1246       1.1   thorpej }
   1247       1.1   thorpej 
   1248       1.1   thorpej /*
   1249  1.11.2.1   thorpej  * futex_op_unlock(f)
   1250       1.1   thorpej  *
   1251       1.1   thorpej  *	Release the queue lock of f.
   1252       1.1   thorpej  */
   1253       1.1   thorpej static void
   1254  1.11.2.1   thorpej futex_op_unlock(struct futex *f)
   1255       1.1   thorpej {
   1256  1.11.2.1   thorpej 	mutex_exit(&f->fx_op_lock);
   1257       1.1   thorpej }
   1258       1.1   thorpej 
   1259       1.1   thorpej /*
   1260  1.11.2.1   thorpej  * futex_op_lock2(f, f2)
   1261       1.1   thorpej  *
   1262  1.11.2.1   thorpej  *	Acquire the op locks of both f and f2, which may be null, or
   1263  1.11.2.1   thorpej  *	which may be the same.  If they are distinct, an arbitrary total
   1264  1.11.2.1   thorpej  *	order is chosen on the locks.
   1265       1.1   thorpej  *
   1266  1.11.2.1   thorpej  *	Callers should only ever acquire multiple op locks
   1267  1.11.2.1   thorpej  *	simultaneously using futex_op_lock2.
   1268       1.1   thorpej  */
   1269       1.1   thorpej static void
   1270  1.11.2.1   thorpej futex_op_lock2(struct futex *f, struct futex *f2)
   1271       1.1   thorpej {
   1272       1.1   thorpej 
   1273       1.1   thorpej 	/*
   1274       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1275       1.1   thorpej 	 * is not, lock the other and be done with it.
   1276       1.1   thorpej 	 */
   1277       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1278       1.1   thorpej 		return;
   1279       1.1   thorpej 	} else if (f == NULL) {
   1280  1.11.2.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1281       1.1   thorpej 		return;
   1282       1.1   thorpej 	} else if (f2 == NULL) {
   1283  1.11.2.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1284       1.1   thorpej 		return;
   1285       1.1   thorpej 	}
   1286       1.1   thorpej 
   1287       1.1   thorpej 	/* If both futexes are the same, acquire only one. */
   1288       1.1   thorpej 	if (f == f2) {
   1289  1.11.2.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1290       1.1   thorpej 		return;
   1291       1.1   thorpej 	}
   1292       1.1   thorpej 
   1293       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1294       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1295  1.11.2.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1296  1.11.2.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1297       1.1   thorpej 	} else {
   1298  1.11.2.1   thorpej 		mutex_enter(&f2->fx_op_lock);
   1299  1.11.2.1   thorpej 		mutex_enter(&f->fx_op_lock);
   1300       1.1   thorpej 	}
   1301       1.1   thorpej }
   1302       1.1   thorpej 
   1303       1.1   thorpej /*
   1304  1.11.2.1   thorpej  * futex_op_unlock2(f, f2)
   1305       1.1   thorpej  *
   1306       1.1   thorpej  *	Release the queue locks of both f and f2, which may be null, or
   1307       1.1   thorpej  *	which may have the same underlying queue.
   1308       1.1   thorpej  */
   1309       1.1   thorpej static void
   1310  1.11.2.1   thorpej futex_op_unlock2(struct futex *f, struct futex *f2)
   1311       1.1   thorpej {
   1312       1.1   thorpej 
   1313       1.1   thorpej 	/*
   1314       1.1   thorpej 	 * If both are null, do nothing; if one is null and the other
   1315       1.1   thorpej 	 * is not, unlock the other and be done with it.
   1316       1.1   thorpej 	 */
   1317       1.1   thorpej 	if (f == NULL && f2 == NULL) {
   1318       1.1   thorpej 		return;
   1319       1.1   thorpej 	} else if (f == NULL) {
   1320  1.11.2.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1321       1.1   thorpej 		return;
   1322       1.1   thorpej 	} else if (f2 == NULL) {
   1323  1.11.2.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1324       1.1   thorpej 		return;
   1325       1.1   thorpej 	}
   1326       1.1   thorpej 
   1327       1.1   thorpej 	/* If both futexes are the same, release only one. */
   1328       1.1   thorpej 	if (f == f2) {
   1329  1.11.2.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1330       1.1   thorpej 		return;
   1331       1.1   thorpej 	}
   1332       1.1   thorpej 
   1333       1.1   thorpej 	/* Otherwise, use the ordering on the kva of the futex pointer.  */
   1334       1.1   thorpej 	if ((uintptr_t)f < (uintptr_t)f2) {
   1335  1.11.2.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1336  1.11.2.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1337       1.1   thorpej 	} else {
   1338  1.11.2.1   thorpej 		mutex_exit(&f->fx_op_lock);
   1339  1.11.2.1   thorpej 		mutex_exit(&f2->fx_op_lock);
   1340       1.1   thorpej 	}
   1341       1.1   thorpej }
   1342       1.1   thorpej 
   1343       1.1   thorpej /*
   1344       1.1   thorpej  * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
   1345       1.1   thorpej  *
   1346  1.11.2.1   thorpej  *	Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
   1347       1.1   thorpej  */
   1348       1.1   thorpej static int
   1349       1.1   thorpej futex_func_wait(bool shared, int *uaddr, int val, int val3,
   1350      1.11  riastrad     const struct timespec *timeout, clockid_t clkid, int clkflags,
   1351       1.1   thorpej     register_t *retval)
   1352       1.1   thorpej {
   1353       1.1   thorpej 	struct futex *f;
   1354      1.11  riastrad 	struct timespec ts;
   1355      1.11  riastrad 	const struct timespec *deadline;
   1356       1.1   thorpej 	int error;
   1357       1.1   thorpej 
   1358       1.6  riastrad 	/*
   1359       1.6  riastrad 	 * If there's nothing to wait for, and nobody will ever wake
   1360       1.6  riastrad 	 * us, then don't set anything up to wait -- just stop here.
   1361       1.6  riastrad 	 */
   1362       1.6  riastrad 	if (val3 == 0)
   1363       1.7  riastrad 		return EINVAL;
   1364       1.6  riastrad 
   1365       1.1   thorpej 	/* Optimistically test before anything else.  */
   1366       1.1   thorpej 	if (!futex_test(uaddr, val))
   1367       1.1   thorpej 		return EAGAIN;
   1368       1.1   thorpej 
   1369      1.11  riastrad 	/* Determine a deadline on the specified clock.  */
   1370      1.11  riastrad 	if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
   1371      1.11  riastrad 		deadline = timeout;
   1372      1.11  riastrad 	} else {
   1373  1.11.2.1   thorpej 		struct timespec interval = *timeout;
   1374  1.11.2.1   thorpej 
   1375  1.11.2.1   thorpej 		error = itimespecfix(&interval);
   1376  1.11.2.1   thorpej 		if (error)
   1377  1.11.2.1   thorpej 			return error;
   1378      1.11  riastrad 		error = clock_gettime1(clkid, &ts);
   1379      1.11  riastrad 		if (error)
   1380      1.11  riastrad 			return error;
   1381  1.11.2.1   thorpej 		timespecadd(&ts, &interval, &ts);
   1382      1.11  riastrad 		deadline = &ts;
   1383      1.11  riastrad 	}
   1384      1.11  riastrad 
   1385       1.1   thorpej 	/* Get the futex, creating it if necessary.  */
   1386  1.11.2.1   thorpej 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1387       1.1   thorpej 	if (error)
   1388       1.1   thorpej 		return error;
   1389       1.1   thorpej 	KASSERT(f);
   1390       1.1   thorpej 
   1391       1.1   thorpej 	/*
   1392  1.11.2.1   thorpej 	 * Under the op lock, check the value again: if it has
   1393       1.1   thorpej 	 * already changed, EAGAIN; otherwise enqueue the waiter.
   1394       1.1   thorpej 	 * Since FUTEX_WAKE will use the same lock and be done after
   1395       1.1   thorpej 	 * modifying the value, the order in which we check and enqueue
   1396       1.1   thorpej 	 * is immaterial.
   1397       1.1   thorpej 	 */
   1398  1.11.2.1   thorpej 	futex_op_lock(f);
   1399       1.1   thorpej 	if (!futex_test(uaddr, val)) {
   1400  1.11.2.1   thorpej 		futex_op_unlock(f);
   1401       1.1   thorpej 		error = EAGAIN;
   1402       1.1   thorpej 		goto out;
   1403       1.1   thorpej 	}
   1404  1.11.2.1   thorpej 
   1405  1.11.2.1   thorpej 	/*
   1406  1.11.2.1   thorpej 	 * Now wait.  futex_wait() will drop our op lock once we
   1407  1.11.2.1   thorpej 	 * are entered into the sleep queue, thus ensuring atomicity
   1408  1.11.2.1   thorpej 	 * of wakes with respect to waits.
   1409  1.11.2.1   thorpej 	 */
   1410  1.11.2.1   thorpej 	error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
   1411       1.1   thorpej 
   1412       1.1   thorpej 	/*
   1413       1.1   thorpej 	 * We cannot drop our reference to the futex here, because
   1414       1.1   thorpej 	 * we might be enqueued on a different one when we are awakened.
   1415  1.11.2.1   thorpej 	 * The references will be managed on our behalf in the requeue,
   1416  1.11.2.1   thorpej 	 * wake, and error cases.
   1417       1.1   thorpej 	 */
   1418       1.1   thorpej 	f = NULL;
   1419       1.1   thorpej 
   1420  1.11.2.1   thorpej 	if (__predict_true(error == 0)) {
   1421  1.11.2.1   thorpej 		/* Return 0 on success, error on failure. */
   1422  1.11.2.1   thorpej 		*retval = 0;
   1423  1.11.2.1   thorpej 	}
   1424       1.1   thorpej 
   1425       1.1   thorpej out:	if (f != NULL)
   1426       1.5  riastrad 		futex_rele(f);
   1427       1.1   thorpej 	return error;
   1428       1.1   thorpej }
   1429       1.1   thorpej 
   1430       1.1   thorpej /*
   1431       1.1   thorpej  * futex_func_wake(uaddr, val, val3, retval)
   1432       1.1   thorpej  *
   1433       1.1   thorpej  *	Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
   1434       1.1   thorpej  */
   1435       1.1   thorpej static int
   1436       1.1   thorpej futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
   1437       1.1   thorpej {
   1438       1.1   thorpej 	struct futex *f;
   1439       1.1   thorpej 	unsigned int nwoken = 0;
   1440       1.1   thorpej 	int error = 0;
   1441       1.1   thorpej 
   1442       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1443       1.1   thorpej 	if (val < 0) {
   1444       1.1   thorpej 		error = EINVAL;
   1445       1.1   thorpej 		goto out;
   1446       1.1   thorpej 	}
   1447       1.1   thorpej 
   1448       1.1   thorpej 	/* Look up the futex, if any.  */
   1449  1.11.2.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1450       1.1   thorpej 	if (error)
   1451       1.1   thorpej 		goto out;
   1452       1.1   thorpej 
   1453       1.1   thorpej 	/* If there's no futex, there are no waiters to wake.  */
   1454       1.1   thorpej 	if (f == NULL)
   1455       1.1   thorpej 		goto out;
   1456       1.1   thorpej 
   1457       1.1   thorpej 	/*
   1458  1.11.2.1   thorpej 	 * Under f's op lock, wake the waiters and remember the
   1459       1.1   thorpej 	 * number woken.
   1460       1.1   thorpej 	 */
   1461  1.11.2.1   thorpej 	futex_op_lock(f);
   1462  1.11.2.1   thorpej 	nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1463  1.11.2.1   thorpej 			    NULL, FUTEX_WRITERQ, 0, val3);
   1464  1.11.2.1   thorpej 	futex_op_unlock(f);
   1465       1.1   thorpej 
   1466       1.1   thorpej 	/* Release the futex.  */
   1467       1.5  riastrad 	futex_rele(f);
   1468       1.1   thorpej 
   1469       1.1   thorpej out:
   1470       1.1   thorpej 	/* Return the number of waiters woken.  */
   1471       1.1   thorpej 	*retval = nwoken;
   1472       1.1   thorpej 
   1473       1.1   thorpej 	/* Success!  */
   1474       1.1   thorpej 	return error;
   1475       1.1   thorpej }
   1476       1.1   thorpej 
   1477       1.1   thorpej /*
   1478       1.1   thorpej  * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
   1479       1.1   thorpej  *
   1480       1.1   thorpej  *	Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
   1481       1.1   thorpej  */
   1482       1.1   thorpej static int
   1483       1.1   thorpej futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
   1484       1.1   thorpej     int val2, int val3, register_t *retval)
   1485       1.1   thorpej {
   1486       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1487       1.1   thorpej 	unsigned nwoken = 0;	/* default to zero woken on early return */
   1488       1.1   thorpej 	int error;
   1489       1.1   thorpej 
   1490       1.1   thorpej 	/* Reject negative number of wakeups or requeues. */
   1491       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1492       1.1   thorpej 		error = EINVAL;
   1493       1.1   thorpej 		goto out;
   1494       1.1   thorpej 	}
   1495       1.1   thorpej 
   1496       1.1   thorpej 	/* Look up the source futex, if any. */
   1497  1.11.2.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1498       1.1   thorpej 	if (error)
   1499       1.1   thorpej 		goto out;
   1500       1.1   thorpej 
   1501       1.1   thorpej 	/* If there is none, nothing to do. */
   1502       1.1   thorpej 	if (f == NULL)
   1503       1.1   thorpej 		goto out;
   1504       1.1   thorpej 
   1505       1.1   thorpej 	/*
   1506       1.1   thorpej 	 * We may need to create the destination futex because it's
   1507       1.1   thorpej 	 * entirely possible it does not currently have any waiters.
   1508       1.1   thorpej 	 */
   1509  1.11.2.1   thorpej 	error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1510       1.1   thorpej 	if (error)
   1511       1.1   thorpej 		goto out;
   1512       1.1   thorpej 
   1513       1.1   thorpej 	/*
   1514  1.11.2.1   thorpej 	 * Under the futexes' op locks, check the value; if
   1515       1.1   thorpej 	 * unchanged from val3, wake the waiters.
   1516       1.1   thorpej 	 */
   1517  1.11.2.1   thorpej 	futex_op_lock2(f, f2);
   1518       1.1   thorpej 	if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
   1519       1.1   thorpej 		error = EAGAIN;
   1520       1.1   thorpej 	} else {
   1521       1.1   thorpej 		error = 0;
   1522  1.11.2.1   thorpej 		nwoken = futex_wake(f, FUTEX_WRITERQ, val,
   1523  1.11.2.1   thorpej 				    f2, FUTEX_WRITERQ, val2,
   1524  1.11.2.1   thorpej 				    FUTEX_BITSET_MATCH_ANY);
   1525       1.1   thorpej 	}
   1526  1.11.2.1   thorpej 	futex_op_unlock2(f, f2);
   1527       1.1   thorpej 
   1528       1.1   thorpej out:
   1529       1.1   thorpej 	/* Return the number of waiters woken.  */
   1530       1.1   thorpej 	*retval = nwoken;
   1531       1.1   thorpej 
   1532       1.1   thorpej 	/* Release the futexes if we got them.  */
   1533       1.1   thorpej 	if (f2)
   1534       1.5  riastrad 		futex_rele(f2);
   1535       1.1   thorpej 	if (f)
   1536       1.5  riastrad 		futex_rele(f);
   1537       1.1   thorpej 	return error;
   1538       1.1   thorpej }
   1539       1.1   thorpej 
   1540       1.1   thorpej /*
   1541       1.1   thorpej  * futex_validate_op_cmp(val3)
   1542       1.1   thorpej  *
   1543       1.1   thorpej  *	Validate an op/cmp argument for FUTEX_WAKE_OP.
   1544       1.1   thorpej  */
   1545       1.1   thorpej static int
   1546       1.1   thorpej futex_validate_op_cmp(int val3)
   1547       1.1   thorpej {
   1548       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1549       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1550       1.1   thorpej 
   1551       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1552       1.1   thorpej 		int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1553       1.1   thorpej 		if (oparg < 0)
   1554       1.1   thorpej 			return EINVAL;
   1555       1.1   thorpej 		if (oparg >= 32)
   1556       1.1   thorpej 			return EINVAL;
   1557       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1558       1.1   thorpej 	}
   1559       1.1   thorpej 
   1560       1.1   thorpej 	switch (op) {
   1561       1.1   thorpej 	case FUTEX_OP_SET:
   1562       1.1   thorpej 	case FUTEX_OP_ADD:
   1563       1.1   thorpej 	case FUTEX_OP_OR:
   1564       1.1   thorpej 	case FUTEX_OP_ANDN:
   1565       1.1   thorpej 	case FUTEX_OP_XOR:
   1566       1.1   thorpej 		break;
   1567       1.1   thorpej 	default:
   1568       1.1   thorpej 		return EINVAL;
   1569       1.1   thorpej 	}
   1570       1.1   thorpej 
   1571       1.1   thorpej 	switch (cmp) {
   1572       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1573       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1574       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1575       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1576       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1577       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1578       1.1   thorpej 		break;
   1579       1.1   thorpej 	default:
   1580       1.1   thorpej 		return EINVAL;
   1581       1.1   thorpej 	}
   1582       1.1   thorpej 
   1583       1.1   thorpej 	return 0;
   1584       1.1   thorpej }
   1585       1.1   thorpej 
   1586       1.1   thorpej /*
   1587       1.1   thorpej  * futex_compute_op(oldval, val3)
   1588       1.1   thorpej  *
   1589       1.1   thorpej  *	Apply a FUTEX_WAIT_OP operation to oldval.
   1590       1.1   thorpej  */
   1591       1.1   thorpej static int
   1592       1.1   thorpej futex_compute_op(int oldval, int val3)
   1593       1.1   thorpej {
   1594       1.1   thorpej 	int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
   1595       1.1   thorpej 	int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
   1596       1.1   thorpej 
   1597       1.1   thorpej 	if (op & FUTEX_OP_OPARG_SHIFT) {
   1598       1.1   thorpej 		KASSERT(oparg >= 0);
   1599       1.1   thorpej 		KASSERT(oparg < 32);
   1600       1.1   thorpej 		oparg = 1u << oparg;
   1601       1.1   thorpej 		op &= ~FUTEX_OP_OPARG_SHIFT;
   1602       1.1   thorpej 	}
   1603       1.1   thorpej 
   1604       1.1   thorpej 	switch (op) {
   1605       1.1   thorpej 	case FUTEX_OP_SET:
   1606       1.1   thorpej 		return oparg;
   1607       1.1   thorpej 
   1608       1.1   thorpej 	case FUTEX_OP_ADD:
   1609       1.1   thorpej 		/*
   1610       1.1   thorpej 		 * Avoid signed arithmetic overflow by doing
   1611       1.1   thorpej 		 * arithmetic unsigned and converting back to signed
   1612       1.1   thorpej 		 * at the end.
   1613       1.1   thorpej 		 */
   1614       1.1   thorpej 		return (int)((unsigned)oldval + (unsigned)oparg);
   1615       1.1   thorpej 
   1616       1.1   thorpej 	case FUTEX_OP_OR:
   1617       1.1   thorpej 		return oldval | oparg;
   1618       1.1   thorpej 
   1619       1.1   thorpej 	case FUTEX_OP_ANDN:
   1620       1.1   thorpej 		return oldval & ~oparg;
   1621       1.1   thorpej 
   1622       1.1   thorpej 	case FUTEX_OP_XOR:
   1623       1.1   thorpej 		return oldval ^ oparg;
   1624       1.1   thorpej 
   1625       1.1   thorpej 	default:
   1626       1.1   thorpej 		panic("invalid futex op");
   1627       1.1   thorpej 	}
   1628       1.1   thorpej }
   1629       1.1   thorpej 
   1630       1.1   thorpej /*
   1631       1.1   thorpej  * futex_compute_cmp(oldval, val3)
   1632       1.1   thorpej  *
   1633       1.1   thorpej  *	Apply a FUTEX_WAIT_OP comparison to oldval.
   1634       1.1   thorpej  */
   1635       1.1   thorpej static bool
   1636       1.1   thorpej futex_compute_cmp(int oldval, int val3)
   1637       1.1   thorpej {
   1638       1.1   thorpej 	int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
   1639       1.1   thorpej 	int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
   1640       1.1   thorpej 
   1641       1.1   thorpej 	switch (cmp) {
   1642       1.1   thorpej 	case FUTEX_OP_CMP_EQ:
   1643       1.1   thorpej 		return (oldval == cmparg);
   1644       1.1   thorpej 
   1645       1.1   thorpej 	case FUTEX_OP_CMP_NE:
   1646       1.1   thorpej 		return (oldval != cmparg);
   1647       1.1   thorpej 
   1648       1.1   thorpej 	case FUTEX_OP_CMP_LT:
   1649       1.1   thorpej 		return (oldval < cmparg);
   1650       1.1   thorpej 
   1651       1.1   thorpej 	case FUTEX_OP_CMP_LE:
   1652       1.1   thorpej 		return (oldval <= cmparg);
   1653       1.1   thorpej 
   1654       1.1   thorpej 	case FUTEX_OP_CMP_GT:
   1655       1.1   thorpej 		return (oldval > cmparg);
   1656       1.1   thorpej 
   1657       1.1   thorpej 	case FUTEX_OP_CMP_GE:
   1658       1.1   thorpej 		return (oldval >= cmparg);
   1659       1.1   thorpej 
   1660       1.1   thorpej 	default:
   1661       1.1   thorpej 		panic("invalid futex cmp operation");
   1662       1.1   thorpej 	}
   1663       1.1   thorpej }
   1664       1.1   thorpej 
   1665       1.1   thorpej /*
   1666       1.1   thorpej  * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
   1667       1.1   thorpej  *
   1668       1.1   thorpej  *	Implement futex(FUTEX_WAKE_OP).
   1669       1.1   thorpej  */
   1670       1.1   thorpej static int
   1671       1.1   thorpej futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
   1672       1.1   thorpej     int val3, register_t *retval)
   1673       1.1   thorpej {
   1674       1.1   thorpej 	struct futex *f = NULL, *f2 = NULL;
   1675       1.1   thorpej 	int oldval, newval, actual;
   1676       1.1   thorpej 	unsigned nwoken = 0;
   1677       1.1   thorpej 	int error;
   1678       1.1   thorpej 
   1679       1.1   thorpej 	/* Reject negative number of wakeups.  */
   1680       1.1   thorpej 	if (val < 0 || val2 < 0) {
   1681       1.1   thorpej 		error = EINVAL;
   1682       1.1   thorpej 		goto out;
   1683       1.1   thorpej 	}
   1684       1.1   thorpej 
   1685       1.1   thorpej 	/* Reject invalid operations before we start doing things.  */
   1686       1.1   thorpej 	if ((error = futex_validate_op_cmp(val3)) != 0)
   1687       1.1   thorpej 		goto out;
   1688       1.1   thorpej 
   1689       1.1   thorpej 	/* Look up the first futex, if any.  */
   1690  1.11.2.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
   1691       1.1   thorpej 	if (error)
   1692       1.1   thorpej 		goto out;
   1693       1.1   thorpej 
   1694       1.1   thorpej 	/* Look up the second futex, if any.  */
   1695  1.11.2.1   thorpej 	error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
   1696       1.1   thorpej 	if (error)
   1697       1.1   thorpej 		goto out;
   1698       1.1   thorpej 
   1699       1.1   thorpej 	/*
   1700  1.11.2.1   thorpej 	 * Under the op locks:
   1701       1.1   thorpej 	 *
   1702       1.1   thorpej 	 * 1. Read/modify/write: *uaddr2 op= oparg.
   1703       1.1   thorpej 	 * 2. Unconditionally wake uaddr.
   1704  1.11.2.1   thorpej 	 * 3. Conditionally wake uaddr2, if it previously matched val3.
   1705       1.1   thorpej 	 */
   1706  1.11.2.1   thorpej 	futex_op_lock2(f, f2);
   1707       1.1   thorpej 	do {
   1708       1.1   thorpej 		error = futex_load(uaddr2, &oldval);
   1709       1.1   thorpej 		if (error)
   1710       1.1   thorpej 			goto out_unlock;
   1711       1.1   thorpej 		newval = futex_compute_op(oldval, val3);
   1712       1.1   thorpej 		error = ucas_int(uaddr2, oldval, newval, &actual);
   1713       1.1   thorpej 		if (error)
   1714       1.1   thorpej 			goto out_unlock;
   1715       1.1   thorpej 	} while (actual != oldval);
   1716  1.11.2.1   thorpej 	nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
   1717  1.11.2.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   1718  1.11.2.1   thorpej 				 FUTEX_BITSET_MATCH_ANY) : 0);
   1719       1.1   thorpej 	if (f2 && futex_compute_cmp(oldval, val3))
   1720  1.11.2.1   thorpej 		nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
   1721  1.11.2.1   thorpej 				     NULL, FUTEX_WRITERQ, 0,
   1722  1.11.2.1   thorpej 				     FUTEX_BITSET_MATCH_ANY);
   1723       1.1   thorpej 
   1724       1.1   thorpej 	/* Success! */
   1725       1.1   thorpej 	error = 0;
   1726       1.1   thorpej out_unlock:
   1727  1.11.2.1   thorpej 	futex_op_unlock2(f, f2);
   1728       1.1   thorpej 
   1729       1.1   thorpej out:
   1730       1.1   thorpej 	/* Return the number of waiters woken. */
   1731       1.1   thorpej 	*retval = nwoken;
   1732       1.1   thorpej 
   1733       1.1   thorpej 	/* Release the futexes, if we got them. */
   1734       1.1   thorpej 	if (f2)
   1735       1.5  riastrad 		futex_rele(f2);
   1736       1.1   thorpej 	if (f)
   1737       1.5  riastrad 		futex_rele(f);
   1738       1.1   thorpej 	return error;
   1739       1.1   thorpej }
   1740       1.1   thorpej 
   1741       1.1   thorpej /*
   1742  1.11.2.1   thorpej  * futex_read_waiter_prio(l)
   1743  1.11.2.1   thorpej  *
   1744  1.11.2.1   thorpej  *	Returns the read-waiter priority for purposes of comparing
   1745  1.11.2.1   thorpej  *	against a write-waiter's priority.  Read-waiters are only
   1746  1.11.2.1   thorpej  *	prioritized if they are real-time threads.
   1747  1.11.2.1   thorpej  */
   1748  1.11.2.1   thorpej static inline pri_t
   1749  1.11.2.1   thorpej futex_read_waiter_prio(struct lwp * const l)
   1750  1.11.2.1   thorpej {
   1751  1.11.2.1   thorpej 	const pri_t pri = lwp_eprio(l);
   1752  1.11.2.1   thorpej 	if (__predict_false(pri >= PRI_USER_RT))
   1753  1.11.2.1   thorpej 		return pri;
   1754  1.11.2.1   thorpej 	return PRI_NONE;
   1755  1.11.2.1   thorpej }
   1756  1.11.2.1   thorpej 
   1757  1.11.2.1   thorpej #if 0 /* XXX */
   1758  1.11.2.1   thorpej /*
   1759  1.11.2.1   thorpej  * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp)
   1760  1.11.2.1   thorpej  *
   1761  1.11.2.1   thorpej  *	Attempt to resolve the case of a real-time thread attempting
   1762  1.11.2.1   thorpej  *	to acquire a read lock.  Turns true if the attempt is resolved
   1763  1.11.2.1   thorpej  *	and the wait should be elided.
   1764  1.11.2.1   thorpej  */
   1765  1.11.2.1   thorpej static int __noinline
   1766  1.11.2.1   thorpej futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val,
   1767  1.11.2.1   thorpej     pri_t pri_reader)
   1768  1.11.2.1   thorpej {
   1769  1.11.2.1   thorpej 	struct lwp *l_writer = NULL;
   1770  1.11.2.1   thorpej 	int newval, next;
   1771  1.11.2.1   thorpej 	int error;
   1772  1.11.2.1   thorpej 
   1773  1.11.2.1   thorpej 	KASSERT(mutex_owned(&f->fx_op_lock));
   1774  1.11.2.1   thorpej 
   1775  1.11.2.1   thorpej 	/*
   1776  1.11.2.1   thorpej 	 * If the lock is write-locked, there isn't anything we
   1777  1.11.2.1   thorpej 	 * can do but wait.
   1778  1.11.2.1   thorpej 	 */
   1779  1.11.2.1   thorpej 	if (val & FUTEX_RW_WRITE_LOCKED) {
   1780  1.11.2.1   thorpej 		return 0;
   1781  1.11.2.1   thorpej 	}
   1782  1.11.2.1   thorpej 
   1783  1.11.2.1   thorpej 	/*
   1784  1.11.2.1   thorpej 	 * If we're maxed out on readers already, nothing we can do.
   1785  1.11.2.1   thorpej 	 */
   1786  1.11.2.1   thorpej 	if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
   1787  1.11.2.1   thorpej 		return ENFILE;		/* disctinct from EAGAIN */
   1788  1.11.2.1   thorpej 	}
   1789  1.11.2.1   thorpej 
   1790  1.11.2.1   thorpej 	/*
   1791  1.11.2.1   thorpej 	 * The assumption then is that we arrived here with WRITE_WANTED
   1792  1.11.2.1   thorpej 	 * set.  We're not going to bother testing that, however.  We
   1793  1.11.2.1   thorpej 	 * will preserve it if we see it.
   1794  1.11.2.1   thorpej 	 *
   1795  1.11.2.1   thorpej 	 * Because WRITE_WANTED is set, This will cause everyone to enter
   1796  1.11.2.1   thorpej 	 * the futex to rw_wait.  And we are holding the op lock so no
   1797  1.11.2.1   thorpej 	 * additional waiters will apear on the sleepq.  We are going
   1798  1.11.2.1   thorpej 	 * to do the same tricky dance as rw_handoff to let higher-priority
   1799  1.11.2.1   thorpej 	 * real-time waiters slip past the gate.
   1800  1.11.2.1   thorpej 	 */
   1801  1.11.2.1   thorpej 
   1802  1.11.2.1   thorpej 	/*
   1803  1.11.2.1   thorpej 	 * All we want to do here is check if there is a writer waiting.
   1804  1.11.2.1   thorpej 	 * If there is, and it is equal or greater priority, this reader
   1805  1.11.2.1   thorpej 	 * loses, otherwise we will just make note if it to ensure that
   1806  1.11.2.1   thorpej 	 * the WRITE_WANTED bit is set when we update the futex word.
   1807  1.11.2.1   thorpej 	 *
   1808  1.11.2.1   thorpej 	 * Since we are not going to actually wake someone from the
   1809  1.11.2.1   thorpej 	 * queue here, we're not really interested if the write-waiter
   1810  1.11.2.1   thorpej 	 * happens to leave based on a timeout or signal; we know that
   1811  1.11.2.1   thorpej 	 * any write-waiter *after* the first one is of equal or lower
   1812  1.11.2.1   thorpej 	 * priority, so we would still best it.
   1813  1.11.2.1   thorpej 	 */
   1814  1.11.2.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   1815  1.11.2.1   thorpej 	l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
   1816  1.11.2.1   thorpej 	if (__predict_true(l_writer != NULL)) {
   1817  1.11.2.1   thorpej 		if (pri_reader <= lwp_eprio(l_writer)) {
   1818  1.11.2.1   thorpej 			return 0;
   1819  1.11.2.1   thorpej 		}
   1820  1.11.2.1   thorpej 	}
   1821  1.11.2.1   thorpej 	mutex_spin_exit(f->fx_sq_lock);
   1822  1.11.2.1   thorpej 
   1823  1.11.2.1   thorpej 	for (;; val = next) {
   1824  1.11.2.1   thorpej 		if (__predict_true(l_writer != NULL)) {
   1825  1.11.2.1   thorpej 			newval = (val + 1) | FUTEX_RW_WRITE_WANTED;
   1826  1.11.2.1   thorpej 		} else {
   1827  1.11.2.1   thorpej 			/*
   1828  1.11.2.1   thorpej 			 * No writer waiting; increment the reader
   1829  1.11.2.1   thorpej 			 * count, preserve any WRITE_WANTED bit.
   1830  1.11.2.1   thorpej 			 */
   1831  1.11.2.1   thorpej 			newval = val + 1;
   1832  1.11.2.1   thorpej 			KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0);
   1833  1.11.2.1   thorpej 		}
   1834  1.11.2.1   thorpej 
   1835  1.11.2.1   thorpej 		error = ucas_int(uaddr, val, newval, &next);
   1836  1.11.2.1   thorpej 		if (__predict_false(error != 0)) {
   1837  1.11.2.1   thorpej 			return error;
   1838  1.11.2.1   thorpej 		}
   1839  1.11.2.1   thorpej 		if (next == val) {
   1840  1.11.2.1   thorpej 			/* Successfully acquired the read lock. */
   1841  1.11.2.1   thorpej 			return EJUSTRETURN;
   1842  1.11.2.1   thorpej 		}
   1843  1.11.2.1   thorpej 		/*
   1844  1.11.2.1   thorpej 		 * Repeat the terminal checks from above on the new
   1845  1.11.2.1   thorpej 		 * value.
   1846  1.11.2.1   thorpej 		 */
   1847  1.11.2.1   thorpej 		if (next & FUTEX_RW_WRITE_LOCKED) {
   1848  1.11.2.1   thorpej 			return 0;
   1849  1.11.2.1   thorpej 		}
   1850  1.11.2.1   thorpej 		if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
   1851  1.11.2.1   thorpej 			return ENFILE;		/* disctinct from EAGAIN */
   1852  1.11.2.1   thorpej 		}
   1853  1.11.2.1   thorpej 	}
   1854  1.11.2.1   thorpej }
   1855  1.11.2.1   thorpej #endif /* XXX */
   1856  1.11.2.1   thorpej 
   1857  1.11.2.1   thorpej /*
   1858  1.11.2.1   thorpej  * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval)
   1859  1.11.2.1   thorpej  *
   1860  1.11.2.1   thorpej  *	Implement futex(FUTEX_NETBSD_RW_WAIT).
   1861  1.11.2.1   thorpej  */
   1862  1.11.2.1   thorpej static int
   1863  1.11.2.1   thorpej futex_func_rw_wait(bool shared, int *uaddr, int val, int val3,
   1864  1.11.2.1   thorpej     const struct timespec *abstime, clockid_t clkid, register_t *retval)
   1865  1.11.2.1   thorpej {
   1866  1.11.2.1   thorpej #if 1
   1867  1.11.2.1   thorpej 	/* XXX NETBSD_RW ops are currently broken XXX */
   1868  1.11.2.1   thorpej 	return ENOSYS;
   1869  1.11.2.1   thorpej #else
   1870  1.11.2.1   thorpej 	struct futex *f;
   1871  1.11.2.1   thorpej 	int error;
   1872  1.11.2.1   thorpej 
   1873  1.11.2.1   thorpej 	/* Must specify READER or WRITER. */
   1874  1.11.2.1   thorpej 	if (__predict_false(val3 != FUTEX_RW_READER &&
   1875  1.11.2.1   thorpej 			    val3 != FUTEX_RW_WRITER))
   1876  1.11.2.1   thorpej 		return EINVAL;
   1877  1.11.2.1   thorpej 
   1878  1.11.2.1   thorpej 	/* Optimistically test before anything else.  */
   1879  1.11.2.1   thorpej 	if (!futex_test(uaddr, val))
   1880  1.11.2.1   thorpej 		return EAGAIN;
   1881  1.11.2.1   thorpej 
   1882  1.11.2.1   thorpej 	/* Get the futex, creating it if necessary.  */
   1883  1.11.2.1   thorpej 	error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
   1884  1.11.2.1   thorpej 	if (error)
   1885  1.11.2.1   thorpej 		return error;
   1886  1.11.2.1   thorpej 	KASSERT(f);
   1887  1.11.2.1   thorpej 
   1888  1.11.2.1   thorpej 	/*
   1889  1.11.2.1   thorpej 	 * Under the op lock, check the value again: if it has
   1890  1.11.2.1   thorpej 	 * already changed, EAGAIN; otherwise, enqueue the waiter
   1891  1.11.2.1   thorpej 	 * on the specified queue.
   1892  1.11.2.1   thorpej 	 */
   1893  1.11.2.1   thorpej 	futex_op_lock(f);
   1894  1.11.2.1   thorpej 	if (!futex_test(uaddr, val)) {
   1895  1.11.2.1   thorpej 		futex_op_unlock(f);
   1896  1.11.2.1   thorpej 		error = EAGAIN;
   1897  1.11.2.1   thorpej 		goto out;
   1898  1.11.2.1   thorpej 	}
   1899  1.11.2.1   thorpej 
   1900  1.11.2.1   thorpej 	/*
   1901  1.11.2.1   thorpej 	 * POSIX dictates that a real-time reader will be prioritized
   1902  1.11.2.1   thorpej 	 * over writers of lesser priority, when normally we would
   1903  1.11.2.1   thorpej 	 * prefer the writers.
   1904  1.11.2.1   thorpej 	 */
   1905  1.11.2.1   thorpej 	if (__predict_false(val3 == FUTEX_RW_READER)) {
   1906  1.11.2.1   thorpej 		struct lwp * const l = curlwp;
   1907  1.11.2.1   thorpej 		const pri_t pri_reader = futex_read_waiter_prio(l);
   1908  1.11.2.1   thorpej 		if (__predict_false(pri_reader > PRI_NONE)) {
   1909  1.11.2.1   thorpej 			error = futex_rw_handle_rt_reader(f, uaddr, val,
   1910  1.11.2.1   thorpej 			    pri_reader);
   1911  1.11.2.1   thorpej 			if (error) {
   1912  1.11.2.1   thorpej 				if (__predict_true(error == EJUSTRETURN)) {
   1913  1.11.2.1   thorpej 					/* RT reader acquired the lock. */
   1914  1.11.2.1   thorpej 					error = 0;
   1915  1.11.2.1   thorpej 				}
   1916  1.11.2.1   thorpej 				futex_op_unlock(f);
   1917  1.11.2.1   thorpej 				goto out;
   1918  1.11.2.1   thorpej 			}
   1919  1.11.2.1   thorpej 		}
   1920  1.11.2.1   thorpej 	}
   1921  1.11.2.1   thorpej 
   1922  1.11.2.1   thorpej 	/*
   1923  1.11.2.1   thorpej 	 * Now wait.  futex_wait() will dop our op lock once we
   1924  1.11.2.1   thorpej 	 * are entered into the sleep queue, thus ensuring atomicity
   1925  1.11.2.1   thorpej 	 * of wakes with respect to waits.
   1926  1.11.2.1   thorpej 	 *
   1927  1.11.2.1   thorpej 	 * Use a wake selector of 0 so that this waiter won't match on any
   1928  1.11.2.1   thorpej 	 * of the other operations in case someone makes that error; only
   1929  1.11.2.1   thorpej 	 * rw_handoff is allowed!  This is critical because a waiter that
   1930  1.11.2.1   thorpej 	 * awakens from an rw_wait assumes it has been given ownership of
   1931  1.11.2.1   thorpej 	 * the lock.
   1932  1.11.2.1   thorpej 	 */
   1933  1.11.2.1   thorpej 	error = futex_wait(f, val3, abstime, clkid, 0);
   1934  1.11.2.1   thorpej 
   1935  1.11.2.1   thorpej 	/*
   1936  1.11.2.1   thorpej 	 * Don't drop our reference here.  We won't be requeued, but
   1937  1.11.2.1   thorpej 	 * it's best to main symmetry with other operations.
   1938  1.11.2.1   thorpej 	 */
   1939  1.11.2.1   thorpej 	f = NULL;
   1940  1.11.2.1   thorpej 
   1941  1.11.2.1   thorpej out:	if (__predict_true(error == 0)) {
   1942  1.11.2.1   thorpej 		/* Return 0 on success, error on failure. */
   1943  1.11.2.1   thorpej 		*retval = 0;
   1944  1.11.2.1   thorpej 	}
   1945  1.11.2.1   thorpej 
   1946  1.11.2.1   thorpej 	if (f != NULL)
   1947  1.11.2.1   thorpej 		futex_rele(f);
   1948  1.11.2.1   thorpej 	return error;
   1949  1.11.2.1   thorpej #endif
   1950  1.11.2.1   thorpej }
   1951  1.11.2.1   thorpej 
   1952  1.11.2.1   thorpej /*
   1953  1.11.2.1   thorpej  * futex_func_rw_handoff(uaddr, val, retval)
   1954  1.11.2.1   thorpej  *
   1955  1.11.2.1   thorpej  *	Implement futex(FUTEX_NETBSD_RW_HANDOFF).
   1956  1.11.2.1   thorpej  *
   1957  1.11.2.1   thorpej  *	This implements direct hand-off to either a single writer
   1958  1.11.2.1   thorpej  *	or all readers.
   1959  1.11.2.1   thorpej  */
   1960  1.11.2.1   thorpej static int
   1961  1.11.2.1   thorpej futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval)
   1962  1.11.2.1   thorpej {
   1963  1.11.2.1   thorpej #if 1
   1964  1.11.2.1   thorpej 	/* XXX NETBSD_RW ops are currently broken XXX */
   1965  1.11.2.1   thorpej 	return ENOSYS;
   1966  1.11.2.1   thorpej #else
   1967  1.11.2.1   thorpej 	struct lwp *l, *l_next, *l_writer, *l_reader;
   1968  1.11.2.1   thorpej 	struct futex *f;
   1969  1.11.2.1   thorpej 	sleepq_t wsq, *sq;
   1970  1.11.2.1   thorpej 	int error = 0;
   1971  1.11.2.1   thorpej 	int newval, next, nwake, nwoken = 0;
   1972  1.11.2.1   thorpej 	int allreaders __diagused = 0;
   1973  1.11.2.1   thorpej 	unsigned int *nwaitersp;
   1974  1.11.2.1   thorpej 	pri_t pri_writer;
   1975  1.11.2.1   thorpej 
   1976  1.11.2.1   thorpej 	/* Look up the futex, if any.  */
   1977  1.11.2.1   thorpej 	error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
   1978  1.11.2.1   thorpej 	if (error)
   1979  1.11.2.1   thorpej 		goto out;
   1980  1.11.2.1   thorpej 
   1981  1.11.2.1   thorpej 	/*
   1982  1.11.2.1   thorpej 	 * There are a couple of diffrent scenarios where a thread
   1983  1.11.2.1   thorpej 	 * releasing a RW lock would call rw_handoff, yet we find no
   1984  1.11.2.1   thorpej 	 * waiters:
   1985  1.11.2.1   thorpej 	 *
   1986  1.11.2.1   thorpej 	 * ==> There were waiters on the queue, but they left the queue
   1987  1.11.2.1   thorpej 	 *     due to a signal.
   1988  1.11.2.1   thorpej 	 * ==> The waiting thread set the waiter bit(s), but decided to
   1989  1.11.2.1   thorpej 	 *     try spinning before calling rw_wait.
   1990  1.11.2.1   thorpej 	 *
   1991  1.11.2.1   thorpej 	 * In both of these cases, we will ensure that the lock word
   1992  1.11.2.1   thorpej 	 * gets cleared.
   1993  1.11.2.1   thorpej 	 */
   1994  1.11.2.1   thorpej 
   1995  1.11.2.1   thorpej 	/* If there's no futex, there are no waiters to wake. */
   1996  1.11.2.1   thorpej 	if (__predict_false(f == NULL)) {
   1997  1.11.2.1   thorpej 		/*
   1998  1.11.2.1   thorpej 		 * If there are no waiters, ensure that the lock
   1999  1.11.2.1   thorpej 		 * word is cleared.
   2000  1.11.2.1   thorpej 		 */
   2001  1.11.2.1   thorpej 		error = ucas_int(uaddr, val, 0, &next);
   2002  1.11.2.1   thorpej 		if (__predict_true(error == 0)) {
   2003  1.11.2.1   thorpej 			if (__predict_false(val != next))
   2004  1.11.2.1   thorpej 				error = EAGAIN;
   2005  1.11.2.1   thorpej 		}
   2006  1.11.2.1   thorpej 		goto out;
   2007  1.11.2.1   thorpej 	}
   2008  1.11.2.1   thorpej 
   2009  1.11.2.1   thorpej 	/*
   2010  1.11.2.1   thorpej 	 * Compute the new value and store it in the futex word.
   2011  1.11.2.1   thorpej 	 *
   2012  1.11.2.1   thorpej 	 * This is a little tricky because the ucas could cause
   2013  1.11.2.1   thorpej 	 * a page fault, and we can't let that happen while holding
   2014  1.11.2.1   thorpej 	 * the sleepq locks.  But we have to make sure that choices
   2015  1.11.2.1   thorpej 	 * we make regarding what threads to wake is accurately
   2016  1.11.2.1   thorpej 	 * reflected in the futex word and that the futex word is
   2017  1.11.2.1   thorpej 	 * updated before the LWPs can run.
   2018  1.11.2.1   thorpej 	 *
   2019  1.11.2.1   thorpej 	 * This is easy enough ... we can transfer the LWPs to a private
   2020  1.11.2.1   thorpej 	 * sleepq to prevent changes in the original sleepq while we have
   2021  1.11.2.1   thorpej 	 * it unlocked from affecting the results, but we must also
   2022  1.11.2.1   thorpej 	 * consider that LWPs might be using timed-wait, so we have to
   2023  1.11.2.1   thorpej 	 * make sure that won't wake the LWP up out from under us if the
   2024  1.11.2.1   thorpej 	 * timer fires.  To do this, we have to set the "wchan" to NULL
   2025  1.11.2.1   thorpej 	 * and use a dummy syncobj that takes no action on "unsleep".
   2026  1.11.2.1   thorpej 	 *
   2027  1.11.2.1   thorpej 	 * We thus perform the hand-off in three steps, all with the op
   2028  1.11.2.1   thorpej 	 * lock held:
   2029  1.11.2.1   thorpej 	 *
   2030  1.11.2.1   thorpej 	 * ==> Wake selection: sleepq locked, select LWPs to wake,
   2031  1.11.2.1   thorpej 	 *     compute new futex word.  LWPs are tranferred from the
   2032  1.11.2.1   thorpej 	 *     futex sleepq to the private sleepq and further sedated.
   2033  1.11.2.1   thorpej 	 *
   2034  1.11.2.1   thorpej 	 * ==> Futex word update: sleepq unlocked, use a loop around ucas
   2035  1.11.2.1   thorpej 	 *     to update the futex word.  There are no scenarios where
   2036  1.11.2.1   thorpej 	 *     user space code can update the futex in a way that would
   2037  1.11.2.1   thorpej 	 *     impact the work we do here; because we've been asked to do
   2038  1.11.2.1   thorpej 	 *     the hand-off, any new LWPs attempting to acquire the lock
   2039  1.11.2.1   thorpej 	 *     would be entering rw_wait by definition (either because
   2040  1.11.2.1   thorpej 	 *     they're read-lockers and the lock is write-wanted, or they're
   2041  1.11.2.1   thorpej 	 *     write-lockers and the lock is read-held).  Those new rw_wait
   2042  1.11.2.1   thorpej 	 *     LWPs will serialize against the op lock.  We DO, however,
   2043  1.11.2.1   thorpej 	 *     need to preserve any newly-set WANTED bits, hence the ucas
   2044  1.11.2.1   thorpej 	 *     loop.  Those newly-set WANTED bits, however, will not impact
   2045  1.11.2.1   thorpej 	 *     our decisions.  Those LWPs have simply lost the race to
   2046  1.11.2.1   thorpej 	 *     acquire the lock, and we don't consult those bits anyway;
   2047  1.11.2.1   thorpej 	 *     we instead use the contents of the sleepqs as the truth
   2048  1.11.2.1   thorpej 	 *     about who is waiting, and now new waiters will appear on
   2049  1.11.2.1   thorpej 	 *     the sleepqs while we hold the op lock.
   2050  1.11.2.1   thorpej 	 *
   2051  1.11.2.1   thorpej 	 * ==> Wake waiters: sleepq locked, run down our private sleepq
   2052  1.11.2.1   thorpej 	 *     and actually awaken all of the LWPs we selected earlier.
   2053  1.11.2.1   thorpej 	 *
   2054  1.11.2.1   thorpej 	 * If for some reason, the ucas fails because it page backing it
   2055  1.11.2.1   thorpej 	 * was unmapped, all bets are off.  We still awaken the waiters.
   2056  1.11.2.1   thorpej 	 * This is either a malicious attempt to screw things up, or a
   2057  1.11.2.1   thorpej 	 * programming error, and we don't care about either one.
   2058  1.11.2.1   thorpej 	 */
   2059  1.11.2.1   thorpej 	sleepq_init(&wsq);
   2060  1.11.2.1   thorpej 
   2061  1.11.2.1   thorpej 	futex_op_lock(f);
   2062  1.11.2.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   2063  1.11.2.1   thorpej 
   2064  1.11.2.1   thorpej 	/*
   2065  1.11.2.1   thorpej 	 * STEP 1
   2066  1.11.2.1   thorpej 	 */
   2067  1.11.2.1   thorpej 
   2068  1.11.2.1   thorpej 	/*
   2069  1.11.2.1   thorpej 	 * POSIX dictates that any real-time waiters will acquire the
   2070  1.11.2.1   thorpej 	 * lock in priority order.  This implies that a real-time
   2071  1.11.2.1   thorpej 	 * read-waiter has priority over a non-real-time write-waiter,
   2072  1.11.2.1   thorpej 	 * where we would otherwise prioritize waking the write-waiter.
   2073  1.11.2.1   thorpej 	 */
   2074  1.11.2.1   thorpej 	l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
   2075  1.11.2.1   thorpej 	l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]);
   2076  1.11.2.1   thorpej 	if (__predict_true(l_writer == NULL)) {
   2077  1.11.2.1   thorpej 		/* We will wake all the readers. */
   2078  1.11.2.1   thorpej 		sq = &f->fx_sqs[FUTEX_READERQ];
   2079  1.11.2.1   thorpej 		nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
   2080  1.11.2.1   thorpej 		nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ];
   2081  1.11.2.1   thorpej 		KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
   2082  1.11.2.1   thorpej 		if (__predict_false(nwake == 0)) {
   2083  1.11.2.1   thorpej 			KASSERT(l_reader == NULL);
   2084  1.11.2.1   thorpej 			newval = 0;
   2085  1.11.2.1   thorpej 		} else {
   2086  1.11.2.1   thorpej 			KASSERT(l_reader != NULL);
   2087  1.11.2.1   thorpej 			newval = nwake;
   2088  1.11.2.1   thorpej 		}
   2089  1.11.2.1   thorpej 		l = l_reader;
   2090  1.11.2.1   thorpej 	} else if (__predict_false(l_reader != NULL &&
   2091  1.11.2.1   thorpej 				   futex_read_waiter_prio(l_reader) >
   2092  1.11.2.1   thorpej 					(pri_writer = lwp_eprio(l_writer)))) {
   2093  1.11.2.1   thorpej 		/*
   2094  1.11.2.1   thorpej 		 * Count the number of real-time read-waiters that
   2095  1.11.2.1   thorpej 		 * exceed the write-waiter's priority.  We will
   2096  1.11.2.1   thorpej 		 * wake that many (we alreay know it's at least one).
   2097  1.11.2.1   thorpej 		 */
   2098  1.11.2.1   thorpej 		sq = &f->fx_sqs[FUTEX_READERQ];
   2099  1.11.2.1   thorpej 		nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
   2100  1.11.2.1   thorpej 		for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain);
   2101  1.11.2.1   thorpej 		     l != NULL && futex_read_waiter_prio(l) > pri_writer;
   2102  1.11.2.1   thorpej 		     l = LIST_NEXT(l, l_sleepchain)) {
   2103  1.11.2.1   thorpej 			nwake++;
   2104  1.11.2.1   thorpej 		}
   2105  1.11.2.1   thorpej 		KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
   2106  1.11.2.1   thorpej 		/* We know there is at least one write-waiter. */
   2107  1.11.2.1   thorpej 		newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
   2108  1.11.2.1   thorpej 		l = l_reader;
   2109  1.11.2.1   thorpej 	} else {
   2110  1.11.2.1   thorpej 		/* We will wake one writer. */
   2111  1.11.2.1   thorpej 		sq = &f->fx_sqs[FUTEX_WRITERQ];
   2112  1.11.2.1   thorpej 		nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ];
   2113  1.11.2.1   thorpej 		nwake = 1;
   2114  1.11.2.1   thorpej 		newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid;
   2115  1.11.2.1   thorpej 		if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) {
   2116  1.11.2.1   thorpej 			KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL);
   2117  1.11.2.1   thorpej 			newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
   2118  1.11.2.1   thorpej 		} else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) {
   2119  1.11.2.1   thorpej 			KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2120  1.11.2.1   thorpej 			newval |= FUTEX_WAITERS;
   2121  1.11.2.1   thorpej 		} else {
   2122  1.11.2.1   thorpej 			KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2123  1.11.2.1   thorpej 		}
   2124  1.11.2.1   thorpej 		l = l_writer;
   2125  1.11.2.1   thorpej 	}
   2126  1.11.2.1   thorpej 
   2127  1.11.2.1   thorpej 	KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] ||
   2128  1.11.2.1   thorpej 		sq == &f->fx_sqs[FUTEX_WRITERQ]);
   2129  1.11.2.1   thorpej 	while (nwake != 0 && l != NULL) {
   2130  1.11.2.1   thorpej 		l_next = LIST_NEXT(l, l_sleepchain);
   2131  1.11.2.1   thorpej 		KASSERT(l->l_syncobj == &futex_syncobj);
   2132  1.11.2.1   thorpej 		KASSERT(l->l_wchan == (wchan_t)f);
   2133  1.11.2.1   thorpej 		KASSERT(l->l_futex == f);
   2134  1.11.2.1   thorpej 		KASSERT(l->l_sleepq == sq);
   2135  1.11.2.1   thorpej 		KASSERT(l->l_mutex == f->fx_sq_lock);
   2136  1.11.2.1   thorpej 
   2137  1.11.2.1   thorpej 		/*
   2138  1.11.2.1   thorpej 		 * NULL wchan --> timeout will not wake LWP.
   2139  1.11.2.1   thorpej 		 * NULL lock --> keep existing lock.
   2140  1.11.2.1   thorpej 		 */
   2141  1.11.2.1   thorpej 		sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg,
   2142  1.11.2.1   thorpej 		    &sched_syncobj, NULL/*lock*/, false);
   2143  1.11.2.1   thorpej 
   2144  1.11.2.1   thorpej 		KASSERT(*nwaitersp > 0);
   2145  1.11.2.1   thorpej 		(*nwaitersp)--;
   2146  1.11.2.1   thorpej 		nwoken++;
   2147  1.11.2.1   thorpej 		nwake--;
   2148  1.11.2.1   thorpej 		/* hold count adjusted below */
   2149  1.11.2.1   thorpej 		l = l_next;
   2150  1.11.2.1   thorpej 	}
   2151  1.11.2.1   thorpej 
   2152  1.11.2.1   thorpej 	mutex_spin_exit(f->fx_sq_lock);
   2153  1.11.2.1   thorpej 
   2154  1.11.2.1   thorpej 	/*
   2155  1.11.2.1   thorpej 	 * STEP 2
   2156  1.11.2.1   thorpej 	 */
   2157  1.11.2.1   thorpej 	for (;; val = next) {
   2158  1.11.2.1   thorpej 		error = ucas_int(uaddr, val, newval, &next);
   2159  1.11.2.1   thorpej 		if (__predict_false(error != 0)) {
   2160  1.11.2.1   thorpej 			/*
   2161  1.11.2.1   thorpej 			 * The futex word has become unmapped.  All bets
   2162  1.11.2.1   thorpej 			 * are off.  Break out of the loop and awaken the
   2163  1.11.2.1   thorpej 			 * waiters; this is easier than trying to stuff
   2164  1.11.2.1   thorpej 			 * them back into their previous sleepq, and the
   2165  1.11.2.1   thorpej 			 * application is screwed anyway.
   2166  1.11.2.1   thorpej 			 */
   2167  1.11.2.1   thorpej 			break;
   2168  1.11.2.1   thorpej 		}
   2169  1.11.2.1   thorpej 		if (__predict_true(next == val)) {
   2170  1.11.2.1   thorpej 			/*
   2171  1.11.2.1   thorpej 			 * Successfully updated the futex word!
   2172  1.11.2.1   thorpej 			 */
   2173  1.11.2.1   thorpej 			break;
   2174  1.11.2.1   thorpej 		}
   2175  1.11.2.1   thorpej 		/*
   2176  1.11.2.1   thorpej 		 * The only thing that could have possibly happened
   2177  1.11.2.1   thorpej 		 * (barring some bug in the thread library) is that
   2178  1.11.2.1   thorpej 		 * additional waiter bits arrived.  Those new waiters
   2179  1.11.2.1   thorpej 		 * have lost the race to acquire the lock, but we
   2180  1.11.2.1   thorpej 		 * need to preserve those bits.
   2181  1.11.2.1   thorpej 		 */
   2182  1.11.2.1   thorpej 		newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED);
   2183  1.11.2.1   thorpej 	}
   2184  1.11.2.1   thorpej 
   2185  1.11.2.1   thorpej 	mutex_spin_enter(f->fx_sq_lock);
   2186  1.11.2.1   thorpej 
   2187  1.11.2.1   thorpej 	/*
   2188  1.11.2.1   thorpej 	 * STEP 3
   2189  1.11.2.1   thorpej 	 */
   2190  1.11.2.1   thorpej 
   2191  1.11.2.1   thorpej 	LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) {
   2192  1.11.2.1   thorpej 		KASSERT(l->l_syncobj == &sched_syncobj);
   2193  1.11.2.1   thorpej 		KASSERT(l->l_wchan == NULL);
   2194  1.11.2.1   thorpej 		KASSERT(l->l_futex == f);
   2195  1.11.2.1   thorpej 		KASSERT(l->l_sleepq == &wsq);
   2196  1.11.2.1   thorpej 		KASSERT(l->l_mutex == f->fx_sq_lock);
   2197  1.11.2.1   thorpej 
   2198  1.11.2.1   thorpej 		l->l_futex_wakesel = 0;
   2199  1.11.2.1   thorpej 		l->l_futex = NULL;
   2200  1.11.2.1   thorpej 		sleepq_remove(&wsq, l);
   2201  1.11.2.1   thorpej 	}
   2202  1.11.2.1   thorpej 	/* If we woke all-readers, ensure we will wake them all. */
   2203  1.11.2.1   thorpej 	KASSERT(allreaders == 0 || allreaders == nwoken);
   2204  1.11.2.1   thorpej 	KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
   2205  1.11.2.1   thorpej 	KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0);
   2206  1.11.2.1   thorpej 
   2207  1.11.2.1   thorpej 	mutex_spin_exit(f->fx_sq_lock);
   2208  1.11.2.1   thorpej 
   2209  1.11.2.1   thorpej 	/* Adjust hold count. */
   2210  1.11.2.1   thorpej 	futex_rele_count_not_last(f, nwoken);
   2211  1.11.2.1   thorpej 
   2212  1.11.2.1   thorpej 	futex_op_unlock(f);
   2213  1.11.2.1   thorpej 
   2214  1.11.2.1   thorpej 	/* Release the futex.  */
   2215  1.11.2.1   thorpej 	futex_rele(f);
   2216  1.11.2.1   thorpej 
   2217  1.11.2.1   thorpej out:	if (__predict_true(error == 0)) {
   2218  1.11.2.1   thorpej 		/* Return the number of waiters woken.  */
   2219  1.11.2.1   thorpej 		*retval = nwoken;
   2220  1.11.2.1   thorpej 	}
   2221  1.11.2.1   thorpej 
   2222  1.11.2.1   thorpej 	/* Success!  */
   2223  1.11.2.1   thorpej 	return error;
   2224  1.11.2.1   thorpej #endif
   2225  1.11.2.1   thorpej }
   2226  1.11.2.1   thorpej 
   2227  1.11.2.1   thorpej /*
   2228       1.1   thorpej  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
   2229       1.1   thorpej  *
   2230       1.1   thorpej  *	Implement the futex system call with all the parameters
   2231       1.1   thorpej  *	parsed out.
   2232       1.1   thorpej  */
   2233       1.1   thorpej int
   2234      1.11  riastrad do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
   2235      1.11  riastrad     int *uaddr2, int val2, int val3, register_t *retval)
   2236       1.1   thorpej {
   2237       1.1   thorpej 	const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
   2238       1.1   thorpej 	const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
   2239       1.1   thorpej 							    : CLOCK_MONOTONIC;
   2240  1.11.2.1   thorpej 	int error;
   2241       1.1   thorpej 
   2242       1.1   thorpej 	op &= FUTEX_CMD_MASK;
   2243       1.1   thorpej 
   2244       1.1   thorpej 	switch (op) {
   2245       1.1   thorpej 	case FUTEX_WAIT:
   2246  1.11.2.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   2247  1.11.2.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
   2248  1.11.2.1   thorpej 		    TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
   2249  1.11.2.1   thorpej 		error = futex_func_wait(shared, uaddr, val,
   2250       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
   2251       1.1   thorpej 		    retval);
   2252  1.11.2.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   2253  1.11.2.1   thorpej 		break;
   2254  1.11.2.1   thorpej 
   2255  1.11.2.1   thorpej 	case FUTEX_WAIT_BITSET:
   2256  1.11.2.1   thorpej 		SDT_PROBE6(futex, func, wait, entry,
   2257  1.11.2.1   thorpej 		    uaddr, val, val3, timeout,
   2258  1.11.2.1   thorpej 		    TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
   2259  1.11.2.1   thorpej 		error = futex_func_wait(shared, uaddr, val, val3, timeout,
   2260  1.11.2.1   thorpej 		    clkid, TIMER_ABSTIME, retval);
   2261  1.11.2.1   thorpej 		SDT_PROBE1(futex, func, wait, exit, error);
   2262  1.11.2.1   thorpej 		break;
   2263       1.1   thorpej 
   2264       1.1   thorpej 	case FUTEX_WAKE:
   2265  1.11.2.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   2266  1.11.2.1   thorpej 		    uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
   2267  1.11.2.1   thorpej 		error = futex_func_wake(shared, uaddr, val,
   2268  1.11.2.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, retval);
   2269  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   2270  1.11.2.1   thorpej 		break;
   2271  1.11.2.1   thorpej 
   2272       1.1   thorpej 	case FUTEX_WAKE_BITSET:
   2273  1.11.2.1   thorpej 		SDT_PROBE4(futex, func, wake, entry,
   2274  1.11.2.1   thorpej 		    uaddr, val, val3, op & ~FUTEX_CMD_MASK);
   2275  1.11.2.1   thorpej 		error = futex_func_wake(shared, uaddr, val, val3, retval);
   2276  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, wake, exit, error, *retval);
   2277  1.11.2.1   thorpej 		break;
   2278       1.1   thorpej 
   2279       1.1   thorpej 	case FUTEX_REQUEUE:
   2280  1.11.2.1   thorpej 		SDT_PROBE5(futex, func, requeue, entry,
   2281  1.11.2.1   thorpej 		    uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
   2282  1.11.2.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   2283  1.11.2.1   thorpej 		    val2, 0, retval);
   2284  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, requeue, exit, error, *retval);
   2285  1.11.2.1   thorpej 		break;
   2286  1.11.2.1   thorpej 
   2287       1.1   thorpej 	case FUTEX_CMP_REQUEUE:
   2288  1.11.2.1   thorpej 		SDT_PROBE6(futex, func, cmp_requeue, entry,
   2289  1.11.2.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   2290  1.11.2.1   thorpej 		error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
   2291       1.1   thorpej 		    val2, val3, retval);
   2292  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
   2293  1.11.2.1   thorpej 		break;
   2294       1.1   thorpej 
   2295       1.1   thorpej 	case FUTEX_WAKE_OP:
   2296  1.11.2.1   thorpej 		SDT_PROBE6(futex, func, wake_op, entry,
   2297  1.11.2.1   thorpej 		    uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
   2298  1.11.2.1   thorpej 		error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
   2299       1.1   thorpej 		    val3, retval);
   2300  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
   2301  1.11.2.1   thorpej 		break;
   2302  1.11.2.1   thorpej 
   2303  1.11.2.1   thorpej 	case FUTEX_NETBSD_RW_WAIT:
   2304  1.11.2.1   thorpej 		SDT_PROBE5(futex, func, rw_wait, entry,
   2305  1.11.2.1   thorpej 		    uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK);
   2306  1.11.2.1   thorpej 		error = futex_func_rw_wait(shared, uaddr, val, val3,
   2307  1.11.2.1   thorpej 		    timeout, clkid, retval);
   2308  1.11.2.1   thorpej 		SDT_PROBE1(futex, func, rw_wait, exit, error);
   2309  1.11.2.1   thorpej 		break;
   2310  1.11.2.1   thorpej 
   2311  1.11.2.1   thorpej 	case FUTEX_NETBSD_RW_HANDOFF:
   2312  1.11.2.1   thorpej 		SDT_PROBE3(futex, func, rw_handoff, entry,
   2313  1.11.2.1   thorpej 		    uaddr, val, op & ~FUTEX_CMD_MASK);
   2314  1.11.2.1   thorpej 		error = futex_func_rw_handoff(shared, uaddr, val, retval);
   2315  1.11.2.1   thorpej 		SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval);
   2316  1.11.2.1   thorpej 		break;
   2317       1.1   thorpej 
   2318       1.1   thorpej 	case FUTEX_FD:
   2319       1.1   thorpej 	default:
   2320  1.11.2.1   thorpej 		error = ENOSYS;
   2321  1.11.2.1   thorpej 		break;
   2322       1.1   thorpej 	}
   2323  1.11.2.1   thorpej 
   2324  1.11.2.1   thorpej 	return error;
   2325       1.1   thorpej }
   2326       1.1   thorpej 
   2327       1.1   thorpej /*
   2328       1.1   thorpej  * sys___futex(l, uap, retval)
   2329       1.1   thorpej  *
   2330       1.1   thorpej  *	__futex(2) system call: generic futex operations.
   2331       1.1   thorpej  */
   2332       1.1   thorpej int
   2333       1.1   thorpej sys___futex(struct lwp *l, const struct sys___futex_args *uap,
   2334       1.1   thorpej     register_t *retval)
   2335       1.1   thorpej {
   2336       1.1   thorpej 	/* {
   2337       1.1   thorpej 		syscallarg(int *) uaddr;
   2338       1.1   thorpej 		syscallarg(int) op;
   2339       1.1   thorpej 		syscallarg(int) val;
   2340       1.1   thorpej 		syscallarg(const struct timespec *) timeout;
   2341       1.1   thorpej 		syscallarg(int *) uaddr2;
   2342       1.1   thorpej 		syscallarg(int) val2;
   2343       1.1   thorpej 		syscallarg(int) val3;
   2344       1.1   thorpej 	} */
   2345       1.1   thorpej 	struct timespec ts, *tsp;
   2346       1.1   thorpej 	int error;
   2347       1.1   thorpej 
   2348       1.1   thorpej 	/*
   2349       1.1   thorpej 	 * Copy in the timeout argument, if specified.
   2350       1.1   thorpej 	 */
   2351       1.1   thorpej 	if (SCARG(uap, timeout)) {
   2352       1.1   thorpej 		error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
   2353       1.1   thorpej 		if (error)
   2354       1.1   thorpej 			return error;
   2355       1.1   thorpej 		tsp = &ts;
   2356       1.1   thorpej 	} else {
   2357       1.1   thorpej 		tsp = NULL;
   2358       1.1   thorpej 	}
   2359       1.1   thorpej 
   2360       1.1   thorpej 	return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
   2361       1.1   thorpej 	    tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
   2362       1.1   thorpej 	    retval);
   2363       1.1   thorpej }
   2364       1.1   thorpej 
   2365       1.1   thorpej /*
   2366       1.1   thorpej  * sys___futex_set_robust_list(l, uap, retval)
   2367       1.1   thorpej  *
   2368       1.1   thorpej  *	__futex_set_robust_list(2) system call for robust futexes.
   2369       1.1   thorpej  */
   2370       1.1   thorpej int
   2371       1.1   thorpej sys___futex_set_robust_list(struct lwp *l,
   2372       1.1   thorpej     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
   2373       1.1   thorpej {
   2374       1.1   thorpej 	/* {
   2375       1.1   thorpej 		syscallarg(void *) head;
   2376       1.1   thorpej 		syscallarg(size_t) len;
   2377       1.1   thorpej 	} */
   2378       1.1   thorpej 	void *head = SCARG(uap, head);
   2379       1.1   thorpej 
   2380       1.1   thorpej 	if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
   2381       1.1   thorpej 		return EINVAL;
   2382       1.1   thorpej 	if ((uintptr_t)head % sizeof(u_long))
   2383       1.1   thorpej 		return EINVAL;
   2384       1.1   thorpej 
   2385       1.1   thorpej 	l->l_robust_head = (uintptr_t)head;
   2386       1.1   thorpej 
   2387       1.1   thorpej 	return 0;
   2388       1.1   thorpej }
   2389       1.1   thorpej 
   2390       1.1   thorpej /*
   2391       1.1   thorpej  * sys___futex_get_robust_list(l, uap, retval)
   2392       1.1   thorpej  *
   2393       1.1   thorpej  *	__futex_get_robust_list(2) system call for robust futexes.
   2394       1.1   thorpej  */
   2395       1.1   thorpej int
   2396       1.1   thorpej sys___futex_get_robust_list(struct lwp *l,
   2397       1.1   thorpej     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
   2398       1.1   thorpej {
   2399       1.1   thorpej 	/* {
   2400       1.1   thorpej 		syscallarg(lwpid_t) lwpid;
   2401       1.1   thorpej 		syscallarg(void **) headp;
   2402       1.1   thorpej 		syscallarg(size_t *) lenp;
   2403       1.1   thorpej 	} */
   2404       1.1   thorpej 	void *head;
   2405       1.1   thorpej 	const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
   2406       1.1   thorpej 	int error;
   2407       1.1   thorpej 
   2408       1.1   thorpej 	error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
   2409       1.1   thorpej 	if (error)
   2410       1.1   thorpej 		return error;
   2411       1.1   thorpej 
   2412       1.1   thorpej 	/* Copy out the head pointer and the head structure length. */
   2413       1.1   thorpej 	error = copyout(&head, SCARG(uap, headp), sizeof(head));
   2414       1.1   thorpej 	if (__predict_true(error == 0)) {
   2415       1.1   thorpej 		error = copyout(&len, SCARG(uap, lenp), sizeof(len));
   2416       1.1   thorpej 	}
   2417       1.1   thorpej 
   2418       1.1   thorpej 	return error;
   2419       1.1   thorpej }
   2420       1.1   thorpej 
   2421       1.1   thorpej /*
   2422       1.1   thorpej  * release_futex(uva, tid)
   2423       1.1   thorpej  *
   2424       1.1   thorpej  *	Try to release the robust futex at uva in the current process
   2425       1.1   thorpej  *	on lwp exit.  If anything goes wrong, silently fail.  It is the
   2426       1.1   thorpej  *	userland program's obligation to arrange correct behaviour.
   2427       1.1   thorpej  */
   2428       1.1   thorpej static void
   2429       1.1   thorpej release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
   2430       1.1   thorpej     bool const is_pending)
   2431       1.1   thorpej {
   2432       1.1   thorpej 	int *uaddr;
   2433       1.1   thorpej 	struct futex *f;
   2434       1.1   thorpej 	int oldval, newval, actual;
   2435       1.1   thorpej 	int error;
   2436  1.11.2.1   thorpej 	bool shared;
   2437       1.1   thorpej 
   2438       1.1   thorpej 	/* If it's misaligned, tough.  */
   2439       1.1   thorpej 	if (__predict_false(uptr & 3))
   2440       1.1   thorpej 		return;
   2441       1.1   thorpej 	uaddr = (int *)uptr;
   2442       1.1   thorpej 
   2443       1.1   thorpej 	error = futex_load(uaddr, &oldval);
   2444       1.1   thorpej 	if (__predict_false(error))
   2445       1.1   thorpej 		return;
   2446       1.1   thorpej 
   2447       1.1   thorpej 	/*
   2448       1.1   thorpej 	 * There are two race conditions we need to handle here:
   2449       1.1   thorpej 	 *
   2450       1.1   thorpej 	 * 1. User space cleared the futex word but died before
   2451       1.1   thorpej 	 *    being able to issue the wakeup.  No wakeups will
   2452       1.1   thorpej 	 *    ever be issued, oops!
   2453       1.1   thorpej 	 *
   2454       1.1   thorpej 	 * 2. Awakened waiter died before being able to acquire
   2455       1.1   thorpej 	 *    the futex in user space.  Any other waiters are
   2456       1.1   thorpej 	 *    now stuck, oops!
   2457       1.1   thorpej 	 *
   2458       1.1   thorpej 	 * In both of these cases, the futex word will be 0 (because
   2459       1.1   thorpej 	 * it's updated before the wake is issued).  The best we can
   2460       1.1   thorpej 	 * do is detect this situation if it's the pending futex and
   2461       1.1   thorpej 	 * issue a wake without modifying the futex word.
   2462       1.1   thorpej 	 *
   2463       1.1   thorpej 	 * XXX eventual PI handling?
   2464       1.1   thorpej 	 */
   2465       1.1   thorpej 	if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
   2466       1.1   thorpej 		register_t retval;
   2467  1.11.2.1   thorpej 		error = futex_func_wake(/*shared*/true, uaddr, 1,
   2468       1.1   thorpej 		    FUTEX_BITSET_MATCH_ANY, &retval);
   2469  1.11.2.1   thorpej 		if (error != 0 || retval == 0)
   2470  1.11.2.1   thorpej 			(void) futex_func_wake(/*shared*/false, uaddr, 1,
   2471  1.11.2.1   thorpej 			    FUTEX_BITSET_MATCH_ANY, &retval);
   2472       1.1   thorpej 		return;
   2473       1.1   thorpej 	}
   2474       1.1   thorpej 
   2475       1.1   thorpej 	/* Optimistically test whether we need to do anything at all.  */
   2476       1.1   thorpej 	if ((oldval & FUTEX_TID_MASK) != tid)
   2477       1.1   thorpej 		return;
   2478       1.1   thorpej 
   2479       1.1   thorpej 	/*
   2480       1.1   thorpej 	 * We need to handle the case where this thread owned the futex,
   2481       1.1   thorpej 	 * but it was uncontended.  In this case, there won't be any
   2482       1.1   thorpej 	 * kernel state to look up.  All we can do is mark the futex
   2483       1.1   thorpej 	 * as a zombie to be mopped up the next time another thread
   2484       1.1   thorpej 	 * attempts to acquire it.
   2485       1.1   thorpej 	 *
   2486       1.1   thorpej 	 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
   2487       1.1   thorpej 	 * this loop, even if waiters appear while we're are doing
   2488       1.1   thorpej 	 * so.  This is beause FUTEX_WAITERS is set by user space
   2489       1.1   thorpej 	 * before calling __futex() to wait, and the futex needs
   2490       1.1   thorpej 	 * to be marked as a zombie when the new waiter gets into
   2491       1.1   thorpej 	 * the kernel.
   2492       1.1   thorpej 	 */
   2493       1.1   thorpej 	if ((oldval & FUTEX_WAITERS) == 0) {
   2494       1.1   thorpej 		do {
   2495       1.1   thorpej 			error = futex_load(uaddr, &oldval);
   2496       1.1   thorpej 			if (error)
   2497       1.1   thorpej 				return;
   2498       1.1   thorpej 			if ((oldval & FUTEX_TID_MASK) != tid)
   2499       1.1   thorpej 				return;
   2500       1.1   thorpej 			newval = oldval | FUTEX_OWNER_DIED;
   2501       1.1   thorpej 			error = ucas_int(uaddr, oldval, newval, &actual);
   2502       1.1   thorpej 			if (error)
   2503       1.1   thorpej 				return;
   2504       1.1   thorpej 		} while (actual != oldval);
   2505       1.1   thorpej 
   2506       1.1   thorpej 		/*
   2507       1.1   thorpej 		 * If where is still no indication of waiters, then there is
   2508       1.1   thorpej 		 * no more work for us to do.
   2509       1.1   thorpej 		 */
   2510       1.1   thorpej 		if ((oldval & FUTEX_WAITERS) == 0)
   2511       1.1   thorpej 			return;
   2512       1.1   thorpej 	}
   2513       1.1   thorpej 
   2514       1.1   thorpej 	/*
   2515  1.11.2.1   thorpej 	 * Look for a futex.  Try shared first, then private.  If we
   2516  1.11.2.1   thorpej 	 * can't fine one, tough.
   2517  1.11.2.1   thorpej 	 *
   2518  1.11.2.1   thorpej 	 * Note: the assumption here is that anyone placing a futex on
   2519  1.11.2.1   thorpej 	 * the robust list is adhering to the rules, regardless of the
   2520  1.11.2.1   thorpej 	 * futex class.
   2521       1.1   thorpej 	 */
   2522  1.11.2.1   thorpej 	for (f = NULL, shared = true; f == NULL; shared = false) {
   2523  1.11.2.1   thorpej 		error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
   2524  1.11.2.1   thorpej 		if (error)
   2525  1.11.2.1   thorpej 			return;
   2526  1.11.2.1   thorpej 		if (f == NULL && shared == false)
   2527  1.11.2.1   thorpej 			return;
   2528  1.11.2.1   thorpej 	}
   2529       1.1   thorpej 
   2530  1.11.2.1   thorpej 	/* Work under the futex op lock.  */
   2531  1.11.2.1   thorpej 	futex_op_lock(f);
   2532       1.1   thorpej 
   2533       1.1   thorpej 	/*
   2534       1.1   thorpej 	 * Fetch the word: if the tid doesn't match ours, skip;
   2535       1.1   thorpej 	 * otherwise, set the owner-died bit, atomically.
   2536       1.1   thorpej 	 */
   2537       1.1   thorpej 	do {
   2538       1.1   thorpej 		error = futex_load(uaddr, &oldval);
   2539       1.1   thorpej 		if (error)
   2540       1.1   thorpej 			goto out;
   2541       1.1   thorpej 		if ((oldval & FUTEX_TID_MASK) != tid)
   2542       1.1   thorpej 			goto out;
   2543       1.1   thorpej 		newval = oldval | FUTEX_OWNER_DIED;
   2544       1.1   thorpej 		error = ucas_int(uaddr, oldval, newval, &actual);
   2545       1.1   thorpej 		if (error)
   2546       1.1   thorpej 			goto out;
   2547       1.1   thorpej 	} while (actual != oldval);
   2548       1.1   thorpej 
   2549       1.1   thorpej 	/*
   2550       1.1   thorpej 	 * If there may be waiters, try to wake one.  If anything goes
   2551       1.1   thorpej 	 * wrong, tough.
   2552       1.1   thorpej 	 *
   2553       1.1   thorpej 	 * XXX eventual PI handling?
   2554       1.1   thorpej 	 */
   2555  1.11.2.1   thorpej 	if (oldval & FUTEX_WAITERS) {
   2556  1.11.2.1   thorpej 		(void)futex_wake(f, FUTEX_WRITERQ, 1,
   2557  1.11.2.1   thorpej 				 NULL, FUTEX_WRITERQ, 0,
   2558  1.11.2.1   thorpej 				 FUTEX_BITSET_MATCH_ANY);
   2559  1.11.2.1   thorpej 	}
   2560       1.1   thorpej 
   2561       1.1   thorpej 	/* Unlock the queue and release the futex.  */
   2562  1.11.2.1   thorpej out:	futex_op_unlock(f);
   2563       1.5  riastrad 	futex_rele(f);
   2564       1.1   thorpej }
   2565       1.1   thorpej 
   2566       1.1   thorpej /*
   2567       1.1   thorpej  * futex_robust_head_lookup(l, lwpid)
   2568       1.1   thorpej  *
   2569       1.1   thorpej  *	Helper function to look up a robust head by LWP ID.
   2570       1.1   thorpej  */
   2571       1.1   thorpej int
   2572       1.1   thorpej futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
   2573       1.1   thorpej {
   2574       1.1   thorpej 	struct proc *p = l->l_proc;
   2575       1.1   thorpej 
   2576       1.1   thorpej 	/* Find the other lwp, if requested; otherwise use our robust head.  */
   2577       1.1   thorpej 	if (lwpid) {
   2578       1.1   thorpej 		mutex_enter(p->p_lock);
   2579       1.1   thorpej 		l = lwp_find(p, lwpid);
   2580       1.1   thorpej 		if (l == NULL) {
   2581       1.1   thorpej 			mutex_exit(p->p_lock);
   2582       1.1   thorpej 			return ESRCH;
   2583       1.1   thorpej 		}
   2584       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2585       1.1   thorpej 		mutex_exit(p->p_lock);
   2586       1.1   thorpej 	} else {
   2587       1.1   thorpej 		*headp = (void *)l->l_robust_head;
   2588       1.1   thorpej 	}
   2589       1.1   thorpej 	return 0;
   2590       1.1   thorpej }
   2591       1.1   thorpej 
   2592       1.1   thorpej /*
   2593       1.1   thorpej  * futex_fetch_robust_head(uaddr)
   2594       1.1   thorpej  *
   2595       1.1   thorpej  *	Helper routine to fetch the futex robust list head that
   2596       1.1   thorpej  *	handles 32-bit binaries running on 64-bit kernels.
   2597       1.1   thorpej  */
   2598       1.1   thorpej static int
   2599       1.1   thorpej futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
   2600       1.1   thorpej {
   2601       1.1   thorpej #ifdef _LP64
   2602       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2603       1.1   thorpej 		uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
   2604       1.1   thorpej 		int error;
   2605       1.1   thorpej 
   2606       1.1   thorpej 		error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
   2607       1.1   thorpej 		if (__predict_true(error == 0)) {
   2608       1.1   thorpej 			for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
   2609       1.1   thorpej 				if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
   2610       1.1   thorpej 					/*
   2611       1.1   thorpej 					 * Make sure the offset is sign-
   2612       1.1   thorpej 					 * extended.
   2613       1.1   thorpej 					 */
   2614       1.1   thorpej 					rhead[i] = (int32_t)rhead32[i];
   2615       1.1   thorpej 				} else {
   2616       1.1   thorpej 					rhead[i] = rhead32[i];
   2617       1.1   thorpej 				}
   2618       1.1   thorpej 			}
   2619       1.1   thorpej 		}
   2620       1.1   thorpej 		return error;
   2621       1.1   thorpej 	}
   2622       1.1   thorpej #endif /* _L64 */
   2623       1.1   thorpej 
   2624       1.1   thorpej 	return copyin((void *)uaddr, rhead,
   2625       1.1   thorpej 	    sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
   2626       1.1   thorpej }
   2627       1.1   thorpej 
   2628       1.1   thorpej /*
   2629       1.1   thorpej  * futex_decode_robust_word(word)
   2630       1.1   thorpej  *
   2631       1.1   thorpej  *	Decode a robust futex list word into the entry and entry
   2632       1.1   thorpej  *	properties.
   2633       1.1   thorpej  */
   2634       1.1   thorpej static inline void
   2635       1.1   thorpej futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
   2636       1.1   thorpej     bool * const is_pi)
   2637       1.1   thorpej {
   2638       1.1   thorpej 	*is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
   2639       1.1   thorpej 	*entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
   2640       1.1   thorpej }
   2641       1.1   thorpej 
   2642       1.1   thorpej /*
   2643       1.1   thorpej  * futex_fetch_robust_entry(uaddr)
   2644       1.1   thorpej  *
   2645       1.1   thorpej  *	Helper routine to fetch and decode a robust futex entry
   2646       1.1   thorpej  *	that handles 32-bit binaries running on 64-bit kernels.
   2647       1.1   thorpej  */
   2648       1.1   thorpej static int
   2649       1.1   thorpej futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
   2650       1.1   thorpej     bool * const is_pi)
   2651       1.1   thorpej {
   2652       1.1   thorpej 	uintptr_t val = 0;
   2653       1.1   thorpej 	int error = 0;
   2654       1.1   thorpej 
   2655       1.1   thorpej #ifdef _LP64
   2656       1.1   thorpej 	if (curproc->p_flag & PK_32) {
   2657       1.1   thorpej 		uint32_t val32;
   2658       1.1   thorpej 
   2659       1.1   thorpej 		error = ufetch_32((uint32_t *)uaddr, &val32);
   2660       1.1   thorpej 		if (__predict_true(error == 0))
   2661       1.1   thorpej 			val = val32;
   2662       1.1   thorpej 	} else
   2663       1.1   thorpej #endif /* _LP64 */
   2664       1.1   thorpej 		error = ufetch_long((u_long *)uaddr, (u_long *)&val);
   2665       1.1   thorpej 	if (__predict_false(error))
   2666       1.1   thorpej 		return error;
   2667       1.1   thorpej 
   2668       1.1   thorpej 	futex_decode_robust_word(val, valp, is_pi);
   2669       1.1   thorpej 	return 0;
   2670       1.1   thorpej }
   2671       1.1   thorpej 
   2672       1.1   thorpej /*
   2673       1.1   thorpej  * futex_release_all_lwp(l, tid)
   2674       1.1   thorpej  *
   2675       1.1   thorpej  *	Release all l's robust futexes.  If anything looks funny in
   2676       1.1   thorpej  *	the process, give up -- it's userland's responsibility to dot
   2677       1.1   thorpej  *	the i's and cross the t's.
   2678       1.1   thorpej  */
   2679       1.1   thorpej void
   2680       1.1   thorpej futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
   2681       1.1   thorpej {
   2682       1.1   thorpej 	u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
   2683       1.1   thorpej 	int limit = 1000000;
   2684       1.1   thorpej 	int error;
   2685       1.1   thorpej 
   2686       1.1   thorpej 	/* If there's no robust list there's nothing to do. */
   2687       1.1   thorpej 	if (l->l_robust_head == 0)
   2688       1.1   thorpej 		return;
   2689       1.1   thorpej 
   2690       1.1   thorpej 	/* Read the final snapshot of the robust list head. */
   2691       1.1   thorpej 	error = futex_fetch_robust_head(l->l_robust_head, rhead);
   2692       1.1   thorpej 	if (error) {
   2693       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2694       1.1   thorpej 		    " unmapped robust futex list head\n",
   2695       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2696       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2697       1.1   thorpej 		return;
   2698       1.1   thorpej 	}
   2699       1.1   thorpej 
   2700       1.1   thorpej 	const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
   2701       1.1   thorpej 
   2702       1.1   thorpej 	uintptr_t next, pending;
   2703       1.1   thorpej 	bool is_pi, pending_is_pi;
   2704       1.1   thorpej 
   2705       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
   2706       1.1   thorpej 	    &next, &is_pi);
   2707       1.1   thorpej 	futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
   2708       1.1   thorpej 	    &pending, &pending_is_pi);
   2709       1.1   thorpej 
   2710       1.1   thorpej 	/*
   2711       1.1   thorpej 	 * Walk down the list of locked futexes and release them, up
   2712       1.1   thorpej 	 * to one million of them before we give up.
   2713       1.1   thorpej 	 */
   2714       1.1   thorpej 
   2715       1.1   thorpej 	while (next != l->l_robust_head && limit-- > 0) {
   2716       1.1   thorpej 		/* pending handled below. */
   2717       1.1   thorpej 		if (next != pending)
   2718       1.1   thorpej 			release_futex(next + offset, tid, is_pi, false);
   2719       1.1   thorpej 		error = futex_fetch_robust_entry(next, &next, &is_pi);
   2720       1.1   thorpej 		if (error)
   2721       1.1   thorpej 			break;
   2722       1.1   thorpej 		preempt_point();
   2723       1.1   thorpej 	}
   2724       1.1   thorpej 	if (limit <= 0) {
   2725       1.1   thorpej 		printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
   2726       1.1   thorpej 		    " exhausted robust futex limit\n",
   2727       1.1   thorpej 		    (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
   2728       1.1   thorpej 		    (uintmax_t)l->l_lid, (uintmax_t)tid);
   2729       1.1   thorpej 	}
   2730       1.1   thorpej 
   2731       1.1   thorpej 	/* If there's a pending futex, it may need to be released too. */
   2732       1.1   thorpej 	if (pending != 0) {
   2733       1.1   thorpej 		release_futex(pending + offset, tid, pending_is_pi, true);
   2734       1.1   thorpej 	}
   2735       1.1   thorpej }
   2736