Home | History | Annotate | Line # | Download | only in common
linux_futex.c revision 1.33
      1 /*	$NetBSD: linux_futex.c,v 1.32 2013/10/17 21:08:16 christos Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  * 3. All advertising materials mentioning features or use of this software
     15  *    must display the following acknowledgement:
     16  *	This product includes software developed by Emmanuel Dreyfus
     17  * 4. The name of the author may not be used to endorse or promote
     18  *    products derived from this software without specific prior written
     19  *    permission.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
     22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
     23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
     25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     31  * POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 
     34 #include <sys/cdefs.h>
     35 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.32 2013/10/17 21:08:16 christos Exp $");
     36 
     37 #include <sys/param.h>
     38 #include <sys/time.h>
     39 #include <sys/systm.h>
     40 #include <sys/proc.h>
     41 #include <sys/lwp.h>
     42 #include <sys/queue.h>
     43 #include <sys/condvar.h>
     44 #include <sys/mutex.h>
     45 #include <sys/kmem.h>
     46 #include <sys/kernel.h>
     47 #include <sys/atomic.h>
     48 
     49 #include <compat/linux/common/linux_types.h>
     50 #include <compat/linux/common/linux_emuldata.h>
     51 #include <compat/linux/common/linux_exec.h>
     52 #include <compat/linux/common/linux_signal.h>
     53 #include <compat/linux/common/linux_futex.h>
     54 #include <compat/linux/common/linux_sched.h>
     55 #include <compat/linux/common/linux_machdep.h>
     56 #include <compat/linux/linux_syscallargs.h>
     57 
     58 struct futex;
     59 
     60 struct waiting_proc {
     61 	lwp_t *wp_l;
     62 	struct futex *wp_new_futex;
     63 	kcondvar_t wp_futex_cv;
     64 	TAILQ_ENTRY(waiting_proc) wp_list;
     65 	TAILQ_ENTRY(waiting_proc) wp_rqlist;
     66 };
     67 struct futex {
     68 	void *f_uaddr;
     69 	int f_refcount;
     70 	uint32_t f_bitset;
     71 	LIST_ENTRY(futex) f_list;
     72 	TAILQ_HEAD(, waiting_proc) f_waiting_proc;
     73 	TAILQ_HEAD(, waiting_proc) f_requeue_proc;
     74 };
     75 
     76 static LIST_HEAD(futex_list, futex) futex_list;
     77 static kmutex_t futex_lock;
     78 
     79 #define FUTEX_LOCK	mutex_enter(&futex_lock)
     80 #define FUTEX_UNLOCK	mutex_exit(&futex_lock)
     81 #define FUTEX_LOCKASSERT	KASSERT(mutex_owned(&futex_lock))
     82 
     83 #define FUTEX_SYSTEM_LOCK	KERNEL_LOCK(1, NULL)
     84 #define FUTEX_SYSTEM_UNLOCK	KERNEL_UNLOCK_ONE(0)
     85 
     86 #ifdef DEBUG_LINUX_FUTEX
     87 int debug_futex = 1;
     88 #define FUTEXPRINTF(a) do { if (debug_futex) printf a; } while (0)
     89 #else
     90 #define FUTEXPRINTF(a)
     91 #endif
     92 
     93 void
     94 linux_futex_init(void)
     95 {
     96 	FUTEXPRINTF(("%s: initializing futex\n", __func__));
     97 	mutex_init(&futex_lock, MUTEX_DEFAULT, IPL_NONE);
     98 }
     99 
    100 void
    101 linux_futex_fini(void)
    102 {
    103 	FUTEXPRINTF(("%s: destroying futex\n", __func__));
    104 	mutex_destroy(&futex_lock);
    105 }
    106 
    107 static struct waiting_proc *futex_wp_alloc(void);
    108 static void futex_wp_free(struct waiting_proc *);
    109 static struct futex *futex_get(void *, uint32_t);
    110 static void futex_ref(struct futex *);
    111 static void futex_put(struct futex *);
    112 static int futex_sleep(struct futex **, lwp_t *, int, struct waiting_proc *);
    113 static int futex_wake(struct futex *, int, struct futex *, int);
    114 static int futex_atomic_op(lwp_t *, int, void *);
    115 
    116 int
    117 linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval)
    118 {
    119 	/* {
    120 		syscallarg(int *) uaddr;
    121 		syscallarg(int) op;
    122 		syscallarg(int) val;
    123 		syscallarg(const struct linux_timespec *) timeout;
    124 		syscallarg(int *) uaddr2;
    125 		syscallarg(int) val3;
    126 	} */
    127 	struct linux_timespec lts;
    128 	struct timespec ts = { 0, 0 };
    129 	int error;
    130 
    131 	if ((SCARG(uap, op) & LINUX_FUTEX_CMD_MASK) == LINUX_FUTEX_WAIT &&
    132 	    SCARG(uap, timeout) != NULL) {
    133 		if ((error = copyin(SCARG(uap, timeout),
    134 		    &lts, sizeof(lts))) != 0) {
    135 			return error;
    136 		}
    137 		linux_to_native_timespec(&ts, &lts);
    138 	}
    139 	return linux_do_futex(l, uap, retval, &ts);
    140 }
    141 
    142 int
    143 linux_do_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval, struct timespec *ts)
    144 {
    145 	/* {
    146 		syscallarg(int *) uaddr;
    147 		syscallarg(int) op;
    148 		syscallarg(int) val;
    149 		syscallarg(const struct linux_timespec *) timeout;
    150 		syscallarg(int *) uaddr2;
    151 		syscallarg(int) val3;
    152 	} */
    153 	int val, val3;
    154 	int ret;
    155 	int error = 0;
    156 	struct futex *f;
    157 	struct futex *newf;
    158 	int tout;
    159 	struct futex *f2;
    160 	struct waiting_proc *wp;
    161 	int op_ret, cmd;
    162 	clockid_t clk;
    163 
    164 	cmd = SCARG(uap, op) & LINUX_FUTEX_CMD_MASK;
    165 	val3 = SCARG(uap, val3);
    166 
    167 	if (SCARG(uap, op) & LINUX_FUTEX_CLOCK_REALTIME) {
    168 		switch (cmd) {
    169 		case LINUX_FUTEX_WAIT_BITSET:
    170 		case LINUX_FUTEX_WAIT:
    171 			clk = CLOCK_REALTIME;
    172 			break;
    173 		default:
    174 			return ENOSYS;
    175 		}
    176 	} else
    177 		clk = CLOCK_MONOTONIC;
    178 
    179 	/*
    180 	 * Our implementation provides only private futexes. Most of the apps
    181 	 * should use private futexes but don't claim so. Therefore we treat
    182 	 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
    183 	 * in most cases (ie. when futexes are not shared on file descriptor
    184 	 * or between different processes).
    185 	 *
    186 	 * Note that we don't handle bitsets at all at the moment. We need
    187 	 * to move from refcounting uaddr's to handling multiple futex entries
    188 	 * pointing to the same uaddr, but having possibly different bitmask.
    189 	 * Perhaps move to an implementation where each uaddr has a list of
    190 	 * futexes.
    191 	 */
    192 	switch (cmd) {
    193 	case LINUX_FUTEX_WAIT:
    194 		val3 = FUTEX_BITSET_MATCH_ANY;
    195 		/*FALLTHROUGH*/
    196 	case LINUX_FUTEX_WAIT_BITSET:
    197 		if ((error = ts2timo(clk, 0, ts, &tout, NULL)) != 0) {
    198 			if (error != ETIMEDOUT)
    199 				return error;
    200 			/*
    201 			 * If the user process requests a non null timeout,
    202 			 * make sure we do not turn it into an infinite
    203 			 * timeout because tout is 0.
    204 			 *
    205 			 * We use a minimal timeout of 1/hz. Maybe it would make
    206 			 * sense to just return ETIMEDOUT without sleeping.
    207 			 */
    208 			if (SCARG(uap, timeout) != NULL)
    209 				tout = 1;
    210 			else
    211 				tout = 0;
    212 		}
    213 		FUTEX_SYSTEM_LOCK;
    214 		if ((error = copyin(SCARG(uap, uaddr),
    215 		    &val, sizeof(val))) != 0) {
    216 			FUTEX_SYSTEM_UNLOCK;
    217 			return error;
    218 		}
    219 
    220 		if (val != SCARG(uap, val)) {
    221 			FUTEX_SYSTEM_UNLOCK;
    222 			return EWOULDBLOCK;
    223 		}
    224 
    225 		FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, "
    226 		    "*uaddr = %d, timeout = %lld.%09ld\n",
    227 		    l->l_proc->p_pid, l->l_lid, SCARG(uap, val),
    228 		    SCARG(uap, uaddr), val, (long long)ts->tv_sec,
    229 		    ts->tv_nsec));
    230 
    231 
    232 		wp = futex_wp_alloc();
    233 		FUTEX_LOCK;
    234 		f = futex_get(SCARG(uap, uaddr), val3);
    235 		ret = futex_sleep(&f, l, tout, wp);
    236 		futex_put(f);
    237 		FUTEX_UNLOCK;
    238 		futex_wp_free(wp);
    239 
    240 		FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, "
    241 		    "ret = %d\n", l->l_proc->p_pid, l->l_lid,
    242 		    SCARG(uap, uaddr), ret));
    243 
    244 		FUTEX_SYSTEM_UNLOCK;
    245 		switch (ret) {
    246 		case EWOULDBLOCK:	/* timeout */
    247 			return ETIMEDOUT;
    248 			break;
    249 		case EINTR:		/* signal */
    250 			return EINTR;
    251 			break;
    252 		case 0:			/* FUTEX_WAKE received */
    253 			FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n",
    254 			    l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr)));
    255 			return 0;
    256 			break;
    257 		default:
    258 			FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret));
    259 			break;
    260 		}
    261 
    262 		/* NOTREACHED */
    263 		break;
    264 
    265 	case LINUX_FUTEX_WAKE:
    266 		val = FUTEX_BITSET_MATCH_ANY;
    267 		/*FALLTHROUGH*/
    268 	case LINUX_FUTEX_WAKE_BITSET:
    269 		/*
    270 		 * XXX: Linux is able cope with different addresses
    271 		 * corresponding to the same mapped memory in the sleeping
    272 		 * and the waker process(es).
    273 		 */
    274 		FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n",
    275 		    l->l_proc->p_pid, l->l_lid,
    276 		    SCARG(uap, uaddr), SCARG(uap, val)));
    277 
    278 		FUTEX_SYSTEM_LOCK;
    279 		FUTEX_LOCK;
    280 		f = futex_get(SCARG(uap, uaddr), val3);
    281 		*retval = futex_wake(f, SCARG(uap, val), NULL, 0);
    282 		futex_put(f);
    283 		FUTEX_UNLOCK;
    284 		FUTEX_SYSTEM_UNLOCK;
    285 
    286 		break;
    287 
    288 	case LINUX_FUTEX_CMP_REQUEUE:
    289 		FUTEX_SYSTEM_LOCK;
    290 
    291 		if ((error = copyin(SCARG(uap, uaddr),
    292 		    &val, sizeof(val))) != 0) {
    293 			FUTEX_SYSTEM_UNLOCK;
    294 			return error;
    295 		}
    296 
    297 		if (val != val3) {
    298 			FUTEX_SYSTEM_UNLOCK;
    299 			return EAGAIN;
    300 		}
    301 
    302 		FUTEXPRINTF(("FUTEX_CMP_REQUEUE %d.%d: uaddr = %p, val = %d, "
    303 		    "uaddr2 = %p, val2 = %d\n",
    304 		    l->l_proc->p_pid, l->l_lid,
    305 		    SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
    306 		    (int)(unsigned long)SCARG(uap, timeout)));
    307 
    308 		FUTEX_LOCK;
    309 		f = futex_get(SCARG(uap, uaddr), val3);
    310 		newf = futex_get(SCARG(uap, uaddr2), val3);
    311 		*retval = futex_wake(f, SCARG(uap, val), newf,
    312 		    (int)(unsigned long)SCARG(uap, timeout));
    313 		futex_put(f);
    314 		futex_put(newf);
    315 		FUTEX_UNLOCK;
    316 
    317 		FUTEX_SYSTEM_UNLOCK;
    318 		break;
    319 
    320 	case LINUX_FUTEX_REQUEUE:
    321 		FUTEX_SYSTEM_LOCK;
    322 
    323 		FUTEXPRINTF(("FUTEX_REQUEUE %d.%d: uaddr = %p, val = %d, "
    324 		    "uaddr2 = %p, val2 = %d\n",
    325 		    l->l_proc->p_pid, l->l_lid,
    326 		    SCARG(uap, uaddr), SCARG(uap, val), SCARG(uap, uaddr2),
    327 		    (int)(unsigned long)SCARG(uap, timeout)));
    328 
    329 		FUTEX_LOCK;
    330 		f = futex_get(SCARG(uap, uaddr), val3);
    331 		newf = futex_get(SCARG(uap, uaddr2), val3);
    332 		*retval = futex_wake(f, SCARG(uap, val), newf,
    333 		    (int)(unsigned long)SCARG(uap, timeout));
    334 		futex_put(f);
    335 		futex_put(newf);
    336 		FUTEX_UNLOCK;
    337 
    338 		FUTEX_SYSTEM_UNLOCK;
    339 		break;
    340 
    341 	case LINUX_FUTEX_FD:
    342 		FUTEXPRINTF(("%s: unimplemented op %d\n", __func__, cmd));
    343 		return ENOSYS;
    344 	case LINUX_FUTEX_WAKE_OP:
    345 		FUTEX_SYSTEM_LOCK;
    346 
    347 		FUTEXPRINTF(("FUTEX_WAKE_OP %d.%d: uaddr = %p, op = %d, "
    348 		    "val = %d, uaddr2 = %p, val2 = %d\n",
    349 		    l->l_proc->p_pid, l->l_lid,
    350 		    SCARG(uap, uaddr), cmd, SCARG(uap, val),
    351 		    SCARG(uap, uaddr2),
    352 		    (int)(unsigned long)SCARG(uap, timeout)));
    353 
    354 		FUTEX_LOCK;
    355 		f = futex_get(SCARG(uap, uaddr), val3);
    356 		f2 = futex_get(SCARG(uap, uaddr2), val3);
    357 		FUTEX_UNLOCK;
    358 
    359 		/*
    360 		 * This function returns positive number as results and
    361 		 * negative as errors
    362 		 */
    363 		op_ret = futex_atomic_op(l, val3, SCARG(uap, uaddr2));
    364 		FUTEX_LOCK;
    365 		if (op_ret < 0) {
    366 			futex_put(f);
    367 			futex_put(f2);
    368 			FUTEX_UNLOCK;
    369 			FUTEX_SYSTEM_UNLOCK;
    370 			return -op_ret;
    371 		}
    372 
    373 		ret = futex_wake(f, SCARG(uap, val), NULL, 0);
    374 		futex_put(f);
    375 		if (op_ret > 0) {
    376 			op_ret = 0;
    377 			/*
    378 			 * Linux abuses the address of the timespec parameter
    379 			 * as the number of retries
    380 			 */
    381 			op_ret += futex_wake(f2,
    382 			    (int)(unsigned long)SCARG(uap, timeout), NULL, 0);
    383 			ret += op_ret;
    384 		}
    385 		futex_put(f2);
    386 		FUTEX_UNLOCK;
    387 		FUTEX_SYSTEM_UNLOCK;
    388 		*retval = ret;
    389 		break;
    390 	default:
    391 		FUTEXPRINTF(("%s: unknown op %d\n", __func__, cmd));
    392 		return ENOSYS;
    393 	}
    394 	return 0;
    395 }
    396 
    397 static struct waiting_proc *
    398 futex_wp_alloc(void)
    399 {
    400 	struct waiting_proc *wp;
    401 
    402 	wp = kmem_zalloc(sizeof(*wp), KM_SLEEP);
    403 	cv_init(&wp->wp_futex_cv, "futex");
    404 	return wp;
    405 }
    406 
    407 static void
    408 futex_wp_free(struct waiting_proc *wp)
    409 {
    410 
    411 	cv_destroy(&wp->wp_futex_cv);
    412 	kmem_free(wp, sizeof(*wp));
    413 }
    414 
    415 static struct futex *
    416 futex_get(void *uaddr, uint32_t bitset)
    417 {
    418 	struct futex *f;
    419 
    420 	FUTEX_LOCKASSERT;
    421 
    422 	LIST_FOREACH(f, &futex_list, f_list) {
    423 		if (f->f_uaddr == uaddr) {
    424 			f->f_refcount++;
    425 			return f;
    426 		}
    427 	}
    428 
    429 	/* Not found, create it */
    430 	f = kmem_zalloc(sizeof(*f), KM_SLEEP);
    431 	f->f_uaddr = uaddr;
    432 	f->f_bitset = bitset;
    433 	f->f_refcount = 1;
    434 	TAILQ_INIT(&f->f_waiting_proc);
    435 	TAILQ_INIT(&f->f_requeue_proc);
    436 	LIST_INSERT_HEAD(&futex_list, f, f_list);
    437 
    438 	return f;
    439 }
    440 
    441 static void
    442 futex_ref(struct futex *f)
    443 {
    444 
    445 	FUTEX_LOCKASSERT;
    446 
    447 	f->f_refcount++;
    448 }
    449 
    450 static void
    451 futex_put(struct futex *f)
    452 {
    453 
    454 	FUTEX_LOCKASSERT;
    455 
    456 	f->f_refcount--;
    457 	if (f->f_refcount == 0) {
    458 		KASSERT(TAILQ_EMPTY(&f->f_waiting_proc));
    459 		KASSERT(TAILQ_EMPTY(&f->f_requeue_proc));
    460 		LIST_REMOVE(f, f_list);
    461 		kmem_free(f, sizeof(*f));
    462 	}
    463 }
    464 
    465 static int
    466 futex_sleep(struct futex **fp, lwp_t *l, int timeout, struct waiting_proc *wp)
    467 {
    468 	struct futex *f, *newf;
    469 	int ret;
    470 
    471 	FUTEX_LOCKASSERT;
    472 
    473 	f = *fp;
    474 	wp->wp_l = l;
    475 	wp->wp_new_futex = NULL;
    476 
    477 requeue:
    478 	TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);
    479 	ret = cv_timedwait_sig(&wp->wp_futex_cv, &futex_lock, timeout);
    480 	TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
    481 
    482 	/* if futex_wake() tells us to requeue ... */
    483 	newf = wp->wp_new_futex;
    484 	if (ret == 0 && newf != NULL) {
    485 		/* ... requeue ourselves on the new futex */
    486 		futex_put(f);
    487 		wp->wp_new_futex = NULL;
    488 		TAILQ_REMOVE(&newf->f_requeue_proc, wp, wp_rqlist);
    489 		*fp = f = newf;
    490 		goto requeue;
    491 	}
    492 	return ret;
    493 }
    494 
    495 static int
    496 futex_wake(struct futex *f, int n, struct futex *newf, int n2)
    497 {
    498 	struct waiting_proc *wp, *wpnext;
    499 	int count;
    500 
    501 	FUTEX_LOCKASSERT;
    502 
    503 	count = newf ? 0 : 1;
    504 
    505 	/*
    506 	 * first, wake up any threads sleeping on this futex.
    507 	 * note that sleeping threads are not in the process of requeueing.
    508 	 */
    509 
    510 	TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) {
    511 		KASSERT(wp->wp_new_futex == NULL);
    512 
    513 		FUTEXPRINTF(("%s: signal f %p l %p ref %d\n", __func__,
    514 		    f, wp->wp_l, f->f_refcount));
    515 		cv_signal(&wp->wp_futex_cv);
    516 		if (count <= n) {
    517 			count++;
    518 		} else {
    519 			if (newf == NULL)
    520 				break;
    521 
    522 			/* matching futex_put() is called by the other thread. */
    523 			futex_ref(newf);
    524 			wp->wp_new_futex = newf;
    525 			TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist);
    526 			FUTEXPRINTF(("%s: requeue newf %p l %p ref %d\n",
    527 			    __func__, newf, wp->wp_l, newf->f_refcount));
    528 			if (count - n >= n2)
    529 				goto out;
    530 		}
    531 	}
    532 
    533 	/*
    534 	 * next, deal with threads that are requeuing to this futex.
    535 	 * we don't need to signal these threads, any thread on the
    536 	 * requeue list has already been signaled but hasn't had a chance
    537 	 * to run and requeue itself yet.  if we would normally wake
    538 	 * a thread, just remove the requeue info.  if we would normally
    539 	 * requeue a thread, change the requeue target.
    540 	 */
    541 
    542 	TAILQ_FOREACH_SAFE(wp, &f->f_requeue_proc, wp_rqlist, wpnext) {
    543 		KASSERT(wp->wp_new_futex == f);
    544 
    545 		FUTEXPRINTF(("%s: unrequeue f %p l %p ref %d\n", __func__,
    546 		    f, wp->wp_l, f->f_refcount));
    547 		wp->wp_new_futex = NULL;
    548 		TAILQ_REMOVE(&f->f_requeue_proc, wp, wp_rqlist);
    549 		futex_put(f);
    550 
    551 		if (count <= n) {
    552 			count++;
    553 		} else {
    554 			if (newf == NULL)
    555 				break;
    556 
    557 			/* matching futex_put() is called by the other thread. */
    558 			futex_ref(newf);
    559 			wp->wp_new_futex = newf;
    560 			TAILQ_INSERT_TAIL(&newf->f_requeue_proc, wp, wp_rqlist);
    561 			FUTEXPRINTF(("%s: rerequeue newf %p l %p ref %d\n",
    562 			    __func__, newf, wp->wp_l, newf->f_refcount));
    563 			if (count - n >= n2)
    564 				break;
    565 		}
    566 	}
    567 
    568 out:
    569 	return count;
    570 }
    571 
    572 static int
    573 futex_atomic_op(lwp_t *l, int encoded_op, void *uaddr)
    574 {
    575 	const int op = (encoded_op >> 28) & 7;
    576 	const int cmp = (encoded_op >> 24) & 15;
    577 	const int cmparg = (encoded_op << 20) >> 20;
    578 	int oparg = (encoded_op << 8) >> 20;
    579 	int error, oldval, cval;
    580 
    581 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
    582 		oparg = 1 << oparg;
    583 
    584 	/* XXX: linux verifies access here and returns EFAULT */
    585 
    586 	if (copyin(uaddr, &cval, sizeof(int)) != 0)
    587 		return -EFAULT;
    588 
    589 	for (;;) {
    590 		int nval;
    591 
    592 		switch (op) {
    593 		case FUTEX_OP_SET:
    594 			nval = oparg;
    595 			break;
    596 		case FUTEX_OP_ADD:
    597 			nval = cval + oparg;
    598 			break;
    599 		case FUTEX_OP_OR:
    600 			nval = cval | oparg;
    601 			break;
    602 		case FUTEX_OP_ANDN:
    603 			nval = cval & ~oparg;
    604 			break;
    605 		case FUTEX_OP_XOR:
    606 			nval = cval ^ oparg;
    607 			break;
    608 		default:
    609 			return -ENOSYS;
    610 		}
    611 
    612 		error = ucas_int(uaddr, cval, nval, &oldval);
    613 		if (error || oldval == cval) {
    614 			break;
    615 		}
    616 		cval = oldval;
    617 	}
    618 
    619 	if (error)
    620 		return -EFAULT;
    621 
    622 	switch (cmp) {
    623 	case FUTEX_OP_CMP_EQ:
    624 		return (oldval == cmparg);
    625 	case FUTEX_OP_CMP_NE:
    626 		return (oldval != cmparg);
    627 	case FUTEX_OP_CMP_LT:
    628 		return (oldval < cmparg);
    629 	case FUTEX_OP_CMP_GE:
    630 		return (oldval >= cmparg);
    631 	case FUTEX_OP_CMP_LE:
    632 		return (oldval <= cmparg);
    633 	case FUTEX_OP_CMP_GT:
    634 		return (oldval > cmparg);
    635 	default:
    636 		return -ENOSYS;
    637 	}
    638 }
    639 
    640 int
    641 linux_sys_set_robust_list(struct lwp *l,
    642     const struct linux_sys_set_robust_list_args *uap, register_t *retval)
    643 {
    644 	/* {
    645 		syscallarg(struct linux_robust_list_head *) head;
    646 		syscallarg(size_t) len;
    647 	} */
    648 	struct linux_emuldata *led;
    649 
    650 	if (SCARG(uap, len) != sizeof(struct linux_robust_list_head))
    651 		return EINVAL;
    652 	led = l->l_emuldata;
    653 	led->led_robust_head = SCARG(uap, head);
    654 	*retval = 0;
    655 	return 0;
    656 }
    657 
    658 int
    659 linux_sys_get_robust_list(struct lwp *l,
    660     const struct linux_sys_get_robust_list_args *uap, register_t *retval)
    661 {
    662 	/* {
    663 		syscallarg(int) pid;
    664 		syscallarg(struct linux_robust_list_head **) head;
    665 		syscallarg(size_t *) len;
    666 	} */
    667 	struct proc *p;
    668 	struct linux_emuldata *led;
    669 	struct linux_robust_list_head *head;
    670 	size_t len;
    671 	int error = 0;
    672 
    673 	p = l->l_proc;
    674 	if (!SCARG(uap, pid)) {
    675 		led = l->l_emuldata;
    676 		head = led->led_robust_head;
    677 	} else {
    678 		mutex_enter(p->p_lock);
    679 		l = lwp_find(p, SCARG(uap, pid));
    680 		if (l != NULL) {
    681 			led = l->l_emuldata;
    682 			head = led->led_robust_head;
    683 		}
    684 		mutex_exit(p->p_lock);
    685 		if (l == NULL) {
    686 			return ESRCH;
    687 		}
    688 	}
    689 #ifdef __arch64__
    690 	if (p->p_flag & PK_32) {
    691 		uint32_t u32;
    692 
    693 		u32 = 12;
    694 		error = copyout(&u32, SCARG(uap, len), sizeof(u32));
    695 		if (error)
    696 			return error;
    697 		u32 = (uint32_t)(uintptr_t)head;
    698 		return copyout(&u32, SCARG(uap, head), sizeof(u32));
    699 	}
    700 #endif
    701 
    702 	len = sizeof(*head);
    703 	error = copyout(&len, SCARG(uap, len), sizeof(len));
    704 	if (error)
    705 		return error;
    706 	return copyout(&head, SCARG(uap, head), sizeof(head));
    707 }
    708 
    709 static int
    710 handle_futex_death(void *uaddr, pid_t pid, int pi)
    711 {
    712 	int uval, nval, mval;
    713 	struct futex *f;
    714 
    715 retry:
    716 	if (copyin(uaddr, &uval, sizeof(uval)))
    717 		return EFAULT;
    718 
    719 	if ((uval & FUTEX_TID_MASK) == pid) {
    720 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
    721 		nval = atomic_cas_32(uaddr, uval, mval);
    722 
    723 		if (nval == -1)
    724 			return EFAULT;
    725 
    726 		if (nval != uval)
    727 			goto retry;
    728 
    729 		if (!pi && (uval & FUTEX_WAITERS)) {
    730 			FUTEX_LOCK;
    731 			f = futex_get(uaddr, FUTEX_BITSET_MATCH_ANY);
    732 			futex_wake(f, 1, NULL, 0);
    733 			FUTEX_UNLOCK;
    734 		}
    735 	}
    736 
    737 	return 0;
    738 }
    739 
    740 static int
    741 fetch_robust_entry(struct lwp *l, struct linux_robust_list **entry,
    742     struct linux_robust_list **head, int *pi)
    743 {
    744 	unsigned long uentry;
    745 
    746 #ifdef __arch64__
    747 	if (l->l_proc->p_flag & PK_32) {
    748 		uint32_t u32;
    749 
    750 		if (copyin(head, &u32, sizeof(u32)))
    751 			return EFAULT;
    752 		uentry = (unsigned long)u32;
    753 	} else
    754 #endif
    755 	if (copyin(head, &uentry, sizeof(uentry)))
    756 		return EFAULT;
    757 
    758 	*entry = (void *)(uentry & ~1UL);
    759 	*pi = uentry & 1;
    760 
    761 	return 0;
    762 }
    763 
    764 /* This walks the list of robust futexes, releasing them. */
    765 void
    766 release_futexes(struct lwp *l)
    767 {
    768 	struct linux_robust_list_head head;
    769 	struct linux_robust_list *entry, *next_entry = NULL, *pending;
    770 	unsigned int limit = 2048, pi, next_pi, pip;
    771 	struct linux_emuldata *led;
    772 	unsigned long futex_offset;
    773 	int rc;
    774 
    775 	led = l->l_emuldata;
    776 	if (led->led_robust_head == NULL)
    777 		return;
    778 
    779 #ifdef __arch64__
    780 	if (l->l_proc->p_flag & PK_32) {
    781 		uint32_t u32s[3];
    782 
    783 		if (copyin(led->led_robust_head, u32s, sizeof(u32s)))
    784 			return;
    785 
    786 		head.list.next = (void *)(uintptr_t)u32s[0];
    787 		head.futex_offset = (unsigned long)u32s[1];
    788 		head.pending_list = (void *)(uintptr_t)u32s[2];
    789 	} else
    790 #endif
    791 	if (copyin(led->led_robust_head, &head, sizeof(head)))
    792 		return;
    793 
    794 	if (fetch_robust_entry(l, &entry, &head.list.next, &pi))
    795 		return;
    796 
    797 #ifdef __arch64__
    798 	if (l->l_proc->p_flag & PK_32) {
    799 		uint32_t u32;
    800 
    801 		if (copyin(led->led_robust_head, &u32, sizeof(u32)))
    802 			return;
    803 
    804 		head.futex_offset = (unsigned long)u32;
    805 	} else
    806 #endif
    807 	if (copyin(&head.futex_offset, &futex_offset, sizeof(unsigned long)))
    808 		return;
    809 
    810 	if (fetch_robust_entry(l, &pending, &head.pending_list, &pip))
    811 		return;
    812 
    813 	while (entry != &head.list) {
    814 		rc = fetch_robust_entry(l, &next_entry, &entry->next, &next_pi);
    815 
    816 		if (entry != pending)
    817 			if (handle_futex_death((char *)entry + futex_offset,
    818 			    l->l_lid, pi))
    819 				return;
    820 
    821 		if (rc)
    822 			return;
    823 
    824 		entry = next_entry;
    825 		pi = next_pi;
    826 
    827 		if (!--limit)
    828 			break;
    829 
    830 		yield();	/* XXX why? */
    831 	}
    832 
    833 	if (pending)
    834 		handle_futex_death((char *)pending + futex_offset,
    835 		    l->l_lid, pip);
    836 }
    837