sys_futex.c revision 1.12.4.6 1 /* $NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.6 2021/11/01 08:40:16 chs Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/atomic.h>
120 #include <sys/condvar.h>
121 #include <sys/futex.h>
122 #include <sys/mutex.h>
123 #include <sys/rbtree.h>
124 #include <sys/sleepq.h>
125 #include <sys/queue.h>
126 #include <sys/sdt.h>
127
128 #include <sys/syscall.h>
129 #include <sys/syscallargs.h>
130 #include <sys/syscallvar.h>
131
132 #include <uvm/uvm_extern.h>
133
134 /*
135 * DTrace probes.
136 */
137 SDT_PROVIDER_DEFINE(futex);
138
139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
140 /* exit: error */
141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
142 "struct timespec *", "int", "int");
143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
144
145 /* entry: uaddr, nwake, bitset, fflags */
146 /* exit: error, nwoken */
147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
149
150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
151 /* exit: error, nwoken */
152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
153 "int");
154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
155
156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
157 /* exit: error, nwoken */
158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
159 "int", "int", "int");
160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
161
162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
163 /* exit: error, nwoken */
164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
165 "int", "int");
166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
167
168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
171
172 /* entry: timo */
173 /* exit: error */
174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
176
177 /*
178 * Lock order:
179 *
180 * futex_tab.lock
181 * futex::fx_op_lock ordered by kva of struct futex
182 * -> futex::fx_sq_lock ordered by kva of sleepq lock
183 *
184 * N.B. multiple futexes can share a single sleepq lock.
185 */
186
187 /*
188 * union futex_key
189 *
190 * A futex is addressed either by a vmspace+va (private) or by
191 * a uvm_voaddr (shared).
192 */
193 typedef union futex_key {
194 struct {
195 struct vmspace *vmspace;
196 vaddr_t va;
197 } fk_private;
198 struct uvm_voaddr fk_shared;
199 } futex_key_t;
200
201 CTASSERT(offsetof(futex_key_t, fk_private.va) ==
202 offsetof(futex_key_t, fk_shared.offset));
203
204 /*
205 * struct futex
206 *
207 * Kernel state for a futex located at a particular address in a
208 * particular virtual address space.
209 *
210 * N.B. fx_refcnt is an unsigned long because we need to be able
211 * to operate on it atomically on all systems while at the same
212 * time rendering practically impossible the chance of it reaching
213 * its max value. In practice, we're limited by the number of LWPs
214 * that can be present on the system at any given time, and the
215 * assumption is that limit will be good enough on a 32-bit platform.
216 * See futex_wake() for why overflow needs to be avoided.
217 *
218 * XXX Since futex addresses must be 4-byte aligned, we could
219 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
220 * XXX field of the key. Worth it?
221 */
222 struct futex {
223 futex_key_t fx_key;
224 struct rb_node fx_node;
225 unsigned long fx_refcnt;
226 bool fx_shared;
227 bool fx_on_tree;
228 uint8_t fx_class;
229
230 kmutex_t fx_op_lock; /* adaptive */
231 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
232 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
233 unsigned int fx_nwaiters[2];
234 };
235
236 /*
237 * futex classes: Some futex operations can only be carried out on
238 * futexes that are known to be abiding by a certain protocol. These
239 * classes are assigned to a futex when created due to a wait event,
240 * and when subsequent wake or requeue operations are issued, the
241 * class is checked at futex lookup time. If the class does not match,
242 * EINVAL is the result.
243 */
244 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
245 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
246 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
247 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
248
249 /* sleepq definitions */
250 #define FUTEX_READERQ 0
251 #define FUTEX_WRITERQ 1
252
253 static const char futex_wmesg[] = "futex";
254
255 static void futex_unsleep(lwp_t *, bool);
256
257 static syncobj_t futex_syncobj = {
258 .sobj_flag = SOBJ_SLEEPQ_SORTED,
259 .sobj_unsleep = futex_unsleep,
260 .sobj_changepri = sleepq_changepri,
261 .sobj_lendpri = sleepq_lendpri,
262 .sobj_owner = syncobj_noowner,
263 };
264
265 /*
266 * futex_tab
267 *
268 * Global trees of futexes by vmspace/va and VM object address.
269 *
270 * XXX This obviously doesn't scale in parallel. We could use a
271 * pserialize-safe data structure, but there may be a high cost to
272 * frequent deletion since we don't cache futexes after we're done
273 * with them. We could use hashed locks. But for now, just make
274 * sure userland can't DoS the serial performance, by using a
275 * balanced binary tree for lookup.
276 *
277 * XXX We could use a per-process tree for the table indexed by
278 * virtual address to reduce contention between processes.
279 */
280 static struct {
281 kmutex_t lock;
282 struct rb_tree va;
283 struct rb_tree oa;
284 } futex_tab __cacheline_aligned;
285
286 static int
287 compare_futex_key(void *cookie, const void *n, const void *k)
288 {
289 const struct futex *fa = n;
290 const futex_key_t *fka = &fa->fx_key;
291 const futex_key_t *fkb = k;
292
293 if ((uintptr_t)fka->fk_private.vmspace <
294 (uintptr_t)fkb->fk_private.vmspace)
295 return -1;
296 if ((uintptr_t)fka->fk_private.vmspace >
297 (uintptr_t)fkb->fk_private.vmspace)
298 return +1;
299 if (fka->fk_private.va < fkb->fk_private.va)
300 return -1;
301 if (fka->fk_private.va > fkb->fk_private.va)
302 return +1;
303 return 0;
304 }
305
306 static int
307 compare_futex(void *cookie, const void *na, const void *nb)
308 {
309 const struct futex *fa = na;
310 const struct futex *fb = nb;
311
312 return compare_futex_key(cookie, fa, &fb->fx_key);
313 }
314
315 static const rb_tree_ops_t futex_rb_ops = {
316 .rbto_compare_nodes = compare_futex,
317 .rbto_compare_key = compare_futex_key,
318 .rbto_node_offset = offsetof(struct futex, fx_node),
319 };
320
321 static int
322 compare_futex_shared_key(void *cookie, const void *n, const void *k)
323 {
324 const struct futex *fa = n;
325 const futex_key_t *fka = &fa->fx_key;
326 const futex_key_t *fkb = k;
327
328 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
329 }
330
331 static int
332 compare_futex_shared(void *cookie, const void *na, const void *nb)
333 {
334 const struct futex *fa = na;
335 const struct futex *fb = nb;
336
337 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
338 }
339
340 static const rb_tree_ops_t futex_shared_rb_ops = {
341 .rbto_compare_nodes = compare_futex_shared,
342 .rbto_compare_key = compare_futex_shared_key,
343 .rbto_node_offset = offsetof(struct futex, fx_node),
344 };
345
346 /*
347 * futex_sq_lock2(f, f2)
348 *
349 * Acquire the sleepq locks for f and f2, which may be null, or
350 * which may be the same. If they are distinct, an arbitrary total
351 * order is chosen on the locks.
352 *
353 * Callers should only ever acquire multiple sleepq locks
354 * simultaneously using futex_sq_lock2.
355 */
356 static void
357 futex_sq_lock2(struct futex * const f, struct futex * const f2)
358 {
359
360 /*
361 * If both are null, do nothing; if one is null and the other
362 * is not, lock the other and be done with it.
363 */
364 if (f == NULL && f2 == NULL) {
365 return;
366 } else if (f == NULL) {
367 mutex_spin_enter(f2->fx_sq_lock);
368 return;
369 } else if (f2 == NULL) {
370 mutex_spin_enter(f->fx_sq_lock);
371 return;
372 }
373
374 kmutex_t * const m = f->fx_sq_lock;
375 kmutex_t * const m2 = f2->fx_sq_lock;
376
377 /* If both locks are the same, acquire only one. */
378 if (m == m2) {
379 mutex_spin_enter(m);
380 return;
381 }
382
383 /* Otherwise, use the ordering on the kva of the lock pointer. */
384 if ((uintptr_t)m < (uintptr_t)m2) {
385 mutex_spin_enter(m);
386 mutex_spin_enter(m2);
387 } else {
388 mutex_spin_enter(m2);
389 mutex_spin_enter(m);
390 }
391 }
392
393 /*
394 * futex_sq_unlock2(f, f2)
395 *
396 * Release the sleep queue locks for f and f2, which may be null, or
397 * which may have the same underlying lock.
398 */
399 static void
400 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
401 {
402
403 /*
404 * If both are null, do nothing; if one is null and the other
405 * is not, unlock the other and be done with it.
406 */
407 if (f == NULL && f2 == NULL) {
408 return;
409 } else if (f == NULL) {
410 mutex_spin_exit(f2->fx_sq_lock);
411 return;
412 } else if (f2 == NULL) {
413 mutex_spin_exit(f->fx_sq_lock);
414 return;
415 }
416
417 kmutex_t * const m = f->fx_sq_lock;
418 kmutex_t * const m2 = f2->fx_sq_lock;
419
420 /* If both locks are the same, release only one. */
421 if (m == m2) {
422 mutex_spin_exit(m);
423 return;
424 }
425
426 /* Otherwise, use the ordering on the kva of the lock pointer. */
427 if ((uintptr_t)m < (uintptr_t)m2) {
428 mutex_spin_exit(m2);
429 mutex_spin_exit(m);
430 } else {
431 mutex_spin_exit(m);
432 mutex_spin_exit(m2);
433 }
434 }
435
436 /*
437 * futex_load(uaddr, kaddr)
438 *
439 * Perform a single atomic load to read *uaddr, and return the
440 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
441 * mapped.
442 */
443 static inline int
444 futex_load(int *uaddr, int *kaddr)
445 {
446 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
447 }
448
449 /*
450 * futex_test(uaddr, expected)
451 *
452 * True if *uaddr == expected. False if *uaddr != expected, or if
453 * uaddr is not mapped.
454 */
455 static bool
456 futex_test(int *uaddr, int expected)
457 {
458 int val;
459 int error;
460
461 error = futex_load(uaddr, &val);
462 if (error)
463 return false;
464 return val == expected;
465 }
466
467 static pool_cache_t futex_cache __read_mostly;
468
469 static int futex_ctor(void *, void *, int);
470 static void futex_dtor(void *, void *);
471
472 /*
473 * futex_sys_init()
474 *
475 * Initialize the futex subsystem.
476 */
477 void
478 futex_sys_init(void)
479 {
480
481 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
482 rb_tree_init(&futex_tab.va, &futex_rb_ops);
483 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
484
485 futex_cache = pool_cache_init(sizeof(struct futex),
486 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
487 futex_dtor, NULL);
488 }
489
490 /*
491 * futex_sys_fini()
492 *
493 * Finalize the futex subsystem.
494 */
495 void
496 futex_sys_fini(void)
497 {
498
499 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
500 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
501 mutex_destroy(&futex_tab.lock);
502 }
503
504 /*
505 * futex_ctor()
506 *
507 * Futex object constructor.
508 */
509 static int
510 futex_ctor(void *arg __unused, void *obj, int flags __unused)
511 {
512 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
513 struct futex * const f = obj;
514
515 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
516 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
517
518 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
519 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
520 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
521
522 return 0;
523 }
524
525 /*
526 * futex_dtor()
527 *
528 * Futex object destructor.
529 */
530 static void
531 futex_dtor(void *arg __unused, void *obj)
532 {
533 struct futex * const f = obj;
534
535 mutex_destroy(&f->fx_op_lock);
536 f->fx_sq_lock = NULL;
537 }
538
539 /*
540 * futex_key_init(key, vm, va, shared)
541 *
542 * Initialize a futex key for lookup, etc.
543 */
544 static int
545 futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
546 {
547 int error = 0;
548
549 if (__predict_false(shared)) {
550 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
551 error = EFAULT;
552 } else {
553 fk->fk_private.vmspace = vm;
554 fk->fk_private.va = va;
555 }
556
557 return error;
558 }
559
560 /*
561 * futex_key_fini(key, shared)
562 *
563 * Release a futex key.
564 */
565 static void
566 futex_key_fini(futex_key_t *fk, bool shared)
567 {
568 if (__predict_false(shared))
569 uvm_voaddr_release(&fk->fk_shared);
570 memset(fk, 0, sizeof(*fk));
571 }
572
573 /*
574 * futex_create(fk, shared)
575 *
576 * Create a futex. Initial reference count is 1, representing the
577 * caller. Returns NULL on failure. Always takes ownership of the
578 * key, either transferring it to the newly-created futex, or releasing
579 * the key if creation fails.
580 *
581 * Never sleeps for memory, but may sleep to acquire a lock.
582 */
583 static struct futex *
584 futex_create(futex_key_t *fk, bool shared, uint8_t class)
585 {
586 struct futex *f;
587
588 f = pool_cache_get(futex_cache, PR_NOWAIT);
589 if (f == NULL) {
590 futex_key_fini(fk, shared);
591 return NULL;
592 }
593 f->fx_key = *fk;
594 f->fx_refcnt = 1;
595 f->fx_shared = shared;
596 f->fx_on_tree = false;
597 f->fx_class = class;
598
599 return f;
600 }
601
602 /*
603 * futex_destroy(f)
604 *
605 * Destroy a futex created with futex_create. Reference count
606 * must be zero.
607 *
608 * May sleep.
609 */
610 static void
611 futex_destroy(struct futex *f)
612 {
613
614 ASSERT_SLEEPABLE();
615
616 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
617 KASSERT(!f->fx_on_tree);
618 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
619 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
620 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
621 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
622
623 futex_key_fini(&f->fx_key, f->fx_shared);
624
625 pool_cache_put(futex_cache, f);
626 }
627
628 /*
629 * futex_hold_count(f, n)
630 *
631 * Attempt to acquire a reference to f. Return 0 on success,
632 * ENFILE on too many references.
633 *
634 * Never sleeps.
635 */
636 static int
637 futex_hold_count(struct futex *f, unsigned long const count)
638 {
639 unsigned long refcnt;
640
641 do {
642 refcnt = atomic_load_relaxed(&f->fx_refcnt);
643 if (ULONG_MAX - refcnt < count)
644 return ENFILE;
645 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
646 refcnt + count) != refcnt);
647
648 return 0;
649 }
650 #define futex_hold(f) futex_hold_count(f, 1)
651
652 /*
653 * futex_rele_count(f, n)
654 *
655 * Release a reference to f acquired with futex_create or
656 * futex_hold.
657 *
658 * May sleep to free f.
659 */
660 static void
661 futex_rele_count(struct futex *f, unsigned long const count)
662 {
663 unsigned long refcnt;
664
665 ASSERT_SLEEPABLE();
666
667 do {
668 refcnt = atomic_load_relaxed(&f->fx_refcnt);
669 KASSERT(refcnt >= count);
670 if (refcnt - count == 0)
671 goto trylast;
672 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
673 refcnt - count) != refcnt);
674 return;
675
676 trylast:
677 KASSERT(count <= LONG_MAX);
678 mutex_enter(&futex_tab.lock);
679 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
680 if (f->fx_on_tree) {
681 if (__predict_false(f->fx_shared))
682 rb_tree_remove_node(&futex_tab.oa, f);
683 else
684 rb_tree_remove_node(&futex_tab.va, f);
685 f->fx_on_tree = false;
686 }
687 } else {
688 /* References remain -- don't destroy it. */
689 f = NULL;
690 }
691 mutex_exit(&futex_tab.lock);
692 if (f != NULL)
693 futex_destroy(f);
694 }
695 #define futex_rele(f) futex_rele_count(f, 1)
696
697 /*
698 * futex_rele_count_not_last(f, n)
699 *
700 * Release a reference to f acquired with futex_create or
701 * futex_hold.
702 *
703 * This version asserts that we are not dropping the last
704 * reference to f.
705 */
706 static void
707 futex_rele_count_not_last(struct futex *f, unsigned long const count)
708 {
709 unsigned long refcnt;
710
711 do {
712 refcnt = atomic_load_relaxed(&f->fx_refcnt);
713 KASSERT(refcnt >= count);
714 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
715 refcnt - count) != refcnt);
716 }
717
718 /*
719 * futex_lookup_by_key(key, shared, class, &f)
720 *
721 * Try to find an existing futex va reference in the specified key
722 * On success, return 0, set f to found futex or to NULL if not found,
723 * and increment f's reference count if found.
724 *
725 * Return ENFILE if reference count too high.
726 *
727 * Internal lookup routine shared by futex_lookup() and
728 * futex_lookup_create().
729 */
730 static int
731 futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
732 struct futex **fp)
733 {
734 struct futex *f;
735 int error = 0;
736
737 mutex_enter(&futex_tab.lock);
738 if (__predict_false(shared)) {
739 f = rb_tree_find_node(&futex_tab.oa, fk);
740 } else {
741 f = rb_tree_find_node(&futex_tab.va, fk);
742 }
743 if (f) {
744 if (__predict_true(f->fx_class == class ||
745 class == FUTEX_CLASS_ANY))
746 error = futex_hold(f);
747 else
748 error = EINVAL;
749 if (error)
750 f = NULL;
751 }
752 *fp = f;
753 mutex_exit(&futex_tab.lock);
754
755 return error;
756 }
757
758 /*
759 * futex_insert(f, fp)
760 *
761 * Try to insert the futex f into the tree by va. If there
762 * already is a futex for its va, acquire a reference to it, and
763 * store it in *fp; otherwise store f in *fp.
764 *
765 * Return 0 on success, ENFILE if there already is a futex but its
766 * reference count is too high.
767 */
768 static int
769 futex_insert(struct futex *f, struct futex **fp)
770 {
771 struct futex *f0;
772 int error;
773
774 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
775 KASSERT(!f->fx_on_tree);
776
777 mutex_enter(&futex_tab.lock);
778 if (__predict_false(f->fx_shared))
779 f0 = rb_tree_insert_node(&futex_tab.oa, f);
780 else
781 f0 = rb_tree_insert_node(&futex_tab.va, f);
782 if (f0 == f) {
783 f->fx_on_tree = true;
784 error = 0;
785 } else {
786 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
787 KASSERT(f0->fx_on_tree);
788 error = futex_hold(f0);
789 if (error)
790 goto out;
791 }
792 *fp = f0;
793 out: mutex_exit(&futex_tab.lock);
794
795 return error;
796 }
797
798 /*
799 * futex_lookup(uaddr, shared, class, &f)
800 *
801 * Find a futex at the userland pointer uaddr in the current
802 * process's VM space. On success, return the futex in f and
803 * increment its reference count.
804 *
805 * Caller must call futex_rele when done.
806 */
807 static int
808 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
809 {
810 futex_key_t fk;
811 struct vmspace *vm = curproc->p_vmspace;
812 vaddr_t va = (vaddr_t)uaddr;
813 int error;
814
815 /*
816 * Reject unaligned user pointers so we don't cross page
817 * boundaries and so atomics will work.
818 */
819 if (__predict_false((va & 3) != 0))
820 return EINVAL;
821
822 /* Look it up. */
823 error = futex_key_init(&fk, vm, va, shared);
824 if (error)
825 return error;
826
827 error = futex_lookup_by_key(&fk, shared, class, fp);
828 futex_key_fini(&fk, shared);
829 if (error)
830 return error;
831
832 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
833 KASSERT(*fp == NULL || (*fp)->fx_class == class);
834 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
835
836 /*
837 * Success! (Caller must still check whether we found
838 * anything, but nothing went _wrong_ like trying to use
839 * unmapped memory.)
840 */
841 KASSERT(error == 0);
842
843 return error;
844 }
845
846 /*
847 * futex_lookup_create(uaddr, shared, class, &f)
848 *
849 * Find or create a futex at the userland pointer uaddr in the
850 * current process's VM space. On success, return the futex in f
851 * and increment its reference count.
852 *
853 * Caller must call futex_rele when done.
854 */
855 static int
856 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
857 {
858 futex_key_t fk;
859 struct vmspace *vm = curproc->p_vmspace;
860 struct futex *f = NULL;
861 vaddr_t va = (vaddr_t)uaddr;
862 int error;
863
864 /*
865 * Reject unaligned user pointers so we don't cross page
866 * boundaries and so atomics will work.
867 */
868 if (__predict_false((va & 3) != 0))
869 return EINVAL;
870
871 if (__predict_false(class == FUTEX_CLASS_ANY))
872 return EINVAL;
873
874 error = futex_key_init(&fk, vm, va, shared);
875 if (error)
876 return error;
877
878 /*
879 * Optimistically assume there already is one, and try to find
880 * it.
881 */
882 error = futex_lookup_by_key(&fk, shared, class, fp);
883 if (error || *fp != NULL) {
884 /*
885 * We either found one, or there was an error.
886 * In either case, we are done with the key.
887 */
888 futex_key_fini(&fk, shared);
889 goto out;
890 }
891
892 /*
893 * Create a futex recoard. This tranfers ownership of the key
894 * in all cases.
895 */
896 f = futex_create(&fk, shared, class);
897 if (f == NULL) {
898 error = ENOMEM;
899 goto out;
900 }
901
902 /*
903 * Insert our new futex, or use existing if someone else beat
904 * us to it.
905 */
906 error = futex_insert(f, fp);
907 if (error)
908 goto out;
909 if (*fp == f)
910 f = NULL; /* don't release on exit */
911
912 /* Success! */
913 KASSERT(error == 0);
914
915 out: if (f != NULL)
916 futex_rele(f);
917 KASSERT(error || *fp != NULL);
918 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
919 return error;
920 }
921
922 /*
923 * futex_unsleep:
924 *
925 * Remove an LWP from the futex and sleep queue. This is called when
926 * the LWP has not been awoken normally but instead interrupted: for
927 * example, when a signal is received. Must be called with the LWP
928 * locked. Will unlock if "unlock" is true.
929 */
930 static void
931 futex_unsleep(lwp_t *l, bool unlock)
932 {
933 struct futex *f __diagused;
934
935 f = (struct futex *)(uintptr_t)l->l_wchan;
936
937 KASSERT(mutex_owned(f->fx_sq_lock));
938
939 /* WRITERQ is more likely */
940 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
941 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
942 f->fx_nwaiters[FUTEX_WRITERQ]--;
943 } else {
944 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
945 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
946 f->fx_nwaiters[FUTEX_READERQ]--;
947 }
948
949 sleepq_unsleep(l, unlock);
950 }
951
952 /*
953 * futex_wait(f, timeout, clkid, bitset)
954 *
955 * Wait until deadline on the clock clkid, or forever if deadline
956 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
957 * ETIMEDOUT on timeout, EINTR on signal.
958 */
959 static int
960 futex_wait(struct futex *f, int q, const struct timespec *deadline,
961 clockid_t clkid, int bitset)
962 {
963
964 /*
965 * Some things to note about how this function works:
966 *
967 * ==> When we enter futex_wait(), the futex's op lock is
968 * held, preventing us from missing wakes. Once we are on
969 * the futex's sleepq, it is safe to release the op lock.
970 *
971 * ==> We have a single early escape to avoid calling
972 * sleepq_block(): our deadline already having passed.
973 * If this is a no-timeout wait, or if the deadline has
974 * not already passed, we are guaranteed to call sleepq_block().
975 *
976 * ==> sleepq_block() contains all of the necessary logic
977 * for handling signals; we do not need to check for them
978 * ourselves.
979 *
980 * ==> Once we have blocked, other futex operations will
981 * be able to wake us or requeue us. Until we have blocked,
982 * those other futex operations will not be able to acquire
983 * the sleepq locks in order to perform those operations,
984 * even if we have dropped the op lock. Thus, we will not
985 * miss wakes. This fundamental invariant is relied upon by
986 * every other synchronization object in the kernel.
987 *
988 * ==> sleepq_block() returns for one of three reasons:
989 *
990 * -- We were awakened.
991 * -- We were signaled.
992 * -- We timed out.
993 *
994 * ==> Once sleepq_block() returns, we will not call
995 * sleepq_block() again.
996 *
997 * ==> If sleepq_block() returns due to being signaled,
998 * we must check to see if we were also awakened. This
999 * is to handle the following sequence:
1000 *
1001 * -- Futex wake takes us off sleepq, places us on runq.
1002 * -- We are signaled while sitting on the runq.
1003 * -- We run, and sleepq_block() notices the signal and
1004 * returns EINTR/ERESTART.
1005 *
1006 * In this situation, we want to indicate a successful wake
1007 * to the caller, because that wake has been reported to the
1008 * thread that issued it.
1009 *
1010 * Note that it is NOT possible for a wake to be issued after
1011 * a signal; the LWP will already have been taken off the sleepq
1012 * by the signal, so the wake will never happen. Note that for
1013 * this reason, we must *never* return ERESTART, because it could
1014 * result in us waiting again with no one to awaken us.
1015 */
1016
1017 struct lwp * const l = curlwp;
1018 int timo;
1019 int error;
1020
1021 /*
1022 * Compute our timeout before taking any locks.
1023 */
1024 if (deadline) {
1025 struct timespec ts;
1026
1027 /* Check our watch. */
1028 error = clock_gettime1(clkid, &ts);
1029 if (error) {
1030 mutex_exit(&f->fx_op_lock);
1031 return error;
1032 }
1033
1034 /*
1035 * If we're past the deadline, ETIMEDOUT.
1036 */
1037 if (timespeccmp(deadline, &ts, <=)) {
1038 mutex_exit(&f->fx_op_lock);
1039 return ETIMEDOUT;
1040 } else {
1041 /* Count how much time is left. */
1042 timespecsub(deadline, &ts, &ts);
1043 timo = tstohz(&ts);
1044 if (timo == 0) {
1045 /*
1046 * tstohz() already rounds up, and
1047 * a return value of 0 ticks means
1048 * "expired now or in the past".
1049 */
1050 mutex_exit(&f->fx_op_lock);
1051 return ETIMEDOUT;
1052 }
1053 }
1054 } else {
1055 timo = 0;
1056 }
1057
1058 /*
1059 * Acquire in sleepq -> lwp order. While we're at it, ensure
1060 * that there's room for us to block.
1061 */
1062 mutex_spin_enter(f->fx_sq_lock);
1063 if (__predict_false(q == FUTEX_READERQ &&
1064 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1065 mutex_spin_exit(f->fx_sq_lock);
1066 mutex_exit(&f->fx_op_lock);
1067 return ENFILE;
1068 }
1069 lwp_lock(l);
1070
1071 /*
1072 * No need for locks here; until we're on a futex's sleepq, these
1073 * values are private to the LWP, and the locks needed to get onto
1074 * the sleepq provide the memory barriers needed to ensure visibility.
1075 */
1076 l->l_futex = f;
1077 l->l_futex_wakesel = bitset;
1078
1079 /*
1080 * This is an inlined bit of sleepq_enter();
1081 * we already hold the lwp lock.
1082 */
1083 lwp_unlock_to(l, f->fx_sq_lock);
1084 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1085 KASSERT(lwp_locked(l, f->fx_sq_lock));
1086
1087 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1088 f->fx_nwaiters[q]++;
1089
1090 /* We can now safely release the op lock. */
1091 mutex_exit(&f->fx_op_lock);
1092
1093 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1094 error = sleepq_block(timo, true);
1095 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1096
1097 /*
1098 * f is no longer valid; we may have been moved to another
1099 * futex sleepq while we slept.
1100 */
1101
1102 /*
1103 * If something went wrong, we may still have a futex reference.
1104 * Check for that here and drop it if so. We can do this without
1105 * taking any locks because l->l_futex is private to the LWP when
1106 * not on any futex's sleepq, and we are not on any sleepq because
1107 * we are running.
1108 *
1109 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1110 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1111 */
1112 if (error) {
1113 f = l->l_futex;
1114 if (f != NULL) {
1115 SDT_PROBE0(futex, wait, finish, aborted);
1116 l->l_futex = NULL;
1117 futex_rele(f);
1118 } else {
1119 /* Raced with wakeup; report success. */
1120 SDT_PROBE0(futex, wait, finish, wakerace);
1121 error = 0;
1122 }
1123 if (error == EWOULDBLOCK)
1124 error = ETIMEDOUT;
1125 else if (error != ETIMEDOUT)
1126 error = EINTR;
1127 } else {
1128 f = l->l_futex;
1129 if (__predict_false(f != NULL)) {
1130 /*
1131 * There are certain situations that can cause
1132 * sleepq_block() to return 0 even if we were
1133 * signalled (by e.g. SIGKILL). In this case,
1134 * we will need to release our futex hold and
1135 * return EINTR (we're probably about to die
1136 * anyway).
1137 */
1138 SDT_PROBE0(futex, wait, finish, aborted);
1139 l->l_futex = NULL;
1140 futex_rele(f);
1141 error = EINTR;
1142 } else {
1143 SDT_PROBE0(futex, wait, finish, normally);
1144 }
1145 }
1146
1147 return error;
1148 }
1149
1150 /*
1151 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1152 *
1153 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1154 * provided, move up to nrequeue remaining waiters on f matching
1155 * bitset to f2. Return the number of waiters actually woken.
1156 * Caller must hold the locks of f and f2, if provided.
1157 */
1158 static unsigned
1159 futex_wake(struct futex *f, int q, unsigned int const nwake,
1160 struct futex *f2, int q2, unsigned int const nrequeue,
1161 int bitset, unsigned int *nrequeuedp)
1162 {
1163 struct lwp *l, *l_next;
1164 unsigned nwoken = 0;
1165 unsigned nrequeued = 0;
1166
1167 KASSERT(mutex_owned(&f->fx_op_lock));
1168 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1169
1170 futex_sq_lock2(f, f2);
1171
1172 /* Wake up to nwake waiters, and count the number woken. */
1173 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1174 if (nwoken == nwake)
1175 break;
1176
1177 KASSERT(l->l_syncobj == &futex_syncobj);
1178 KASSERT(l->l_wchan == (wchan_t)f);
1179 KASSERT(l->l_futex == f);
1180 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1181 KASSERT(l->l_mutex == f->fx_sq_lock);
1182
1183 if ((l->l_futex_wakesel & bitset) == 0)
1184 continue;
1185
1186 l->l_futex_wakesel = 0;
1187 l->l_futex = NULL;
1188 sleepq_remove(&f->fx_sqs[q], l);
1189 f->fx_nwaiters[q]--;
1190 nwoken++;
1191 /* hold counts adjusted in bulk below */
1192 }
1193
1194 if (nrequeue) {
1195 KASSERT(f2 != NULL);
1196 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1197 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1198 if (nrequeued == nrequeue)
1199 break;
1200
1201 KASSERT(l->l_syncobj == &futex_syncobj);
1202 KASSERT(l->l_wchan == (wchan_t)f);
1203 KASSERT(l->l_futex == f);
1204 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1205 KASSERT(l->l_mutex == f->fx_sq_lock);
1206
1207 if ((l->l_futex_wakesel & bitset) == 0)
1208 continue;
1209
1210 l->l_futex = f2;
1211 sleepq_transfer(l, &f->fx_sqs[q],
1212 &f2->fx_sqs[q2], f2, futex_wmesg,
1213 &futex_syncobj, f2->fx_sq_lock, true);
1214 f->fx_nwaiters[q]--;
1215 f2->fx_nwaiters[q2]++;
1216 nrequeued++;
1217 /* hold counts adjusted in bulk below */
1218 }
1219 if (nrequeued) {
1220 /* XXX futex_hold() could theoretically fail here. */
1221 int hold_error __diagused =
1222 futex_hold_count(f2, nrequeued);
1223 KASSERT(hold_error == 0);
1224 }
1225 }
1226
1227 /*
1228 * Adjust the futex reference counts for the wakes and
1229 * requeues.
1230 */
1231 KASSERT(nwoken + nrequeued <= LONG_MAX);
1232 futex_rele_count_not_last(f, nwoken + nrequeued);
1233
1234 futex_sq_unlock2(f, f2);
1235
1236 /* Return the number of waiters woken and requeued. */
1237 if (nrequeuedp != NULL) {
1238 *nrequeuedp = nrequeued;
1239 }
1240 return nwoken;
1241 }
1242
1243 /*
1244 * futex_op_lock(f)
1245 *
1246 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1247 * not use if caller needs to acquire two locks; use
1248 * futex_op_lock2 instead.
1249 */
1250 static void
1251 futex_op_lock(struct futex *f)
1252 {
1253 mutex_enter(&f->fx_op_lock);
1254 }
1255
1256 /*
1257 * futex_op_unlock(f)
1258 *
1259 * Release the op lock of f.
1260 */
1261 static void
1262 futex_op_unlock(struct futex *f)
1263 {
1264 mutex_exit(&f->fx_op_lock);
1265 }
1266
1267 /*
1268 * futex_op_lock2(f, f2)
1269 *
1270 * Acquire the op locks of both f and f2, which may be null, or
1271 * which may be the same. If they are distinct, an arbitrary total
1272 * order is chosen on the locks.
1273 *
1274 * Callers should only ever acquire multiple op locks
1275 * simultaneously using futex_op_lock2.
1276 */
1277 static void
1278 futex_op_lock2(struct futex *f, struct futex *f2)
1279 {
1280
1281 /*
1282 * If both are null, do nothing; if one is null and the other
1283 * is not, lock the other and be done with it.
1284 */
1285 if (f == NULL && f2 == NULL) {
1286 return;
1287 } else if (f == NULL) {
1288 mutex_enter(&f2->fx_op_lock);
1289 return;
1290 } else if (f2 == NULL) {
1291 mutex_enter(&f->fx_op_lock);
1292 return;
1293 }
1294
1295 /* If both futexes are the same, acquire only one. */
1296 if (f == f2) {
1297 mutex_enter(&f->fx_op_lock);
1298 return;
1299 }
1300
1301 /* Otherwise, use the ordering on the kva of the futex pointer. */
1302 if ((uintptr_t)f < (uintptr_t)f2) {
1303 mutex_enter(&f->fx_op_lock);
1304 mutex_enter(&f2->fx_op_lock);
1305 } else {
1306 mutex_enter(&f2->fx_op_lock);
1307 mutex_enter(&f->fx_op_lock);
1308 }
1309 }
1310
1311 /*
1312 * futex_op_unlock2(f, f2)
1313 *
1314 * Release the queue locks of both f and f2, which may be null, or
1315 * which may have the same underlying queue.
1316 */
1317 static void
1318 futex_op_unlock2(struct futex *f, struct futex *f2)
1319 {
1320
1321 /*
1322 * If both are null, do nothing; if one is null and the other
1323 * is not, unlock the other and be done with it.
1324 */
1325 if (f == NULL && f2 == NULL) {
1326 return;
1327 } else if (f == NULL) {
1328 mutex_exit(&f2->fx_op_lock);
1329 return;
1330 } else if (f2 == NULL) {
1331 mutex_exit(&f->fx_op_lock);
1332 return;
1333 }
1334
1335 /* If both futexes are the same, release only one. */
1336 if (f == f2) {
1337 mutex_exit(&f->fx_op_lock);
1338 return;
1339 }
1340
1341 /* Otherwise, use the ordering on the kva of the futex pointer. */
1342 if ((uintptr_t)f < (uintptr_t)f2) {
1343 mutex_exit(&f2->fx_op_lock);
1344 mutex_exit(&f->fx_op_lock);
1345 } else {
1346 mutex_exit(&f->fx_op_lock);
1347 mutex_exit(&f2->fx_op_lock);
1348 }
1349 }
1350
1351 /*
1352 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1353 *
1354 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1355 */
1356 static int
1357 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1358 const struct timespec *timeout, clockid_t clkid, int clkflags,
1359 register_t *retval)
1360 {
1361 struct futex *f;
1362 struct timespec ts;
1363 const struct timespec *deadline;
1364 int error;
1365
1366 /*
1367 * If there's nothing to wait for, and nobody will ever wake
1368 * us, then don't set anything up to wait -- just stop here.
1369 */
1370 if (val3 == 0)
1371 return EINVAL;
1372
1373 /* Optimistically test before anything else. */
1374 if (!futex_test(uaddr, val))
1375 return EAGAIN;
1376
1377 /* Determine a deadline on the specified clock. */
1378 if (timeout == NULL) {
1379 deadline = timeout;
1380 } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1381 /* Sanity-check the deadline. */
1382 if (timeout->tv_sec < 0 ||
1383 timeout->tv_nsec < 0 ||
1384 timeout->tv_nsec >= 1000000000L) {
1385 return EINVAL;
1386 }
1387 deadline = timeout;
1388 } else {
1389 struct timespec interval = *timeout;
1390
1391 error = itimespecfix(&interval);
1392 if (error)
1393 return error;
1394 error = clock_gettime1(clkid, &ts);
1395 if (error)
1396 return error;
1397 timespecadd(&ts, &interval, &ts);
1398 deadline = &ts;
1399 }
1400
1401 /* Get the futex, creating it if necessary. */
1402 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1403 if (error)
1404 return error;
1405 KASSERT(f);
1406
1407 /*
1408 * Under the op lock, check the value again: if it has
1409 * already changed, EAGAIN; otherwise enqueue the waiter.
1410 * Since FUTEX_WAKE will use the same lock and be done after
1411 * modifying the value, the order in which we check and enqueue
1412 * is immaterial.
1413 */
1414 futex_op_lock(f);
1415 if (!futex_test(uaddr, val)) {
1416 futex_op_unlock(f);
1417 error = EAGAIN;
1418 goto out;
1419 }
1420
1421 /*
1422 * Now wait. futex_wait() will drop our op lock once we
1423 * are entered into the sleep queue, thus ensuring atomicity
1424 * of wakes with respect to waits.
1425 */
1426 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1427
1428 /*
1429 * We cannot drop our reference to the futex here, because
1430 * we might be enqueued on a different one when we are awakened.
1431 * The references will be managed on our behalf in the requeue,
1432 * wake, and error cases.
1433 */
1434 f = NULL;
1435
1436 if (__predict_true(error == 0)) {
1437 /* Return 0 on success, error on failure. */
1438 *retval = 0;
1439 }
1440
1441 out: if (f != NULL)
1442 futex_rele(f);
1443 return error;
1444 }
1445
1446 /*
1447 * futex_func_wake(uaddr, val, val3, retval)
1448 *
1449 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1450 */
1451 static int
1452 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1453 {
1454 struct futex *f;
1455 unsigned int nwoken = 0;
1456 int error = 0;
1457
1458 /* Reject negative number of wakeups. */
1459 if (val < 0) {
1460 error = EINVAL;
1461 goto out;
1462 }
1463
1464 /* Look up the futex, if any. */
1465 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1466 if (error)
1467 goto out;
1468
1469 /* If there's no futex, there are no waiters to wake. */
1470 if (f == NULL)
1471 goto out;
1472
1473 /*
1474 * Under f's op lock, wake the waiters and remember the
1475 * number woken.
1476 */
1477 futex_op_lock(f);
1478 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1479 NULL, FUTEX_WRITERQ, 0, val3, NULL);
1480 futex_op_unlock(f);
1481
1482 /* Release the futex. */
1483 futex_rele(f);
1484
1485 out:
1486 /* Return the number of waiters woken. */
1487 *retval = nwoken;
1488
1489 /* Success! */
1490 return error;
1491 }
1492
1493 /*
1494 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1495 *
1496 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1497 */
1498 static int
1499 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1500 int val2, int val3, register_t *retval)
1501 {
1502 struct futex *f = NULL, *f2 = NULL;
1503 unsigned nwoken = 0; /* default to zero woken on early return */
1504 unsigned nrequeued = 0;
1505 int error;
1506
1507 /* Reject negative number of wakeups or requeues. */
1508 if (val < 0 || val2 < 0) {
1509 error = EINVAL;
1510 goto out;
1511 }
1512
1513 /* Look up the source futex, if any. */
1514 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1515 if (error)
1516 goto out;
1517
1518 /* If there is none, nothing to do. */
1519 if (f == NULL)
1520 goto out;
1521
1522 /*
1523 * We may need to create the destination futex because it's
1524 * entirely possible it does not currently have any waiters.
1525 */
1526 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1527 if (error)
1528 goto out;
1529
1530 /*
1531 * Under the futexes' op locks, check the value; if
1532 * unchanged from val3, wake the waiters.
1533 */
1534 futex_op_lock2(f, f2);
1535 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1536 error = EAGAIN;
1537 } else {
1538 error = 0;
1539 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1540 f2, FUTEX_WRITERQ, val2,
1541 FUTEX_BITSET_MATCH_ANY,
1542 &nrequeued);
1543 }
1544 futex_op_unlock2(f, f2);
1545
1546 out:
1547 /*
1548 * For FUTUEX_REQUEUE, return the numner of waiters woken.
1549 *
1550 * For FUTEX_CMP_REQUEUE, return the number of waiters woken
1551 * **and** requeued.
1552 */
1553 *retval = nwoken + (op == FUTEX_CMP_REQUEUE) ? nrequeued : 0;
1554
1555 /* Release the futexes if we got them. */
1556 if (f2)
1557 futex_rele(f2);
1558 if (f)
1559 futex_rele(f);
1560 return error;
1561 }
1562
1563 /*
1564 * futex_validate_op_cmp(val3)
1565 *
1566 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1567 */
1568 static int
1569 futex_validate_op_cmp(int val3)
1570 {
1571 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1572 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1573
1574 if (op & FUTEX_OP_OPARG_SHIFT) {
1575 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1576 if (oparg < 0)
1577 return EINVAL;
1578 if (oparg >= 32)
1579 return EINVAL;
1580 op &= ~FUTEX_OP_OPARG_SHIFT;
1581 }
1582
1583 switch (op) {
1584 case FUTEX_OP_SET:
1585 case FUTEX_OP_ADD:
1586 case FUTEX_OP_OR:
1587 case FUTEX_OP_ANDN:
1588 case FUTEX_OP_XOR:
1589 break;
1590 default:
1591 return EINVAL;
1592 }
1593
1594 switch (cmp) {
1595 case FUTEX_OP_CMP_EQ:
1596 case FUTEX_OP_CMP_NE:
1597 case FUTEX_OP_CMP_LT:
1598 case FUTEX_OP_CMP_LE:
1599 case FUTEX_OP_CMP_GT:
1600 case FUTEX_OP_CMP_GE:
1601 break;
1602 default:
1603 return EINVAL;
1604 }
1605
1606 return 0;
1607 }
1608
1609 /*
1610 * futex_compute_op(oldval, val3)
1611 *
1612 * Apply a FUTEX_WAIT_OP operation to oldval.
1613 */
1614 static int
1615 futex_compute_op(int oldval, int val3)
1616 {
1617 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1618 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1619
1620 if (op & FUTEX_OP_OPARG_SHIFT) {
1621 KASSERT(oparg >= 0);
1622 KASSERT(oparg < 32);
1623 oparg = 1u << oparg;
1624 op &= ~FUTEX_OP_OPARG_SHIFT;
1625 }
1626
1627 switch (op) {
1628 case FUTEX_OP_SET:
1629 return oparg;
1630
1631 case FUTEX_OP_ADD:
1632 /*
1633 * Avoid signed arithmetic overflow by doing
1634 * arithmetic unsigned and converting back to signed
1635 * at the end.
1636 */
1637 return (int)((unsigned)oldval + (unsigned)oparg);
1638
1639 case FUTEX_OP_OR:
1640 return oldval | oparg;
1641
1642 case FUTEX_OP_ANDN:
1643 return oldval & ~oparg;
1644
1645 case FUTEX_OP_XOR:
1646 return oldval ^ oparg;
1647
1648 default:
1649 panic("invalid futex op");
1650 }
1651 }
1652
1653 /*
1654 * futex_compute_cmp(oldval, val3)
1655 *
1656 * Apply a FUTEX_WAIT_OP comparison to oldval.
1657 */
1658 static bool
1659 futex_compute_cmp(int oldval, int val3)
1660 {
1661 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1662 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1663
1664 switch (cmp) {
1665 case FUTEX_OP_CMP_EQ:
1666 return (oldval == cmparg);
1667
1668 case FUTEX_OP_CMP_NE:
1669 return (oldval != cmparg);
1670
1671 case FUTEX_OP_CMP_LT:
1672 return (oldval < cmparg);
1673
1674 case FUTEX_OP_CMP_LE:
1675 return (oldval <= cmparg);
1676
1677 case FUTEX_OP_CMP_GT:
1678 return (oldval > cmparg);
1679
1680 case FUTEX_OP_CMP_GE:
1681 return (oldval >= cmparg);
1682
1683 default:
1684 panic("invalid futex cmp operation");
1685 }
1686 }
1687
1688 /*
1689 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1690 *
1691 * Implement futex(FUTEX_WAKE_OP).
1692 */
1693 static int
1694 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1695 int val3, register_t *retval)
1696 {
1697 struct futex *f = NULL, *f2 = NULL;
1698 int oldval, newval, actual;
1699 unsigned nwoken = 0;
1700 int error;
1701
1702 /* Reject negative number of wakeups. */
1703 if (val < 0 || val2 < 0) {
1704 error = EINVAL;
1705 goto out;
1706 }
1707
1708 /* Reject invalid operations before we start doing things. */
1709 if ((error = futex_validate_op_cmp(val3)) != 0)
1710 goto out;
1711
1712 /* Look up the first futex, if any. */
1713 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1714 if (error)
1715 goto out;
1716
1717 /* Look up the second futex, if any. */
1718 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1719 if (error)
1720 goto out;
1721
1722 /*
1723 * Under the op locks:
1724 *
1725 * 1. Read/modify/write: *uaddr2 op= oparg.
1726 * 2. Unconditionally wake uaddr.
1727 * 3. Conditionally wake uaddr2, if it previously matched val3.
1728 */
1729 futex_op_lock2(f, f2);
1730 do {
1731 error = futex_load(uaddr2, &oldval);
1732 if (error)
1733 goto out_unlock;
1734 newval = futex_compute_op(oldval, val3);
1735 error = ucas_int(uaddr2, oldval, newval, &actual);
1736 if (error)
1737 goto out_unlock;
1738 } while (actual != oldval);
1739 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1740 NULL, FUTEX_WRITERQ, 0,
1741 FUTEX_BITSET_MATCH_ANY, NULL) : 0);
1742 if (f2 && futex_compute_cmp(oldval, val3))
1743 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1744 NULL, FUTEX_WRITERQ, 0,
1745 FUTEX_BITSET_MATCH_ANY, NULL);
1746
1747 /* Success! */
1748 error = 0;
1749 out_unlock:
1750 futex_op_unlock2(f, f2);
1751
1752 out:
1753 /* Return the number of waiters woken. */
1754 *retval = nwoken;
1755
1756 /* Release the futexes, if we got them. */
1757 if (f2)
1758 futex_rele(f2);
1759 if (f)
1760 futex_rele(f);
1761 return error;
1762 }
1763
1764 /*
1765 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1766 *
1767 * Implement the futex system call with all the parameters
1768 * parsed out.
1769 */
1770 int
1771 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1772 int *uaddr2, int val2, int val3, register_t *retval)
1773 {
1774 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1775 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1776 : CLOCK_MONOTONIC;
1777 int error;
1778
1779 op &= FUTEX_CMD_MASK;
1780
1781 switch (op) {
1782 case FUTEX_WAIT:
1783 SDT_PROBE6(futex, func, wait, entry,
1784 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
1785 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
1786 error = futex_func_wait(shared, uaddr, val,
1787 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1788 retval);
1789 SDT_PROBE1(futex, func, wait, exit, error);
1790 break;
1791
1792 case FUTEX_WAIT_BITSET:
1793 SDT_PROBE6(futex, func, wait, entry,
1794 uaddr, val, val3, timeout,
1795 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
1796 error = futex_func_wait(shared, uaddr, val, val3, timeout,
1797 clkid, TIMER_ABSTIME, retval);
1798 SDT_PROBE1(futex, func, wait, exit, error);
1799 break;
1800
1801 case FUTEX_WAKE:
1802 SDT_PROBE4(futex, func, wake, entry,
1803 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
1804 error = futex_func_wake(shared, uaddr, val,
1805 FUTEX_BITSET_MATCH_ANY, retval);
1806 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1807 break;
1808
1809 case FUTEX_WAKE_BITSET:
1810 SDT_PROBE4(futex, func, wake, entry,
1811 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
1812 error = futex_func_wake(shared, uaddr, val, val3, retval);
1813 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1814 break;
1815
1816 case FUTEX_REQUEUE:
1817 SDT_PROBE5(futex, func, requeue, entry,
1818 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
1819 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1820 val2, 0, retval);
1821 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
1822 break;
1823
1824 case FUTEX_CMP_REQUEUE:
1825 SDT_PROBE6(futex, func, cmp_requeue, entry,
1826 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1827 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1828 val2, val3, retval);
1829 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1830 break;
1831
1832 case FUTEX_WAKE_OP:
1833 SDT_PROBE6(futex, func, wake_op, entry,
1834 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1835 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1836 val3, retval);
1837 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
1838 break;
1839
1840 case FUTEX_FD:
1841 default:
1842 error = ENOSYS;
1843 break;
1844 }
1845
1846 return error;
1847 }
1848
1849 /*
1850 * sys___futex(l, uap, retval)
1851 *
1852 * __futex(2) system call: generic futex operations.
1853 */
1854 int
1855 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1856 register_t *retval)
1857 {
1858 /* {
1859 syscallarg(int *) uaddr;
1860 syscallarg(int) op;
1861 syscallarg(int) val;
1862 syscallarg(const struct timespec *) timeout;
1863 syscallarg(int *) uaddr2;
1864 syscallarg(int) val2;
1865 syscallarg(int) val3;
1866 } */
1867 struct timespec ts, *tsp;
1868 int error;
1869
1870 /*
1871 * Copy in the timeout argument, if specified.
1872 */
1873 if (SCARG(uap, timeout)) {
1874 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1875 if (error)
1876 return error;
1877 tsp = &ts;
1878 } else {
1879 tsp = NULL;
1880 }
1881
1882 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1883 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1884 retval);
1885 }
1886
1887 /*
1888 * sys___futex_set_robust_list(l, uap, retval)
1889 *
1890 * __futex_set_robust_list(2) system call for robust futexes.
1891 */
1892 int
1893 sys___futex_set_robust_list(struct lwp *l,
1894 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1895 {
1896 /* {
1897 syscallarg(void *) head;
1898 syscallarg(size_t) len;
1899 } */
1900 void *head = SCARG(uap, head);
1901
1902 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1903 return EINVAL;
1904 if ((uintptr_t)head % sizeof(u_long))
1905 return EINVAL;
1906
1907 l->l_robust_head = (uintptr_t)head;
1908
1909 return 0;
1910 }
1911
1912 /*
1913 * sys___futex_get_robust_list(l, uap, retval)
1914 *
1915 * __futex_get_robust_list(2) system call for robust futexes.
1916 */
1917 int
1918 sys___futex_get_robust_list(struct lwp *l,
1919 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1920 {
1921 /* {
1922 syscallarg(lwpid_t) lwpid;
1923 syscallarg(void **) headp;
1924 syscallarg(size_t *) lenp;
1925 } */
1926 void *head;
1927 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1928 int error;
1929
1930 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1931 if (error)
1932 return error;
1933
1934 /* Copy out the head pointer and the head structure length. */
1935 error = copyout(&head, SCARG(uap, headp), sizeof(head));
1936 if (__predict_true(error == 0)) {
1937 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1938 }
1939
1940 return error;
1941 }
1942
1943 /*
1944 * release_futex(uva, tid)
1945 *
1946 * Try to release the robust futex at uva in the current process
1947 * on lwp exit. If anything goes wrong, silently fail. It is the
1948 * userland program's obligation to arrange correct behaviour.
1949 */
1950 static void
1951 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1952 bool const is_pending)
1953 {
1954 int *uaddr;
1955 struct futex *f;
1956 int oldval, newval, actual;
1957 int error;
1958 bool shared;
1959
1960 /* If it's misaligned, tough. */
1961 if (__predict_false(uptr & 3))
1962 return;
1963 uaddr = (int *)uptr;
1964
1965 error = futex_load(uaddr, &oldval);
1966 if (__predict_false(error))
1967 return;
1968
1969 /*
1970 * There are two race conditions we need to handle here:
1971 *
1972 * 1. User space cleared the futex word but died before
1973 * being able to issue the wakeup. No wakeups will
1974 * ever be issued, oops!
1975 *
1976 * 2. Awakened waiter died before being able to acquire
1977 * the futex in user space. Any other waiters are
1978 * now stuck, oops!
1979 *
1980 * In both of these cases, the futex word will be 0 (because
1981 * it's updated before the wake is issued). The best we can
1982 * do is detect this situation if it's the pending futex and
1983 * issue a wake without modifying the futex word.
1984 *
1985 * XXX eventual PI handling?
1986 */
1987 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1988 register_t retval;
1989 error = futex_func_wake(/*shared*/true, uaddr, 1,
1990 FUTEX_BITSET_MATCH_ANY, &retval);
1991 if (error != 0 || retval == 0)
1992 (void) futex_func_wake(/*shared*/false, uaddr, 1,
1993 FUTEX_BITSET_MATCH_ANY, &retval);
1994 return;
1995 }
1996
1997 /* Optimistically test whether we need to do anything at all. */
1998 if ((oldval & FUTEX_TID_MASK) != tid)
1999 return;
2000
2001 /*
2002 * We need to handle the case where this thread owned the futex,
2003 * but it was uncontended. In this case, there won't be any
2004 * kernel state to look up. All we can do is mark the futex
2005 * as a zombie to be mopped up the next time another thread
2006 * attempts to acquire it.
2007 *
2008 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
2009 * this loop, even if waiters appear while we're are doing
2010 * so. This is beause FUTEX_WAITERS is set by user space
2011 * before calling __futex() to wait, and the futex needs
2012 * to be marked as a zombie when the new waiter gets into
2013 * the kernel.
2014 */
2015 if ((oldval & FUTEX_WAITERS) == 0) {
2016 do {
2017 error = futex_load(uaddr, &oldval);
2018 if (error)
2019 return;
2020 if ((oldval & FUTEX_TID_MASK) != tid)
2021 return;
2022 newval = oldval | FUTEX_OWNER_DIED;
2023 error = ucas_int(uaddr, oldval, newval, &actual);
2024 if (error)
2025 return;
2026 } while (actual != oldval);
2027
2028 /*
2029 * If where is still no indication of waiters, then there is
2030 * no more work for us to do.
2031 */
2032 if ((oldval & FUTEX_WAITERS) == 0)
2033 return;
2034 }
2035
2036 /*
2037 * Look for a futex. Try shared first, then private. If we
2038 * can't fine one, tough.
2039 *
2040 * Note: the assumption here is that anyone placing a futex on
2041 * the robust list is adhering to the rules, regardless of the
2042 * futex class.
2043 */
2044 for (f = NULL, shared = true; f == NULL; shared = false) {
2045 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2046 if (error)
2047 return;
2048 if (f == NULL && shared == false)
2049 return;
2050 }
2051
2052 /* Work under the futex op lock. */
2053 futex_op_lock(f);
2054
2055 /*
2056 * Fetch the word: if the tid doesn't match ours, skip;
2057 * otherwise, set the owner-died bit, atomically.
2058 */
2059 do {
2060 error = futex_load(uaddr, &oldval);
2061 if (error)
2062 goto out;
2063 if ((oldval & FUTEX_TID_MASK) != tid)
2064 goto out;
2065 newval = oldval | FUTEX_OWNER_DIED;
2066 error = ucas_int(uaddr, oldval, newval, &actual);
2067 if (error)
2068 goto out;
2069 } while (actual != oldval);
2070
2071 /*
2072 * If there may be waiters, try to wake one. If anything goes
2073 * wrong, tough.
2074 *
2075 * XXX eventual PI handling?
2076 */
2077 if (oldval & FUTEX_WAITERS) {
2078 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2079 NULL, FUTEX_WRITERQ, 0,
2080 FUTEX_BITSET_MATCH_ANY, NULL);
2081 }
2082
2083 /* Unlock the queue and release the futex. */
2084 out: futex_op_unlock(f);
2085 futex_rele(f);
2086 }
2087
2088 /*
2089 * futex_robust_head_lookup(l, lwpid)
2090 *
2091 * Helper function to look up a robust head by LWP ID.
2092 */
2093 int
2094 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2095 {
2096 struct proc *p = l->l_proc;
2097
2098 /* Find the other lwp, if requested; otherwise use our robust head. */
2099 if (lwpid) {
2100 mutex_enter(p->p_lock);
2101 l = lwp_find(p, lwpid);
2102 if (l == NULL) {
2103 mutex_exit(p->p_lock);
2104 return ESRCH;
2105 }
2106 *headp = (void *)l->l_robust_head;
2107 mutex_exit(p->p_lock);
2108 } else {
2109 *headp = (void *)l->l_robust_head;
2110 }
2111 return 0;
2112 }
2113
2114 /*
2115 * futex_fetch_robust_head(uaddr)
2116 *
2117 * Helper routine to fetch the futex robust list head that
2118 * handles 32-bit binaries running on 64-bit kernels.
2119 */
2120 static int
2121 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2122 {
2123 #ifdef _LP64
2124 if (curproc->p_flag & PK_32) {
2125 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2126 int error;
2127
2128 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2129 if (__predict_true(error == 0)) {
2130 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2131 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2132 /*
2133 * Make sure the offset is sign-
2134 * extended.
2135 */
2136 rhead[i] = (int32_t)rhead32[i];
2137 } else {
2138 rhead[i] = rhead32[i];
2139 }
2140 }
2141 }
2142 return error;
2143 }
2144 #endif /* _L64 */
2145
2146 return copyin((void *)uaddr, rhead,
2147 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2148 }
2149
2150 /*
2151 * futex_decode_robust_word(word)
2152 *
2153 * Decode a robust futex list word into the entry and entry
2154 * properties.
2155 */
2156 static inline void
2157 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2158 bool * const is_pi)
2159 {
2160 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2161 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2162 }
2163
2164 /*
2165 * futex_fetch_robust_entry(uaddr)
2166 *
2167 * Helper routine to fetch and decode a robust futex entry
2168 * that handles 32-bit binaries running on 64-bit kernels.
2169 */
2170 static int
2171 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2172 bool * const is_pi)
2173 {
2174 uintptr_t val = 0;
2175 int error = 0;
2176
2177 #ifdef _LP64
2178 if (curproc->p_flag & PK_32) {
2179 uint32_t val32;
2180
2181 error = ufetch_32((uint32_t *)uaddr, &val32);
2182 if (__predict_true(error == 0))
2183 val = val32;
2184 } else
2185 #endif /* _LP64 */
2186 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2187 if (__predict_false(error))
2188 return error;
2189
2190 futex_decode_robust_word(val, valp, is_pi);
2191 return 0;
2192 }
2193
2194 /*
2195 * futex_release_all_lwp(l, tid)
2196 *
2197 * Release all l's robust futexes. If anything looks funny in
2198 * the process, give up -- it's userland's responsibility to dot
2199 * the i's and cross the t's.
2200 */
2201 void
2202 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2203 {
2204 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2205 int limit = 1000000;
2206 int error;
2207
2208 /* If there's no robust list there's nothing to do. */
2209 if (l->l_robust_head == 0)
2210 return;
2211
2212 /* Read the final snapshot of the robust list head. */
2213 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2214 if (error) {
2215 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2216 " unmapped robust futex list head\n",
2217 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2218 (uintmax_t)l->l_lid, (uintmax_t)tid);
2219 return;
2220 }
2221
2222 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2223
2224 uintptr_t next, pending;
2225 bool is_pi, pending_is_pi;
2226
2227 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2228 &next, &is_pi);
2229 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2230 &pending, &pending_is_pi);
2231
2232 /*
2233 * Walk down the list of locked futexes and release them, up
2234 * to one million of them before we give up.
2235 */
2236
2237 while (next != l->l_robust_head && limit-- > 0) {
2238 /* pending handled below. */
2239 if (next != pending)
2240 release_futex(next + offset, tid, is_pi, false);
2241 error = futex_fetch_robust_entry(next, &next, &is_pi);
2242 if (error)
2243 break;
2244 preempt_point();
2245 }
2246 if (limit <= 0) {
2247 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2248 " exhausted robust futex limit\n",
2249 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2250 (uintmax_t)l->l_lid, (uintmax_t)tid);
2251 }
2252
2253 /* If there's a pending futex, it may need to be released too. */
2254 if (pending != 0) {
2255 release_futex(pending + offset, tid, pending_is_pi, true);
2256 }
2257 }
2258