sys_futex.c revision 1.12.4.4 1 /* $NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.4 2021/08/06 23:53:53 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/atomic.h>
120 #include <sys/condvar.h>
121 #include <sys/futex.h>
122 #include <sys/mutex.h>
123 #include <sys/rbtree.h>
124 #include <sys/sleepq.h>
125 #include <sys/queue.h>
126 #include <sys/sdt.h>
127
128 #include <sys/syscall.h>
129 #include <sys/syscallargs.h>
130 #include <sys/syscallvar.h>
131
132 #include <uvm/uvm_extern.h>
133
134 /*
135 * DTrace probes.
136 */
137 SDT_PROVIDER_DEFINE(futex);
138
139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
140 /* exit: error */
141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
142 "struct timespec *", "int", "int");
143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
144
145 /* entry: uaddr, nwake, bitset, fflags */
146 /* exit: error, nwoken */
147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
149
150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
151 /* exit: error, nwoken */
152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
153 "int");
154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
155
156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
157 /* exit: error, nwoken */
158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
159 "int", "int", "int");
160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
161
162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
163 /* exit: error, nwoken */
164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
165 "int", "int");
166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
167
168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
171
172 /* entry: timo */
173 /* exit: error */
174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
176
177 /*
178 * Lock order:
179 *
180 * futex_tab.lock
181 * futex::fx_op_lock ordered by kva of struct futex
182 * -> futex::fx_sq_lock ordered by kva of sleepq lock
183 *
184 * N.B. multiple futexes can share a single sleepq lock.
185 */
186
187 /*
188 * union futex_key
189 *
190 * A futex is addressed either by a vmspace+va (private) or by
191 * a uvm_voaddr (shared).
192 */
193 typedef union futex_key {
194 struct {
195 struct vmspace *vmspace;
196 vaddr_t va;
197 } fk_private;
198 struct uvm_voaddr fk_shared;
199 } futex_key_t;
200
201 CTASSERT(offsetof(futex_key_t, fk_private.va) ==
202 offsetof(futex_key_t, fk_shared.offset));
203
204 /*
205 * struct futex
206 *
207 * Kernel state for a futex located at a particular address in a
208 * particular virtual address space.
209 *
210 * N.B. fx_refcnt is an unsigned long because we need to be able
211 * to operate on it atomically on all systems while at the same
212 * time rendering practically impossible the chance of it reaching
213 * its max value. In practice, we're limited by the number of LWPs
214 * that can be present on the system at any given time, and the
215 * assumption is that limit will be good enough on a 32-bit platform.
216 * See futex_wake() for why overflow needs to be avoided.
217 *
218 * XXX Since futex addresses must be 4-byte aligned, we could
219 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
220 * XXX field of the key. Worth it?
221 */
222 struct futex {
223 futex_key_t fx_key;
224 struct rb_node fx_node;
225 unsigned long fx_refcnt;
226 bool fx_shared;
227 bool fx_on_tree;
228 uint8_t fx_class;
229
230 kmutex_t fx_op_lock; /* adaptive */
231 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
232 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
233 unsigned int fx_nwaiters[2];
234 };
235
236 /*
237 * futex classes: Some futex operations can only be carried out on
238 * futexes that are known to be abiding by a certain protocol. These
239 * classes are assigned to a futex when created due to a wait event,
240 * and when subsequent wake or requeue operations are issued, the
241 * class is checked at futex lookup time. If the class does not match,
242 * EINVAL is the result.
243 */
244 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
245 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
246 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
247 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
248
249 /* sleepq definitions */
250 #define FUTEX_READERQ 0
251 #define FUTEX_WRITERQ 1
252
253 static const char futex_wmesg[] = "futex";
254
255 static void futex_unsleep(lwp_t *, bool);
256
257 static syncobj_t futex_syncobj = {
258 .sobj_flag = SOBJ_SLEEPQ_SORTED,
259 .sobj_unsleep = futex_unsleep,
260 .sobj_changepri = sleepq_changepri,
261 .sobj_lendpri = sleepq_lendpri,
262 .sobj_owner = syncobj_noowner,
263 };
264
265 /*
266 * futex_tab
267 *
268 * Global trees of futexes by vmspace/va and VM object address.
269 *
270 * XXX This obviously doesn't scale in parallel. We could use a
271 * pserialize-safe data structure, but there may be a high cost to
272 * frequent deletion since we don't cache futexes after we're done
273 * with them. We could use hashed locks. But for now, just make
274 * sure userland can't DoS the serial performance, by using a
275 * balanced binary tree for lookup.
276 *
277 * XXX We could use a per-process tree for the table indexed by
278 * virtual address to reduce contention between processes.
279 */
280 static struct {
281 kmutex_t lock;
282 struct rb_tree va;
283 struct rb_tree oa;
284 } futex_tab __cacheline_aligned;
285
286 static int
287 compare_futex_key(void *cookie, const void *n, const void *k)
288 {
289 const struct futex *fa = n;
290 const futex_key_t *fka = &fa->fx_key;
291 const futex_key_t *fkb = k;
292
293 if ((uintptr_t)fka->fk_private.vmspace <
294 (uintptr_t)fkb->fk_private.vmspace)
295 return -1;
296 if ((uintptr_t)fka->fk_private.vmspace >
297 (uintptr_t)fkb->fk_private.vmspace)
298 return +1;
299 if (fka->fk_private.va < fkb->fk_private.va)
300 return -1;
301 if (fka->fk_private.va > fkb->fk_private.va)
302 return -1;
303 return 0;
304 }
305
306 static int
307 compare_futex(void *cookie, const void *na, const void *nb)
308 {
309 const struct futex *fa = na;
310 const struct futex *fb = nb;
311
312 return compare_futex_key(cookie, fa, &fb->fx_key);
313 }
314
315 static const rb_tree_ops_t futex_rb_ops = {
316 .rbto_compare_nodes = compare_futex,
317 .rbto_compare_key = compare_futex_key,
318 .rbto_node_offset = offsetof(struct futex, fx_node),
319 };
320
321 static int
322 compare_futex_shared_key(void *cookie, const void *n, const void *k)
323 {
324 const struct futex *fa = n;
325 const futex_key_t *fka = &fa->fx_key;
326 const futex_key_t *fkb = k;
327
328 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
329 }
330
331 static int
332 compare_futex_shared(void *cookie, const void *na, const void *nb)
333 {
334 const struct futex *fa = na;
335 const struct futex *fb = nb;
336
337 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
338 }
339
340 static const rb_tree_ops_t futex_shared_rb_ops = {
341 .rbto_compare_nodes = compare_futex_shared,
342 .rbto_compare_key = compare_futex_shared_key,
343 .rbto_node_offset = offsetof(struct futex, fx_node),
344 };
345
346 /*
347 * futex_sq_lock2(f, f2)
348 *
349 * Acquire the sleepq locks for f and f2, which may be null, or
350 * which may be the same. If they are distinct, an arbitrary total
351 * order is chosen on the locks.
352 *
353 * Callers should only ever acquire multiple sleepq locks
354 * simultaneously using futex_sq_lock2.
355 */
356 static void
357 futex_sq_lock2(struct futex * const f, struct futex * const f2)
358 {
359
360 /*
361 * If both are null, do nothing; if one is null and the other
362 * is not, lock the other and be done with it.
363 */
364 if (f == NULL && f2 == NULL) {
365 return;
366 } else if (f == NULL) {
367 mutex_spin_enter(f2->fx_sq_lock);
368 return;
369 } else if (f2 == NULL) {
370 mutex_spin_enter(f->fx_sq_lock);
371 return;
372 }
373
374 kmutex_t * const m = f->fx_sq_lock;
375 kmutex_t * const m2 = f2->fx_sq_lock;
376
377 /* If both locks are the same, acquire only one. */
378 if (m == m2) {
379 mutex_spin_enter(m);
380 return;
381 }
382
383 /* Otherwise, use the ordering on the kva of the lock pointer. */
384 if ((uintptr_t)m < (uintptr_t)m2) {
385 mutex_spin_enter(m);
386 mutex_spin_enter(m2);
387 } else {
388 mutex_spin_enter(m2);
389 mutex_spin_enter(m);
390 }
391 }
392
393 /*
394 * futex_sq_unlock2(f, f2)
395 *
396 * Release the sleep queue locks for f and f2, which may be null, or
397 * which may have the same underlying lock.
398 */
399 static void
400 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
401 {
402
403 /*
404 * If both are null, do nothing; if one is null and the other
405 * is not, unlock the other and be done with it.
406 */
407 if (f == NULL && f2 == NULL) {
408 return;
409 } else if (f == NULL) {
410 mutex_spin_exit(f2->fx_sq_lock);
411 return;
412 } else if (f2 == NULL) {
413 mutex_spin_exit(f->fx_sq_lock);
414 return;
415 }
416
417 kmutex_t * const m = f->fx_sq_lock;
418 kmutex_t * const m2 = f2->fx_sq_lock;
419
420 /* If both locks are the same, release only one. */
421 if (m == m2) {
422 mutex_spin_exit(m);
423 return;
424 }
425
426 /* Otherwise, use the ordering on the kva of the lock pointer. */
427 if ((uintptr_t)m < (uintptr_t)m2) {
428 mutex_spin_exit(m2);
429 mutex_spin_exit(m);
430 } else {
431 mutex_spin_exit(m);
432 mutex_spin_exit(m2);
433 }
434 }
435
436 /*
437 * futex_load(uaddr, kaddr)
438 *
439 * Perform a single atomic load to read *uaddr, and return the
440 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
441 * mapped.
442 */
443 static inline int
444 futex_load(int *uaddr, int *kaddr)
445 {
446 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
447 }
448
449 /*
450 * futex_test(uaddr, expected)
451 *
452 * True if *uaddr == expected. False if *uaddr != expected, or if
453 * uaddr is not mapped.
454 */
455 static bool
456 futex_test(int *uaddr, int expected)
457 {
458 int val;
459 int error;
460
461 error = futex_load(uaddr, &val);
462 if (error)
463 return false;
464 return val == expected;
465 }
466
467 static pool_cache_t futex_cache __read_mostly;
468
469 static int futex_ctor(void *, void *, int);
470 static void futex_dtor(void *, void *);
471
472 /*
473 * futex_sys_init()
474 *
475 * Initialize the futex subsystem.
476 */
477 void
478 futex_sys_init(void)
479 {
480
481 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
482 rb_tree_init(&futex_tab.va, &futex_rb_ops);
483 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
484
485 futex_cache = pool_cache_init(sizeof(struct futex),
486 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
487 futex_dtor, NULL);
488 }
489
490 /*
491 * futex_sys_fini()
492 *
493 * Finalize the futex subsystem.
494 */
495 void
496 futex_sys_fini(void)
497 {
498
499 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
500 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
501 mutex_destroy(&futex_tab.lock);
502 }
503
504 /*
505 * futex_ctor()
506 *
507 * Futex object constructor.
508 */
509 static int
510 futex_ctor(void *arg __unused, void *obj, int flags __unused)
511 {
512 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
513 struct futex * const f = obj;
514
515 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
516 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
517
518 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
519 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
520 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
521
522 return 0;
523 }
524
525 /*
526 * futex_dtor()
527 *
528 * Futex object destructor.
529 */
530 static void
531 futex_dtor(void *arg __unused, void *obj)
532 {
533 struct futex * const f = obj;
534
535 mutex_destroy(&f->fx_op_lock);
536 f->fx_sq_lock = NULL;
537 }
538
539 /*
540 * futex_key_init(key, vm, va, shared)
541 *
542 * Initialize a futex key for lookup, etc.
543 */
544 static int
545 futex_key_init(futex_key_t *fk, struct vmspace *vm, vaddr_t va, bool shared)
546 {
547 int error = 0;
548
549 if (__predict_false(shared)) {
550 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
551 error = EFAULT;
552 } else {
553 fk->fk_private.vmspace = vm;
554 fk->fk_private.va = va;
555 }
556
557 return error;
558 }
559
560 /*
561 * futex_key_fini(key, shared)
562 *
563 * Release a futex key.
564 */
565 static void
566 futex_key_fini(futex_key_t *fk, bool shared)
567 {
568 if (__predict_false(shared))
569 uvm_voaddr_release(&fk->fk_shared);
570 memset(fk, 0, sizeof(*fk));
571 }
572
573 /*
574 * futex_create(fk, shared)
575 *
576 * Create a futex. Initial reference count is 1, representing the
577 * caller. Returns NULL on failure. Always takes ownership of the
578 * key, either transferring it to the newly-created futex, or releasing
579 * the key if creation fails.
580 *
581 * Never sleeps for memory, but may sleep to acquire a lock.
582 */
583 static struct futex *
584 futex_create(futex_key_t *fk, bool shared, uint8_t class)
585 {
586 struct futex *f;
587
588 f = pool_cache_get(futex_cache, PR_NOWAIT);
589 if (f == NULL) {
590 futex_key_fini(fk, shared);
591 return NULL;
592 }
593 f->fx_key = *fk;
594 f->fx_refcnt = 1;
595 f->fx_shared = shared;
596 f->fx_on_tree = false;
597 f->fx_class = class;
598
599 return f;
600 }
601
602 /*
603 * futex_destroy(f)
604 *
605 * Destroy a futex created with futex_create. Reference count
606 * must be zero.
607 *
608 * May sleep.
609 */
610 static void
611 futex_destroy(struct futex *f)
612 {
613
614 ASSERT_SLEEPABLE();
615
616 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
617 KASSERT(!f->fx_on_tree);
618 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
619 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
620 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
621 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
622
623 futex_key_fini(&f->fx_key, f->fx_shared);
624
625 pool_cache_put(futex_cache, f);
626 }
627
628 /*
629 * futex_hold_count(f, n)
630 *
631 * Attempt to acquire a reference to f. Return 0 on success,
632 * ENFILE on too many references.
633 *
634 * Never sleeps.
635 */
636 static int
637 futex_hold_count(struct futex *f, unsigned long const count)
638 {
639 unsigned long refcnt;
640
641 do {
642 refcnt = atomic_load_relaxed(&f->fx_refcnt);
643 if (ULONG_MAX - refcnt < count)
644 return ENFILE;
645 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
646 refcnt + count) != refcnt);
647
648 return 0;
649 }
650 #define futex_hold(f) futex_hold_count(f, 1)
651
652 /*
653 * futex_rele_count(f, n)
654 *
655 * Release a reference to f acquired with futex_create or
656 * futex_hold.
657 *
658 * May sleep to free f.
659 */
660 static void
661 futex_rele_count(struct futex *f, unsigned long const count)
662 {
663 unsigned long refcnt;
664
665 ASSERT_SLEEPABLE();
666
667 do {
668 refcnt = atomic_load_relaxed(&f->fx_refcnt);
669 KASSERT(refcnt >= count);
670 if (refcnt - count == 0)
671 goto trylast;
672 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
673 refcnt - count) != refcnt);
674 return;
675
676 trylast:
677 KASSERT(count <= LONG_MAX);
678 mutex_enter(&futex_tab.lock);
679 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
680 if (f->fx_on_tree) {
681 if (__predict_false(f->fx_shared))
682 rb_tree_remove_node(&futex_tab.oa, f);
683 else
684 rb_tree_remove_node(&futex_tab.va, f);
685 f->fx_on_tree = false;
686 }
687 } else {
688 /* References remain -- don't destroy it. */
689 f = NULL;
690 }
691 mutex_exit(&futex_tab.lock);
692 if (f != NULL)
693 futex_destroy(f);
694 }
695 #define futex_rele(f) futex_rele_count(f, 1)
696
697 /*
698 * futex_rele_count_not_last(f, n)
699 *
700 * Release a reference to f acquired with futex_create or
701 * futex_hold.
702 *
703 * This version asserts that we are not dropping the last
704 * reference to f.
705 */
706 static void
707 futex_rele_count_not_last(struct futex *f, unsigned long const count)
708 {
709 unsigned long refcnt;
710
711 do {
712 refcnt = atomic_load_relaxed(&f->fx_refcnt);
713 KASSERT(refcnt >= count);
714 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
715 refcnt - count) != refcnt);
716 }
717
718 /*
719 * futex_lookup_by_key(key, shared, class, &f)
720 *
721 * Try to find an existing futex va reference in the specified key
722 * On success, return 0, set f to found futex or to NULL if not found,
723 * and increment f's reference count if found.
724 *
725 * Return ENFILE if reference count too high.
726 *
727 * Internal lookup routine shared by futex_lookup() and
728 * futex_lookup_create().
729 */
730 static int
731 futex_lookup_by_key(futex_key_t *fk, bool shared, uint8_t class,
732 struct futex **fp)
733 {
734 struct futex *f;
735 int error = 0;
736
737 mutex_enter(&futex_tab.lock);
738 if (__predict_false(shared)) {
739 f = rb_tree_find_node(&futex_tab.oa, fk);
740 } else {
741 f = rb_tree_find_node(&futex_tab.va, fk);
742 }
743 if (f) {
744 if (__predict_true(f->fx_class == class ||
745 class == FUTEX_CLASS_ANY))
746 error = futex_hold(f);
747 else
748 error = EINVAL;
749 if (error)
750 f = NULL;
751 }
752 *fp = f;
753 mutex_exit(&futex_tab.lock);
754
755 return error;
756 }
757
758 /*
759 * futex_insert(f, fp)
760 *
761 * Try to insert the futex f into the tree by va. If there
762 * already is a futex for its va, acquire a reference to it, and
763 * store it in *fp; otherwise store f in *fp.
764 *
765 * Return 0 on success, ENFILE if there already is a futex but its
766 * reference count is too high.
767 */
768 static int
769 futex_insert(struct futex *f, struct futex **fp)
770 {
771 struct futex *f0;
772 int error;
773
774 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
775 KASSERT(!f->fx_on_tree);
776
777 mutex_enter(&futex_tab.lock);
778 if (__predict_false(f->fx_shared))
779 f0 = rb_tree_insert_node(&futex_tab.oa, f);
780 else
781 f0 = rb_tree_insert_node(&futex_tab.va, f);
782 if (f0 == f) {
783 f->fx_on_tree = true;
784 error = 0;
785 } else {
786 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
787 KASSERT(f0->fx_on_tree);
788 error = futex_hold(f0);
789 if (error)
790 goto out;
791 }
792 *fp = f0;
793 out: mutex_exit(&futex_tab.lock);
794
795 return error;
796 }
797
798 /*
799 * futex_lookup(uaddr, shared, class, &f)
800 *
801 * Find a futex at the userland pointer uaddr in the current
802 * process's VM space. On success, return the futex in f and
803 * increment its reference count.
804 *
805 * Caller must call futex_rele when done.
806 */
807 static int
808 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
809 {
810 futex_key_t fk;
811 struct vmspace *vm = curproc->p_vmspace;
812 vaddr_t va = (vaddr_t)uaddr;
813 int error;
814
815 /*
816 * Reject unaligned user pointers so we don't cross page
817 * boundaries and so atomics will work.
818 */
819 if (__predict_false((va & 3) != 0))
820 return EINVAL;
821
822 /* Look it up. */
823 error = futex_key_init(&fk, vm, va, shared);
824 if (error)
825 return error;
826
827 error = futex_lookup_by_key(&fk, shared, class, fp);
828 futex_key_fini(&fk, shared);
829 if (error)
830 return error;
831
832 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
833 KASSERT(*fp == NULL || (*fp)->fx_class == class);
834 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
835
836 /*
837 * Success! (Caller must still check whether we found
838 * anything, but nothing went _wrong_ like trying to use
839 * unmapped memory.)
840 */
841 KASSERT(error == 0);
842
843 return error;
844 }
845
846 /*
847 * futex_lookup_create(uaddr, shared, class, &f)
848 *
849 * Find or create a futex at the userland pointer uaddr in the
850 * current process's VM space. On success, return the futex in f
851 * and increment its reference count.
852 *
853 * Caller must call futex_rele when done.
854 */
855 static int
856 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
857 {
858 futex_key_t fk;
859 struct vmspace *vm = curproc->p_vmspace;
860 struct futex *f = NULL;
861 vaddr_t va = (vaddr_t)uaddr;
862 int error;
863
864 /*
865 * Reject unaligned user pointers so we don't cross page
866 * boundaries and so atomics will work.
867 */
868 if (__predict_false((va & 3) != 0))
869 return EINVAL;
870
871 if (__predict_false(class == FUTEX_CLASS_ANY))
872 return EINVAL;
873
874 error = futex_key_init(&fk, vm, va, shared);
875 if (error)
876 return error;
877
878 /*
879 * Optimistically assume there already is one, and try to find
880 * it.
881 */
882 error = futex_lookup_by_key(&fk, shared, class, fp);
883 if (error || *fp != NULL) {
884 /*
885 * We either found one, or there was an error.
886 * In either case, we are done with the key.
887 */
888 futex_key_fini(&fk, shared);
889 goto out;
890 }
891
892 /*
893 * Create a futex recoard. This tranfers ownership of the key
894 * in all cases.
895 */
896 f = futex_create(&fk, shared, class);
897 if (f == NULL) {
898 error = ENOMEM;
899 goto out;
900 }
901
902 /*
903 * Insert our new futex, or use existing if someone else beat
904 * us to it.
905 */
906 error = futex_insert(f, fp);
907 if (error)
908 goto out;
909 if (*fp == f)
910 f = NULL; /* don't release on exit */
911
912 /* Success! */
913 KASSERT(error == 0);
914
915 out: if (f != NULL)
916 futex_rele(f);
917 KASSERT(error || *fp != NULL);
918 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
919 return error;
920 }
921
922 /*
923 * futex_unsleep:
924 *
925 * Remove an LWP from the futex and sleep queue. This is called when
926 * the LWP has not been awoken normally but instead interrupted: for
927 * example, when a signal is received. Must be called with the LWP
928 * locked. Will unlock if "unlock" is true.
929 */
930 static void
931 futex_unsleep(lwp_t *l, bool unlock)
932 {
933 struct futex *f __diagused;
934
935 f = (struct futex *)(uintptr_t)l->l_wchan;
936
937 KASSERT(mutex_owned(f->fx_sq_lock));
938
939 /* WRITERQ is more likely */
940 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
941 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
942 f->fx_nwaiters[FUTEX_WRITERQ]--;
943 } else {
944 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
945 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
946 f->fx_nwaiters[FUTEX_READERQ]--;
947 }
948
949 sleepq_unsleep(l, unlock);
950 }
951
952 /*
953 * futex_wait(f, timeout, clkid, bitset)
954 *
955 * Wait until deadline on the clock clkid, or forever if deadline
956 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
957 * ETIMEDOUT on timeout, EINTR on signal.
958 */
959 static int
960 futex_wait(struct futex *f, int q, const struct timespec *deadline,
961 clockid_t clkid, int bitset)
962 {
963
964 /*
965 * Some things to note about how this function works:
966 *
967 * ==> When we enter futex_wait(), the futex's op lock is
968 * held, preventing us from missing wakes. Once we are on
969 * the futex's sleepq, it is safe to release the op lock.
970 *
971 * ==> We have a single early escape to avoid calling
972 * sleepq_block(): our deadline already having passed.
973 * If this is a no-timeout wait, or if the deadline has
974 * not already passed, we are guaranteed to call sleepq_block().
975 *
976 * ==> sleepq_block() contains all of the necessary logic
977 * for handling signals; we do not need to check for them
978 * ourselves.
979 *
980 * ==> Once we have blocked, other futex operations will
981 * be able to wake us or requeue us. Until we have blocked,
982 * those other futex operations will not be able to acquire
983 * the sleepq locks in order to perform those operations,
984 * even if we have dropped the op lock. Thus, we will not
985 * miss wakes. This fundamental invariant is relied upon by
986 * every other synchronization object in the kernel.
987 *
988 * ==> sleepq_block() returns for one of three reasons:
989 *
990 * -- We were awakened.
991 * -- We were signaled.
992 * -- We timed out.
993 *
994 * ==> Once sleepq_block() returns, we will not call
995 * sleepq_block() again.
996 *
997 * ==> If sleepq_block() returns due to being signaled,
998 * we must check to see if we were also awakened. This
999 * is to handle the following sequence:
1000 *
1001 * -- Futex wake takes us off sleepq, places us on runq.
1002 * -- We are signaled while sitting on the runq.
1003 * -- We run, and sleepq_block() notices the signal and
1004 * returns EINTR/ERESTART.
1005 *
1006 * In this situation, we want to indicate a successful wake
1007 * to the caller, because that wake has been reported to the
1008 * thread that issued it.
1009 *
1010 * Note that it is NOT possible for a wake to be issued after
1011 * a signal; the LWP will already have been taken off the sleepq
1012 * by the signal, so the wake will never happen. Note that for
1013 * this reason, we must *never* return ERESTART, because it could
1014 * result in us waiting again with no one to awaken us.
1015 */
1016
1017 struct lwp * const l = curlwp;
1018 int timo;
1019 int error;
1020
1021 /*
1022 * Compute our timeout before taking any locks.
1023 */
1024 if (deadline) {
1025 struct timespec ts;
1026
1027 /* Check our watch. */
1028 error = clock_gettime1(clkid, &ts);
1029 if (error) {
1030 mutex_exit(&f->fx_op_lock);
1031 return error;
1032 }
1033
1034 /*
1035 * If we're past the deadline, ETIMEDOUT.
1036 */
1037 if (timespeccmp(deadline, &ts, <=)) {
1038 mutex_exit(&f->fx_op_lock);
1039 return ETIMEDOUT;
1040 } else {
1041 /* Count how much time is left. */
1042 timespecsub(deadline, &ts, &ts);
1043 timo = tstohz(&ts);
1044 if (timo == 0) {
1045 /*
1046 * tstohz() already rounds up, and
1047 * a return value of 0 ticks means
1048 * "expired now or in the past".
1049 */
1050 mutex_exit(&f->fx_op_lock);
1051 return ETIMEDOUT;
1052 }
1053 }
1054 } else {
1055 timo = 0;
1056 }
1057
1058 /*
1059 * Acquire in sleepq -> lwp order. While we're at it, ensure
1060 * that there's room for us to block.
1061 */
1062 mutex_spin_enter(f->fx_sq_lock);
1063 if (__predict_false(q == FUTEX_READERQ &&
1064 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1065 mutex_spin_exit(f->fx_sq_lock);
1066 mutex_exit(&f->fx_op_lock);
1067 return ENFILE;
1068 }
1069 lwp_lock(l);
1070
1071 /*
1072 * No need for locks here; until we're on a futex's sleepq, these
1073 * values are private to the LWP, and the locks needed to get onto
1074 * the sleepq provide the memory barriers needed to ensure visibility.
1075 */
1076 l->l_futex = f;
1077 l->l_futex_wakesel = bitset;
1078
1079 /*
1080 * This is an inlined bit of sleepq_enter();
1081 * we already hold the lwp lock.
1082 */
1083 lwp_unlock_to(l, f->fx_sq_lock);
1084 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1085 KASSERT(lwp_locked(l, f->fx_sq_lock));
1086
1087 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1088 f->fx_nwaiters[q]++;
1089
1090 /* We can now safely release the op lock. */
1091 mutex_exit(&f->fx_op_lock);
1092
1093 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1094 error = sleepq_block(timo, true);
1095 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1096
1097 /*
1098 * f is no longer valid; we may have been moved to another
1099 * futex sleepq while we slept.
1100 */
1101
1102 /*
1103 * If something went wrong, we may still have a futex reference.
1104 * Check for that here and drop it if so. We can do this without
1105 * taking any locks because l->l_futex is private to the LWP when
1106 * not on any futex's sleepq, and we are not on any sleepq because
1107 * we are running.
1108 *
1109 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1110 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1111 */
1112 if (error) {
1113 f = l->l_futex;
1114 if (f != NULL) {
1115 SDT_PROBE0(futex, wait, finish, aborted);
1116 l->l_futex = NULL;
1117 futex_rele(f);
1118 } else {
1119 /* Raced with wakeup; report success. */
1120 SDT_PROBE0(futex, wait, finish, wakerace);
1121 error = 0;
1122 }
1123 if (error == EWOULDBLOCK)
1124 error = ETIMEDOUT;
1125 else if (error != ETIMEDOUT)
1126 error = EINTR;
1127 } else {
1128 f = l->l_futex;
1129 if (__predict_false(f != NULL)) {
1130 /*
1131 * There are certain situations that can cause
1132 * sleepq_block() to return 0 even if we were
1133 * signalled (by e.g. SIGKILL). In this case,
1134 * we will need to release our futex hold and
1135 * return EINTR (we're probably about to die
1136 * anyway).
1137 */
1138 SDT_PROBE0(futex, wait, finish, aborted);
1139 l->l_futex = NULL;
1140 futex_rele(f);
1141 error = EINTR;
1142 } else {
1143 SDT_PROBE0(futex, wait, finish, normally);
1144 }
1145 }
1146
1147 return error;
1148 }
1149
1150 /*
1151 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1152 *
1153 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1154 * provided, move up to nrequeue remaining waiters on f matching
1155 * bitset to f2. Return the number of waiters actually woken.
1156 * Caller must hold the locks of f and f2, if provided.
1157 */
1158 static unsigned
1159 futex_wake(struct futex *f, int q, unsigned int const nwake,
1160 struct futex *f2, int q2, unsigned int const nrequeue,
1161 int bitset)
1162 {
1163 struct lwp *l, *l_next;
1164 unsigned nwoken = 0;
1165 unsigned nrequeued = 0;
1166
1167 KASSERT(mutex_owned(&f->fx_op_lock));
1168 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1169
1170 futex_sq_lock2(f, f2);
1171
1172 /* Wake up to nwake waiters, and count the number woken. */
1173 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1174 if (nwoken == nwake)
1175 break;
1176
1177 KASSERT(l->l_syncobj == &futex_syncobj);
1178 KASSERT(l->l_wchan == (wchan_t)f);
1179 KASSERT(l->l_futex == f);
1180 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1181 KASSERT(l->l_mutex == f->fx_sq_lock);
1182
1183 if ((l->l_futex_wakesel & bitset) == 0)
1184 continue;
1185
1186 l->l_futex_wakesel = 0;
1187 l->l_futex = NULL;
1188 sleepq_remove(&f->fx_sqs[q], l);
1189 f->fx_nwaiters[q]--;
1190 nwoken++;
1191 /* hold counts adjusted in bulk below */
1192 }
1193
1194 if (nrequeue) {
1195 KASSERT(f2 != NULL);
1196 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1197 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1198 if (nrequeued == nrequeue)
1199 break;
1200
1201 KASSERT(l->l_syncobj == &futex_syncobj);
1202 KASSERT(l->l_wchan == (wchan_t)f);
1203 KASSERT(l->l_futex == f);
1204 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1205 KASSERT(l->l_mutex == f->fx_sq_lock);
1206
1207 if ((l->l_futex_wakesel & bitset) == 0)
1208 continue;
1209
1210 l->l_futex = f2;
1211 sleepq_transfer(l, &f->fx_sqs[q],
1212 &f2->fx_sqs[q2], f2, futex_wmesg,
1213 &futex_syncobj, f2->fx_sq_lock, true);
1214 f->fx_nwaiters[q]--;
1215 f2->fx_nwaiters[q2]++;
1216 nrequeued++;
1217 /* hold counts adjusted in bulk below */
1218 }
1219 if (nrequeued) {
1220 /* XXX futex_hold() could theoretically fail here. */
1221 int hold_error __diagused =
1222 futex_hold_count(f2, nrequeued);
1223 KASSERT(hold_error == 0);
1224 }
1225 }
1226
1227 /*
1228 * Adjust the futex reference counts for the wakes and
1229 * requeues.
1230 */
1231 KASSERT(nwoken + nrequeued <= LONG_MAX);
1232 futex_rele_count_not_last(f, nwoken + nrequeued);
1233
1234 futex_sq_unlock2(f, f2);
1235
1236 /* Return the number of waiters woken and requeued. */
1237 return nwoken + nrequeued;
1238 }
1239
1240 /*
1241 * futex_op_lock(f)
1242 *
1243 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1244 * not use if caller needs to acquire two locks; use
1245 * futex_op_lock2 instead.
1246 */
1247 static void
1248 futex_op_lock(struct futex *f)
1249 {
1250 mutex_enter(&f->fx_op_lock);
1251 }
1252
1253 /*
1254 * futex_op_unlock(f)
1255 *
1256 * Release the op lock of f.
1257 */
1258 static void
1259 futex_op_unlock(struct futex *f)
1260 {
1261 mutex_exit(&f->fx_op_lock);
1262 }
1263
1264 /*
1265 * futex_op_lock2(f, f2)
1266 *
1267 * Acquire the op locks of both f and f2, which may be null, or
1268 * which may be the same. If they are distinct, an arbitrary total
1269 * order is chosen on the locks.
1270 *
1271 * Callers should only ever acquire multiple op locks
1272 * simultaneously using futex_op_lock2.
1273 */
1274 static void
1275 futex_op_lock2(struct futex *f, struct futex *f2)
1276 {
1277
1278 /*
1279 * If both are null, do nothing; if one is null and the other
1280 * is not, lock the other and be done with it.
1281 */
1282 if (f == NULL && f2 == NULL) {
1283 return;
1284 } else if (f == NULL) {
1285 mutex_enter(&f2->fx_op_lock);
1286 return;
1287 } else if (f2 == NULL) {
1288 mutex_enter(&f->fx_op_lock);
1289 return;
1290 }
1291
1292 /* If both futexes are the same, acquire only one. */
1293 if (f == f2) {
1294 mutex_enter(&f->fx_op_lock);
1295 return;
1296 }
1297
1298 /* Otherwise, use the ordering on the kva of the futex pointer. */
1299 if ((uintptr_t)f < (uintptr_t)f2) {
1300 mutex_enter(&f->fx_op_lock);
1301 mutex_enter(&f2->fx_op_lock);
1302 } else {
1303 mutex_enter(&f2->fx_op_lock);
1304 mutex_enter(&f->fx_op_lock);
1305 }
1306 }
1307
1308 /*
1309 * futex_op_unlock2(f, f2)
1310 *
1311 * Release the queue locks of both f and f2, which may be null, or
1312 * which may have the same underlying queue.
1313 */
1314 static void
1315 futex_op_unlock2(struct futex *f, struct futex *f2)
1316 {
1317
1318 /*
1319 * If both are null, do nothing; if one is null and the other
1320 * is not, unlock the other and be done with it.
1321 */
1322 if (f == NULL && f2 == NULL) {
1323 return;
1324 } else if (f == NULL) {
1325 mutex_exit(&f2->fx_op_lock);
1326 return;
1327 } else if (f2 == NULL) {
1328 mutex_exit(&f->fx_op_lock);
1329 return;
1330 }
1331
1332 /* If both futexes are the same, release only one. */
1333 if (f == f2) {
1334 mutex_exit(&f->fx_op_lock);
1335 return;
1336 }
1337
1338 /* Otherwise, use the ordering on the kva of the futex pointer. */
1339 if ((uintptr_t)f < (uintptr_t)f2) {
1340 mutex_exit(&f2->fx_op_lock);
1341 mutex_exit(&f->fx_op_lock);
1342 } else {
1343 mutex_exit(&f->fx_op_lock);
1344 mutex_exit(&f2->fx_op_lock);
1345 }
1346 }
1347
1348 /*
1349 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1350 *
1351 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1352 */
1353 static int
1354 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1355 const struct timespec *timeout, clockid_t clkid, int clkflags,
1356 register_t *retval)
1357 {
1358 struct futex *f;
1359 struct timespec ts;
1360 const struct timespec *deadline;
1361 int error;
1362
1363 /*
1364 * If there's nothing to wait for, and nobody will ever wake
1365 * us, then don't set anything up to wait -- just stop here.
1366 */
1367 if (val3 == 0)
1368 return EINVAL;
1369
1370 /* Optimistically test before anything else. */
1371 if (!futex_test(uaddr, val))
1372 return EAGAIN;
1373
1374 /* Determine a deadline on the specified clock. */
1375 if (timeout == NULL) {
1376 deadline = timeout;
1377 } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1378 /* Sanity-check the deadline. */
1379 if (timeout->tv_sec < 0 ||
1380 timeout->tv_nsec < 0 ||
1381 timeout->tv_nsec >= 1000000000L) {
1382 return EINVAL;
1383 }
1384 deadline = timeout;
1385 } else {
1386 struct timespec interval = *timeout;
1387
1388 error = itimespecfix(&interval);
1389 if (error)
1390 return error;
1391 error = clock_gettime1(clkid, &ts);
1392 if (error)
1393 return error;
1394 timespecadd(&ts, &interval, &ts);
1395 deadline = &ts;
1396 }
1397
1398 /* Get the futex, creating it if necessary. */
1399 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1400 if (error)
1401 return error;
1402 KASSERT(f);
1403
1404 /*
1405 * Under the op lock, check the value again: if it has
1406 * already changed, EAGAIN; otherwise enqueue the waiter.
1407 * Since FUTEX_WAKE will use the same lock and be done after
1408 * modifying the value, the order in which we check and enqueue
1409 * is immaterial.
1410 */
1411 futex_op_lock(f);
1412 if (!futex_test(uaddr, val)) {
1413 futex_op_unlock(f);
1414 error = EAGAIN;
1415 goto out;
1416 }
1417
1418 /*
1419 * Now wait. futex_wait() will drop our op lock once we
1420 * are entered into the sleep queue, thus ensuring atomicity
1421 * of wakes with respect to waits.
1422 */
1423 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1424
1425 /*
1426 * We cannot drop our reference to the futex here, because
1427 * we might be enqueued on a different one when we are awakened.
1428 * The references will be managed on our behalf in the requeue,
1429 * wake, and error cases.
1430 */
1431 f = NULL;
1432
1433 if (__predict_true(error == 0)) {
1434 /* Return 0 on success, error on failure. */
1435 *retval = 0;
1436 }
1437
1438 out: if (f != NULL)
1439 futex_rele(f);
1440 return error;
1441 }
1442
1443 /*
1444 * futex_func_wake(uaddr, val, val3, retval)
1445 *
1446 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1447 */
1448 static int
1449 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1450 {
1451 struct futex *f;
1452 unsigned int nwoken = 0;
1453 int error = 0;
1454
1455 /* Reject negative number of wakeups. */
1456 if (val < 0) {
1457 error = EINVAL;
1458 goto out;
1459 }
1460
1461 /* Look up the futex, if any. */
1462 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1463 if (error)
1464 goto out;
1465
1466 /* If there's no futex, there are no waiters to wake. */
1467 if (f == NULL)
1468 goto out;
1469
1470 /*
1471 * Under f's op lock, wake the waiters and remember the
1472 * number woken.
1473 */
1474 futex_op_lock(f);
1475 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1476 NULL, FUTEX_WRITERQ, 0, val3);
1477 futex_op_unlock(f);
1478
1479 /* Release the futex. */
1480 futex_rele(f);
1481
1482 out:
1483 /* Return the number of waiters woken. */
1484 *retval = nwoken;
1485
1486 /* Success! */
1487 return error;
1488 }
1489
1490 /*
1491 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1492 *
1493 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1494 */
1495 static int
1496 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1497 int val2, int val3, register_t *retval)
1498 {
1499 struct futex *f = NULL, *f2 = NULL;
1500 unsigned nwoken = 0; /* default to zero woken on early return */
1501 int error;
1502
1503 /* Reject negative number of wakeups or requeues. */
1504 if (val < 0 || val2 < 0) {
1505 error = EINVAL;
1506 goto out;
1507 }
1508
1509 /* Look up the source futex, if any. */
1510 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1511 if (error)
1512 goto out;
1513
1514 /* If there is none, nothing to do. */
1515 if (f == NULL)
1516 goto out;
1517
1518 /*
1519 * We may need to create the destination futex because it's
1520 * entirely possible it does not currently have any waiters.
1521 */
1522 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1523 if (error)
1524 goto out;
1525
1526 /*
1527 * Under the futexes' op locks, check the value; if
1528 * unchanged from val3, wake the waiters.
1529 */
1530 futex_op_lock2(f, f2);
1531 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1532 error = EAGAIN;
1533 } else {
1534 error = 0;
1535 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1536 f2, FUTEX_WRITERQ, val2,
1537 FUTEX_BITSET_MATCH_ANY);
1538 }
1539 futex_op_unlock2(f, f2);
1540
1541 out:
1542 /* Return the number of waiters woken. */
1543 *retval = nwoken;
1544
1545 /* Release the futexes if we got them. */
1546 if (f2)
1547 futex_rele(f2);
1548 if (f)
1549 futex_rele(f);
1550 return error;
1551 }
1552
1553 /*
1554 * futex_validate_op_cmp(val3)
1555 *
1556 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1557 */
1558 static int
1559 futex_validate_op_cmp(int val3)
1560 {
1561 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1562 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1563
1564 if (op & FUTEX_OP_OPARG_SHIFT) {
1565 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1566 if (oparg < 0)
1567 return EINVAL;
1568 if (oparg >= 32)
1569 return EINVAL;
1570 op &= ~FUTEX_OP_OPARG_SHIFT;
1571 }
1572
1573 switch (op) {
1574 case FUTEX_OP_SET:
1575 case FUTEX_OP_ADD:
1576 case FUTEX_OP_OR:
1577 case FUTEX_OP_ANDN:
1578 case FUTEX_OP_XOR:
1579 break;
1580 default:
1581 return EINVAL;
1582 }
1583
1584 switch (cmp) {
1585 case FUTEX_OP_CMP_EQ:
1586 case FUTEX_OP_CMP_NE:
1587 case FUTEX_OP_CMP_LT:
1588 case FUTEX_OP_CMP_LE:
1589 case FUTEX_OP_CMP_GT:
1590 case FUTEX_OP_CMP_GE:
1591 break;
1592 default:
1593 return EINVAL;
1594 }
1595
1596 return 0;
1597 }
1598
1599 /*
1600 * futex_compute_op(oldval, val3)
1601 *
1602 * Apply a FUTEX_WAIT_OP operation to oldval.
1603 */
1604 static int
1605 futex_compute_op(int oldval, int val3)
1606 {
1607 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1608 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1609
1610 if (op & FUTEX_OP_OPARG_SHIFT) {
1611 KASSERT(oparg >= 0);
1612 KASSERT(oparg < 32);
1613 oparg = 1u << oparg;
1614 op &= ~FUTEX_OP_OPARG_SHIFT;
1615 }
1616
1617 switch (op) {
1618 case FUTEX_OP_SET:
1619 return oparg;
1620
1621 case FUTEX_OP_ADD:
1622 /*
1623 * Avoid signed arithmetic overflow by doing
1624 * arithmetic unsigned and converting back to signed
1625 * at the end.
1626 */
1627 return (int)((unsigned)oldval + (unsigned)oparg);
1628
1629 case FUTEX_OP_OR:
1630 return oldval | oparg;
1631
1632 case FUTEX_OP_ANDN:
1633 return oldval & ~oparg;
1634
1635 case FUTEX_OP_XOR:
1636 return oldval ^ oparg;
1637
1638 default:
1639 panic("invalid futex op");
1640 }
1641 }
1642
1643 /*
1644 * futex_compute_cmp(oldval, val3)
1645 *
1646 * Apply a FUTEX_WAIT_OP comparison to oldval.
1647 */
1648 static bool
1649 futex_compute_cmp(int oldval, int val3)
1650 {
1651 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1652 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1653
1654 switch (cmp) {
1655 case FUTEX_OP_CMP_EQ:
1656 return (oldval == cmparg);
1657
1658 case FUTEX_OP_CMP_NE:
1659 return (oldval != cmparg);
1660
1661 case FUTEX_OP_CMP_LT:
1662 return (oldval < cmparg);
1663
1664 case FUTEX_OP_CMP_LE:
1665 return (oldval <= cmparg);
1666
1667 case FUTEX_OP_CMP_GT:
1668 return (oldval > cmparg);
1669
1670 case FUTEX_OP_CMP_GE:
1671 return (oldval >= cmparg);
1672
1673 default:
1674 panic("invalid futex cmp operation");
1675 }
1676 }
1677
1678 /*
1679 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1680 *
1681 * Implement futex(FUTEX_WAKE_OP).
1682 */
1683 static int
1684 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1685 int val3, register_t *retval)
1686 {
1687 struct futex *f = NULL, *f2 = NULL;
1688 int oldval, newval, actual;
1689 unsigned nwoken = 0;
1690 int error;
1691
1692 /* Reject negative number of wakeups. */
1693 if (val < 0 || val2 < 0) {
1694 error = EINVAL;
1695 goto out;
1696 }
1697
1698 /* Reject invalid operations before we start doing things. */
1699 if ((error = futex_validate_op_cmp(val3)) != 0)
1700 goto out;
1701
1702 /* Look up the first futex, if any. */
1703 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1704 if (error)
1705 goto out;
1706
1707 /* Look up the second futex, if any. */
1708 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1709 if (error)
1710 goto out;
1711
1712 /*
1713 * Under the op locks:
1714 *
1715 * 1. Read/modify/write: *uaddr2 op= oparg.
1716 * 2. Unconditionally wake uaddr.
1717 * 3. Conditionally wake uaddr2, if it previously matched val3.
1718 */
1719 futex_op_lock2(f, f2);
1720 do {
1721 error = futex_load(uaddr2, &oldval);
1722 if (error)
1723 goto out_unlock;
1724 newval = futex_compute_op(oldval, val3);
1725 error = ucas_int(uaddr2, oldval, newval, &actual);
1726 if (error)
1727 goto out_unlock;
1728 } while (actual != oldval);
1729 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1730 NULL, FUTEX_WRITERQ, 0,
1731 FUTEX_BITSET_MATCH_ANY) : 0);
1732 if (f2 && futex_compute_cmp(oldval, val3))
1733 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1734 NULL, FUTEX_WRITERQ, 0,
1735 FUTEX_BITSET_MATCH_ANY);
1736
1737 /* Success! */
1738 error = 0;
1739 out_unlock:
1740 futex_op_unlock2(f, f2);
1741
1742 out:
1743 /* Return the number of waiters woken. */
1744 *retval = nwoken;
1745
1746 /* Release the futexes, if we got them. */
1747 if (f2)
1748 futex_rele(f2);
1749 if (f)
1750 futex_rele(f);
1751 return error;
1752 }
1753
1754 /*
1755 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1756 *
1757 * Implement the futex system call with all the parameters
1758 * parsed out.
1759 */
1760 int
1761 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1762 int *uaddr2, int val2, int val3, register_t *retval)
1763 {
1764 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1765 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1766 : CLOCK_MONOTONIC;
1767 int error;
1768
1769 op &= FUTEX_CMD_MASK;
1770
1771 switch (op) {
1772 case FUTEX_WAIT:
1773 SDT_PROBE6(futex, func, wait, entry,
1774 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
1775 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
1776 error = futex_func_wait(shared, uaddr, val,
1777 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1778 retval);
1779 SDT_PROBE1(futex, func, wait, exit, error);
1780 break;
1781
1782 case FUTEX_WAIT_BITSET:
1783 SDT_PROBE6(futex, func, wait, entry,
1784 uaddr, val, val3, timeout,
1785 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
1786 error = futex_func_wait(shared, uaddr, val, val3, timeout,
1787 clkid, TIMER_ABSTIME, retval);
1788 SDT_PROBE1(futex, func, wait, exit, error);
1789 break;
1790
1791 case FUTEX_WAKE:
1792 SDT_PROBE4(futex, func, wake, entry,
1793 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
1794 error = futex_func_wake(shared, uaddr, val,
1795 FUTEX_BITSET_MATCH_ANY, retval);
1796 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1797 break;
1798
1799 case FUTEX_WAKE_BITSET:
1800 SDT_PROBE4(futex, func, wake, entry,
1801 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
1802 error = futex_func_wake(shared, uaddr, val, val3, retval);
1803 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1804 break;
1805
1806 case FUTEX_REQUEUE:
1807 SDT_PROBE5(futex, func, requeue, entry,
1808 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
1809 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1810 val2, 0, retval);
1811 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
1812 break;
1813
1814 case FUTEX_CMP_REQUEUE:
1815 SDT_PROBE6(futex, func, cmp_requeue, entry,
1816 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1817 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1818 val2, val3, retval);
1819 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1820 break;
1821
1822 case FUTEX_WAKE_OP:
1823 SDT_PROBE6(futex, func, wake_op, entry,
1824 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1825 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1826 val3, retval);
1827 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
1828 break;
1829
1830 case FUTEX_FD:
1831 default:
1832 error = ENOSYS;
1833 break;
1834 }
1835
1836 return error;
1837 }
1838
1839 /*
1840 * sys___futex(l, uap, retval)
1841 *
1842 * __futex(2) system call: generic futex operations.
1843 */
1844 int
1845 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1846 register_t *retval)
1847 {
1848 /* {
1849 syscallarg(int *) uaddr;
1850 syscallarg(int) op;
1851 syscallarg(int) val;
1852 syscallarg(const struct timespec *) timeout;
1853 syscallarg(int *) uaddr2;
1854 syscallarg(int) val2;
1855 syscallarg(int) val3;
1856 } */
1857 struct timespec ts, *tsp;
1858 int error;
1859
1860 /*
1861 * Copy in the timeout argument, if specified.
1862 */
1863 if (SCARG(uap, timeout)) {
1864 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1865 if (error)
1866 return error;
1867 tsp = &ts;
1868 } else {
1869 tsp = NULL;
1870 }
1871
1872 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1873 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1874 retval);
1875 }
1876
1877 /*
1878 * sys___futex_set_robust_list(l, uap, retval)
1879 *
1880 * __futex_set_robust_list(2) system call for robust futexes.
1881 */
1882 int
1883 sys___futex_set_robust_list(struct lwp *l,
1884 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1885 {
1886 /* {
1887 syscallarg(void *) head;
1888 syscallarg(size_t) len;
1889 } */
1890 void *head = SCARG(uap, head);
1891
1892 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1893 return EINVAL;
1894 if ((uintptr_t)head % sizeof(u_long))
1895 return EINVAL;
1896
1897 l->l_robust_head = (uintptr_t)head;
1898
1899 return 0;
1900 }
1901
1902 /*
1903 * sys___futex_get_robust_list(l, uap, retval)
1904 *
1905 * __futex_get_robust_list(2) system call for robust futexes.
1906 */
1907 int
1908 sys___futex_get_robust_list(struct lwp *l,
1909 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1910 {
1911 /* {
1912 syscallarg(lwpid_t) lwpid;
1913 syscallarg(void **) headp;
1914 syscallarg(size_t *) lenp;
1915 } */
1916 void *head;
1917 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1918 int error;
1919
1920 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1921 if (error)
1922 return error;
1923
1924 /* Copy out the head pointer and the head structure length. */
1925 error = copyout(&head, SCARG(uap, headp), sizeof(head));
1926 if (__predict_true(error == 0)) {
1927 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1928 }
1929
1930 return error;
1931 }
1932
1933 /*
1934 * release_futex(uva, tid)
1935 *
1936 * Try to release the robust futex at uva in the current process
1937 * on lwp exit. If anything goes wrong, silently fail. It is the
1938 * userland program's obligation to arrange correct behaviour.
1939 */
1940 static void
1941 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1942 bool const is_pending)
1943 {
1944 int *uaddr;
1945 struct futex *f;
1946 int oldval, newval, actual;
1947 int error;
1948 bool shared;
1949
1950 /* If it's misaligned, tough. */
1951 if (__predict_false(uptr & 3))
1952 return;
1953 uaddr = (int *)uptr;
1954
1955 error = futex_load(uaddr, &oldval);
1956 if (__predict_false(error))
1957 return;
1958
1959 /*
1960 * There are two race conditions we need to handle here:
1961 *
1962 * 1. User space cleared the futex word but died before
1963 * being able to issue the wakeup. No wakeups will
1964 * ever be issued, oops!
1965 *
1966 * 2. Awakened waiter died before being able to acquire
1967 * the futex in user space. Any other waiters are
1968 * now stuck, oops!
1969 *
1970 * In both of these cases, the futex word will be 0 (because
1971 * it's updated before the wake is issued). The best we can
1972 * do is detect this situation if it's the pending futex and
1973 * issue a wake without modifying the futex word.
1974 *
1975 * XXX eventual PI handling?
1976 */
1977 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1978 register_t retval;
1979 error = futex_func_wake(/*shared*/true, uaddr, 1,
1980 FUTEX_BITSET_MATCH_ANY, &retval);
1981 if (error != 0 || retval == 0)
1982 (void) futex_func_wake(/*shared*/false, uaddr, 1,
1983 FUTEX_BITSET_MATCH_ANY, &retval);
1984 return;
1985 }
1986
1987 /* Optimistically test whether we need to do anything at all. */
1988 if ((oldval & FUTEX_TID_MASK) != tid)
1989 return;
1990
1991 /*
1992 * We need to handle the case where this thread owned the futex,
1993 * but it was uncontended. In this case, there won't be any
1994 * kernel state to look up. All we can do is mark the futex
1995 * as a zombie to be mopped up the next time another thread
1996 * attempts to acquire it.
1997 *
1998 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1999 * this loop, even if waiters appear while we're are doing
2000 * so. This is beause FUTEX_WAITERS is set by user space
2001 * before calling __futex() to wait, and the futex needs
2002 * to be marked as a zombie when the new waiter gets into
2003 * the kernel.
2004 */
2005 if ((oldval & FUTEX_WAITERS) == 0) {
2006 do {
2007 error = futex_load(uaddr, &oldval);
2008 if (error)
2009 return;
2010 if ((oldval & FUTEX_TID_MASK) != tid)
2011 return;
2012 newval = oldval | FUTEX_OWNER_DIED;
2013 error = ucas_int(uaddr, oldval, newval, &actual);
2014 if (error)
2015 return;
2016 } while (actual != oldval);
2017
2018 /*
2019 * If where is still no indication of waiters, then there is
2020 * no more work for us to do.
2021 */
2022 if ((oldval & FUTEX_WAITERS) == 0)
2023 return;
2024 }
2025
2026 /*
2027 * Look for a futex. Try shared first, then private. If we
2028 * can't fine one, tough.
2029 *
2030 * Note: the assumption here is that anyone placing a futex on
2031 * the robust list is adhering to the rules, regardless of the
2032 * futex class.
2033 */
2034 for (f = NULL, shared = true; f == NULL; shared = false) {
2035 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2036 if (error)
2037 return;
2038 if (f == NULL && shared == false)
2039 return;
2040 }
2041
2042 /* Work under the futex op lock. */
2043 futex_op_lock(f);
2044
2045 /*
2046 * Fetch the word: if the tid doesn't match ours, skip;
2047 * otherwise, set the owner-died bit, atomically.
2048 */
2049 do {
2050 error = futex_load(uaddr, &oldval);
2051 if (error)
2052 goto out;
2053 if ((oldval & FUTEX_TID_MASK) != tid)
2054 goto out;
2055 newval = oldval | FUTEX_OWNER_DIED;
2056 error = ucas_int(uaddr, oldval, newval, &actual);
2057 if (error)
2058 goto out;
2059 } while (actual != oldval);
2060
2061 /*
2062 * If there may be waiters, try to wake one. If anything goes
2063 * wrong, tough.
2064 *
2065 * XXX eventual PI handling?
2066 */
2067 if (oldval & FUTEX_WAITERS) {
2068 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2069 NULL, FUTEX_WRITERQ, 0,
2070 FUTEX_BITSET_MATCH_ANY);
2071 }
2072
2073 /* Unlock the queue and release the futex. */
2074 out: futex_op_unlock(f);
2075 futex_rele(f);
2076 }
2077
2078 /*
2079 * futex_robust_head_lookup(l, lwpid)
2080 *
2081 * Helper function to look up a robust head by LWP ID.
2082 */
2083 int
2084 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2085 {
2086 struct proc *p = l->l_proc;
2087
2088 /* Find the other lwp, if requested; otherwise use our robust head. */
2089 if (lwpid) {
2090 mutex_enter(p->p_lock);
2091 l = lwp_find(p, lwpid);
2092 if (l == NULL) {
2093 mutex_exit(p->p_lock);
2094 return ESRCH;
2095 }
2096 *headp = (void *)l->l_robust_head;
2097 mutex_exit(p->p_lock);
2098 } else {
2099 *headp = (void *)l->l_robust_head;
2100 }
2101 return 0;
2102 }
2103
2104 /*
2105 * futex_fetch_robust_head(uaddr)
2106 *
2107 * Helper routine to fetch the futex robust list head that
2108 * handles 32-bit binaries running on 64-bit kernels.
2109 */
2110 static int
2111 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2112 {
2113 #ifdef _LP64
2114 if (curproc->p_flag & PK_32) {
2115 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2116 int error;
2117
2118 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2119 if (__predict_true(error == 0)) {
2120 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2121 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2122 /*
2123 * Make sure the offset is sign-
2124 * extended.
2125 */
2126 rhead[i] = (int32_t)rhead32[i];
2127 } else {
2128 rhead[i] = rhead32[i];
2129 }
2130 }
2131 }
2132 return error;
2133 }
2134 #endif /* _L64 */
2135
2136 return copyin((void *)uaddr, rhead,
2137 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2138 }
2139
2140 /*
2141 * futex_decode_robust_word(word)
2142 *
2143 * Decode a robust futex list word into the entry and entry
2144 * properties.
2145 */
2146 static inline void
2147 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2148 bool * const is_pi)
2149 {
2150 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2151 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2152 }
2153
2154 /*
2155 * futex_fetch_robust_entry(uaddr)
2156 *
2157 * Helper routine to fetch and decode a robust futex entry
2158 * that handles 32-bit binaries running on 64-bit kernels.
2159 */
2160 static int
2161 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2162 bool * const is_pi)
2163 {
2164 uintptr_t val = 0;
2165 int error = 0;
2166
2167 #ifdef _LP64
2168 if (curproc->p_flag & PK_32) {
2169 uint32_t val32;
2170
2171 error = ufetch_32((uint32_t *)uaddr, &val32);
2172 if (__predict_true(error == 0))
2173 val = val32;
2174 } else
2175 #endif /* _LP64 */
2176 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2177 if (__predict_false(error))
2178 return error;
2179
2180 futex_decode_robust_word(val, valp, is_pi);
2181 return 0;
2182 }
2183
2184 /*
2185 * futex_release_all_lwp(l, tid)
2186 *
2187 * Release all l's robust futexes. If anything looks funny in
2188 * the process, give up -- it's userland's responsibility to dot
2189 * the i's and cross the t's.
2190 */
2191 void
2192 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2193 {
2194 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2195 int limit = 1000000;
2196 int error;
2197
2198 /* If there's no robust list there's nothing to do. */
2199 if (l->l_robust_head == 0)
2200 return;
2201
2202 /* Read the final snapshot of the robust list head. */
2203 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2204 if (error) {
2205 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2206 " unmapped robust futex list head\n",
2207 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2208 (uintmax_t)l->l_lid, (uintmax_t)tid);
2209 return;
2210 }
2211
2212 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2213
2214 uintptr_t next, pending;
2215 bool is_pi, pending_is_pi;
2216
2217 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2218 &next, &is_pi);
2219 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2220 &pending, &pending_is_pi);
2221
2222 /*
2223 * Walk down the list of locked futexes and release them, up
2224 * to one million of them before we give up.
2225 */
2226
2227 while (next != l->l_robust_head && limit-- > 0) {
2228 /* pending handled below. */
2229 if (next != pending)
2230 release_futex(next + offset, tid, is_pi, false);
2231 error = futex_fetch_robust_entry(next, &next, &is_pi);
2232 if (error)
2233 break;
2234 preempt_point();
2235 }
2236 if (limit <= 0) {
2237 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2238 " exhausted robust futex limit\n",
2239 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2240 (uintmax_t)l->l_lid, (uintmax_t)tid);
2241 }
2242
2243 /* If there's a pending futex, it may need to be released too. */
2244 if (pending != 0) {
2245 release_futex(pending + offset, tid, pending_is_pi, true);
2246 }
2247 }
2248