sys_futex.c revision 1.12.4.1 1 /* $NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.1 2021/08/05 23:14:21 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/atomic.h>
120 #include <sys/condvar.h>
121 #include <sys/futex.h>
122 #include <sys/mutex.h>
123 #include <sys/rbtree.h>
124 #include <sys/sleepq.h>
125 #include <sys/queue.h>
126 #include <sys/sdt.h>
127
128 #include <sys/syscall.h>
129 #include <sys/syscallargs.h>
130 #include <sys/syscallvar.h>
131
132 #include <uvm/uvm_extern.h>
133
134 /*
135 * DTrace probes.
136 */
137 SDT_PROVIDER_DEFINE(futex);
138
139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
140 /* exit: error */
141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
142 "struct timespec *", "int", "int");
143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
144
145 /* entry: uaddr, nwake, bitset, fflags */
146 /* exit: error, nwoken */
147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
149
150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
151 /* exit: error, nwoken */
152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
153 "int");
154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
155
156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
157 /* exit: error, nwoken */
158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
159 "int", "int", "int");
160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
161
162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
163 /* exit: error, nwoken */
164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
165 "int", "int");
166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
167
168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
171
172 /* entry: timo */
173 /* exit: error */
174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
176
177 /*
178 * Lock order:
179 *
180 * futex_tab.lock
181 * futex::fx_op_lock ordered by kva of struct futex
182 * -> futex::fx_sq_lock ordered by kva of sleepq lock
183 *
184 * N.B. multiple futexes can share a single sleepq lock.
185 */
186
187 /*
188 * union futex_key
189 *
190 * A futex is addressed either by a vmspace+va (private) or by
191 * a uvm_voaddr (shared).
192 */
193 union futex_key {
194 struct {
195 struct vmspace *vmspace;
196 vaddr_t va;
197 } fk_private;
198 struct uvm_voaddr fk_shared;
199 };
200
201 /*
202 * struct futex
203 *
204 * Kernel state for a futex located at a particular address in a
205 * particular virtual address space.
206 *
207 * N.B. fx_refcnt is an unsigned long because we need to be able
208 * to operate on it atomically on all systems while at the same
209 * time rendering practically impossible the chance of it reaching
210 * its max value. In practice, we're limited by the number of LWPs
211 * that can be present on the system at any given time, and the
212 * assumption is that limit will be good enough on a 32-bit platform.
213 * See futex_wake() for why overflow needs to be avoided.
214 *
215 * XXX Since futex addresses must be 4-byte aligned, we could
216 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
217 * XXX field of the key. Worth it?
218 */
219 struct futex {
220 union futex_key fx_key;
221 struct rb_node fx_node;
222 unsigned long fx_refcnt;
223 bool fx_shared;
224 bool fx_on_tree;
225 uint8_t fx_class;
226
227 kmutex_t fx_op_lock; /* adaptive */
228 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
229 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
230 unsigned int fx_nwaiters[2];
231 };
232
233 /*
234 * futex classes: Some futex operations can only be carried out on
235 * futexes that are known to be abiding by a certain protocol. These
236 * classes are assigned to a futex when created due to a wait event,
237 * and when subsequent wake or requeue operations are issued, the
238 * class is checked at futex lookup time. If the class does not match,
239 * EINVAL is the result.
240 */
241 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
242 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
243 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
244 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
245
246 /* sleepq definitions */
247 #define FUTEX_READERQ 0
248 #define FUTEX_WRITERQ 1
249
250 static const char futex_wmesg[] = "futex";
251
252 static void futex_unsleep(lwp_t *, bool);
253
254 static syncobj_t futex_syncobj = {
255 .sobj_flag = SOBJ_SLEEPQ_SORTED,
256 .sobj_unsleep = futex_unsleep,
257 .sobj_changepri = sleepq_changepri,
258 .sobj_lendpri = sleepq_lendpri,
259 .sobj_owner = syncobj_noowner,
260 };
261
262 /*
263 * futex_tab
264 *
265 * Global trees of futexes by vmspace/va and VM object address.
266 *
267 * XXX This obviously doesn't scale in parallel. We could use a
268 * pserialize-safe data structure, but there may be a high cost to
269 * frequent deletion since we don't cache futexes after we're done
270 * with them. We could use hashed locks. But for now, just make
271 * sure userland can't DoS the serial performance, by using a
272 * balanced binary tree for lookup.
273 *
274 * XXX We could use a per-process tree for the table indexed by
275 * virtual address to reduce contention between processes.
276 */
277 static struct {
278 kmutex_t lock;
279 struct rb_tree va;
280 struct rb_tree oa;
281 } futex_tab __cacheline_aligned;
282
283 static int
284 compare_futex_key(void *cookie, const void *n, const void *k)
285 {
286 const struct futex *fa = n;
287 const union futex_key *fka = &fa->fx_key;
288 const union futex_key *fkb = k;
289
290 if ((uintptr_t)fka->fk_private.vmspace <
291 (uintptr_t)fkb->fk_private.vmspace)
292 return -1;
293 if ((uintptr_t)fka->fk_private.vmspace >
294 (uintptr_t)fkb->fk_private.vmspace)
295 return +1;
296 if (fka->fk_private.va < fkb->fk_private.va)
297 return -1;
298 if (fka->fk_private.va > fkb->fk_private.va)
299 return -1;
300 return 0;
301 }
302
303 static int
304 compare_futex(void *cookie, const void *na, const void *nb)
305 {
306 const struct futex *fa = na;
307 const struct futex *fb = nb;
308
309 return compare_futex_key(cookie, fa, &fb->fx_key);
310 }
311
312 static const rb_tree_ops_t futex_rb_ops = {
313 .rbto_compare_nodes = compare_futex,
314 .rbto_compare_key = compare_futex_key,
315 .rbto_node_offset = offsetof(struct futex, fx_node),
316 };
317
318 static int
319 compare_futex_shared_key(void *cookie, const void *n, const void *k)
320 {
321 const struct futex *fa = n;
322 const union futex_key *fka = &fa->fx_key;
323 const union futex_key *fkb = k;
324
325 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
326 }
327
328 static int
329 compare_futex_shared(void *cookie, const void *na, const void *nb)
330 {
331 const struct futex *fa = na;
332 const struct futex *fb = nb;
333
334 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
335 }
336
337 static const rb_tree_ops_t futex_shared_rb_ops = {
338 .rbto_compare_nodes = compare_futex_shared,
339 .rbto_compare_key = compare_futex_shared_key,
340 .rbto_node_offset = offsetof(struct futex, fx_node),
341 };
342
343 /*
344 * futex_sq_lock2(f, f2)
345 *
346 * Acquire the sleepq locks for f and f2, which may be null, or
347 * which may be the same. If they are distinct, an arbitrary total
348 * order is chosen on the locks.
349 *
350 * Callers should only ever acquire multiple sleepq locks
351 * simultaneously using futex_sq_lock2.
352 */
353 static void
354 futex_sq_lock2(struct futex * const f, struct futex * const f2)
355 {
356
357 /*
358 * If both are null, do nothing; if one is null and the other
359 * is not, lock the other and be done with it.
360 */
361 if (f == NULL && f2 == NULL) {
362 return;
363 } else if (f == NULL) {
364 mutex_spin_enter(f2->fx_sq_lock);
365 return;
366 } else if (f2 == NULL) {
367 mutex_spin_enter(f->fx_sq_lock);
368 return;
369 }
370
371 kmutex_t * const m = f->fx_sq_lock;
372 kmutex_t * const m2 = f2->fx_sq_lock;
373
374 /* If both locks are the same, acquire only one. */
375 if (m == m2) {
376 mutex_spin_enter(m);
377 return;
378 }
379
380 /* Otherwise, use the ordering on the kva of the lock pointer. */
381 if ((uintptr_t)m < (uintptr_t)m2) {
382 mutex_spin_enter(m);
383 mutex_spin_enter(m2);
384 } else {
385 mutex_spin_enter(m2);
386 mutex_spin_enter(m);
387 }
388 }
389
390 /*
391 * futex_sq_unlock2(f, f2)
392 *
393 * Release the sleep queue locks for f and f2, which may be null, or
394 * which may have the same underlying lock.
395 */
396 static void
397 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
398 {
399
400 /*
401 * If both are null, do nothing; if one is null and the other
402 * is not, unlock the other and be done with it.
403 */
404 if (f == NULL && f2 == NULL) {
405 return;
406 } else if (f == NULL) {
407 mutex_spin_exit(f2->fx_sq_lock);
408 return;
409 } else if (f2 == NULL) {
410 mutex_spin_exit(f->fx_sq_lock);
411 return;
412 }
413
414 kmutex_t * const m = f->fx_sq_lock;
415 kmutex_t * const m2 = f2->fx_sq_lock;
416
417 /* If both locks are the same, release only one. */
418 if (m == m2) {
419 mutex_spin_exit(m);
420 return;
421 }
422
423 /* Otherwise, use the ordering on the kva of the lock pointer. */
424 if ((uintptr_t)m < (uintptr_t)m2) {
425 mutex_spin_exit(m2);
426 mutex_spin_exit(m);
427 } else {
428 mutex_spin_exit(m);
429 mutex_spin_exit(m2);
430 }
431 }
432
433 /*
434 * futex_load(uaddr, kaddr)
435 *
436 * Perform a single atomic load to read *uaddr, and return the
437 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
438 * mapped.
439 */
440 static inline int
441 futex_load(int *uaddr, int *kaddr)
442 {
443 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
444 }
445
446 /*
447 * futex_test(uaddr, expected)
448 *
449 * True if *uaddr == expected. False if *uaddr != expected, or if
450 * uaddr is not mapped.
451 */
452 static bool
453 futex_test(int *uaddr, int expected)
454 {
455 int val;
456 int error;
457
458 error = futex_load(uaddr, &val);
459 if (error)
460 return false;
461 return val == expected;
462 }
463
464 static pool_cache_t futex_cache __read_mostly;
465
466 static int futex_ctor(void *, void *, int);
467 static void futex_dtor(void *, void *);
468
469 /*
470 * futex_sys_init()
471 *
472 * Initialize the futex subsystem.
473 */
474 void
475 futex_sys_init(void)
476 {
477
478 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
479 rb_tree_init(&futex_tab.va, &futex_rb_ops);
480 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
481
482 futex_cache = pool_cache_init(sizeof(struct futex),
483 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
484 futex_dtor, NULL);
485 }
486
487 /*
488 * futex_sys_fini()
489 *
490 * Finalize the futex subsystem.
491 */
492 void
493 futex_sys_fini(void)
494 {
495
496 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
497 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
498 mutex_destroy(&futex_tab.lock);
499 }
500
501 /*
502 * futex_ctor()
503 *
504 * Futex object constructor.
505 */
506 static int
507 futex_ctor(void *arg __unused, void *obj, int flags __unused)
508 {
509 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
510 struct futex * const f = obj;
511
512 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
513 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
514
515 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
516 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
517 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
518
519 return 0;
520 }
521
522 /*
523 * futex_dtor()
524 *
525 * Futex object destructor.
526 */
527 static void
528 futex_dtor(void *arg __unused, void *obj)
529 {
530 struct futex * const f = obj;
531
532 mutex_destroy(&f->fx_op_lock);
533 f->fx_sq_lock = NULL;
534 }
535
536 /*
537 * futex_key_init(key, vm, va, shared)
538 *
539 * Initialize a futex key for lookup, etc.
540 */
541 static int
542 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
543 {
544 int error = 0;
545
546 if (__predict_false(shared)) {
547 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
548 error = EFAULT;
549 } else {
550 fk->fk_private.vmspace = vm;
551 fk->fk_private.va = va;
552 }
553
554 return error;
555 }
556
557 /*
558 * futex_key_fini(key, shared)
559 *
560 * Release a futex key.
561 */
562 static void
563 futex_key_fini(union futex_key *fk, bool shared)
564 {
565 if (__predict_false(shared))
566 uvm_voaddr_release(&fk->fk_shared);
567 memset(fk, 0, sizeof(*fk));
568 }
569
570 /*
571 * futex_create(fk, shared)
572 *
573 * Create a futex. Initial reference count is 1, representing the
574 * caller. Returns NULL on failure. Always takes ownership of the
575 * key, either transferring it to the newly-created futex, or releasing
576 * the key if creation fails.
577 *
578 * Never sleeps for memory, but may sleep to acquire a lock.
579 */
580 static struct futex *
581 futex_create(union futex_key *fk, bool shared, uint8_t class)
582 {
583 struct futex *f;
584
585 f = pool_cache_get(futex_cache, PR_NOWAIT);
586 if (f == NULL) {
587 futex_key_fini(fk, shared);
588 return NULL;
589 }
590 f->fx_key = *fk;
591 f->fx_refcnt = 1;
592 f->fx_shared = shared;
593 f->fx_on_tree = false;
594 f->fx_class = class;
595
596 return f;
597 }
598
599 /*
600 * futex_destroy(f)
601 *
602 * Destroy a futex created with futex_create. Reference count
603 * must be zero.
604 *
605 * May sleep.
606 */
607 static void
608 futex_destroy(struct futex *f)
609 {
610
611 ASSERT_SLEEPABLE();
612
613 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
614 KASSERT(!f->fx_on_tree);
615 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
616 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
617 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
618 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
619
620 futex_key_fini(&f->fx_key, f->fx_shared);
621
622 pool_cache_put(futex_cache, f);
623 }
624
625 /*
626 * futex_hold_count(f, n)
627 *
628 * Attempt to acquire a reference to f. Return 0 on success,
629 * ENFILE on too many references.
630 *
631 * Never sleeps.
632 */
633 static int
634 futex_hold_count(struct futex *f, unsigned long const count)
635 {
636 unsigned long refcnt;
637
638 do {
639 refcnt = atomic_load_relaxed(&f->fx_refcnt);
640 if (ULONG_MAX - refcnt < count)
641 return ENFILE;
642 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
643 refcnt + count) != refcnt);
644
645 return 0;
646 }
647 #define futex_hold(f) futex_hold_count(f, 1)
648
649 /*
650 * futex_rele_count(f, n)
651 *
652 * Release a reference to f acquired with futex_create or
653 * futex_hold.
654 *
655 * May sleep to free f.
656 */
657 static void
658 futex_rele_count(struct futex *f, unsigned long const count)
659 {
660 unsigned long refcnt;
661
662 ASSERT_SLEEPABLE();
663
664 do {
665 refcnt = atomic_load_relaxed(&f->fx_refcnt);
666 KASSERT(refcnt >= count);
667 if (refcnt - count == 0)
668 goto trylast;
669 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
670 refcnt - count) != refcnt);
671 return;
672
673 trylast:
674 KASSERT(count <= LONG_MAX);
675 mutex_enter(&futex_tab.lock);
676 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
677 if (f->fx_on_tree) {
678 if (__predict_false(f->fx_shared))
679 rb_tree_remove_node(&futex_tab.oa, f);
680 else
681 rb_tree_remove_node(&futex_tab.va, f);
682 f->fx_on_tree = false;
683 }
684 } else {
685 /* References remain -- don't destroy it. */
686 f = NULL;
687 }
688 mutex_exit(&futex_tab.lock);
689 if (f != NULL)
690 futex_destroy(f);
691 }
692 #define futex_rele(f) futex_rele_count(f, 1)
693
694 /*
695 * futex_rele_count_not_last(f, n)
696 *
697 * Release a reference to f acquired with futex_create or
698 * futex_hold.
699 *
700 * This version asserts that we are not dropping the last
701 * reference to f.
702 */
703 static void
704 futex_rele_count_not_last(struct futex *f, unsigned long const count)
705 {
706 unsigned long refcnt;
707
708 do {
709 refcnt = atomic_load_relaxed(&f->fx_refcnt);
710 KASSERT(refcnt >= count);
711 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
712 refcnt - count) != refcnt);
713 }
714
715 /*
716 * futex_lookup_by_key(key, shared, class, &f)
717 *
718 * Try to find an existing futex va reference in the specified key
719 * On success, return 0, set f to found futex or to NULL if not found,
720 * and increment f's reference count if found.
721 *
722 * Return ENFILE if reference count too high.
723 *
724 * Internal lookup routine shared by futex_lookup() and
725 * futex_lookup_create().
726 */
727 static int
728 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
729 struct futex **fp)
730 {
731 struct futex *f;
732 int error = 0;
733
734 mutex_enter(&futex_tab.lock);
735 if (__predict_false(shared)) {
736 f = rb_tree_find_node(&futex_tab.oa, fk);
737 } else {
738 f = rb_tree_find_node(&futex_tab.va, fk);
739 }
740 if (f) {
741 if (__predict_true(f->fx_class == class ||
742 class == FUTEX_CLASS_ANY))
743 error = futex_hold(f);
744 else
745 error = EINVAL;
746 if (error)
747 f = NULL;
748 }
749 *fp = f;
750 mutex_exit(&futex_tab.lock);
751
752 return error;
753 }
754
755 /*
756 * futex_insert(f, fp)
757 *
758 * Try to insert the futex f into the tree by va. If there
759 * already is a futex for its va, acquire a reference to it, and
760 * store it in *fp; otherwise store f in *fp.
761 *
762 * Return 0 on success, ENFILE if there already is a futex but its
763 * reference count is too high.
764 */
765 static int
766 futex_insert(struct futex *f, struct futex **fp)
767 {
768 struct futex *f0;
769 int error;
770
771 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
772 KASSERT(!f->fx_on_tree);
773
774 mutex_enter(&futex_tab.lock);
775 if (__predict_false(f->fx_shared))
776 f0 = rb_tree_insert_node(&futex_tab.oa, f);
777 else
778 f0 = rb_tree_insert_node(&futex_tab.va, f);
779 if (f0 == f) {
780 f->fx_on_tree = true;
781 error = 0;
782 } else {
783 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
784 KASSERT(f0->fx_on_tree);
785 error = futex_hold(f0);
786 if (error)
787 goto out;
788 }
789 *fp = f0;
790 out: mutex_exit(&futex_tab.lock);
791
792 return error;
793 }
794
795 /*
796 * futex_lookup(uaddr, shared, class, &f)
797 *
798 * Find a futex at the userland pointer uaddr in the current
799 * process's VM space. On success, return the futex in f and
800 * increment its reference count.
801 *
802 * Caller must call futex_rele when done.
803 */
804 static int
805 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
806 {
807 union futex_key fk;
808 struct vmspace *vm = curproc->p_vmspace;
809 vaddr_t va = (vaddr_t)uaddr;
810 int error;
811
812 /*
813 * Reject unaligned user pointers so we don't cross page
814 * boundaries and so atomics will work.
815 */
816 if (__predict_false((va & 3) != 0))
817 return EINVAL;
818
819 /* Look it up. */
820 error = futex_key_init(&fk, vm, va, shared);
821 if (error)
822 return error;
823
824 error = futex_lookup_by_key(&fk, shared, class, fp);
825 futex_key_fini(&fk, shared);
826 if (error)
827 return error;
828
829 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
830 KASSERT(*fp == NULL || (*fp)->fx_class == class);
831 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
832
833 /*
834 * Success! (Caller must still check whether we found
835 * anything, but nothing went _wrong_ like trying to use
836 * unmapped memory.)
837 */
838 KASSERT(error == 0);
839
840 return error;
841 }
842
843 /*
844 * futex_lookup_create(uaddr, shared, class, &f)
845 *
846 * Find or create a futex at the userland pointer uaddr in the
847 * current process's VM space. On success, return the futex in f
848 * and increment its reference count.
849 *
850 * Caller must call futex_rele when done.
851 */
852 static int
853 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
854 {
855 union futex_key fk;
856 struct vmspace *vm = curproc->p_vmspace;
857 struct futex *f = NULL;
858 vaddr_t va = (vaddr_t)uaddr;
859 int error;
860
861 /*
862 * Reject unaligned user pointers so we don't cross page
863 * boundaries and so atomics will work.
864 */
865 if (__predict_false((va & 3) != 0))
866 return EINVAL;
867
868 if (__predict_false(class == FUTEX_CLASS_ANY))
869 return EINVAL;
870
871 error = futex_key_init(&fk, vm, va, shared);
872 if (error)
873 return error;
874
875 /*
876 * Optimistically assume there already is one, and try to find
877 * it.
878 */
879 error = futex_lookup_by_key(&fk, shared, class, fp);
880 if (error || *fp != NULL) {
881 /*
882 * We either found one, or there was an error.
883 * In either case, we are done with the key.
884 */
885 futex_key_fini(&fk, shared);
886 goto out;
887 }
888
889 /*
890 * Create a futex recoard. This tranfers ownership of the key
891 * in all cases.
892 */
893 f = futex_create(&fk, shared, class);
894 if (f == NULL) {
895 error = ENOMEM;
896 goto out;
897 }
898
899 /*
900 * Insert our new futex, or use existing if someone else beat
901 * us to it.
902 */
903 error = futex_insert(f, fp);
904 if (error)
905 goto out;
906 if (*fp == f)
907 f = NULL; /* don't release on exit */
908
909 /* Success! */
910 KASSERT(error == 0);
911
912 out: if (f != NULL)
913 futex_rele(f);
914 KASSERT(error || *fp != NULL);
915 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
916 return error;
917 }
918
919 /*
920 * futex_unsleep:
921 *
922 * Remove an LWP from the futex and sleep queue. This is called when
923 * the LWP has not been awoken normally but instead interrupted: for
924 * example, when a signal is received. Must be called with the LWP
925 * locked. Will unlock if "unlock" is true.
926 */
927 static void
928 futex_unsleep(lwp_t *l, bool unlock)
929 {
930 struct futex *f __diagused;
931
932 f = (struct futex *)(uintptr_t)l->l_wchan;
933
934 KASSERT(mutex_owned(f->fx_sq_lock));
935
936 /* WRITERQ is more likely */
937 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
938 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
939 f->fx_nwaiters[FUTEX_WRITERQ]--;
940 } else {
941 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
942 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
943 f->fx_nwaiters[FUTEX_READERQ]--;
944 }
945
946 sleepq_unsleep(l, unlock);
947 }
948
949 /*
950 * futex_wait(f, timeout, clkid, bitset)
951 *
952 * Wait until deadline on the clock clkid, or forever if deadline
953 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
954 * ETIMEDOUT on timeout, EINTR on signal.
955 */
956 static int
957 futex_wait(struct futex *f, int q, const struct timespec *deadline,
958 clockid_t clkid, int bitset)
959 {
960
961 /*
962 * Some things to note about how this function works:
963 *
964 * ==> When we enter futex_wait(), the futex's op lock is
965 * held, preventing us from missing wakes. Once we are on
966 * the futex's sleepq, it is safe to release the op lock.
967 *
968 * ==> We have a single early escape to avoid calling
969 * sleepq_block(): our deadline already having passed.
970 * If this is a no-timeout wait, or if the deadline has
971 * not already passed, we are guaranteed to call sleepq_block().
972 *
973 * ==> sleepq_block() contains all of the necessary logic
974 * for handling signals; we do not need to check for them
975 * ourselves.
976 *
977 * ==> Once we have blocked, other futex operations will
978 * be able to wake us or requeue us. Until we have blocked,
979 * those other futex operations will not be able to acquire
980 * the sleepq locks in order to perform those operations,
981 * even if we have dropped the op lock. Thus, we will not
982 * miss wakes. This fundamental invariant is relied upon by
983 * every other synchronization object in the kernel.
984 *
985 * ==> sleepq_block() returns for one of three reasons:
986 *
987 * -- We were awakened.
988 * -- We were signaled.
989 * -- We timed out.
990 *
991 * ==> Once sleepq_block() returns, we will not call
992 * sleepq_block() again.
993 *
994 * ==> If sleepq_block() returns due to being signaled,
995 * we must check to see if we were also awakened. This
996 * is to handle the following sequence:
997 *
998 * -- Futex wake takes us off sleepq, places us on runq.
999 * -- We are signaled while sitting on the runq.
1000 * -- We run, and sleepq_block() notices the signal and
1001 * returns EINTR/ERESTART.
1002 *
1003 * In this situation, we want to indicate a successful wake
1004 * to the caller, because that wake has been reported to the
1005 * thread that issued it.
1006 *
1007 * Note that it is NOT possible for a wake to be issued after
1008 * a signal; the LWP will already have been taken off the sleepq
1009 * by the signal, so the wake will never happen. Note that for
1010 * this reason, we must *never* return ERESTART, because it could
1011 * result in us waiting again with no one to awaken us.
1012 */
1013
1014 struct lwp * const l = curlwp;
1015 int timo;
1016 int error;
1017
1018 /*
1019 * Compute our timeout before taking any locks.
1020 */
1021 if (deadline) {
1022 struct timespec ts;
1023
1024 /* Check our watch. */
1025 error = clock_gettime1(clkid, &ts);
1026 if (error) {
1027 mutex_exit(&f->fx_op_lock);
1028 return error;
1029 }
1030
1031 /*
1032 * If we're past the deadline, ETIMEDOUT.
1033 */
1034 if (timespeccmp(deadline, &ts, <=)) {
1035 mutex_exit(&f->fx_op_lock);
1036 return ETIMEDOUT;
1037 } else {
1038 /* Count how much time is left. */
1039 timespecsub(deadline, &ts, &ts);
1040 timo = tstohz(&ts);
1041 if (timo == 0) {
1042 /*
1043 * tstohz() already rounds up, and
1044 * a return value of 0 ticks means
1045 * "expired now or in the past".
1046 */
1047 mutex_exit(&f->fx_op_lock);
1048 return ETIMEDOUT;
1049 }
1050 }
1051 } else {
1052 timo = 0;
1053 }
1054
1055 /*
1056 * Acquire in sleepq -> lwp order. While we're at it, ensure
1057 * that there's room for us to block.
1058 */
1059 mutex_spin_enter(f->fx_sq_lock);
1060 if (__predict_false(q == FUTEX_READERQ &&
1061 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1062 mutex_spin_exit(f->fx_sq_lock);
1063 mutex_exit(&f->fx_op_lock);
1064 return ENFILE;
1065 }
1066 lwp_lock(l);
1067
1068 /*
1069 * No need for locks here; until we're on a futex's sleepq, these
1070 * values are private to the LWP, and the locks needed to get onto
1071 * the sleepq provide the memory barriers needed to ensure visibility.
1072 */
1073 l->l_futex = f;
1074 l->l_futex_wakesel = bitset;
1075
1076 /*
1077 * This is an inlined bit of sleepq_enter();
1078 * we already hold the lwp lock.
1079 */
1080 lwp_unlock_to(l, f->fx_sq_lock);
1081 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1082 KASSERT(lwp_locked(l, f->fx_sq_lock));
1083
1084 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1085 f->fx_nwaiters[q]++;
1086
1087 /* We can now safely release the op lock. */
1088 mutex_exit(&f->fx_op_lock);
1089
1090 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1091 error = sleepq_block(timo, true);
1092 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1093
1094 /*
1095 * f is no longer valid; we may have been moved to another
1096 * futex sleepq while we slept.
1097 */
1098
1099 /*
1100 * If something went wrong, we may still have a futex reference.
1101 * Check for that here and drop it if so. We can do this without
1102 * taking any locks because l->l_futex is private to the LWP when
1103 * not on any futex's sleepq, and we are not on any sleepq because
1104 * we are running.
1105 *
1106 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1107 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1108 */
1109 if (error) {
1110 f = l->l_futex;
1111 if (f != NULL) {
1112 SDT_PROBE0(futex, wait, finish, aborted);
1113 l->l_futex = NULL;
1114 futex_rele(f);
1115 } else {
1116 /* Raced with wakeup; report success. */
1117 SDT_PROBE0(futex, wait, finish, wakerace);
1118 error = 0;
1119 }
1120 if (error == EWOULDBLOCK)
1121 error = ETIMEDOUT;
1122 else if (error != ETIMEDOUT)
1123 error = EINTR;
1124 } else {
1125 KASSERT(l->l_futex == NULL);
1126 SDT_PROBE0(futex, wait, finish, normally);
1127 }
1128
1129 return error;
1130 }
1131
1132 /*
1133 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1134 *
1135 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1136 * provided, move up to nrequeue remaining waiters on f matching
1137 * bitset to f2. Return the number of waiters actually woken.
1138 * Caller must hold the locks of f and f2, if provided.
1139 */
1140 static unsigned
1141 futex_wake(struct futex *f, int q, unsigned int const nwake,
1142 struct futex *f2, int q2, unsigned int const nrequeue,
1143 int bitset)
1144 {
1145 struct lwp *l, *l_next;
1146 unsigned nwoken = 0;
1147 unsigned nrequeued = 0;
1148
1149 KASSERT(mutex_owned(&f->fx_op_lock));
1150 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1151
1152 futex_sq_lock2(f, f2);
1153
1154 /* Wake up to nwake waiters, and count the number woken. */
1155 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1156 if (nwoken == nwake)
1157 break;
1158
1159 KASSERT(l->l_syncobj == &futex_syncobj);
1160 KASSERT(l->l_wchan == (wchan_t)f);
1161 KASSERT(l->l_futex == f);
1162 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1163 KASSERT(l->l_mutex == f->fx_sq_lock);
1164
1165 if ((l->l_futex_wakesel & bitset) == 0)
1166 continue;
1167
1168 l->l_futex_wakesel = 0;
1169 l->l_futex = NULL;
1170 sleepq_remove(&f->fx_sqs[q], l);
1171 f->fx_nwaiters[q]--;
1172 nwoken++;
1173 /* hold counts adjusted in bulk below */
1174 }
1175
1176 if (nrequeue) {
1177 KASSERT(f2 != NULL);
1178 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1179 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1180 if (nrequeued == nrequeue)
1181 break;
1182
1183 KASSERT(l->l_syncobj == &futex_syncobj);
1184 KASSERT(l->l_wchan == (wchan_t)f);
1185 KASSERT(l->l_futex == f);
1186 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1187 KASSERT(l->l_mutex == f->fx_sq_lock);
1188
1189 if ((l->l_futex_wakesel & bitset) == 0)
1190 continue;
1191
1192 l->l_futex = f2;
1193 sleepq_transfer(l, &f->fx_sqs[q],
1194 &f2->fx_sqs[q2], f2, futex_wmesg,
1195 &futex_syncobj, f2->fx_sq_lock, true);
1196 f->fx_nwaiters[q]--;
1197 f2->fx_nwaiters[q2]++;
1198 nrequeued++;
1199 /* hold counts adjusted in bulk below */
1200 }
1201 if (nrequeued) {
1202 /* XXX futex_hold() could theoretically fail here. */
1203 int hold_error __diagused =
1204 futex_hold_count(f2, nrequeued);
1205 KASSERT(hold_error == 0);
1206 }
1207 }
1208
1209 /*
1210 * Adjust the futex reference counts for the wakes and
1211 * requeues.
1212 */
1213 KASSERT(nwoken + nrequeued <= LONG_MAX);
1214 futex_rele_count_not_last(f, nwoken + nrequeued);
1215
1216 futex_sq_unlock2(f, f2);
1217
1218 /* Return the number of waiters woken and requeued. */
1219 return nwoken + nrequeued;
1220 }
1221
1222 /*
1223 * futex_op_lock(f)
1224 *
1225 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1226 * not use if caller needs to acquire two locks; use
1227 * futex_op_lock2 instead.
1228 */
1229 static void
1230 futex_op_lock(struct futex *f)
1231 {
1232 mutex_enter(&f->fx_op_lock);
1233 }
1234
1235 /*
1236 * futex_op_unlock(f)
1237 *
1238 * Release the queue lock of f.
1239 */
1240 static void
1241 futex_op_unlock(struct futex *f)
1242 {
1243 mutex_exit(&f->fx_op_lock);
1244 }
1245
1246 /*
1247 * futex_op_lock2(f, f2)
1248 *
1249 * Acquire the op locks of both f and f2, which may be null, or
1250 * which may be the same. If they are distinct, an arbitrary total
1251 * order is chosen on the locks.
1252 *
1253 * Callers should only ever acquire multiple op locks
1254 * simultaneously using futex_op_lock2.
1255 */
1256 static void
1257 futex_op_lock2(struct futex *f, struct futex *f2)
1258 {
1259
1260 /*
1261 * If both are null, do nothing; if one is null and the other
1262 * is not, lock the other and be done with it.
1263 */
1264 if (f == NULL && f2 == NULL) {
1265 return;
1266 } else if (f == NULL) {
1267 mutex_enter(&f2->fx_op_lock);
1268 return;
1269 } else if (f2 == NULL) {
1270 mutex_enter(&f->fx_op_lock);
1271 return;
1272 }
1273
1274 /* If both futexes are the same, acquire only one. */
1275 if (f == f2) {
1276 mutex_enter(&f->fx_op_lock);
1277 return;
1278 }
1279
1280 /* Otherwise, use the ordering on the kva of the futex pointer. */
1281 if ((uintptr_t)f < (uintptr_t)f2) {
1282 mutex_enter(&f->fx_op_lock);
1283 mutex_enter(&f2->fx_op_lock);
1284 } else {
1285 mutex_enter(&f2->fx_op_lock);
1286 mutex_enter(&f->fx_op_lock);
1287 }
1288 }
1289
1290 /*
1291 * futex_op_unlock2(f, f2)
1292 *
1293 * Release the queue locks of both f and f2, which may be null, or
1294 * which may have the same underlying queue.
1295 */
1296 static void
1297 futex_op_unlock2(struct futex *f, struct futex *f2)
1298 {
1299
1300 /*
1301 * If both are null, do nothing; if one is null and the other
1302 * is not, unlock the other and be done with it.
1303 */
1304 if (f == NULL && f2 == NULL) {
1305 return;
1306 } else if (f == NULL) {
1307 mutex_exit(&f2->fx_op_lock);
1308 return;
1309 } else if (f2 == NULL) {
1310 mutex_exit(&f->fx_op_lock);
1311 return;
1312 }
1313
1314 /* If both futexes are the same, release only one. */
1315 if (f == f2) {
1316 mutex_exit(&f->fx_op_lock);
1317 return;
1318 }
1319
1320 /* Otherwise, use the ordering on the kva of the futex pointer. */
1321 if ((uintptr_t)f < (uintptr_t)f2) {
1322 mutex_exit(&f2->fx_op_lock);
1323 mutex_exit(&f->fx_op_lock);
1324 } else {
1325 mutex_exit(&f->fx_op_lock);
1326 mutex_exit(&f2->fx_op_lock);
1327 }
1328 }
1329
1330 /*
1331 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1332 *
1333 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1334 */
1335 static int
1336 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1337 const struct timespec *timeout, clockid_t clkid, int clkflags,
1338 register_t *retval)
1339 {
1340 struct futex *f;
1341 struct timespec ts;
1342 const struct timespec *deadline;
1343 int error;
1344
1345 /*
1346 * If there's nothing to wait for, and nobody will ever wake
1347 * us, then don't set anything up to wait -- just stop here.
1348 */
1349 if (val3 == 0)
1350 return EINVAL;
1351
1352 /* Optimistically test before anything else. */
1353 if (!futex_test(uaddr, val))
1354 return EAGAIN;
1355
1356 /* Determine a deadline on the specified clock. */
1357 if (timeout == NULL) {
1358 deadline = timeout;
1359 } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1360 /* Sanity-check the deadline. */
1361 if (timeout->tv_sec < 0 ||
1362 timeout->tv_nsec < 0 ||
1363 timeout->tv_nsec >= 1000000000L) {
1364 return EINVAL;
1365 }
1366 deadline = timeout;
1367 } else {
1368 struct timespec interval = *timeout;
1369
1370 error = itimespecfix(&interval);
1371 if (error)
1372 return error;
1373 error = clock_gettime1(clkid, &ts);
1374 if (error)
1375 return error;
1376 timespecadd(&ts, &interval, &ts);
1377 deadline = &ts;
1378 }
1379
1380 /* Get the futex, creating it if necessary. */
1381 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1382 if (error)
1383 return error;
1384 KASSERT(f);
1385
1386 /*
1387 * Under the op lock, check the value again: if it has
1388 * already changed, EAGAIN; otherwise enqueue the waiter.
1389 * Since FUTEX_WAKE will use the same lock and be done after
1390 * modifying the value, the order in which we check and enqueue
1391 * is immaterial.
1392 */
1393 futex_op_lock(f);
1394 if (!futex_test(uaddr, val)) {
1395 futex_op_unlock(f);
1396 error = EAGAIN;
1397 goto out;
1398 }
1399
1400 /*
1401 * Now wait. futex_wait() will drop our op lock once we
1402 * are entered into the sleep queue, thus ensuring atomicity
1403 * of wakes with respect to waits.
1404 */
1405 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1406
1407 /*
1408 * We cannot drop our reference to the futex here, because
1409 * we might be enqueued on a different one when we are awakened.
1410 * The references will be managed on our behalf in the requeue,
1411 * wake, and error cases.
1412 */
1413 f = NULL;
1414
1415 if (__predict_true(error == 0)) {
1416 /* Return 0 on success, error on failure. */
1417 *retval = 0;
1418 }
1419
1420 out: if (f != NULL)
1421 futex_rele(f);
1422 return error;
1423 }
1424
1425 /*
1426 * futex_func_wake(uaddr, val, val3, retval)
1427 *
1428 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1429 */
1430 static int
1431 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1432 {
1433 struct futex *f;
1434 unsigned int nwoken = 0;
1435 int error = 0;
1436
1437 /* Reject negative number of wakeups. */
1438 if (val < 0) {
1439 error = EINVAL;
1440 goto out;
1441 }
1442
1443 /* Look up the futex, if any. */
1444 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1445 if (error)
1446 goto out;
1447
1448 /* If there's no futex, there are no waiters to wake. */
1449 if (f == NULL)
1450 goto out;
1451
1452 /*
1453 * Under f's op lock, wake the waiters and remember the
1454 * number woken.
1455 */
1456 futex_op_lock(f);
1457 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1458 NULL, FUTEX_WRITERQ, 0, val3);
1459 futex_op_unlock(f);
1460
1461 /* Release the futex. */
1462 futex_rele(f);
1463
1464 out:
1465 /* Return the number of waiters woken. */
1466 *retval = nwoken;
1467
1468 /* Success! */
1469 return error;
1470 }
1471
1472 /*
1473 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1474 *
1475 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1476 */
1477 static int
1478 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1479 int val2, int val3, register_t *retval)
1480 {
1481 struct futex *f = NULL, *f2 = NULL;
1482 unsigned nwoken = 0; /* default to zero woken on early return */
1483 int error;
1484
1485 /* Reject negative number of wakeups or requeues. */
1486 if (val < 0 || val2 < 0) {
1487 error = EINVAL;
1488 goto out;
1489 }
1490
1491 /* Look up the source futex, if any. */
1492 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1493 if (error)
1494 goto out;
1495
1496 /* If there is none, nothing to do. */
1497 if (f == NULL)
1498 goto out;
1499
1500 /*
1501 * We may need to create the destination futex because it's
1502 * entirely possible it does not currently have any waiters.
1503 */
1504 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1505 if (error)
1506 goto out;
1507
1508 /*
1509 * Under the futexes' op locks, check the value; if
1510 * unchanged from val3, wake the waiters.
1511 */
1512 futex_op_lock2(f, f2);
1513 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1514 error = EAGAIN;
1515 } else {
1516 error = 0;
1517 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1518 f2, FUTEX_WRITERQ, val2,
1519 FUTEX_BITSET_MATCH_ANY);
1520 }
1521 futex_op_unlock2(f, f2);
1522
1523 out:
1524 /* Return the number of waiters woken. */
1525 *retval = nwoken;
1526
1527 /* Release the futexes if we got them. */
1528 if (f2)
1529 futex_rele(f2);
1530 if (f)
1531 futex_rele(f);
1532 return error;
1533 }
1534
1535 /*
1536 * futex_validate_op_cmp(val3)
1537 *
1538 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1539 */
1540 static int
1541 futex_validate_op_cmp(int val3)
1542 {
1543 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1544 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1545
1546 if (op & FUTEX_OP_OPARG_SHIFT) {
1547 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1548 if (oparg < 0)
1549 return EINVAL;
1550 if (oparg >= 32)
1551 return EINVAL;
1552 op &= ~FUTEX_OP_OPARG_SHIFT;
1553 }
1554
1555 switch (op) {
1556 case FUTEX_OP_SET:
1557 case FUTEX_OP_ADD:
1558 case FUTEX_OP_OR:
1559 case FUTEX_OP_ANDN:
1560 case FUTEX_OP_XOR:
1561 break;
1562 default:
1563 return EINVAL;
1564 }
1565
1566 switch (cmp) {
1567 case FUTEX_OP_CMP_EQ:
1568 case FUTEX_OP_CMP_NE:
1569 case FUTEX_OP_CMP_LT:
1570 case FUTEX_OP_CMP_LE:
1571 case FUTEX_OP_CMP_GT:
1572 case FUTEX_OP_CMP_GE:
1573 break;
1574 default:
1575 return EINVAL;
1576 }
1577
1578 return 0;
1579 }
1580
1581 /*
1582 * futex_compute_op(oldval, val3)
1583 *
1584 * Apply a FUTEX_WAIT_OP operation to oldval.
1585 */
1586 static int
1587 futex_compute_op(int oldval, int val3)
1588 {
1589 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1590 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1591
1592 if (op & FUTEX_OP_OPARG_SHIFT) {
1593 KASSERT(oparg >= 0);
1594 KASSERT(oparg < 32);
1595 oparg = 1u << oparg;
1596 op &= ~FUTEX_OP_OPARG_SHIFT;
1597 }
1598
1599 switch (op) {
1600 case FUTEX_OP_SET:
1601 return oparg;
1602
1603 case FUTEX_OP_ADD:
1604 /*
1605 * Avoid signed arithmetic overflow by doing
1606 * arithmetic unsigned and converting back to signed
1607 * at the end.
1608 */
1609 return (int)((unsigned)oldval + (unsigned)oparg);
1610
1611 case FUTEX_OP_OR:
1612 return oldval | oparg;
1613
1614 case FUTEX_OP_ANDN:
1615 return oldval & ~oparg;
1616
1617 case FUTEX_OP_XOR:
1618 return oldval ^ oparg;
1619
1620 default:
1621 panic("invalid futex op");
1622 }
1623 }
1624
1625 /*
1626 * futex_compute_cmp(oldval, val3)
1627 *
1628 * Apply a FUTEX_WAIT_OP comparison to oldval.
1629 */
1630 static bool
1631 futex_compute_cmp(int oldval, int val3)
1632 {
1633 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1634 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1635
1636 switch (cmp) {
1637 case FUTEX_OP_CMP_EQ:
1638 return (oldval == cmparg);
1639
1640 case FUTEX_OP_CMP_NE:
1641 return (oldval != cmparg);
1642
1643 case FUTEX_OP_CMP_LT:
1644 return (oldval < cmparg);
1645
1646 case FUTEX_OP_CMP_LE:
1647 return (oldval <= cmparg);
1648
1649 case FUTEX_OP_CMP_GT:
1650 return (oldval > cmparg);
1651
1652 case FUTEX_OP_CMP_GE:
1653 return (oldval >= cmparg);
1654
1655 default:
1656 panic("invalid futex cmp operation");
1657 }
1658 }
1659
1660 /*
1661 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1662 *
1663 * Implement futex(FUTEX_WAKE_OP).
1664 */
1665 static int
1666 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1667 int val3, register_t *retval)
1668 {
1669 struct futex *f = NULL, *f2 = NULL;
1670 int oldval, newval, actual;
1671 unsigned nwoken = 0;
1672 int error;
1673
1674 /* Reject negative number of wakeups. */
1675 if (val < 0 || val2 < 0) {
1676 error = EINVAL;
1677 goto out;
1678 }
1679
1680 /* Reject invalid operations before we start doing things. */
1681 if ((error = futex_validate_op_cmp(val3)) != 0)
1682 goto out;
1683
1684 /* Look up the first futex, if any. */
1685 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1686 if (error)
1687 goto out;
1688
1689 /* Look up the second futex, if any. */
1690 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1691 if (error)
1692 goto out;
1693
1694 /*
1695 * Under the op locks:
1696 *
1697 * 1. Read/modify/write: *uaddr2 op= oparg.
1698 * 2. Unconditionally wake uaddr.
1699 * 3. Conditionally wake uaddr2, if it previously matched val3.
1700 */
1701 futex_op_lock2(f, f2);
1702 do {
1703 error = futex_load(uaddr2, &oldval);
1704 if (error)
1705 goto out_unlock;
1706 newval = futex_compute_op(oldval, val3);
1707 error = ucas_int(uaddr2, oldval, newval, &actual);
1708 if (error)
1709 goto out_unlock;
1710 } while (actual != oldval);
1711 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1712 NULL, FUTEX_WRITERQ, 0,
1713 FUTEX_BITSET_MATCH_ANY) : 0);
1714 if (f2 && futex_compute_cmp(oldval, val3))
1715 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1716 NULL, FUTEX_WRITERQ, 0,
1717 FUTEX_BITSET_MATCH_ANY);
1718
1719 /* Success! */
1720 error = 0;
1721 out_unlock:
1722 futex_op_unlock2(f, f2);
1723
1724 out:
1725 /* Return the number of waiters woken. */
1726 *retval = nwoken;
1727
1728 /* Release the futexes, if we got them. */
1729 if (f2)
1730 futex_rele(f2);
1731 if (f)
1732 futex_rele(f);
1733 return error;
1734 }
1735
1736 /*
1737 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1738 *
1739 * Implement the futex system call with all the parameters
1740 * parsed out.
1741 */
1742 int
1743 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1744 int *uaddr2, int val2, int val3, register_t *retval)
1745 {
1746 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1747 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1748 : CLOCK_MONOTONIC;
1749 int error;
1750
1751 op &= FUTEX_CMD_MASK;
1752
1753 switch (op) {
1754 case FUTEX_WAIT:
1755 SDT_PROBE6(futex, func, wait, entry,
1756 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
1757 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
1758 error = futex_func_wait(shared, uaddr, val,
1759 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1760 retval);
1761 SDT_PROBE1(futex, func, wait, exit, error);
1762 break;
1763
1764 case FUTEX_WAIT_BITSET:
1765 SDT_PROBE6(futex, func, wait, entry,
1766 uaddr, val, val3, timeout,
1767 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
1768 error = futex_func_wait(shared, uaddr, val, val3, timeout,
1769 clkid, TIMER_ABSTIME, retval);
1770 SDT_PROBE1(futex, func, wait, exit, error);
1771 break;
1772
1773 case FUTEX_WAKE:
1774 SDT_PROBE4(futex, func, wake, entry,
1775 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
1776 error = futex_func_wake(shared, uaddr, val,
1777 FUTEX_BITSET_MATCH_ANY, retval);
1778 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1779 break;
1780
1781 case FUTEX_WAKE_BITSET:
1782 SDT_PROBE4(futex, func, wake, entry,
1783 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
1784 error = futex_func_wake(shared, uaddr, val, val3, retval);
1785 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1786 break;
1787
1788 case FUTEX_REQUEUE:
1789 SDT_PROBE5(futex, func, requeue, entry,
1790 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
1791 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1792 val2, 0, retval);
1793 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
1794 break;
1795
1796 case FUTEX_CMP_REQUEUE:
1797 SDT_PROBE6(futex, func, cmp_requeue, entry,
1798 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1799 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1800 val2, val3, retval);
1801 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1802 break;
1803
1804 case FUTEX_WAKE_OP:
1805 SDT_PROBE6(futex, func, wake_op, entry,
1806 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1807 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1808 val3, retval);
1809 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
1810 break;
1811
1812 case FUTEX_FD:
1813 default:
1814 error = ENOSYS;
1815 break;
1816 }
1817
1818 return error;
1819 }
1820
1821 /*
1822 * sys___futex(l, uap, retval)
1823 *
1824 * __futex(2) system call: generic futex operations.
1825 */
1826 int
1827 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1828 register_t *retval)
1829 {
1830 /* {
1831 syscallarg(int *) uaddr;
1832 syscallarg(int) op;
1833 syscallarg(int) val;
1834 syscallarg(const struct timespec *) timeout;
1835 syscallarg(int *) uaddr2;
1836 syscallarg(int) val2;
1837 syscallarg(int) val3;
1838 } */
1839 struct timespec ts, *tsp;
1840 int error;
1841
1842 /*
1843 * Copy in the timeout argument, if specified.
1844 */
1845 if (SCARG(uap, timeout)) {
1846 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1847 if (error)
1848 return error;
1849 tsp = &ts;
1850 } else {
1851 tsp = NULL;
1852 }
1853
1854 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1855 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1856 retval);
1857 }
1858
1859 /*
1860 * sys___futex_set_robust_list(l, uap, retval)
1861 *
1862 * __futex_set_robust_list(2) system call for robust futexes.
1863 */
1864 int
1865 sys___futex_set_robust_list(struct lwp *l,
1866 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1867 {
1868 /* {
1869 syscallarg(void *) head;
1870 syscallarg(size_t) len;
1871 } */
1872 void *head = SCARG(uap, head);
1873
1874 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1875 return EINVAL;
1876 if ((uintptr_t)head % sizeof(u_long))
1877 return EINVAL;
1878
1879 l->l_robust_head = (uintptr_t)head;
1880
1881 return 0;
1882 }
1883
1884 /*
1885 * sys___futex_get_robust_list(l, uap, retval)
1886 *
1887 * __futex_get_robust_list(2) system call for robust futexes.
1888 */
1889 int
1890 sys___futex_get_robust_list(struct lwp *l,
1891 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1892 {
1893 /* {
1894 syscallarg(lwpid_t) lwpid;
1895 syscallarg(void **) headp;
1896 syscallarg(size_t *) lenp;
1897 } */
1898 void *head;
1899 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1900 int error;
1901
1902 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1903 if (error)
1904 return error;
1905
1906 /* Copy out the head pointer and the head structure length. */
1907 error = copyout(&head, SCARG(uap, headp), sizeof(head));
1908 if (__predict_true(error == 0)) {
1909 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1910 }
1911
1912 return error;
1913 }
1914
1915 /*
1916 * release_futex(uva, tid)
1917 *
1918 * Try to release the robust futex at uva in the current process
1919 * on lwp exit. If anything goes wrong, silently fail. It is the
1920 * userland program's obligation to arrange correct behaviour.
1921 */
1922 static void
1923 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1924 bool const is_pending)
1925 {
1926 int *uaddr;
1927 struct futex *f;
1928 int oldval, newval, actual;
1929 int error;
1930 bool shared;
1931
1932 /* If it's misaligned, tough. */
1933 if (__predict_false(uptr & 3))
1934 return;
1935 uaddr = (int *)uptr;
1936
1937 error = futex_load(uaddr, &oldval);
1938 if (__predict_false(error))
1939 return;
1940
1941 /*
1942 * There are two race conditions we need to handle here:
1943 *
1944 * 1. User space cleared the futex word but died before
1945 * being able to issue the wakeup. No wakeups will
1946 * ever be issued, oops!
1947 *
1948 * 2. Awakened waiter died before being able to acquire
1949 * the futex in user space. Any other waiters are
1950 * now stuck, oops!
1951 *
1952 * In both of these cases, the futex word will be 0 (because
1953 * it's updated before the wake is issued). The best we can
1954 * do is detect this situation if it's the pending futex and
1955 * issue a wake without modifying the futex word.
1956 *
1957 * XXX eventual PI handling?
1958 */
1959 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1960 register_t retval;
1961 error = futex_func_wake(/*shared*/true, uaddr, 1,
1962 FUTEX_BITSET_MATCH_ANY, &retval);
1963 if (error != 0 || retval == 0)
1964 (void) futex_func_wake(/*shared*/false, uaddr, 1,
1965 FUTEX_BITSET_MATCH_ANY, &retval);
1966 return;
1967 }
1968
1969 /* Optimistically test whether we need to do anything at all. */
1970 if ((oldval & FUTEX_TID_MASK) != tid)
1971 return;
1972
1973 /*
1974 * We need to handle the case where this thread owned the futex,
1975 * but it was uncontended. In this case, there won't be any
1976 * kernel state to look up. All we can do is mark the futex
1977 * as a zombie to be mopped up the next time another thread
1978 * attempts to acquire it.
1979 *
1980 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1981 * this loop, even if waiters appear while we're are doing
1982 * so. This is beause FUTEX_WAITERS is set by user space
1983 * before calling __futex() to wait, and the futex needs
1984 * to be marked as a zombie when the new waiter gets into
1985 * the kernel.
1986 */
1987 if ((oldval & FUTEX_WAITERS) == 0) {
1988 do {
1989 error = futex_load(uaddr, &oldval);
1990 if (error)
1991 return;
1992 if ((oldval & FUTEX_TID_MASK) != tid)
1993 return;
1994 newval = oldval | FUTEX_OWNER_DIED;
1995 error = ucas_int(uaddr, oldval, newval, &actual);
1996 if (error)
1997 return;
1998 } while (actual != oldval);
1999
2000 /*
2001 * If where is still no indication of waiters, then there is
2002 * no more work for us to do.
2003 */
2004 if ((oldval & FUTEX_WAITERS) == 0)
2005 return;
2006 }
2007
2008 /*
2009 * Look for a futex. Try shared first, then private. If we
2010 * can't fine one, tough.
2011 *
2012 * Note: the assumption here is that anyone placing a futex on
2013 * the robust list is adhering to the rules, regardless of the
2014 * futex class.
2015 */
2016 for (f = NULL, shared = true; f == NULL; shared = false) {
2017 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2018 if (error)
2019 return;
2020 if (f == NULL && shared == false)
2021 return;
2022 }
2023
2024 /* Work under the futex op lock. */
2025 futex_op_lock(f);
2026
2027 /*
2028 * Fetch the word: if the tid doesn't match ours, skip;
2029 * otherwise, set the owner-died bit, atomically.
2030 */
2031 do {
2032 error = futex_load(uaddr, &oldval);
2033 if (error)
2034 goto out;
2035 if ((oldval & FUTEX_TID_MASK) != tid)
2036 goto out;
2037 newval = oldval | FUTEX_OWNER_DIED;
2038 error = ucas_int(uaddr, oldval, newval, &actual);
2039 if (error)
2040 goto out;
2041 } while (actual != oldval);
2042
2043 /*
2044 * If there may be waiters, try to wake one. If anything goes
2045 * wrong, tough.
2046 *
2047 * XXX eventual PI handling?
2048 */
2049 if (oldval & FUTEX_WAITERS) {
2050 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2051 NULL, FUTEX_WRITERQ, 0,
2052 FUTEX_BITSET_MATCH_ANY);
2053 }
2054
2055 /* Unlock the queue and release the futex. */
2056 out: futex_op_unlock(f);
2057 futex_rele(f);
2058 }
2059
2060 /*
2061 * futex_robust_head_lookup(l, lwpid)
2062 *
2063 * Helper function to look up a robust head by LWP ID.
2064 */
2065 int
2066 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2067 {
2068 struct proc *p = l->l_proc;
2069
2070 /* Find the other lwp, if requested; otherwise use our robust head. */
2071 if (lwpid) {
2072 mutex_enter(p->p_lock);
2073 l = lwp_find(p, lwpid);
2074 if (l == NULL) {
2075 mutex_exit(p->p_lock);
2076 return ESRCH;
2077 }
2078 *headp = (void *)l->l_robust_head;
2079 mutex_exit(p->p_lock);
2080 } else {
2081 *headp = (void *)l->l_robust_head;
2082 }
2083 return 0;
2084 }
2085
2086 /*
2087 * futex_fetch_robust_head(uaddr)
2088 *
2089 * Helper routine to fetch the futex robust list head that
2090 * handles 32-bit binaries running on 64-bit kernels.
2091 */
2092 static int
2093 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2094 {
2095 #ifdef _LP64
2096 if (curproc->p_flag & PK_32) {
2097 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2098 int error;
2099
2100 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2101 if (__predict_true(error == 0)) {
2102 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2103 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2104 /*
2105 * Make sure the offset is sign-
2106 * extended.
2107 */
2108 rhead[i] = (int32_t)rhead32[i];
2109 } else {
2110 rhead[i] = rhead32[i];
2111 }
2112 }
2113 }
2114 return error;
2115 }
2116 #endif /* _L64 */
2117
2118 return copyin((void *)uaddr, rhead,
2119 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2120 }
2121
2122 /*
2123 * futex_decode_robust_word(word)
2124 *
2125 * Decode a robust futex list word into the entry and entry
2126 * properties.
2127 */
2128 static inline void
2129 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2130 bool * const is_pi)
2131 {
2132 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2133 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2134 }
2135
2136 /*
2137 * futex_fetch_robust_entry(uaddr)
2138 *
2139 * Helper routine to fetch and decode a robust futex entry
2140 * that handles 32-bit binaries running on 64-bit kernels.
2141 */
2142 static int
2143 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2144 bool * const is_pi)
2145 {
2146 uintptr_t val = 0;
2147 int error = 0;
2148
2149 #ifdef _LP64
2150 if (curproc->p_flag & PK_32) {
2151 uint32_t val32;
2152
2153 error = ufetch_32((uint32_t *)uaddr, &val32);
2154 if (__predict_true(error == 0))
2155 val = val32;
2156 } else
2157 #endif /* _LP64 */
2158 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2159 if (__predict_false(error))
2160 return error;
2161
2162 futex_decode_robust_word(val, valp, is_pi);
2163 return 0;
2164 }
2165
2166 /*
2167 * futex_release_all_lwp(l, tid)
2168 *
2169 * Release all l's robust futexes. If anything looks funny in
2170 * the process, give up -- it's userland's responsibility to dot
2171 * the i's and cross the t's.
2172 */
2173 void
2174 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2175 {
2176 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2177 int limit = 1000000;
2178 int error;
2179
2180 /* If there's no robust list there's nothing to do. */
2181 if (l->l_robust_head == 0)
2182 return;
2183
2184 /* Read the final snapshot of the robust list head. */
2185 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2186 if (error) {
2187 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2188 " unmapped robust futex list head\n",
2189 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2190 (uintmax_t)l->l_lid, (uintmax_t)tid);
2191 return;
2192 }
2193
2194 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2195
2196 uintptr_t next, pending;
2197 bool is_pi, pending_is_pi;
2198
2199 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2200 &next, &is_pi);
2201 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2202 &pending, &pending_is_pi);
2203
2204 /*
2205 * Walk down the list of locked futexes and release them, up
2206 * to one million of them before we give up.
2207 */
2208
2209 while (next != l->l_robust_head && limit-- > 0) {
2210 /* pending handled below. */
2211 if (next != pending)
2212 release_futex(next + offset, tid, is_pi, false);
2213 error = futex_fetch_robust_entry(next, &next, &is_pi);
2214 if (error)
2215 break;
2216 preempt_point();
2217 }
2218 if (limit <= 0) {
2219 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2220 " exhausted robust futex limit\n",
2221 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2222 (uintmax_t)l->l_lid, (uintmax_t)tid);
2223 }
2224
2225 /* If there's a pending futex, it may need to be released too. */
2226 if (pending != 0) {
2227 release_futex(pending + offset, tid, pending_is_pi, true);
2228 }
2229 }
2230