sys_futex.c revision 1.12.4.2 1 /* $NetBSD: sys_futex.c,v 1.12.4.2 2021/08/05 23:23:50 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.12.4.2 2021/08/05 23:23:50 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/atomic.h>
120 #include <sys/condvar.h>
121 #include <sys/futex.h>
122 #include <sys/mutex.h>
123 #include <sys/rbtree.h>
124 #include <sys/sleepq.h>
125 #include <sys/queue.h>
126 #include <sys/sdt.h>
127
128 #include <sys/syscall.h>
129 #include <sys/syscallargs.h>
130 #include <sys/syscallvar.h>
131
132 #include <uvm/uvm_extern.h>
133
134 /*
135 * DTrace probes.
136 */
137 SDT_PROVIDER_DEFINE(futex);
138
139 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
140 /* exit: error */
141 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
142 "struct timespec *", "int", "int");
143 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
144
145 /* entry: uaddr, nwake, bitset, fflags */
146 /* exit: error, nwoken */
147 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
148 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
149
150 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
151 /* exit: error, nwoken */
152 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
153 "int");
154 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
155
156 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
157 /* exit: error, nwoken */
158 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
159 "int", "int", "int");
160 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
161
162 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
163 /* exit: error, nwoken */
164 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
165 "int", "int");
166 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
167
168 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
169 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
170 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
171
172 /* entry: timo */
173 /* exit: error */
174 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
175 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
176
177 /*
178 * Lock order:
179 *
180 * futex_tab.lock
181 * futex::fx_op_lock ordered by kva of struct futex
182 * -> futex::fx_sq_lock ordered by kva of sleepq lock
183 *
184 * N.B. multiple futexes can share a single sleepq lock.
185 */
186
187 /*
188 * union futex_key
189 *
190 * A futex is addressed either by a vmspace+va (private) or by
191 * a uvm_voaddr (shared).
192 */
193 union futex_key {
194 struct {
195 struct vmspace *vmspace;
196 vaddr_t va;
197 } fk_private;
198 struct uvm_voaddr fk_shared;
199 };
200
201 /*
202 * struct futex
203 *
204 * Kernel state for a futex located at a particular address in a
205 * particular virtual address space.
206 *
207 * N.B. fx_refcnt is an unsigned long because we need to be able
208 * to operate on it atomically on all systems while at the same
209 * time rendering practically impossible the chance of it reaching
210 * its max value. In practice, we're limited by the number of LWPs
211 * that can be present on the system at any given time, and the
212 * assumption is that limit will be good enough on a 32-bit platform.
213 * See futex_wake() for why overflow needs to be avoided.
214 *
215 * XXX Since futex addresses must be 4-byte aligned, we could
216 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
217 * XXX field of the key. Worth it?
218 */
219 struct futex {
220 union futex_key fx_key;
221 struct rb_node fx_node;
222 unsigned long fx_refcnt;
223 bool fx_shared;
224 bool fx_on_tree;
225 uint8_t fx_class;
226
227 kmutex_t fx_op_lock; /* adaptive */
228 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
229 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
230 unsigned int fx_nwaiters[2];
231 };
232
233 /*
234 * futex classes: Some futex operations can only be carried out on
235 * futexes that are known to be abiding by a certain protocol. These
236 * classes are assigned to a futex when created due to a wait event,
237 * and when subsequent wake or requeue operations are issued, the
238 * class is checked at futex lookup time. If the class does not match,
239 * EINVAL is the result.
240 */
241 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
242 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
243 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
244 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
245
246 /* sleepq definitions */
247 #define FUTEX_READERQ 0
248 #define FUTEX_WRITERQ 1
249
250 static const char futex_wmesg[] = "futex";
251
252 static void futex_unsleep(lwp_t *, bool);
253
254 static syncobj_t futex_syncobj = {
255 .sobj_flag = SOBJ_SLEEPQ_SORTED,
256 .sobj_unsleep = futex_unsleep,
257 .sobj_changepri = sleepq_changepri,
258 .sobj_lendpri = sleepq_lendpri,
259 .sobj_owner = syncobj_noowner,
260 };
261
262 /*
263 * futex_tab
264 *
265 * Global trees of futexes by vmspace/va and VM object address.
266 *
267 * XXX This obviously doesn't scale in parallel. We could use a
268 * pserialize-safe data structure, but there may be a high cost to
269 * frequent deletion since we don't cache futexes after we're done
270 * with them. We could use hashed locks. But for now, just make
271 * sure userland can't DoS the serial performance, by using a
272 * balanced binary tree for lookup.
273 *
274 * XXX We could use a per-process tree for the table indexed by
275 * virtual address to reduce contention between processes.
276 */
277 static struct {
278 kmutex_t lock;
279 struct rb_tree va;
280 struct rb_tree oa;
281 } futex_tab __cacheline_aligned;
282
283 static int
284 compare_futex_key(void *cookie, const void *n, const void *k)
285 {
286 const struct futex *fa = n;
287 const union futex_key *fka = &fa->fx_key;
288 const union futex_key *fkb = k;
289
290 if ((uintptr_t)fka->fk_private.vmspace <
291 (uintptr_t)fkb->fk_private.vmspace)
292 return -1;
293 if ((uintptr_t)fka->fk_private.vmspace >
294 (uintptr_t)fkb->fk_private.vmspace)
295 return +1;
296 if (fka->fk_private.va < fkb->fk_private.va)
297 return -1;
298 if (fka->fk_private.va > fkb->fk_private.va)
299 return -1;
300 return 0;
301 }
302
303 static int
304 compare_futex(void *cookie, const void *na, const void *nb)
305 {
306 const struct futex *fa = na;
307 const struct futex *fb = nb;
308
309 return compare_futex_key(cookie, fa, &fb->fx_key);
310 }
311
312 static const rb_tree_ops_t futex_rb_ops = {
313 .rbto_compare_nodes = compare_futex,
314 .rbto_compare_key = compare_futex_key,
315 .rbto_node_offset = offsetof(struct futex, fx_node),
316 };
317
318 static int
319 compare_futex_shared_key(void *cookie, const void *n, const void *k)
320 {
321 const struct futex *fa = n;
322 const union futex_key *fka = &fa->fx_key;
323 const union futex_key *fkb = k;
324
325 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
326 }
327
328 static int
329 compare_futex_shared(void *cookie, const void *na, const void *nb)
330 {
331 const struct futex *fa = na;
332 const struct futex *fb = nb;
333
334 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
335 }
336
337 static const rb_tree_ops_t futex_shared_rb_ops = {
338 .rbto_compare_nodes = compare_futex_shared,
339 .rbto_compare_key = compare_futex_shared_key,
340 .rbto_node_offset = offsetof(struct futex, fx_node),
341 };
342
343 /*
344 * futex_sq_lock2(f, f2)
345 *
346 * Acquire the sleepq locks for f and f2, which may be null, or
347 * which may be the same. If they are distinct, an arbitrary total
348 * order is chosen on the locks.
349 *
350 * Callers should only ever acquire multiple sleepq locks
351 * simultaneously using futex_sq_lock2.
352 */
353 static void
354 futex_sq_lock2(struct futex * const f, struct futex * const f2)
355 {
356
357 /*
358 * If both are null, do nothing; if one is null and the other
359 * is not, lock the other and be done with it.
360 */
361 if (f == NULL && f2 == NULL) {
362 return;
363 } else if (f == NULL) {
364 mutex_spin_enter(f2->fx_sq_lock);
365 return;
366 } else if (f2 == NULL) {
367 mutex_spin_enter(f->fx_sq_lock);
368 return;
369 }
370
371 kmutex_t * const m = f->fx_sq_lock;
372 kmutex_t * const m2 = f2->fx_sq_lock;
373
374 /* If both locks are the same, acquire only one. */
375 if (m == m2) {
376 mutex_spin_enter(m);
377 return;
378 }
379
380 /* Otherwise, use the ordering on the kva of the lock pointer. */
381 if ((uintptr_t)m < (uintptr_t)m2) {
382 mutex_spin_enter(m);
383 mutex_spin_enter(m2);
384 } else {
385 mutex_spin_enter(m2);
386 mutex_spin_enter(m);
387 }
388 }
389
390 /*
391 * futex_sq_unlock2(f, f2)
392 *
393 * Release the sleep queue locks for f and f2, which may be null, or
394 * which may have the same underlying lock.
395 */
396 static void
397 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
398 {
399
400 /*
401 * If both are null, do nothing; if one is null and the other
402 * is not, unlock the other and be done with it.
403 */
404 if (f == NULL && f2 == NULL) {
405 return;
406 } else if (f == NULL) {
407 mutex_spin_exit(f2->fx_sq_lock);
408 return;
409 } else if (f2 == NULL) {
410 mutex_spin_exit(f->fx_sq_lock);
411 return;
412 }
413
414 kmutex_t * const m = f->fx_sq_lock;
415 kmutex_t * const m2 = f2->fx_sq_lock;
416
417 /* If both locks are the same, release only one. */
418 if (m == m2) {
419 mutex_spin_exit(m);
420 return;
421 }
422
423 /* Otherwise, use the ordering on the kva of the lock pointer. */
424 if ((uintptr_t)m < (uintptr_t)m2) {
425 mutex_spin_exit(m2);
426 mutex_spin_exit(m);
427 } else {
428 mutex_spin_exit(m);
429 mutex_spin_exit(m2);
430 }
431 }
432
433 /*
434 * futex_load(uaddr, kaddr)
435 *
436 * Perform a single atomic load to read *uaddr, and return the
437 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
438 * mapped.
439 */
440 static inline int
441 futex_load(int *uaddr, int *kaddr)
442 {
443 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
444 }
445
446 /*
447 * futex_test(uaddr, expected)
448 *
449 * True if *uaddr == expected. False if *uaddr != expected, or if
450 * uaddr is not mapped.
451 */
452 static bool
453 futex_test(int *uaddr, int expected)
454 {
455 int val;
456 int error;
457
458 error = futex_load(uaddr, &val);
459 if (error)
460 return false;
461 return val == expected;
462 }
463
464 static pool_cache_t futex_cache __read_mostly;
465
466 static int futex_ctor(void *, void *, int);
467 static void futex_dtor(void *, void *);
468
469 /*
470 * futex_sys_init()
471 *
472 * Initialize the futex subsystem.
473 */
474 void
475 futex_sys_init(void)
476 {
477
478 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
479 rb_tree_init(&futex_tab.va, &futex_rb_ops);
480 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
481
482 futex_cache = pool_cache_init(sizeof(struct futex),
483 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
484 futex_dtor, NULL);
485 }
486
487 /*
488 * futex_sys_fini()
489 *
490 * Finalize the futex subsystem.
491 */
492 void
493 futex_sys_fini(void)
494 {
495
496 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
497 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
498 mutex_destroy(&futex_tab.lock);
499 }
500
501 /*
502 * futex_ctor()
503 *
504 * Futex object constructor.
505 */
506 static int
507 futex_ctor(void *arg __unused, void *obj, int flags __unused)
508 {
509 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
510 struct futex * const f = obj;
511
512 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
513 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
514
515 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
516 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
517 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
518
519 return 0;
520 }
521
522 /*
523 * futex_dtor()
524 *
525 * Futex object destructor.
526 */
527 static void
528 futex_dtor(void *arg __unused, void *obj)
529 {
530 struct futex * const f = obj;
531
532 mutex_destroy(&f->fx_op_lock);
533 f->fx_sq_lock = NULL;
534 }
535
536 /*
537 * futex_key_init(key, vm, va, shared)
538 *
539 * Initialize a futex key for lookup, etc.
540 */
541 static int
542 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
543 {
544 int error = 0;
545
546 if (__predict_false(shared)) {
547 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
548 error = EFAULT;
549 } else {
550 fk->fk_private.vmspace = vm;
551 fk->fk_private.va = va;
552 }
553
554 return error;
555 }
556
557 /*
558 * futex_key_fini(key, shared)
559 *
560 * Release a futex key.
561 */
562 static void
563 futex_key_fini(union futex_key *fk, bool shared)
564 {
565 if (__predict_false(shared))
566 uvm_voaddr_release(&fk->fk_shared);
567 memset(fk, 0, sizeof(*fk));
568 }
569
570 /*
571 * futex_create(fk, shared)
572 *
573 * Create a futex. Initial reference count is 1, representing the
574 * caller. Returns NULL on failure. Always takes ownership of the
575 * key, either transferring it to the newly-created futex, or releasing
576 * the key if creation fails.
577 *
578 * Never sleeps for memory, but may sleep to acquire a lock.
579 */
580 static struct futex *
581 futex_create(union futex_key *fk, bool shared, uint8_t class)
582 {
583 struct futex *f;
584
585 f = pool_cache_get(futex_cache, PR_NOWAIT);
586 if (f == NULL) {
587 futex_key_fini(fk, shared);
588 return NULL;
589 }
590 f->fx_key = *fk;
591 f->fx_refcnt = 1;
592 f->fx_shared = shared;
593 f->fx_on_tree = false;
594 f->fx_class = class;
595
596 return f;
597 }
598
599 /*
600 * futex_destroy(f)
601 *
602 * Destroy a futex created with futex_create. Reference count
603 * must be zero.
604 *
605 * May sleep.
606 */
607 static void
608 futex_destroy(struct futex *f)
609 {
610
611 ASSERT_SLEEPABLE();
612
613 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
614 KASSERT(!f->fx_on_tree);
615 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
616 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
617 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
618 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
619
620 futex_key_fini(&f->fx_key, f->fx_shared);
621
622 pool_cache_put(futex_cache, f);
623 }
624
625 /*
626 * futex_hold_count(f, n)
627 *
628 * Attempt to acquire a reference to f. Return 0 on success,
629 * ENFILE on too many references.
630 *
631 * Never sleeps.
632 */
633 static int
634 futex_hold_count(struct futex *f, unsigned long const count)
635 {
636 unsigned long refcnt;
637
638 do {
639 refcnt = atomic_load_relaxed(&f->fx_refcnt);
640 if (ULONG_MAX - refcnt < count)
641 return ENFILE;
642 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
643 refcnt + count) != refcnt);
644
645 return 0;
646 }
647 #define futex_hold(f) futex_hold_count(f, 1)
648
649 /*
650 * futex_rele_count(f, n)
651 *
652 * Release a reference to f acquired with futex_create or
653 * futex_hold.
654 *
655 * May sleep to free f.
656 */
657 static void
658 futex_rele_count(struct futex *f, unsigned long const count)
659 {
660 unsigned long refcnt;
661
662 ASSERT_SLEEPABLE();
663
664 do {
665 refcnt = atomic_load_relaxed(&f->fx_refcnt);
666 KASSERT(refcnt >= count);
667 if (refcnt - count == 0)
668 goto trylast;
669 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
670 refcnt - count) != refcnt);
671 return;
672
673 trylast:
674 KASSERT(count <= LONG_MAX);
675 mutex_enter(&futex_tab.lock);
676 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
677 if (f->fx_on_tree) {
678 if (__predict_false(f->fx_shared))
679 rb_tree_remove_node(&futex_tab.oa, f);
680 else
681 rb_tree_remove_node(&futex_tab.va, f);
682 f->fx_on_tree = false;
683 }
684 } else {
685 /* References remain -- don't destroy it. */
686 f = NULL;
687 }
688 mutex_exit(&futex_tab.lock);
689 if (f != NULL)
690 futex_destroy(f);
691 }
692 #define futex_rele(f) futex_rele_count(f, 1)
693
694 /*
695 * futex_rele_count_not_last(f, n)
696 *
697 * Release a reference to f acquired with futex_create or
698 * futex_hold.
699 *
700 * This version asserts that we are not dropping the last
701 * reference to f.
702 */
703 static void
704 futex_rele_count_not_last(struct futex *f, unsigned long const count)
705 {
706 unsigned long refcnt;
707
708 do {
709 refcnt = atomic_load_relaxed(&f->fx_refcnt);
710 KASSERT(refcnt >= count);
711 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
712 refcnt - count) != refcnt);
713 }
714
715 /*
716 * futex_lookup_by_key(key, shared, class, &f)
717 *
718 * Try to find an existing futex va reference in the specified key
719 * On success, return 0, set f to found futex or to NULL if not found,
720 * and increment f's reference count if found.
721 *
722 * Return ENFILE if reference count too high.
723 *
724 * Internal lookup routine shared by futex_lookup() and
725 * futex_lookup_create().
726 */
727 static int
728 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
729 struct futex **fp)
730 {
731 struct futex *f;
732 int error = 0;
733
734 mutex_enter(&futex_tab.lock);
735 if (__predict_false(shared)) {
736 f = rb_tree_find_node(&futex_tab.oa, fk);
737 } else {
738 f = rb_tree_find_node(&futex_tab.va, fk);
739 }
740 if (f) {
741 if (__predict_true(f->fx_class == class ||
742 class == FUTEX_CLASS_ANY))
743 error = futex_hold(f);
744 else
745 error = EINVAL;
746 if (error)
747 f = NULL;
748 }
749 *fp = f;
750 mutex_exit(&futex_tab.lock);
751
752 return error;
753 }
754
755 /*
756 * futex_insert(f, fp)
757 *
758 * Try to insert the futex f into the tree by va. If there
759 * already is a futex for its va, acquire a reference to it, and
760 * store it in *fp; otherwise store f in *fp.
761 *
762 * Return 0 on success, ENFILE if there already is a futex but its
763 * reference count is too high.
764 */
765 static int
766 futex_insert(struct futex *f, struct futex **fp)
767 {
768 struct futex *f0;
769 int error;
770
771 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
772 KASSERT(!f->fx_on_tree);
773
774 mutex_enter(&futex_tab.lock);
775 if (__predict_false(f->fx_shared))
776 f0 = rb_tree_insert_node(&futex_tab.oa, f);
777 else
778 f0 = rb_tree_insert_node(&futex_tab.va, f);
779 if (f0 == f) {
780 f->fx_on_tree = true;
781 error = 0;
782 } else {
783 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
784 KASSERT(f0->fx_on_tree);
785 error = futex_hold(f0);
786 if (error)
787 goto out;
788 }
789 *fp = f0;
790 out: mutex_exit(&futex_tab.lock);
791
792 return error;
793 }
794
795 /*
796 * futex_lookup(uaddr, shared, class, &f)
797 *
798 * Find a futex at the userland pointer uaddr in the current
799 * process's VM space. On success, return the futex in f and
800 * increment its reference count.
801 *
802 * Caller must call futex_rele when done.
803 */
804 static int
805 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
806 {
807 union futex_key fk;
808 struct vmspace *vm = curproc->p_vmspace;
809 vaddr_t va = (vaddr_t)uaddr;
810 int error;
811
812 /*
813 * Reject unaligned user pointers so we don't cross page
814 * boundaries and so atomics will work.
815 */
816 if (__predict_false((va & 3) != 0))
817 return EINVAL;
818
819 /* Look it up. */
820 error = futex_key_init(&fk, vm, va, shared);
821 if (error)
822 return error;
823
824 error = futex_lookup_by_key(&fk, shared, class, fp);
825 futex_key_fini(&fk, shared);
826 if (error)
827 return error;
828
829 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
830 KASSERT(*fp == NULL || (*fp)->fx_class == class);
831 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
832
833 /*
834 * Success! (Caller must still check whether we found
835 * anything, but nothing went _wrong_ like trying to use
836 * unmapped memory.)
837 */
838 KASSERT(error == 0);
839
840 return error;
841 }
842
843 /*
844 * futex_lookup_create(uaddr, shared, class, &f)
845 *
846 * Find or create a futex at the userland pointer uaddr in the
847 * current process's VM space. On success, return the futex in f
848 * and increment its reference count.
849 *
850 * Caller must call futex_rele when done.
851 */
852 static int
853 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
854 {
855 union futex_key fk;
856 struct vmspace *vm = curproc->p_vmspace;
857 struct futex *f = NULL;
858 vaddr_t va = (vaddr_t)uaddr;
859 int error;
860
861 /*
862 * Reject unaligned user pointers so we don't cross page
863 * boundaries and so atomics will work.
864 */
865 if (__predict_false((va & 3) != 0))
866 return EINVAL;
867
868 if (__predict_false(class == FUTEX_CLASS_ANY))
869 return EINVAL;
870
871 error = futex_key_init(&fk, vm, va, shared);
872 if (error)
873 return error;
874
875 /*
876 * Optimistically assume there already is one, and try to find
877 * it.
878 */
879 error = futex_lookup_by_key(&fk, shared, class, fp);
880 if (error || *fp != NULL) {
881 /*
882 * We either found one, or there was an error.
883 * In either case, we are done with the key.
884 */
885 futex_key_fini(&fk, shared);
886 goto out;
887 }
888
889 /*
890 * Create a futex recoard. This tranfers ownership of the key
891 * in all cases.
892 */
893 f = futex_create(&fk, shared, class);
894 if (f == NULL) {
895 error = ENOMEM;
896 goto out;
897 }
898
899 /*
900 * Insert our new futex, or use existing if someone else beat
901 * us to it.
902 */
903 error = futex_insert(f, fp);
904 if (error)
905 goto out;
906 if (*fp == f)
907 f = NULL; /* don't release on exit */
908
909 /* Success! */
910 KASSERT(error == 0);
911
912 out: if (f != NULL)
913 futex_rele(f);
914 KASSERT(error || *fp != NULL);
915 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
916 return error;
917 }
918
919 /*
920 * futex_unsleep:
921 *
922 * Remove an LWP from the futex and sleep queue. This is called when
923 * the LWP has not been awoken normally but instead interrupted: for
924 * example, when a signal is received. Must be called with the LWP
925 * locked. Will unlock if "unlock" is true.
926 */
927 static void
928 futex_unsleep(lwp_t *l, bool unlock)
929 {
930 struct futex *f __diagused;
931
932 f = (struct futex *)(uintptr_t)l->l_wchan;
933
934 KASSERT(mutex_owned(f->fx_sq_lock));
935
936 /* WRITERQ is more likely */
937 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
938 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
939 f->fx_nwaiters[FUTEX_WRITERQ]--;
940 } else {
941 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
942 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
943 f->fx_nwaiters[FUTEX_READERQ]--;
944 }
945
946 sleepq_unsleep(l, unlock);
947 }
948
949 /*
950 * futex_wait(f, timeout, clkid, bitset)
951 *
952 * Wait until deadline on the clock clkid, or forever if deadline
953 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
954 * ETIMEDOUT on timeout, EINTR on signal.
955 */
956 static int
957 futex_wait(struct futex *f, int q, const struct timespec *deadline,
958 clockid_t clkid, int bitset)
959 {
960
961 /*
962 * Some things to note about how this function works:
963 *
964 * ==> When we enter futex_wait(), the futex's op lock is
965 * held, preventing us from missing wakes. Once we are on
966 * the futex's sleepq, it is safe to release the op lock.
967 *
968 * ==> We have a single early escape to avoid calling
969 * sleepq_block(): our deadline already having passed.
970 * If this is a no-timeout wait, or if the deadline has
971 * not already passed, we are guaranteed to call sleepq_block().
972 *
973 * ==> sleepq_block() contains all of the necessary logic
974 * for handling signals; we do not need to check for them
975 * ourselves.
976 *
977 * ==> Once we have blocked, other futex operations will
978 * be able to wake us or requeue us. Until we have blocked,
979 * those other futex operations will not be able to acquire
980 * the sleepq locks in order to perform those operations,
981 * even if we have dropped the op lock. Thus, we will not
982 * miss wakes. This fundamental invariant is relied upon by
983 * every other synchronization object in the kernel.
984 *
985 * ==> sleepq_block() returns for one of three reasons:
986 *
987 * -- We were awakened.
988 * -- We were signaled.
989 * -- We timed out.
990 *
991 * ==> Once sleepq_block() returns, we will not call
992 * sleepq_block() again.
993 *
994 * ==> If sleepq_block() returns due to being signaled,
995 * we must check to see if we were also awakened. This
996 * is to handle the following sequence:
997 *
998 * -- Futex wake takes us off sleepq, places us on runq.
999 * -- We are signaled while sitting on the runq.
1000 * -- We run, and sleepq_block() notices the signal and
1001 * returns EINTR/ERESTART.
1002 *
1003 * In this situation, we want to indicate a successful wake
1004 * to the caller, because that wake has been reported to the
1005 * thread that issued it.
1006 *
1007 * Note that it is NOT possible for a wake to be issued after
1008 * a signal; the LWP will already have been taken off the sleepq
1009 * by the signal, so the wake will never happen. Note that for
1010 * this reason, we must *never* return ERESTART, because it could
1011 * result in us waiting again with no one to awaken us.
1012 */
1013
1014 struct lwp * const l = curlwp;
1015 int timo;
1016 int error;
1017
1018 /*
1019 * Compute our timeout before taking any locks.
1020 */
1021 if (deadline) {
1022 struct timespec ts;
1023
1024 /* Check our watch. */
1025 error = clock_gettime1(clkid, &ts);
1026 if (error) {
1027 mutex_exit(&f->fx_op_lock);
1028 return error;
1029 }
1030
1031 /*
1032 * If we're past the deadline, ETIMEDOUT.
1033 */
1034 if (timespeccmp(deadline, &ts, <=)) {
1035 mutex_exit(&f->fx_op_lock);
1036 return ETIMEDOUT;
1037 } else {
1038 /* Count how much time is left. */
1039 timespecsub(deadline, &ts, &ts);
1040 timo = tstohz(&ts);
1041 if (timo == 0) {
1042 /*
1043 * tstohz() already rounds up, and
1044 * a return value of 0 ticks means
1045 * "expired now or in the past".
1046 */
1047 mutex_exit(&f->fx_op_lock);
1048 return ETIMEDOUT;
1049 }
1050 }
1051 } else {
1052 timo = 0;
1053 }
1054
1055 /*
1056 * Acquire in sleepq -> lwp order. While we're at it, ensure
1057 * that there's room for us to block.
1058 */
1059 mutex_spin_enter(f->fx_sq_lock);
1060 if (__predict_false(q == FUTEX_READERQ &&
1061 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1062 mutex_spin_exit(f->fx_sq_lock);
1063 mutex_exit(&f->fx_op_lock);
1064 return ENFILE;
1065 }
1066 lwp_lock(l);
1067
1068 /*
1069 * No need for locks here; until we're on a futex's sleepq, these
1070 * values are private to the LWP, and the locks needed to get onto
1071 * the sleepq provide the memory barriers needed to ensure visibility.
1072 */
1073 l->l_futex = f;
1074 l->l_futex_wakesel = bitset;
1075
1076 /*
1077 * This is an inlined bit of sleepq_enter();
1078 * we already hold the lwp lock.
1079 */
1080 lwp_unlock_to(l, f->fx_sq_lock);
1081 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1082 KASSERT(lwp_locked(l, f->fx_sq_lock));
1083
1084 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1085 f->fx_nwaiters[q]++;
1086
1087 /* We can now safely release the op lock. */
1088 mutex_exit(&f->fx_op_lock);
1089
1090 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1091 error = sleepq_block(timo, true);
1092 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1093
1094 /*
1095 * f is no longer valid; we may have been moved to another
1096 * futex sleepq while we slept.
1097 */
1098
1099 /*
1100 * If something went wrong, we may still have a futex reference.
1101 * Check for that here and drop it if so. We can do this without
1102 * taking any locks because l->l_futex is private to the LWP when
1103 * not on any futex's sleepq, and we are not on any sleepq because
1104 * we are running.
1105 *
1106 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1107 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1108 */
1109 if (error) {
1110 f = l->l_futex;
1111 if (f != NULL) {
1112 SDT_PROBE0(futex, wait, finish, aborted);
1113 l->l_futex = NULL;
1114 futex_rele(f);
1115 } else {
1116 /* Raced with wakeup; report success. */
1117 SDT_PROBE0(futex, wait, finish, wakerace);
1118 error = 0;
1119 }
1120 if (error == EWOULDBLOCK)
1121 error = ETIMEDOUT;
1122 else if (error != ETIMEDOUT)
1123 error = EINTR;
1124 } else {
1125 f = l->l_futex;
1126 if (__predict_false(f != NULL)) {
1127 /*
1128 * There are certain situations that can cause
1129 * sleepq_block() to return 0 even if we were
1130 * signalled (by e.g. SIGKILL). In this case,
1131 * we will need to release our futex hold and
1132 * return EINTR (we're probably about to die
1133 * anyway).
1134 */
1135 SDT_PROBE0(futex, wait, finish, aborted);
1136 l->l_futex = NULL;
1137 futex_rele(f);
1138 error = EINTR;
1139 } else {
1140 SDT_PROBE0(futex, wait, finish, normally);
1141 }
1142 }
1143
1144 return error;
1145 }
1146
1147 /*
1148 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1149 *
1150 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1151 * provided, move up to nrequeue remaining waiters on f matching
1152 * bitset to f2. Return the number of waiters actually woken.
1153 * Caller must hold the locks of f and f2, if provided.
1154 */
1155 static unsigned
1156 futex_wake(struct futex *f, int q, unsigned int const nwake,
1157 struct futex *f2, int q2, unsigned int const nrequeue,
1158 int bitset)
1159 {
1160 struct lwp *l, *l_next;
1161 unsigned nwoken = 0;
1162 unsigned nrequeued = 0;
1163
1164 KASSERT(mutex_owned(&f->fx_op_lock));
1165 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1166
1167 futex_sq_lock2(f, f2);
1168
1169 /* Wake up to nwake waiters, and count the number woken. */
1170 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1171 if (nwoken == nwake)
1172 break;
1173
1174 KASSERT(l->l_syncobj == &futex_syncobj);
1175 KASSERT(l->l_wchan == (wchan_t)f);
1176 KASSERT(l->l_futex == f);
1177 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1178 KASSERT(l->l_mutex == f->fx_sq_lock);
1179
1180 if ((l->l_futex_wakesel & bitset) == 0)
1181 continue;
1182
1183 l->l_futex_wakesel = 0;
1184 l->l_futex = NULL;
1185 sleepq_remove(&f->fx_sqs[q], l);
1186 f->fx_nwaiters[q]--;
1187 nwoken++;
1188 /* hold counts adjusted in bulk below */
1189 }
1190
1191 if (nrequeue) {
1192 KASSERT(f2 != NULL);
1193 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1194 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1195 if (nrequeued == nrequeue)
1196 break;
1197
1198 KASSERT(l->l_syncobj == &futex_syncobj);
1199 KASSERT(l->l_wchan == (wchan_t)f);
1200 KASSERT(l->l_futex == f);
1201 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1202 KASSERT(l->l_mutex == f->fx_sq_lock);
1203
1204 if ((l->l_futex_wakesel & bitset) == 0)
1205 continue;
1206
1207 l->l_futex = f2;
1208 sleepq_transfer(l, &f->fx_sqs[q],
1209 &f2->fx_sqs[q2], f2, futex_wmesg,
1210 &futex_syncobj, f2->fx_sq_lock, true);
1211 f->fx_nwaiters[q]--;
1212 f2->fx_nwaiters[q2]++;
1213 nrequeued++;
1214 /* hold counts adjusted in bulk below */
1215 }
1216 if (nrequeued) {
1217 /* XXX futex_hold() could theoretically fail here. */
1218 int hold_error __diagused =
1219 futex_hold_count(f2, nrequeued);
1220 KASSERT(hold_error == 0);
1221 }
1222 }
1223
1224 /*
1225 * Adjust the futex reference counts for the wakes and
1226 * requeues.
1227 */
1228 KASSERT(nwoken + nrequeued <= LONG_MAX);
1229 futex_rele_count_not_last(f, nwoken + nrequeued);
1230
1231 futex_sq_unlock2(f, f2);
1232
1233 /* Return the number of waiters woken and requeued. */
1234 return nwoken + nrequeued;
1235 }
1236
1237 /*
1238 * futex_op_lock(f)
1239 *
1240 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1241 * not use if caller needs to acquire two locks; use
1242 * futex_op_lock2 instead.
1243 */
1244 static void
1245 futex_op_lock(struct futex *f)
1246 {
1247 mutex_enter(&f->fx_op_lock);
1248 }
1249
1250 /*
1251 * futex_op_unlock(f)
1252 *
1253 * Release the queue lock of f.
1254 */
1255 static void
1256 futex_op_unlock(struct futex *f)
1257 {
1258 mutex_exit(&f->fx_op_lock);
1259 }
1260
1261 /*
1262 * futex_op_lock2(f, f2)
1263 *
1264 * Acquire the op locks of both f and f2, which may be null, or
1265 * which may be the same. If they are distinct, an arbitrary total
1266 * order is chosen on the locks.
1267 *
1268 * Callers should only ever acquire multiple op locks
1269 * simultaneously using futex_op_lock2.
1270 */
1271 static void
1272 futex_op_lock2(struct futex *f, struct futex *f2)
1273 {
1274
1275 /*
1276 * If both are null, do nothing; if one is null and the other
1277 * is not, lock the other and be done with it.
1278 */
1279 if (f == NULL && f2 == NULL) {
1280 return;
1281 } else if (f == NULL) {
1282 mutex_enter(&f2->fx_op_lock);
1283 return;
1284 } else if (f2 == NULL) {
1285 mutex_enter(&f->fx_op_lock);
1286 return;
1287 }
1288
1289 /* If both futexes are the same, acquire only one. */
1290 if (f == f2) {
1291 mutex_enter(&f->fx_op_lock);
1292 return;
1293 }
1294
1295 /* Otherwise, use the ordering on the kva of the futex pointer. */
1296 if ((uintptr_t)f < (uintptr_t)f2) {
1297 mutex_enter(&f->fx_op_lock);
1298 mutex_enter(&f2->fx_op_lock);
1299 } else {
1300 mutex_enter(&f2->fx_op_lock);
1301 mutex_enter(&f->fx_op_lock);
1302 }
1303 }
1304
1305 /*
1306 * futex_op_unlock2(f, f2)
1307 *
1308 * Release the queue locks of both f and f2, which may be null, or
1309 * which may have the same underlying queue.
1310 */
1311 static void
1312 futex_op_unlock2(struct futex *f, struct futex *f2)
1313 {
1314
1315 /*
1316 * If both are null, do nothing; if one is null and the other
1317 * is not, unlock the other and be done with it.
1318 */
1319 if (f == NULL && f2 == NULL) {
1320 return;
1321 } else if (f == NULL) {
1322 mutex_exit(&f2->fx_op_lock);
1323 return;
1324 } else if (f2 == NULL) {
1325 mutex_exit(&f->fx_op_lock);
1326 return;
1327 }
1328
1329 /* If both futexes are the same, release only one. */
1330 if (f == f2) {
1331 mutex_exit(&f->fx_op_lock);
1332 return;
1333 }
1334
1335 /* Otherwise, use the ordering on the kva of the futex pointer. */
1336 if ((uintptr_t)f < (uintptr_t)f2) {
1337 mutex_exit(&f2->fx_op_lock);
1338 mutex_exit(&f->fx_op_lock);
1339 } else {
1340 mutex_exit(&f->fx_op_lock);
1341 mutex_exit(&f2->fx_op_lock);
1342 }
1343 }
1344
1345 /*
1346 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1347 *
1348 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1349 */
1350 static int
1351 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1352 const struct timespec *timeout, clockid_t clkid, int clkflags,
1353 register_t *retval)
1354 {
1355 struct futex *f;
1356 struct timespec ts;
1357 const struct timespec *deadline;
1358 int error;
1359
1360 /*
1361 * If there's nothing to wait for, and nobody will ever wake
1362 * us, then don't set anything up to wait -- just stop here.
1363 */
1364 if (val3 == 0)
1365 return EINVAL;
1366
1367 /* Optimistically test before anything else. */
1368 if (!futex_test(uaddr, val))
1369 return EAGAIN;
1370
1371 /* Determine a deadline on the specified clock. */
1372 if (timeout == NULL) {
1373 deadline = timeout;
1374 } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1375 /* Sanity-check the deadline. */
1376 if (timeout->tv_sec < 0 ||
1377 timeout->tv_nsec < 0 ||
1378 timeout->tv_nsec >= 1000000000L) {
1379 return EINVAL;
1380 }
1381 deadline = timeout;
1382 } else {
1383 struct timespec interval = *timeout;
1384
1385 error = itimespecfix(&interval);
1386 if (error)
1387 return error;
1388 error = clock_gettime1(clkid, &ts);
1389 if (error)
1390 return error;
1391 timespecadd(&ts, &interval, &ts);
1392 deadline = &ts;
1393 }
1394
1395 /* Get the futex, creating it if necessary. */
1396 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1397 if (error)
1398 return error;
1399 KASSERT(f);
1400
1401 /*
1402 * Under the op lock, check the value again: if it has
1403 * already changed, EAGAIN; otherwise enqueue the waiter.
1404 * Since FUTEX_WAKE will use the same lock and be done after
1405 * modifying the value, the order in which we check and enqueue
1406 * is immaterial.
1407 */
1408 futex_op_lock(f);
1409 if (!futex_test(uaddr, val)) {
1410 futex_op_unlock(f);
1411 error = EAGAIN;
1412 goto out;
1413 }
1414
1415 /*
1416 * Now wait. futex_wait() will drop our op lock once we
1417 * are entered into the sleep queue, thus ensuring atomicity
1418 * of wakes with respect to waits.
1419 */
1420 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1421
1422 /*
1423 * We cannot drop our reference to the futex here, because
1424 * we might be enqueued on a different one when we are awakened.
1425 * The references will be managed on our behalf in the requeue,
1426 * wake, and error cases.
1427 */
1428 f = NULL;
1429
1430 if (__predict_true(error == 0)) {
1431 /* Return 0 on success, error on failure. */
1432 *retval = 0;
1433 }
1434
1435 out: if (f != NULL)
1436 futex_rele(f);
1437 return error;
1438 }
1439
1440 /*
1441 * futex_func_wake(uaddr, val, val3, retval)
1442 *
1443 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1444 */
1445 static int
1446 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1447 {
1448 struct futex *f;
1449 unsigned int nwoken = 0;
1450 int error = 0;
1451
1452 /* Reject negative number of wakeups. */
1453 if (val < 0) {
1454 error = EINVAL;
1455 goto out;
1456 }
1457
1458 /* Look up the futex, if any. */
1459 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1460 if (error)
1461 goto out;
1462
1463 /* If there's no futex, there are no waiters to wake. */
1464 if (f == NULL)
1465 goto out;
1466
1467 /*
1468 * Under f's op lock, wake the waiters and remember the
1469 * number woken.
1470 */
1471 futex_op_lock(f);
1472 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1473 NULL, FUTEX_WRITERQ, 0, val3);
1474 futex_op_unlock(f);
1475
1476 /* Release the futex. */
1477 futex_rele(f);
1478
1479 out:
1480 /* Return the number of waiters woken. */
1481 *retval = nwoken;
1482
1483 /* Success! */
1484 return error;
1485 }
1486
1487 /*
1488 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1489 *
1490 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1491 */
1492 static int
1493 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1494 int val2, int val3, register_t *retval)
1495 {
1496 struct futex *f = NULL, *f2 = NULL;
1497 unsigned nwoken = 0; /* default to zero woken on early return */
1498 int error;
1499
1500 /* Reject negative number of wakeups or requeues. */
1501 if (val < 0 || val2 < 0) {
1502 error = EINVAL;
1503 goto out;
1504 }
1505
1506 /* Look up the source futex, if any. */
1507 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1508 if (error)
1509 goto out;
1510
1511 /* If there is none, nothing to do. */
1512 if (f == NULL)
1513 goto out;
1514
1515 /*
1516 * We may need to create the destination futex because it's
1517 * entirely possible it does not currently have any waiters.
1518 */
1519 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1520 if (error)
1521 goto out;
1522
1523 /*
1524 * Under the futexes' op locks, check the value; if
1525 * unchanged from val3, wake the waiters.
1526 */
1527 futex_op_lock2(f, f2);
1528 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1529 error = EAGAIN;
1530 } else {
1531 error = 0;
1532 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1533 f2, FUTEX_WRITERQ, val2,
1534 FUTEX_BITSET_MATCH_ANY);
1535 }
1536 futex_op_unlock2(f, f2);
1537
1538 out:
1539 /* Return the number of waiters woken. */
1540 *retval = nwoken;
1541
1542 /* Release the futexes if we got them. */
1543 if (f2)
1544 futex_rele(f2);
1545 if (f)
1546 futex_rele(f);
1547 return error;
1548 }
1549
1550 /*
1551 * futex_validate_op_cmp(val3)
1552 *
1553 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1554 */
1555 static int
1556 futex_validate_op_cmp(int val3)
1557 {
1558 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1559 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1560
1561 if (op & FUTEX_OP_OPARG_SHIFT) {
1562 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1563 if (oparg < 0)
1564 return EINVAL;
1565 if (oparg >= 32)
1566 return EINVAL;
1567 op &= ~FUTEX_OP_OPARG_SHIFT;
1568 }
1569
1570 switch (op) {
1571 case FUTEX_OP_SET:
1572 case FUTEX_OP_ADD:
1573 case FUTEX_OP_OR:
1574 case FUTEX_OP_ANDN:
1575 case FUTEX_OP_XOR:
1576 break;
1577 default:
1578 return EINVAL;
1579 }
1580
1581 switch (cmp) {
1582 case FUTEX_OP_CMP_EQ:
1583 case FUTEX_OP_CMP_NE:
1584 case FUTEX_OP_CMP_LT:
1585 case FUTEX_OP_CMP_LE:
1586 case FUTEX_OP_CMP_GT:
1587 case FUTEX_OP_CMP_GE:
1588 break;
1589 default:
1590 return EINVAL;
1591 }
1592
1593 return 0;
1594 }
1595
1596 /*
1597 * futex_compute_op(oldval, val3)
1598 *
1599 * Apply a FUTEX_WAIT_OP operation to oldval.
1600 */
1601 static int
1602 futex_compute_op(int oldval, int val3)
1603 {
1604 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1605 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1606
1607 if (op & FUTEX_OP_OPARG_SHIFT) {
1608 KASSERT(oparg >= 0);
1609 KASSERT(oparg < 32);
1610 oparg = 1u << oparg;
1611 op &= ~FUTEX_OP_OPARG_SHIFT;
1612 }
1613
1614 switch (op) {
1615 case FUTEX_OP_SET:
1616 return oparg;
1617
1618 case FUTEX_OP_ADD:
1619 /*
1620 * Avoid signed arithmetic overflow by doing
1621 * arithmetic unsigned and converting back to signed
1622 * at the end.
1623 */
1624 return (int)((unsigned)oldval + (unsigned)oparg);
1625
1626 case FUTEX_OP_OR:
1627 return oldval | oparg;
1628
1629 case FUTEX_OP_ANDN:
1630 return oldval & ~oparg;
1631
1632 case FUTEX_OP_XOR:
1633 return oldval ^ oparg;
1634
1635 default:
1636 panic("invalid futex op");
1637 }
1638 }
1639
1640 /*
1641 * futex_compute_cmp(oldval, val3)
1642 *
1643 * Apply a FUTEX_WAIT_OP comparison to oldval.
1644 */
1645 static bool
1646 futex_compute_cmp(int oldval, int val3)
1647 {
1648 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1649 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1650
1651 switch (cmp) {
1652 case FUTEX_OP_CMP_EQ:
1653 return (oldval == cmparg);
1654
1655 case FUTEX_OP_CMP_NE:
1656 return (oldval != cmparg);
1657
1658 case FUTEX_OP_CMP_LT:
1659 return (oldval < cmparg);
1660
1661 case FUTEX_OP_CMP_LE:
1662 return (oldval <= cmparg);
1663
1664 case FUTEX_OP_CMP_GT:
1665 return (oldval > cmparg);
1666
1667 case FUTEX_OP_CMP_GE:
1668 return (oldval >= cmparg);
1669
1670 default:
1671 panic("invalid futex cmp operation");
1672 }
1673 }
1674
1675 /*
1676 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1677 *
1678 * Implement futex(FUTEX_WAKE_OP).
1679 */
1680 static int
1681 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1682 int val3, register_t *retval)
1683 {
1684 struct futex *f = NULL, *f2 = NULL;
1685 int oldval, newval, actual;
1686 unsigned nwoken = 0;
1687 int error;
1688
1689 /* Reject negative number of wakeups. */
1690 if (val < 0 || val2 < 0) {
1691 error = EINVAL;
1692 goto out;
1693 }
1694
1695 /* Reject invalid operations before we start doing things. */
1696 if ((error = futex_validate_op_cmp(val3)) != 0)
1697 goto out;
1698
1699 /* Look up the first futex, if any. */
1700 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1701 if (error)
1702 goto out;
1703
1704 /* Look up the second futex, if any. */
1705 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1706 if (error)
1707 goto out;
1708
1709 /*
1710 * Under the op locks:
1711 *
1712 * 1. Read/modify/write: *uaddr2 op= oparg.
1713 * 2. Unconditionally wake uaddr.
1714 * 3. Conditionally wake uaddr2, if it previously matched val3.
1715 */
1716 futex_op_lock2(f, f2);
1717 do {
1718 error = futex_load(uaddr2, &oldval);
1719 if (error)
1720 goto out_unlock;
1721 newval = futex_compute_op(oldval, val3);
1722 error = ucas_int(uaddr2, oldval, newval, &actual);
1723 if (error)
1724 goto out_unlock;
1725 } while (actual != oldval);
1726 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1727 NULL, FUTEX_WRITERQ, 0,
1728 FUTEX_BITSET_MATCH_ANY) : 0);
1729 if (f2 && futex_compute_cmp(oldval, val3))
1730 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1731 NULL, FUTEX_WRITERQ, 0,
1732 FUTEX_BITSET_MATCH_ANY);
1733
1734 /* Success! */
1735 error = 0;
1736 out_unlock:
1737 futex_op_unlock2(f, f2);
1738
1739 out:
1740 /* Return the number of waiters woken. */
1741 *retval = nwoken;
1742
1743 /* Release the futexes, if we got them. */
1744 if (f2)
1745 futex_rele(f2);
1746 if (f)
1747 futex_rele(f);
1748 return error;
1749 }
1750
1751 /*
1752 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1753 *
1754 * Implement the futex system call with all the parameters
1755 * parsed out.
1756 */
1757 int
1758 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1759 int *uaddr2, int val2, int val3, register_t *retval)
1760 {
1761 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1762 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1763 : CLOCK_MONOTONIC;
1764 int error;
1765
1766 op &= FUTEX_CMD_MASK;
1767
1768 switch (op) {
1769 case FUTEX_WAIT:
1770 SDT_PROBE6(futex, func, wait, entry,
1771 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
1772 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
1773 error = futex_func_wait(shared, uaddr, val,
1774 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1775 retval);
1776 SDT_PROBE1(futex, func, wait, exit, error);
1777 break;
1778
1779 case FUTEX_WAIT_BITSET:
1780 SDT_PROBE6(futex, func, wait, entry,
1781 uaddr, val, val3, timeout,
1782 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
1783 error = futex_func_wait(shared, uaddr, val, val3, timeout,
1784 clkid, TIMER_ABSTIME, retval);
1785 SDT_PROBE1(futex, func, wait, exit, error);
1786 break;
1787
1788 case FUTEX_WAKE:
1789 SDT_PROBE4(futex, func, wake, entry,
1790 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
1791 error = futex_func_wake(shared, uaddr, val,
1792 FUTEX_BITSET_MATCH_ANY, retval);
1793 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1794 break;
1795
1796 case FUTEX_WAKE_BITSET:
1797 SDT_PROBE4(futex, func, wake, entry,
1798 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
1799 error = futex_func_wake(shared, uaddr, val, val3, retval);
1800 SDT_PROBE2(futex, func, wake, exit, error, *retval);
1801 break;
1802
1803 case FUTEX_REQUEUE:
1804 SDT_PROBE5(futex, func, requeue, entry,
1805 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
1806 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1807 val2, 0, retval);
1808 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
1809 break;
1810
1811 case FUTEX_CMP_REQUEUE:
1812 SDT_PROBE6(futex, func, cmp_requeue, entry,
1813 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1814 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
1815 val2, val3, retval);
1816 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
1817 break;
1818
1819 case FUTEX_WAKE_OP:
1820 SDT_PROBE6(futex, func, wake_op, entry,
1821 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
1822 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1823 val3, retval);
1824 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
1825 break;
1826
1827 case FUTEX_FD:
1828 default:
1829 error = ENOSYS;
1830 break;
1831 }
1832
1833 return error;
1834 }
1835
1836 /*
1837 * sys___futex(l, uap, retval)
1838 *
1839 * __futex(2) system call: generic futex operations.
1840 */
1841 int
1842 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1843 register_t *retval)
1844 {
1845 /* {
1846 syscallarg(int *) uaddr;
1847 syscallarg(int) op;
1848 syscallarg(int) val;
1849 syscallarg(const struct timespec *) timeout;
1850 syscallarg(int *) uaddr2;
1851 syscallarg(int) val2;
1852 syscallarg(int) val3;
1853 } */
1854 struct timespec ts, *tsp;
1855 int error;
1856
1857 /*
1858 * Copy in the timeout argument, if specified.
1859 */
1860 if (SCARG(uap, timeout)) {
1861 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1862 if (error)
1863 return error;
1864 tsp = &ts;
1865 } else {
1866 tsp = NULL;
1867 }
1868
1869 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1870 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1871 retval);
1872 }
1873
1874 /*
1875 * sys___futex_set_robust_list(l, uap, retval)
1876 *
1877 * __futex_set_robust_list(2) system call for robust futexes.
1878 */
1879 int
1880 sys___futex_set_robust_list(struct lwp *l,
1881 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1882 {
1883 /* {
1884 syscallarg(void *) head;
1885 syscallarg(size_t) len;
1886 } */
1887 void *head = SCARG(uap, head);
1888
1889 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1890 return EINVAL;
1891 if ((uintptr_t)head % sizeof(u_long))
1892 return EINVAL;
1893
1894 l->l_robust_head = (uintptr_t)head;
1895
1896 return 0;
1897 }
1898
1899 /*
1900 * sys___futex_get_robust_list(l, uap, retval)
1901 *
1902 * __futex_get_robust_list(2) system call for robust futexes.
1903 */
1904 int
1905 sys___futex_get_robust_list(struct lwp *l,
1906 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1907 {
1908 /* {
1909 syscallarg(lwpid_t) lwpid;
1910 syscallarg(void **) headp;
1911 syscallarg(size_t *) lenp;
1912 } */
1913 void *head;
1914 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1915 int error;
1916
1917 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1918 if (error)
1919 return error;
1920
1921 /* Copy out the head pointer and the head structure length. */
1922 error = copyout(&head, SCARG(uap, headp), sizeof(head));
1923 if (__predict_true(error == 0)) {
1924 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1925 }
1926
1927 return error;
1928 }
1929
1930 /*
1931 * release_futex(uva, tid)
1932 *
1933 * Try to release the robust futex at uva in the current process
1934 * on lwp exit. If anything goes wrong, silently fail. It is the
1935 * userland program's obligation to arrange correct behaviour.
1936 */
1937 static void
1938 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1939 bool const is_pending)
1940 {
1941 int *uaddr;
1942 struct futex *f;
1943 int oldval, newval, actual;
1944 int error;
1945 bool shared;
1946
1947 /* If it's misaligned, tough. */
1948 if (__predict_false(uptr & 3))
1949 return;
1950 uaddr = (int *)uptr;
1951
1952 error = futex_load(uaddr, &oldval);
1953 if (__predict_false(error))
1954 return;
1955
1956 /*
1957 * There are two race conditions we need to handle here:
1958 *
1959 * 1. User space cleared the futex word but died before
1960 * being able to issue the wakeup. No wakeups will
1961 * ever be issued, oops!
1962 *
1963 * 2. Awakened waiter died before being able to acquire
1964 * the futex in user space. Any other waiters are
1965 * now stuck, oops!
1966 *
1967 * In both of these cases, the futex word will be 0 (because
1968 * it's updated before the wake is issued). The best we can
1969 * do is detect this situation if it's the pending futex and
1970 * issue a wake without modifying the futex word.
1971 *
1972 * XXX eventual PI handling?
1973 */
1974 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1975 register_t retval;
1976 error = futex_func_wake(/*shared*/true, uaddr, 1,
1977 FUTEX_BITSET_MATCH_ANY, &retval);
1978 if (error != 0 || retval == 0)
1979 (void) futex_func_wake(/*shared*/false, uaddr, 1,
1980 FUTEX_BITSET_MATCH_ANY, &retval);
1981 return;
1982 }
1983
1984 /* Optimistically test whether we need to do anything at all. */
1985 if ((oldval & FUTEX_TID_MASK) != tid)
1986 return;
1987
1988 /*
1989 * We need to handle the case where this thread owned the futex,
1990 * but it was uncontended. In this case, there won't be any
1991 * kernel state to look up. All we can do is mark the futex
1992 * as a zombie to be mopped up the next time another thread
1993 * attempts to acquire it.
1994 *
1995 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1996 * this loop, even if waiters appear while we're are doing
1997 * so. This is beause FUTEX_WAITERS is set by user space
1998 * before calling __futex() to wait, and the futex needs
1999 * to be marked as a zombie when the new waiter gets into
2000 * the kernel.
2001 */
2002 if ((oldval & FUTEX_WAITERS) == 0) {
2003 do {
2004 error = futex_load(uaddr, &oldval);
2005 if (error)
2006 return;
2007 if ((oldval & FUTEX_TID_MASK) != tid)
2008 return;
2009 newval = oldval | FUTEX_OWNER_DIED;
2010 error = ucas_int(uaddr, oldval, newval, &actual);
2011 if (error)
2012 return;
2013 } while (actual != oldval);
2014
2015 /*
2016 * If where is still no indication of waiters, then there is
2017 * no more work for us to do.
2018 */
2019 if ((oldval & FUTEX_WAITERS) == 0)
2020 return;
2021 }
2022
2023 /*
2024 * Look for a futex. Try shared first, then private. If we
2025 * can't fine one, tough.
2026 *
2027 * Note: the assumption here is that anyone placing a futex on
2028 * the robust list is adhering to the rules, regardless of the
2029 * futex class.
2030 */
2031 for (f = NULL, shared = true; f == NULL; shared = false) {
2032 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2033 if (error)
2034 return;
2035 if (f == NULL && shared == false)
2036 return;
2037 }
2038
2039 /* Work under the futex op lock. */
2040 futex_op_lock(f);
2041
2042 /*
2043 * Fetch the word: if the tid doesn't match ours, skip;
2044 * otherwise, set the owner-died bit, atomically.
2045 */
2046 do {
2047 error = futex_load(uaddr, &oldval);
2048 if (error)
2049 goto out;
2050 if ((oldval & FUTEX_TID_MASK) != tid)
2051 goto out;
2052 newval = oldval | FUTEX_OWNER_DIED;
2053 error = ucas_int(uaddr, oldval, newval, &actual);
2054 if (error)
2055 goto out;
2056 } while (actual != oldval);
2057
2058 /*
2059 * If there may be waiters, try to wake one. If anything goes
2060 * wrong, tough.
2061 *
2062 * XXX eventual PI handling?
2063 */
2064 if (oldval & FUTEX_WAITERS) {
2065 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2066 NULL, FUTEX_WRITERQ, 0,
2067 FUTEX_BITSET_MATCH_ANY);
2068 }
2069
2070 /* Unlock the queue and release the futex. */
2071 out: futex_op_unlock(f);
2072 futex_rele(f);
2073 }
2074
2075 /*
2076 * futex_robust_head_lookup(l, lwpid)
2077 *
2078 * Helper function to look up a robust head by LWP ID.
2079 */
2080 int
2081 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2082 {
2083 struct proc *p = l->l_proc;
2084
2085 /* Find the other lwp, if requested; otherwise use our robust head. */
2086 if (lwpid) {
2087 mutex_enter(p->p_lock);
2088 l = lwp_find(p, lwpid);
2089 if (l == NULL) {
2090 mutex_exit(p->p_lock);
2091 return ESRCH;
2092 }
2093 *headp = (void *)l->l_robust_head;
2094 mutex_exit(p->p_lock);
2095 } else {
2096 *headp = (void *)l->l_robust_head;
2097 }
2098 return 0;
2099 }
2100
2101 /*
2102 * futex_fetch_robust_head(uaddr)
2103 *
2104 * Helper routine to fetch the futex robust list head that
2105 * handles 32-bit binaries running on 64-bit kernels.
2106 */
2107 static int
2108 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2109 {
2110 #ifdef _LP64
2111 if (curproc->p_flag & PK_32) {
2112 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2113 int error;
2114
2115 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2116 if (__predict_true(error == 0)) {
2117 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2118 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2119 /*
2120 * Make sure the offset is sign-
2121 * extended.
2122 */
2123 rhead[i] = (int32_t)rhead32[i];
2124 } else {
2125 rhead[i] = rhead32[i];
2126 }
2127 }
2128 }
2129 return error;
2130 }
2131 #endif /* _L64 */
2132
2133 return copyin((void *)uaddr, rhead,
2134 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2135 }
2136
2137 /*
2138 * futex_decode_robust_word(word)
2139 *
2140 * Decode a robust futex list word into the entry and entry
2141 * properties.
2142 */
2143 static inline void
2144 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2145 bool * const is_pi)
2146 {
2147 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2148 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2149 }
2150
2151 /*
2152 * futex_fetch_robust_entry(uaddr)
2153 *
2154 * Helper routine to fetch and decode a robust futex entry
2155 * that handles 32-bit binaries running on 64-bit kernels.
2156 */
2157 static int
2158 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2159 bool * const is_pi)
2160 {
2161 uintptr_t val = 0;
2162 int error = 0;
2163
2164 #ifdef _LP64
2165 if (curproc->p_flag & PK_32) {
2166 uint32_t val32;
2167
2168 error = ufetch_32((uint32_t *)uaddr, &val32);
2169 if (__predict_true(error == 0))
2170 val = val32;
2171 } else
2172 #endif /* _LP64 */
2173 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2174 if (__predict_false(error))
2175 return error;
2176
2177 futex_decode_robust_word(val, valp, is_pi);
2178 return 0;
2179 }
2180
2181 /*
2182 * futex_release_all_lwp(l, tid)
2183 *
2184 * Release all l's robust futexes. If anything looks funny in
2185 * the process, give up -- it's userland's responsibility to dot
2186 * the i's and cross the t's.
2187 */
2188 void
2189 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2190 {
2191 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2192 int limit = 1000000;
2193 int error;
2194
2195 /* If there's no robust list there's nothing to do. */
2196 if (l->l_robust_head == 0)
2197 return;
2198
2199 /* Read the final snapshot of the robust list head. */
2200 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2201 if (error) {
2202 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2203 " unmapped robust futex list head\n",
2204 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2205 (uintmax_t)l->l_lid, (uintmax_t)tid);
2206 return;
2207 }
2208
2209 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2210
2211 uintptr_t next, pending;
2212 bool is_pi, pending_is_pi;
2213
2214 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2215 &next, &is_pi);
2216 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2217 &pending, &pending_is_pi);
2218
2219 /*
2220 * Walk down the list of locked futexes and release them, up
2221 * to one million of them before we give up.
2222 */
2223
2224 while (next != l->l_robust_head && limit-- > 0) {
2225 /* pending handled below. */
2226 if (next != pending)
2227 release_futex(next + offset, tid, is_pi, false);
2228 error = futex_fetch_robust_entry(next, &next, &is_pi);
2229 if (error)
2230 break;
2231 preempt_point();
2232 }
2233 if (limit <= 0) {
2234 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2235 " exhausted robust futex limit\n",
2236 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2237 (uintmax_t)l->l_lid, (uintmax_t)tid);
2238 }
2239
2240 /* If there's a pending futex, it may need to be released too. */
2241 if (pending != 0) {
2242 release_futex(pending + offset, tid, pending_is_pi, true);
2243 }
2244 }
2245