sys_futex.c revision 1.1 1 /* $NetBSD: sys_futex.c,v 1.1 2020/04/26 18:53:33 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.1 2020/04/26 18:53:33 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/types.h>
118 #include <sys/atomic.h>
119 #include <sys/condvar.h>
120 #include <sys/futex.h>
121 #include <sys/mutex.h>
122 #include <sys/rbtree.h>
123 #include <sys/queue.h>
124
125 #include <sys/syscall.h>
126 #include <sys/syscallargs.h>
127 #include <sys/syscallvar.h>
128
129 #include <uvm/uvm_extern.h>
130
131 /*
132 * Lock order:
133 *
134 * futex_tab.lock
135 * futex::fx_qlock ordered by kva of struct futex
136 * -> futex_wait::fw_lock only one at a time
137 * futex_wait::fw_lock only one at a time
138 * -> futex::fx_abortlock only one at a time
139 */
140
141 /*
142 * union futex_key
143 *
144 * A futex is addressed either by a vmspace+va (private) or by
145 * a uvm_voaddr (shared).
146 */
147 union futex_key {
148 struct {
149 struct vmspace *vmspace;
150 vaddr_t va;
151 } fk_private;
152 struct uvm_voaddr fk_shared;
153 };
154
155 /*
156 * struct futex
157 *
158 * Kernel state for a futex located at a particular address in a
159 * particular virtual address space.
160 *
161 * N.B. fx_refcnt is an unsigned long because we need to be able
162 * to operate on it atomically on all systems while at the same
163 * time rendering practically impossible the chance of it reaching
164 * its max value. In practice, we're limited by the number of LWPs
165 * that can be present on the system at any given time, and the
166 * assumption is that limit will be good enough on a 32-bit platform.
167 * See futex_wake() for why overflow needs to be avoided.
168 */
169 struct futex {
170 union futex_key fx_key;
171 unsigned long fx_refcnt;
172 bool fx_shared;
173 bool fx_on_tree;
174 struct rb_node fx_node;
175
176 kmutex_t fx_qlock;
177 TAILQ_HEAD(, futex_wait) fx_queue;
178
179 kmutex_t fx_abortlock;
180 LIST_HEAD(, futex_wait) fx_abortlist;
181 kcondvar_t fx_abortcv;
182 };
183
184 /*
185 * struct futex_wait
186 *
187 * State for a thread to wait on a futex. Threads wait on fw_cv
188 * for fw_bitset to be set to zero. The thread may transition to
189 * a different futex queue at any time under the futex's lock.
190 */
191 struct futex_wait {
192 kmutex_t fw_lock;
193 kcondvar_t fw_cv;
194 struct futex *fw_futex;
195 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */
196 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */
197 int fw_bitset;
198 };
199
200 /*
201 * futex_tab
202 *
203 * Global trees of futexes by vmspace/va and VM object address.
204 *
205 * XXX This obviously doesn't scale in parallel. We could use a
206 * pserialize-safe data structure, but there may be a high cost to
207 * frequent deletion since we don't cache futexes after we're done
208 * with them. We could use hashed locks. But for now, just make
209 * sure userland can't DoS the serial performance, by using a
210 * balanced binary tree for lookup.
211 *
212 * XXX We could use a per-process tree for the table indexed by
213 * virtual address to reduce contention between processes.
214 */
215 static struct {
216 kmutex_t lock;
217 struct rb_tree va;
218 struct rb_tree oa;
219 } futex_tab __cacheline_aligned;
220
221 static int
222 compare_futex_key(void *cookie, const void *n, const void *k)
223 {
224 const struct futex *fa = n;
225 const union futex_key *fka = &fa->fx_key;
226 const union futex_key *fkb = k;
227
228 if ((uintptr_t)fka->fk_private.vmspace <
229 (uintptr_t)fkb->fk_private.vmspace)
230 return -1;
231 if ((uintptr_t)fka->fk_private.vmspace >
232 (uintptr_t)fkb->fk_private.vmspace)
233 return +1;
234 if (fka->fk_private.va < fkb->fk_private.va)
235 return -1;
236 if (fka->fk_private.va > fkb->fk_private.va)
237 return -1;
238 return 0;
239 }
240
241 static int
242 compare_futex(void *cookie, const void *na, const void *nb)
243 {
244 const struct futex *fa = na;
245 const struct futex *fb = nb;
246
247 return compare_futex_key(cookie, fa, &fb->fx_key);
248 }
249
250 static const rb_tree_ops_t futex_rb_ops = {
251 .rbto_compare_nodes = compare_futex,
252 .rbto_compare_key = compare_futex_key,
253 .rbto_node_offset = offsetof(struct futex, fx_node),
254 };
255
256 static int
257 compare_futex_shared_key(void *cookie, const void *n, const void *k)
258 {
259 const struct futex *fa = n;
260 const union futex_key *fka = &fa->fx_key;
261 const union futex_key *fkb = k;
262
263 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
264 }
265
266 static int
267 compare_futex_shared(void *cookie, const void *na, const void *nb)
268 {
269 const struct futex *fa = na;
270 const struct futex *fb = nb;
271
272 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
273 }
274
275 static const rb_tree_ops_t futex_shared_rb_ops = {
276 .rbto_compare_nodes = compare_futex_shared,
277 .rbto_compare_key = compare_futex_shared_key,
278 .rbto_node_offset = offsetof(struct futex, fx_node),
279 };
280
281 static void futex_wait_dequeue(struct futex_wait *, struct futex *);
282
283 /*
284 * futex_load(uaddr, kaddr)
285 *
286 * Perform a single atomic load to read *uaddr, and return the
287 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
288 * mapped.
289 */
290 static inline int
291 futex_load(int *uaddr, int *kaddr)
292 {
293 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
294 }
295
296 /*
297 * futex_test(uaddr, expected)
298 *
299 * True if *uaddr == expected. False if *uaddr != expected, or if
300 * uaddr is not mapped.
301 */
302 static bool
303 futex_test(int *uaddr, int expected)
304 {
305 int val;
306 int error;
307
308 error = futex_load(uaddr, &val);
309 if (error)
310 return false;
311 return val == expected;
312 }
313
314 /*
315 * futex_sys_init()
316 *
317 * Initialize the futex subsystem.
318 */
319 void
320 futex_sys_init(void)
321 {
322
323 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
324 rb_tree_init(&futex_tab.va, &futex_rb_ops);
325 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
326 }
327
328 /*
329 * futex_sys_fini()
330 *
331 * Finalize the futex subsystem.
332 */
333 void
334 futex_sys_fini(void)
335 {
336
337 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
338 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
339 mutex_destroy(&futex_tab.lock);
340 }
341
342 /*
343 * futex_queue_init(f)
344 *
345 * Initialize the futex queue. Caller must call futex_queue_fini
346 * when done.
347 *
348 * Never sleeps.
349 */
350 static void
351 futex_queue_init(struct futex *f)
352 {
353
354 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
355 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
356 cv_init(&f->fx_abortcv, "fqabort");
357 LIST_INIT(&f->fx_abortlist);
358 TAILQ_INIT(&f->fx_queue);
359 }
360
361 /*
362 * futex_queue_drain(f)
363 *
364 * Wait for any aborting waiters in f; then empty the queue of
365 * any stragglers and wake them. Caller must guarantee no new
366 * references to f.
367 *
368 * May sleep.
369 */
370 static void
371 futex_queue_drain(struct futex *f)
372 {
373 struct futex_wait *fw, *fw_next;
374
375 mutex_enter(&f->fx_abortlock);
376 while (!LIST_EMPTY(&f->fx_abortlist))
377 cv_wait(&f->fx_abortcv, &f->fx_abortlock);
378 mutex_exit(&f->fx_abortlock);
379
380 mutex_enter(&f->fx_qlock);
381 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
382 mutex_enter(&fw->fw_lock);
383 futex_wait_dequeue(fw, f);
384 cv_broadcast(&fw->fw_cv);
385 mutex_exit(&fw->fw_lock);
386 }
387 mutex_exit(&f->fx_qlock);
388 }
389
390 /*
391 * futex_queue_fini(fq)
392 *
393 * Finalize the futex queue initialized by futex_queue_init. Queue
394 * must be empty. Caller must not use f again until a subsequent
395 * futex_queue_init.
396 */
397 static void
398 futex_queue_fini(struct futex *f)
399 {
400
401 KASSERT(TAILQ_EMPTY(&f->fx_queue));
402 KASSERT(LIST_EMPTY(&f->fx_abortlist));
403 mutex_destroy(&f->fx_qlock);
404 mutex_destroy(&f->fx_abortlock);
405 cv_destroy(&f->fx_abortcv);
406 }
407
408 /*
409 * futex_key_init(key, vm, va, shared)
410 *
411 * Initialize a futex key for lookup, etc.
412 */
413 static int
414 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
415 {
416 int error = 0;
417
418 if (__predict_false(shared)) {
419 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
420 error = EFAULT;
421 } else {
422 fk->fk_private.vmspace = vm;
423 fk->fk_private.va = va;
424 }
425
426 return error;
427 }
428
429 /*
430 * futex_key_fini(key, shared)
431 *
432 * Release a futex key.
433 */
434 static void
435 futex_key_fini(union futex_key *fk, bool shared)
436 {
437 if (__predict_false(shared))
438 uvm_voaddr_release(&fk->fk_shared);
439 memset(fk, 0, sizeof(*fk));
440 }
441
442 /*
443 * futex_create(fk, shared)
444 *
445 * Create a futex. Initial reference count is 1, representing the
446 * caller. Returns NULL on failure. Always takes ownership of the
447 * key, either transferring it to the newly-created futex, or releasing
448 * the key if creation fails.
449 *
450 * Never sleeps for memory, but may sleep to acquire a lock.
451 */
452 static struct futex *
453 futex_create(union futex_key *fk, bool shared)
454 {
455 struct futex *f;
456
457 f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
458 if (f == NULL) {
459 futex_key_fini(fk, shared);
460 return NULL;
461 }
462 f->fx_key = *fk;
463 f->fx_refcnt = 1;
464 f->fx_shared = shared;
465 f->fx_on_tree = false;
466 futex_queue_init(f);
467
468 return f;
469 }
470
471 /*
472 * futex_destroy(f)
473 *
474 * Destroy a futex created with futex_create. Reference count
475 * must be zero.
476 *
477 * May sleep.
478 */
479 static void
480 futex_destroy(struct futex *f)
481 {
482
483 ASSERT_SLEEPABLE();
484
485 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
486 KASSERT(!f->fx_on_tree);
487
488 /* Drain and destroy the private queue. */
489 futex_queue_drain(f);
490 futex_queue_fini(f);
491
492 futex_key_fini(&f->fx_key, f->fx_shared);
493
494 kmem_free(f, sizeof(*f));
495 }
496
497 /*
498 * futex_hold(f)
499 *
500 * Attempt to acquire a reference to f. Return 0 on success,
501 * ENFILE on too many references.
502 *
503 * Never sleeps.
504 */
505 static int
506 futex_hold(struct futex *f)
507 {
508 unsigned long refcnt;
509
510 do {
511 refcnt = atomic_load_relaxed(&f->fx_refcnt);
512 if (refcnt == ULONG_MAX)
513 return ENFILE;
514 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
515
516 return 0;
517 }
518
519 /*
520 * futex_rele(f)
521 *
522 * Release a reference to f acquired with futex_create or
523 * futex_hold.
524 *
525 * May sleep to free f.
526 */
527 static void
528 futex_rele(struct futex *f)
529 {
530 unsigned long refcnt;
531
532 ASSERT_SLEEPABLE();
533
534 do {
535 refcnt = atomic_load_relaxed(&f->fx_refcnt);
536 if (refcnt == 1)
537 goto trylast;
538 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
539 return;
540
541 trylast:
542 mutex_enter(&futex_tab.lock);
543 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
544 if (f->fx_on_tree) {
545 if (__predict_false(f->fx_shared))
546 rb_tree_remove_node(&futex_tab.oa, f);
547 else
548 rb_tree_remove_node(&futex_tab.va, f);
549 f->fx_on_tree = false;
550 }
551 } else {
552 /* References remain -- don't destroy it. */
553 f = NULL;
554 }
555 mutex_exit(&futex_tab.lock);
556 if (f != NULL)
557 futex_destroy(f);
558 }
559
560 /*
561 * futex_rele_not_last(f)
562 *
563 * Release a reference to f acquired with futex_create or
564 * futex_hold.
565 *
566 * This version asserts that we are not dropping the last
567 * reference to f.
568 */
569 static void
570 futex_rele_not_last(struct futex *f)
571 {
572 unsigned long refcnt;
573
574 do {
575 refcnt = atomic_load_relaxed(&f->fx_refcnt);
576 KASSERT(refcnt > 1);
577 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
578 }
579
580 /*
581 * futex_lookup_by_key(key, shared, &f)
582 *
583 * Try to find an existing futex va reference in the specified key
584 * On success, return 0, set f to found futex or to NULL if not found,
585 * and increment f's reference count if found.
586 *
587 * Return ENFILE if reference count too high.
588 *
589 * Internal lookup routine shared by futex_lookup() and
590 * futex_get().
591 */
592 static int
593 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
594 {
595 struct futex *f;
596 int error = 0;
597
598 mutex_enter(&futex_tab.lock);
599 if (__predict_false(shared)) {
600 f = rb_tree_find_node(&futex_tab.oa, fk);
601 } else {
602 f = rb_tree_find_node(&futex_tab.va, fk);
603 }
604 if (f) {
605 error = futex_hold(f);
606 if (error)
607 f = NULL;
608 }
609 *fp = f;
610 mutex_exit(&futex_tab.lock);
611
612 return error;
613 }
614
615 /*
616 * futex_insert(f, fp)
617 *
618 * Try to insert the futex f into the tree by va. If there
619 * already is a futex for its va, acquire a reference to it, and
620 * store it in *fp; otherwise store f in *fp.
621 *
622 * Return 0 on success, ENFILE if there already is a futex but its
623 * reference count is too high.
624 */
625 static int
626 futex_insert(struct futex *f, struct futex **fp)
627 {
628 struct futex *f0;
629 int error;
630
631 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
632 KASSERT(!f->fx_on_tree);
633
634 mutex_enter(&futex_tab.lock);
635 if (__predict_false(f->fx_shared))
636 f0 = rb_tree_insert_node(&futex_tab.oa, f);
637 else
638 f0 = rb_tree_insert_node(&futex_tab.va, f);
639 if (f0 == f) {
640 f->fx_on_tree = true;
641 error = 0;
642 } else {
643 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
644 KASSERT(f0->fx_on_tree);
645 error = futex_hold(f0);
646 if (error)
647 goto out;
648 }
649 *fp = f0;
650 out: mutex_exit(&futex_tab.lock);
651
652 return error;
653 }
654
655 /*
656 * futex_lookup(uaddr, shared, &f)
657 *
658 * Find a futex at the userland pointer uaddr in the current
659 * process's VM space. On success, return the futex in f and
660 * increment its reference count.
661 *
662 * Caller must call futex_put when done.
663 */
664 static int
665 futex_lookup(int *uaddr, bool shared, struct futex **fp)
666 {
667 union futex_key fk;
668 struct vmspace *vm = curproc->p_vmspace;
669 vaddr_t va = (vaddr_t)uaddr;
670 int error;
671
672 /*
673 * Reject unaligned user pointers so we don't cross page
674 * boundaries and so atomics will work.
675 */
676 if ((va & 3) != 0)
677 return EINVAL;
678
679 CTASSERT((PAGE_SIZE & 3) == 0);
680
681 /* Look it up. */
682 error = futex_key_init(&fk, vm, va, shared);
683 if (error)
684 return error;
685
686 error = futex_lookup_by_key(&fk, shared, fp);
687 futex_key_fini(&fk, shared);
688 if (error)
689 return error;
690
691 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
692 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
693
694 /*
695 * Success! (Caller must still check whether we found
696 * anything, but nothing went _wrong_ like trying to use
697 * unmapped memory.)
698 */
699 KASSERT(error == 0);
700
701 return error;
702 }
703
704 /*
705 * futex_get(uaddr, shared, &f)
706 *
707 * Find or create a futex at the userland pointer uaddr in the
708 * current process's VM space. On success, return the futex in f
709 * and increment its reference count.
710 *
711 * Caller must call futex_put when done.
712 */
713 static int
714 futex_get(int *uaddr, bool shared, struct futex **fp)
715 {
716 union futex_key fk;
717 struct vmspace *vm = curproc->p_vmspace;
718 struct futex *f = NULL;
719 vaddr_t va = (vaddr_t)uaddr;
720 int error;
721
722 /*
723 * Reject unaligned user pointers so we don't cross page
724 * boundaries and so atomics will work.
725 */
726 if ((va & 3) != 0)
727 return EINVAL;
728
729 CTASSERT((PAGE_SIZE & 3) == 0);
730
731 error = futex_key_init(&fk, vm, va, shared);
732 if (error)
733 return error;
734
735 /*
736 * Optimistically assume there already is one, and try to find
737 * it.
738 */
739 error = futex_lookup_by_key(&fk, shared, fp);
740 if (error || *fp != NULL) {
741 /*
742 * We either found one, or there was an error.
743 * In either case, we are done with the key.
744 */
745 futex_key_fini(&fk, shared);
746 goto out;
747 }
748
749 /*
750 * Create a futex recoard. This tranfers ownership of the key
751 * in all cases.
752 */
753 f = futex_create(&fk, shared);
754 if (f == NULL) {
755 error = ENOMEM;
756 goto out;
757 }
758
759 /*
760 * Insert our new futex, or use existing if someone else beat
761 * us to it.
762 */
763 error = futex_insert(f, fp);
764 if (error)
765 goto out;
766 if (*fp == f)
767 f = NULL; /* don't release on exit */
768
769 /* Success! */
770 KASSERT(error == 0);
771
772 out: if (f != NULL)
773 futex_rele(f);
774 KASSERT(error || *fp != NULL);
775 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
776 return error;
777 }
778
779 /*
780 * futex_put(f)
781 *
782 * Release a futex acquired with futex_get or futex_lookup.
783 */
784 static void
785 futex_put(struct futex *f)
786 {
787
788 futex_rele(f);
789 }
790
791 /*
792 * futex_wait_init(fw, bitset)
793 *
794 * Initialize a record for a thread to wait on a futex matching
795 * the specified bit set. Should be passed to futex_wait_enqueue
796 * before futex_wait, and should be passed to futex_wait_fini when
797 * done.
798 */
799 static void
800 futex_wait_init(struct futex_wait *fw, int bitset)
801 {
802
803 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
804 cv_init(&fw->fw_cv, "futex");
805 fw->fw_futex = NULL;
806 fw->fw_bitset = bitset;
807 }
808
809 /*
810 * futex_wait_fini(fw)
811 *
812 * Finalize a record for a futex waiter. Must not be on any
813 * futex's queue.
814 */
815 static void
816 futex_wait_fini(struct futex_wait *fw)
817 {
818
819 cv_destroy(&fw->fw_cv);
820 mutex_destroy(&fw->fw_lock);
821 }
822
823 /*
824 * futex_wait_enqueue(fw, f)
825 *
826 * Put fw on the futex queue. Must be done before futex_wait.
827 * Caller must hold fw's lock and f's lock, and fw must not be on
828 * any existing futex's waiter list.
829 */
830 static void
831 futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
832 {
833
834 KASSERT(mutex_owned(&f->fx_qlock));
835 KASSERT(mutex_owned(&fw->fw_lock));
836 KASSERT(fw->fw_futex == NULL);
837
838 fw->fw_futex = f;
839 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
840 }
841
842 /*
843 * futex_wait_dequeue(fw, f)
844 *
845 * Remove fw from the futex queue. Precludes subsequent
846 * futex_wait until a futex_wait_enqueue. Caller must hold fw's
847 * lock and f's lock, and fw must be on f.
848 */
849 static void
850 futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
851 {
852
853 KASSERT(mutex_owned(&f->fx_qlock));
854 KASSERT(mutex_owned(&fw->fw_lock));
855 KASSERT(fw->fw_futex == f);
856
857 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
858 fw->fw_futex = NULL;
859 }
860
861 /*
862 * futex_wait_abort(fw)
863 *
864 * Caller is no longer waiting for fw. Remove it from any queue
865 * if it was on one.
866 */
867 static void
868 futex_wait_abort(struct futex_wait *fw)
869 {
870 struct futex *f;
871
872 /* Acquire fw_lock so that the content of fw won't change. */
873 mutex_enter(&fw->fw_lock);
874
875 /*
876 * Grab the futex queue. It can't go away as long as we hold
877 * fw_lock. However, we can't take the queue lock because
878 * that's a lock order reversal.
879 */
880 f = fw->fw_futex;
881
882 /* Put us on the abort list so that fq won't go away. */
883 mutex_enter(&f->fx_abortlock);
884 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
885 mutex_exit(&f->fx_abortlock);
886
887 /* f is now stable, so we can release fw_lock. */
888 mutex_exit(&fw->fw_lock);
889
890 /* Now we can remove fw under the queue lock. */
891 mutex_enter(&f->fx_qlock);
892 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
893 mutex_exit(&f->fx_qlock);
894
895 /*
896 * Finally, remove us from the abort list and notify anyone
897 * waiting for the abort to complete if we were the last to go.
898 */
899 mutex_enter(&f->fx_abortlock);
900 LIST_REMOVE(fw, fw_abort);
901 if (LIST_EMPTY(&f->fx_abortlist))
902 cv_broadcast(&f->fx_abortcv);
903 mutex_exit(&f->fx_abortlock);
904 }
905
906 /*
907 * futex_wait(fw, deadline, clkid)
908 *
909 * fw must be a waiter on a futex's queue. Wait until deadline on
910 * the clock clkid, or forever if deadline is NULL, for a futex
911 * wakeup. Return 0 on explicit wakeup or destruction of futex,
912 * ETIMEDOUT on timeout, EINTR/ERESTART on signal.
913 */
914 static int
915 futex_wait(struct futex_wait *fw, const struct timespec *deadline,
916 clockid_t clkid)
917 {
918 int error = 0;
919
920 /* Test and wait under the wait lock. */
921 mutex_enter(&fw->fw_lock);
922 while (fw->fw_bitset && fw->fw_futex != NULL) {
923 /* Not done yet. Wait. */
924 if (deadline) {
925 struct timespec ts;
926
927 /* Check our watch. */
928 error = clock_gettime1(clkid, &ts);
929 if (error)
930 break;
931
932 /* If we're past the deadline, ETIMEDOUT. */
933 if (timespeccmp(deadline, &ts, <=)) {
934 error = ETIMEDOUT;
935 break;
936 }
937
938 /* Count how much time is left. */
939 timespecsub(deadline, &ts, &ts);
940
941 /* Wait for that much time, allowing signals. */
942 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
943 tstohz(&ts));
944 } else {
945 /* Wait indefinitely, allowing signals. */
946 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
947 }
948 if (error) {
949 /* Convert EWOULDBLOCK to ETIMEDOUT. */
950 if (error == EWOULDBLOCK)
951 error = ETIMEDOUT;
952 break;
953 }
954 }
955 mutex_exit(&fw->fw_lock);
956
957 return error;
958 }
959
960 /*
961 * futex_wake(f, nwake, f2, nrequeue, bitset)
962 *
963 * Wake up to nwake waiters on f matching bitset; then, if f2 is
964 * provided, move up to nrequeue remaining waiters on f matching
965 * bitset to f2. Return the number of waiters actually woken.
966 * Caller must hold the locks of f and f2, if provided.
967 */
968 static unsigned
969 futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
970 unsigned nrequeue, int bitset)
971 {
972 struct futex_wait *fw, *fw_next;
973 unsigned nwoken = 0;
974 int hold_error;
975
976 KASSERT(mutex_owned(&f->fx_qlock));
977 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
978
979 /* Wake up to nwake waiters, and count the number woken. */
980 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
981 if ((fw->fw_bitset & bitset) == 0)
982 continue;
983 if (nwake-- > 0) {
984 mutex_enter(&fw->fw_lock);
985 futex_wait_dequeue(fw, f);
986 fw->fw_bitset = 0;
987 cv_broadcast(&fw->fw_cv);
988 mutex_exit(&fw->fw_lock);
989 nwoken++;
990 /*
991 * Drop the futex reference on behalf of the
992 * waiter. We assert this is not the last
993 * reference on the futex (our caller should
994 * also have one).
995 */
996 futex_rele_not_last(f);
997 } else {
998 break;
999 }
1000 }
1001
1002 if (f2) {
1003 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1004 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
1005 if ((fw->fw_bitset & bitset) == 0)
1006 continue;
1007 if (nrequeue-- > 0) {
1008 mutex_enter(&fw->fw_lock);
1009 futex_wait_dequeue(fw, f);
1010 futex_wait_enqueue(fw, f2);
1011 mutex_exit(&fw->fw_lock);
1012 /*
1013 * Transfer the reference from f to f2.
1014 * As above, we assert that we are not
1015 * dropping the last reference to f here.
1016 *
1017 * XXX futex_hold() could theoretically
1018 * XXX fail here.
1019 */
1020 futex_rele_not_last(f);
1021 hold_error = futex_hold(f2);
1022 KASSERT(hold_error == 0);
1023 } else {
1024 break;
1025 }
1026 }
1027 } else {
1028 KASSERT(nrequeue == 0);
1029 }
1030
1031 /* Return the number of waiters woken. */
1032 return nwoken;
1033 }
1034
1035 /*
1036 * futex_queue_lock(f)
1037 *
1038 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do
1039 * not use if caller needs to acquire two locks; use
1040 * futex_queue_lock2 instead.
1041 */
1042 static void
1043 futex_queue_lock(struct futex *f)
1044 {
1045 mutex_enter(&f->fx_qlock);
1046 }
1047
1048 /*
1049 * futex_queue_unlock(f)
1050 *
1051 * Release the queue lock of f.
1052 */
1053 static void
1054 futex_queue_unlock(struct futex *f)
1055 {
1056 mutex_exit(&f->fx_qlock);
1057 }
1058
1059 /*
1060 * futex_queue_lock2(f, f2)
1061 *
1062 * Acquire the queue locks of both f and f2, which may be null, or
1063 * which may have the same underlying queue. If they are
1064 * distinct, an arbitrary total order is chosen on the locks.
1065 *
1066 * Callers should only ever acquire multiple queue locks
1067 * simultaneously using futex_queue_lock2.
1068 */
1069 static void
1070 futex_queue_lock2(struct futex *f, struct futex *f2)
1071 {
1072
1073 /*
1074 * If both are null, do nothing; if one is null and the other
1075 * is not, lock the other and be done with it.
1076 */
1077 if (f == NULL && f2 == NULL) {
1078 return;
1079 } else if (f == NULL) {
1080 mutex_enter(&f2->fx_qlock);
1081 return;
1082 } else if (f2 == NULL) {
1083 mutex_enter(&f->fx_qlock);
1084 return;
1085 }
1086
1087 /* If both futexes are the same, acquire only one. */
1088 if (f == f2) {
1089 mutex_enter(&f->fx_qlock);
1090 return;
1091 }
1092
1093 /* Otherwise, use the ordering on the kva of the futex pointer. */
1094 if ((uintptr_t)f < (uintptr_t)f2) {
1095 mutex_enter(&f->fx_qlock);
1096 mutex_enter(&f2->fx_qlock);
1097 } else {
1098 mutex_enter(&f2->fx_qlock);
1099 mutex_enter(&f->fx_qlock);
1100 }
1101 }
1102
1103 /*
1104 * futex_queue_unlock2(f, f2)
1105 *
1106 * Release the queue locks of both f and f2, which may be null, or
1107 * which may have the same underlying queue.
1108 */
1109 static void
1110 futex_queue_unlock2(struct futex *f, struct futex *f2)
1111 {
1112
1113 /*
1114 * If both are null, do nothing; if one is null and the other
1115 * is not, unlock the other and be done with it.
1116 */
1117 if (f == NULL && f2 == NULL) {
1118 return;
1119 } else if (f == NULL) {
1120 mutex_exit(&f2->fx_qlock);
1121 return;
1122 } else if (f2 == NULL) {
1123 mutex_exit(&f->fx_qlock);
1124 return;
1125 }
1126
1127 /* If both futexes are the same, release only one. */
1128 if (f == f2) {
1129 mutex_exit(&f->fx_qlock);
1130 return;
1131 }
1132
1133 /* Otherwise, use the ordering on the kva of the futex pointer. */
1134 if ((uintptr_t)f < (uintptr_t)f2) {
1135 mutex_exit(&f2->fx_qlock);
1136 mutex_exit(&f->fx_qlock);
1137 } else {
1138 mutex_exit(&f->fx_qlock);
1139 mutex_exit(&f2->fx_qlock);
1140 }
1141 }
1142
1143 /*
1144 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1145 *
1146 * Implement futex(FUTEX_WAIT).
1147 */
1148 static int
1149 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1150 const struct timespec *timeout, clockid_t clkid, int clkflags,
1151 register_t *retval)
1152 {
1153 struct futex *f;
1154 struct futex_wait wait, *fw = &wait;
1155 struct timespec ts;
1156 const struct timespec *deadline;
1157 int error;
1158
1159 /* Optimistically test before anything else. */
1160 if (!futex_test(uaddr, val))
1161 return EAGAIN;
1162
1163 /* Determine a deadline on the specified clock. */
1164 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1165 deadline = timeout;
1166 } else {
1167 error = clock_gettime1(clkid, &ts);
1168 if (error)
1169 return error;
1170 timespecadd(&ts, timeout, &ts);
1171 deadline = &ts;
1172 }
1173
1174 /* Get the futex, creating it if necessary. */
1175 error = futex_get(uaddr, shared, &f);
1176 if (error)
1177 return error;
1178 KASSERT(f);
1179
1180 /* Get ready to wait. */
1181 futex_wait_init(fw, val3);
1182
1183 /*
1184 * Under the queue lock, check the value again: if it has
1185 * already changed, EAGAIN; otherwise enqueue the waiter.
1186 * Since FUTEX_WAKE will use the same lock and be done after
1187 * modifying the value, the order in which we check and enqueue
1188 * is immaterial.
1189 */
1190 futex_queue_lock(f);
1191 if (!futex_test(uaddr, val)) {
1192 futex_queue_unlock(f);
1193 error = EAGAIN;
1194 goto out;
1195 }
1196 mutex_enter(&fw->fw_lock);
1197 futex_wait_enqueue(fw, f);
1198 mutex_exit(&fw->fw_lock);
1199 futex_queue_unlock(f);
1200
1201 /*
1202 * We cannot drop our reference to the futex here, because
1203 * we might be enqueued on a different one when we are awakened.
1204 * The references will be managed on our behalf in the requeue
1205 * and wake cases.
1206 */
1207 f = NULL;
1208
1209 /* Wait. */
1210 error = futex_wait(fw, deadline, clkid);
1211 if (error) {
1212 futex_wait_abort(fw);
1213 goto out;
1214 }
1215
1216 /* Return 0 on success, error on failure. */
1217 *retval = 0;
1218
1219 out: if (f != NULL)
1220 futex_put(f);
1221 futex_wait_fini(fw);
1222 return error;
1223 }
1224
1225 /*
1226 * futex_func_wake(uaddr, val, val3, retval)
1227 *
1228 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1229 */
1230 static int
1231 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1232 {
1233 struct futex *f;
1234 unsigned int nwoken = 0;
1235 int error = 0;
1236
1237 /* Reject negative number of wakeups. */
1238 if (val < 0) {
1239 error = EINVAL;
1240 goto out;
1241 }
1242
1243 /* Look up the futex, if any. */
1244 error = futex_lookup(uaddr, shared, &f);
1245 if (error)
1246 goto out;
1247
1248 /* If there's no futex, there are no waiters to wake. */
1249 if (f == NULL)
1250 goto out;
1251
1252 /*
1253 * Under f's queue lock, wake the waiters and remember the
1254 * number woken.
1255 */
1256 futex_queue_lock(f);
1257 nwoken = futex_wake(f, val, NULL, 0, val3);
1258 futex_queue_unlock(f);
1259
1260 /* Release the futex. */
1261 futex_put(f);
1262
1263 out:
1264 /* Return the number of waiters woken. */
1265 *retval = nwoken;
1266
1267 /* Success! */
1268 return error;
1269 }
1270
1271 /*
1272 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1273 *
1274 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1275 */
1276 static int
1277 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1278 int val2, int val3, register_t *retval)
1279 {
1280 struct futex *f = NULL, *f2 = NULL;
1281 unsigned nwoken = 0; /* default to zero woken on early return */
1282 int error;
1283
1284 /* Reject negative number of wakeups or requeues. */
1285 if (val < 0 || val2 < 0) {
1286 error = EINVAL;
1287 goto out;
1288 }
1289
1290 /* Look up the source futex, if any. */
1291 error = futex_lookup(uaddr, shared, &f);
1292 if (error)
1293 goto out;
1294
1295 /* If there is none, nothing to do. */
1296 if (f == NULL)
1297 goto out;
1298
1299 /*
1300 * We may need to create the destination futex because it's
1301 * entirely possible it does not currently have any waiters.
1302 */
1303 error = futex_get(uaddr2, shared, &f2);
1304 if (error)
1305 goto out;
1306
1307 /*
1308 * Under the futexes' queue locks, check the value; if
1309 * unchanged from val3, wake the waiters.
1310 */
1311 futex_queue_lock2(f, f2);
1312 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1313 error = EAGAIN;
1314 } else {
1315 error = 0;
1316 nwoken = futex_wake(f, val, f2, val2, FUTEX_BITSET_MATCH_ANY);
1317 }
1318 futex_queue_unlock2(f, f2);
1319
1320 out:
1321 /* Return the number of waiters woken. */
1322 *retval = nwoken;
1323
1324 /* Release the futexes if we got them. */
1325 if (f2)
1326 futex_put(f2);
1327 if (f)
1328 futex_put(f);
1329 return error;
1330 }
1331
1332 /*
1333 * futex_validate_op_cmp(val3)
1334 *
1335 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1336 */
1337 static int
1338 futex_validate_op_cmp(int val3)
1339 {
1340 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1341 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1342
1343 if (op & FUTEX_OP_OPARG_SHIFT) {
1344 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1345 if (oparg < 0)
1346 return EINVAL;
1347 if (oparg >= 32)
1348 return EINVAL;
1349 op &= ~FUTEX_OP_OPARG_SHIFT;
1350 }
1351
1352 switch (op) {
1353 case FUTEX_OP_SET:
1354 case FUTEX_OP_ADD:
1355 case FUTEX_OP_OR:
1356 case FUTEX_OP_ANDN:
1357 case FUTEX_OP_XOR:
1358 break;
1359 default:
1360 return EINVAL;
1361 }
1362
1363 switch (cmp) {
1364 case FUTEX_OP_CMP_EQ:
1365 case FUTEX_OP_CMP_NE:
1366 case FUTEX_OP_CMP_LT:
1367 case FUTEX_OP_CMP_LE:
1368 case FUTEX_OP_CMP_GT:
1369 case FUTEX_OP_CMP_GE:
1370 break;
1371 default:
1372 return EINVAL;
1373 }
1374
1375 return 0;
1376 }
1377
1378 /*
1379 * futex_compute_op(oldval, val3)
1380 *
1381 * Apply a FUTEX_WAIT_OP operation to oldval.
1382 */
1383 static int
1384 futex_compute_op(int oldval, int val3)
1385 {
1386 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1387 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1388
1389 if (op & FUTEX_OP_OPARG_SHIFT) {
1390 KASSERT(oparg >= 0);
1391 KASSERT(oparg < 32);
1392 oparg = 1u << oparg;
1393 op &= ~FUTEX_OP_OPARG_SHIFT;
1394 }
1395
1396 switch (op) {
1397 case FUTEX_OP_SET:
1398 return oparg;
1399
1400 case FUTEX_OP_ADD:
1401 /*
1402 * Avoid signed arithmetic overflow by doing
1403 * arithmetic unsigned and converting back to signed
1404 * at the end.
1405 */
1406 return (int)((unsigned)oldval + (unsigned)oparg);
1407
1408 case FUTEX_OP_OR:
1409 return oldval | oparg;
1410
1411 case FUTEX_OP_ANDN:
1412 return oldval & ~oparg;
1413
1414 case FUTEX_OP_XOR:
1415 return oldval ^ oparg;
1416
1417 default:
1418 panic("invalid futex op");
1419 }
1420 }
1421
1422 /*
1423 * futex_compute_cmp(oldval, val3)
1424 *
1425 * Apply a FUTEX_WAIT_OP comparison to oldval.
1426 */
1427 static bool
1428 futex_compute_cmp(int oldval, int val3)
1429 {
1430 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1431 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1432
1433 switch (cmp) {
1434 case FUTEX_OP_CMP_EQ:
1435 return (oldval == cmparg);
1436
1437 case FUTEX_OP_CMP_NE:
1438 return (oldval != cmparg);
1439
1440 case FUTEX_OP_CMP_LT:
1441 return (oldval < cmparg);
1442
1443 case FUTEX_OP_CMP_LE:
1444 return (oldval <= cmparg);
1445
1446 case FUTEX_OP_CMP_GT:
1447 return (oldval > cmparg);
1448
1449 case FUTEX_OP_CMP_GE:
1450 return (oldval >= cmparg);
1451
1452 default:
1453 panic("invalid futex cmp operation");
1454 }
1455 }
1456
1457 /*
1458 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1459 *
1460 * Implement futex(FUTEX_WAKE_OP).
1461 */
1462 static int
1463 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1464 int val3, register_t *retval)
1465 {
1466 struct futex *f = NULL, *f2 = NULL;
1467 int oldval, newval, actual;
1468 unsigned nwoken = 0;
1469 int error;
1470
1471 /* Reject negative number of wakeups. */
1472 if (val < 0 || val2 < 0) {
1473 error = EINVAL;
1474 goto out;
1475 }
1476
1477 /* Reject invalid operations before we start doing things. */
1478 if ((error = futex_validate_op_cmp(val3)) != 0)
1479 goto out;
1480
1481 /* Look up the first futex, if any. */
1482 error = futex_lookup(uaddr, shared, &f);
1483 if (error)
1484 goto out;
1485
1486 /* Look up the second futex, if any. */
1487 error = futex_lookup(uaddr2, shared, &f2);
1488 if (error)
1489 goto out;
1490
1491 /*
1492 * Under the queue locks:
1493 *
1494 * 1. Read/modify/write: *uaddr2 op= oparg.
1495 * 2. Unconditionally wake uaddr.
1496 * 3. Conditionally wake uaddr2, if it previously matched val2.
1497 */
1498 futex_queue_lock2(f, f2);
1499 do {
1500 error = futex_load(uaddr2, &oldval);
1501 if (error)
1502 goto out_unlock;
1503 newval = futex_compute_op(oldval, val3);
1504 error = ucas_int(uaddr2, oldval, newval, &actual);
1505 if (error)
1506 goto out_unlock;
1507 } while (actual != oldval);
1508 nwoken = (f ? futex_wake(f, val, NULL, 0, FUTEX_BITSET_MATCH_ANY) : 0);
1509 if (f2 && futex_compute_cmp(oldval, val3))
1510 nwoken += futex_wake(f2, val2, NULL, 0,
1511 FUTEX_BITSET_MATCH_ANY);
1512
1513 /* Success! */
1514 error = 0;
1515 out_unlock:
1516 futex_queue_unlock2(f, f2);
1517
1518 out:
1519 /* Return the number of waiters woken. */
1520 *retval = nwoken;
1521
1522 /* Release the futexes, if we got them. */
1523 if (f2)
1524 futex_put(f2);
1525 if (f)
1526 futex_put(f);
1527 return error;
1528 }
1529
1530 /*
1531 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1532 *
1533 * Implement the futex system call with all the parameters
1534 * parsed out.
1535 */
1536 int
1537 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1538 int *uaddr2, int val2, int val3, register_t *retval)
1539 {
1540 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1541 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1542 : CLOCK_MONOTONIC;
1543
1544 op &= FUTEX_CMD_MASK;
1545
1546 switch (op) {
1547 case FUTEX_WAIT:
1548 return futex_func_wait(shared, uaddr, val,
1549 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
1550 retval);
1551
1552 case FUTEX_WAKE:
1553 val3 = FUTEX_BITSET_MATCH_ANY;
1554 /* FALLTHROUGH */
1555 case FUTEX_WAKE_BITSET:
1556 return futex_func_wake(shared, uaddr, val, val3, retval);
1557
1558 case FUTEX_REQUEUE:
1559 case FUTEX_CMP_REQUEUE:
1560 return futex_func_requeue(shared, op, uaddr, val, uaddr2,
1561 val2, val3, retval);
1562
1563 case FUTEX_WAIT_BITSET:
1564 return futex_func_wait(shared, uaddr, val, val3, timeout,
1565 clkid, TIMER_ABSTIME, retval);
1566
1567 case FUTEX_WAKE_OP:
1568 return futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
1569 val3, retval);
1570
1571 case FUTEX_FD:
1572 default:
1573 return ENOSYS;
1574 }
1575 }
1576
1577 /*
1578 * sys___futex(l, uap, retval)
1579 *
1580 * __futex(2) system call: generic futex operations.
1581 */
1582 int
1583 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1584 register_t *retval)
1585 {
1586 /* {
1587 syscallarg(int *) uaddr;
1588 syscallarg(int) op;
1589 syscallarg(int) val;
1590 syscallarg(const struct timespec *) timeout;
1591 syscallarg(int *) uaddr2;
1592 syscallarg(int) val2;
1593 syscallarg(int) val3;
1594 } */
1595 struct timespec ts, *tsp;
1596 int error;
1597
1598 /*
1599 * Copy in the timeout argument, if specified.
1600 */
1601 if (SCARG(uap, timeout)) {
1602 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1603 if (error)
1604 return error;
1605 tsp = &ts;
1606 } else {
1607 tsp = NULL;
1608 }
1609
1610 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1611 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1612 retval);
1613 }
1614
1615 /*
1616 * sys___futex_set_robust_list(l, uap, retval)
1617 *
1618 * __futex_set_robust_list(2) system call for robust futexes.
1619 */
1620 int
1621 sys___futex_set_robust_list(struct lwp *l,
1622 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1623 {
1624 /* {
1625 syscallarg(void *) head;
1626 syscallarg(size_t) len;
1627 } */
1628 void *head = SCARG(uap, head);
1629
1630 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1631 return EINVAL;
1632 if ((uintptr_t)head % sizeof(u_long))
1633 return EINVAL;
1634
1635 l->l_robust_head = (uintptr_t)head;
1636
1637 return 0;
1638 }
1639
1640 /*
1641 * sys___futex_get_robust_list(l, uap, retval)
1642 *
1643 * __futex_get_robust_list(2) system call for robust futexes.
1644 */
1645 int
1646 sys___futex_get_robust_list(struct lwp *l,
1647 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1648 {
1649 /* {
1650 syscallarg(lwpid_t) lwpid;
1651 syscallarg(void **) headp;
1652 syscallarg(size_t *) lenp;
1653 } */
1654 void *head;
1655 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1656 int error;
1657
1658 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1659 if (error)
1660 return error;
1661
1662 /* Copy out the head pointer and the head structure length. */
1663 error = copyout(&head, SCARG(uap, headp), sizeof(head));
1664 if (__predict_true(error == 0)) {
1665 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1666 }
1667
1668 return error;
1669 }
1670
1671 /*
1672 * release_futex(uva, tid)
1673 *
1674 * Try to release the robust futex at uva in the current process
1675 * on lwp exit. If anything goes wrong, silently fail. It is the
1676 * userland program's obligation to arrange correct behaviour.
1677 */
1678 static void
1679 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1680 bool const is_pending)
1681 {
1682 int *uaddr;
1683 struct futex *f;
1684 int oldval, newval, actual;
1685 int error;
1686
1687 /* If it's misaligned, tough. */
1688 if (__predict_false(uptr & 3))
1689 return;
1690 uaddr = (int *)uptr;
1691
1692 error = futex_load(uaddr, &oldval);
1693 if (__predict_false(error))
1694 return;
1695
1696 /*
1697 * There are two race conditions we need to handle here:
1698 *
1699 * 1. User space cleared the futex word but died before
1700 * being able to issue the wakeup. No wakeups will
1701 * ever be issued, oops!
1702 *
1703 * 2. Awakened waiter died before being able to acquire
1704 * the futex in user space. Any other waiters are
1705 * now stuck, oops!
1706 *
1707 * In both of these cases, the futex word will be 0 (because
1708 * it's updated before the wake is issued). The best we can
1709 * do is detect this situation if it's the pending futex and
1710 * issue a wake without modifying the futex word.
1711 *
1712 * XXX eventual PI handling?
1713 */
1714 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1715 register_t retval;
1716 (void) futex_func_wake(/*shared*/true, uaddr, 1,
1717 FUTEX_BITSET_MATCH_ANY, &retval);
1718 return;
1719 }
1720
1721 /* Optimistically test whether we need to do anything at all. */
1722 if ((oldval & FUTEX_TID_MASK) != tid)
1723 return;
1724
1725 /*
1726 * We need to handle the case where this thread owned the futex,
1727 * but it was uncontended. In this case, there won't be any
1728 * kernel state to look up. All we can do is mark the futex
1729 * as a zombie to be mopped up the next time another thread
1730 * attempts to acquire it.
1731 *
1732 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1733 * this loop, even if waiters appear while we're are doing
1734 * so. This is beause FUTEX_WAITERS is set by user space
1735 * before calling __futex() to wait, and the futex needs
1736 * to be marked as a zombie when the new waiter gets into
1737 * the kernel.
1738 */
1739 if ((oldval & FUTEX_WAITERS) == 0) {
1740 do {
1741 error = futex_load(uaddr, &oldval);
1742 if (error)
1743 return;
1744 if ((oldval & FUTEX_TID_MASK) != tid)
1745 return;
1746 newval = oldval | FUTEX_OWNER_DIED;
1747 error = ucas_int(uaddr, oldval, newval, &actual);
1748 if (error)
1749 return;
1750 } while (actual != oldval);
1751
1752 /*
1753 * If where is still no indication of waiters, then there is
1754 * no more work for us to do.
1755 */
1756 if ((oldval & FUTEX_WAITERS) == 0)
1757 return;
1758 }
1759
1760 /*
1761 * Look for a shared futex since we have no positive indication
1762 * it is private. If we can't, tough.
1763 */
1764 error = futex_lookup(uaddr, /*shared*/true, &f);
1765 if (error)
1766 return;
1767
1768 /*
1769 * If there's no kernel state for this futex, there's nothing to
1770 * release.
1771 */
1772 if (f == NULL)
1773 return;
1774
1775 /* Work under the futex queue lock. */
1776 futex_queue_lock(f);
1777
1778 /*
1779 * Fetch the word: if the tid doesn't match ours, skip;
1780 * otherwise, set the owner-died bit, atomically.
1781 */
1782 do {
1783 error = futex_load(uaddr, &oldval);
1784 if (error)
1785 goto out;
1786 if ((oldval & FUTEX_TID_MASK) != tid)
1787 goto out;
1788 newval = oldval | FUTEX_OWNER_DIED;
1789 error = ucas_int(uaddr, oldval, newval, &actual);
1790 if (error)
1791 goto out;
1792 } while (actual != oldval);
1793
1794 /*
1795 * If there may be waiters, try to wake one. If anything goes
1796 * wrong, tough.
1797 *
1798 * XXX eventual PI handling?
1799 */
1800 if (oldval & FUTEX_WAITERS)
1801 (void)futex_wake(f, 1, NULL, 0, FUTEX_BITSET_MATCH_ANY);
1802
1803 /* Unlock the queue and release the futex. */
1804 out: futex_queue_unlock(f);
1805 futex_put(f);
1806 }
1807
1808 /*
1809 * futex_robust_head_lookup(l, lwpid)
1810 *
1811 * Helper function to look up a robust head by LWP ID.
1812 */
1813 int
1814 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
1815 {
1816 struct proc *p = l->l_proc;
1817
1818 /* Find the other lwp, if requested; otherwise use our robust head. */
1819 if (lwpid) {
1820 mutex_enter(p->p_lock);
1821 l = lwp_find(p, lwpid);
1822 if (l == NULL) {
1823 mutex_exit(p->p_lock);
1824 return ESRCH;
1825 }
1826 *headp = (void *)l->l_robust_head;
1827 mutex_exit(p->p_lock);
1828 } else {
1829 *headp = (void *)l->l_robust_head;
1830 }
1831 return 0;
1832 }
1833
1834 /*
1835 * futex_fetch_robust_head(uaddr)
1836 *
1837 * Helper routine to fetch the futex robust list head that
1838 * handles 32-bit binaries running on 64-bit kernels.
1839 */
1840 static int
1841 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
1842 {
1843 #ifdef _LP64
1844 if (curproc->p_flag & PK_32) {
1845 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
1846 int error;
1847
1848 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
1849 if (__predict_true(error == 0)) {
1850 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
1851 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
1852 /*
1853 * Make sure the offset is sign-
1854 * extended.
1855 */
1856 rhead[i] = (int32_t)rhead32[i];
1857 } else {
1858 rhead[i] = rhead32[i];
1859 }
1860 }
1861 }
1862 return error;
1863 }
1864 #endif /* _L64 */
1865
1866 return copyin((void *)uaddr, rhead,
1867 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
1868 }
1869
1870 /*
1871 * futex_decode_robust_word(word)
1872 *
1873 * Decode a robust futex list word into the entry and entry
1874 * properties.
1875 */
1876 static inline void
1877 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
1878 bool * const is_pi)
1879 {
1880 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
1881 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
1882 }
1883
1884 /*
1885 * futex_fetch_robust_entry(uaddr)
1886 *
1887 * Helper routine to fetch and decode a robust futex entry
1888 * that handles 32-bit binaries running on 64-bit kernels.
1889 */
1890 static int
1891 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
1892 bool * const is_pi)
1893 {
1894 uintptr_t val = 0;
1895 int error = 0;
1896
1897 #ifdef _LP64
1898 if (curproc->p_flag & PK_32) {
1899 uint32_t val32;
1900
1901 error = ufetch_32((uint32_t *)uaddr, &val32);
1902 if (__predict_true(error == 0))
1903 val = val32;
1904 } else
1905 #endif /* _LP64 */
1906 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
1907 if (__predict_false(error))
1908 return error;
1909
1910 futex_decode_robust_word(val, valp, is_pi);
1911 return 0;
1912 }
1913
1914 /*
1915 * futex_release_all_lwp(l, tid)
1916 *
1917 * Release all l's robust futexes. If anything looks funny in
1918 * the process, give up -- it's userland's responsibility to dot
1919 * the i's and cross the t's.
1920 */
1921 void
1922 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
1923 {
1924 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
1925 int limit = 1000000;
1926 int error;
1927
1928 /* If there's no robust list there's nothing to do. */
1929 if (l->l_robust_head == 0)
1930 return;
1931
1932 /* Read the final snapshot of the robust list head. */
1933 error = futex_fetch_robust_head(l->l_robust_head, rhead);
1934 if (error) {
1935 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
1936 " unmapped robust futex list head\n",
1937 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1938 (uintmax_t)l->l_lid, (uintmax_t)tid);
1939 return;
1940 }
1941
1942 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
1943
1944 uintptr_t next, pending;
1945 bool is_pi, pending_is_pi;
1946
1947 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
1948 &next, &is_pi);
1949 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
1950 &pending, &pending_is_pi);
1951
1952 /*
1953 * Walk down the list of locked futexes and release them, up
1954 * to one million of them before we give up.
1955 */
1956
1957 while (next != l->l_robust_head && limit-- > 0) {
1958 /* pending handled below. */
1959 if (next != pending)
1960 release_futex(next + offset, tid, is_pi, false);
1961 error = futex_fetch_robust_entry(next, &next, &is_pi);
1962 if (error)
1963 break;
1964 preempt_point();
1965 }
1966 if (limit <= 0) {
1967 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
1968 " exhausted robust futex limit\n",
1969 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
1970 (uintmax_t)l->l_lid, (uintmax_t)tid);
1971 }
1972
1973 /* If there's a pending futex, it may need to be released too. */
1974 if (pending != 0) {
1975 release_futex(pending + offset, tid, pending_is_pi, true);
1976 }
1977 }
1978