sys_futex.c revision 1.11.2.2 1 /* $NetBSD: sys_futex.c,v 1.11.2.2 2021/04/03 21:52:20 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.2 2021/04/03 21:52:20 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/types.h>
118 #include <sys/atomic.h>
119 #include <sys/condvar.h>
120 #include <sys/futex.h>
121 #include <sys/mutex.h>
122 #include <sys/rbtree.h>
123 #include <sys/sleepq.h>
124 #include <sys/queue.h>
125 #include <sys/sdt.h>
126
127 #include <sys/syscall.h>
128 #include <sys/syscallargs.h>
129 #include <sys/syscallvar.h>
130
131 #include <uvm/uvm_extern.h>
132
133 /*
134 * DTrace probes.
135 */
136 SDT_PROVIDER_DEFINE(futex);
137
138 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
139 /* exit: error */
140 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
141 "struct timespec *", "int", "int");
142 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
143
144 /* entry: uaddr, nwake, bitset, fflags */
145 /* exit: error, nwoken */
146 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
147 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
148
149 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
150 /* exit: error, nwoken */
151 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
152 "int");
153 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
154
155 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
156 /* exit: error, nwoken */
157 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
158 "int", "int", "int");
159 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
160
161 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
162 /* exit: error, nwoken */
163 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
164 "int", "int");
165 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
166
167 /* entry: uaddr, val, r/w, abstime, fflags */
168 /* exit: error */
169 SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int",
170 "struct timespec *", "int");
171 SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int");
172
173 /* entry: uaddr, val, fflags */
174 /* exit: error, nwoken */
175 SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int");
176 SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int");
177
178 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
179 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
180 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
181
182 /* entry: timo */
183 /* exit: error */
184 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
185 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
186
187 /*
188 * Lock order:
189 *
190 * futex_tab.lock
191 * futex::fx_op_lock ordered by kva of struct futex
192 * -> futex::fx_sq_lock ordered by kva of sleepq lock
193 *
194 * N.B. multiple futexes can share a single sleepq lock.
195 */
196
197 /*
198 * union futex_key
199 *
200 * A futex is addressed either by a vmspace+va (private) or by
201 * a uvm_voaddr (shared).
202 */
203 union futex_key {
204 struct {
205 struct vmspace *vmspace;
206 vaddr_t va;
207 } fk_private;
208 struct uvm_voaddr fk_shared;
209 };
210
211 /*
212 * struct futex
213 *
214 * Kernel state for a futex located at a particular address in a
215 * particular virtual address space.
216 *
217 * N.B. fx_refcnt is an unsigned long because we need to be able
218 * to operate on it atomically on all systems while at the same
219 * time rendering practically impossible the chance of it reaching
220 * its max value. In practice, we're limited by the number of LWPs
221 * that can be present on the system at any given time, and the
222 * assumption is that limit will be good enough on a 32-bit platform.
223 * See futex_wake() for why overflow needs to be avoided.
224 *
225 * XXX Since futex addresses must be 4-byte aligned, we could
226 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
227 * XXX field of the key. Worth it?
228 */
229 struct futex {
230 union futex_key fx_key;
231 struct rb_node fx_node;
232 unsigned long fx_refcnt;
233 bool fx_shared;
234 bool fx_on_tree;
235 uint8_t fx_class;
236
237 kmutex_t fx_op_lock; /* adaptive */
238 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
239 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
240 unsigned int fx_nwaiters[2];
241 };
242
243 /*
244 * futex classes: Some futex operations can only be carried out on
245 * futexes that are known to be abiding by a certain protocol. These
246 * classes are assigned to a futex when created due to a wait event,
247 * and when subsequent wake or requeue operations are issued, the
248 * class is checked at futex lookup time. If the class does not match,
249 * EINVAL is the result.
250 */
251 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
252 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
253 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
254 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
255
256 /* sleepq definitions */
257 #define FUTEX_READERQ 0
258 #define FUTEX_WRITERQ 1
259
260 CTASSERT(FUTEX_READERQ == FUTEX_RW_READER);
261 CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER);
262
263 static const char futex_wmesg[] = "futex";
264
265 static void futex_unsleep(lwp_t *, bool);
266
267 static syncobj_t futex_syncobj = {
268 .sobj_flag = SOBJ_SLEEPQ_SORTED,
269 .sobj_unsleep = futex_unsleep,
270 .sobj_changepri = sleepq_changepri,
271 .sobj_lendpri = sleepq_lendpri,
272 .sobj_owner = syncobj_noowner,
273 };
274
275 /*
276 * futex_tab
277 *
278 * Global trees of futexes by vmspace/va and VM object address.
279 *
280 * XXX This obviously doesn't scale in parallel. We could use a
281 * pserialize-safe data structure, but there may be a high cost to
282 * frequent deletion since we don't cache futexes after we're done
283 * with them. We could use hashed locks. But for now, just make
284 * sure userland can't DoS the serial performance, by using a
285 * balanced binary tree for lookup.
286 *
287 * XXX We could use a per-process tree for the table indexed by
288 * virtual address to reduce contention between processes.
289 */
290 static struct {
291 kmutex_t lock;
292 struct rb_tree va;
293 struct rb_tree oa;
294 } futex_tab __cacheline_aligned;
295
296 static int
297 compare_futex_key(void *cookie, const void *n, const void *k)
298 {
299 const struct futex *fa = n;
300 const union futex_key *fka = &fa->fx_key;
301 const union futex_key *fkb = k;
302
303 if ((uintptr_t)fka->fk_private.vmspace <
304 (uintptr_t)fkb->fk_private.vmspace)
305 return -1;
306 if ((uintptr_t)fka->fk_private.vmspace >
307 (uintptr_t)fkb->fk_private.vmspace)
308 return +1;
309 if (fka->fk_private.va < fkb->fk_private.va)
310 return -1;
311 if (fka->fk_private.va > fkb->fk_private.va)
312 return -1;
313 return 0;
314 }
315
316 static int
317 compare_futex(void *cookie, const void *na, const void *nb)
318 {
319 const struct futex *fa = na;
320 const struct futex *fb = nb;
321
322 return compare_futex_key(cookie, fa, &fb->fx_key);
323 }
324
325 static const rb_tree_ops_t futex_rb_ops = {
326 .rbto_compare_nodes = compare_futex,
327 .rbto_compare_key = compare_futex_key,
328 .rbto_node_offset = offsetof(struct futex, fx_node),
329 };
330
331 static int
332 compare_futex_shared_key(void *cookie, const void *n, const void *k)
333 {
334 const struct futex *fa = n;
335 const union futex_key *fka = &fa->fx_key;
336 const union futex_key *fkb = k;
337
338 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
339 }
340
341 static int
342 compare_futex_shared(void *cookie, const void *na, const void *nb)
343 {
344 const struct futex *fa = na;
345 const struct futex *fb = nb;
346
347 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
348 }
349
350 static const rb_tree_ops_t futex_shared_rb_ops = {
351 .rbto_compare_nodes = compare_futex_shared,
352 .rbto_compare_key = compare_futex_shared_key,
353 .rbto_node_offset = offsetof(struct futex, fx_node),
354 };
355
356 /*
357 * futex_sq_lock2(f, f2)
358 *
359 * Acquire the sleepq locks for f and f2, which may be null, or
360 * which may be the same. If they are distinct, an arbitrary total
361 * order is chosen on the locks.
362 *
363 * Callers should only ever acquire multiple sleepq locks
364 * simultaneously using futex_sq_lock2.
365 */
366 static void
367 futex_sq_lock2(struct futex * const f, struct futex * const f2)
368 {
369
370 /*
371 * If both are null, do nothing; if one is null and the other
372 * is not, lock the other and be done with it.
373 */
374 if (f == NULL && f2 == NULL) {
375 return;
376 } else if (f == NULL) {
377 mutex_spin_enter(f2->fx_sq_lock);
378 return;
379 } else if (f2 == NULL) {
380 mutex_spin_enter(f->fx_sq_lock);
381 return;
382 }
383
384 kmutex_t * const m = f->fx_sq_lock;
385 kmutex_t * const m2 = f2->fx_sq_lock;
386
387 /* If both locks are the same, acquire only one. */
388 if (m == m2) {
389 mutex_spin_enter(m);
390 return;
391 }
392
393 /* Otherwise, use the ordering on the kva of the lock pointer. */
394 if ((uintptr_t)m < (uintptr_t)m2) {
395 mutex_spin_enter(m);
396 mutex_spin_enter(m2);
397 } else {
398 mutex_spin_enter(m2);
399 mutex_spin_enter(m);
400 }
401 }
402
403 /*
404 * futex_sq_unlock2(f, f2)
405 *
406 * Release the sleep queue locks for f and f2, which may be null, or
407 * which may have the same underlying lock.
408 */
409 static void
410 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
411 {
412
413 /*
414 * If both are null, do nothing; if one is null and the other
415 * is not, unlock the other and be done with it.
416 */
417 if (f == NULL && f2 == NULL) {
418 return;
419 } else if (f == NULL) {
420 mutex_spin_exit(f2->fx_sq_lock);
421 return;
422 } else if (f2 == NULL) {
423 mutex_spin_exit(f->fx_sq_lock);
424 return;
425 }
426
427 kmutex_t * const m = f->fx_sq_lock;
428 kmutex_t * const m2 = f2->fx_sq_lock;
429
430 /* If both locks are the same, release only one. */
431 if (m == m2) {
432 mutex_spin_exit(m);
433 return;
434 }
435
436 /* Otherwise, use the ordering on the kva of the lock pointer. */
437 if ((uintptr_t)m < (uintptr_t)m2) {
438 mutex_spin_exit(m2);
439 mutex_spin_exit(m);
440 } else {
441 mutex_spin_exit(m);
442 mutex_spin_exit(m2);
443 }
444 }
445
446 /*
447 * futex_load(uaddr, kaddr)
448 *
449 * Perform a single atomic load to read *uaddr, and return the
450 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
451 * mapped.
452 */
453 static inline int
454 futex_load(int *uaddr, int *kaddr)
455 {
456 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
457 }
458
459 /*
460 * futex_test(uaddr, expected)
461 *
462 * True if *uaddr == expected. False if *uaddr != expected, or if
463 * uaddr is not mapped.
464 */
465 static bool
466 futex_test(int *uaddr, int expected)
467 {
468 int val;
469 int error;
470
471 error = futex_load(uaddr, &val);
472 if (error)
473 return false;
474 return val == expected;
475 }
476
477 static pool_cache_t futex_cache __read_mostly;
478
479 static int futex_ctor(void *, void *, int);
480 static void futex_dtor(void *, void *);
481
482 /*
483 * futex_sys_init()
484 *
485 * Initialize the futex subsystem.
486 */
487 void
488 futex_sys_init(void)
489 {
490
491 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
492 rb_tree_init(&futex_tab.va, &futex_rb_ops);
493 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
494
495 futex_cache = pool_cache_init(sizeof(struct futex),
496 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
497 futex_dtor, NULL);
498 }
499
500 /*
501 * futex_sys_fini()
502 *
503 * Finalize the futex subsystem.
504 */
505 void
506 futex_sys_fini(void)
507 {
508
509 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
510 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
511 mutex_destroy(&futex_tab.lock);
512 }
513
514 /*
515 * futex_ctor()
516 *
517 * Futex object constructor.
518 */
519 static int
520 futex_ctor(void *arg __unused, void *obj, int flags __unused)
521 {
522 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
523 struct futex * const f = obj;
524
525 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
526 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
527
528 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
529 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
530 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
531
532 return 0;
533 }
534
535 /*
536 * futex_dtor()
537 *
538 * Futex object destructor.
539 */
540 static void
541 futex_dtor(void *arg __unused, void *obj)
542 {
543 struct futex * const f = obj;
544
545 mutex_destroy(&f->fx_op_lock);
546 f->fx_sq_lock = NULL;
547 }
548
549 /*
550 * futex_key_init(key, vm, va, shared)
551 *
552 * Initialize a futex key for lookup, etc.
553 */
554 static int
555 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
556 {
557 int error = 0;
558
559 if (__predict_false(shared)) {
560 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
561 error = EFAULT;
562 } else {
563 fk->fk_private.vmspace = vm;
564 fk->fk_private.va = va;
565 }
566
567 return error;
568 }
569
570 /*
571 * futex_key_fini(key, shared)
572 *
573 * Release a futex key.
574 */
575 static void
576 futex_key_fini(union futex_key *fk, bool shared)
577 {
578 if (__predict_false(shared))
579 uvm_voaddr_release(&fk->fk_shared);
580 memset(fk, 0, sizeof(*fk));
581 }
582
583 /*
584 * futex_create(fk, shared)
585 *
586 * Create a futex. Initial reference count is 1, representing the
587 * caller. Returns NULL on failure. Always takes ownership of the
588 * key, either transferring it to the newly-created futex, or releasing
589 * the key if creation fails.
590 *
591 * Never sleeps for memory, but may sleep to acquire a lock.
592 */
593 static struct futex *
594 futex_create(union futex_key *fk, bool shared, uint8_t class)
595 {
596 struct futex *f;
597
598 f = pool_cache_get(futex_cache, PR_NOWAIT);
599 if (f == NULL) {
600 futex_key_fini(fk, shared);
601 return NULL;
602 }
603 f->fx_key = *fk;
604 f->fx_refcnt = 1;
605 f->fx_shared = shared;
606 f->fx_on_tree = false;
607 f->fx_class = class;
608
609 return f;
610 }
611
612 /*
613 * futex_destroy(f)
614 *
615 * Destroy a futex created with futex_create. Reference count
616 * must be zero.
617 *
618 * May sleep.
619 */
620 static void
621 futex_destroy(struct futex *f)
622 {
623
624 ASSERT_SLEEPABLE();
625
626 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
627 KASSERT(!f->fx_on_tree);
628 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
629 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
630 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
631 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
632
633 futex_key_fini(&f->fx_key, f->fx_shared);
634
635 pool_cache_put(futex_cache, f);
636 }
637
638 /*
639 * futex_hold_count(f, n)
640 *
641 * Attempt to acquire a reference to f. Return 0 on success,
642 * ENFILE on too many references.
643 *
644 * Never sleeps.
645 */
646 static int
647 futex_hold_count(struct futex *f, unsigned long const count)
648 {
649 unsigned long refcnt;
650
651 do {
652 refcnt = atomic_load_relaxed(&f->fx_refcnt);
653 if (ULONG_MAX - refcnt < count)
654 return ENFILE;
655 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
656 refcnt + count) != refcnt);
657
658 return 0;
659 }
660 #define futex_hold(f) futex_hold_count(f, 1)
661
662 /*
663 * futex_rele_count(f, n)
664 *
665 * Release a reference to f acquired with futex_create or
666 * futex_hold.
667 *
668 * May sleep to free f.
669 */
670 static void
671 futex_rele_count(struct futex *f, unsigned long const count)
672 {
673 unsigned long refcnt;
674
675 ASSERT_SLEEPABLE();
676
677 do {
678 refcnt = atomic_load_relaxed(&f->fx_refcnt);
679 KASSERT(refcnt >= count);
680 if (refcnt - count == 0)
681 goto trylast;
682 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
683 refcnt - count) != refcnt);
684 return;
685
686 trylast:
687 KASSERT(count <= LONG_MAX);
688 mutex_enter(&futex_tab.lock);
689 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
690 if (f->fx_on_tree) {
691 if (__predict_false(f->fx_shared))
692 rb_tree_remove_node(&futex_tab.oa, f);
693 else
694 rb_tree_remove_node(&futex_tab.va, f);
695 f->fx_on_tree = false;
696 }
697 } else {
698 /* References remain -- don't destroy it. */
699 f = NULL;
700 }
701 mutex_exit(&futex_tab.lock);
702 if (f != NULL)
703 futex_destroy(f);
704 }
705 #define futex_rele(f) futex_rele_count(f, 1)
706
707 /*
708 * futex_rele_count_not_last(f, n)
709 *
710 * Release a reference to f acquired with futex_create or
711 * futex_hold.
712 *
713 * This version asserts that we are not dropping the last
714 * reference to f.
715 */
716 static void
717 futex_rele_count_not_last(struct futex *f, unsigned long const count)
718 {
719 unsigned long refcnt;
720
721 do {
722 refcnt = atomic_load_relaxed(&f->fx_refcnt);
723 KASSERT(refcnt >= count);
724 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
725 refcnt - count) != refcnt);
726 }
727
728 /*
729 * futex_lookup_by_key(key, shared, class, &f)
730 *
731 * Try to find an existing futex va reference in the specified key
732 * On success, return 0, set f to found futex or to NULL if not found,
733 * and increment f's reference count if found.
734 *
735 * Return ENFILE if reference count too high.
736 *
737 * Internal lookup routine shared by futex_lookup() and
738 * futex_lookup_create().
739 */
740 static int
741 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
742 struct futex **fp)
743 {
744 struct futex *f;
745 int error = 0;
746
747 mutex_enter(&futex_tab.lock);
748 if (__predict_false(shared)) {
749 f = rb_tree_find_node(&futex_tab.oa, fk);
750 } else {
751 f = rb_tree_find_node(&futex_tab.va, fk);
752 }
753 if (f) {
754 if (__predict_true(f->fx_class == class ||
755 class == FUTEX_CLASS_ANY))
756 error = futex_hold(f);
757 else
758 error = EINVAL;
759 if (error)
760 f = NULL;
761 }
762 *fp = f;
763 mutex_exit(&futex_tab.lock);
764
765 return error;
766 }
767
768 /*
769 * futex_insert(f, fp)
770 *
771 * Try to insert the futex f into the tree by va. If there
772 * already is a futex for its va, acquire a reference to it, and
773 * store it in *fp; otherwise store f in *fp.
774 *
775 * Return 0 on success, ENFILE if there already is a futex but its
776 * reference count is too high.
777 */
778 static int
779 futex_insert(struct futex *f, struct futex **fp)
780 {
781 struct futex *f0;
782 int error;
783
784 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
785 KASSERT(!f->fx_on_tree);
786
787 mutex_enter(&futex_tab.lock);
788 if (__predict_false(f->fx_shared))
789 f0 = rb_tree_insert_node(&futex_tab.oa, f);
790 else
791 f0 = rb_tree_insert_node(&futex_tab.va, f);
792 if (f0 == f) {
793 f->fx_on_tree = true;
794 error = 0;
795 } else {
796 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
797 KASSERT(f0->fx_on_tree);
798 error = futex_hold(f0);
799 if (error)
800 goto out;
801 }
802 *fp = f0;
803 out: mutex_exit(&futex_tab.lock);
804
805 return error;
806 }
807
808 /*
809 * futex_lookup(uaddr, shared, class, &f)
810 *
811 * Find a futex at the userland pointer uaddr in the current
812 * process's VM space. On success, return the futex in f and
813 * increment its reference count.
814 *
815 * Caller must call futex_rele when done.
816 */
817 static int
818 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
819 {
820 union futex_key fk;
821 struct vmspace *vm = curproc->p_vmspace;
822 vaddr_t va = (vaddr_t)uaddr;
823 int error;
824
825 /*
826 * Reject unaligned user pointers so we don't cross page
827 * boundaries and so atomics will work.
828 */
829 if (__predict_false((va & 3) != 0))
830 return EINVAL;
831
832 /* Look it up. */
833 error = futex_key_init(&fk, vm, va, shared);
834 if (error)
835 return error;
836
837 error = futex_lookup_by_key(&fk, shared, class, fp);
838 futex_key_fini(&fk, shared);
839 if (error)
840 return error;
841
842 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
843 KASSERT(*fp == NULL || (*fp)->fx_class == class);
844 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
845
846 /*
847 * Success! (Caller must still check whether we found
848 * anything, but nothing went _wrong_ like trying to use
849 * unmapped memory.)
850 */
851 KASSERT(error == 0);
852
853 return error;
854 }
855
856 /*
857 * futex_lookup_create(uaddr, shared, class, &f)
858 *
859 * Find or create a futex at the userland pointer uaddr in the
860 * current process's VM space. On success, return the futex in f
861 * and increment its reference count.
862 *
863 * Caller must call futex_rele when done.
864 */
865 static int
866 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
867 {
868 union futex_key fk;
869 struct vmspace *vm = curproc->p_vmspace;
870 struct futex *f = NULL;
871 vaddr_t va = (vaddr_t)uaddr;
872 int error;
873
874 /*
875 * Reject unaligned user pointers so we don't cross page
876 * boundaries and so atomics will work.
877 */
878 if (__predict_false((va & 3) != 0))
879 return EINVAL;
880
881 if (__predict_false(class == FUTEX_CLASS_ANY))
882 return EINVAL;
883
884 error = futex_key_init(&fk, vm, va, shared);
885 if (error)
886 return error;
887
888 /*
889 * Optimistically assume there already is one, and try to find
890 * it.
891 */
892 error = futex_lookup_by_key(&fk, shared, class, fp);
893 if (error || *fp != NULL) {
894 /*
895 * We either found one, or there was an error.
896 * In either case, we are done with the key.
897 */
898 futex_key_fini(&fk, shared);
899 goto out;
900 }
901
902 /*
903 * Create a futex recoard. This tranfers ownership of the key
904 * in all cases.
905 */
906 f = futex_create(&fk, shared, class);
907 if (f == NULL) {
908 error = ENOMEM;
909 goto out;
910 }
911
912 /*
913 * Insert our new futex, or use existing if someone else beat
914 * us to it.
915 */
916 error = futex_insert(f, fp);
917 if (error)
918 goto out;
919 if (*fp == f)
920 f = NULL; /* don't release on exit */
921
922 /* Success! */
923 KASSERT(error == 0);
924
925 out: if (f != NULL)
926 futex_rele(f);
927 KASSERT(error || *fp != NULL);
928 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
929 return error;
930 }
931
932 /*
933 * futex_unsleep:
934 *
935 * Remove an LWP from the futex and sleep queue. This is called when
936 * the LWP has not been awoken normally but instead interrupted: for
937 * example, when a signal is received. Must be called with the LWP
938 * locked. Will unlock if "unlock" is true.
939 */
940 static void
941 futex_unsleep(lwp_t *l, bool unlock)
942 {
943 struct futex *f __diagused;
944
945 f = (struct futex *)(uintptr_t)l->l_wchan;
946
947 KASSERT(mutex_owned(f->fx_sq_lock));
948
949 /* WRITERQ is more likely */
950 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
951 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
952 f->fx_nwaiters[FUTEX_WRITERQ]--;
953 } else {
954 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
955 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
956 f->fx_nwaiters[FUTEX_READERQ]--;
957 }
958
959 sleepq_unsleep(l, unlock);
960 }
961
962 /*
963 * futex_wait(f, timeout, clkid, bitset)
964 *
965 * Wait until deadline on the clock clkid, or forever if deadline
966 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
967 * ETIMEDOUT on timeout, EINTR on signal.
968 */
969 static int
970 futex_wait(struct futex *f, int q, const struct timespec *deadline,
971 clockid_t clkid, int bitset)
972 {
973
974 /*
975 * Some things to note about how this function works:
976 *
977 * ==> When we enter futex_wait(), the futex's op lock is
978 * held, preventing us from missing wakes. Once we are on
979 * the futex's sleepq, it is safe to release the op lock.
980 *
981 * ==> We have a single early escape to avoid calling
982 * sleepq_block(): our deadline already having passed.
983 * If this is a no-timeout wait, or if the deadline has
984 * not already passed, we are guaranteed to call sleepq_block().
985 *
986 * ==> sleepq_block() contains all of the necessary logic
987 * for handling signals; we do not need to check for them
988 * ourselves.
989 *
990 * ==> Once we have blocked, other futex operations will
991 * be able to wake us or requeue us. Until we have blocked,
992 * those other futex operations will not be able to acquire
993 * the sleepq locks in order to perform those operations,
994 * even if we have dropped the op lock. Thus, we will not
995 * miss wakes. This fundamental invariant is relied upon by
996 * every other synchronization object in the kernel.
997 *
998 * ==> sleepq_block() returns for one of three reasons:
999 *
1000 * -- We were awakened.
1001 * -- We were signaled.
1002 * -- We timed out.
1003 *
1004 * ==> Once sleepq_block() returns, we will not call
1005 * sleepq_block() again.
1006 *
1007 * ==> If sleepq_block() returns due to being signaled,
1008 * we must check to see if we were also awakened. This
1009 * is to handle the following sequence:
1010 *
1011 * -- Futex wake takes us off sleepq, places us on runq.
1012 * -- We are signaled while sitting on the runq.
1013 * -- We run, and sleepq_block() notices the signal and
1014 * returns EINTR/ERESTART.
1015 *
1016 * In this situation, we want to indicate a successful wake
1017 * to the caller, because that wake has been reported to the
1018 * thread that issued it.
1019 *
1020 * Note that it is NOT possible for a wake to be issued after
1021 * a signal; the LWP will already have been taken off the sleepq
1022 * by the signal, so the wake will never happen. Note that for
1023 * this reason, we must *never* return ERESTART, because it could
1024 * result in us waiting again with no one to awaken us.
1025 */
1026
1027 struct lwp * const l = curlwp;
1028 int timo;
1029 int error;
1030
1031 /*
1032 * Compute our timeout before taking any locks.
1033 */
1034 if (deadline) {
1035 struct timespec ts;
1036
1037 /* Check our watch. */
1038 error = clock_gettime1(clkid, &ts);
1039 if (error) {
1040 mutex_exit(&f->fx_op_lock);
1041 return error;
1042 }
1043
1044 /*
1045 * If we're past the deadline, ETIMEDOUT.
1046 */
1047 if (timespeccmp(deadline, &ts, <=)) {
1048 mutex_exit(&f->fx_op_lock);
1049 return ETIMEDOUT;
1050 } else {
1051 /* Count how much time is left. */
1052 timespecsub(deadline, &ts, &ts);
1053 timo = tstohz(&ts);
1054 if (timo == 0) {
1055 /*
1056 * tstohz() already rounds up, and
1057 * a return value of 0 ticks means
1058 * "expired now or in the past".
1059 */
1060 mutex_exit(&f->fx_op_lock);
1061 return ETIMEDOUT;
1062 }
1063 }
1064 } else {
1065 timo = 0;
1066 }
1067
1068 /*
1069 * Acquire in sleepq -> lwp order. While we're at it, ensure
1070 * that there's room for us to block.
1071 */
1072 mutex_spin_enter(f->fx_sq_lock);
1073 if (__predict_false(q == FUTEX_READERQ &&
1074 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1075 mutex_spin_exit(f->fx_sq_lock);
1076 mutex_exit(&f->fx_op_lock);
1077 return ENFILE;
1078 }
1079 lwp_lock(l);
1080
1081 /*
1082 * No need for locks here; until we're on a futex's sleepq, these
1083 * values are private to the LWP, and the locks needed to get onto
1084 * the sleepq provide the memory barriers needed to ensure visibility.
1085 */
1086 l->l_futex = f;
1087 l->l_futex_wakesel = bitset;
1088
1089 /*
1090 * This is an inlined bit of sleepq_enter();
1091 * we already hold the lwp lock.
1092 */
1093 lwp_unlock_to(l, f->fx_sq_lock);
1094 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1095 KASSERT(lwp_locked(l, f->fx_sq_lock));
1096
1097 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1098 f->fx_nwaiters[q]++;
1099
1100 /* We can now safely release the op lock. */
1101 mutex_exit(&f->fx_op_lock);
1102
1103 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1104 error = sleepq_block(timo, true);
1105 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1106
1107 /*
1108 * f is no longer valid; we may have been moved to another
1109 * futex sleepq while we slept.
1110 */
1111
1112 /*
1113 * If something went wrong, we may still have a futex reference.
1114 * Check for that here and drop it if so. We can do this without
1115 * taking any locks because l->l_futex is private to the LWP when
1116 * not on any futex's sleepq, and we are not on any sleepq because
1117 * we are running.
1118 *
1119 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1120 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1121 */
1122 if (error) {
1123 f = l->l_futex;
1124 if (f != NULL) {
1125 SDT_PROBE0(futex, wait, finish, aborted);
1126 l->l_futex = NULL;
1127 futex_rele(f);
1128 } else {
1129 /* Raced with wakeup; report success. */
1130 SDT_PROBE0(futex, wait, finish, wakerace);
1131 error = 0;
1132 }
1133 if (error == EWOULDBLOCK)
1134 error = ETIMEDOUT;
1135 else if (error != ETIMEDOUT)
1136 error = EINTR;
1137 } else {
1138 KASSERT(l->l_futex == NULL);
1139 SDT_PROBE0(futex, wait, finish, normally);
1140 }
1141
1142 return error;
1143 }
1144
1145 /*
1146 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1147 *
1148 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1149 * provided, move up to nrequeue remaining waiters on f matching
1150 * bitset to f2. Return the number of waiters actually woken.
1151 * Caller must hold the locks of f and f2, if provided.
1152 */
1153 static unsigned
1154 futex_wake(struct futex *f, int q, unsigned int const nwake,
1155 struct futex *f2, int q2, unsigned int const nrequeue,
1156 int bitset)
1157 {
1158 struct lwp *l, *l_next;
1159 unsigned nwoken = 0;
1160 unsigned nrequeued = 0;
1161
1162 KASSERT(mutex_owned(&f->fx_op_lock));
1163 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1164
1165 futex_sq_lock2(f, f2);
1166
1167 /* Wake up to nwake waiters, and count the number woken. */
1168 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1169 if (nwoken == nwake)
1170 break;
1171
1172 KASSERT(l->l_syncobj == &futex_syncobj);
1173 KASSERT(l->l_wchan == (wchan_t)f);
1174 KASSERT(l->l_futex == f);
1175 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1176 KASSERT(l->l_mutex == f->fx_sq_lock);
1177
1178 if ((l->l_futex_wakesel & bitset) == 0)
1179 continue;
1180
1181 l->l_futex_wakesel = 0;
1182 l->l_futex = NULL;
1183 sleepq_remove(&f->fx_sqs[q], l);
1184 f->fx_nwaiters[q]--;
1185 nwoken++;
1186 /* hold counts adjusted in bulk below */
1187 }
1188
1189 if (nrequeue) {
1190 KASSERT(f2 != NULL);
1191 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1192 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1193 if (nrequeued == nrequeue)
1194 break;
1195
1196 KASSERT(l->l_syncobj == &futex_syncobj);
1197 KASSERT(l->l_wchan == (wchan_t)f);
1198 KASSERT(l->l_futex == f);
1199 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1200 KASSERT(l->l_mutex == f->fx_sq_lock);
1201
1202 if ((l->l_futex_wakesel & bitset) == 0)
1203 continue;
1204
1205 l->l_futex = f2;
1206 sleepq_transfer(l, &f->fx_sqs[q],
1207 &f2->fx_sqs[q2], f2, futex_wmesg,
1208 &futex_syncobj, f2->fx_sq_lock, true);
1209 f->fx_nwaiters[q]--;
1210 f2->fx_nwaiters[q2]++;
1211 nrequeued++;
1212 /* hold counts adjusted in bulk below */
1213 }
1214 if (nrequeued) {
1215 /* XXX futex_hold() could theoretically fail here. */
1216 int hold_error __diagused =
1217 futex_hold_count(f2, nrequeued);
1218 KASSERT(hold_error == 0);
1219 }
1220 }
1221
1222 /*
1223 * Adjust the futex reference counts for the wakes and
1224 * requeues.
1225 */
1226 KASSERT(nwoken + nrequeued <= LONG_MAX);
1227 futex_rele_count_not_last(f, nwoken + nrequeued);
1228
1229 futex_sq_unlock2(f, f2);
1230
1231 /* Return the number of waiters woken and requeued. */
1232 return nwoken + nrequeued;
1233 }
1234
1235 /*
1236 * futex_op_lock(f)
1237 *
1238 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1239 * not use if caller needs to acquire two locks; use
1240 * futex_op_lock2 instead.
1241 */
1242 static void
1243 futex_op_lock(struct futex *f)
1244 {
1245 mutex_enter(&f->fx_op_lock);
1246 }
1247
1248 /*
1249 * futex_op_unlock(f)
1250 *
1251 * Release the queue lock of f.
1252 */
1253 static void
1254 futex_op_unlock(struct futex *f)
1255 {
1256 mutex_exit(&f->fx_op_lock);
1257 }
1258
1259 /*
1260 * futex_op_lock2(f, f2)
1261 *
1262 * Acquire the op locks of both f and f2, which may be null, or
1263 * which may be the same. If they are distinct, an arbitrary total
1264 * order is chosen on the locks.
1265 *
1266 * Callers should only ever acquire multiple op locks
1267 * simultaneously using futex_op_lock2.
1268 */
1269 static void
1270 futex_op_lock2(struct futex *f, struct futex *f2)
1271 {
1272
1273 /*
1274 * If both are null, do nothing; if one is null and the other
1275 * is not, lock the other and be done with it.
1276 */
1277 if (f == NULL && f2 == NULL) {
1278 return;
1279 } else if (f == NULL) {
1280 mutex_enter(&f2->fx_op_lock);
1281 return;
1282 } else if (f2 == NULL) {
1283 mutex_enter(&f->fx_op_lock);
1284 return;
1285 }
1286
1287 /* If both futexes are the same, acquire only one. */
1288 if (f == f2) {
1289 mutex_enter(&f->fx_op_lock);
1290 return;
1291 }
1292
1293 /* Otherwise, use the ordering on the kva of the futex pointer. */
1294 if ((uintptr_t)f < (uintptr_t)f2) {
1295 mutex_enter(&f->fx_op_lock);
1296 mutex_enter(&f2->fx_op_lock);
1297 } else {
1298 mutex_enter(&f2->fx_op_lock);
1299 mutex_enter(&f->fx_op_lock);
1300 }
1301 }
1302
1303 /*
1304 * futex_op_unlock2(f, f2)
1305 *
1306 * Release the queue locks of both f and f2, which may be null, or
1307 * which may have the same underlying queue.
1308 */
1309 static void
1310 futex_op_unlock2(struct futex *f, struct futex *f2)
1311 {
1312
1313 /*
1314 * If both are null, do nothing; if one is null and the other
1315 * is not, unlock the other and be done with it.
1316 */
1317 if (f == NULL && f2 == NULL) {
1318 return;
1319 } else if (f == NULL) {
1320 mutex_exit(&f2->fx_op_lock);
1321 return;
1322 } else if (f2 == NULL) {
1323 mutex_exit(&f->fx_op_lock);
1324 return;
1325 }
1326
1327 /* If both futexes are the same, release only one. */
1328 if (f == f2) {
1329 mutex_exit(&f->fx_op_lock);
1330 return;
1331 }
1332
1333 /* Otherwise, use the ordering on the kva of the futex pointer. */
1334 if ((uintptr_t)f < (uintptr_t)f2) {
1335 mutex_exit(&f2->fx_op_lock);
1336 mutex_exit(&f->fx_op_lock);
1337 } else {
1338 mutex_exit(&f->fx_op_lock);
1339 mutex_exit(&f2->fx_op_lock);
1340 }
1341 }
1342
1343 /*
1344 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1345 *
1346 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1347 */
1348 static int
1349 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1350 const struct timespec *timeout, clockid_t clkid, int clkflags,
1351 register_t *retval)
1352 {
1353 struct futex *f;
1354 struct timespec ts;
1355 const struct timespec *deadline;
1356 int error;
1357
1358 /*
1359 * If there's nothing to wait for, and nobody will ever wake
1360 * us, then don't set anything up to wait -- just stop here.
1361 */
1362 if (val3 == 0)
1363 return EINVAL;
1364
1365 /* Optimistically test before anything else. */
1366 if (!futex_test(uaddr, val))
1367 return EAGAIN;
1368
1369 /* Determine a deadline on the specified clock. */
1370 if (timeout == NULL) {
1371 deadline = timeout;
1372 } else if ((clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1373 /* Sanity-check the deadline. */
1374 if (timeout->tv_sec < 0 ||
1375 timeout->tv_nsec < 0 ||
1376 timeout->tv_nsec >= 1000000000L) {
1377 return EINVAL;
1378 }
1379 deadline = timeout;
1380 } else {
1381 struct timespec interval = *timeout;
1382
1383 error = itimespecfix(&interval);
1384 if (error)
1385 return error;
1386 error = clock_gettime1(clkid, &ts);
1387 if (error)
1388 return error;
1389 timespecadd(&ts, &interval, &ts);
1390 deadline = &ts;
1391 }
1392
1393 /* Get the futex, creating it if necessary. */
1394 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1395 if (error)
1396 return error;
1397 KASSERT(f);
1398
1399 /*
1400 * Under the op lock, check the value again: if it has
1401 * already changed, EAGAIN; otherwise enqueue the waiter.
1402 * Since FUTEX_WAKE will use the same lock and be done after
1403 * modifying the value, the order in which we check and enqueue
1404 * is immaterial.
1405 */
1406 futex_op_lock(f);
1407 if (!futex_test(uaddr, val)) {
1408 futex_op_unlock(f);
1409 error = EAGAIN;
1410 goto out;
1411 }
1412
1413 /*
1414 * Now wait. futex_wait() will drop our op lock once we
1415 * are entered into the sleep queue, thus ensuring atomicity
1416 * of wakes with respect to waits.
1417 */
1418 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1419
1420 /*
1421 * We cannot drop our reference to the futex here, because
1422 * we might be enqueued on a different one when we are awakened.
1423 * The references will be managed on our behalf in the requeue,
1424 * wake, and error cases.
1425 */
1426 f = NULL;
1427
1428 if (__predict_true(error == 0)) {
1429 /* Return 0 on success, error on failure. */
1430 *retval = 0;
1431 }
1432
1433 out: if (f != NULL)
1434 futex_rele(f);
1435 return error;
1436 }
1437
1438 /*
1439 * futex_func_wake(uaddr, val, val3, retval)
1440 *
1441 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1442 */
1443 static int
1444 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1445 {
1446 struct futex *f;
1447 unsigned int nwoken = 0;
1448 int error = 0;
1449
1450 /* Reject negative number of wakeups. */
1451 if (val < 0) {
1452 error = EINVAL;
1453 goto out;
1454 }
1455
1456 /* Look up the futex, if any. */
1457 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1458 if (error)
1459 goto out;
1460
1461 /* If there's no futex, there are no waiters to wake. */
1462 if (f == NULL)
1463 goto out;
1464
1465 /*
1466 * Under f's op lock, wake the waiters and remember the
1467 * number woken.
1468 */
1469 futex_op_lock(f);
1470 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1471 NULL, FUTEX_WRITERQ, 0, val3);
1472 futex_op_unlock(f);
1473
1474 /* Release the futex. */
1475 futex_rele(f);
1476
1477 out:
1478 /* Return the number of waiters woken. */
1479 *retval = nwoken;
1480
1481 /* Success! */
1482 return error;
1483 }
1484
1485 /*
1486 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1487 *
1488 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1489 */
1490 static int
1491 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1492 int val2, int val3, register_t *retval)
1493 {
1494 struct futex *f = NULL, *f2 = NULL;
1495 unsigned nwoken = 0; /* default to zero woken on early return */
1496 int error;
1497
1498 /* Reject negative number of wakeups or requeues. */
1499 if (val < 0 || val2 < 0) {
1500 error = EINVAL;
1501 goto out;
1502 }
1503
1504 /* Look up the source futex, if any. */
1505 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1506 if (error)
1507 goto out;
1508
1509 /* If there is none, nothing to do. */
1510 if (f == NULL)
1511 goto out;
1512
1513 /*
1514 * We may need to create the destination futex because it's
1515 * entirely possible it does not currently have any waiters.
1516 */
1517 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1518 if (error)
1519 goto out;
1520
1521 /*
1522 * Under the futexes' op locks, check the value; if
1523 * unchanged from val3, wake the waiters.
1524 */
1525 futex_op_lock2(f, f2);
1526 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1527 error = EAGAIN;
1528 } else {
1529 error = 0;
1530 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1531 f2, FUTEX_WRITERQ, val2,
1532 FUTEX_BITSET_MATCH_ANY);
1533 }
1534 futex_op_unlock2(f, f2);
1535
1536 out:
1537 /* Return the number of waiters woken. */
1538 *retval = nwoken;
1539
1540 /* Release the futexes if we got them. */
1541 if (f2)
1542 futex_rele(f2);
1543 if (f)
1544 futex_rele(f);
1545 return error;
1546 }
1547
1548 /*
1549 * futex_validate_op_cmp(val3)
1550 *
1551 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1552 */
1553 static int
1554 futex_validate_op_cmp(int val3)
1555 {
1556 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1557 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1558
1559 if (op & FUTEX_OP_OPARG_SHIFT) {
1560 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1561 if (oparg < 0)
1562 return EINVAL;
1563 if (oparg >= 32)
1564 return EINVAL;
1565 op &= ~FUTEX_OP_OPARG_SHIFT;
1566 }
1567
1568 switch (op) {
1569 case FUTEX_OP_SET:
1570 case FUTEX_OP_ADD:
1571 case FUTEX_OP_OR:
1572 case FUTEX_OP_ANDN:
1573 case FUTEX_OP_XOR:
1574 break;
1575 default:
1576 return EINVAL;
1577 }
1578
1579 switch (cmp) {
1580 case FUTEX_OP_CMP_EQ:
1581 case FUTEX_OP_CMP_NE:
1582 case FUTEX_OP_CMP_LT:
1583 case FUTEX_OP_CMP_LE:
1584 case FUTEX_OP_CMP_GT:
1585 case FUTEX_OP_CMP_GE:
1586 break;
1587 default:
1588 return EINVAL;
1589 }
1590
1591 return 0;
1592 }
1593
1594 /*
1595 * futex_compute_op(oldval, val3)
1596 *
1597 * Apply a FUTEX_WAIT_OP operation to oldval.
1598 */
1599 static int
1600 futex_compute_op(int oldval, int val3)
1601 {
1602 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1603 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1604
1605 if (op & FUTEX_OP_OPARG_SHIFT) {
1606 KASSERT(oparg >= 0);
1607 KASSERT(oparg < 32);
1608 oparg = 1u << oparg;
1609 op &= ~FUTEX_OP_OPARG_SHIFT;
1610 }
1611
1612 switch (op) {
1613 case FUTEX_OP_SET:
1614 return oparg;
1615
1616 case FUTEX_OP_ADD:
1617 /*
1618 * Avoid signed arithmetic overflow by doing
1619 * arithmetic unsigned and converting back to signed
1620 * at the end.
1621 */
1622 return (int)((unsigned)oldval + (unsigned)oparg);
1623
1624 case FUTEX_OP_OR:
1625 return oldval | oparg;
1626
1627 case FUTEX_OP_ANDN:
1628 return oldval & ~oparg;
1629
1630 case FUTEX_OP_XOR:
1631 return oldval ^ oparg;
1632
1633 default:
1634 panic("invalid futex op");
1635 }
1636 }
1637
1638 /*
1639 * futex_compute_cmp(oldval, val3)
1640 *
1641 * Apply a FUTEX_WAIT_OP comparison to oldval.
1642 */
1643 static bool
1644 futex_compute_cmp(int oldval, int val3)
1645 {
1646 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1647 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1648
1649 switch (cmp) {
1650 case FUTEX_OP_CMP_EQ:
1651 return (oldval == cmparg);
1652
1653 case FUTEX_OP_CMP_NE:
1654 return (oldval != cmparg);
1655
1656 case FUTEX_OP_CMP_LT:
1657 return (oldval < cmparg);
1658
1659 case FUTEX_OP_CMP_LE:
1660 return (oldval <= cmparg);
1661
1662 case FUTEX_OP_CMP_GT:
1663 return (oldval > cmparg);
1664
1665 case FUTEX_OP_CMP_GE:
1666 return (oldval >= cmparg);
1667
1668 default:
1669 panic("invalid futex cmp operation");
1670 }
1671 }
1672
1673 /*
1674 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1675 *
1676 * Implement futex(FUTEX_WAKE_OP).
1677 */
1678 static int
1679 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1680 int val3, register_t *retval)
1681 {
1682 struct futex *f = NULL, *f2 = NULL;
1683 int oldval, newval, actual;
1684 unsigned nwoken = 0;
1685 int error;
1686
1687 /* Reject negative number of wakeups. */
1688 if (val < 0 || val2 < 0) {
1689 error = EINVAL;
1690 goto out;
1691 }
1692
1693 /* Reject invalid operations before we start doing things. */
1694 if ((error = futex_validate_op_cmp(val3)) != 0)
1695 goto out;
1696
1697 /* Look up the first futex, if any. */
1698 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1699 if (error)
1700 goto out;
1701
1702 /* Look up the second futex, if any. */
1703 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1704 if (error)
1705 goto out;
1706
1707 /*
1708 * Under the op locks:
1709 *
1710 * 1. Read/modify/write: *uaddr2 op= oparg.
1711 * 2. Unconditionally wake uaddr.
1712 * 3. Conditionally wake uaddr2, if it previously matched val3.
1713 */
1714 futex_op_lock2(f, f2);
1715 do {
1716 error = futex_load(uaddr2, &oldval);
1717 if (error)
1718 goto out_unlock;
1719 newval = futex_compute_op(oldval, val3);
1720 error = ucas_int(uaddr2, oldval, newval, &actual);
1721 if (error)
1722 goto out_unlock;
1723 } while (actual != oldval);
1724 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1725 NULL, FUTEX_WRITERQ, 0,
1726 FUTEX_BITSET_MATCH_ANY) : 0);
1727 if (f2 && futex_compute_cmp(oldval, val3))
1728 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1729 NULL, FUTEX_WRITERQ, 0,
1730 FUTEX_BITSET_MATCH_ANY);
1731
1732 /* Success! */
1733 error = 0;
1734 out_unlock:
1735 futex_op_unlock2(f, f2);
1736
1737 out:
1738 /* Return the number of waiters woken. */
1739 *retval = nwoken;
1740
1741 /* Release the futexes, if we got them. */
1742 if (f2)
1743 futex_rele(f2);
1744 if (f)
1745 futex_rele(f);
1746 return error;
1747 }
1748
1749 /*
1750 * futex_read_waiter_prio(l)
1751 *
1752 * Returns the read-waiter priority for purposes of comparing
1753 * against a write-waiter's priority. Read-waiters are only
1754 * prioritized if they are real-time threads.
1755 */
1756 static inline pri_t
1757 futex_read_waiter_prio(struct lwp * const l)
1758 {
1759 const pri_t pri = lwp_eprio(l);
1760 if (__predict_false(pri >= PRI_USER_RT))
1761 return pri;
1762 return PRI_NONE;
1763 }
1764
1765 #if 0 /* XXX */
1766 /*
1767 * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp)
1768 *
1769 * Attempt to resolve the case of a real-time thread attempting
1770 * to acquire a read lock. Turns true if the attempt is resolved
1771 * and the wait should be elided.
1772 */
1773 static int __noinline
1774 futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val,
1775 pri_t pri_reader)
1776 {
1777 struct lwp *l_writer = NULL;
1778 int newval, next;
1779 int error;
1780
1781 KASSERT(mutex_owned(&f->fx_op_lock));
1782
1783 /*
1784 * If the lock is write-locked, there isn't anything we
1785 * can do but wait.
1786 */
1787 if (val & FUTEX_RW_WRITE_LOCKED) {
1788 return 0;
1789 }
1790
1791 /*
1792 * If we're maxed out on readers already, nothing we can do.
1793 */
1794 if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
1795 return ENFILE; /* disctinct from EAGAIN */
1796 }
1797
1798 /*
1799 * The assumption then is that we arrived here with WRITE_WANTED
1800 * set. We're not going to bother testing that, however. We
1801 * will preserve it if we see it.
1802 *
1803 * Because WRITE_WANTED is set, This will cause everyone to enter
1804 * the futex to rw_wait. And we are holding the op lock so no
1805 * additional waiters will apear on the sleepq. We are going
1806 * to do the same tricky dance as rw_handoff to let higher-priority
1807 * real-time waiters slip past the gate.
1808 */
1809
1810 /*
1811 * All we want to do here is check if there is a writer waiting.
1812 * If there is, and it is equal or greater priority, this reader
1813 * loses, otherwise we will just make note if it to ensure that
1814 * the WRITE_WANTED bit is set when we update the futex word.
1815 *
1816 * Since we are not going to actually wake someone from the
1817 * queue here, we're not really interested if the write-waiter
1818 * happens to leave based on a timeout or signal; we know that
1819 * any write-waiter *after* the first one is of equal or lower
1820 * priority, so we would still best it.
1821 */
1822 mutex_spin_enter(f->fx_sq_lock);
1823 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
1824 if (__predict_true(l_writer != NULL)) {
1825 if (pri_reader <= lwp_eprio(l_writer)) {
1826 return 0;
1827 }
1828 }
1829 mutex_spin_exit(f->fx_sq_lock);
1830
1831 for (;; val = next) {
1832 if (__predict_true(l_writer != NULL)) {
1833 newval = (val + 1) | FUTEX_RW_WRITE_WANTED;
1834 } else {
1835 /*
1836 * No writer waiting; increment the reader
1837 * count, preserve any WRITE_WANTED bit.
1838 */
1839 newval = val + 1;
1840 KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0);
1841 }
1842
1843 error = ucas_int(uaddr, val, newval, &next);
1844 if (__predict_false(error != 0)) {
1845 return error;
1846 }
1847 if (next == val) {
1848 /* Successfully acquired the read lock. */
1849 return EJUSTRETURN;
1850 }
1851 /*
1852 * Repeat the terminal checks from above on the new
1853 * value.
1854 */
1855 if (next & FUTEX_RW_WRITE_LOCKED) {
1856 return 0;
1857 }
1858 if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
1859 return ENFILE; /* disctinct from EAGAIN */
1860 }
1861 }
1862 }
1863 #endif /* XXX */
1864
1865 /*
1866 * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval)
1867 *
1868 * Implement futex(FUTEX_NETBSD_RW_WAIT).
1869 */
1870 static int
1871 futex_func_rw_wait(bool shared, int *uaddr, int val, int val3,
1872 const struct timespec *abstime, clockid_t clkid, register_t *retval)
1873 {
1874 #if 1
1875 /* XXX NETBSD_RW ops are currently broken XXX */
1876 return ENOSYS;
1877 #else
1878 struct futex *f;
1879 int error;
1880
1881 /* Must specify READER or WRITER. */
1882 if (__predict_false(val3 != FUTEX_RW_READER &&
1883 val3 != FUTEX_RW_WRITER))
1884 return EINVAL;
1885
1886 /* Optimistically test before anything else. */
1887 if (!futex_test(uaddr, val))
1888 return EAGAIN;
1889
1890 /* Get the futex, creating it if necessary. */
1891 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
1892 if (error)
1893 return error;
1894 KASSERT(f);
1895
1896 /*
1897 * Under the op lock, check the value again: if it has
1898 * already changed, EAGAIN; otherwise, enqueue the waiter
1899 * on the specified queue.
1900 */
1901 futex_op_lock(f);
1902 if (!futex_test(uaddr, val)) {
1903 futex_op_unlock(f);
1904 error = EAGAIN;
1905 goto out;
1906 }
1907
1908 /*
1909 * POSIX dictates that a real-time reader will be prioritized
1910 * over writers of lesser priority, when normally we would
1911 * prefer the writers.
1912 */
1913 if (__predict_false(val3 == FUTEX_RW_READER)) {
1914 struct lwp * const l = curlwp;
1915 const pri_t pri_reader = futex_read_waiter_prio(l);
1916 if (__predict_false(pri_reader > PRI_NONE)) {
1917 error = futex_rw_handle_rt_reader(f, uaddr, val,
1918 pri_reader);
1919 if (error) {
1920 if (__predict_true(error == EJUSTRETURN)) {
1921 /* RT reader acquired the lock. */
1922 error = 0;
1923 }
1924 futex_op_unlock(f);
1925 goto out;
1926 }
1927 }
1928 }
1929
1930 /*
1931 * Now wait. futex_wait() will drop our op lock once we
1932 * are entered into the sleep queue, thus ensuring atomicity
1933 * of wakes with respect to waits.
1934 *
1935 * Use a wake selector of 0 so that this waiter won't match on any
1936 * of the other operations in case someone makes that error; only
1937 * rw_handoff is allowed! This is critical because a waiter that
1938 * awakens from an rw_wait assumes it has been given ownership of
1939 * the lock.
1940 */
1941 error = futex_wait(f, val3, abstime, clkid, 0);
1942
1943 /*
1944 * Don't drop our reference here. We won't be requeued, but
1945 * it's best to main symmetry with other operations.
1946 */
1947 f = NULL;
1948
1949 out: if (__predict_true(error == 0)) {
1950 /* Return 0 on success, error on failure. */
1951 *retval = 0;
1952 }
1953
1954 if (f != NULL)
1955 futex_rele(f);
1956 return error;
1957 #endif
1958 }
1959
1960 /*
1961 * futex_func_rw_handoff(uaddr, val, retval)
1962 *
1963 * Implement futex(FUTEX_NETBSD_RW_HANDOFF).
1964 *
1965 * This implements direct hand-off to either a single writer
1966 * or all readers.
1967 */
1968 static int
1969 futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval)
1970 {
1971 #if 1
1972 /* XXX NETBSD_RW ops are currently broken XXX */
1973 return ENOSYS;
1974 #else
1975 struct lwp *l, *l_next, *l_writer, *l_reader;
1976 struct futex *f;
1977 sleepq_t wsq, *sq;
1978 int error = 0;
1979 int newval, next, nwake, nwoken = 0;
1980 int allreaders __diagused = 0;
1981 unsigned int *nwaitersp;
1982 pri_t pri_writer;
1983
1984 /* Look up the futex, if any. */
1985 error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
1986 if (error)
1987 goto out;
1988
1989 /*
1990 * There are a couple of diffrent scenarios where a thread
1991 * releasing a RW lock would call rw_handoff, yet we find no
1992 * waiters:
1993 *
1994 * ==> There were waiters on the queue, but they left the queue
1995 * due to a signal.
1996 * ==> The waiting thread set the waiter bit(s), but decided to
1997 * try spinning before calling rw_wait.
1998 *
1999 * In both of these cases, we will ensure that the lock word
2000 * gets cleared.
2001 */
2002
2003 /* If there's no futex, there are no waiters to wake. */
2004 if (__predict_false(f == NULL)) {
2005 /*
2006 * If there are no waiters, ensure that the lock
2007 * word is cleared.
2008 */
2009 error = ucas_int(uaddr, val, 0, &next);
2010 if (__predict_true(error == 0)) {
2011 if (__predict_false(val != next))
2012 error = EAGAIN;
2013 }
2014 goto out;
2015 }
2016
2017 /*
2018 * Compute the new value and store it in the futex word.
2019 *
2020 * This is a little tricky because the ucas could cause
2021 * a page fault, and we can't let that happen while holding
2022 * the sleepq locks. But we have to make sure that choices
2023 * we make regarding what threads to wake is accurately
2024 * reflected in the futex word and that the futex word is
2025 * updated before the LWPs can run.
2026 *
2027 * This is easy enough ... we can transfer the LWPs to a private
2028 * sleepq to prevent changes in the original sleepq while we have
2029 * it unlocked from affecting the results, but we must also
2030 * consider that LWPs might be using timed-wait, so we have to
2031 * make sure that won't wake the LWP up out from under us if the
2032 * timer fires. To do this, we have to set the "wchan" to NULL
2033 * and use a dummy syncobj that takes no action on "unsleep".
2034 *
2035 * We thus perform the hand-off in three steps, all with the op
2036 * lock held:
2037 *
2038 * ==> Wake selection: sleepq locked, select LWPs to wake,
2039 * compute new futex word. LWPs are tranferred from the
2040 * futex sleepq to the private sleepq and further sedated.
2041 *
2042 * ==> Futex word update: sleepq unlocked, use a loop around ucas
2043 * to update the futex word. There are no scenarios where
2044 * user space code can update the futex in a way that would
2045 * impact the work we do here; because we've been asked to do
2046 * the hand-off, any new LWPs attempting to acquire the lock
2047 * would be entering rw_wait by definition (either because
2048 * they're read-lockers and the lock is write-wanted, or they're
2049 * write-lockers and the lock is read-held). Those new rw_wait
2050 * LWPs will serialize against the op lock. We DO, however,
2051 * need to preserve any newly-set WANTED bits, hence the ucas
2052 * loop. Those newly-set WANTED bits, however, will not impact
2053 * our decisions. Those LWPs have simply lost the race to
2054 * acquire the lock, and we don't consult those bits anyway;
2055 * we instead use the contents of the sleepqs as the truth
2056 * about who is waiting, and now new waiters will appear on
2057 * the sleepqs while we hold the op lock.
2058 *
2059 * ==> Wake waiters: sleepq locked, run down our private sleepq
2060 * and actually awaken all of the LWPs we selected earlier.
2061 *
2062 * If for some reason, the ucas fails because it page backing it
2063 * was unmapped, all bets are off. We still awaken the waiters.
2064 * This is either a malicious attempt to screw things up, or a
2065 * programming error, and we don't care about either one.
2066 */
2067 sleepq_init(&wsq);
2068
2069 futex_op_lock(f);
2070 mutex_spin_enter(f->fx_sq_lock);
2071
2072 /*
2073 * STEP 1
2074 */
2075
2076 /*
2077 * POSIX dictates that any real-time waiters will acquire the
2078 * lock in priority order. This implies that a real-time
2079 * read-waiter has priority over a non-real-time write-waiter,
2080 * where we would otherwise prioritize waking the write-waiter.
2081 */
2082 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
2083 l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]);
2084 if (__predict_true(l_writer == NULL)) {
2085 /* We will wake all the readers. */
2086 sq = &f->fx_sqs[FUTEX_READERQ];
2087 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
2088 nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ];
2089 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
2090 if (__predict_false(nwake == 0)) {
2091 KASSERT(l_reader == NULL);
2092 newval = 0;
2093 } else {
2094 KASSERT(l_reader != NULL);
2095 newval = nwake;
2096 }
2097 l = l_reader;
2098 } else if (__predict_false(l_reader != NULL &&
2099 futex_read_waiter_prio(l_reader) >
2100 (pri_writer = lwp_eprio(l_writer)))) {
2101 /*
2102 * Count the number of real-time read-waiters that
2103 * exceed the write-waiter's priority. We will
2104 * wake that many (we alreay know it's at least one).
2105 */
2106 sq = &f->fx_sqs[FUTEX_READERQ];
2107 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
2108 for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain);
2109 l != NULL && futex_read_waiter_prio(l) > pri_writer;
2110 l = LIST_NEXT(l, l_sleepchain)) {
2111 nwake++;
2112 }
2113 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
2114 /* We know there is at least one write-waiter. */
2115 newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
2116 l = l_reader;
2117 } else {
2118 /* We will wake one writer. */
2119 sq = &f->fx_sqs[FUTEX_WRITERQ];
2120 nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ];
2121 nwake = 1;
2122 newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid;
2123 if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) {
2124 KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL);
2125 newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
2126 } else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) {
2127 KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2128 newval |= FUTEX_WAITERS;
2129 } else {
2130 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2131 }
2132 l = l_writer;
2133 }
2134
2135 KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] ||
2136 sq == &f->fx_sqs[FUTEX_WRITERQ]);
2137 while (nwake != 0 && l != NULL) {
2138 l_next = LIST_NEXT(l, l_sleepchain);
2139 KASSERT(l->l_syncobj == &futex_syncobj);
2140 KASSERT(l->l_wchan == (wchan_t)f);
2141 KASSERT(l->l_futex == f);
2142 KASSERT(l->l_sleepq == sq);
2143 KASSERT(l->l_mutex == f->fx_sq_lock);
2144
2145 /*
2146 * NULL wchan --> timeout will not wake LWP.
2147 * NULL lock --> keep existing lock.
2148 */
2149 sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg,
2150 &sched_syncobj, NULL/*lock*/, false);
2151
2152 KASSERT(*nwaitersp > 0);
2153 (*nwaitersp)--;
2154 nwoken++;
2155 nwake--;
2156 /* hold count adjusted below */
2157 l = l_next;
2158 }
2159
2160 mutex_spin_exit(f->fx_sq_lock);
2161
2162 /*
2163 * STEP 2
2164 */
2165 for (;; val = next) {
2166 error = ucas_int(uaddr, val, newval, &next);
2167 if (__predict_false(error != 0)) {
2168 /*
2169 * The futex word has become unmapped. All bets
2170 * are off. Break out of the loop and awaken the
2171 * waiters; this is easier than trying to stuff
2172 * them back into their previous sleepq, and the
2173 * application is screwed anyway.
2174 */
2175 break;
2176 }
2177 if (__predict_true(next == val)) {
2178 /*
2179 * Successfully updated the futex word!
2180 */
2181 break;
2182 }
2183 /*
2184 * The only thing that could have possibly happened
2185 * (barring some bug in the thread library) is that
2186 * additional waiter bits arrived. Those new waiters
2187 * have lost the race to acquire the lock, but we
2188 * need to preserve those bits.
2189 */
2190 newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED);
2191 }
2192
2193 mutex_spin_enter(f->fx_sq_lock);
2194
2195 /*
2196 * STEP 3
2197 */
2198
2199 LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) {
2200 KASSERT(l->l_syncobj == &sched_syncobj);
2201 KASSERT(l->l_wchan == NULL);
2202 KASSERT(l->l_futex == f);
2203 KASSERT(l->l_sleepq == &wsq);
2204 KASSERT(l->l_mutex == f->fx_sq_lock);
2205
2206 l->l_futex_wakesel = 0;
2207 l->l_futex = NULL;
2208 sleepq_remove(&wsq, l);
2209 }
2210 /* If we woke all-readers, ensure we will wake them all. */
2211 KASSERT(allreaders == 0 || allreaders == nwoken);
2212 KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2213 KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0);
2214
2215 mutex_spin_exit(f->fx_sq_lock);
2216
2217 /* Adjust hold count. */
2218 futex_rele_count_not_last(f, nwoken);
2219
2220 futex_op_unlock(f);
2221
2222 /* Release the futex. */
2223 futex_rele(f);
2224
2225 out: if (__predict_true(error == 0)) {
2226 /* Return the number of waiters woken. */
2227 *retval = nwoken;
2228 }
2229
2230 /* Success! */
2231 return error;
2232 #endif
2233 }
2234
2235 /*
2236 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
2237 *
2238 * Implement the futex system call with all the parameters
2239 * parsed out.
2240 */
2241 int
2242 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
2243 int *uaddr2, int val2, int val3, register_t *retval)
2244 {
2245 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
2246 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
2247 : CLOCK_MONOTONIC;
2248 int error;
2249
2250 op &= FUTEX_CMD_MASK;
2251
2252 switch (op) {
2253 case FUTEX_WAIT:
2254 SDT_PROBE6(futex, func, wait, entry,
2255 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
2256 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
2257 error = futex_func_wait(shared, uaddr, val,
2258 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
2259 retval);
2260 SDT_PROBE1(futex, func, wait, exit, error);
2261 break;
2262
2263 case FUTEX_WAIT_BITSET:
2264 SDT_PROBE6(futex, func, wait, entry,
2265 uaddr, val, val3, timeout,
2266 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
2267 error = futex_func_wait(shared, uaddr, val, val3, timeout,
2268 clkid, TIMER_ABSTIME, retval);
2269 SDT_PROBE1(futex, func, wait, exit, error);
2270 break;
2271
2272 case FUTEX_WAKE:
2273 SDT_PROBE4(futex, func, wake, entry,
2274 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
2275 error = futex_func_wake(shared, uaddr, val,
2276 FUTEX_BITSET_MATCH_ANY, retval);
2277 SDT_PROBE2(futex, func, wake, exit, error, *retval);
2278 break;
2279
2280 case FUTEX_WAKE_BITSET:
2281 SDT_PROBE4(futex, func, wake, entry,
2282 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
2283 error = futex_func_wake(shared, uaddr, val, val3, retval);
2284 SDT_PROBE2(futex, func, wake, exit, error, *retval);
2285 break;
2286
2287 case FUTEX_REQUEUE:
2288 SDT_PROBE5(futex, func, requeue, entry,
2289 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
2290 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
2291 val2, 0, retval);
2292 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
2293 break;
2294
2295 case FUTEX_CMP_REQUEUE:
2296 SDT_PROBE6(futex, func, cmp_requeue, entry,
2297 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
2298 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
2299 val2, val3, retval);
2300 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
2301 break;
2302
2303 case FUTEX_WAKE_OP:
2304 SDT_PROBE6(futex, func, wake_op, entry,
2305 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
2306 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
2307 val3, retval);
2308 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
2309 break;
2310
2311 case FUTEX_NETBSD_RW_WAIT:
2312 SDT_PROBE5(futex, func, rw_wait, entry,
2313 uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK);
2314 error = futex_func_rw_wait(shared, uaddr, val, val3,
2315 timeout, clkid, retval);
2316 SDT_PROBE1(futex, func, rw_wait, exit, error);
2317 break;
2318
2319 case FUTEX_NETBSD_RW_HANDOFF:
2320 SDT_PROBE3(futex, func, rw_handoff, entry,
2321 uaddr, val, op & ~FUTEX_CMD_MASK);
2322 error = futex_func_rw_handoff(shared, uaddr, val, retval);
2323 SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval);
2324 break;
2325
2326 case FUTEX_FD:
2327 default:
2328 error = ENOSYS;
2329 break;
2330 }
2331
2332 return error;
2333 }
2334
2335 /*
2336 * sys___futex(l, uap, retval)
2337 *
2338 * __futex(2) system call: generic futex operations.
2339 */
2340 int
2341 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
2342 register_t *retval)
2343 {
2344 /* {
2345 syscallarg(int *) uaddr;
2346 syscallarg(int) op;
2347 syscallarg(int) val;
2348 syscallarg(const struct timespec *) timeout;
2349 syscallarg(int *) uaddr2;
2350 syscallarg(int) val2;
2351 syscallarg(int) val3;
2352 } */
2353 struct timespec ts, *tsp;
2354 int error;
2355
2356 /*
2357 * Copy in the timeout argument, if specified.
2358 */
2359 if (SCARG(uap, timeout)) {
2360 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
2361 if (error)
2362 return error;
2363 tsp = &ts;
2364 } else {
2365 tsp = NULL;
2366 }
2367
2368 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
2369 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
2370 retval);
2371 }
2372
2373 /*
2374 * sys___futex_set_robust_list(l, uap, retval)
2375 *
2376 * __futex_set_robust_list(2) system call for robust futexes.
2377 */
2378 int
2379 sys___futex_set_robust_list(struct lwp *l,
2380 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
2381 {
2382 /* {
2383 syscallarg(void *) head;
2384 syscallarg(size_t) len;
2385 } */
2386 void *head = SCARG(uap, head);
2387
2388 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
2389 return EINVAL;
2390 if ((uintptr_t)head % sizeof(u_long))
2391 return EINVAL;
2392
2393 l->l_robust_head = (uintptr_t)head;
2394
2395 return 0;
2396 }
2397
2398 /*
2399 * sys___futex_get_robust_list(l, uap, retval)
2400 *
2401 * __futex_get_robust_list(2) system call for robust futexes.
2402 */
2403 int
2404 sys___futex_get_robust_list(struct lwp *l,
2405 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
2406 {
2407 /* {
2408 syscallarg(lwpid_t) lwpid;
2409 syscallarg(void **) headp;
2410 syscallarg(size_t *) lenp;
2411 } */
2412 void *head;
2413 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
2414 int error;
2415
2416 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
2417 if (error)
2418 return error;
2419
2420 /* Copy out the head pointer and the head structure length. */
2421 error = copyout(&head, SCARG(uap, headp), sizeof(head));
2422 if (__predict_true(error == 0)) {
2423 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
2424 }
2425
2426 return error;
2427 }
2428
2429 /*
2430 * release_futex(uva, tid)
2431 *
2432 * Try to release the robust futex at uva in the current process
2433 * on lwp exit. If anything goes wrong, silently fail. It is the
2434 * userland program's obligation to arrange correct behaviour.
2435 */
2436 static void
2437 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
2438 bool const is_pending)
2439 {
2440 int *uaddr;
2441 struct futex *f;
2442 int oldval, newval, actual;
2443 int error;
2444 bool shared;
2445
2446 /* If it's misaligned, tough. */
2447 if (__predict_false(uptr & 3))
2448 return;
2449 uaddr = (int *)uptr;
2450
2451 error = futex_load(uaddr, &oldval);
2452 if (__predict_false(error))
2453 return;
2454
2455 /*
2456 * There are two race conditions we need to handle here:
2457 *
2458 * 1. User space cleared the futex word but died before
2459 * being able to issue the wakeup. No wakeups will
2460 * ever be issued, oops!
2461 *
2462 * 2. Awakened waiter died before being able to acquire
2463 * the futex in user space. Any other waiters are
2464 * now stuck, oops!
2465 *
2466 * In both of these cases, the futex word will be 0 (because
2467 * it's updated before the wake is issued). The best we can
2468 * do is detect this situation if it's the pending futex and
2469 * issue a wake without modifying the futex word.
2470 *
2471 * XXX eventual PI handling?
2472 */
2473 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
2474 register_t retval;
2475 error = futex_func_wake(/*shared*/true, uaddr, 1,
2476 FUTEX_BITSET_MATCH_ANY, &retval);
2477 if (error != 0 || retval == 0)
2478 (void) futex_func_wake(/*shared*/false, uaddr, 1,
2479 FUTEX_BITSET_MATCH_ANY, &retval);
2480 return;
2481 }
2482
2483 /* Optimistically test whether we need to do anything at all. */
2484 if ((oldval & FUTEX_TID_MASK) != tid)
2485 return;
2486
2487 /*
2488 * We need to handle the case where this thread owned the futex,
2489 * but it was uncontended. In this case, there won't be any
2490 * kernel state to look up. All we can do is mark the futex
2491 * as a zombie to be mopped up the next time another thread
2492 * attempts to acquire it.
2493 *
2494 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
2495 * this loop, even if waiters appear while we're are doing
2496 * so. This is beause FUTEX_WAITERS is set by user space
2497 * before calling __futex() to wait, and the futex needs
2498 * to be marked as a zombie when the new waiter gets into
2499 * the kernel.
2500 */
2501 if ((oldval & FUTEX_WAITERS) == 0) {
2502 do {
2503 error = futex_load(uaddr, &oldval);
2504 if (error)
2505 return;
2506 if ((oldval & FUTEX_TID_MASK) != tid)
2507 return;
2508 newval = oldval | FUTEX_OWNER_DIED;
2509 error = ucas_int(uaddr, oldval, newval, &actual);
2510 if (error)
2511 return;
2512 } while (actual != oldval);
2513
2514 /*
2515 * If where is still no indication of waiters, then there is
2516 * no more work for us to do.
2517 */
2518 if ((oldval & FUTEX_WAITERS) == 0)
2519 return;
2520 }
2521
2522 /*
2523 * Look for a futex. Try shared first, then private. If we
2524 * can't fine one, tough.
2525 *
2526 * Note: the assumption here is that anyone placing a futex on
2527 * the robust list is adhering to the rules, regardless of the
2528 * futex class.
2529 */
2530 for (f = NULL, shared = true; f == NULL; shared = false) {
2531 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2532 if (error)
2533 return;
2534 if (f == NULL && shared == false)
2535 return;
2536 }
2537
2538 /* Work under the futex op lock. */
2539 futex_op_lock(f);
2540
2541 /*
2542 * Fetch the word: if the tid doesn't match ours, skip;
2543 * otherwise, set the owner-died bit, atomically.
2544 */
2545 do {
2546 error = futex_load(uaddr, &oldval);
2547 if (error)
2548 goto out;
2549 if ((oldval & FUTEX_TID_MASK) != tid)
2550 goto out;
2551 newval = oldval | FUTEX_OWNER_DIED;
2552 error = ucas_int(uaddr, oldval, newval, &actual);
2553 if (error)
2554 goto out;
2555 } while (actual != oldval);
2556
2557 /*
2558 * If there may be waiters, try to wake one. If anything goes
2559 * wrong, tough.
2560 *
2561 * XXX eventual PI handling?
2562 */
2563 if (oldval & FUTEX_WAITERS) {
2564 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2565 NULL, FUTEX_WRITERQ, 0,
2566 FUTEX_BITSET_MATCH_ANY);
2567 }
2568
2569 /* Unlock the queue and release the futex. */
2570 out: futex_op_unlock(f);
2571 futex_rele(f);
2572 }
2573
2574 /*
2575 * futex_robust_head_lookup(l, lwpid)
2576 *
2577 * Helper function to look up a robust head by LWP ID.
2578 */
2579 int
2580 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2581 {
2582 struct proc *p = l->l_proc;
2583
2584 /* Find the other lwp, if requested; otherwise use our robust head. */
2585 if (lwpid) {
2586 mutex_enter(p->p_lock);
2587 l = lwp_find(p, lwpid);
2588 if (l == NULL) {
2589 mutex_exit(p->p_lock);
2590 return ESRCH;
2591 }
2592 *headp = (void *)l->l_robust_head;
2593 mutex_exit(p->p_lock);
2594 } else {
2595 *headp = (void *)l->l_robust_head;
2596 }
2597 return 0;
2598 }
2599
2600 /*
2601 * futex_fetch_robust_head(uaddr)
2602 *
2603 * Helper routine to fetch the futex robust list head that
2604 * handles 32-bit binaries running on 64-bit kernels.
2605 */
2606 static int
2607 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2608 {
2609 #ifdef _LP64
2610 if (curproc->p_flag & PK_32) {
2611 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2612 int error;
2613
2614 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2615 if (__predict_true(error == 0)) {
2616 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2617 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2618 /*
2619 * Make sure the offset is sign-
2620 * extended.
2621 */
2622 rhead[i] = (int32_t)rhead32[i];
2623 } else {
2624 rhead[i] = rhead32[i];
2625 }
2626 }
2627 }
2628 return error;
2629 }
2630 #endif /* _L64 */
2631
2632 return copyin((void *)uaddr, rhead,
2633 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2634 }
2635
2636 /*
2637 * futex_decode_robust_word(word)
2638 *
2639 * Decode a robust futex list word into the entry and entry
2640 * properties.
2641 */
2642 static inline void
2643 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2644 bool * const is_pi)
2645 {
2646 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2647 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2648 }
2649
2650 /*
2651 * futex_fetch_robust_entry(uaddr)
2652 *
2653 * Helper routine to fetch and decode a robust futex entry
2654 * that handles 32-bit binaries running on 64-bit kernels.
2655 */
2656 static int
2657 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2658 bool * const is_pi)
2659 {
2660 uintptr_t val = 0;
2661 int error = 0;
2662
2663 #ifdef _LP64
2664 if (curproc->p_flag & PK_32) {
2665 uint32_t val32;
2666
2667 error = ufetch_32((uint32_t *)uaddr, &val32);
2668 if (__predict_true(error == 0))
2669 val = val32;
2670 } else
2671 #endif /* _LP64 */
2672 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2673 if (__predict_false(error))
2674 return error;
2675
2676 futex_decode_robust_word(val, valp, is_pi);
2677 return 0;
2678 }
2679
2680 /*
2681 * futex_release_all_lwp(l, tid)
2682 *
2683 * Release all l's robust futexes. If anything looks funny in
2684 * the process, give up -- it's userland's responsibility to dot
2685 * the i's and cross the t's.
2686 */
2687 void
2688 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2689 {
2690 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2691 int limit = 1000000;
2692 int error;
2693
2694 /* If there's no robust list there's nothing to do. */
2695 if (l->l_robust_head == 0)
2696 return;
2697
2698 /* Read the final snapshot of the robust list head. */
2699 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2700 if (error) {
2701 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2702 " unmapped robust futex list head\n",
2703 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2704 (uintmax_t)l->l_lid, (uintmax_t)tid);
2705 return;
2706 }
2707
2708 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2709
2710 uintptr_t next, pending;
2711 bool is_pi, pending_is_pi;
2712
2713 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2714 &next, &is_pi);
2715 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2716 &pending, &pending_is_pi);
2717
2718 /*
2719 * Walk down the list of locked futexes and release them, up
2720 * to one million of them before we give up.
2721 */
2722
2723 while (next != l->l_robust_head && limit-- > 0) {
2724 /* pending handled below. */
2725 if (next != pending)
2726 release_futex(next + offset, tid, is_pi, false);
2727 error = futex_fetch_robust_entry(next, &next, &is_pi);
2728 if (error)
2729 break;
2730 preempt_point();
2731 }
2732 if (limit <= 0) {
2733 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2734 " exhausted robust futex limit\n",
2735 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2736 (uintmax_t)l->l_lid, (uintmax_t)tid);
2737 }
2738
2739 /* If there's a pending futex, it may need to be released too. */
2740 if (pending != 0) {
2741 release_futex(pending + offset, tid, pending_is_pi, true);
2742 }
2743 }
2744