sys_futex.c revision 1.11.2.1 1 /* $NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell and Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.11.2.1 2020/11/01 15:16:43 thorpej Exp $");
34
35 /*
36 * Futexes
37 *
38 * The futex system call coordinates notifying threads waiting for
39 * changes on a 32-bit word of memory. The word can be managed by
40 * CPU atomic operations in userland, without system calls, as long
41 * as there is no contention.
42 *
43 * The simplest use case demonstrating the utility is:
44 *
45 * // 32-bit word of memory shared among threads or
46 * // processes in userland. lock & 1 means owned;
47 * // lock & 2 means there are waiters waiting.
48 * volatile int lock = 0;
49 *
50 * int v;
51 *
52 * // Acquire a lock.
53 * do {
54 * v = lock;
55 * if (v & 1) {
56 * // Lock is held. Set a bit to say that
57 * // there are waiters, and wait for lock
58 * // to change to anything other than v;
59 * // then retry.
60 * if (atomic_cas_uint(&lock, v, v | 2) != v)
61 * continue;
62 * futex(FUTEX_WAIT, &lock, v | 2, NULL, NULL, 0);
63 * continue;
64 * }
65 * } while (atomic_cas_uint(&lock, v, v & ~1) != v);
66 * membar_enter();
67 *
68 * ...
69 *
70 * // Release the lock. Optimistically assume there are
71 * // no waiters first until demonstrated otherwise.
72 * membar_exit();
73 * if (atomic_cas_uint(&lock, 1, 0) != 1) {
74 * // There may be waiters.
75 * v = atomic_swap_uint(&lock, 0);
76 * // If there are still waiters, wake one.
77 * if (v & 2)
78 * futex(FUTEX_WAKE, &lock, 1, NULL, NULL, 0);
79 * }
80 *
81 * The goal is to avoid the futex system call unless there is
82 * contention; then if there is contention, to guarantee no missed
83 * wakeups.
84 *
85 * For a simple implementation, futex(FUTEX_WAIT) could queue
86 * itself to be woken, double-check the lock word, and then sleep;
87 * spurious wakeups are generally a fact of life, so any
88 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
89 *
90 * If this were all there is to it, we could then increase
91 * parallelism by refining the approximation: partition the
92 * waiters into buckets by hashing the lock addresses to reduce
93 * the incidence of spurious wakeups. But this is not all.
94 *
95 * The futex(FUTEX_CMP_REQUEUE, &lock, n, &lock2, m, val)
96 * operation not only wakes n waiters on lock if lock == val, but
97 * also _transfers_ m additional waiters to lock2. Unless wakeups
98 * on lock2 also trigger wakeups on lock, we cannot move waiters
99 * to lock2 if they merely share the same hash as waiters on lock.
100 * Thus, we can't approximately distribute waiters into queues by
101 * a hash function; we must distinguish futex queues exactly by
102 * lock address.
103 *
104 * For now, we use a global red/black tree to index futexes. This
105 * should be replaced by a lockless radix tree with a thread to
106 * free entries no longer in use once all lookups on all CPUs have
107 * completed.
108 *
109 * Specifically, we maintain two maps:
110 *
111 * futex_tab.va[vmspace, va] for private futexes
112 * futex_tab.oa[uvm_voaddr] for shared futexes
113 *
114 * This implementation does not support priority inheritance.
115 */
116
117 #include <sys/types.h>
118 #include <sys/atomic.h>
119 #include <sys/condvar.h>
120 #include <sys/futex.h>
121 #include <sys/mutex.h>
122 #include <sys/rbtree.h>
123 #include <sys/sleepq.h>
124 #include <sys/queue.h>
125 #include <sys/sdt.h>
126
127 #include <sys/syscall.h>
128 #include <sys/syscallargs.h>
129 #include <sys/syscallvar.h>
130
131 #include <uvm/uvm_extern.h>
132
133 /*
134 * DTrace probes.
135 */
136 SDT_PROVIDER_DEFINE(futex);
137
138 /* entry: uaddr, val, bitset, timeout, clkflags, fflags */
139 /* exit: error */
140 SDT_PROBE_DEFINE6(futex, func, wait, entry, "int *", "int", "int",
141 "struct timespec *", "int", "int");
142 SDT_PROBE_DEFINE1(futex, func, wait, exit, "int");
143
144 /* entry: uaddr, nwake, bitset, fflags */
145 /* exit: error, nwoken */
146 SDT_PROBE_DEFINE4(futex, func, wake, entry, "int *", "int", "int", "int");
147 SDT_PROBE_DEFINE2(futex, func, wake, exit, "int", "int");
148
149 /* entry: uaddr, nwake, uaddr2, nrequeue, fflags */
150 /* exit: error, nwoken */
151 SDT_PROBE_DEFINE5(futex, func, requeue, entry, "int *", "int", "int *", "int",
152 "int");
153 SDT_PROBE_DEFINE2(futex, func, requeue, exit, "int", "int");
154
155 /* entry: uaddr, nwake, uaddr2, nrequeue, val3, fflags */
156 /* exit: error, nwoken */
157 SDT_PROBE_DEFINE6(futex, func, cmp_requeue, entry, "int *", "int", "int *",
158 "int", "int", "int");
159 SDT_PROBE_DEFINE2(futex, func, cmp_requeue, exit, "int", "int");
160
161 /* entry: uaddr, nwake, uaddr2, nwake2, wakeop, fflags */
162 /* exit: error, nwoken */
163 SDT_PROBE_DEFINE6(futex, func, wake_op, entry, "int *", "int", "int *", "int",
164 "int", "int");
165 SDT_PROBE_DEFINE2(futex, func, wake_op, exit, "int", "int");
166
167 /* entry: uaddr, val, r/w, abstime, fflags */
168 /* exit: error */
169 SDT_PROBE_DEFINE5(futex, func, rw_wait, entry, "int *", "int", "int",
170 "struct timespec *", "int");
171 SDT_PROBE_DEFINE1(futex, func, rw_wait, exit, "int");
172
173 /* entry: uaddr, val, fflags */
174 /* exit: error, nwoken */
175 SDT_PROBE_DEFINE3(futex, func, rw_handoff, entry, "int *", "int", "int");
176 SDT_PROBE_DEFINE2(futex, func, rw_handoff, exit, "int", "int");
177
178 SDT_PROBE_DEFINE0(futex, wait, finish, normally);
179 SDT_PROBE_DEFINE0(futex, wait, finish, wakerace);
180 SDT_PROBE_DEFINE0(futex, wait, finish, aborted);
181
182 /* entry: timo */
183 /* exit: error */
184 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, entry, "int");
185 SDT_PROBE_DEFINE1(futex, wait, sleepq_block, exit, "int");
186
187 /*
188 * Lock order:
189 *
190 * futex_tab.lock
191 * futex::fx_op_lock ordered by kva of struct futex
192 * -> futex::fx_sq_lock ordered by kva of sleepq lock
193 *
194 * N.B. multiple futexes can share a single sleepq lock.
195 */
196
197 /*
198 * union futex_key
199 *
200 * A futex is addressed either by a vmspace+va (private) or by
201 * a uvm_voaddr (shared).
202 */
203 union futex_key {
204 struct {
205 struct vmspace *vmspace;
206 vaddr_t va;
207 } fk_private;
208 struct uvm_voaddr fk_shared;
209 };
210
211 /*
212 * struct futex
213 *
214 * Kernel state for a futex located at a particular address in a
215 * particular virtual address space.
216 *
217 * N.B. fx_refcnt is an unsigned long because we need to be able
218 * to operate on it atomically on all systems while at the same
219 * time rendering practically impossible the chance of it reaching
220 * its max value. In practice, we're limited by the number of LWPs
221 * that can be present on the system at any given time, and the
222 * assumption is that limit will be good enough on a 32-bit platform.
223 * See futex_wake() for why overflow needs to be avoided.
224 *
225 * XXX Since futex addresses must be 4-byte aligned, we could
226 * XXX squirrel away fx_shared and fx_on_tree bits in the "va"
227 * XXX field of the key. Worth it?
228 */
229 struct futex {
230 union futex_key fx_key;
231 struct rb_node fx_node;
232 unsigned long fx_refcnt;
233 bool fx_shared;
234 bool fx_on_tree;
235 uint8_t fx_class;
236
237 kmutex_t fx_op_lock; /* adaptive */
238 kmutex_t * fx_sq_lock; /* &sleepq_locks[...] */
239 sleepq_t fx_sqs[2]; /* 0=reader, 1=writer */
240 unsigned int fx_nwaiters[2];
241 };
242
243 /*
244 * futex classes: Some futex operations can only be carried out on
245 * futexes that are known to be abiding by a certain protocol. These
246 * classes are assigned to a futex when created due to a wait event,
247 * and when subsequent wake or requeue operations are issued, the
248 * class is checked at futex lookup time. If the class does not match,
249 * EINVAL is the result.
250 */
251 #define FUTEX_CLASS_ANY 0 /* match any class in lookup */
252 #define FUTEX_CLASS_NORMAL 1 /* normal futex */
253 #define FUTEX_CLASS_RWLOCK 2 /* rwlock futex */
254 #define FUTEX_CLASS_PI 3 /* for FUTEX_*_PI ops */
255
256 /* sleepq definitions */
257 #define FUTEX_READERQ 0
258 #define FUTEX_WRITERQ 1
259
260 CTASSERT(FUTEX_READERQ == FUTEX_RW_READER);
261 CTASSERT(FUTEX_WRITERQ == FUTEX_RW_WRITER);
262
263 static const char futex_wmesg[] = "futex";
264
265 static void futex_unsleep(lwp_t *, bool);
266
267 static syncobj_t futex_syncobj = {
268 .sobj_flag = SOBJ_SLEEPQ_SORTED,
269 .sobj_unsleep = futex_unsleep,
270 .sobj_changepri = sleepq_changepri,
271 .sobj_lendpri = sleepq_lendpri,
272 .sobj_owner = syncobj_noowner,
273 };
274
275 /*
276 * futex_tab
277 *
278 * Global trees of futexes by vmspace/va and VM object address.
279 *
280 * XXX This obviously doesn't scale in parallel. We could use a
281 * pserialize-safe data structure, but there may be a high cost to
282 * frequent deletion since we don't cache futexes after we're done
283 * with them. We could use hashed locks. But for now, just make
284 * sure userland can't DoS the serial performance, by using a
285 * balanced binary tree for lookup.
286 *
287 * XXX We could use a per-process tree for the table indexed by
288 * virtual address to reduce contention between processes.
289 */
290 static struct {
291 kmutex_t lock;
292 struct rb_tree va;
293 struct rb_tree oa;
294 } futex_tab __cacheline_aligned;
295
296 static int
297 compare_futex_key(void *cookie, const void *n, const void *k)
298 {
299 const struct futex *fa = n;
300 const union futex_key *fka = &fa->fx_key;
301 const union futex_key *fkb = k;
302
303 if ((uintptr_t)fka->fk_private.vmspace <
304 (uintptr_t)fkb->fk_private.vmspace)
305 return -1;
306 if ((uintptr_t)fka->fk_private.vmspace >
307 (uintptr_t)fkb->fk_private.vmspace)
308 return +1;
309 if (fka->fk_private.va < fkb->fk_private.va)
310 return -1;
311 if (fka->fk_private.va > fkb->fk_private.va)
312 return -1;
313 return 0;
314 }
315
316 static int
317 compare_futex(void *cookie, const void *na, const void *nb)
318 {
319 const struct futex *fa = na;
320 const struct futex *fb = nb;
321
322 return compare_futex_key(cookie, fa, &fb->fx_key);
323 }
324
325 static const rb_tree_ops_t futex_rb_ops = {
326 .rbto_compare_nodes = compare_futex,
327 .rbto_compare_key = compare_futex_key,
328 .rbto_node_offset = offsetof(struct futex, fx_node),
329 };
330
331 static int
332 compare_futex_shared_key(void *cookie, const void *n, const void *k)
333 {
334 const struct futex *fa = n;
335 const union futex_key *fka = &fa->fx_key;
336 const union futex_key *fkb = k;
337
338 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
339 }
340
341 static int
342 compare_futex_shared(void *cookie, const void *na, const void *nb)
343 {
344 const struct futex *fa = na;
345 const struct futex *fb = nb;
346
347 return compare_futex_shared_key(cookie, fa, &fb->fx_key);
348 }
349
350 static const rb_tree_ops_t futex_shared_rb_ops = {
351 .rbto_compare_nodes = compare_futex_shared,
352 .rbto_compare_key = compare_futex_shared_key,
353 .rbto_node_offset = offsetof(struct futex, fx_node),
354 };
355
356 /*
357 * futex_sq_lock2(f, f2)
358 *
359 * Acquire the sleepq locks for f and f2, which may be null, or
360 * which may be the same. If they are distinct, an arbitrary total
361 * order is chosen on the locks.
362 *
363 * Callers should only ever acquire multiple sleepq locks
364 * simultaneously using futex_sq_lock2.
365 */
366 static void
367 futex_sq_lock2(struct futex * const f, struct futex * const f2)
368 {
369
370 /*
371 * If both are null, do nothing; if one is null and the other
372 * is not, lock the other and be done with it.
373 */
374 if (f == NULL && f2 == NULL) {
375 return;
376 } else if (f == NULL) {
377 mutex_spin_enter(f2->fx_sq_lock);
378 return;
379 } else if (f2 == NULL) {
380 mutex_spin_enter(f->fx_sq_lock);
381 return;
382 }
383
384 kmutex_t * const m = f->fx_sq_lock;
385 kmutex_t * const m2 = f2->fx_sq_lock;
386
387 /* If both locks are the same, acquire only one. */
388 if (m == m2) {
389 mutex_spin_enter(m);
390 return;
391 }
392
393 /* Otherwise, use the ordering on the kva of the lock pointer. */
394 if ((uintptr_t)m < (uintptr_t)m2) {
395 mutex_spin_enter(m);
396 mutex_spin_enter(m2);
397 } else {
398 mutex_spin_enter(m2);
399 mutex_spin_enter(m);
400 }
401 }
402
403 /*
404 * futex_sq_unlock2(f, f2)
405 *
406 * Release the sleep queue locks for f and f2, which may be null, or
407 * which may have the same underlying lock.
408 */
409 static void
410 futex_sq_unlock2(struct futex * const f, struct futex * const f2)
411 {
412
413 /*
414 * If both are null, do nothing; if one is null and the other
415 * is not, unlock the other and be done with it.
416 */
417 if (f == NULL && f2 == NULL) {
418 return;
419 } else if (f == NULL) {
420 mutex_spin_exit(f2->fx_sq_lock);
421 return;
422 } else if (f2 == NULL) {
423 mutex_spin_exit(f->fx_sq_lock);
424 return;
425 }
426
427 kmutex_t * const m = f->fx_sq_lock;
428 kmutex_t * const m2 = f2->fx_sq_lock;
429
430 /* If both locks are the same, release only one. */
431 if (m == m2) {
432 mutex_spin_exit(m);
433 return;
434 }
435
436 /* Otherwise, use the ordering on the kva of the lock pointer. */
437 if ((uintptr_t)m < (uintptr_t)m2) {
438 mutex_spin_exit(m2);
439 mutex_spin_exit(m);
440 } else {
441 mutex_spin_exit(m);
442 mutex_spin_exit(m2);
443 }
444 }
445
446 /*
447 * futex_load(uaddr, kaddr)
448 *
449 * Perform a single atomic load to read *uaddr, and return the
450 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not
451 * mapped.
452 */
453 static inline int
454 futex_load(int *uaddr, int *kaddr)
455 {
456 return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
457 }
458
459 /*
460 * futex_test(uaddr, expected)
461 *
462 * True if *uaddr == expected. False if *uaddr != expected, or if
463 * uaddr is not mapped.
464 */
465 static bool
466 futex_test(int *uaddr, int expected)
467 {
468 int val;
469 int error;
470
471 error = futex_load(uaddr, &val);
472 if (error)
473 return false;
474 return val == expected;
475 }
476
477 static pool_cache_t futex_cache __read_mostly;
478
479 static int futex_ctor(void *, void *, int);
480 static void futex_dtor(void *, void *);
481
482 /*
483 * futex_sys_init()
484 *
485 * Initialize the futex subsystem.
486 */
487 void
488 futex_sys_init(void)
489 {
490
491 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
492 rb_tree_init(&futex_tab.va, &futex_rb_ops);
493 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
494
495 futex_cache = pool_cache_init(sizeof(struct futex),
496 coherency_unit, 0, 0, "futex", NULL, IPL_NONE, futex_ctor,
497 futex_dtor, NULL);
498 }
499
500 /*
501 * futex_sys_fini()
502 *
503 * Finalize the futex subsystem.
504 */
505 void
506 futex_sys_fini(void)
507 {
508
509 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
510 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
511 mutex_destroy(&futex_tab.lock);
512 }
513
514 /*
515 * futex_ctor()
516 *
517 * Futex object constructor.
518 */
519 static int
520 futex_ctor(void *arg __unused, void *obj, int flags __unused)
521 {
522 extern sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE];
523 struct futex * const f = obj;
524
525 mutex_init(&f->fx_op_lock, MUTEX_DEFAULT, IPL_NONE);
526 f->fx_sq_lock = &sleepq_locks[SLEEPTAB_HASH(f)].lock;
527
528 sleepq_init(&f->fx_sqs[FUTEX_READERQ]);
529 sleepq_init(&f->fx_sqs[FUTEX_WRITERQ]);
530 f->fx_nwaiters[FUTEX_READERQ] = f->fx_nwaiters[FUTEX_WRITERQ] = 0;
531
532 return 0;
533 }
534
535 /*
536 * futex_dtor()
537 *
538 * Futex object destructor.
539 */
540 static void
541 futex_dtor(void *arg __unused, void *obj)
542 {
543 struct futex * const f = obj;
544
545 mutex_destroy(&f->fx_op_lock);
546 f->fx_sq_lock = NULL;
547 }
548
549 /*
550 * futex_key_init(key, vm, va, shared)
551 *
552 * Initialize a futex key for lookup, etc.
553 */
554 static int
555 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
556 {
557 int error = 0;
558
559 if (__predict_false(shared)) {
560 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
561 error = EFAULT;
562 } else {
563 fk->fk_private.vmspace = vm;
564 fk->fk_private.va = va;
565 }
566
567 return error;
568 }
569
570 /*
571 * futex_key_fini(key, shared)
572 *
573 * Release a futex key.
574 */
575 static void
576 futex_key_fini(union futex_key *fk, bool shared)
577 {
578 if (__predict_false(shared))
579 uvm_voaddr_release(&fk->fk_shared);
580 memset(fk, 0, sizeof(*fk));
581 }
582
583 /*
584 * futex_create(fk, shared)
585 *
586 * Create a futex. Initial reference count is 1, representing the
587 * caller. Returns NULL on failure. Always takes ownership of the
588 * key, either transferring it to the newly-created futex, or releasing
589 * the key if creation fails.
590 *
591 * Never sleeps for memory, but may sleep to acquire a lock.
592 */
593 static struct futex *
594 futex_create(union futex_key *fk, bool shared, uint8_t class)
595 {
596 struct futex *f;
597
598 f = pool_cache_get(futex_cache, PR_NOWAIT);
599 if (f == NULL) {
600 futex_key_fini(fk, shared);
601 return NULL;
602 }
603 f->fx_key = *fk;
604 f->fx_refcnt = 1;
605 f->fx_shared = shared;
606 f->fx_on_tree = false;
607 f->fx_class = class;
608
609 return f;
610 }
611
612 /*
613 * futex_destroy(f)
614 *
615 * Destroy a futex created with futex_create. Reference count
616 * must be zero.
617 *
618 * May sleep.
619 */
620 static void
621 futex_destroy(struct futex *f)
622 {
623
624 ASSERT_SLEEPABLE();
625
626 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
627 KASSERT(!f->fx_on_tree);
628 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
629 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_WRITERQ]));
630 KASSERT(f->fx_nwaiters[FUTEX_READERQ] == 0);
631 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] == 0);
632
633 futex_key_fini(&f->fx_key, f->fx_shared);
634
635 pool_cache_put(futex_cache, f);
636 }
637
638 /*
639 * futex_hold_count(f, n)
640 *
641 * Attempt to acquire a reference to f. Return 0 on success,
642 * ENFILE on too many references.
643 *
644 * Never sleeps.
645 */
646 static int
647 futex_hold_count(struct futex *f, unsigned long const count)
648 {
649 unsigned long refcnt;
650
651 do {
652 refcnt = atomic_load_relaxed(&f->fx_refcnt);
653 if (ULONG_MAX - refcnt < count)
654 return ENFILE;
655 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
656 refcnt + count) != refcnt);
657
658 return 0;
659 }
660 #define futex_hold(f) futex_hold_count(f, 1)
661
662 /*
663 * futex_rele_count(f, n)
664 *
665 * Release a reference to f acquired with futex_create or
666 * futex_hold.
667 *
668 * May sleep to free f.
669 */
670 static void
671 futex_rele_count(struct futex *f, unsigned long const count)
672 {
673 unsigned long refcnt;
674
675 ASSERT_SLEEPABLE();
676
677 do {
678 refcnt = atomic_load_relaxed(&f->fx_refcnt);
679 KASSERT(refcnt >= count);
680 if (refcnt - count == 0)
681 goto trylast;
682 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
683 refcnt - count) != refcnt);
684 return;
685
686 trylast:
687 KASSERT(count <= LONG_MAX);
688 mutex_enter(&futex_tab.lock);
689 if (atomic_add_long_nv(&f->fx_refcnt, -(long)count) == 0) {
690 if (f->fx_on_tree) {
691 if (__predict_false(f->fx_shared))
692 rb_tree_remove_node(&futex_tab.oa, f);
693 else
694 rb_tree_remove_node(&futex_tab.va, f);
695 f->fx_on_tree = false;
696 }
697 } else {
698 /* References remain -- don't destroy it. */
699 f = NULL;
700 }
701 mutex_exit(&futex_tab.lock);
702 if (f != NULL)
703 futex_destroy(f);
704 }
705 #define futex_rele(f) futex_rele_count(f, 1)
706
707 /*
708 * futex_rele_count_not_last(f, n)
709 *
710 * Release a reference to f acquired with futex_create or
711 * futex_hold.
712 *
713 * This version asserts that we are not dropping the last
714 * reference to f.
715 */
716 static void
717 futex_rele_count_not_last(struct futex *f, unsigned long const count)
718 {
719 unsigned long refcnt;
720
721 do {
722 refcnt = atomic_load_relaxed(&f->fx_refcnt);
723 KASSERT(refcnt >= count);
724 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt,
725 refcnt - count) != refcnt);
726 }
727
728 /*
729 * futex_lookup_by_key(key, shared, class, &f)
730 *
731 * Try to find an existing futex va reference in the specified key
732 * On success, return 0, set f to found futex or to NULL if not found,
733 * and increment f's reference count if found.
734 *
735 * Return ENFILE if reference count too high.
736 *
737 * Internal lookup routine shared by futex_lookup() and
738 * futex_lookup_create().
739 */
740 static int
741 futex_lookup_by_key(union futex_key *fk, bool shared, uint8_t class,
742 struct futex **fp)
743 {
744 struct futex *f;
745 int error = 0;
746
747 mutex_enter(&futex_tab.lock);
748 if (__predict_false(shared)) {
749 f = rb_tree_find_node(&futex_tab.oa, fk);
750 } else {
751 f = rb_tree_find_node(&futex_tab.va, fk);
752 }
753 if (f) {
754 if (__predict_true(f->fx_class == class ||
755 class == FUTEX_CLASS_ANY))
756 error = futex_hold(f);
757 else
758 error = EINVAL;
759 if (error)
760 f = NULL;
761 }
762 *fp = f;
763 mutex_exit(&futex_tab.lock);
764
765 return error;
766 }
767
768 /*
769 * futex_insert(f, fp)
770 *
771 * Try to insert the futex f into the tree by va. If there
772 * already is a futex for its va, acquire a reference to it, and
773 * store it in *fp; otherwise store f in *fp.
774 *
775 * Return 0 on success, ENFILE if there already is a futex but its
776 * reference count is too high.
777 */
778 static int
779 futex_insert(struct futex *f, struct futex **fp)
780 {
781 struct futex *f0;
782 int error;
783
784 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
785 KASSERT(!f->fx_on_tree);
786
787 mutex_enter(&futex_tab.lock);
788 if (__predict_false(f->fx_shared))
789 f0 = rb_tree_insert_node(&futex_tab.oa, f);
790 else
791 f0 = rb_tree_insert_node(&futex_tab.va, f);
792 if (f0 == f) {
793 f->fx_on_tree = true;
794 error = 0;
795 } else {
796 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
797 KASSERT(f0->fx_on_tree);
798 error = futex_hold(f0);
799 if (error)
800 goto out;
801 }
802 *fp = f0;
803 out: mutex_exit(&futex_tab.lock);
804
805 return error;
806 }
807
808 /*
809 * futex_lookup(uaddr, shared, class, &f)
810 *
811 * Find a futex at the userland pointer uaddr in the current
812 * process's VM space. On success, return the futex in f and
813 * increment its reference count.
814 *
815 * Caller must call futex_rele when done.
816 */
817 static int
818 futex_lookup(int *uaddr, bool shared, uint8_t class, struct futex **fp)
819 {
820 union futex_key fk;
821 struct vmspace *vm = curproc->p_vmspace;
822 vaddr_t va = (vaddr_t)uaddr;
823 int error;
824
825 /*
826 * Reject unaligned user pointers so we don't cross page
827 * boundaries and so atomics will work.
828 */
829 if (__predict_false((va & 3) != 0))
830 return EINVAL;
831
832 /* Look it up. */
833 error = futex_key_init(&fk, vm, va, shared);
834 if (error)
835 return error;
836
837 error = futex_lookup_by_key(&fk, shared, class, fp);
838 futex_key_fini(&fk, shared);
839 if (error)
840 return error;
841
842 KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
843 KASSERT(*fp == NULL || (*fp)->fx_class == class);
844 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
845
846 /*
847 * Success! (Caller must still check whether we found
848 * anything, but nothing went _wrong_ like trying to use
849 * unmapped memory.)
850 */
851 KASSERT(error == 0);
852
853 return error;
854 }
855
856 /*
857 * futex_lookup_create(uaddr, shared, class, &f)
858 *
859 * Find or create a futex at the userland pointer uaddr in the
860 * current process's VM space. On success, return the futex in f
861 * and increment its reference count.
862 *
863 * Caller must call futex_rele when done.
864 */
865 static int
866 futex_lookup_create(int *uaddr, bool shared, uint8_t class, struct futex **fp)
867 {
868 union futex_key fk;
869 struct vmspace *vm = curproc->p_vmspace;
870 struct futex *f = NULL;
871 vaddr_t va = (vaddr_t)uaddr;
872 int error;
873
874 /*
875 * Reject unaligned user pointers so we don't cross page
876 * boundaries and so atomics will work.
877 */
878 if (__predict_false((va & 3) != 0))
879 return EINVAL;
880
881 if (__predict_false(class == FUTEX_CLASS_ANY))
882 return EINVAL;
883
884 error = futex_key_init(&fk, vm, va, shared);
885 if (error)
886 return error;
887
888 /*
889 * Optimistically assume there already is one, and try to find
890 * it.
891 */
892 error = futex_lookup_by_key(&fk, shared, class, fp);
893 if (error || *fp != NULL) {
894 /*
895 * We either found one, or there was an error.
896 * In either case, we are done with the key.
897 */
898 futex_key_fini(&fk, shared);
899 goto out;
900 }
901
902 /*
903 * Create a futex recoard. This tranfers ownership of the key
904 * in all cases.
905 */
906 f = futex_create(&fk, shared, class);
907 if (f == NULL) {
908 error = ENOMEM;
909 goto out;
910 }
911
912 /*
913 * Insert our new futex, or use existing if someone else beat
914 * us to it.
915 */
916 error = futex_insert(f, fp);
917 if (error)
918 goto out;
919 if (*fp == f)
920 f = NULL; /* don't release on exit */
921
922 /* Success! */
923 KASSERT(error == 0);
924
925 out: if (f != NULL)
926 futex_rele(f);
927 KASSERT(error || *fp != NULL);
928 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
929 return error;
930 }
931
932 /*
933 * futex_unsleep:
934 *
935 * Remove an LWP from the futex and sleep queue. This is called when
936 * the LWP has not been awoken normally but instead interrupted: for
937 * example, when a signal is received. Must be called with the LWP
938 * locked. Will unlock if "unlock" is true.
939 */
940 static void
941 futex_unsleep(lwp_t *l, bool unlock)
942 {
943 struct futex *f __diagused;
944
945 f = (struct futex *)(uintptr_t)l->l_wchan;
946
947 KASSERT(mutex_owned(f->fx_sq_lock));
948
949 /* WRITERQ is more likely */
950 if (__predict_true(l->l_sleepq == &f->fx_sqs[FUTEX_WRITERQ])) {
951 KASSERT(f->fx_nwaiters[FUTEX_WRITERQ] > 0);
952 f->fx_nwaiters[FUTEX_WRITERQ]--;
953 } else {
954 KASSERT(l->l_sleepq == &f->fx_sqs[FUTEX_READERQ]);
955 KASSERT(f->fx_nwaiters[FUTEX_READERQ] > 0);
956 f->fx_nwaiters[FUTEX_READERQ]--;
957 }
958
959 sleepq_unsleep(l, unlock);
960 }
961
962 /*
963 * futex_wait(f, timeout, clkid, bitset)
964 *
965 * Wait until deadline on the clock clkid, or forever if deadline
966 * is NULL, for a futex wakeup. Return 0 on explicit wakeup,
967 * ETIMEDOUT on timeout, EINTR on signal.
968 */
969 static int
970 futex_wait(struct futex *f, int q, const struct timespec *deadline,
971 clockid_t clkid, int bitset)
972 {
973
974 /*
975 * Some things to note about how this function works:
976 *
977 * ==> When we enter futex_wait(), the futex's op lock is
978 * held, preventing us from missing wakes. Once we are on
979 * the futex's sleepq, it is safe to release the op lock.
980 *
981 * ==> We have a single early escape to avoid calling
982 * sleepq_block(): our deadline already having passed.
983 * If this is a no-timeout wait, or if the deadline has
984 * not already passed, we are guaranteed to call sleepq_block().
985 *
986 * ==> sleepq_block() contains all of the necessary logic
987 * for handling signals; we do not need to check for them
988 * ourselves.
989 *
990 * ==> Once we have blocked, other futex operations will
991 * be able to wake us or requeue us. Until we have blocked,
992 * those other futex operations will not be able to acquire
993 * the sleepq locks in order to perform those operations,
994 * even if we have dropped the op lock. Thus, we will not
995 * miss wakes. This fundamental invariant is relied upon by
996 * every other synchronization object in the kernel.
997 *
998 * ==> sleepq_block() returns for one of three reasons:
999 *
1000 * -- We were awakened.
1001 * -- We were signaled.
1002 * -- We timed out.
1003 *
1004 * ==> Once sleepq_block() returns, we will not call
1005 * sleepq_block() again.
1006 *
1007 * ==> If sleepq_block() returns due to being signaled,
1008 * we must check to see if we were also awakened. This
1009 * is to handle the following sequence:
1010 *
1011 * -- Futex wake takes us off sleepq, places us on runq.
1012 * -- We are signaled while sitting on the runq.
1013 * -- We run, and sleepq_block() notices the signal and
1014 * returns EINTR/ERESTART.
1015 *
1016 * In this situation, we want to indicate a successful wake
1017 * to the caller, because that wake has been reported to the
1018 * thread that issued it.
1019 *
1020 * Note that it is NOT possible for a wake to be issued after
1021 * a signal; the LWP will already have been taken off the sleepq
1022 * by the signal, so the wake will never happen. Note that for
1023 * this reason, we must *never* return ERESTART, because it could
1024 * result in us waiting again with no one to awaken us.
1025 */
1026
1027 struct lwp * const l = curlwp;
1028 int timo;
1029 int error;
1030
1031 /*
1032 * Compute our timeout before taking any locks.
1033 */
1034 if (deadline) {
1035 struct timespec ts;
1036
1037 /* Check our watch. */
1038 error = clock_gettime1(clkid, &ts);
1039 if (error) {
1040 mutex_exit(&f->fx_op_lock);
1041 return error;
1042 }
1043
1044 /*
1045 * If we're past the deadline, ETIMEDOUT.
1046 */
1047 if (timespeccmp(deadline, &ts, <=)) {
1048 mutex_exit(&f->fx_op_lock);
1049 return ETIMEDOUT;
1050 } else {
1051 /* Count how much time is left. */
1052 timespecsub(deadline, &ts, &ts);
1053 timo = tstohz(&ts);
1054 if (timo == 0) {
1055 /*
1056 * tstohz() already rounds up, and
1057 * a return value of 0 ticks means
1058 * "expired now or in the past".
1059 */
1060 mutex_exit(&f->fx_op_lock);
1061 return ETIMEDOUT;
1062 }
1063 }
1064 } else {
1065 timo = 0;
1066 }
1067
1068 /*
1069 * Acquire in sleepq -> lwp order. While we're at it, ensure
1070 * that there's room for us to block.
1071 */
1072 mutex_spin_enter(f->fx_sq_lock);
1073 if (__predict_false(q == FUTEX_READERQ &&
1074 f->fx_nwaiters[q] == FUTEX_TID_MASK)) {
1075 mutex_spin_exit(f->fx_sq_lock);
1076 mutex_exit(&f->fx_op_lock);
1077 return ENFILE;
1078 }
1079 lwp_lock(l);
1080
1081 /*
1082 * No need for locks here; until we're on a futex's sleepq, these
1083 * values are private to the LWP, and the locks needed to get onto
1084 * the sleepq provide the memory barriers needed to ensure visibility.
1085 */
1086 l->l_futex = f;
1087 l->l_futex_wakesel = bitset;
1088
1089 /*
1090 * This is an inlined bit of sleepq_enter();
1091 * we already hold the lwp lock.
1092 */
1093 lwp_unlock_to(l, f->fx_sq_lock);
1094 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
1095 KASSERT(lwp_locked(l, f->fx_sq_lock));
1096
1097 sleepq_enqueue(&f->fx_sqs[q], f, futex_wmesg, &futex_syncobj, true);
1098 f->fx_nwaiters[q]++;
1099
1100 /* We can now safely release the op lock. */
1101 mutex_exit(&f->fx_op_lock);
1102
1103 SDT_PROBE1(futex, wait, sleepq_block, entry, timo);
1104 error = sleepq_block(timo, true);
1105 SDT_PROBE1(futex, wait, sleepq_block, exit, error);
1106
1107 /*
1108 * f is no longer valid; we may have been moved to another
1109 * futex sleepq while we slept.
1110 */
1111
1112 /*
1113 * If something went wrong, we may still have a futex reference.
1114 * Check for that here and drop it if so. We can do this without
1115 * taking any locks because l->l_futex is private to the LWP when
1116 * not on any futex's sleepq, and we are not on any sleepq because
1117 * we are running.
1118 *
1119 * Convert EWOULDBLOCK to ETIMEDOUT in case sleepq_block() returned
1120 * EWOULDBLOCK, and ensure the only other error we return is EINTR.
1121 */
1122 if (error) {
1123 f = l->l_futex;
1124 if (f != NULL) {
1125 SDT_PROBE0(futex, wait, finish, aborted);
1126 l->l_futex = NULL;
1127 futex_rele(f);
1128 } else {
1129 /* Raced with wakeup; report success. */
1130 SDT_PROBE0(futex, wait, finish, wakerace);
1131 error = 0;
1132 }
1133 if (error == EWOULDBLOCK)
1134 error = ETIMEDOUT;
1135 else if (error != ETIMEDOUT)
1136 error = EINTR;
1137 } else {
1138 KASSERT(l->l_futex == NULL);
1139 SDT_PROBE0(futex, wait, finish, normally);
1140 }
1141
1142 return error;
1143 }
1144
1145 /*
1146 * futex_wake(f, q, nwake, f2, q2, nrequeue, bitset)
1147 *
1148 * Wake up to nwake waiters on f matching bitset; then, if f2 is
1149 * provided, move up to nrequeue remaining waiters on f matching
1150 * bitset to f2. Return the number of waiters actually woken.
1151 * Caller must hold the locks of f and f2, if provided.
1152 */
1153 static unsigned
1154 futex_wake(struct futex *f, int q, unsigned int const nwake,
1155 struct futex *f2, int q2, unsigned int const nrequeue,
1156 int bitset)
1157 {
1158 struct lwp *l, *l_next;
1159 unsigned nwoken = 0;
1160 unsigned nrequeued = 0;
1161
1162 KASSERT(mutex_owned(&f->fx_op_lock));
1163 KASSERT(f2 == NULL || mutex_owned(&f2->fx_op_lock));
1164
1165 futex_sq_lock2(f, f2);
1166
1167 /* Wake up to nwake waiters, and count the number woken. */
1168 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1169 if (nwoken == nwake)
1170 break;
1171
1172 KASSERT(l->l_syncobj == &futex_syncobj);
1173 KASSERT(l->l_wchan == (wchan_t)f);
1174 KASSERT(l->l_futex == f);
1175 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1176 KASSERT(l->l_mutex == f->fx_sq_lock);
1177
1178 if ((l->l_futex_wakesel & bitset) == 0)
1179 continue;
1180
1181 l->l_futex_wakesel = 0;
1182 l->l_futex = NULL;
1183 sleepq_remove(&f->fx_sqs[q], l);
1184 f->fx_nwaiters[q]--;
1185 nwoken++;
1186 /* hold counts adjusted in bulk below */
1187 }
1188
1189 if (nrequeue) {
1190 KASSERT(f2 != NULL);
1191 /* Move up to nrequeue waiters from f's queue to f2's queue. */
1192 LIST_FOREACH_SAFE(l, &f->fx_sqs[q], l_sleepchain, l_next) {
1193 if (nrequeued == nrequeue)
1194 break;
1195
1196 KASSERT(l->l_syncobj == &futex_syncobj);
1197 KASSERT(l->l_wchan == (wchan_t)f);
1198 KASSERT(l->l_futex == f);
1199 KASSERT(l->l_sleepq == &f->fx_sqs[q]);
1200 KASSERT(l->l_mutex == f->fx_sq_lock);
1201
1202 if ((l->l_futex_wakesel & bitset) == 0)
1203 continue;
1204
1205 l->l_futex = f2;
1206 sleepq_transfer(l, &f->fx_sqs[q],
1207 &f2->fx_sqs[q2], f2, futex_wmesg,
1208 &futex_syncobj, f2->fx_sq_lock, true);
1209 f->fx_nwaiters[q]--;
1210 f2->fx_nwaiters[q2]++;
1211 nrequeued++;
1212 /* hold counts adjusted in bulk below */
1213 }
1214 if (nrequeued) {
1215 /* XXX futex_hold() could theoretically fail here. */
1216 int hold_error __diagused =
1217 futex_hold_count(f2, nrequeued);
1218 KASSERT(hold_error == 0);
1219 }
1220 }
1221
1222 /*
1223 * Adjust the futex reference counts for the wakes and
1224 * requeues.
1225 */
1226 KASSERT(nwoken + nrequeued <= LONG_MAX);
1227 futex_rele_count_not_last(f, nwoken + nrequeued);
1228
1229 futex_sq_unlock2(f, f2);
1230
1231 /* Return the number of waiters woken and requeued. */
1232 return nwoken + nrequeued;
1233 }
1234
1235 /*
1236 * futex_op_lock(f)
1237 *
1238 * Acquire the op lock of f. Pair with futex_op_unlock. Do
1239 * not use if caller needs to acquire two locks; use
1240 * futex_op_lock2 instead.
1241 */
1242 static void
1243 futex_op_lock(struct futex *f)
1244 {
1245 mutex_enter(&f->fx_op_lock);
1246 }
1247
1248 /*
1249 * futex_op_unlock(f)
1250 *
1251 * Release the queue lock of f.
1252 */
1253 static void
1254 futex_op_unlock(struct futex *f)
1255 {
1256 mutex_exit(&f->fx_op_lock);
1257 }
1258
1259 /*
1260 * futex_op_lock2(f, f2)
1261 *
1262 * Acquire the op locks of both f and f2, which may be null, or
1263 * which may be the same. If they are distinct, an arbitrary total
1264 * order is chosen on the locks.
1265 *
1266 * Callers should only ever acquire multiple op locks
1267 * simultaneously using futex_op_lock2.
1268 */
1269 static void
1270 futex_op_lock2(struct futex *f, struct futex *f2)
1271 {
1272
1273 /*
1274 * If both are null, do nothing; if one is null and the other
1275 * is not, lock the other and be done with it.
1276 */
1277 if (f == NULL && f2 == NULL) {
1278 return;
1279 } else if (f == NULL) {
1280 mutex_enter(&f2->fx_op_lock);
1281 return;
1282 } else if (f2 == NULL) {
1283 mutex_enter(&f->fx_op_lock);
1284 return;
1285 }
1286
1287 /* If both futexes are the same, acquire only one. */
1288 if (f == f2) {
1289 mutex_enter(&f->fx_op_lock);
1290 return;
1291 }
1292
1293 /* Otherwise, use the ordering on the kva of the futex pointer. */
1294 if ((uintptr_t)f < (uintptr_t)f2) {
1295 mutex_enter(&f->fx_op_lock);
1296 mutex_enter(&f2->fx_op_lock);
1297 } else {
1298 mutex_enter(&f2->fx_op_lock);
1299 mutex_enter(&f->fx_op_lock);
1300 }
1301 }
1302
1303 /*
1304 * futex_op_unlock2(f, f2)
1305 *
1306 * Release the queue locks of both f and f2, which may be null, or
1307 * which may have the same underlying queue.
1308 */
1309 static void
1310 futex_op_unlock2(struct futex *f, struct futex *f2)
1311 {
1312
1313 /*
1314 * If both are null, do nothing; if one is null and the other
1315 * is not, unlock the other and be done with it.
1316 */
1317 if (f == NULL && f2 == NULL) {
1318 return;
1319 } else if (f == NULL) {
1320 mutex_exit(&f2->fx_op_lock);
1321 return;
1322 } else if (f2 == NULL) {
1323 mutex_exit(&f->fx_op_lock);
1324 return;
1325 }
1326
1327 /* If both futexes are the same, release only one. */
1328 if (f == f2) {
1329 mutex_exit(&f->fx_op_lock);
1330 return;
1331 }
1332
1333 /* Otherwise, use the ordering on the kva of the futex pointer. */
1334 if ((uintptr_t)f < (uintptr_t)f2) {
1335 mutex_exit(&f2->fx_op_lock);
1336 mutex_exit(&f->fx_op_lock);
1337 } else {
1338 mutex_exit(&f->fx_op_lock);
1339 mutex_exit(&f2->fx_op_lock);
1340 }
1341 }
1342
1343 /*
1344 * futex_func_wait(uaddr, val, val3, timeout, clkid, clkflags, retval)
1345 *
1346 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET).
1347 */
1348 static int
1349 futex_func_wait(bool shared, int *uaddr, int val, int val3,
1350 const struct timespec *timeout, clockid_t clkid, int clkflags,
1351 register_t *retval)
1352 {
1353 struct futex *f;
1354 struct timespec ts;
1355 const struct timespec *deadline;
1356 int error;
1357
1358 /*
1359 * If there's nothing to wait for, and nobody will ever wake
1360 * us, then don't set anything up to wait -- just stop here.
1361 */
1362 if (val3 == 0)
1363 return EINVAL;
1364
1365 /* Optimistically test before anything else. */
1366 if (!futex_test(uaddr, val))
1367 return EAGAIN;
1368
1369 /* Determine a deadline on the specified clock. */
1370 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1371 deadline = timeout;
1372 } else {
1373 struct timespec interval = *timeout;
1374
1375 error = itimespecfix(&interval);
1376 if (error)
1377 return error;
1378 error = clock_gettime1(clkid, &ts);
1379 if (error)
1380 return error;
1381 timespecadd(&ts, &interval, &ts);
1382 deadline = &ts;
1383 }
1384
1385 /* Get the futex, creating it if necessary. */
1386 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1387 if (error)
1388 return error;
1389 KASSERT(f);
1390
1391 /*
1392 * Under the op lock, check the value again: if it has
1393 * already changed, EAGAIN; otherwise enqueue the waiter.
1394 * Since FUTEX_WAKE will use the same lock and be done after
1395 * modifying the value, the order in which we check and enqueue
1396 * is immaterial.
1397 */
1398 futex_op_lock(f);
1399 if (!futex_test(uaddr, val)) {
1400 futex_op_unlock(f);
1401 error = EAGAIN;
1402 goto out;
1403 }
1404
1405 /*
1406 * Now wait. futex_wait() will drop our op lock once we
1407 * are entered into the sleep queue, thus ensuring atomicity
1408 * of wakes with respect to waits.
1409 */
1410 error = futex_wait(f, FUTEX_WRITERQ, deadline, clkid, val3);
1411
1412 /*
1413 * We cannot drop our reference to the futex here, because
1414 * we might be enqueued on a different one when we are awakened.
1415 * The references will be managed on our behalf in the requeue,
1416 * wake, and error cases.
1417 */
1418 f = NULL;
1419
1420 if (__predict_true(error == 0)) {
1421 /* Return 0 on success, error on failure. */
1422 *retval = 0;
1423 }
1424
1425 out: if (f != NULL)
1426 futex_rele(f);
1427 return error;
1428 }
1429
1430 /*
1431 * futex_func_wake(uaddr, val, val3, retval)
1432 *
1433 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET).
1434 */
1435 static int
1436 futex_func_wake(bool shared, int *uaddr, int val, int val3, register_t *retval)
1437 {
1438 struct futex *f;
1439 unsigned int nwoken = 0;
1440 int error = 0;
1441
1442 /* Reject negative number of wakeups. */
1443 if (val < 0) {
1444 error = EINVAL;
1445 goto out;
1446 }
1447
1448 /* Look up the futex, if any. */
1449 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1450 if (error)
1451 goto out;
1452
1453 /* If there's no futex, there are no waiters to wake. */
1454 if (f == NULL)
1455 goto out;
1456
1457 /*
1458 * Under f's op lock, wake the waiters and remember the
1459 * number woken.
1460 */
1461 futex_op_lock(f);
1462 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1463 NULL, FUTEX_WRITERQ, 0, val3);
1464 futex_op_unlock(f);
1465
1466 /* Release the futex. */
1467 futex_rele(f);
1468
1469 out:
1470 /* Return the number of waiters woken. */
1471 *retval = nwoken;
1472
1473 /* Success! */
1474 return error;
1475 }
1476
1477 /*
1478 * futex_func_requeue(op, uaddr, val, uaddr2, val2, val3, retval)
1479 *
1480 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE).
1481 */
1482 static int
1483 futex_func_requeue(bool shared, int op, int *uaddr, int val, int *uaddr2,
1484 int val2, int val3, register_t *retval)
1485 {
1486 struct futex *f = NULL, *f2 = NULL;
1487 unsigned nwoken = 0; /* default to zero woken on early return */
1488 int error;
1489
1490 /* Reject negative number of wakeups or requeues. */
1491 if (val < 0 || val2 < 0) {
1492 error = EINVAL;
1493 goto out;
1494 }
1495
1496 /* Look up the source futex, if any. */
1497 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1498 if (error)
1499 goto out;
1500
1501 /* If there is none, nothing to do. */
1502 if (f == NULL)
1503 goto out;
1504
1505 /*
1506 * We may need to create the destination futex because it's
1507 * entirely possible it does not currently have any waiters.
1508 */
1509 error = futex_lookup_create(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1510 if (error)
1511 goto out;
1512
1513 /*
1514 * Under the futexes' op locks, check the value; if
1515 * unchanged from val3, wake the waiters.
1516 */
1517 futex_op_lock2(f, f2);
1518 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, val3)) {
1519 error = EAGAIN;
1520 } else {
1521 error = 0;
1522 nwoken = futex_wake(f, FUTEX_WRITERQ, val,
1523 f2, FUTEX_WRITERQ, val2,
1524 FUTEX_BITSET_MATCH_ANY);
1525 }
1526 futex_op_unlock2(f, f2);
1527
1528 out:
1529 /* Return the number of waiters woken. */
1530 *retval = nwoken;
1531
1532 /* Release the futexes if we got them. */
1533 if (f2)
1534 futex_rele(f2);
1535 if (f)
1536 futex_rele(f);
1537 return error;
1538 }
1539
1540 /*
1541 * futex_validate_op_cmp(val3)
1542 *
1543 * Validate an op/cmp argument for FUTEX_WAKE_OP.
1544 */
1545 static int
1546 futex_validate_op_cmp(int val3)
1547 {
1548 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1549 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1550
1551 if (op & FUTEX_OP_OPARG_SHIFT) {
1552 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1553 if (oparg < 0)
1554 return EINVAL;
1555 if (oparg >= 32)
1556 return EINVAL;
1557 op &= ~FUTEX_OP_OPARG_SHIFT;
1558 }
1559
1560 switch (op) {
1561 case FUTEX_OP_SET:
1562 case FUTEX_OP_ADD:
1563 case FUTEX_OP_OR:
1564 case FUTEX_OP_ANDN:
1565 case FUTEX_OP_XOR:
1566 break;
1567 default:
1568 return EINVAL;
1569 }
1570
1571 switch (cmp) {
1572 case FUTEX_OP_CMP_EQ:
1573 case FUTEX_OP_CMP_NE:
1574 case FUTEX_OP_CMP_LT:
1575 case FUTEX_OP_CMP_LE:
1576 case FUTEX_OP_CMP_GT:
1577 case FUTEX_OP_CMP_GE:
1578 break;
1579 default:
1580 return EINVAL;
1581 }
1582
1583 return 0;
1584 }
1585
1586 /*
1587 * futex_compute_op(oldval, val3)
1588 *
1589 * Apply a FUTEX_WAIT_OP operation to oldval.
1590 */
1591 static int
1592 futex_compute_op(int oldval, int val3)
1593 {
1594 int op = __SHIFTOUT(val3, FUTEX_OP_OP_MASK);
1595 int oparg = __SHIFTOUT(val3, FUTEX_OP_OPARG_MASK);
1596
1597 if (op & FUTEX_OP_OPARG_SHIFT) {
1598 KASSERT(oparg >= 0);
1599 KASSERT(oparg < 32);
1600 oparg = 1u << oparg;
1601 op &= ~FUTEX_OP_OPARG_SHIFT;
1602 }
1603
1604 switch (op) {
1605 case FUTEX_OP_SET:
1606 return oparg;
1607
1608 case FUTEX_OP_ADD:
1609 /*
1610 * Avoid signed arithmetic overflow by doing
1611 * arithmetic unsigned and converting back to signed
1612 * at the end.
1613 */
1614 return (int)((unsigned)oldval + (unsigned)oparg);
1615
1616 case FUTEX_OP_OR:
1617 return oldval | oparg;
1618
1619 case FUTEX_OP_ANDN:
1620 return oldval & ~oparg;
1621
1622 case FUTEX_OP_XOR:
1623 return oldval ^ oparg;
1624
1625 default:
1626 panic("invalid futex op");
1627 }
1628 }
1629
1630 /*
1631 * futex_compute_cmp(oldval, val3)
1632 *
1633 * Apply a FUTEX_WAIT_OP comparison to oldval.
1634 */
1635 static bool
1636 futex_compute_cmp(int oldval, int val3)
1637 {
1638 int cmp = __SHIFTOUT(val3, FUTEX_OP_CMP_MASK);
1639 int cmparg = __SHIFTOUT(val3, FUTEX_OP_CMPARG_MASK);
1640
1641 switch (cmp) {
1642 case FUTEX_OP_CMP_EQ:
1643 return (oldval == cmparg);
1644
1645 case FUTEX_OP_CMP_NE:
1646 return (oldval != cmparg);
1647
1648 case FUTEX_OP_CMP_LT:
1649 return (oldval < cmparg);
1650
1651 case FUTEX_OP_CMP_LE:
1652 return (oldval <= cmparg);
1653
1654 case FUTEX_OP_CMP_GT:
1655 return (oldval > cmparg);
1656
1657 case FUTEX_OP_CMP_GE:
1658 return (oldval >= cmparg);
1659
1660 default:
1661 panic("invalid futex cmp operation");
1662 }
1663 }
1664
1665 /*
1666 * futex_func_wake_op(uaddr, val, uaddr2, val2, val3, retval)
1667 *
1668 * Implement futex(FUTEX_WAKE_OP).
1669 */
1670 static int
1671 futex_func_wake_op(bool shared, int *uaddr, int val, int *uaddr2, int val2,
1672 int val3, register_t *retval)
1673 {
1674 struct futex *f = NULL, *f2 = NULL;
1675 int oldval, newval, actual;
1676 unsigned nwoken = 0;
1677 int error;
1678
1679 /* Reject negative number of wakeups. */
1680 if (val < 0 || val2 < 0) {
1681 error = EINVAL;
1682 goto out;
1683 }
1684
1685 /* Reject invalid operations before we start doing things. */
1686 if ((error = futex_validate_op_cmp(val3)) != 0)
1687 goto out;
1688
1689 /* Look up the first futex, if any. */
1690 error = futex_lookup(uaddr, shared, FUTEX_CLASS_NORMAL, &f);
1691 if (error)
1692 goto out;
1693
1694 /* Look up the second futex, if any. */
1695 error = futex_lookup(uaddr2, shared, FUTEX_CLASS_NORMAL, &f2);
1696 if (error)
1697 goto out;
1698
1699 /*
1700 * Under the op locks:
1701 *
1702 * 1. Read/modify/write: *uaddr2 op= oparg.
1703 * 2. Unconditionally wake uaddr.
1704 * 3. Conditionally wake uaddr2, if it previously matched val3.
1705 */
1706 futex_op_lock2(f, f2);
1707 do {
1708 error = futex_load(uaddr2, &oldval);
1709 if (error)
1710 goto out_unlock;
1711 newval = futex_compute_op(oldval, val3);
1712 error = ucas_int(uaddr2, oldval, newval, &actual);
1713 if (error)
1714 goto out_unlock;
1715 } while (actual != oldval);
1716 nwoken = (f ? futex_wake(f, FUTEX_WRITERQ, val,
1717 NULL, FUTEX_WRITERQ, 0,
1718 FUTEX_BITSET_MATCH_ANY) : 0);
1719 if (f2 && futex_compute_cmp(oldval, val3))
1720 nwoken += futex_wake(f2, FUTEX_WRITERQ, val2,
1721 NULL, FUTEX_WRITERQ, 0,
1722 FUTEX_BITSET_MATCH_ANY);
1723
1724 /* Success! */
1725 error = 0;
1726 out_unlock:
1727 futex_op_unlock2(f, f2);
1728
1729 out:
1730 /* Return the number of waiters woken. */
1731 *retval = nwoken;
1732
1733 /* Release the futexes, if we got them. */
1734 if (f2)
1735 futex_rele(f2);
1736 if (f)
1737 futex_rele(f);
1738 return error;
1739 }
1740
1741 /*
1742 * futex_read_waiter_prio(l)
1743 *
1744 * Returns the read-waiter priority for purposes of comparing
1745 * against a write-waiter's priority. Read-waiters are only
1746 * prioritized if they are real-time threads.
1747 */
1748 static inline pri_t
1749 futex_read_waiter_prio(struct lwp * const l)
1750 {
1751 const pri_t pri = lwp_eprio(l);
1752 if (__predict_false(pri >= PRI_USER_RT))
1753 return pri;
1754 return PRI_NONE;
1755 }
1756
1757 #if 0 /* XXX */
1758 /*
1759 * futex_rw_handle_rt_reader(f, uaddr, val, pri, errorp)
1760 *
1761 * Attempt to resolve the case of a real-time thread attempting
1762 * to acquire a read lock. Turns true if the attempt is resolved
1763 * and the wait should be elided.
1764 */
1765 static int __noinline
1766 futex_rw_handle_rt_reader(struct futex *f, int *uaddr, int val,
1767 pri_t pri_reader)
1768 {
1769 struct lwp *l_writer = NULL;
1770 int newval, next;
1771 int error;
1772
1773 KASSERT(mutex_owned(&f->fx_op_lock));
1774
1775 /*
1776 * If the lock is write-locked, there isn't anything we
1777 * can do but wait.
1778 */
1779 if (val & FUTEX_RW_WRITE_LOCKED) {
1780 return 0;
1781 }
1782
1783 /*
1784 * If we're maxed out on readers already, nothing we can do.
1785 */
1786 if ((val & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
1787 return ENFILE; /* disctinct from EAGAIN */
1788 }
1789
1790 /*
1791 * The assumption then is that we arrived here with WRITE_WANTED
1792 * set. We're not going to bother testing that, however. We
1793 * will preserve it if we see it.
1794 *
1795 * Because WRITE_WANTED is set, This will cause everyone to enter
1796 * the futex to rw_wait. And we are holding the op lock so no
1797 * additional waiters will apear on the sleepq. We are going
1798 * to do the same tricky dance as rw_handoff to let higher-priority
1799 * real-time waiters slip past the gate.
1800 */
1801
1802 /*
1803 * All we want to do here is check if there is a writer waiting.
1804 * If there is, and it is equal or greater priority, this reader
1805 * loses, otherwise we will just make note if it to ensure that
1806 * the WRITE_WANTED bit is set when we update the futex word.
1807 *
1808 * Since we are not going to actually wake someone from the
1809 * queue here, we're not really interested if the write-waiter
1810 * happens to leave based on a timeout or signal; we know that
1811 * any write-waiter *after* the first one is of equal or lower
1812 * priority, so we would still best it.
1813 */
1814 mutex_spin_enter(f->fx_sq_lock);
1815 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
1816 if (__predict_true(l_writer != NULL)) {
1817 if (pri_reader <= lwp_eprio(l_writer)) {
1818 return 0;
1819 }
1820 }
1821 mutex_spin_exit(f->fx_sq_lock);
1822
1823 for (;; val = next) {
1824 if (__predict_true(l_writer != NULL)) {
1825 newval = (val + 1) | FUTEX_RW_WRITE_WANTED;
1826 } else {
1827 /*
1828 * No writer waiting; increment the reader
1829 * count, preserve any WRITE_WANTED bit.
1830 */
1831 newval = val + 1;
1832 KASSERT(((newval ^ val) & FUTEX_RW_WRITE_WANTED) == 0);
1833 }
1834
1835 error = ucas_int(uaddr, val, newval, &next);
1836 if (__predict_false(error != 0)) {
1837 return error;
1838 }
1839 if (next == val) {
1840 /* Successfully acquired the read lock. */
1841 return EJUSTRETURN;
1842 }
1843 /*
1844 * Repeat the terminal checks from above on the new
1845 * value.
1846 */
1847 if (next & FUTEX_RW_WRITE_LOCKED) {
1848 return 0;
1849 }
1850 if ((next & FUTEX_TID_MASK) == FUTEX_TID_MASK) {
1851 return ENFILE; /* disctinct from EAGAIN */
1852 }
1853 }
1854 }
1855 #endif /* XXX */
1856
1857 /*
1858 * futex_func_rw_wait(uaddr, val, val3, abstime, clkid, retval)
1859 *
1860 * Implement futex(FUTEX_NETBSD_RW_WAIT).
1861 */
1862 static int
1863 futex_func_rw_wait(bool shared, int *uaddr, int val, int val3,
1864 const struct timespec *abstime, clockid_t clkid, register_t *retval)
1865 {
1866 #if 1
1867 /* XXX NETBSD_RW ops are currently broken XXX */
1868 return ENOSYS;
1869 #else
1870 struct futex *f;
1871 int error;
1872
1873 /* Must specify READER or WRITER. */
1874 if (__predict_false(val3 != FUTEX_RW_READER &&
1875 val3 != FUTEX_RW_WRITER))
1876 return EINVAL;
1877
1878 /* Optimistically test before anything else. */
1879 if (!futex_test(uaddr, val))
1880 return EAGAIN;
1881
1882 /* Get the futex, creating it if necessary. */
1883 error = futex_lookup_create(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
1884 if (error)
1885 return error;
1886 KASSERT(f);
1887
1888 /*
1889 * Under the op lock, check the value again: if it has
1890 * already changed, EAGAIN; otherwise, enqueue the waiter
1891 * on the specified queue.
1892 */
1893 futex_op_lock(f);
1894 if (!futex_test(uaddr, val)) {
1895 futex_op_unlock(f);
1896 error = EAGAIN;
1897 goto out;
1898 }
1899
1900 /*
1901 * POSIX dictates that a real-time reader will be prioritized
1902 * over writers of lesser priority, when normally we would
1903 * prefer the writers.
1904 */
1905 if (__predict_false(val3 == FUTEX_RW_READER)) {
1906 struct lwp * const l = curlwp;
1907 const pri_t pri_reader = futex_read_waiter_prio(l);
1908 if (__predict_false(pri_reader > PRI_NONE)) {
1909 error = futex_rw_handle_rt_reader(f, uaddr, val,
1910 pri_reader);
1911 if (error) {
1912 if (__predict_true(error == EJUSTRETURN)) {
1913 /* RT reader acquired the lock. */
1914 error = 0;
1915 }
1916 futex_op_unlock(f);
1917 goto out;
1918 }
1919 }
1920 }
1921
1922 /*
1923 * Now wait. futex_wait() will dop our op lock once we
1924 * are entered into the sleep queue, thus ensuring atomicity
1925 * of wakes with respect to waits.
1926 *
1927 * Use a wake selector of 0 so that this waiter won't match on any
1928 * of the other operations in case someone makes that error; only
1929 * rw_handoff is allowed! This is critical because a waiter that
1930 * awakens from an rw_wait assumes it has been given ownership of
1931 * the lock.
1932 */
1933 error = futex_wait(f, val3, abstime, clkid, 0);
1934
1935 /*
1936 * Don't drop our reference here. We won't be requeued, but
1937 * it's best to main symmetry with other operations.
1938 */
1939 f = NULL;
1940
1941 out: if (__predict_true(error == 0)) {
1942 /* Return 0 on success, error on failure. */
1943 *retval = 0;
1944 }
1945
1946 if (f != NULL)
1947 futex_rele(f);
1948 return error;
1949 #endif
1950 }
1951
1952 /*
1953 * futex_func_rw_handoff(uaddr, val, retval)
1954 *
1955 * Implement futex(FUTEX_NETBSD_RW_HANDOFF).
1956 *
1957 * This implements direct hand-off to either a single writer
1958 * or all readers.
1959 */
1960 static int
1961 futex_func_rw_handoff(bool shared, int *uaddr, int val, register_t *retval)
1962 {
1963 #if 1
1964 /* XXX NETBSD_RW ops are currently broken XXX */
1965 return ENOSYS;
1966 #else
1967 struct lwp *l, *l_next, *l_writer, *l_reader;
1968 struct futex *f;
1969 sleepq_t wsq, *sq;
1970 int error = 0;
1971 int newval, next, nwake, nwoken = 0;
1972 int allreaders __diagused = 0;
1973 unsigned int *nwaitersp;
1974 pri_t pri_writer;
1975
1976 /* Look up the futex, if any. */
1977 error = futex_lookup(uaddr, shared, FUTEX_CLASS_RWLOCK, &f);
1978 if (error)
1979 goto out;
1980
1981 /*
1982 * There are a couple of diffrent scenarios where a thread
1983 * releasing a RW lock would call rw_handoff, yet we find no
1984 * waiters:
1985 *
1986 * ==> There were waiters on the queue, but they left the queue
1987 * due to a signal.
1988 * ==> The waiting thread set the waiter bit(s), but decided to
1989 * try spinning before calling rw_wait.
1990 *
1991 * In both of these cases, we will ensure that the lock word
1992 * gets cleared.
1993 */
1994
1995 /* If there's no futex, there are no waiters to wake. */
1996 if (__predict_false(f == NULL)) {
1997 /*
1998 * If there are no waiters, ensure that the lock
1999 * word is cleared.
2000 */
2001 error = ucas_int(uaddr, val, 0, &next);
2002 if (__predict_true(error == 0)) {
2003 if (__predict_false(val != next))
2004 error = EAGAIN;
2005 }
2006 goto out;
2007 }
2008
2009 /*
2010 * Compute the new value and store it in the futex word.
2011 *
2012 * This is a little tricky because the ucas could cause
2013 * a page fault, and we can't let that happen while holding
2014 * the sleepq locks. But we have to make sure that choices
2015 * we make regarding what threads to wake is accurately
2016 * reflected in the futex word and that the futex word is
2017 * updated before the LWPs can run.
2018 *
2019 * This is easy enough ... we can transfer the LWPs to a private
2020 * sleepq to prevent changes in the original sleepq while we have
2021 * it unlocked from affecting the results, but we must also
2022 * consider that LWPs might be using timed-wait, so we have to
2023 * make sure that won't wake the LWP up out from under us if the
2024 * timer fires. To do this, we have to set the "wchan" to NULL
2025 * and use a dummy syncobj that takes no action on "unsleep".
2026 *
2027 * We thus perform the hand-off in three steps, all with the op
2028 * lock held:
2029 *
2030 * ==> Wake selection: sleepq locked, select LWPs to wake,
2031 * compute new futex word. LWPs are tranferred from the
2032 * futex sleepq to the private sleepq and further sedated.
2033 *
2034 * ==> Futex word update: sleepq unlocked, use a loop around ucas
2035 * to update the futex word. There are no scenarios where
2036 * user space code can update the futex in a way that would
2037 * impact the work we do here; because we've been asked to do
2038 * the hand-off, any new LWPs attempting to acquire the lock
2039 * would be entering rw_wait by definition (either because
2040 * they're read-lockers and the lock is write-wanted, or they're
2041 * write-lockers and the lock is read-held). Those new rw_wait
2042 * LWPs will serialize against the op lock. We DO, however,
2043 * need to preserve any newly-set WANTED bits, hence the ucas
2044 * loop. Those newly-set WANTED bits, however, will not impact
2045 * our decisions. Those LWPs have simply lost the race to
2046 * acquire the lock, and we don't consult those bits anyway;
2047 * we instead use the contents of the sleepqs as the truth
2048 * about who is waiting, and now new waiters will appear on
2049 * the sleepqs while we hold the op lock.
2050 *
2051 * ==> Wake waiters: sleepq locked, run down our private sleepq
2052 * and actually awaken all of the LWPs we selected earlier.
2053 *
2054 * If for some reason, the ucas fails because it page backing it
2055 * was unmapped, all bets are off. We still awaken the waiters.
2056 * This is either a malicious attempt to screw things up, or a
2057 * programming error, and we don't care about either one.
2058 */
2059 sleepq_init(&wsq);
2060
2061 futex_op_lock(f);
2062 mutex_spin_enter(f->fx_sq_lock);
2063
2064 /*
2065 * STEP 1
2066 */
2067
2068 /*
2069 * POSIX dictates that any real-time waiters will acquire the
2070 * lock in priority order. This implies that a real-time
2071 * read-waiter has priority over a non-real-time write-waiter,
2072 * where we would otherwise prioritize waking the write-waiter.
2073 */
2074 l_writer = LIST_FIRST(&f->fx_sqs[FUTEX_WRITERQ]);
2075 l_reader = LIST_FIRST(&f->fx_sqs[FUTEX_READERQ]);
2076 if (__predict_true(l_writer == NULL)) {
2077 /* We will wake all the readers. */
2078 sq = &f->fx_sqs[FUTEX_READERQ];
2079 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
2080 nwake = allreaders = f->fx_nwaiters[FUTEX_READERQ];
2081 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
2082 if (__predict_false(nwake == 0)) {
2083 KASSERT(l_reader == NULL);
2084 newval = 0;
2085 } else {
2086 KASSERT(l_reader != NULL);
2087 newval = nwake;
2088 }
2089 l = l_reader;
2090 } else if (__predict_false(l_reader != NULL &&
2091 futex_read_waiter_prio(l_reader) >
2092 (pri_writer = lwp_eprio(l_writer)))) {
2093 /*
2094 * Count the number of real-time read-waiters that
2095 * exceed the write-waiter's priority. We will
2096 * wake that many (we alreay know it's at least one).
2097 */
2098 sq = &f->fx_sqs[FUTEX_READERQ];
2099 nwaitersp = &f->fx_nwaiters[FUTEX_READERQ];
2100 for (nwake = 1, l = LIST_NEXT(l_reader, l_sleepchain);
2101 l != NULL && futex_read_waiter_prio(l) > pri_writer;
2102 l = LIST_NEXT(l, l_sleepchain)) {
2103 nwake++;
2104 }
2105 KASSERT(nwake >= 0 && nwake <= FUTEX_TID_MASK);
2106 /* We know there is at least one write-waiter. */
2107 newval = nwake | FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
2108 l = l_reader;
2109 } else {
2110 /* We will wake one writer. */
2111 sq = &f->fx_sqs[FUTEX_WRITERQ];
2112 nwaitersp = &f->fx_nwaiters[FUTEX_WRITERQ];
2113 nwake = 1;
2114 newval = FUTEX_RW_WRITE_LOCKED | l_writer->l_lid;
2115 if (__predict_false(f->fx_nwaiters[FUTEX_WRITERQ] > 1)) {
2116 KASSERT(LIST_NEXT(l_writer, l_sleepchain) != NULL);
2117 newval |= FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED;
2118 } else if (__predict_true(f->fx_nwaiters[FUTEX_READERQ] != 0)) {
2119 KASSERT(!LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2120 newval |= FUTEX_WAITERS;
2121 } else {
2122 KASSERT(LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2123 }
2124 l = l_writer;
2125 }
2126
2127 KASSERT(sq == &f->fx_sqs[FUTEX_READERQ] ||
2128 sq == &f->fx_sqs[FUTEX_WRITERQ]);
2129 while (nwake != 0 && l != NULL) {
2130 l_next = LIST_NEXT(l, l_sleepchain);
2131 KASSERT(l->l_syncobj == &futex_syncobj);
2132 KASSERT(l->l_wchan == (wchan_t)f);
2133 KASSERT(l->l_futex == f);
2134 KASSERT(l->l_sleepq == sq);
2135 KASSERT(l->l_mutex == f->fx_sq_lock);
2136
2137 /*
2138 * NULL wchan --> timeout will not wake LWP.
2139 * NULL lock --> keep existing lock.
2140 */
2141 sleepq_transfer(l, sq, &wsq, NULL/*wchan*/, futex_wmesg,
2142 &sched_syncobj, NULL/*lock*/, false);
2143
2144 KASSERT(*nwaitersp > 0);
2145 (*nwaitersp)--;
2146 nwoken++;
2147 nwake--;
2148 /* hold count adjusted below */
2149 l = l_next;
2150 }
2151
2152 mutex_spin_exit(f->fx_sq_lock);
2153
2154 /*
2155 * STEP 2
2156 */
2157 for (;; val = next) {
2158 error = ucas_int(uaddr, val, newval, &next);
2159 if (__predict_false(error != 0)) {
2160 /*
2161 * The futex word has become unmapped. All bets
2162 * are off. Break out of the loop and awaken the
2163 * waiters; this is easier than trying to stuff
2164 * them back into their previous sleepq, and the
2165 * application is screwed anyway.
2166 */
2167 break;
2168 }
2169 if (__predict_true(next == val)) {
2170 /*
2171 * Successfully updated the futex word!
2172 */
2173 break;
2174 }
2175 /*
2176 * The only thing that could have possibly happened
2177 * (barring some bug in the thread library) is that
2178 * additional waiter bits arrived. Those new waiters
2179 * have lost the race to acquire the lock, but we
2180 * need to preserve those bits.
2181 */
2182 newval |= next & (FUTEX_WAITERS | FUTEX_RW_WRITE_WANTED);
2183 }
2184
2185 mutex_spin_enter(f->fx_sq_lock);
2186
2187 /*
2188 * STEP 3
2189 */
2190
2191 LIST_FOREACH_SAFE(l, &wsq, l_sleepchain, l_next) {
2192 KASSERT(l->l_syncobj == &sched_syncobj);
2193 KASSERT(l->l_wchan == NULL);
2194 KASSERT(l->l_futex == f);
2195 KASSERT(l->l_sleepq == &wsq);
2196 KASSERT(l->l_mutex == f->fx_sq_lock);
2197
2198 l->l_futex_wakesel = 0;
2199 l->l_futex = NULL;
2200 sleepq_remove(&wsq, l);
2201 }
2202 /* If we woke all-readers, ensure we will wake them all. */
2203 KASSERT(allreaders == 0 || allreaders == nwoken);
2204 KASSERT(allreaders == 0 || LIST_EMPTY(&f->fx_sqs[FUTEX_READERQ]));
2205 KASSERT(allreaders == 0 || f->fx_nwaiters[FUTEX_READERQ] == 0);
2206
2207 mutex_spin_exit(f->fx_sq_lock);
2208
2209 /* Adjust hold count. */
2210 futex_rele_count_not_last(f, nwoken);
2211
2212 futex_op_unlock(f);
2213
2214 /* Release the futex. */
2215 futex_rele(f);
2216
2217 out: if (__predict_true(error == 0)) {
2218 /* Return the number of waiters woken. */
2219 *retval = nwoken;
2220 }
2221
2222 /* Success! */
2223 return error;
2224 #endif
2225 }
2226
2227 /*
2228 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
2229 *
2230 * Implement the futex system call with all the parameters
2231 * parsed out.
2232 */
2233 int
2234 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
2235 int *uaddr2, int val2, int val3, register_t *retval)
2236 {
2237 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
2238 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
2239 : CLOCK_MONOTONIC;
2240 int error;
2241
2242 op &= FUTEX_CMD_MASK;
2243
2244 switch (op) {
2245 case FUTEX_WAIT:
2246 SDT_PROBE6(futex, func, wait, entry,
2247 uaddr, val, FUTEX_BITSET_MATCH_ANY, timeout,
2248 TIMER_RELTIME, op & ~FUTEX_CMD_MASK);
2249 error = futex_func_wait(shared, uaddr, val,
2250 FUTEX_BITSET_MATCH_ANY, timeout, clkid, TIMER_RELTIME,
2251 retval);
2252 SDT_PROBE1(futex, func, wait, exit, error);
2253 break;
2254
2255 case FUTEX_WAIT_BITSET:
2256 SDT_PROBE6(futex, func, wait, entry,
2257 uaddr, val, val3, timeout,
2258 TIMER_ABSTIME, op & ~FUTEX_CMD_MASK);
2259 error = futex_func_wait(shared, uaddr, val, val3, timeout,
2260 clkid, TIMER_ABSTIME, retval);
2261 SDT_PROBE1(futex, func, wait, exit, error);
2262 break;
2263
2264 case FUTEX_WAKE:
2265 SDT_PROBE4(futex, func, wake, entry,
2266 uaddr, val, FUTEX_BITSET_MATCH_ANY, op & ~FUTEX_CMD_MASK);
2267 error = futex_func_wake(shared, uaddr, val,
2268 FUTEX_BITSET_MATCH_ANY, retval);
2269 SDT_PROBE2(futex, func, wake, exit, error, *retval);
2270 break;
2271
2272 case FUTEX_WAKE_BITSET:
2273 SDT_PROBE4(futex, func, wake, entry,
2274 uaddr, val, val3, op & ~FUTEX_CMD_MASK);
2275 error = futex_func_wake(shared, uaddr, val, val3, retval);
2276 SDT_PROBE2(futex, func, wake, exit, error, *retval);
2277 break;
2278
2279 case FUTEX_REQUEUE:
2280 SDT_PROBE5(futex, func, requeue, entry,
2281 uaddr, val, uaddr2, val2, op & ~FUTEX_CMD_MASK);
2282 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
2283 val2, 0, retval);
2284 SDT_PROBE2(futex, func, requeue, exit, error, *retval);
2285 break;
2286
2287 case FUTEX_CMP_REQUEUE:
2288 SDT_PROBE6(futex, func, cmp_requeue, entry,
2289 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
2290 error = futex_func_requeue(shared, op, uaddr, val, uaddr2,
2291 val2, val3, retval);
2292 SDT_PROBE2(futex, func, cmp_requeue, exit, error, *retval);
2293 break;
2294
2295 case FUTEX_WAKE_OP:
2296 SDT_PROBE6(futex, func, wake_op, entry,
2297 uaddr, val, uaddr2, val2, val3, op & ~FUTEX_CMD_MASK);
2298 error = futex_func_wake_op(shared, uaddr, val, uaddr2, val2,
2299 val3, retval);
2300 SDT_PROBE2(futex, func, wake_op, exit, error, *retval);
2301 break;
2302
2303 case FUTEX_NETBSD_RW_WAIT:
2304 SDT_PROBE5(futex, func, rw_wait, entry,
2305 uaddr, val, val3, timeout, op & ~FUTEX_CMD_MASK);
2306 error = futex_func_rw_wait(shared, uaddr, val, val3,
2307 timeout, clkid, retval);
2308 SDT_PROBE1(futex, func, rw_wait, exit, error);
2309 break;
2310
2311 case FUTEX_NETBSD_RW_HANDOFF:
2312 SDT_PROBE3(futex, func, rw_handoff, entry,
2313 uaddr, val, op & ~FUTEX_CMD_MASK);
2314 error = futex_func_rw_handoff(shared, uaddr, val, retval);
2315 SDT_PROBE2(futex, func, rw_handoff, exit, error, *retval);
2316 break;
2317
2318 case FUTEX_FD:
2319 default:
2320 error = ENOSYS;
2321 break;
2322 }
2323
2324 return error;
2325 }
2326
2327 /*
2328 * sys___futex(l, uap, retval)
2329 *
2330 * __futex(2) system call: generic futex operations.
2331 */
2332 int
2333 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
2334 register_t *retval)
2335 {
2336 /* {
2337 syscallarg(int *) uaddr;
2338 syscallarg(int) op;
2339 syscallarg(int) val;
2340 syscallarg(const struct timespec *) timeout;
2341 syscallarg(int *) uaddr2;
2342 syscallarg(int) val2;
2343 syscallarg(int) val3;
2344 } */
2345 struct timespec ts, *tsp;
2346 int error;
2347
2348 /*
2349 * Copy in the timeout argument, if specified.
2350 */
2351 if (SCARG(uap, timeout)) {
2352 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
2353 if (error)
2354 return error;
2355 tsp = &ts;
2356 } else {
2357 tsp = NULL;
2358 }
2359
2360 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
2361 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
2362 retval);
2363 }
2364
2365 /*
2366 * sys___futex_set_robust_list(l, uap, retval)
2367 *
2368 * __futex_set_robust_list(2) system call for robust futexes.
2369 */
2370 int
2371 sys___futex_set_robust_list(struct lwp *l,
2372 const struct sys___futex_set_robust_list_args *uap, register_t *retval)
2373 {
2374 /* {
2375 syscallarg(void *) head;
2376 syscallarg(size_t) len;
2377 } */
2378 void *head = SCARG(uap, head);
2379
2380 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
2381 return EINVAL;
2382 if ((uintptr_t)head % sizeof(u_long))
2383 return EINVAL;
2384
2385 l->l_robust_head = (uintptr_t)head;
2386
2387 return 0;
2388 }
2389
2390 /*
2391 * sys___futex_get_robust_list(l, uap, retval)
2392 *
2393 * __futex_get_robust_list(2) system call for robust futexes.
2394 */
2395 int
2396 sys___futex_get_robust_list(struct lwp *l,
2397 const struct sys___futex_get_robust_list_args *uap, register_t *retval)
2398 {
2399 /* {
2400 syscallarg(lwpid_t) lwpid;
2401 syscallarg(void **) headp;
2402 syscallarg(size_t *) lenp;
2403 } */
2404 void *head;
2405 const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
2406 int error;
2407
2408 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
2409 if (error)
2410 return error;
2411
2412 /* Copy out the head pointer and the head structure length. */
2413 error = copyout(&head, SCARG(uap, headp), sizeof(head));
2414 if (__predict_true(error == 0)) {
2415 error = copyout(&len, SCARG(uap, lenp), sizeof(len));
2416 }
2417
2418 return error;
2419 }
2420
2421 /*
2422 * release_futex(uva, tid)
2423 *
2424 * Try to release the robust futex at uva in the current process
2425 * on lwp exit. If anything goes wrong, silently fail. It is the
2426 * userland program's obligation to arrange correct behaviour.
2427 */
2428 static void
2429 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
2430 bool const is_pending)
2431 {
2432 int *uaddr;
2433 struct futex *f;
2434 int oldval, newval, actual;
2435 int error;
2436 bool shared;
2437
2438 /* If it's misaligned, tough. */
2439 if (__predict_false(uptr & 3))
2440 return;
2441 uaddr = (int *)uptr;
2442
2443 error = futex_load(uaddr, &oldval);
2444 if (__predict_false(error))
2445 return;
2446
2447 /*
2448 * There are two race conditions we need to handle here:
2449 *
2450 * 1. User space cleared the futex word but died before
2451 * being able to issue the wakeup. No wakeups will
2452 * ever be issued, oops!
2453 *
2454 * 2. Awakened waiter died before being able to acquire
2455 * the futex in user space. Any other waiters are
2456 * now stuck, oops!
2457 *
2458 * In both of these cases, the futex word will be 0 (because
2459 * it's updated before the wake is issued). The best we can
2460 * do is detect this situation if it's the pending futex and
2461 * issue a wake without modifying the futex word.
2462 *
2463 * XXX eventual PI handling?
2464 */
2465 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
2466 register_t retval;
2467 error = futex_func_wake(/*shared*/true, uaddr, 1,
2468 FUTEX_BITSET_MATCH_ANY, &retval);
2469 if (error != 0 || retval == 0)
2470 (void) futex_func_wake(/*shared*/false, uaddr, 1,
2471 FUTEX_BITSET_MATCH_ANY, &retval);
2472 return;
2473 }
2474
2475 /* Optimistically test whether we need to do anything at all. */
2476 if ((oldval & FUTEX_TID_MASK) != tid)
2477 return;
2478
2479 /*
2480 * We need to handle the case where this thread owned the futex,
2481 * but it was uncontended. In this case, there won't be any
2482 * kernel state to look up. All we can do is mark the futex
2483 * as a zombie to be mopped up the next time another thread
2484 * attempts to acquire it.
2485 *
2486 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
2487 * this loop, even if waiters appear while we're are doing
2488 * so. This is beause FUTEX_WAITERS is set by user space
2489 * before calling __futex() to wait, and the futex needs
2490 * to be marked as a zombie when the new waiter gets into
2491 * the kernel.
2492 */
2493 if ((oldval & FUTEX_WAITERS) == 0) {
2494 do {
2495 error = futex_load(uaddr, &oldval);
2496 if (error)
2497 return;
2498 if ((oldval & FUTEX_TID_MASK) != tid)
2499 return;
2500 newval = oldval | FUTEX_OWNER_DIED;
2501 error = ucas_int(uaddr, oldval, newval, &actual);
2502 if (error)
2503 return;
2504 } while (actual != oldval);
2505
2506 /*
2507 * If where is still no indication of waiters, then there is
2508 * no more work for us to do.
2509 */
2510 if ((oldval & FUTEX_WAITERS) == 0)
2511 return;
2512 }
2513
2514 /*
2515 * Look for a futex. Try shared first, then private. If we
2516 * can't fine one, tough.
2517 *
2518 * Note: the assumption here is that anyone placing a futex on
2519 * the robust list is adhering to the rules, regardless of the
2520 * futex class.
2521 */
2522 for (f = NULL, shared = true; f == NULL; shared = false) {
2523 error = futex_lookup(uaddr, shared, FUTEX_CLASS_ANY, &f);
2524 if (error)
2525 return;
2526 if (f == NULL && shared == false)
2527 return;
2528 }
2529
2530 /* Work under the futex op lock. */
2531 futex_op_lock(f);
2532
2533 /*
2534 * Fetch the word: if the tid doesn't match ours, skip;
2535 * otherwise, set the owner-died bit, atomically.
2536 */
2537 do {
2538 error = futex_load(uaddr, &oldval);
2539 if (error)
2540 goto out;
2541 if ((oldval & FUTEX_TID_MASK) != tid)
2542 goto out;
2543 newval = oldval | FUTEX_OWNER_DIED;
2544 error = ucas_int(uaddr, oldval, newval, &actual);
2545 if (error)
2546 goto out;
2547 } while (actual != oldval);
2548
2549 /*
2550 * If there may be waiters, try to wake one. If anything goes
2551 * wrong, tough.
2552 *
2553 * XXX eventual PI handling?
2554 */
2555 if (oldval & FUTEX_WAITERS) {
2556 (void)futex_wake(f, FUTEX_WRITERQ, 1,
2557 NULL, FUTEX_WRITERQ, 0,
2558 FUTEX_BITSET_MATCH_ANY);
2559 }
2560
2561 /* Unlock the queue and release the futex. */
2562 out: futex_op_unlock(f);
2563 futex_rele(f);
2564 }
2565
2566 /*
2567 * futex_robust_head_lookup(l, lwpid)
2568 *
2569 * Helper function to look up a robust head by LWP ID.
2570 */
2571 int
2572 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
2573 {
2574 struct proc *p = l->l_proc;
2575
2576 /* Find the other lwp, if requested; otherwise use our robust head. */
2577 if (lwpid) {
2578 mutex_enter(p->p_lock);
2579 l = lwp_find(p, lwpid);
2580 if (l == NULL) {
2581 mutex_exit(p->p_lock);
2582 return ESRCH;
2583 }
2584 *headp = (void *)l->l_robust_head;
2585 mutex_exit(p->p_lock);
2586 } else {
2587 *headp = (void *)l->l_robust_head;
2588 }
2589 return 0;
2590 }
2591
2592 /*
2593 * futex_fetch_robust_head(uaddr)
2594 *
2595 * Helper routine to fetch the futex robust list head that
2596 * handles 32-bit binaries running on 64-bit kernels.
2597 */
2598 static int
2599 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
2600 {
2601 #ifdef _LP64
2602 if (curproc->p_flag & PK_32) {
2603 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
2604 int error;
2605
2606 error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2607 if (__predict_true(error == 0)) {
2608 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2609 if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2610 /*
2611 * Make sure the offset is sign-
2612 * extended.
2613 */
2614 rhead[i] = (int32_t)rhead32[i];
2615 } else {
2616 rhead[i] = rhead32[i];
2617 }
2618 }
2619 }
2620 return error;
2621 }
2622 #endif /* _L64 */
2623
2624 return copyin((void *)uaddr, rhead,
2625 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2626 }
2627
2628 /*
2629 * futex_decode_robust_word(word)
2630 *
2631 * Decode a robust futex list word into the entry and entry
2632 * properties.
2633 */
2634 static inline void
2635 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2636 bool * const is_pi)
2637 {
2638 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2639 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2640 }
2641
2642 /*
2643 * futex_fetch_robust_entry(uaddr)
2644 *
2645 * Helper routine to fetch and decode a robust futex entry
2646 * that handles 32-bit binaries running on 64-bit kernels.
2647 */
2648 static int
2649 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2650 bool * const is_pi)
2651 {
2652 uintptr_t val = 0;
2653 int error = 0;
2654
2655 #ifdef _LP64
2656 if (curproc->p_flag & PK_32) {
2657 uint32_t val32;
2658
2659 error = ufetch_32((uint32_t *)uaddr, &val32);
2660 if (__predict_true(error == 0))
2661 val = val32;
2662 } else
2663 #endif /* _LP64 */
2664 error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2665 if (__predict_false(error))
2666 return error;
2667
2668 futex_decode_robust_word(val, valp, is_pi);
2669 return 0;
2670 }
2671
2672 /*
2673 * futex_release_all_lwp(l, tid)
2674 *
2675 * Release all l's robust futexes. If anything looks funny in
2676 * the process, give up -- it's userland's responsibility to dot
2677 * the i's and cross the t's.
2678 */
2679 void
2680 futex_release_all_lwp(struct lwp * const l, lwpid_t const tid)
2681 {
2682 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2683 int limit = 1000000;
2684 int error;
2685
2686 /* If there's no robust list there's nothing to do. */
2687 if (l->l_robust_head == 0)
2688 return;
2689
2690 /* Read the final snapshot of the robust list head. */
2691 error = futex_fetch_robust_head(l->l_robust_head, rhead);
2692 if (error) {
2693 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2694 " unmapped robust futex list head\n",
2695 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2696 (uintmax_t)l->l_lid, (uintmax_t)tid);
2697 return;
2698 }
2699
2700 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2701
2702 uintptr_t next, pending;
2703 bool is_pi, pending_is_pi;
2704
2705 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2706 &next, &is_pi);
2707 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2708 &pending, &pending_is_pi);
2709
2710 /*
2711 * Walk down the list of locked futexes and release them, up
2712 * to one million of them before we give up.
2713 */
2714
2715 while (next != l->l_robust_head && limit-- > 0) {
2716 /* pending handled below. */
2717 if (next != pending)
2718 release_futex(next + offset, tid, is_pi, false);
2719 error = futex_fetch_robust_entry(next, &next, &is_pi);
2720 if (error)
2721 break;
2722 preempt_point();
2723 }
2724 if (limit <= 0) {
2725 printf("WARNING: pid %jd (%s) lwp %jd tid %jd:"
2726 " exhausted robust futex limit\n",
2727 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2728 (uintmax_t)l->l_lid, (uintmax_t)tid);
2729 }
2730
2731 /* If there's a pending futex, it may need to be released too. */
2732 if (pending != 0) {
2733 release_futex(pending + offset, tid, pending_is_pi, true);
2734 }
2735 }
2736