1 /* $NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell and Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $"); 34 35 /* 36 * Futexes 37 * 38 * The futex system call coordinates notifying threads waiting for 39 * changes on a 32-bit word of memory. The word can be managed by 40 * CPU atomic operations in userland, without system calls, as long 41 * as there is no contention. 42 * 43 * The simplest use case demonstrating the utility is: 44 * 45 * // 32-bit word of memory shared among threads or 46 * // processes in userland. lock & 1 means owned; 47 * // lock & 2 means there are waiters waiting. 48 * volatile int lock = 0; 49 * 50 * int v; 51 * 52 * // Acquire a lock. 53 * do { 54 * v = lock; 55 * if (v & 1) { 56 * // Lock is held. Set a bit to say that 57 * // there are waiters, and wait for lock 58 * // to change to anything other than v; 59 * // then retry. 60 * if (atomic_cas_uint(&lock, v, v | 2) != v) 61 * continue; 62 * futex(&lock, FUTEX_WAIT, v | 2, NULL, NULL, 0, 63 * 0); 64 * continue; 65 * } 66 * } while (atomic_cas_uint(&lock, v, v | 1) != v); 67 * membar_acquire(); 68 * 69 * ... 70 * 71 * // Release the lock. Optimistically assume there are 72 * // no waiters first until demonstrated otherwise. 73 * membar_release(); 74 * if (atomic_cas_uint(&lock, 1, 0) != 1) { 75 * // There may be waiters. 76 * v = atomic_swap_uint(&lock, 0); 77 * // If there are still waiters, wake one. 78 * if (v & 2) 79 * futex(&lock, FUTEX_WAKE, 1, NULL, NULL, 0, 0); 80 * } 81 * 82 * The goal is to avoid the futex system call unless there is 83 * contention; then if there is contention, to guarantee no missed 84 * wakeups. 85 * 86 * For a simple implementation, futex(FUTEX_WAIT) could queue 87 * itself to be woken, double-check the lock word, and then sleep; 88 * spurious wakeups are generally a fact of life, so any 89 * FUTEX_WAKE could just wake every FUTEX_WAIT in the system. 90 * 91 * If this were all there is to it, we could then increase 92 * parallelism by refining the approximation: partition the 93 * waiters into buckets by hashing the lock addresses to reduce 94 * the incidence of spurious wakeups. But this is not all. 95 * 96 * The futex(&lock, FUTEX_CMP_REQUEUE, n, timeout, &lock2, m, val) 97 * operation not only wakes n waiters on lock if lock == val, but 98 * also _transfers_ m additional waiters to lock2. Unless wakeups 99 * on lock2 also trigger wakeups on lock, we cannot move waiters 100 * to lock2 if they merely share the same hash as waiters on lock. 101 * Thus, we can't approximately distribute waiters into queues by 102 * a hash function; we must distinguish futex queues exactly by 103 * lock address. 104 * 105 * For now, we use a global red/black tree to index futexes. This 106 * should be replaced by a lockless radix tree with a thread to 107 * free entries no longer in use once all lookups on all CPUs have 108 * completed. 109 * 110 * Specifically, we maintain two maps: 111 * 112 * futex_tab.va[vmspace, va] for private futexes 113 * futex_tab.oa[uvm_voaddr] for shared futexes 114 * 115 * This implementation does not support priority inheritance. 116 */ 117 118 #include <sys/param.h> 119 #include <sys/types.h> 120 #include <sys/atomic.h> 121 #include <sys/condvar.h> 122 #include <sys/futex.h> 123 #include <sys/mutex.h> 124 #include <sys/rbtree.h> 125 #include <sys/queue.h> 126 127 #include <sys/syscall.h> 128 #include <sys/syscallargs.h> 129 #include <sys/syscallvar.h> 130 131 #include <uvm/uvm_extern.h> 132 133 /* 134 * Lock order: 135 * 136 * futex_tab.lock 137 * futex::fx_qlock ordered by kva of struct futex 138 * -> futex_wait::fw_lock only one at a time 139 * futex_wait::fw_lock only one at a time 140 * -> futex::fx_abortlock only one at a time 141 */ 142 143 /* 144 * union futex_key 145 * 146 * A futex is addressed either by a vmspace+va (private) or by 147 * a uvm_voaddr (shared). 148 */ 149 union futex_key { 150 struct { 151 struct vmspace *vmspace; 152 vaddr_t va; 153 } fk_private; 154 struct uvm_voaddr fk_shared; 155 }; 156 157 /* 158 * struct futex 159 * 160 * Kernel state for a futex located at a particular address in a 161 * particular virtual address space. 162 * 163 * N.B. fx_refcnt is an unsigned long because we need to be able 164 * to operate on it atomically on all systems while at the same 165 * time rendering practically impossible the chance of it reaching 166 * its max value. In practice, we're limited by the number of LWPs 167 * that can be present on the system at any given time, and the 168 * assumption is that limit will be good enough on a 32-bit platform. 169 * See futex_wake() for why overflow needs to be avoided. 170 */ 171 struct futex { 172 union futex_key fx_key; 173 unsigned long fx_refcnt; 174 bool fx_shared; 175 bool fx_on_tree; 176 struct rb_node fx_node; 177 178 kmutex_t fx_qlock; 179 TAILQ_HEAD(, futex_wait) fx_queue; 180 181 kmutex_t fx_abortlock; 182 LIST_HEAD(, futex_wait) fx_abortlist; 183 kcondvar_t fx_abortcv; 184 }; 185 186 /* 187 * struct futex_wait 188 * 189 * State for a thread to wait on a futex. Threads wait on fw_cv 190 * for fw_bitset to be set to zero. The thread may transition to 191 * a different futex queue at any time under the futex's lock. 192 */ 193 struct futex_wait { 194 kmutex_t fw_lock; 195 kcondvar_t fw_cv; 196 struct futex *fw_futex; 197 TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 198 LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 199 int fw_bitset; 200 bool fw_aborting; /* fw_lock */ 201 }; 202 203 /* 204 * futex_tab 205 * 206 * Global trees of futexes by vmspace/va and VM object address. 207 * 208 * XXX This obviously doesn't scale in parallel. We could use a 209 * pserialize-safe data structure, but there may be a high cost to 210 * frequent deletion since we don't cache futexes after we're done 211 * with them. We could use hashed locks. But for now, just make 212 * sure userland can't DoS the serial performance, by using a 213 * balanced binary tree for lookup. 214 * 215 * XXX We could use a per-process tree for the table indexed by 216 * virtual address to reduce contention between processes. 217 */ 218 static struct { 219 kmutex_t lock; 220 struct rb_tree va; 221 struct rb_tree oa; 222 } futex_tab __cacheline_aligned; 223 224 static int 225 compare_futex_key(void *cookie, const void *n, const void *k) 226 { 227 const struct futex *fa = n; 228 const union futex_key *fka = &fa->fx_key; 229 const union futex_key *fkb = k; 230 231 if ((uintptr_t)fka->fk_private.vmspace < 232 (uintptr_t)fkb->fk_private.vmspace) 233 return -1; 234 if ((uintptr_t)fka->fk_private.vmspace > 235 (uintptr_t)fkb->fk_private.vmspace) 236 return +1; 237 if (fka->fk_private.va < fkb->fk_private.va) 238 return -1; 239 if (fka->fk_private.va > fkb->fk_private.va) 240 return +1; 241 return 0; 242 } 243 244 static int 245 compare_futex(void *cookie, const void *na, const void *nb) 246 { 247 const struct futex *fa = na; 248 const struct futex *fb = nb; 249 250 return compare_futex_key(cookie, fa, &fb->fx_key); 251 } 252 253 static const rb_tree_ops_t futex_rb_ops = { 254 .rbto_compare_nodes = compare_futex, 255 .rbto_compare_key = compare_futex_key, 256 .rbto_node_offset = offsetof(struct futex, fx_node), 257 }; 258 259 static int 260 compare_futex_shared_key(void *cookie, const void *n, const void *k) 261 { 262 const struct futex *fa = n; 263 const union futex_key *fka = &fa->fx_key; 264 const union futex_key *fkb = k; 265 266 return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared); 267 } 268 269 static int 270 compare_futex_shared(void *cookie, const void *na, const void *nb) 271 { 272 const struct futex *fa = na; 273 const struct futex *fb = nb; 274 275 return compare_futex_shared_key(cookie, fa, &fb->fx_key); 276 } 277 278 static const rb_tree_ops_t futex_shared_rb_ops = { 279 .rbto_compare_nodes = compare_futex_shared, 280 .rbto_compare_key = compare_futex_shared_key, 281 .rbto_node_offset = offsetof(struct futex, fx_node), 282 }; 283 284 static void futex_wait_dequeue(struct futex_wait *, struct futex *); 285 286 /* 287 * futex_load(uaddr, kaddr) 288 * 289 * Perform a single atomic load to read *uaddr, and return the 290 * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 291 * mapped. 292 */ 293 static inline int 294 futex_load(int *uaddr, int *kaddr) 295 { 296 return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 297 } 298 299 /* 300 * futex_test(uaddr, expected) 301 * 302 * True if *uaddr == expected. False if *uaddr != expected, or if 303 * uaddr is not mapped. 304 */ 305 static bool 306 futex_test(int *uaddr, int expected) 307 { 308 int val; 309 int error; 310 311 error = futex_load(uaddr, &val); 312 if (error) 313 return false; 314 return val == expected; 315 } 316 317 /* 318 * futex_sys_init() 319 * 320 * Initialize the futex subsystem. 321 */ 322 void 323 futex_sys_init(void) 324 { 325 326 mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 327 rb_tree_init(&futex_tab.va, &futex_rb_ops); 328 rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 329 } 330 331 /* 332 * futex_sys_fini() 333 * 334 * Finalize the futex subsystem. 335 */ 336 void 337 futex_sys_fini(void) 338 { 339 340 KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 341 KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 342 mutex_destroy(&futex_tab.lock); 343 } 344 345 /* 346 * futex_queue_init(f) 347 * 348 * Initialize the futex queue. Caller must call futex_queue_fini 349 * when done. 350 * 351 * Never sleeps. 352 */ 353 static void 354 futex_queue_init(struct futex *f) 355 { 356 357 mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 358 mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 359 cv_init(&f->fx_abortcv, "fqabort"); 360 LIST_INIT(&f->fx_abortlist); 361 TAILQ_INIT(&f->fx_queue); 362 } 363 364 /* 365 * futex_queue_drain(f) 366 * 367 * Wait for any aborting waiters in f; then empty the queue of 368 * any stragglers and wake them. Caller must guarantee no new 369 * references to f. 370 * 371 * May sleep. 372 */ 373 static void 374 futex_queue_drain(struct futex *f) 375 { 376 struct futex_wait *fw, *fw_next; 377 378 mutex_enter(&f->fx_abortlock); 379 while (!LIST_EMPTY(&f->fx_abortlist)) 380 cv_wait(&f->fx_abortcv, &f->fx_abortlock); 381 mutex_exit(&f->fx_abortlock); 382 383 mutex_enter(&f->fx_qlock); 384 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 385 mutex_enter(&fw->fw_lock); 386 futex_wait_dequeue(fw, f); 387 cv_broadcast(&fw->fw_cv); 388 mutex_exit(&fw->fw_lock); 389 } 390 mutex_exit(&f->fx_qlock); 391 } 392 393 /* 394 * futex_queue_fini(fq) 395 * 396 * Finalize the futex queue initialized by futex_queue_init. Queue 397 * must be empty. Caller must not use f again until a subsequent 398 * futex_queue_init. 399 */ 400 static void 401 futex_queue_fini(struct futex *f) 402 { 403 404 KASSERT(TAILQ_EMPTY(&f->fx_queue)); 405 KASSERT(LIST_EMPTY(&f->fx_abortlist)); 406 mutex_destroy(&f->fx_qlock); 407 mutex_destroy(&f->fx_abortlock); 408 cv_destroy(&f->fx_abortcv); 409 } 410 411 /* 412 * futex_key_init(key, vm, va, shared) 413 * 414 * Initialize a futex key for lookup, etc. 415 */ 416 static int 417 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 418 { 419 int error = 0; 420 421 if (__predict_false(shared)) { 422 if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared)) 423 error = EFAULT; 424 } else { 425 fk->fk_private.vmspace = vm; 426 fk->fk_private.va = va; 427 } 428 429 return error; 430 } 431 432 /* 433 * futex_key_fini(key, shared) 434 * 435 * Release a futex key. 436 */ 437 static void 438 futex_key_fini(union futex_key *fk, bool shared) 439 { 440 if (__predict_false(shared)) 441 uvm_voaddr_release(&fk->fk_shared); 442 memset(fk, 0, sizeof(*fk)); 443 } 444 445 /* 446 * futex_create(fk, shared) 447 * 448 * Create a futex. Initial reference count is 1, representing the 449 * caller. Returns NULL on failure. Always takes ownership of the 450 * key, either transferring it to the newly-created futex, or releasing 451 * the key if creation fails. 452 * 453 * Never sleeps for memory, but may sleep to acquire a lock. 454 */ 455 static struct futex * 456 futex_create(union futex_key *fk, bool shared) 457 { 458 struct futex *f; 459 460 f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 461 if (f == NULL) { 462 futex_key_fini(fk, shared); 463 return NULL; 464 } 465 f->fx_key = *fk; 466 f->fx_refcnt = 1; 467 f->fx_shared = shared; 468 f->fx_on_tree = false; 469 futex_queue_init(f); 470 471 return f; 472 } 473 474 /* 475 * futex_destroy(f) 476 * 477 * Destroy a futex created with futex_create. Reference count 478 * must be zero. 479 * 480 * May sleep. 481 */ 482 static void 483 futex_destroy(struct futex *f) 484 { 485 486 ASSERT_SLEEPABLE(); 487 488 KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 489 KASSERT(!f->fx_on_tree); 490 491 /* Drain and destroy the private queue. */ 492 futex_queue_drain(f); 493 futex_queue_fini(f); 494 495 futex_key_fini(&f->fx_key, f->fx_shared); 496 497 kmem_free(f, sizeof(*f)); 498 } 499 500 /* 501 * futex_hold(f) 502 * 503 * Attempt to acquire a reference to f. Return 0 on success, 504 * ENFILE on too many references. 505 * 506 * Never sleeps. 507 */ 508 static int 509 futex_hold(struct futex *f) 510 { 511 unsigned long refcnt; 512 513 do { 514 refcnt = atomic_load_relaxed(&f->fx_refcnt); 515 if (refcnt == ULONG_MAX) 516 return ENFILE; 517 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 518 519 return 0; 520 } 521 522 /* 523 * futex_rele(f) 524 * 525 * Release a reference to f acquired with futex_create or 526 * futex_hold. 527 * 528 * May sleep to free f. 529 */ 530 static void 531 futex_rele(struct futex *f) 532 { 533 unsigned long refcnt; 534 535 ASSERT_SLEEPABLE(); 536 537 do { 538 refcnt = atomic_load_relaxed(&f->fx_refcnt); 539 if (refcnt == 1) 540 goto trylast; 541 membar_release(); 542 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 543 return; 544 545 trylast: 546 mutex_enter(&futex_tab.lock); 547 if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 548 membar_acquire(); 549 if (f->fx_on_tree) { 550 if (__predict_false(f->fx_shared)) 551 rb_tree_remove_node(&futex_tab.oa, f); 552 else 553 rb_tree_remove_node(&futex_tab.va, f); 554 f->fx_on_tree = false; 555 } 556 } else { 557 /* References remain -- don't destroy it. */ 558 f = NULL; 559 } 560 mutex_exit(&futex_tab.lock); 561 if (f != NULL) 562 futex_destroy(f); 563 } 564 565 /* 566 * futex_rele_not_last(f) 567 * 568 * Release a reference to f acquired with futex_create or 569 * futex_hold. 570 * 571 * This version asserts that we are not dropping the last 572 * reference to f. 573 */ 574 static void 575 futex_rele_not_last(struct futex *f) 576 { 577 unsigned long refcnt; 578 579 do { 580 refcnt = atomic_load_relaxed(&f->fx_refcnt); 581 KASSERT(refcnt > 1); 582 } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 583 } 584 585 /* 586 * futex_lookup_by_key(key, shared, &f) 587 * 588 * Try to find an existing futex va reference in the specified key 589 * On success, return 0, set f to found futex or to NULL if not found, 590 * and increment f's reference count if found. 591 * 592 * Return ENFILE if reference count too high. 593 * 594 * Internal lookup routine shared by futex_lookup() and 595 * futex_lookup_create(). 596 */ 597 static int 598 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 599 { 600 struct futex *f; 601 int error = 0; 602 603 mutex_enter(&futex_tab.lock); 604 if (__predict_false(shared)) { 605 f = rb_tree_find_node(&futex_tab.oa, fk); 606 } else { 607 f = rb_tree_find_node(&futex_tab.va, fk); 608 } 609 if (f) { 610 error = futex_hold(f); 611 if (error) 612 f = NULL; 613 } 614 *fp = f; 615 mutex_exit(&futex_tab.lock); 616 617 return error; 618 } 619 620 /* 621 * futex_insert(f, fp) 622 * 623 * Try to insert the futex f into the tree by va. If there 624 * already is a futex for its va, acquire a reference to it, and 625 * store it in *fp; otherwise store f in *fp. 626 * 627 * Return 0 on success, ENFILE if there already is a futex but its 628 * reference count is too high. 629 */ 630 static int 631 futex_insert(struct futex *f, struct futex **fp) 632 { 633 struct futex *f0; 634 int error; 635 636 KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0); 637 KASSERT(!f->fx_on_tree); 638 639 mutex_enter(&futex_tab.lock); 640 if (__predict_false(f->fx_shared)) 641 f0 = rb_tree_insert_node(&futex_tab.oa, f); 642 else 643 f0 = rb_tree_insert_node(&futex_tab.va, f); 644 if (f0 == f) { 645 f->fx_on_tree = true; 646 error = 0; 647 } else { 648 KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 649 KASSERT(f0->fx_on_tree); 650 error = futex_hold(f0); 651 if (error) 652 goto out; 653 } 654 *fp = f0; 655 out: mutex_exit(&futex_tab.lock); 656 657 return error; 658 } 659 660 /* 661 * futex_lookup(uaddr, shared, &f) 662 * 663 * Find a futex at the userland pointer uaddr in the current 664 * process's VM space. On success, return the futex in f and 665 * increment its reference count. 666 * 667 * Caller must call futex_rele when done. 668 */ 669 static int 670 futex_lookup(int *uaddr, bool shared, struct futex **fp) 671 { 672 union futex_key fk; 673 struct vmspace *vm = curproc->p_vmspace; 674 vaddr_t va = (vaddr_t)uaddr; 675 int error; 676 677 /* 678 * Reject unaligned user pointers so we don't cross page 679 * boundaries and so atomics will work. 680 */ 681 if ((va & 3) != 0) 682 return EINVAL; 683 684 /* Look it up. */ 685 error = futex_key_init(&fk, vm, va, shared); 686 if (error) 687 return error; 688 689 error = futex_lookup_by_key(&fk, shared, fp); 690 futex_key_fini(&fk, shared); 691 if (error) 692 return error; 693 694 KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 695 KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 696 697 /* 698 * Success! (Caller must still check whether we found 699 * anything, but nothing went _wrong_ like trying to use 700 * unmapped memory.) 701 */ 702 KASSERT(error == 0); 703 704 return error; 705 } 706 707 /* 708 * futex_lookup_create(uaddr, shared, &f) 709 * 710 * Find or create a futex at the userland pointer uaddr in the 711 * current process's VM space. On success, return the futex in f 712 * and increment its reference count. 713 * 714 * Caller must call futex_rele when done. 715 */ 716 static int 717 futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 718 { 719 union futex_key fk; 720 struct vmspace *vm = curproc->p_vmspace; 721 struct futex *f = NULL; 722 vaddr_t va = (vaddr_t)uaddr; 723 int error; 724 725 /* 726 * Reject unaligned user pointers so we don't cross page 727 * boundaries and so atomics will work. 728 */ 729 if ((va & 3) != 0) 730 return EINVAL; 731 732 error = futex_key_init(&fk, vm, va, shared); 733 if (error) 734 return error; 735 736 /* 737 * Optimistically assume there already is one, and try to find 738 * it. 739 */ 740 error = futex_lookup_by_key(&fk, shared, fp); 741 if (error || *fp != NULL) { 742 /* 743 * We either found one, or there was an error. 744 * In either case, we are done with the key. 745 */ 746 futex_key_fini(&fk, shared); 747 goto out; 748 } 749 750 /* 751 * Create a futex record. This transfers ownership of the key 752 * in all cases. 753 */ 754 f = futex_create(&fk, shared); 755 if (f == NULL) { 756 error = ENOMEM; 757 goto out; 758 } 759 760 /* 761 * Insert our new futex, or use existing if someone else beat 762 * us to it. 763 */ 764 error = futex_insert(f, fp); 765 if (error) 766 goto out; 767 if (*fp == f) 768 f = NULL; /* don't release on exit */ 769 770 /* Success! */ 771 KASSERT(error == 0); 772 773 out: if (f != NULL) 774 futex_rele(f); 775 KASSERT(error || *fp != NULL); 776 KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 777 return error; 778 } 779 780 /* 781 * futex_wait_init(fw, bitset) 782 * 783 * Initialize a record for a thread to wait on a futex matching 784 * the specified bit set. Should be passed to futex_wait_enqueue 785 * before futex_wait, and should be passed to futex_wait_fini when 786 * done. 787 */ 788 static void 789 futex_wait_init(struct futex_wait *fw, int bitset) 790 { 791 792 KASSERT(bitset); 793 794 mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 795 cv_init(&fw->fw_cv, "futex"); 796 fw->fw_futex = NULL; 797 fw->fw_bitset = bitset; 798 fw->fw_aborting = false; 799 } 800 801 /* 802 * futex_wait_fini(fw) 803 * 804 * Finalize a record for a futex waiter. Must not be on any 805 * futex's queue. 806 */ 807 static void 808 futex_wait_fini(struct futex_wait *fw) 809 { 810 811 KASSERT(fw->fw_futex == NULL); 812 813 cv_destroy(&fw->fw_cv); 814 mutex_destroy(&fw->fw_lock); 815 } 816 817 /* 818 * futex_wait_enqueue(fw, f) 819 * 820 * Put fw on the futex queue. Must be done before futex_wait. 821 * Caller must hold fw's lock and f's lock, and fw must not be on 822 * any existing futex's waiter list. 823 */ 824 static void 825 futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 826 { 827 828 KASSERT(mutex_owned(&f->fx_qlock)); 829 KASSERT(mutex_owned(&fw->fw_lock)); 830 KASSERT(fw->fw_futex == NULL); 831 KASSERT(!fw->fw_aborting); 832 833 fw->fw_futex = f; 834 TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 835 } 836 837 /* 838 * futex_wait_dequeue(fw, f) 839 * 840 * Remove fw from the futex queue. Precludes subsequent 841 * futex_wait until a futex_wait_enqueue. Caller must hold fw's 842 * lock and f's lock, and fw must be on f. 843 */ 844 static void 845 futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 846 { 847 848 KASSERT(mutex_owned(&f->fx_qlock)); 849 KASSERT(mutex_owned(&fw->fw_lock)); 850 KASSERT(fw->fw_futex == f); 851 852 TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 853 fw->fw_futex = NULL; 854 } 855 856 /* 857 * futex_wait_abort(fw) 858 * 859 * Caller is no longer waiting for fw. Remove it from any queue 860 * if it was on one. Caller must hold fw->fw_lock. 861 */ 862 static void 863 futex_wait_abort(struct futex_wait *fw) 864 { 865 struct futex *f; 866 867 KASSERT(mutex_owned(&fw->fw_lock)); 868 869 /* 870 * Grab the futex queue. It can't go away as long as we hold 871 * fw_lock. However, we can't take the queue lock because 872 * that's a lock order reversal. 873 */ 874 f = fw->fw_futex; 875 876 /* Put us on the abort list so that fq won't go away. */ 877 mutex_enter(&f->fx_abortlock); 878 LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 879 mutex_exit(&f->fx_abortlock); 880 881 /* 882 * Mark fw as aborting so it won't lose wakeups and won't be 883 * transferred to any other queue. 884 */ 885 fw->fw_aborting = true; 886 887 /* f is now stable, so we can release fw_lock. */ 888 mutex_exit(&fw->fw_lock); 889 890 /* Now we can remove fw under the queue lock. */ 891 mutex_enter(&f->fx_qlock); 892 mutex_enter(&fw->fw_lock); 893 futex_wait_dequeue(fw, f); 894 mutex_exit(&fw->fw_lock); 895 mutex_exit(&f->fx_qlock); 896 897 /* 898 * Finally, remove us from the abort list and notify anyone 899 * waiting for the abort to complete if we were the last to go. 900 */ 901 mutex_enter(&f->fx_abortlock); 902 LIST_REMOVE(fw, fw_abort); 903 if (LIST_EMPTY(&f->fx_abortlist)) 904 cv_broadcast(&f->fx_abortcv); 905 mutex_exit(&f->fx_abortlock); 906 907 /* 908 * Release our reference to the futex now that we are not 909 * waiting for it. 910 */ 911 futex_rele(f); 912 913 /* 914 * Reacquire the fw lock as caller expects. Verify that we're 915 * aborting and no longer associated with a futex. 916 */ 917 mutex_enter(&fw->fw_lock); 918 KASSERT(fw->fw_aborting); 919 KASSERT(fw->fw_futex == NULL); 920 } 921 922 /* 923 * futex_wait(fw, deadline, clkid) 924 * 925 * fw must be a waiter on a futex's queue. Wait until deadline on 926 * the clock clkid, or forever if deadline is NULL, for a futex 927 * wakeup. Return 0 on explicit wakeup or destruction of futex, 928 * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 929 * will no longer be on a futex queue on return. 930 */ 931 static int 932 futex_wait(struct futex_wait *fw, const struct timespec *deadline, 933 clockid_t clkid) 934 { 935 int error = 0; 936 937 /* Test and wait under the wait lock. */ 938 mutex_enter(&fw->fw_lock); 939 940 for (;;) { 941 /* If we're done yet, stop and report success. */ 942 if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 943 error = 0; 944 break; 945 } 946 947 /* If anything went wrong in the last iteration, stop. */ 948 if (error) 949 break; 950 951 /* Not done yet. Wait. */ 952 if (deadline) { 953 struct timespec ts; 954 955 /* Check our watch. */ 956 error = clock_gettime1(clkid, &ts); 957 if (error) 958 break; 959 960 /* If we're past the deadline, ETIMEDOUT. */ 961 if (timespeccmp(deadline, &ts, <=)) { 962 error = ETIMEDOUT; 963 break; 964 } 965 966 /* Count how much time is left. */ 967 timespecsub(deadline, &ts, &ts); 968 969 /* 970 * Wait for that much time, allowing signals. 971 * Ignore EWOULDBLOCK, however: we will detect 972 * timeout ourselves on the next iteration of 973 * the loop -- and the timeout may have been 974 * truncated by tstohz, anyway. 975 */ 976 error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 977 MAX(1, tstohz(&ts))); 978 if (error == EWOULDBLOCK) 979 error = 0; 980 } else { 981 /* Wait indefinitely, allowing signals. */ 982 error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 983 } 984 } 985 986 /* 987 * If we were woken up, the waker will have removed fw from the 988 * queue. But if anything went wrong, we must remove fw from 989 * the queue ourselves. 990 */ 991 if (error) 992 futex_wait_abort(fw); 993 994 mutex_exit(&fw->fw_lock); 995 996 return error; 997 } 998 999 /* 1000 * futex_wake(f, nwake, f2, nrequeue, bitset) 1001 * 1002 * Wake up to nwake waiters on f matching bitset; then, if f2 is 1003 * provided, move up to nrequeue remaining waiters on f matching 1004 * bitset to f2. Return the number of waiters actually woken or 1005 * requeued. Caller must hold the locks of f and f2, if provided. 1006 */ 1007 static unsigned 1008 futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1009 unsigned nrequeue, int bitset) 1010 { 1011 struct futex_wait *fw, *fw_next; 1012 unsigned nwoken_or_requeued = 0; 1013 int hold_error __diagused; 1014 1015 KASSERT(mutex_owned(&f->fx_qlock)); 1016 KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 1017 1018 /* Wake up to nwake waiters, and count the number woken. */ 1019 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1020 if ((fw->fw_bitset & bitset) == 0) 1021 continue; 1022 if (nwake > 0) { 1023 mutex_enter(&fw->fw_lock); 1024 if (__predict_false(fw->fw_aborting)) { 1025 mutex_exit(&fw->fw_lock); 1026 continue; 1027 } 1028 futex_wait_dequeue(fw, f); 1029 fw->fw_bitset = 0; 1030 cv_broadcast(&fw->fw_cv); 1031 mutex_exit(&fw->fw_lock); 1032 nwake--; 1033 nwoken_or_requeued++; 1034 /* 1035 * Drop the futex reference on behalf of the 1036 * waiter. We assert this is not the last 1037 * reference on the futex (our caller should 1038 * also have one). 1039 */ 1040 futex_rele_not_last(f); 1041 } else { 1042 break; 1043 } 1044 } 1045 1046 if (f2) { 1047 /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1048 TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1049 if ((fw->fw_bitset & bitset) == 0) 1050 continue; 1051 if (nrequeue > 0) { 1052 mutex_enter(&fw->fw_lock); 1053 if (__predict_false(fw->fw_aborting)) { 1054 mutex_exit(&fw->fw_lock); 1055 continue; 1056 } 1057 futex_wait_dequeue(fw, f); 1058 futex_wait_enqueue(fw, f2); 1059 mutex_exit(&fw->fw_lock); 1060 nrequeue--; 1061 /* 1062 * PR kern/59004: Missing constant for upper 1063 * bound on systemwide number of lwps 1064 */ 1065 KASSERT(nwoken_or_requeued < 1066 MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK)); 1067 __CTASSERT(UINT_MAX >= 1068 MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK)); 1069 if (++nwoken_or_requeued == 0) /* paranoia */ 1070 nwoken_or_requeued = UINT_MAX; 1071 /* 1072 * Transfer the reference from f to f2. 1073 * As above, we assert that we are not 1074 * dropping the last reference to f here. 1075 * 1076 * XXX futex_hold() could theoretically 1077 * XXX fail here. 1078 */ 1079 futex_rele_not_last(f); 1080 hold_error = futex_hold(f2); 1081 KASSERT(hold_error == 0); 1082 } else { 1083 break; 1084 } 1085 } 1086 } else { 1087 KASSERT(nrequeue == 0); 1088 } 1089 1090 /* Return the number of waiters woken or requeued. */ 1091 return nwoken_or_requeued; 1092 } 1093 1094 /* 1095 * futex_queue_lock(f) 1096 * 1097 * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1098 * not use if caller needs to acquire two locks; use 1099 * futex_queue_lock2 instead. 1100 */ 1101 static void 1102 futex_queue_lock(struct futex *f) 1103 { 1104 mutex_enter(&f->fx_qlock); 1105 } 1106 1107 /* 1108 * futex_queue_unlock(f) 1109 * 1110 * Release the queue lock of f. 1111 */ 1112 static void 1113 futex_queue_unlock(struct futex *f) 1114 { 1115 mutex_exit(&f->fx_qlock); 1116 } 1117 1118 /* 1119 * futex_queue_lock2(f, f2) 1120 * 1121 * Acquire the queue locks of both f and f2, which may be null, or 1122 * which may have the same underlying queue. If they are 1123 * distinct, an arbitrary total order is chosen on the locks. 1124 * 1125 * Callers should only ever acquire multiple queue locks 1126 * simultaneously using futex_queue_lock2. 1127 */ 1128 static void 1129 futex_queue_lock2(struct futex *f, struct futex *f2) 1130 { 1131 1132 /* 1133 * If both are null, do nothing; if one is null and the other 1134 * is not, lock the other and be done with it. 1135 */ 1136 if (f == NULL && f2 == NULL) { 1137 return; 1138 } else if (f == NULL) { 1139 mutex_enter(&f2->fx_qlock); 1140 return; 1141 } else if (f2 == NULL) { 1142 mutex_enter(&f->fx_qlock); 1143 return; 1144 } 1145 1146 /* If both futexes are the same, acquire only one. */ 1147 if (f == f2) { 1148 mutex_enter(&f->fx_qlock); 1149 return; 1150 } 1151 1152 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1153 if ((uintptr_t)f < (uintptr_t)f2) { 1154 mutex_enter(&f->fx_qlock); 1155 mutex_enter(&f2->fx_qlock); 1156 } else { 1157 mutex_enter(&f2->fx_qlock); 1158 mutex_enter(&f->fx_qlock); 1159 } 1160 } 1161 1162 /* 1163 * futex_queue_unlock2(f, f2) 1164 * 1165 * Release the queue locks of both f and f2, which may be null, or 1166 * which may have the same underlying queue. 1167 */ 1168 static void 1169 futex_queue_unlock2(struct futex *f, struct futex *f2) 1170 { 1171 1172 /* 1173 * If both are null, do nothing; if one is null and the other 1174 * is not, unlock the other and be done with it. 1175 */ 1176 if (f == NULL && f2 == NULL) { 1177 return; 1178 } else if (f == NULL) { 1179 mutex_exit(&f2->fx_qlock); 1180 return; 1181 } else if (f2 == NULL) { 1182 mutex_exit(&f->fx_qlock); 1183 return; 1184 } 1185 1186 /* If both futexes are the same, release only one. */ 1187 if (f == f2) { 1188 mutex_exit(&f->fx_qlock); 1189 return; 1190 } 1191 1192 /* Otherwise, use the ordering on the kva of the futex pointer. */ 1193 if ((uintptr_t)f < (uintptr_t)f2) { 1194 mutex_exit(&f2->fx_qlock); 1195 mutex_exit(&f->fx_qlock); 1196 } else { 1197 mutex_exit(&f->fx_qlock); 1198 mutex_exit(&f2->fx_qlock); 1199 } 1200 } 1201 1202 /* 1203 * futex_func_wait(uaddr, cmpval@val, bitset@val3, timeout, clkid, clkflags, 1204 * retval) 1205 * 1206 * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET): If 1207 * *uaddr == cmpval, wait until futex-woken on any of the bits in 1208 * bitset. But if *uaddr != cmpval, fail with EAGAIN. 1209 * 1210 * For FUTEX_WAIT, bitset has all bits set and val3 is ignored. 1211 */ 1212 static int 1213 futex_func_wait(bool shared, int *uaddr, int cmpval, int bitset, 1214 const struct timespec *timeout, clockid_t clkid, int clkflags, 1215 register_t *retval) 1216 { 1217 struct futex *f; 1218 struct futex_wait wait, *fw = &wait; 1219 struct timespec ts; 1220 const struct timespec *deadline; 1221 int error; 1222 1223 /* 1224 * If there's nothing to wait for, and nobody will ever wake 1225 * us, then don't set anything up to wait -- just stop here. 1226 */ 1227 if (bitset == 0) 1228 return EINVAL; 1229 1230 /* Optimistically test before anything else. */ 1231 if (!futex_test(uaddr, cmpval)) 1232 return EAGAIN; 1233 1234 /* Determine a deadline on the specified clock. */ 1235 if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1236 deadline = timeout; 1237 } else { 1238 error = clock_gettime1(clkid, &ts); 1239 if (error) 1240 return error; 1241 timespecadd(&ts, timeout, &ts); 1242 deadline = &ts; 1243 } 1244 1245 /* Get the futex, creating it if necessary. */ 1246 error = futex_lookup_create(uaddr, shared, &f); 1247 if (error) 1248 return error; 1249 KASSERT(f); 1250 1251 /* Get ready to wait. */ 1252 futex_wait_init(fw, bitset); 1253 1254 /* 1255 * Under the queue lock, check the value again: if it has 1256 * already changed, EAGAIN; otherwise enqueue the waiter. 1257 * Since FUTEX_WAKE will use the same lock and be done after 1258 * modifying the value, the order in which we check and enqueue 1259 * is immaterial. 1260 */ 1261 futex_queue_lock(f); 1262 if (!futex_test(uaddr, cmpval)) { 1263 futex_queue_unlock(f); 1264 error = EAGAIN; 1265 goto out; 1266 } 1267 mutex_enter(&fw->fw_lock); 1268 futex_wait_enqueue(fw, f); 1269 mutex_exit(&fw->fw_lock); 1270 futex_queue_unlock(f); 1271 1272 /* 1273 * We cannot drop our reference to the futex here, because 1274 * we might be enqueued on a different one when we are awakened. 1275 * The references will be managed on our behalf in the requeue 1276 * and wake cases. 1277 */ 1278 f = NULL; 1279 1280 /* Wait. */ 1281 error = futex_wait(fw, deadline, clkid); 1282 if (error) 1283 goto out; 1284 1285 /* Return 0 on success, error on failure. */ 1286 *retval = 0; 1287 1288 out: if (f != NULL) 1289 futex_rele(f); 1290 futex_wait_fini(fw); 1291 return error; 1292 } 1293 1294 /* 1295 * futex_func_wake(uaddr, nwake@val, bitset@val3, retval) 1296 * 1297 * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET): Wake 1298 * up to nwake waiters on uaddr waiting on any of the bits in 1299 * bitset. 1300 * 1301 * Return the number of waiters woken. 1302 * 1303 * For FUTEX_WAKE, bitset has all bits set and val3 is ignored. 1304 */ 1305 static int 1306 futex_func_wake(bool shared, int *uaddr, int nwake, int bitset, 1307 register_t *retval) 1308 { 1309 struct futex *f; 1310 unsigned int nwoken = 0; 1311 int error = 0; 1312 1313 /* Reject negative number of wakeups. */ 1314 if (nwake < 0) { 1315 error = EINVAL; 1316 goto out; 1317 } 1318 1319 /* Look up the futex, if any. */ 1320 error = futex_lookup(uaddr, shared, &f); 1321 if (error) 1322 goto out; 1323 1324 /* If there's no futex, there are no waiters to wake. */ 1325 if (f == NULL) 1326 goto out; 1327 1328 /* 1329 * Under f's queue lock, wake the waiters and remember the 1330 * number woken. 1331 */ 1332 futex_queue_lock(f); 1333 nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0, bitset); 1334 futex_queue_unlock(f); 1335 1336 /* Release the futex. */ 1337 futex_rele(f); 1338 1339 out: 1340 /* Return the number of waiters woken. */ 1341 *retval = nwoken; 1342 1343 /* Success! */ 1344 return error; 1345 } 1346 1347 /* 1348 * futex_func_requeue(op, uaddr, nwake@val, uaddr2, nrequeue@val2, 1349 * cmpval@val3, retval) 1350 * 1351 * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE): If 1352 * *uaddr == cmpval or if op == FUTEX_REQUEUE, wake up to nwake 1353 * waiters at uaddr and then requeue up to nrequeue waiters from 1354 * uaddr to uaddr2. 1355 * 1356 * Return the number of waiters woken or requeued. 1357 * 1358 * For FUTEX_CMP_REQUEUE, if *uaddr != cmpval, fail with EAGAIN 1359 * and no wakeups. 1360 */ 1361 static int 1362 futex_func_requeue(bool shared, int op, int *uaddr, int nwake, int *uaddr2, 1363 int nrequeue, int cmpval, register_t *retval) 1364 { 1365 struct futex *f = NULL, *f2 = NULL; 1366 unsigned nwoken_or_requeued = 0; /* default to zero on early return */ 1367 int error; 1368 1369 /* Reject negative number of wakeups or requeues. */ 1370 if (nwake < 0 || nrequeue < 0) { 1371 error = EINVAL; 1372 goto out; 1373 } 1374 1375 /* 1376 * Look up or create the source futex. For FUTEX_CMP_REQUEUE, 1377 * we always create it, rather than bail if it has no waiters, 1378 * because FUTEX_CMP_REQUEUE always tests the futex word in 1379 * order to report EAGAIN. 1380 */ 1381 error = (op == FUTEX_CMP_REQUEUE 1382 ? futex_lookup_create(uaddr, shared, &f) 1383 : futex_lookup(uaddr, shared, &f)); 1384 if (error) 1385 goto out; 1386 1387 /* If there is none for FUTEX_REQUEUE, nothing to do. */ 1388 if (f == NULL) { 1389 KASSERT(op != FUTEX_CMP_REQUEUE); 1390 goto out; 1391 } 1392 1393 /* 1394 * We may need to create the destination futex because it's 1395 * entirely possible it does not currently have any waiters. 1396 */ 1397 error = futex_lookup_create(uaddr2, shared, &f2); 1398 if (error) 1399 goto out; 1400 1401 /* 1402 * Under the futexes' queue locks, check the value; if 1403 * unchanged from cmpval, or if this is the unconditional 1404 * FUTEX_REQUEUE operation, wake the waiters. 1405 */ 1406 futex_queue_lock2(f, f2); 1407 if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, cmpval)) { 1408 error = EAGAIN; 1409 } else { 1410 error = 0; 1411 nwoken_or_requeued = futex_wake(f, nwake, f2, nrequeue, 1412 FUTEX_BITSET_MATCH_ANY); 1413 } 1414 futex_queue_unlock2(f, f2); 1415 1416 out: 1417 /* Return the number of waiters woken or requeued. */ 1418 *retval = nwoken_or_requeued; 1419 1420 /* Release the futexes if we got them. */ 1421 if (f2) 1422 futex_rele(f2); 1423 if (f) 1424 futex_rele(f); 1425 return error; 1426 } 1427 1428 /* 1429 * futex_opcmp_arg(arg) 1430 * 1431 * arg is either the oparg or cmparg field of a FUTEX_WAKE_OP 1432 * operation, a 12-bit string in either case. Map it to a numeric 1433 * argument value by sign-extending it in two's-complement 1434 * representation. 1435 */ 1436 static int 1437 futex_opcmp_arg(int arg) 1438 { 1439 1440 KASSERT(arg == (arg & __BITS(11,0))); 1441 return arg - 0x1000*__SHIFTOUT(arg, __BIT(11)); 1442 } 1443 1444 /* 1445 * futex_validate_op_cmp(opcmp) 1446 * 1447 * Validate an op/cmp argument for FUTEX_WAKE_OP. 1448 */ 1449 static int 1450 futex_validate_op_cmp(int opcmp) 1451 { 1452 int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK); 1453 int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK); 1454 1455 if (op & FUTEX_OP_OPARG_SHIFT) { 1456 int oparg = 1457 futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK)); 1458 if (oparg < 0) 1459 return EINVAL; 1460 if (oparg >= 32) 1461 return EINVAL; 1462 op &= ~FUTEX_OP_OPARG_SHIFT; 1463 } 1464 1465 switch (op) { 1466 case FUTEX_OP_SET: 1467 case FUTEX_OP_ADD: 1468 case FUTEX_OP_OR: 1469 case FUTEX_OP_ANDN: 1470 case FUTEX_OP_XOR: 1471 break; 1472 default: 1473 return EINVAL; 1474 } 1475 1476 switch (cmp) { 1477 case FUTEX_OP_CMP_EQ: 1478 case FUTEX_OP_CMP_NE: 1479 case FUTEX_OP_CMP_LT: 1480 case FUTEX_OP_CMP_LE: 1481 case FUTEX_OP_CMP_GT: 1482 case FUTEX_OP_CMP_GE: 1483 break; 1484 default: 1485 return EINVAL; 1486 } 1487 1488 return 0; 1489 } 1490 1491 /* 1492 * futex_compute_op(oldval, opcmp) 1493 * 1494 * Apply a FUTEX_WAKE_OP operation to oldval. 1495 */ 1496 static int 1497 futex_compute_op(int oldval, int opcmp) 1498 { 1499 int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK); 1500 int oparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK)); 1501 1502 if (op & FUTEX_OP_OPARG_SHIFT) { 1503 KASSERT(oparg >= 0); 1504 KASSERT(oparg < 32); 1505 oparg = 1u << oparg; 1506 op &= ~FUTEX_OP_OPARG_SHIFT; 1507 } 1508 1509 switch (op) { 1510 case FUTEX_OP_SET: 1511 return oparg; 1512 1513 case FUTEX_OP_ADD: 1514 /* 1515 * Avoid signed arithmetic overflow by doing 1516 * arithmetic unsigned and converting back to signed 1517 * at the end. 1518 */ 1519 return (int)((unsigned)oldval + (unsigned)oparg); 1520 1521 case FUTEX_OP_OR: 1522 return oldval | oparg; 1523 1524 case FUTEX_OP_ANDN: 1525 return oldval & ~oparg; 1526 1527 case FUTEX_OP_XOR: 1528 return oldval ^ oparg; 1529 1530 default: 1531 panic("invalid futex op"); 1532 } 1533 } 1534 1535 /* 1536 * futex_compute_cmp(oldval, opcmp) 1537 * 1538 * Apply a FUTEX_WAKE_OP comparison to oldval. 1539 */ 1540 static bool 1541 futex_compute_cmp(int oldval, int opcmp) 1542 { 1543 int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK); 1544 int cmparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_CMPARG_MASK)); 1545 1546 switch (cmp) { 1547 case FUTEX_OP_CMP_EQ: 1548 return (oldval == cmparg); 1549 1550 case FUTEX_OP_CMP_NE: 1551 return (oldval != cmparg); 1552 1553 case FUTEX_OP_CMP_LT: 1554 return (oldval < cmparg); 1555 1556 case FUTEX_OP_CMP_LE: 1557 return (oldval <= cmparg); 1558 1559 case FUTEX_OP_CMP_GT: 1560 return (oldval > cmparg); 1561 1562 case FUTEX_OP_CMP_GE: 1563 return (oldval >= cmparg); 1564 1565 default: 1566 panic("invalid futex cmp operation"); 1567 } 1568 } 1569 1570 /* 1571 * futex_func_wake_op(uaddr, nwake@val, uaddr2, nwake2@val2, opcmp@val3, 1572 * retval) 1573 * 1574 * Implement futex(FUTEX_WAKE_OP): 1575 * 1576 * 1. Update *uaddr2 according to the r/m/w operation specified in 1577 * opcmp. 1578 * 1579 * 2. If uaddr is nonnull, wake up to nwake waiters at uaddr. 1580 * 1581 * 3. If what was previously at *uaddr2 matches the comparison 1582 * operation specified in opcmp, additionally wake up to nwake2 1583 * waiters at uaddr2. 1584 */ 1585 static int 1586 futex_func_wake_op(bool shared, int *uaddr, int nwake, int *uaddr2, int nwake2, 1587 int opcmp, register_t *retval) 1588 { 1589 struct futex *f = NULL, *f2 = NULL; 1590 int oldval, newval, actual; 1591 unsigned nwoken = 0; 1592 int error; 1593 1594 /* Reject negative number of wakeups. */ 1595 if (nwake < 0 || nwake2 < 0) { 1596 error = EINVAL; 1597 goto out; 1598 } 1599 1600 /* Reject invalid operations before we start doing things. */ 1601 if ((error = futex_validate_op_cmp(opcmp)) != 0) 1602 goto out; 1603 1604 /* Look up the first futex, if any. */ 1605 error = futex_lookup(uaddr, shared, &f); 1606 if (error) 1607 goto out; 1608 1609 /* Look up the second futex, if any. */ 1610 error = futex_lookup(uaddr2, shared, &f2); 1611 if (error) 1612 goto out; 1613 1614 /* 1615 * Under the queue locks: 1616 * 1617 * 1. Read/modify/write: *uaddr2 op= oparg, as in opcmp. 1618 * 2. Unconditionally wake uaddr. 1619 * 3. Conditionally wake uaddr2, if it previously matched the 1620 * comparison in opcmp. 1621 */ 1622 futex_queue_lock2(f, f2); 1623 do { 1624 error = futex_load(uaddr2, &oldval); 1625 if (error) 1626 goto out_unlock; 1627 newval = futex_compute_op(oldval, opcmp); 1628 error = ucas_int(uaddr2, oldval, newval, &actual); 1629 if (error) 1630 goto out_unlock; 1631 } while (actual != oldval); 1632 if (f == NULL) { 1633 nwoken = 0; 1634 } else { 1635 nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0, 1636 FUTEX_BITSET_MATCH_ANY); 1637 } 1638 if (f2 && futex_compute_cmp(oldval, opcmp)) { 1639 nwoken += futex_wake(f2, nwake2, NULL, /*nrequeue*/0, 1640 FUTEX_BITSET_MATCH_ANY); 1641 } 1642 1643 /* Success! */ 1644 error = 0; 1645 out_unlock: 1646 futex_queue_unlock2(f, f2); 1647 1648 out: 1649 /* Return the number of waiters woken. */ 1650 *retval = nwoken; 1651 1652 /* Release the futexes, if we got them. */ 1653 if (f2) 1654 futex_rele(f2); 1655 if (f) 1656 futex_rele(f); 1657 return error; 1658 } 1659 1660 /* 1661 * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 1662 * 1663 * Implement the futex system call with all the parameters 1664 * parsed out. 1665 */ 1666 int 1667 do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 1668 int *uaddr2, int val2, int val3, register_t *retval) 1669 { 1670 const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 1671 const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 1672 : CLOCK_MONOTONIC; 1673 1674 op &= FUTEX_CMD_MASK; 1675 1676 switch (op) { 1677 case FUTEX_WAIT: { 1678 const int cmpval = val; 1679 const int bitset = FUTEX_BITSET_MATCH_ANY; 1680 1681 return futex_func_wait(shared, uaddr, cmpval, bitset, timeout, 1682 clkid, TIMER_RELTIME, retval); 1683 } 1684 case FUTEX_WAKE: { 1685 const int nwake = val; 1686 const int bitset = FUTEX_BITSET_MATCH_ANY; 1687 1688 return futex_func_wake(shared, uaddr, nwake, bitset, retval); 1689 } 1690 case FUTEX_WAKE_BITSET: { 1691 const int nwake = val; 1692 const int bitset = val3; 1693 1694 return futex_func_wake(shared, uaddr, nwake, bitset, retval); 1695 } 1696 case FUTEX_REQUEUE: 1697 case FUTEX_CMP_REQUEUE: { 1698 const int nwake = val; 1699 const int nrequeue = val2; 1700 const int cmpval = val3; /* ignored if op=FUTEX_REQUEUE */ 1701 1702 return futex_func_requeue(shared, op, uaddr, nwake, uaddr2, 1703 nrequeue, cmpval, retval); 1704 } 1705 case FUTEX_WAIT_BITSET: { 1706 const int cmpval = val; 1707 const int bitset = val3; 1708 1709 return futex_func_wait(shared, uaddr, cmpval, bitset, timeout, 1710 clkid, TIMER_ABSTIME, retval); 1711 } 1712 case FUTEX_WAKE_OP: { 1713 const int nwake = val; 1714 const int nwake2 = val2; 1715 const int opcmp = val3; 1716 1717 return futex_func_wake_op(shared, uaddr, nwake, uaddr2, nwake2, 1718 opcmp, retval); 1719 } 1720 case FUTEX_FD: 1721 default: 1722 return ENOSYS; 1723 } 1724 } 1725 1726 /* 1727 * sys___futex(l, uap, retval) 1728 * 1729 * __futex(2) system call: generic futex operations. 1730 */ 1731 int 1732 sys___futex(struct lwp *l, const struct sys___futex_args *uap, 1733 register_t *retval) 1734 { 1735 /* { 1736 syscallarg(int *) uaddr; 1737 syscallarg(int) op; 1738 syscallarg(int) val; 1739 syscallarg(const struct timespec *) timeout; 1740 syscallarg(int *) uaddr2; 1741 syscallarg(int) val2; 1742 syscallarg(int) val3; 1743 } */ 1744 struct timespec ts, *tsp; 1745 int error; 1746 1747 /* 1748 * Copy in the timeout argument, if specified. 1749 */ 1750 if (SCARG(uap, timeout)) { 1751 error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1752 if (error) 1753 return error; 1754 tsp = &ts; 1755 } else { 1756 tsp = NULL; 1757 } 1758 1759 return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val), 1760 tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3), 1761 retval); 1762 } 1763 1764 /* 1765 * sys___futex_set_robust_list(l, uap, retval) 1766 * 1767 * __futex_set_robust_list(2) system call for robust futexes. 1768 */ 1769 int 1770 sys___futex_set_robust_list(struct lwp *l, 1771 const struct sys___futex_set_robust_list_args *uap, register_t *retval) 1772 { 1773 /* { 1774 syscallarg(void *) head; 1775 syscallarg(size_t) len; 1776 } */ 1777 void *head = SCARG(uap, head); 1778 1779 if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE) 1780 return EINVAL; 1781 if ((uintptr_t)head % sizeof(u_long)) 1782 return EINVAL; 1783 1784 l->l_robust_head = (uintptr_t)head; 1785 1786 return 0; 1787 } 1788 1789 /* 1790 * sys___futex_get_robust_list(l, uap, retval) 1791 * 1792 * __futex_get_robust_list(2) system call for robust futexes. 1793 */ 1794 int 1795 sys___futex_get_robust_list(struct lwp *l, 1796 const struct sys___futex_get_robust_list_args *uap, register_t *retval) 1797 { 1798 /* { 1799 syscallarg(lwpid_t) lwpid; 1800 syscallarg(void **) headp; 1801 syscallarg(size_t *) lenp; 1802 } */ 1803 void *head; 1804 const size_t len = _FUTEX_ROBUST_HEAD_SIZE; 1805 int error; 1806 1807 error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head); 1808 if (error) 1809 return error; 1810 1811 /* Copy out the head pointer and the head structure length. */ 1812 error = copyout(&head, SCARG(uap, headp), sizeof(head)); 1813 if (__predict_true(error == 0)) { 1814 error = copyout(&len, SCARG(uap, lenp), sizeof(len)); 1815 } 1816 1817 return error; 1818 } 1819 1820 /* 1821 * release_futex(uva, tid) 1822 * 1823 * Try to release the robust futex at uva in the current process 1824 * on lwp exit. If anything goes wrong, silently fail. It is the 1825 * userland program's obligation to arrange correct behaviour. 1826 */ 1827 static void 1828 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 1829 bool const is_pending) 1830 { 1831 int *uaddr; 1832 struct futex *f; 1833 int oldval, newval, actual; 1834 int error; 1835 1836 /* If it's misaligned, tough. */ 1837 if (__predict_false(uptr & 3)) 1838 return; 1839 uaddr = (int *)uptr; 1840 1841 error = futex_load(uaddr, &oldval); 1842 if (__predict_false(error)) 1843 return; 1844 1845 /* 1846 * There are two race conditions we need to handle here: 1847 * 1848 * 1. User space cleared the futex word but died before 1849 * being able to issue the wakeup. No wakeups will 1850 * ever be issued, oops! 1851 * 1852 * 2. Awakened waiter died before being able to acquire 1853 * the futex in user space. Any other waiters are 1854 * now stuck, oops! 1855 * 1856 * In both of these cases, the futex word will be 0 (because 1857 * it's updated before the wake is issued). The best we can 1858 * do is detect this situation if it's the pending futex and 1859 * issue a wake without modifying the futex word. 1860 * 1861 * XXX eventual PI handling? 1862 */ 1863 if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 1864 register_t retval; 1865 (void) futex_func_wake(/*shared*/true, uaddr, 1, 1866 FUTEX_BITSET_MATCH_ANY, &retval); 1867 return; 1868 } 1869 1870 /* Optimistically test whether we need to do anything at all. */ 1871 if ((oldval & FUTEX_TID_MASK) != tid) 1872 return; 1873 1874 /* 1875 * We need to handle the case where this thread owned the futex, 1876 * but it was uncontended. In this case, there won't be any 1877 * kernel state to look up. All we can do is mark the futex 1878 * as a zombie to be mopped up the next time another thread 1879 * attempts to acquire it. 1880 * 1881 * N.B. It's important to ensure to set FUTEX_OWNER_DIED in 1882 * this loop, even if waiters appear while we're are doing 1883 * so. This is beause FUTEX_WAITERS is set by user space 1884 * before calling __futex() to wait, and the futex needs 1885 * to be marked as a zombie when the new waiter gets into 1886 * the kernel. 1887 */ 1888 if ((oldval & FUTEX_WAITERS) == 0) { 1889 do { 1890 error = futex_load(uaddr, &oldval); 1891 if (error) 1892 return; 1893 if ((oldval & FUTEX_TID_MASK) != tid) 1894 return; 1895 newval = oldval | FUTEX_OWNER_DIED; 1896 error = ucas_int(uaddr, oldval, newval, &actual); 1897 if (error) 1898 return; 1899 } while (actual != oldval); 1900 1901 /* 1902 * If where is still no indication of waiters, then there is 1903 * no more work for us to do. 1904 */ 1905 if ((oldval & FUTEX_WAITERS) == 0) 1906 return; 1907 } 1908 1909 /* 1910 * Look for a shared futex since we have no positive indication 1911 * it is private. If we can't, tough. 1912 */ 1913 error = futex_lookup(uaddr, /*shared*/true, &f); 1914 if (error) 1915 return; 1916 1917 /* 1918 * If there's no kernel state for this futex, there's nothing to 1919 * release. 1920 */ 1921 if (f == NULL) 1922 return; 1923 1924 /* Work under the futex queue lock. */ 1925 futex_queue_lock(f); 1926 1927 /* 1928 * Fetch the word: if the tid doesn't match ours, skip; 1929 * otherwise, set the owner-died bit, atomically. 1930 */ 1931 do { 1932 error = futex_load(uaddr, &oldval); 1933 if (error) 1934 goto out; 1935 if ((oldval & FUTEX_TID_MASK) != tid) 1936 goto out; 1937 newval = oldval | FUTEX_OWNER_DIED; 1938 error = ucas_int(uaddr, oldval, newval, &actual); 1939 if (error) 1940 goto out; 1941 } while (actual != oldval); 1942 1943 /* 1944 * If there may be waiters, try to wake one. If anything goes 1945 * wrong, tough. 1946 * 1947 * XXX eventual PI handling? 1948 */ 1949 if (oldval & FUTEX_WAITERS) { 1950 (void)futex_wake(f, /*nwake*/1, NULL, /*nrequeue*/0, 1951 FUTEX_BITSET_MATCH_ANY); 1952 } 1953 1954 /* Unlock the queue and release the futex. */ 1955 out: futex_queue_unlock(f); 1956 futex_rele(f); 1957 } 1958 1959 /* 1960 * futex_robust_head_lookup(l, lwpid) 1961 * 1962 * Helper function to look up a robust head by LWP ID. 1963 */ 1964 int 1965 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 1966 { 1967 struct proc *p = l->l_proc; 1968 1969 /* Find the other lwp, if requested; otherwise use our robust head. */ 1970 if (lwpid) { 1971 mutex_enter(p->p_lock); 1972 l = lwp_find(p, lwpid); 1973 if (l == NULL) { 1974 mutex_exit(p->p_lock); 1975 return ESRCH; 1976 } 1977 *headp = (void *)l->l_robust_head; 1978 mutex_exit(p->p_lock); 1979 } else { 1980 *headp = (void *)l->l_robust_head; 1981 } 1982 return 0; 1983 } 1984 1985 /* 1986 * futex_fetch_robust_head(uaddr) 1987 * 1988 * Helper routine to fetch the futex robust list head that 1989 * handles 32-bit binaries running on 64-bit kernels. 1990 */ 1991 static int 1992 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead) 1993 { 1994 #ifdef _LP64 1995 if (curproc->p_flag & PK_32) { 1996 uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS]; 1997 int error; 1998 1999 error = copyin((void *)uaddr, rhead32, sizeof(rhead32)); 2000 if (__predict_true(error == 0)) { 2001 for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) { 2002 if (i == _FUTEX_ROBUST_HEAD_OFFSET) { 2003 /* 2004 * Make sure the offset is sign- 2005 * extended. 2006 */ 2007 rhead[i] = (int32_t)rhead32[i]; 2008 } else { 2009 rhead[i] = rhead32[i]; 2010 } 2011 } 2012 } 2013 return error; 2014 } 2015 #endif /* _L64 */ 2016 2017 return copyin((void *)uaddr, rhead, 2018 sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS); 2019 } 2020 2021 /* 2022 * futex_decode_robust_word(word) 2023 * 2024 * Decode a robust futex list word into the entry and entry 2025 * properties. 2026 */ 2027 static inline void 2028 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry, 2029 bool * const is_pi) 2030 { 2031 *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false; 2032 *entry = word & ~_FUTEX_ROBUST_ENTRY_PI; 2033 } 2034 2035 /* 2036 * futex_fetch_robust_entry(uaddr) 2037 * 2038 * Helper routine to fetch and decode a robust futex entry 2039 * that handles 32-bit binaries running on 64-bit kernels. 2040 */ 2041 static int 2042 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp, 2043 bool * const is_pi) 2044 { 2045 uintptr_t val = 0; 2046 int error = 0; 2047 2048 #ifdef _LP64 2049 if (curproc->p_flag & PK_32) { 2050 uint32_t val32; 2051 2052 error = ufetch_32((uint32_t *)uaddr, &val32); 2053 if (__predict_true(error == 0)) 2054 val = val32; 2055 } else 2056 #endif /* _LP64 */ 2057 error = ufetch_long((u_long *)uaddr, (u_long *)&val); 2058 if (__predict_false(error)) 2059 return error; 2060 2061 futex_decode_robust_word(val, valp, is_pi); 2062 return 0; 2063 } 2064 2065 /* 2066 * futex_release_all_lwp(l, tid) 2067 * 2068 * Release all l's robust futexes. If anything looks funny in 2069 * the process, give up -- it's userland's responsibility to dot 2070 * the i's and cross the t's. 2071 */ 2072 void 2073 futex_release_all_lwp(struct lwp * const l) 2074 { 2075 u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS]; 2076 int limit = 1000000; 2077 int error; 2078 2079 /* If there's no robust list there's nothing to do. */ 2080 if (l->l_robust_head == 0) 2081 return; 2082 2083 KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid); 2084 2085 /* Read the final snapshot of the robust list head. */ 2086 error = futex_fetch_robust_head(l->l_robust_head, rhead); 2087 if (error) { 2088 printf("WARNING: pid %jd (%s) lwp %jd:" 2089 " unmapped robust futex list head\n", 2090 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2091 (uintmax_t)l->l_lid); 2092 return; 2093 } 2094 2095 const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET]; 2096 2097 uintptr_t next, pending; 2098 bool is_pi, pending_is_pi; 2099 2100 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST], 2101 &next, &is_pi); 2102 futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING], 2103 &pending, &pending_is_pi); 2104 2105 /* 2106 * Walk down the list of locked futexes and release them, up 2107 * to one million of them before we give up. 2108 */ 2109 2110 while (next != l->l_robust_head && limit-- > 0) { 2111 /* pending handled below. */ 2112 if (next != pending) 2113 release_futex(next + offset, l->l_lid, is_pi, false); 2114 error = futex_fetch_robust_entry(next, &next, &is_pi); 2115 if (error) 2116 break; 2117 preempt_point(); 2118 } 2119 if (limit <= 0) { 2120 printf("WARNING: pid %jd (%s) lwp %jd:" 2121 " exhausted robust futex limit\n", 2122 (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2123 (uintmax_t)l->l_lid); 2124 } 2125 2126 /* If there's a pending futex, it may need to be released too. */ 2127 if (pending != 0) { 2128 release_futex(pending + offset, l->l_lid, pending_is_pi, true); 2129 } 2130 } 2131