1 1.26 riastrad /* $NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $ */ 2 1.1 thorpej 3 1.1 thorpej /*- 4 1.1 thorpej * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc. 5 1.1 thorpej * All rights reserved. 6 1.1 thorpej * 7 1.1 thorpej * This code is derived from software contributed to The NetBSD Foundation 8 1.1 thorpej * by Taylor R. Campbell and Jason R. Thorpe. 9 1.1 thorpej * 10 1.1 thorpej * Redistribution and use in source and binary forms, with or without 11 1.1 thorpej * modification, are permitted provided that the following conditions 12 1.1 thorpej * are met: 13 1.1 thorpej * 1. Redistributions of source code must retain the above copyright 14 1.1 thorpej * notice, this list of conditions and the following disclaimer. 15 1.1 thorpej * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 thorpej * notice, this list of conditions and the following disclaimer in the 17 1.1 thorpej * documentation and/or other materials provided with the distribution. 18 1.1 thorpej * 19 1.1 thorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.1 thorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.1 thorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.1 thorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.1 thorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.1 thorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.1 thorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.1 thorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.1 thorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.1 thorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.1 thorpej * POSSIBILITY OF SUCH DAMAGE. 30 1.1 thorpej */ 31 1.1 thorpej 32 1.1 thorpej #include <sys/cdefs.h> 33 1.26 riastrad __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $"); 34 1.1 thorpej 35 1.1 thorpej /* 36 1.1 thorpej * Futexes 37 1.1 thorpej * 38 1.1 thorpej * The futex system call coordinates notifying threads waiting for 39 1.1 thorpej * changes on a 32-bit word of memory. The word can be managed by 40 1.1 thorpej * CPU atomic operations in userland, without system calls, as long 41 1.1 thorpej * as there is no contention. 42 1.1 thorpej * 43 1.1 thorpej * The simplest use case demonstrating the utility is: 44 1.1 thorpej * 45 1.1 thorpej * // 32-bit word of memory shared among threads or 46 1.1 thorpej * // processes in userland. lock & 1 means owned; 47 1.1 thorpej * // lock & 2 means there are waiters waiting. 48 1.1 thorpej * volatile int lock = 0; 49 1.1 thorpej * 50 1.1 thorpej * int v; 51 1.1 thorpej * 52 1.1 thorpej * // Acquire a lock. 53 1.1 thorpej * do { 54 1.1 thorpej * v = lock; 55 1.1 thorpej * if (v & 1) { 56 1.1 thorpej * // Lock is held. Set a bit to say that 57 1.1 thorpej * // there are waiters, and wait for lock 58 1.1 thorpej * // to change to anything other than v; 59 1.1 thorpej * // then retry. 60 1.1 thorpej * if (atomic_cas_uint(&lock, v, v | 2) != v) 61 1.1 thorpej * continue; 62 1.24 riastrad * futex(&lock, FUTEX_WAIT, v | 2, NULL, NULL, 0, 63 1.24 riastrad * 0); 64 1.1 thorpej * continue; 65 1.1 thorpej * } 66 1.20 riastrad * } while (atomic_cas_uint(&lock, v, v | 1) != v); 67 1.18 riastrad * membar_acquire(); 68 1.1 thorpej * 69 1.1 thorpej * ... 70 1.1 thorpej * 71 1.1 thorpej * // Release the lock. Optimistically assume there are 72 1.1 thorpej * // no waiters first until demonstrated otherwise. 73 1.18 riastrad * membar_release(); 74 1.1 thorpej * if (atomic_cas_uint(&lock, 1, 0) != 1) { 75 1.1 thorpej * // There may be waiters. 76 1.1 thorpej * v = atomic_swap_uint(&lock, 0); 77 1.1 thorpej * // If there are still waiters, wake one. 78 1.1 thorpej * if (v & 2) 79 1.24 riastrad * futex(&lock, FUTEX_WAKE, 1, NULL, NULL, 0, 0); 80 1.1 thorpej * } 81 1.1 thorpej * 82 1.1 thorpej * The goal is to avoid the futex system call unless there is 83 1.1 thorpej * contention; then if there is contention, to guarantee no missed 84 1.1 thorpej * wakeups. 85 1.1 thorpej * 86 1.1 thorpej * For a simple implementation, futex(FUTEX_WAIT) could queue 87 1.1 thorpej * itself to be woken, double-check the lock word, and then sleep; 88 1.1 thorpej * spurious wakeups are generally a fact of life, so any 89 1.1 thorpej * FUTEX_WAKE could just wake every FUTEX_WAIT in the system. 90 1.1 thorpej * 91 1.1 thorpej * If this were all there is to it, we could then increase 92 1.1 thorpej * parallelism by refining the approximation: partition the 93 1.1 thorpej * waiters into buckets by hashing the lock addresses to reduce 94 1.1 thorpej * the incidence of spurious wakeups. But this is not all. 95 1.1 thorpej * 96 1.24 riastrad * The futex(&lock, FUTEX_CMP_REQUEUE, n, timeout, &lock2, m, val) 97 1.1 thorpej * operation not only wakes n waiters on lock if lock == val, but 98 1.1 thorpej * also _transfers_ m additional waiters to lock2. Unless wakeups 99 1.1 thorpej * on lock2 also trigger wakeups on lock, we cannot move waiters 100 1.1 thorpej * to lock2 if they merely share the same hash as waiters on lock. 101 1.1 thorpej * Thus, we can't approximately distribute waiters into queues by 102 1.1 thorpej * a hash function; we must distinguish futex queues exactly by 103 1.1 thorpej * lock address. 104 1.1 thorpej * 105 1.1 thorpej * For now, we use a global red/black tree to index futexes. This 106 1.1 thorpej * should be replaced by a lockless radix tree with a thread to 107 1.1 thorpej * free entries no longer in use once all lookups on all CPUs have 108 1.1 thorpej * completed. 109 1.1 thorpej * 110 1.1 thorpej * Specifically, we maintain two maps: 111 1.1 thorpej * 112 1.1 thorpej * futex_tab.va[vmspace, va] for private futexes 113 1.1 thorpej * futex_tab.oa[uvm_voaddr] for shared futexes 114 1.1 thorpej * 115 1.1 thorpej * This implementation does not support priority inheritance. 116 1.1 thorpej */ 117 1.1 thorpej 118 1.12 skrll #include <sys/param.h> 119 1.1 thorpej #include <sys/types.h> 120 1.1 thorpej #include <sys/atomic.h> 121 1.1 thorpej #include <sys/condvar.h> 122 1.1 thorpej #include <sys/futex.h> 123 1.1 thorpej #include <sys/mutex.h> 124 1.1 thorpej #include <sys/rbtree.h> 125 1.1 thorpej #include <sys/queue.h> 126 1.1 thorpej 127 1.1 thorpej #include <sys/syscall.h> 128 1.1 thorpej #include <sys/syscallargs.h> 129 1.1 thorpej #include <sys/syscallvar.h> 130 1.1 thorpej 131 1.1 thorpej #include <uvm/uvm_extern.h> 132 1.1 thorpej 133 1.1 thorpej /* 134 1.1 thorpej * Lock order: 135 1.1 thorpej * 136 1.1 thorpej * futex_tab.lock 137 1.1 thorpej * futex::fx_qlock ordered by kva of struct futex 138 1.1 thorpej * -> futex_wait::fw_lock only one at a time 139 1.1 thorpej * futex_wait::fw_lock only one at a time 140 1.1 thorpej * -> futex::fx_abortlock only one at a time 141 1.1 thorpej */ 142 1.1 thorpej 143 1.1 thorpej /* 144 1.1 thorpej * union futex_key 145 1.1 thorpej * 146 1.1 thorpej * A futex is addressed either by a vmspace+va (private) or by 147 1.1 thorpej * a uvm_voaddr (shared). 148 1.1 thorpej */ 149 1.1 thorpej union futex_key { 150 1.1 thorpej struct { 151 1.1 thorpej struct vmspace *vmspace; 152 1.1 thorpej vaddr_t va; 153 1.1 thorpej } fk_private; 154 1.1 thorpej struct uvm_voaddr fk_shared; 155 1.1 thorpej }; 156 1.1 thorpej 157 1.1 thorpej /* 158 1.1 thorpej * struct futex 159 1.1 thorpej * 160 1.1 thorpej * Kernel state for a futex located at a particular address in a 161 1.1 thorpej * particular virtual address space. 162 1.1 thorpej * 163 1.1 thorpej * N.B. fx_refcnt is an unsigned long because we need to be able 164 1.1 thorpej * to operate on it atomically on all systems while at the same 165 1.1 thorpej * time rendering practically impossible the chance of it reaching 166 1.1 thorpej * its max value. In practice, we're limited by the number of LWPs 167 1.1 thorpej * that can be present on the system at any given time, and the 168 1.1 thorpej * assumption is that limit will be good enough on a 32-bit platform. 169 1.1 thorpej * See futex_wake() for why overflow needs to be avoided. 170 1.1 thorpej */ 171 1.1 thorpej struct futex { 172 1.1 thorpej union futex_key fx_key; 173 1.1 thorpej unsigned long fx_refcnt; 174 1.1 thorpej bool fx_shared; 175 1.1 thorpej bool fx_on_tree; 176 1.1 thorpej struct rb_node fx_node; 177 1.1 thorpej 178 1.1 thorpej kmutex_t fx_qlock; 179 1.1 thorpej TAILQ_HEAD(, futex_wait) fx_queue; 180 1.1 thorpej 181 1.1 thorpej kmutex_t fx_abortlock; 182 1.1 thorpej LIST_HEAD(, futex_wait) fx_abortlist; 183 1.1 thorpej kcondvar_t fx_abortcv; 184 1.1 thorpej }; 185 1.1 thorpej 186 1.1 thorpej /* 187 1.1 thorpej * struct futex_wait 188 1.1 thorpej * 189 1.1 thorpej * State for a thread to wait on a futex. Threads wait on fw_cv 190 1.1 thorpej * for fw_bitset to be set to zero. The thread may transition to 191 1.1 thorpej * a different futex queue at any time under the futex's lock. 192 1.1 thorpej */ 193 1.1 thorpej struct futex_wait { 194 1.1 thorpej kmutex_t fw_lock; 195 1.1 thorpej kcondvar_t fw_cv; 196 1.1 thorpej struct futex *fw_futex; 197 1.1 thorpej TAILQ_ENTRY(futex_wait) fw_entry; /* queue lock */ 198 1.1 thorpej LIST_ENTRY(futex_wait) fw_abort; /* queue abortlock */ 199 1.1 thorpej int fw_bitset; 200 1.4 riastrad bool fw_aborting; /* fw_lock */ 201 1.1 thorpej }; 202 1.1 thorpej 203 1.1 thorpej /* 204 1.1 thorpej * futex_tab 205 1.1 thorpej * 206 1.1 thorpej * Global trees of futexes by vmspace/va and VM object address. 207 1.1 thorpej * 208 1.1 thorpej * XXX This obviously doesn't scale in parallel. We could use a 209 1.1 thorpej * pserialize-safe data structure, but there may be a high cost to 210 1.1 thorpej * frequent deletion since we don't cache futexes after we're done 211 1.1 thorpej * with them. We could use hashed locks. But for now, just make 212 1.1 thorpej * sure userland can't DoS the serial performance, by using a 213 1.1 thorpej * balanced binary tree for lookup. 214 1.1 thorpej * 215 1.1 thorpej * XXX We could use a per-process tree for the table indexed by 216 1.1 thorpej * virtual address to reduce contention between processes. 217 1.1 thorpej */ 218 1.1 thorpej static struct { 219 1.1 thorpej kmutex_t lock; 220 1.1 thorpej struct rb_tree va; 221 1.1 thorpej struct rb_tree oa; 222 1.1 thorpej } futex_tab __cacheline_aligned; 223 1.1 thorpej 224 1.1 thorpej static int 225 1.1 thorpej compare_futex_key(void *cookie, const void *n, const void *k) 226 1.1 thorpej { 227 1.1 thorpej const struct futex *fa = n; 228 1.1 thorpej const union futex_key *fka = &fa->fx_key; 229 1.1 thorpej const union futex_key *fkb = k; 230 1.1 thorpej 231 1.1 thorpej if ((uintptr_t)fka->fk_private.vmspace < 232 1.1 thorpej (uintptr_t)fkb->fk_private.vmspace) 233 1.1 thorpej return -1; 234 1.1 thorpej if ((uintptr_t)fka->fk_private.vmspace > 235 1.1 thorpej (uintptr_t)fkb->fk_private.vmspace) 236 1.1 thorpej return +1; 237 1.1 thorpej if (fka->fk_private.va < fkb->fk_private.va) 238 1.1 thorpej return -1; 239 1.1 thorpej if (fka->fk_private.va > fkb->fk_private.va) 240 1.15 chs return +1; 241 1.1 thorpej return 0; 242 1.1 thorpej } 243 1.1 thorpej 244 1.1 thorpej static int 245 1.1 thorpej compare_futex(void *cookie, const void *na, const void *nb) 246 1.1 thorpej { 247 1.1 thorpej const struct futex *fa = na; 248 1.1 thorpej const struct futex *fb = nb; 249 1.1 thorpej 250 1.1 thorpej return compare_futex_key(cookie, fa, &fb->fx_key); 251 1.1 thorpej } 252 1.1 thorpej 253 1.1 thorpej static const rb_tree_ops_t futex_rb_ops = { 254 1.1 thorpej .rbto_compare_nodes = compare_futex, 255 1.1 thorpej .rbto_compare_key = compare_futex_key, 256 1.1 thorpej .rbto_node_offset = offsetof(struct futex, fx_node), 257 1.1 thorpej }; 258 1.1 thorpej 259 1.1 thorpej static int 260 1.1 thorpej compare_futex_shared_key(void *cookie, const void *n, const void *k) 261 1.1 thorpej { 262 1.1 thorpej const struct futex *fa = n; 263 1.1 thorpej const union futex_key *fka = &fa->fx_key; 264 1.1 thorpej const union futex_key *fkb = k; 265 1.1 thorpej 266 1.1 thorpej return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared); 267 1.1 thorpej } 268 1.1 thorpej 269 1.1 thorpej static int 270 1.1 thorpej compare_futex_shared(void *cookie, const void *na, const void *nb) 271 1.1 thorpej { 272 1.1 thorpej const struct futex *fa = na; 273 1.1 thorpej const struct futex *fb = nb; 274 1.1 thorpej 275 1.1 thorpej return compare_futex_shared_key(cookie, fa, &fb->fx_key); 276 1.1 thorpej } 277 1.1 thorpej 278 1.1 thorpej static const rb_tree_ops_t futex_shared_rb_ops = { 279 1.1 thorpej .rbto_compare_nodes = compare_futex_shared, 280 1.1 thorpej .rbto_compare_key = compare_futex_shared_key, 281 1.1 thorpej .rbto_node_offset = offsetof(struct futex, fx_node), 282 1.1 thorpej }; 283 1.1 thorpej 284 1.1 thorpej static void futex_wait_dequeue(struct futex_wait *, struct futex *); 285 1.1 thorpej 286 1.1 thorpej /* 287 1.1 thorpej * futex_load(uaddr, kaddr) 288 1.1 thorpej * 289 1.1 thorpej * Perform a single atomic load to read *uaddr, and return the 290 1.1 thorpej * result in *kaddr. Return 0 on success, EFAULT if uaddr is not 291 1.1 thorpej * mapped. 292 1.1 thorpej */ 293 1.1 thorpej static inline int 294 1.1 thorpej futex_load(int *uaddr, int *kaddr) 295 1.1 thorpej { 296 1.1 thorpej return ufetch_int((u_int *)uaddr, (u_int *)kaddr); 297 1.1 thorpej } 298 1.1 thorpej 299 1.1 thorpej /* 300 1.1 thorpej * futex_test(uaddr, expected) 301 1.1 thorpej * 302 1.1 thorpej * True if *uaddr == expected. False if *uaddr != expected, or if 303 1.1 thorpej * uaddr is not mapped. 304 1.1 thorpej */ 305 1.1 thorpej static bool 306 1.1 thorpej futex_test(int *uaddr, int expected) 307 1.1 thorpej { 308 1.1 thorpej int val; 309 1.1 thorpej int error; 310 1.1 thorpej 311 1.1 thorpej error = futex_load(uaddr, &val); 312 1.1 thorpej if (error) 313 1.1 thorpej return false; 314 1.1 thorpej return val == expected; 315 1.1 thorpej } 316 1.1 thorpej 317 1.1 thorpej /* 318 1.1 thorpej * futex_sys_init() 319 1.1 thorpej * 320 1.1 thorpej * Initialize the futex subsystem. 321 1.1 thorpej */ 322 1.1 thorpej void 323 1.1 thorpej futex_sys_init(void) 324 1.1 thorpej { 325 1.1 thorpej 326 1.1 thorpej mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE); 327 1.1 thorpej rb_tree_init(&futex_tab.va, &futex_rb_ops); 328 1.1 thorpej rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops); 329 1.1 thorpej } 330 1.1 thorpej 331 1.1 thorpej /* 332 1.1 thorpej * futex_sys_fini() 333 1.1 thorpej * 334 1.1 thorpej * Finalize the futex subsystem. 335 1.1 thorpej */ 336 1.1 thorpej void 337 1.1 thorpej futex_sys_fini(void) 338 1.1 thorpej { 339 1.1 thorpej 340 1.1 thorpej KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL); 341 1.1 thorpej KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL); 342 1.1 thorpej mutex_destroy(&futex_tab.lock); 343 1.1 thorpej } 344 1.1 thorpej 345 1.1 thorpej /* 346 1.1 thorpej * futex_queue_init(f) 347 1.1 thorpej * 348 1.1 thorpej * Initialize the futex queue. Caller must call futex_queue_fini 349 1.1 thorpej * when done. 350 1.1 thorpej * 351 1.1 thorpej * Never sleeps. 352 1.1 thorpej */ 353 1.1 thorpej static void 354 1.1 thorpej futex_queue_init(struct futex *f) 355 1.1 thorpej { 356 1.1 thorpej 357 1.1 thorpej mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE); 358 1.1 thorpej mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE); 359 1.1 thorpej cv_init(&f->fx_abortcv, "fqabort"); 360 1.1 thorpej LIST_INIT(&f->fx_abortlist); 361 1.1 thorpej TAILQ_INIT(&f->fx_queue); 362 1.1 thorpej } 363 1.1 thorpej 364 1.1 thorpej /* 365 1.1 thorpej * futex_queue_drain(f) 366 1.1 thorpej * 367 1.1 thorpej * Wait for any aborting waiters in f; then empty the queue of 368 1.1 thorpej * any stragglers and wake them. Caller must guarantee no new 369 1.1 thorpej * references to f. 370 1.1 thorpej * 371 1.1 thorpej * May sleep. 372 1.1 thorpej */ 373 1.1 thorpej static void 374 1.1 thorpej futex_queue_drain(struct futex *f) 375 1.1 thorpej { 376 1.1 thorpej struct futex_wait *fw, *fw_next; 377 1.1 thorpej 378 1.1 thorpej mutex_enter(&f->fx_abortlock); 379 1.1 thorpej while (!LIST_EMPTY(&f->fx_abortlist)) 380 1.1 thorpej cv_wait(&f->fx_abortcv, &f->fx_abortlock); 381 1.1 thorpej mutex_exit(&f->fx_abortlock); 382 1.1 thorpej 383 1.1 thorpej mutex_enter(&f->fx_qlock); 384 1.1 thorpej TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 385 1.1 thorpej mutex_enter(&fw->fw_lock); 386 1.1 thorpej futex_wait_dequeue(fw, f); 387 1.1 thorpej cv_broadcast(&fw->fw_cv); 388 1.1 thorpej mutex_exit(&fw->fw_lock); 389 1.1 thorpej } 390 1.1 thorpej mutex_exit(&f->fx_qlock); 391 1.1 thorpej } 392 1.1 thorpej 393 1.1 thorpej /* 394 1.1 thorpej * futex_queue_fini(fq) 395 1.1 thorpej * 396 1.1 thorpej * Finalize the futex queue initialized by futex_queue_init. Queue 397 1.1 thorpej * must be empty. Caller must not use f again until a subsequent 398 1.1 thorpej * futex_queue_init. 399 1.1 thorpej */ 400 1.1 thorpej static void 401 1.1 thorpej futex_queue_fini(struct futex *f) 402 1.1 thorpej { 403 1.1 thorpej 404 1.1 thorpej KASSERT(TAILQ_EMPTY(&f->fx_queue)); 405 1.1 thorpej KASSERT(LIST_EMPTY(&f->fx_abortlist)); 406 1.1 thorpej mutex_destroy(&f->fx_qlock); 407 1.1 thorpej mutex_destroy(&f->fx_abortlock); 408 1.1 thorpej cv_destroy(&f->fx_abortcv); 409 1.1 thorpej } 410 1.1 thorpej 411 1.1 thorpej /* 412 1.1 thorpej * futex_key_init(key, vm, va, shared) 413 1.1 thorpej * 414 1.1 thorpej * Initialize a futex key for lookup, etc. 415 1.1 thorpej */ 416 1.1 thorpej static int 417 1.1 thorpej futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared) 418 1.1 thorpej { 419 1.1 thorpej int error = 0; 420 1.1 thorpej 421 1.1 thorpej if (__predict_false(shared)) { 422 1.1 thorpej if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared)) 423 1.1 thorpej error = EFAULT; 424 1.1 thorpej } else { 425 1.1 thorpej fk->fk_private.vmspace = vm; 426 1.1 thorpej fk->fk_private.va = va; 427 1.1 thorpej } 428 1.1 thorpej 429 1.1 thorpej return error; 430 1.1 thorpej } 431 1.1 thorpej 432 1.1 thorpej /* 433 1.1 thorpej * futex_key_fini(key, shared) 434 1.1 thorpej * 435 1.1 thorpej * Release a futex key. 436 1.1 thorpej */ 437 1.1 thorpej static void 438 1.1 thorpej futex_key_fini(union futex_key *fk, bool shared) 439 1.1 thorpej { 440 1.1 thorpej if (__predict_false(shared)) 441 1.1 thorpej uvm_voaddr_release(&fk->fk_shared); 442 1.1 thorpej memset(fk, 0, sizeof(*fk)); 443 1.1 thorpej } 444 1.1 thorpej 445 1.1 thorpej /* 446 1.1 thorpej * futex_create(fk, shared) 447 1.1 thorpej * 448 1.1 thorpej * Create a futex. Initial reference count is 1, representing the 449 1.1 thorpej * caller. Returns NULL on failure. Always takes ownership of the 450 1.1 thorpej * key, either transferring it to the newly-created futex, or releasing 451 1.1 thorpej * the key if creation fails. 452 1.1 thorpej * 453 1.1 thorpej * Never sleeps for memory, but may sleep to acquire a lock. 454 1.1 thorpej */ 455 1.1 thorpej static struct futex * 456 1.1 thorpej futex_create(union futex_key *fk, bool shared) 457 1.1 thorpej { 458 1.1 thorpej struct futex *f; 459 1.1 thorpej 460 1.1 thorpej f = kmem_alloc(sizeof(*f), KM_NOSLEEP); 461 1.1 thorpej if (f == NULL) { 462 1.1 thorpej futex_key_fini(fk, shared); 463 1.1 thorpej return NULL; 464 1.1 thorpej } 465 1.1 thorpej f->fx_key = *fk; 466 1.1 thorpej f->fx_refcnt = 1; 467 1.1 thorpej f->fx_shared = shared; 468 1.1 thorpej f->fx_on_tree = false; 469 1.1 thorpej futex_queue_init(f); 470 1.1 thorpej 471 1.1 thorpej return f; 472 1.1 thorpej } 473 1.1 thorpej 474 1.1 thorpej /* 475 1.1 thorpej * futex_destroy(f) 476 1.1 thorpej * 477 1.1 thorpej * Destroy a futex created with futex_create. Reference count 478 1.1 thorpej * must be zero. 479 1.1 thorpej * 480 1.1 thorpej * May sleep. 481 1.1 thorpej */ 482 1.1 thorpej static void 483 1.1 thorpej futex_destroy(struct futex *f) 484 1.1 thorpej { 485 1.1 thorpej 486 1.1 thorpej ASSERT_SLEEPABLE(); 487 1.1 thorpej 488 1.1 thorpej KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0); 489 1.1 thorpej KASSERT(!f->fx_on_tree); 490 1.1 thorpej 491 1.1 thorpej /* Drain and destroy the private queue. */ 492 1.1 thorpej futex_queue_drain(f); 493 1.1 thorpej futex_queue_fini(f); 494 1.1 thorpej 495 1.1 thorpej futex_key_fini(&f->fx_key, f->fx_shared); 496 1.1 thorpej 497 1.1 thorpej kmem_free(f, sizeof(*f)); 498 1.1 thorpej } 499 1.1 thorpej 500 1.1 thorpej /* 501 1.1 thorpej * futex_hold(f) 502 1.1 thorpej * 503 1.1 thorpej * Attempt to acquire a reference to f. Return 0 on success, 504 1.1 thorpej * ENFILE on too many references. 505 1.1 thorpej * 506 1.1 thorpej * Never sleeps. 507 1.1 thorpej */ 508 1.1 thorpej static int 509 1.1 thorpej futex_hold(struct futex *f) 510 1.1 thorpej { 511 1.1 thorpej unsigned long refcnt; 512 1.1 thorpej 513 1.1 thorpej do { 514 1.1 thorpej refcnt = atomic_load_relaxed(&f->fx_refcnt); 515 1.1 thorpej if (refcnt == ULONG_MAX) 516 1.1 thorpej return ENFILE; 517 1.1 thorpej } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt); 518 1.1 thorpej 519 1.1 thorpej return 0; 520 1.1 thorpej } 521 1.1 thorpej 522 1.1 thorpej /* 523 1.1 thorpej * futex_rele(f) 524 1.1 thorpej * 525 1.1 thorpej * Release a reference to f acquired with futex_create or 526 1.1 thorpej * futex_hold. 527 1.1 thorpej * 528 1.1 thorpej * May sleep to free f. 529 1.1 thorpej */ 530 1.1 thorpej static void 531 1.1 thorpej futex_rele(struct futex *f) 532 1.1 thorpej { 533 1.1 thorpej unsigned long refcnt; 534 1.1 thorpej 535 1.1 thorpej ASSERT_SLEEPABLE(); 536 1.1 thorpej 537 1.1 thorpej do { 538 1.1 thorpej refcnt = atomic_load_relaxed(&f->fx_refcnt); 539 1.1 thorpej if (refcnt == 1) 540 1.1 thorpej goto trylast; 541 1.17 riastrad membar_release(); 542 1.1 thorpej } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 543 1.1 thorpej return; 544 1.1 thorpej 545 1.1 thorpej trylast: 546 1.1 thorpej mutex_enter(&futex_tab.lock); 547 1.1 thorpej if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) { 548 1.17 riastrad membar_acquire(); 549 1.1 thorpej if (f->fx_on_tree) { 550 1.1 thorpej if (__predict_false(f->fx_shared)) 551 1.1 thorpej rb_tree_remove_node(&futex_tab.oa, f); 552 1.1 thorpej else 553 1.1 thorpej rb_tree_remove_node(&futex_tab.va, f); 554 1.1 thorpej f->fx_on_tree = false; 555 1.1 thorpej } 556 1.1 thorpej } else { 557 1.1 thorpej /* References remain -- don't destroy it. */ 558 1.1 thorpej f = NULL; 559 1.1 thorpej } 560 1.1 thorpej mutex_exit(&futex_tab.lock); 561 1.1 thorpej if (f != NULL) 562 1.1 thorpej futex_destroy(f); 563 1.1 thorpej } 564 1.1 thorpej 565 1.1 thorpej /* 566 1.1 thorpej * futex_rele_not_last(f) 567 1.1 thorpej * 568 1.1 thorpej * Release a reference to f acquired with futex_create or 569 1.1 thorpej * futex_hold. 570 1.1 thorpej * 571 1.1 thorpej * This version asserts that we are not dropping the last 572 1.1 thorpej * reference to f. 573 1.1 thorpej */ 574 1.1 thorpej static void 575 1.1 thorpej futex_rele_not_last(struct futex *f) 576 1.1 thorpej { 577 1.1 thorpej unsigned long refcnt; 578 1.1 thorpej 579 1.1 thorpej do { 580 1.1 thorpej refcnt = atomic_load_relaxed(&f->fx_refcnt); 581 1.1 thorpej KASSERT(refcnt > 1); 582 1.1 thorpej } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt); 583 1.1 thorpej } 584 1.1 thorpej 585 1.1 thorpej /* 586 1.1 thorpej * futex_lookup_by_key(key, shared, &f) 587 1.1 thorpej * 588 1.1 thorpej * Try to find an existing futex va reference in the specified key 589 1.1 thorpej * On success, return 0, set f to found futex or to NULL if not found, 590 1.1 thorpej * and increment f's reference count if found. 591 1.1 thorpej * 592 1.1 thorpej * Return ENFILE if reference count too high. 593 1.1 thorpej * 594 1.1 thorpej * Internal lookup routine shared by futex_lookup() and 595 1.5 riastrad * futex_lookup_create(). 596 1.1 thorpej */ 597 1.1 thorpej static int 598 1.1 thorpej futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp) 599 1.1 thorpej { 600 1.1 thorpej struct futex *f; 601 1.1 thorpej int error = 0; 602 1.1 thorpej 603 1.1 thorpej mutex_enter(&futex_tab.lock); 604 1.1 thorpej if (__predict_false(shared)) { 605 1.1 thorpej f = rb_tree_find_node(&futex_tab.oa, fk); 606 1.1 thorpej } else { 607 1.1 thorpej f = rb_tree_find_node(&futex_tab.va, fk); 608 1.1 thorpej } 609 1.1 thorpej if (f) { 610 1.1 thorpej error = futex_hold(f); 611 1.1 thorpej if (error) 612 1.1 thorpej f = NULL; 613 1.1 thorpej } 614 1.1 thorpej *fp = f; 615 1.1 thorpej mutex_exit(&futex_tab.lock); 616 1.1 thorpej 617 1.1 thorpej return error; 618 1.1 thorpej } 619 1.1 thorpej 620 1.1 thorpej /* 621 1.1 thorpej * futex_insert(f, fp) 622 1.1 thorpej * 623 1.1 thorpej * Try to insert the futex f into the tree by va. If there 624 1.1 thorpej * already is a futex for its va, acquire a reference to it, and 625 1.1 thorpej * store it in *fp; otherwise store f in *fp. 626 1.1 thorpej * 627 1.1 thorpej * Return 0 on success, ENFILE if there already is a futex but its 628 1.1 thorpej * reference count is too high. 629 1.1 thorpej */ 630 1.1 thorpej static int 631 1.1 thorpej futex_insert(struct futex *f, struct futex **fp) 632 1.1 thorpej { 633 1.1 thorpej struct futex *f0; 634 1.1 thorpej int error; 635 1.1 thorpej 636 1.1 thorpej KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0); 637 1.1 thorpej KASSERT(!f->fx_on_tree); 638 1.1 thorpej 639 1.1 thorpej mutex_enter(&futex_tab.lock); 640 1.1 thorpej if (__predict_false(f->fx_shared)) 641 1.1 thorpej f0 = rb_tree_insert_node(&futex_tab.oa, f); 642 1.1 thorpej else 643 1.1 thorpej f0 = rb_tree_insert_node(&futex_tab.va, f); 644 1.1 thorpej if (f0 == f) { 645 1.1 thorpej f->fx_on_tree = true; 646 1.1 thorpej error = 0; 647 1.1 thorpej } else { 648 1.1 thorpej KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0); 649 1.1 thorpej KASSERT(f0->fx_on_tree); 650 1.1 thorpej error = futex_hold(f0); 651 1.1 thorpej if (error) 652 1.1 thorpej goto out; 653 1.1 thorpej } 654 1.1 thorpej *fp = f0; 655 1.1 thorpej out: mutex_exit(&futex_tab.lock); 656 1.1 thorpej 657 1.1 thorpej return error; 658 1.1 thorpej } 659 1.1 thorpej 660 1.1 thorpej /* 661 1.1 thorpej * futex_lookup(uaddr, shared, &f) 662 1.1 thorpej * 663 1.1 thorpej * Find a futex at the userland pointer uaddr in the current 664 1.1 thorpej * process's VM space. On success, return the futex in f and 665 1.1 thorpej * increment its reference count. 666 1.1 thorpej * 667 1.5 riastrad * Caller must call futex_rele when done. 668 1.1 thorpej */ 669 1.1 thorpej static int 670 1.1 thorpej futex_lookup(int *uaddr, bool shared, struct futex **fp) 671 1.1 thorpej { 672 1.1 thorpej union futex_key fk; 673 1.1 thorpej struct vmspace *vm = curproc->p_vmspace; 674 1.1 thorpej vaddr_t va = (vaddr_t)uaddr; 675 1.1 thorpej int error; 676 1.1 thorpej 677 1.1 thorpej /* 678 1.1 thorpej * Reject unaligned user pointers so we don't cross page 679 1.1 thorpej * boundaries and so atomics will work. 680 1.1 thorpej */ 681 1.1 thorpej if ((va & 3) != 0) 682 1.1 thorpej return EINVAL; 683 1.1 thorpej 684 1.1 thorpej /* Look it up. */ 685 1.1 thorpej error = futex_key_init(&fk, vm, va, shared); 686 1.1 thorpej if (error) 687 1.1 thorpej return error; 688 1.1 thorpej 689 1.1 thorpej error = futex_lookup_by_key(&fk, shared, fp); 690 1.1 thorpej futex_key_fini(&fk, shared); 691 1.1 thorpej if (error) 692 1.1 thorpej return error; 693 1.1 thorpej 694 1.1 thorpej KASSERT(*fp == NULL || (*fp)->fx_shared == shared); 695 1.1 thorpej KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 696 1.1 thorpej 697 1.1 thorpej /* 698 1.1 thorpej * Success! (Caller must still check whether we found 699 1.1 thorpej * anything, but nothing went _wrong_ like trying to use 700 1.1 thorpej * unmapped memory.) 701 1.1 thorpej */ 702 1.1 thorpej KASSERT(error == 0); 703 1.1 thorpej 704 1.1 thorpej return error; 705 1.1 thorpej } 706 1.1 thorpej 707 1.1 thorpej /* 708 1.5 riastrad * futex_lookup_create(uaddr, shared, &f) 709 1.1 thorpej * 710 1.1 thorpej * Find or create a futex at the userland pointer uaddr in the 711 1.1 thorpej * current process's VM space. On success, return the futex in f 712 1.1 thorpej * and increment its reference count. 713 1.1 thorpej * 714 1.5 riastrad * Caller must call futex_rele when done. 715 1.1 thorpej */ 716 1.1 thorpej static int 717 1.5 riastrad futex_lookup_create(int *uaddr, bool shared, struct futex **fp) 718 1.1 thorpej { 719 1.1 thorpej union futex_key fk; 720 1.1 thorpej struct vmspace *vm = curproc->p_vmspace; 721 1.1 thorpej struct futex *f = NULL; 722 1.1 thorpej vaddr_t va = (vaddr_t)uaddr; 723 1.1 thorpej int error; 724 1.1 thorpej 725 1.1 thorpej /* 726 1.1 thorpej * Reject unaligned user pointers so we don't cross page 727 1.1 thorpej * boundaries and so atomics will work. 728 1.1 thorpej */ 729 1.1 thorpej if ((va & 3) != 0) 730 1.1 thorpej return EINVAL; 731 1.1 thorpej 732 1.1 thorpej error = futex_key_init(&fk, vm, va, shared); 733 1.1 thorpej if (error) 734 1.1 thorpej return error; 735 1.1 thorpej 736 1.1 thorpej /* 737 1.1 thorpej * Optimistically assume there already is one, and try to find 738 1.1 thorpej * it. 739 1.1 thorpej */ 740 1.1 thorpej error = futex_lookup_by_key(&fk, shared, fp); 741 1.1 thorpej if (error || *fp != NULL) { 742 1.1 thorpej /* 743 1.1 thorpej * We either found one, or there was an error. 744 1.1 thorpej * In either case, we are done with the key. 745 1.1 thorpej */ 746 1.1 thorpej futex_key_fini(&fk, shared); 747 1.1 thorpej goto out; 748 1.1 thorpej } 749 1.1 thorpej 750 1.1 thorpej /* 751 1.14 andvar * Create a futex record. This transfers ownership of the key 752 1.1 thorpej * in all cases. 753 1.1 thorpej */ 754 1.1 thorpej f = futex_create(&fk, shared); 755 1.1 thorpej if (f == NULL) { 756 1.1 thorpej error = ENOMEM; 757 1.1 thorpej goto out; 758 1.1 thorpej } 759 1.1 thorpej 760 1.1 thorpej /* 761 1.1 thorpej * Insert our new futex, or use existing if someone else beat 762 1.1 thorpej * us to it. 763 1.1 thorpej */ 764 1.1 thorpej error = futex_insert(f, fp); 765 1.1 thorpej if (error) 766 1.1 thorpej goto out; 767 1.1 thorpej if (*fp == f) 768 1.1 thorpej f = NULL; /* don't release on exit */ 769 1.1 thorpej 770 1.1 thorpej /* Success! */ 771 1.1 thorpej KASSERT(error == 0); 772 1.1 thorpej 773 1.1 thorpej out: if (f != NULL) 774 1.1 thorpej futex_rele(f); 775 1.1 thorpej KASSERT(error || *fp != NULL); 776 1.1 thorpej KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0); 777 1.1 thorpej return error; 778 1.1 thorpej } 779 1.1 thorpej 780 1.1 thorpej /* 781 1.1 thorpej * futex_wait_init(fw, bitset) 782 1.1 thorpej * 783 1.1 thorpej * Initialize a record for a thread to wait on a futex matching 784 1.1 thorpej * the specified bit set. Should be passed to futex_wait_enqueue 785 1.1 thorpej * before futex_wait, and should be passed to futex_wait_fini when 786 1.1 thorpej * done. 787 1.1 thorpej */ 788 1.1 thorpej static void 789 1.1 thorpej futex_wait_init(struct futex_wait *fw, int bitset) 790 1.1 thorpej { 791 1.1 thorpej 792 1.6 riastrad KASSERT(bitset); 793 1.6 riastrad 794 1.1 thorpej mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE); 795 1.1 thorpej cv_init(&fw->fw_cv, "futex"); 796 1.1 thorpej fw->fw_futex = NULL; 797 1.1 thorpej fw->fw_bitset = bitset; 798 1.4 riastrad fw->fw_aborting = false; 799 1.1 thorpej } 800 1.1 thorpej 801 1.1 thorpej /* 802 1.1 thorpej * futex_wait_fini(fw) 803 1.1 thorpej * 804 1.1 thorpej * Finalize a record for a futex waiter. Must not be on any 805 1.1 thorpej * futex's queue. 806 1.1 thorpej */ 807 1.1 thorpej static void 808 1.1 thorpej futex_wait_fini(struct futex_wait *fw) 809 1.1 thorpej { 810 1.1 thorpej 811 1.6 riastrad KASSERT(fw->fw_futex == NULL); 812 1.6 riastrad 813 1.1 thorpej cv_destroy(&fw->fw_cv); 814 1.1 thorpej mutex_destroy(&fw->fw_lock); 815 1.1 thorpej } 816 1.1 thorpej 817 1.1 thorpej /* 818 1.1 thorpej * futex_wait_enqueue(fw, f) 819 1.1 thorpej * 820 1.1 thorpej * Put fw on the futex queue. Must be done before futex_wait. 821 1.1 thorpej * Caller must hold fw's lock and f's lock, and fw must not be on 822 1.1 thorpej * any existing futex's waiter list. 823 1.1 thorpej */ 824 1.1 thorpej static void 825 1.1 thorpej futex_wait_enqueue(struct futex_wait *fw, struct futex *f) 826 1.1 thorpej { 827 1.1 thorpej 828 1.1 thorpej KASSERT(mutex_owned(&f->fx_qlock)); 829 1.1 thorpej KASSERT(mutex_owned(&fw->fw_lock)); 830 1.1 thorpej KASSERT(fw->fw_futex == NULL); 831 1.4 riastrad KASSERT(!fw->fw_aborting); 832 1.1 thorpej 833 1.1 thorpej fw->fw_futex = f; 834 1.1 thorpej TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry); 835 1.1 thorpej } 836 1.1 thorpej 837 1.1 thorpej /* 838 1.1 thorpej * futex_wait_dequeue(fw, f) 839 1.1 thorpej * 840 1.1 thorpej * Remove fw from the futex queue. Precludes subsequent 841 1.1 thorpej * futex_wait until a futex_wait_enqueue. Caller must hold fw's 842 1.1 thorpej * lock and f's lock, and fw must be on f. 843 1.1 thorpej */ 844 1.1 thorpej static void 845 1.1 thorpej futex_wait_dequeue(struct futex_wait *fw, struct futex *f) 846 1.1 thorpej { 847 1.1 thorpej 848 1.1 thorpej KASSERT(mutex_owned(&f->fx_qlock)); 849 1.1 thorpej KASSERT(mutex_owned(&fw->fw_lock)); 850 1.1 thorpej KASSERT(fw->fw_futex == f); 851 1.1 thorpej 852 1.1 thorpej TAILQ_REMOVE(&f->fx_queue, fw, fw_entry); 853 1.1 thorpej fw->fw_futex = NULL; 854 1.1 thorpej } 855 1.1 thorpej 856 1.1 thorpej /* 857 1.1 thorpej * futex_wait_abort(fw) 858 1.1 thorpej * 859 1.1 thorpej * Caller is no longer waiting for fw. Remove it from any queue 860 1.4 riastrad * if it was on one. Caller must hold fw->fw_lock. 861 1.1 thorpej */ 862 1.1 thorpej static void 863 1.1 thorpej futex_wait_abort(struct futex_wait *fw) 864 1.1 thorpej { 865 1.1 thorpej struct futex *f; 866 1.1 thorpej 867 1.4 riastrad KASSERT(mutex_owned(&fw->fw_lock)); 868 1.1 thorpej 869 1.1 thorpej /* 870 1.1 thorpej * Grab the futex queue. It can't go away as long as we hold 871 1.1 thorpej * fw_lock. However, we can't take the queue lock because 872 1.1 thorpej * that's a lock order reversal. 873 1.1 thorpej */ 874 1.1 thorpej f = fw->fw_futex; 875 1.1 thorpej 876 1.1 thorpej /* Put us on the abort list so that fq won't go away. */ 877 1.1 thorpej mutex_enter(&f->fx_abortlock); 878 1.1 thorpej LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort); 879 1.1 thorpej mutex_exit(&f->fx_abortlock); 880 1.1 thorpej 881 1.4 riastrad /* 882 1.4 riastrad * Mark fw as aborting so it won't lose wakeups and won't be 883 1.4 riastrad * transferred to any other queue. 884 1.4 riastrad */ 885 1.4 riastrad fw->fw_aborting = true; 886 1.4 riastrad 887 1.1 thorpej /* f is now stable, so we can release fw_lock. */ 888 1.1 thorpej mutex_exit(&fw->fw_lock); 889 1.1 thorpej 890 1.1 thorpej /* Now we can remove fw under the queue lock. */ 891 1.1 thorpej mutex_enter(&f->fx_qlock); 892 1.4 riastrad mutex_enter(&fw->fw_lock); 893 1.4 riastrad futex_wait_dequeue(fw, f); 894 1.4 riastrad mutex_exit(&fw->fw_lock); 895 1.1 thorpej mutex_exit(&f->fx_qlock); 896 1.1 thorpej 897 1.1 thorpej /* 898 1.1 thorpej * Finally, remove us from the abort list and notify anyone 899 1.1 thorpej * waiting for the abort to complete if we were the last to go. 900 1.1 thorpej */ 901 1.1 thorpej mutex_enter(&f->fx_abortlock); 902 1.1 thorpej LIST_REMOVE(fw, fw_abort); 903 1.1 thorpej if (LIST_EMPTY(&f->fx_abortlist)) 904 1.1 thorpej cv_broadcast(&f->fx_abortcv); 905 1.1 thorpej mutex_exit(&f->fx_abortlock); 906 1.4 riastrad 907 1.4 riastrad /* 908 1.4 riastrad * Release our reference to the futex now that we are not 909 1.4 riastrad * waiting for it. 910 1.4 riastrad */ 911 1.4 riastrad futex_rele(f); 912 1.4 riastrad 913 1.4 riastrad /* 914 1.4 riastrad * Reacquire the fw lock as caller expects. Verify that we're 915 1.4 riastrad * aborting and no longer associated with a futex. 916 1.4 riastrad */ 917 1.4 riastrad mutex_enter(&fw->fw_lock); 918 1.4 riastrad KASSERT(fw->fw_aborting); 919 1.4 riastrad KASSERT(fw->fw_futex == NULL); 920 1.1 thorpej } 921 1.1 thorpej 922 1.1 thorpej /* 923 1.11 riastrad * futex_wait(fw, deadline, clkid) 924 1.1 thorpej * 925 1.11 riastrad * fw must be a waiter on a futex's queue. Wait until deadline on 926 1.11 riastrad * the clock clkid, or forever if deadline is NULL, for a futex 927 1.11 riastrad * wakeup. Return 0 on explicit wakeup or destruction of futex, 928 1.11 riastrad * ETIMEDOUT on timeout, EINTR/ERESTART on signal. Either way, fw 929 1.11 riastrad * will no longer be on a futex queue on return. 930 1.1 thorpej */ 931 1.1 thorpej static int 932 1.11 riastrad futex_wait(struct futex_wait *fw, const struct timespec *deadline, 933 1.11 riastrad clockid_t clkid) 934 1.1 thorpej { 935 1.1 thorpej int error = 0; 936 1.1 thorpej 937 1.1 thorpej /* Test and wait under the wait lock. */ 938 1.1 thorpej mutex_enter(&fw->fw_lock); 939 1.4 riastrad 940 1.4 riastrad for (;;) { 941 1.11 riastrad /* If we're done yet, stop and report success. */ 942 1.4 riastrad if (fw->fw_bitset == 0 || fw->fw_futex == NULL) { 943 1.4 riastrad error = 0; 944 1.4 riastrad break; 945 1.4 riastrad } 946 1.4 riastrad 947 1.4 riastrad /* If anything went wrong in the last iteration, stop. */ 948 1.4 riastrad if (error) 949 1.4 riastrad break; 950 1.4 riastrad 951 1.1 thorpej /* Not done yet. Wait. */ 952 1.11 riastrad if (deadline) { 953 1.11 riastrad struct timespec ts; 954 1.11 riastrad 955 1.11 riastrad /* Check our watch. */ 956 1.11 riastrad error = clock_gettime1(clkid, &ts); 957 1.11 riastrad if (error) 958 1.11 riastrad break; 959 1.11 riastrad 960 1.11 riastrad /* If we're past the deadline, ETIMEDOUT. */ 961 1.11 riastrad if (timespeccmp(deadline, &ts, <=)) { 962 1.11 riastrad error = ETIMEDOUT; 963 1.11 riastrad break; 964 1.11 riastrad } 965 1.11 riastrad 966 1.11 riastrad /* Count how much time is left. */ 967 1.11 riastrad timespecsub(deadline, &ts, &ts); 968 1.11 riastrad 969 1.26 riastrad /* 970 1.26 riastrad * Wait for that much time, allowing signals. 971 1.26 riastrad * Ignore EWOULDBLOCK, however: we will detect 972 1.26 riastrad * timeout ourselves on the next iteration of 973 1.26 riastrad * the loop -- and the timeout may have been 974 1.26 riastrad * truncated by tstohz, anyway. 975 1.26 riastrad */ 976 1.11 riastrad error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock, 977 1.26 riastrad MAX(1, tstohz(&ts))); 978 1.26 riastrad if (error == EWOULDBLOCK) 979 1.26 riastrad error = 0; 980 1.11 riastrad } else { 981 1.11 riastrad /* Wait indefinitely, allowing signals. */ 982 1.11 riastrad error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock); 983 1.11 riastrad } 984 1.1 thorpej } 985 1.4 riastrad 986 1.4 riastrad /* 987 1.4 riastrad * If we were woken up, the waker will have removed fw from the 988 1.4 riastrad * queue. But if anything went wrong, we must remove fw from 989 1.26 riastrad * the queue ourselves. 990 1.4 riastrad */ 991 1.26 riastrad if (error) 992 1.4 riastrad futex_wait_abort(fw); 993 1.4 riastrad 994 1.1 thorpej mutex_exit(&fw->fw_lock); 995 1.1 thorpej 996 1.1 thorpej return error; 997 1.1 thorpej } 998 1.1 thorpej 999 1.1 thorpej /* 1000 1.1 thorpej * futex_wake(f, nwake, f2, nrequeue, bitset) 1001 1.1 thorpej * 1002 1.1 thorpej * Wake up to nwake waiters on f matching bitset; then, if f2 is 1003 1.1 thorpej * provided, move up to nrequeue remaining waiters on f matching 1004 1.22 riastrad * bitset to f2. Return the number of waiters actually woken or 1005 1.22 riastrad * requeued. Caller must hold the locks of f and f2, if provided. 1006 1.1 thorpej */ 1007 1.1 thorpej static unsigned 1008 1.1 thorpej futex_wake(struct futex *f, unsigned nwake, struct futex *f2, 1009 1.1 thorpej unsigned nrequeue, int bitset) 1010 1.1 thorpej { 1011 1.1 thorpej struct futex_wait *fw, *fw_next; 1012 1.22 riastrad unsigned nwoken_or_requeued = 0; 1013 1.2 mlelstv int hold_error __diagused; 1014 1.1 thorpej 1015 1.1 thorpej KASSERT(mutex_owned(&f->fx_qlock)); 1016 1.1 thorpej KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock)); 1017 1.1 thorpej 1018 1.1 thorpej /* Wake up to nwake waiters, and count the number woken. */ 1019 1.1 thorpej TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1020 1.1 thorpej if ((fw->fw_bitset & bitset) == 0) 1021 1.1 thorpej continue; 1022 1.4 riastrad if (nwake > 0) { 1023 1.1 thorpej mutex_enter(&fw->fw_lock); 1024 1.4 riastrad if (__predict_false(fw->fw_aborting)) { 1025 1.4 riastrad mutex_exit(&fw->fw_lock); 1026 1.4 riastrad continue; 1027 1.4 riastrad } 1028 1.1 thorpej futex_wait_dequeue(fw, f); 1029 1.1 thorpej fw->fw_bitset = 0; 1030 1.1 thorpej cv_broadcast(&fw->fw_cv); 1031 1.1 thorpej mutex_exit(&fw->fw_lock); 1032 1.4 riastrad nwake--; 1033 1.22 riastrad nwoken_or_requeued++; 1034 1.1 thorpej /* 1035 1.1 thorpej * Drop the futex reference on behalf of the 1036 1.1 thorpej * waiter. We assert this is not the last 1037 1.1 thorpej * reference on the futex (our caller should 1038 1.1 thorpej * also have one). 1039 1.1 thorpej */ 1040 1.1 thorpej futex_rele_not_last(f); 1041 1.1 thorpej } else { 1042 1.1 thorpej break; 1043 1.1 thorpej } 1044 1.1 thorpej } 1045 1.1 thorpej 1046 1.1 thorpej if (f2) { 1047 1.1 thorpej /* Move up to nrequeue waiters from f's queue to f2's queue. */ 1048 1.1 thorpej TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) { 1049 1.1 thorpej if ((fw->fw_bitset & bitset) == 0) 1050 1.1 thorpej continue; 1051 1.4 riastrad if (nrequeue > 0) { 1052 1.1 thorpej mutex_enter(&fw->fw_lock); 1053 1.4 riastrad if (__predict_false(fw->fw_aborting)) { 1054 1.4 riastrad mutex_exit(&fw->fw_lock); 1055 1.4 riastrad continue; 1056 1.4 riastrad } 1057 1.1 thorpej futex_wait_dequeue(fw, f); 1058 1.1 thorpej futex_wait_enqueue(fw, f2); 1059 1.1 thorpej mutex_exit(&fw->fw_lock); 1060 1.4 riastrad nrequeue--; 1061 1.1 thorpej /* 1062 1.22 riastrad * PR kern/59004: Missing constant for upper 1063 1.22 riastrad * bound on systemwide number of lwps 1064 1.22 riastrad */ 1065 1.22 riastrad KASSERT(nwoken_or_requeued < 1066 1.22 riastrad MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK)); 1067 1.22 riastrad __CTASSERT(UINT_MAX >= 1068 1.22 riastrad MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK)); 1069 1.22 riastrad if (++nwoken_or_requeued == 0) /* paranoia */ 1070 1.22 riastrad nwoken_or_requeued = UINT_MAX; 1071 1.22 riastrad /* 1072 1.1 thorpej * Transfer the reference from f to f2. 1073 1.1 thorpej * As above, we assert that we are not 1074 1.1 thorpej * dropping the last reference to f here. 1075 1.1 thorpej * 1076 1.1 thorpej * XXX futex_hold() could theoretically 1077 1.1 thorpej * XXX fail here. 1078 1.1 thorpej */ 1079 1.1 thorpej futex_rele_not_last(f); 1080 1.1 thorpej hold_error = futex_hold(f2); 1081 1.1 thorpej KASSERT(hold_error == 0); 1082 1.1 thorpej } else { 1083 1.1 thorpej break; 1084 1.1 thorpej } 1085 1.1 thorpej } 1086 1.1 thorpej } else { 1087 1.1 thorpej KASSERT(nrequeue == 0); 1088 1.1 thorpej } 1089 1.1 thorpej 1090 1.22 riastrad /* Return the number of waiters woken or requeued. */ 1091 1.22 riastrad return nwoken_or_requeued; 1092 1.1 thorpej } 1093 1.1 thorpej 1094 1.1 thorpej /* 1095 1.1 thorpej * futex_queue_lock(f) 1096 1.1 thorpej * 1097 1.1 thorpej * Acquire the queue lock of f. Pair with futex_queue_unlock. Do 1098 1.1 thorpej * not use if caller needs to acquire two locks; use 1099 1.1 thorpej * futex_queue_lock2 instead. 1100 1.1 thorpej */ 1101 1.1 thorpej static void 1102 1.1 thorpej futex_queue_lock(struct futex *f) 1103 1.1 thorpej { 1104 1.1 thorpej mutex_enter(&f->fx_qlock); 1105 1.1 thorpej } 1106 1.1 thorpej 1107 1.1 thorpej /* 1108 1.1 thorpej * futex_queue_unlock(f) 1109 1.1 thorpej * 1110 1.1 thorpej * Release the queue lock of f. 1111 1.1 thorpej */ 1112 1.1 thorpej static void 1113 1.1 thorpej futex_queue_unlock(struct futex *f) 1114 1.1 thorpej { 1115 1.1 thorpej mutex_exit(&f->fx_qlock); 1116 1.1 thorpej } 1117 1.1 thorpej 1118 1.1 thorpej /* 1119 1.1 thorpej * futex_queue_lock2(f, f2) 1120 1.1 thorpej * 1121 1.1 thorpej * Acquire the queue locks of both f and f2, which may be null, or 1122 1.1 thorpej * which may have the same underlying queue. If they are 1123 1.1 thorpej * distinct, an arbitrary total order is chosen on the locks. 1124 1.1 thorpej * 1125 1.1 thorpej * Callers should only ever acquire multiple queue locks 1126 1.1 thorpej * simultaneously using futex_queue_lock2. 1127 1.1 thorpej */ 1128 1.1 thorpej static void 1129 1.1 thorpej futex_queue_lock2(struct futex *f, struct futex *f2) 1130 1.1 thorpej { 1131 1.1 thorpej 1132 1.1 thorpej /* 1133 1.1 thorpej * If both are null, do nothing; if one is null and the other 1134 1.1 thorpej * is not, lock the other and be done with it. 1135 1.1 thorpej */ 1136 1.1 thorpej if (f == NULL && f2 == NULL) { 1137 1.1 thorpej return; 1138 1.1 thorpej } else if (f == NULL) { 1139 1.1 thorpej mutex_enter(&f2->fx_qlock); 1140 1.1 thorpej return; 1141 1.1 thorpej } else if (f2 == NULL) { 1142 1.1 thorpej mutex_enter(&f->fx_qlock); 1143 1.1 thorpej return; 1144 1.1 thorpej } 1145 1.1 thorpej 1146 1.1 thorpej /* If both futexes are the same, acquire only one. */ 1147 1.1 thorpej if (f == f2) { 1148 1.1 thorpej mutex_enter(&f->fx_qlock); 1149 1.1 thorpej return; 1150 1.1 thorpej } 1151 1.1 thorpej 1152 1.1 thorpej /* Otherwise, use the ordering on the kva of the futex pointer. */ 1153 1.1 thorpej if ((uintptr_t)f < (uintptr_t)f2) { 1154 1.1 thorpej mutex_enter(&f->fx_qlock); 1155 1.1 thorpej mutex_enter(&f2->fx_qlock); 1156 1.1 thorpej } else { 1157 1.1 thorpej mutex_enter(&f2->fx_qlock); 1158 1.1 thorpej mutex_enter(&f->fx_qlock); 1159 1.1 thorpej } 1160 1.1 thorpej } 1161 1.1 thorpej 1162 1.1 thorpej /* 1163 1.1 thorpej * futex_queue_unlock2(f, f2) 1164 1.1 thorpej * 1165 1.1 thorpej * Release the queue locks of both f and f2, which may be null, or 1166 1.1 thorpej * which may have the same underlying queue. 1167 1.1 thorpej */ 1168 1.1 thorpej static void 1169 1.1 thorpej futex_queue_unlock2(struct futex *f, struct futex *f2) 1170 1.1 thorpej { 1171 1.1 thorpej 1172 1.1 thorpej /* 1173 1.1 thorpej * If both are null, do nothing; if one is null and the other 1174 1.1 thorpej * is not, unlock the other and be done with it. 1175 1.1 thorpej */ 1176 1.1 thorpej if (f == NULL && f2 == NULL) { 1177 1.1 thorpej return; 1178 1.1 thorpej } else if (f == NULL) { 1179 1.1 thorpej mutex_exit(&f2->fx_qlock); 1180 1.1 thorpej return; 1181 1.1 thorpej } else if (f2 == NULL) { 1182 1.1 thorpej mutex_exit(&f->fx_qlock); 1183 1.1 thorpej return; 1184 1.1 thorpej } 1185 1.1 thorpej 1186 1.1 thorpej /* If both futexes are the same, release only one. */ 1187 1.1 thorpej if (f == f2) { 1188 1.1 thorpej mutex_exit(&f->fx_qlock); 1189 1.1 thorpej return; 1190 1.1 thorpej } 1191 1.1 thorpej 1192 1.1 thorpej /* Otherwise, use the ordering on the kva of the futex pointer. */ 1193 1.1 thorpej if ((uintptr_t)f < (uintptr_t)f2) { 1194 1.1 thorpej mutex_exit(&f2->fx_qlock); 1195 1.1 thorpej mutex_exit(&f->fx_qlock); 1196 1.1 thorpej } else { 1197 1.1 thorpej mutex_exit(&f->fx_qlock); 1198 1.1 thorpej mutex_exit(&f2->fx_qlock); 1199 1.1 thorpej } 1200 1.1 thorpej } 1201 1.1 thorpej 1202 1.1 thorpej /* 1203 1.25 riastrad * futex_func_wait(uaddr, cmpval@val, bitset@val3, timeout, clkid, clkflags, 1204 1.25 riastrad * retval) 1205 1.1 thorpej * 1206 1.25 riastrad * Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET): If 1207 1.25 riastrad * *uaddr == cmpval, wait until futex-woken on any of the bits in 1208 1.25 riastrad * bitset. But if *uaddr != cmpval, fail with EAGAIN. 1209 1.25 riastrad * 1210 1.25 riastrad * For FUTEX_WAIT, bitset has all bits set and val3 is ignored. 1211 1.1 thorpej */ 1212 1.1 thorpej static int 1213 1.25 riastrad futex_func_wait(bool shared, int *uaddr, int cmpval, int bitset, 1214 1.11 riastrad const struct timespec *timeout, clockid_t clkid, int clkflags, 1215 1.1 thorpej register_t *retval) 1216 1.1 thorpej { 1217 1.1 thorpej struct futex *f; 1218 1.1 thorpej struct futex_wait wait, *fw = &wait; 1219 1.11 riastrad struct timespec ts; 1220 1.11 riastrad const struct timespec *deadline; 1221 1.1 thorpej int error; 1222 1.1 thorpej 1223 1.6 riastrad /* 1224 1.6 riastrad * If there's nothing to wait for, and nobody will ever wake 1225 1.6 riastrad * us, then don't set anything up to wait -- just stop here. 1226 1.6 riastrad */ 1227 1.25 riastrad if (bitset == 0) 1228 1.7 riastrad return EINVAL; 1229 1.6 riastrad 1230 1.1 thorpej /* Optimistically test before anything else. */ 1231 1.25 riastrad if (!futex_test(uaddr, cmpval)) 1232 1.1 thorpej return EAGAIN; 1233 1.1 thorpej 1234 1.11 riastrad /* Determine a deadline on the specified clock. */ 1235 1.11 riastrad if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) { 1236 1.11 riastrad deadline = timeout; 1237 1.11 riastrad } else { 1238 1.11 riastrad error = clock_gettime1(clkid, &ts); 1239 1.11 riastrad if (error) 1240 1.11 riastrad return error; 1241 1.11 riastrad timespecadd(&ts, timeout, &ts); 1242 1.11 riastrad deadline = &ts; 1243 1.11 riastrad } 1244 1.11 riastrad 1245 1.1 thorpej /* Get the futex, creating it if necessary. */ 1246 1.5 riastrad error = futex_lookup_create(uaddr, shared, &f); 1247 1.1 thorpej if (error) 1248 1.1 thorpej return error; 1249 1.1 thorpej KASSERT(f); 1250 1.1 thorpej 1251 1.1 thorpej /* Get ready to wait. */ 1252 1.25 riastrad futex_wait_init(fw, bitset); 1253 1.1 thorpej 1254 1.1 thorpej /* 1255 1.1 thorpej * Under the queue lock, check the value again: if it has 1256 1.1 thorpej * already changed, EAGAIN; otherwise enqueue the waiter. 1257 1.1 thorpej * Since FUTEX_WAKE will use the same lock and be done after 1258 1.1 thorpej * modifying the value, the order in which we check and enqueue 1259 1.1 thorpej * is immaterial. 1260 1.1 thorpej */ 1261 1.1 thorpej futex_queue_lock(f); 1262 1.25 riastrad if (!futex_test(uaddr, cmpval)) { 1263 1.1 thorpej futex_queue_unlock(f); 1264 1.1 thorpej error = EAGAIN; 1265 1.1 thorpej goto out; 1266 1.1 thorpej } 1267 1.1 thorpej mutex_enter(&fw->fw_lock); 1268 1.1 thorpej futex_wait_enqueue(fw, f); 1269 1.1 thorpej mutex_exit(&fw->fw_lock); 1270 1.1 thorpej futex_queue_unlock(f); 1271 1.1 thorpej 1272 1.1 thorpej /* 1273 1.1 thorpej * We cannot drop our reference to the futex here, because 1274 1.1 thorpej * we might be enqueued on a different one when we are awakened. 1275 1.1 thorpej * The references will be managed on our behalf in the requeue 1276 1.1 thorpej * and wake cases. 1277 1.1 thorpej */ 1278 1.1 thorpej f = NULL; 1279 1.1 thorpej 1280 1.1 thorpej /* Wait. */ 1281 1.11 riastrad error = futex_wait(fw, deadline, clkid); 1282 1.4 riastrad if (error) 1283 1.1 thorpej goto out; 1284 1.1 thorpej 1285 1.1 thorpej /* Return 0 on success, error on failure. */ 1286 1.1 thorpej *retval = 0; 1287 1.1 thorpej 1288 1.1 thorpej out: if (f != NULL) 1289 1.5 riastrad futex_rele(f); 1290 1.1 thorpej futex_wait_fini(fw); 1291 1.1 thorpej return error; 1292 1.1 thorpej } 1293 1.1 thorpej 1294 1.1 thorpej /* 1295 1.25 riastrad * futex_func_wake(uaddr, nwake@val, bitset@val3, retval) 1296 1.25 riastrad * 1297 1.25 riastrad * Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET): Wake 1298 1.25 riastrad * up to nwake waiters on uaddr waiting on any of the bits in 1299 1.25 riastrad * bitset. 1300 1.25 riastrad * 1301 1.25 riastrad * Return the number of waiters woken. 1302 1.1 thorpej * 1303 1.25 riastrad * For FUTEX_WAKE, bitset has all bits set and val3 is ignored. 1304 1.1 thorpej */ 1305 1.1 thorpej static int 1306 1.25 riastrad futex_func_wake(bool shared, int *uaddr, int nwake, int bitset, 1307 1.25 riastrad register_t *retval) 1308 1.1 thorpej { 1309 1.1 thorpej struct futex *f; 1310 1.1 thorpej unsigned int nwoken = 0; 1311 1.1 thorpej int error = 0; 1312 1.1 thorpej 1313 1.1 thorpej /* Reject negative number of wakeups. */ 1314 1.25 riastrad if (nwake < 0) { 1315 1.1 thorpej error = EINVAL; 1316 1.1 thorpej goto out; 1317 1.1 thorpej } 1318 1.1 thorpej 1319 1.1 thorpej /* Look up the futex, if any. */ 1320 1.1 thorpej error = futex_lookup(uaddr, shared, &f); 1321 1.1 thorpej if (error) 1322 1.1 thorpej goto out; 1323 1.1 thorpej 1324 1.1 thorpej /* If there's no futex, there are no waiters to wake. */ 1325 1.1 thorpej if (f == NULL) 1326 1.1 thorpej goto out; 1327 1.1 thorpej 1328 1.1 thorpej /* 1329 1.1 thorpej * Under f's queue lock, wake the waiters and remember the 1330 1.1 thorpej * number woken. 1331 1.1 thorpej */ 1332 1.1 thorpej futex_queue_lock(f); 1333 1.25 riastrad nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0, bitset); 1334 1.1 thorpej futex_queue_unlock(f); 1335 1.1 thorpej 1336 1.1 thorpej /* Release the futex. */ 1337 1.5 riastrad futex_rele(f); 1338 1.1 thorpej 1339 1.1 thorpej out: 1340 1.1 thorpej /* Return the number of waiters woken. */ 1341 1.1 thorpej *retval = nwoken; 1342 1.1 thorpej 1343 1.1 thorpej /* Success! */ 1344 1.1 thorpej return error; 1345 1.1 thorpej } 1346 1.1 thorpej 1347 1.1 thorpej /* 1348 1.25 riastrad * futex_func_requeue(op, uaddr, nwake@val, uaddr2, nrequeue@val2, 1349 1.25 riastrad * cmpval@val3, retval) 1350 1.25 riastrad * 1351 1.25 riastrad * Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE): If 1352 1.25 riastrad * *uaddr == cmpval or if op == FUTEX_REQUEUE, wake up to nwake 1353 1.25 riastrad * waiters at uaddr and then requeue up to nrequeue waiters from 1354 1.25 riastrad * uaddr to uaddr2. 1355 1.1 thorpej * 1356 1.25 riastrad * Return the number of waiters woken or requeued. 1357 1.25 riastrad * 1358 1.25 riastrad * For FUTEX_CMP_REQUEUE, if *uaddr != cmpval, fail with EAGAIN 1359 1.25 riastrad * and no wakeups. 1360 1.1 thorpej */ 1361 1.1 thorpej static int 1362 1.25 riastrad futex_func_requeue(bool shared, int op, int *uaddr, int nwake, int *uaddr2, 1363 1.25 riastrad int nrequeue, int cmpval, register_t *retval) 1364 1.1 thorpej { 1365 1.1 thorpej struct futex *f = NULL, *f2 = NULL; 1366 1.22 riastrad unsigned nwoken_or_requeued = 0; /* default to zero on early return */ 1367 1.1 thorpej int error; 1368 1.1 thorpej 1369 1.1 thorpej /* Reject negative number of wakeups or requeues. */ 1370 1.25 riastrad if (nwake < 0 || nrequeue < 0) { 1371 1.1 thorpej error = EINVAL; 1372 1.1 thorpej goto out; 1373 1.1 thorpej } 1374 1.1 thorpej 1375 1.21 riastrad /* 1376 1.21 riastrad * Look up or create the source futex. For FUTEX_CMP_REQUEUE, 1377 1.21 riastrad * we always create it, rather than bail if it has no waiters, 1378 1.21 riastrad * because FUTEX_CMP_REQUEUE always tests the futex word in 1379 1.21 riastrad * order to report EAGAIN. 1380 1.21 riastrad */ 1381 1.21 riastrad error = (op == FUTEX_CMP_REQUEUE 1382 1.21 riastrad ? futex_lookup_create(uaddr, shared, &f) 1383 1.21 riastrad : futex_lookup(uaddr, shared, &f)); 1384 1.1 thorpej if (error) 1385 1.1 thorpej goto out; 1386 1.1 thorpej 1387 1.21 riastrad /* If there is none for FUTEX_REQUEUE, nothing to do. */ 1388 1.21 riastrad if (f == NULL) { 1389 1.21 riastrad KASSERT(op != FUTEX_CMP_REQUEUE); 1390 1.1 thorpej goto out; 1391 1.21 riastrad } 1392 1.1 thorpej 1393 1.1 thorpej /* 1394 1.1 thorpej * We may need to create the destination futex because it's 1395 1.1 thorpej * entirely possible it does not currently have any waiters. 1396 1.1 thorpej */ 1397 1.5 riastrad error = futex_lookup_create(uaddr2, shared, &f2); 1398 1.1 thorpej if (error) 1399 1.1 thorpej goto out; 1400 1.1 thorpej 1401 1.1 thorpej /* 1402 1.1 thorpej * Under the futexes' queue locks, check the value; if 1403 1.25 riastrad * unchanged from cmpval, or if this is the unconditional 1404 1.25 riastrad * FUTEX_REQUEUE operation, wake the waiters. 1405 1.1 thorpej */ 1406 1.1 thorpej futex_queue_lock2(f, f2); 1407 1.25 riastrad if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, cmpval)) { 1408 1.1 thorpej error = EAGAIN; 1409 1.1 thorpej } else { 1410 1.1 thorpej error = 0; 1411 1.25 riastrad nwoken_or_requeued = futex_wake(f, nwake, f2, nrequeue, 1412 1.22 riastrad FUTEX_BITSET_MATCH_ANY); 1413 1.1 thorpej } 1414 1.1 thorpej futex_queue_unlock2(f, f2); 1415 1.1 thorpej 1416 1.1 thorpej out: 1417 1.22 riastrad /* Return the number of waiters woken or requeued. */ 1418 1.22 riastrad *retval = nwoken_or_requeued; 1419 1.1 thorpej 1420 1.1 thorpej /* Release the futexes if we got them. */ 1421 1.1 thorpej if (f2) 1422 1.5 riastrad futex_rele(f2); 1423 1.1 thorpej if (f) 1424 1.5 riastrad futex_rele(f); 1425 1.1 thorpej return error; 1426 1.1 thorpej } 1427 1.1 thorpej 1428 1.1 thorpej /* 1429 1.23 riastrad * futex_opcmp_arg(arg) 1430 1.23 riastrad * 1431 1.23 riastrad * arg is either the oparg or cmparg field of a FUTEX_WAKE_OP 1432 1.23 riastrad * operation, a 12-bit string in either case. Map it to a numeric 1433 1.23 riastrad * argument value by sign-extending it in two's-complement 1434 1.23 riastrad * representation. 1435 1.23 riastrad */ 1436 1.23 riastrad static int 1437 1.23 riastrad futex_opcmp_arg(int arg) 1438 1.23 riastrad { 1439 1.23 riastrad 1440 1.23 riastrad KASSERT(arg == (arg & __BITS(11,0))); 1441 1.23 riastrad return arg - 0x1000*__SHIFTOUT(arg, __BIT(11)); 1442 1.23 riastrad } 1443 1.23 riastrad 1444 1.23 riastrad /* 1445 1.25 riastrad * futex_validate_op_cmp(opcmp) 1446 1.1 thorpej * 1447 1.1 thorpej * Validate an op/cmp argument for FUTEX_WAKE_OP. 1448 1.1 thorpej */ 1449 1.1 thorpej static int 1450 1.25 riastrad futex_validate_op_cmp(int opcmp) 1451 1.1 thorpej { 1452 1.25 riastrad int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK); 1453 1.25 riastrad int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK); 1454 1.1 thorpej 1455 1.1 thorpej if (op & FUTEX_OP_OPARG_SHIFT) { 1456 1.23 riastrad int oparg = 1457 1.25 riastrad futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK)); 1458 1.1 thorpej if (oparg < 0) 1459 1.1 thorpej return EINVAL; 1460 1.1 thorpej if (oparg >= 32) 1461 1.1 thorpej return EINVAL; 1462 1.1 thorpej op &= ~FUTEX_OP_OPARG_SHIFT; 1463 1.1 thorpej } 1464 1.1 thorpej 1465 1.1 thorpej switch (op) { 1466 1.1 thorpej case FUTEX_OP_SET: 1467 1.1 thorpej case FUTEX_OP_ADD: 1468 1.1 thorpej case FUTEX_OP_OR: 1469 1.1 thorpej case FUTEX_OP_ANDN: 1470 1.1 thorpej case FUTEX_OP_XOR: 1471 1.1 thorpej break; 1472 1.1 thorpej default: 1473 1.1 thorpej return EINVAL; 1474 1.1 thorpej } 1475 1.1 thorpej 1476 1.1 thorpej switch (cmp) { 1477 1.1 thorpej case FUTEX_OP_CMP_EQ: 1478 1.1 thorpej case FUTEX_OP_CMP_NE: 1479 1.1 thorpej case FUTEX_OP_CMP_LT: 1480 1.1 thorpej case FUTEX_OP_CMP_LE: 1481 1.1 thorpej case FUTEX_OP_CMP_GT: 1482 1.1 thorpej case FUTEX_OP_CMP_GE: 1483 1.1 thorpej break; 1484 1.1 thorpej default: 1485 1.1 thorpej return EINVAL; 1486 1.1 thorpej } 1487 1.1 thorpej 1488 1.1 thorpej return 0; 1489 1.1 thorpej } 1490 1.1 thorpej 1491 1.1 thorpej /* 1492 1.25 riastrad * futex_compute_op(oldval, opcmp) 1493 1.1 thorpej * 1494 1.23 riastrad * Apply a FUTEX_WAKE_OP operation to oldval. 1495 1.1 thorpej */ 1496 1.1 thorpej static int 1497 1.25 riastrad futex_compute_op(int oldval, int opcmp) 1498 1.1 thorpej { 1499 1.25 riastrad int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK); 1500 1.25 riastrad int oparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK)); 1501 1.1 thorpej 1502 1.1 thorpej if (op & FUTEX_OP_OPARG_SHIFT) { 1503 1.1 thorpej KASSERT(oparg >= 0); 1504 1.1 thorpej KASSERT(oparg < 32); 1505 1.1 thorpej oparg = 1u << oparg; 1506 1.1 thorpej op &= ~FUTEX_OP_OPARG_SHIFT; 1507 1.1 thorpej } 1508 1.1 thorpej 1509 1.1 thorpej switch (op) { 1510 1.1 thorpej case FUTEX_OP_SET: 1511 1.1 thorpej return oparg; 1512 1.1 thorpej 1513 1.1 thorpej case FUTEX_OP_ADD: 1514 1.1 thorpej /* 1515 1.1 thorpej * Avoid signed arithmetic overflow by doing 1516 1.1 thorpej * arithmetic unsigned and converting back to signed 1517 1.1 thorpej * at the end. 1518 1.1 thorpej */ 1519 1.1 thorpej return (int)((unsigned)oldval + (unsigned)oparg); 1520 1.1 thorpej 1521 1.1 thorpej case FUTEX_OP_OR: 1522 1.1 thorpej return oldval | oparg; 1523 1.1 thorpej 1524 1.1 thorpej case FUTEX_OP_ANDN: 1525 1.1 thorpej return oldval & ~oparg; 1526 1.1 thorpej 1527 1.1 thorpej case FUTEX_OP_XOR: 1528 1.1 thorpej return oldval ^ oparg; 1529 1.1 thorpej 1530 1.1 thorpej default: 1531 1.1 thorpej panic("invalid futex op"); 1532 1.1 thorpej } 1533 1.1 thorpej } 1534 1.1 thorpej 1535 1.1 thorpej /* 1536 1.25 riastrad * futex_compute_cmp(oldval, opcmp) 1537 1.1 thorpej * 1538 1.23 riastrad * Apply a FUTEX_WAKE_OP comparison to oldval. 1539 1.1 thorpej */ 1540 1.1 thorpej static bool 1541 1.25 riastrad futex_compute_cmp(int oldval, int opcmp) 1542 1.1 thorpej { 1543 1.25 riastrad int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK); 1544 1.25 riastrad int cmparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_CMPARG_MASK)); 1545 1.1 thorpej 1546 1.1 thorpej switch (cmp) { 1547 1.1 thorpej case FUTEX_OP_CMP_EQ: 1548 1.1 thorpej return (oldval == cmparg); 1549 1.1 thorpej 1550 1.1 thorpej case FUTEX_OP_CMP_NE: 1551 1.1 thorpej return (oldval != cmparg); 1552 1.1 thorpej 1553 1.1 thorpej case FUTEX_OP_CMP_LT: 1554 1.1 thorpej return (oldval < cmparg); 1555 1.1 thorpej 1556 1.1 thorpej case FUTEX_OP_CMP_LE: 1557 1.1 thorpej return (oldval <= cmparg); 1558 1.1 thorpej 1559 1.1 thorpej case FUTEX_OP_CMP_GT: 1560 1.1 thorpej return (oldval > cmparg); 1561 1.1 thorpej 1562 1.1 thorpej case FUTEX_OP_CMP_GE: 1563 1.1 thorpej return (oldval >= cmparg); 1564 1.1 thorpej 1565 1.1 thorpej default: 1566 1.1 thorpej panic("invalid futex cmp operation"); 1567 1.1 thorpej } 1568 1.1 thorpej } 1569 1.1 thorpej 1570 1.1 thorpej /* 1571 1.25 riastrad * futex_func_wake_op(uaddr, nwake@val, uaddr2, nwake2@val2, opcmp@val3, 1572 1.25 riastrad * retval) 1573 1.25 riastrad * 1574 1.25 riastrad * Implement futex(FUTEX_WAKE_OP): 1575 1.25 riastrad * 1576 1.25 riastrad * 1. Update *uaddr2 according to the r/m/w operation specified in 1577 1.25 riastrad * opcmp. 1578 1.25 riastrad * 1579 1.25 riastrad * 2. If uaddr is nonnull, wake up to nwake waiters at uaddr. 1580 1.1 thorpej * 1581 1.25 riastrad * 3. If what was previously at *uaddr2 matches the comparison 1582 1.25 riastrad * operation specified in opcmp, additionally wake up to nwake2 1583 1.25 riastrad * waiters at uaddr2. 1584 1.1 thorpej */ 1585 1.1 thorpej static int 1586 1.25 riastrad futex_func_wake_op(bool shared, int *uaddr, int nwake, int *uaddr2, int nwake2, 1587 1.25 riastrad int opcmp, register_t *retval) 1588 1.1 thorpej { 1589 1.1 thorpej struct futex *f = NULL, *f2 = NULL; 1590 1.1 thorpej int oldval, newval, actual; 1591 1.1 thorpej unsigned nwoken = 0; 1592 1.1 thorpej int error; 1593 1.1 thorpej 1594 1.1 thorpej /* Reject negative number of wakeups. */ 1595 1.25 riastrad if (nwake < 0 || nwake2 < 0) { 1596 1.1 thorpej error = EINVAL; 1597 1.1 thorpej goto out; 1598 1.1 thorpej } 1599 1.1 thorpej 1600 1.1 thorpej /* Reject invalid operations before we start doing things. */ 1601 1.25 riastrad if ((error = futex_validate_op_cmp(opcmp)) != 0) 1602 1.1 thorpej goto out; 1603 1.1 thorpej 1604 1.1 thorpej /* Look up the first futex, if any. */ 1605 1.1 thorpej error = futex_lookup(uaddr, shared, &f); 1606 1.1 thorpej if (error) 1607 1.1 thorpej goto out; 1608 1.1 thorpej 1609 1.1 thorpej /* Look up the second futex, if any. */ 1610 1.1 thorpej error = futex_lookup(uaddr2, shared, &f2); 1611 1.1 thorpej if (error) 1612 1.1 thorpej goto out; 1613 1.1 thorpej 1614 1.1 thorpej /* 1615 1.1 thorpej * Under the queue locks: 1616 1.1 thorpej * 1617 1.25 riastrad * 1. Read/modify/write: *uaddr2 op= oparg, as in opcmp. 1618 1.1 thorpej * 2. Unconditionally wake uaddr. 1619 1.25 riastrad * 3. Conditionally wake uaddr2, if it previously matched the 1620 1.25 riastrad * comparison in opcmp. 1621 1.1 thorpej */ 1622 1.1 thorpej futex_queue_lock2(f, f2); 1623 1.1 thorpej do { 1624 1.1 thorpej error = futex_load(uaddr2, &oldval); 1625 1.1 thorpej if (error) 1626 1.1 thorpej goto out_unlock; 1627 1.25 riastrad newval = futex_compute_op(oldval, opcmp); 1628 1.1 thorpej error = ucas_int(uaddr2, oldval, newval, &actual); 1629 1.1 thorpej if (error) 1630 1.1 thorpej goto out_unlock; 1631 1.1 thorpej } while (actual != oldval); 1632 1.22 riastrad if (f == NULL) { 1633 1.22 riastrad nwoken = 0; 1634 1.22 riastrad } else { 1635 1.25 riastrad nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0, 1636 1.22 riastrad FUTEX_BITSET_MATCH_ANY); 1637 1.22 riastrad } 1638 1.25 riastrad if (f2 && futex_compute_cmp(oldval, opcmp)) { 1639 1.25 riastrad nwoken += futex_wake(f2, nwake2, NULL, /*nrequeue*/0, 1640 1.1 thorpej FUTEX_BITSET_MATCH_ANY); 1641 1.22 riastrad } 1642 1.1 thorpej 1643 1.1 thorpej /* Success! */ 1644 1.1 thorpej error = 0; 1645 1.1 thorpej out_unlock: 1646 1.1 thorpej futex_queue_unlock2(f, f2); 1647 1.1 thorpej 1648 1.1 thorpej out: 1649 1.1 thorpej /* Return the number of waiters woken. */ 1650 1.1 thorpej *retval = nwoken; 1651 1.1 thorpej 1652 1.1 thorpej /* Release the futexes, if we got them. */ 1653 1.1 thorpej if (f2) 1654 1.5 riastrad futex_rele(f2); 1655 1.1 thorpej if (f) 1656 1.5 riastrad futex_rele(f); 1657 1.1 thorpej return error; 1658 1.1 thorpej } 1659 1.1 thorpej 1660 1.1 thorpej /* 1661 1.1 thorpej * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3) 1662 1.1 thorpej * 1663 1.1 thorpej * Implement the futex system call with all the parameters 1664 1.1 thorpej * parsed out. 1665 1.1 thorpej */ 1666 1.1 thorpej int 1667 1.11 riastrad do_futex(int *uaddr, int op, int val, const struct timespec *timeout, 1668 1.11 riastrad int *uaddr2, int val2, int val3, register_t *retval) 1669 1.1 thorpej { 1670 1.1 thorpej const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true; 1671 1.1 thorpej const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME 1672 1.1 thorpej : CLOCK_MONOTONIC; 1673 1.1 thorpej 1674 1.1 thorpej op &= FUTEX_CMD_MASK; 1675 1.1 thorpej 1676 1.1 thorpej switch (op) { 1677 1.25 riastrad case FUTEX_WAIT: { 1678 1.25 riastrad const int cmpval = val; 1679 1.25 riastrad const int bitset = FUTEX_BITSET_MATCH_ANY; 1680 1.1 thorpej 1681 1.25 riastrad return futex_func_wait(shared, uaddr, cmpval, bitset, timeout, 1682 1.25 riastrad clkid, TIMER_RELTIME, retval); 1683 1.25 riastrad } 1684 1.25 riastrad case FUTEX_WAKE: { 1685 1.25 riastrad const int nwake = val; 1686 1.25 riastrad const int bitset = FUTEX_BITSET_MATCH_ANY; 1687 1.25 riastrad 1688 1.25 riastrad return futex_func_wake(shared, uaddr, nwake, bitset, retval); 1689 1.25 riastrad } 1690 1.25 riastrad case FUTEX_WAKE_BITSET: { 1691 1.25 riastrad const int nwake = val; 1692 1.25 riastrad const int bitset = val3; 1693 1.25 riastrad 1694 1.25 riastrad return futex_func_wake(shared, uaddr, nwake, bitset, retval); 1695 1.25 riastrad } 1696 1.1 thorpej case FUTEX_REQUEUE: 1697 1.25 riastrad case FUTEX_CMP_REQUEUE: { 1698 1.25 riastrad const int nwake = val; 1699 1.25 riastrad const int nrequeue = val2; 1700 1.25 riastrad const int cmpval = val3; /* ignored if op=FUTEX_REQUEUE */ 1701 1.25 riastrad 1702 1.25 riastrad return futex_func_requeue(shared, op, uaddr, nwake, uaddr2, 1703 1.25 riastrad nrequeue, cmpval, retval); 1704 1.25 riastrad } 1705 1.25 riastrad case FUTEX_WAIT_BITSET: { 1706 1.25 riastrad const int cmpval = val; 1707 1.25 riastrad const int bitset = val3; 1708 1.1 thorpej 1709 1.25 riastrad return futex_func_wait(shared, uaddr, cmpval, bitset, timeout, 1710 1.1 thorpej clkid, TIMER_ABSTIME, retval); 1711 1.25 riastrad } 1712 1.25 riastrad case FUTEX_WAKE_OP: { 1713 1.25 riastrad const int nwake = val; 1714 1.25 riastrad const int nwake2 = val2; 1715 1.25 riastrad const int opcmp = val3; 1716 1.1 thorpej 1717 1.25 riastrad return futex_func_wake_op(shared, uaddr, nwake, uaddr2, nwake2, 1718 1.25 riastrad opcmp, retval); 1719 1.25 riastrad } 1720 1.1 thorpej case FUTEX_FD: 1721 1.1 thorpej default: 1722 1.1 thorpej return ENOSYS; 1723 1.1 thorpej } 1724 1.1 thorpej } 1725 1.1 thorpej 1726 1.1 thorpej /* 1727 1.1 thorpej * sys___futex(l, uap, retval) 1728 1.1 thorpej * 1729 1.1 thorpej * __futex(2) system call: generic futex operations. 1730 1.1 thorpej */ 1731 1.1 thorpej int 1732 1.1 thorpej sys___futex(struct lwp *l, const struct sys___futex_args *uap, 1733 1.1 thorpej register_t *retval) 1734 1.1 thorpej { 1735 1.1 thorpej /* { 1736 1.1 thorpej syscallarg(int *) uaddr; 1737 1.1 thorpej syscallarg(int) op; 1738 1.1 thorpej syscallarg(int) val; 1739 1.1 thorpej syscallarg(const struct timespec *) timeout; 1740 1.1 thorpej syscallarg(int *) uaddr2; 1741 1.1 thorpej syscallarg(int) val2; 1742 1.1 thorpej syscallarg(int) val3; 1743 1.1 thorpej } */ 1744 1.1 thorpej struct timespec ts, *tsp; 1745 1.1 thorpej int error; 1746 1.1 thorpej 1747 1.1 thorpej /* 1748 1.1 thorpej * Copy in the timeout argument, if specified. 1749 1.1 thorpej */ 1750 1.1 thorpej if (SCARG(uap, timeout)) { 1751 1.1 thorpej error = copyin(SCARG(uap, timeout), &ts, sizeof(ts)); 1752 1.1 thorpej if (error) 1753 1.1 thorpej return error; 1754 1.1 thorpej tsp = &ts; 1755 1.1 thorpej } else { 1756 1.1 thorpej tsp = NULL; 1757 1.1 thorpej } 1758 1.1 thorpej 1759 1.1 thorpej return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val), 1760 1.1 thorpej tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3), 1761 1.1 thorpej retval); 1762 1.1 thorpej } 1763 1.1 thorpej 1764 1.1 thorpej /* 1765 1.1 thorpej * sys___futex_set_robust_list(l, uap, retval) 1766 1.1 thorpej * 1767 1.1 thorpej * __futex_set_robust_list(2) system call for robust futexes. 1768 1.1 thorpej */ 1769 1.1 thorpej int 1770 1.1 thorpej sys___futex_set_robust_list(struct lwp *l, 1771 1.1 thorpej const struct sys___futex_set_robust_list_args *uap, register_t *retval) 1772 1.1 thorpej { 1773 1.1 thorpej /* { 1774 1.1 thorpej syscallarg(void *) head; 1775 1.1 thorpej syscallarg(size_t) len; 1776 1.1 thorpej } */ 1777 1.1 thorpej void *head = SCARG(uap, head); 1778 1.1 thorpej 1779 1.1 thorpej if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE) 1780 1.1 thorpej return EINVAL; 1781 1.1 thorpej if ((uintptr_t)head % sizeof(u_long)) 1782 1.1 thorpej return EINVAL; 1783 1.1 thorpej 1784 1.1 thorpej l->l_robust_head = (uintptr_t)head; 1785 1.1 thorpej 1786 1.1 thorpej return 0; 1787 1.1 thorpej } 1788 1.1 thorpej 1789 1.1 thorpej /* 1790 1.1 thorpej * sys___futex_get_robust_list(l, uap, retval) 1791 1.1 thorpej * 1792 1.1 thorpej * __futex_get_robust_list(2) system call for robust futexes. 1793 1.1 thorpej */ 1794 1.1 thorpej int 1795 1.1 thorpej sys___futex_get_robust_list(struct lwp *l, 1796 1.1 thorpej const struct sys___futex_get_robust_list_args *uap, register_t *retval) 1797 1.1 thorpej { 1798 1.1 thorpej /* { 1799 1.1 thorpej syscallarg(lwpid_t) lwpid; 1800 1.1 thorpej syscallarg(void **) headp; 1801 1.1 thorpej syscallarg(size_t *) lenp; 1802 1.1 thorpej } */ 1803 1.1 thorpej void *head; 1804 1.1 thorpej const size_t len = _FUTEX_ROBUST_HEAD_SIZE; 1805 1.1 thorpej int error; 1806 1.1 thorpej 1807 1.1 thorpej error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head); 1808 1.1 thorpej if (error) 1809 1.1 thorpej return error; 1810 1.1 thorpej 1811 1.1 thorpej /* Copy out the head pointer and the head structure length. */ 1812 1.1 thorpej error = copyout(&head, SCARG(uap, headp), sizeof(head)); 1813 1.1 thorpej if (__predict_true(error == 0)) { 1814 1.1 thorpej error = copyout(&len, SCARG(uap, lenp), sizeof(len)); 1815 1.1 thorpej } 1816 1.1 thorpej 1817 1.1 thorpej return error; 1818 1.1 thorpej } 1819 1.1 thorpej 1820 1.1 thorpej /* 1821 1.1 thorpej * release_futex(uva, tid) 1822 1.1 thorpej * 1823 1.1 thorpej * Try to release the robust futex at uva in the current process 1824 1.1 thorpej * on lwp exit. If anything goes wrong, silently fail. It is the 1825 1.1 thorpej * userland program's obligation to arrange correct behaviour. 1826 1.1 thorpej */ 1827 1.1 thorpej static void 1828 1.1 thorpej release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi, 1829 1.1 thorpej bool const is_pending) 1830 1.1 thorpej { 1831 1.1 thorpej int *uaddr; 1832 1.1 thorpej struct futex *f; 1833 1.1 thorpej int oldval, newval, actual; 1834 1.1 thorpej int error; 1835 1.1 thorpej 1836 1.1 thorpej /* If it's misaligned, tough. */ 1837 1.1 thorpej if (__predict_false(uptr & 3)) 1838 1.1 thorpej return; 1839 1.1 thorpej uaddr = (int *)uptr; 1840 1.1 thorpej 1841 1.1 thorpej error = futex_load(uaddr, &oldval); 1842 1.1 thorpej if (__predict_false(error)) 1843 1.1 thorpej return; 1844 1.1 thorpej 1845 1.1 thorpej /* 1846 1.1 thorpej * There are two race conditions we need to handle here: 1847 1.1 thorpej * 1848 1.1 thorpej * 1. User space cleared the futex word but died before 1849 1.1 thorpej * being able to issue the wakeup. No wakeups will 1850 1.1 thorpej * ever be issued, oops! 1851 1.1 thorpej * 1852 1.1 thorpej * 2. Awakened waiter died before being able to acquire 1853 1.1 thorpej * the futex in user space. Any other waiters are 1854 1.1 thorpej * now stuck, oops! 1855 1.1 thorpej * 1856 1.1 thorpej * In both of these cases, the futex word will be 0 (because 1857 1.1 thorpej * it's updated before the wake is issued). The best we can 1858 1.1 thorpej * do is detect this situation if it's the pending futex and 1859 1.1 thorpej * issue a wake without modifying the futex word. 1860 1.1 thorpej * 1861 1.1 thorpej * XXX eventual PI handling? 1862 1.1 thorpej */ 1863 1.1 thorpej if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) { 1864 1.1 thorpej register_t retval; 1865 1.1 thorpej (void) futex_func_wake(/*shared*/true, uaddr, 1, 1866 1.1 thorpej FUTEX_BITSET_MATCH_ANY, &retval); 1867 1.1 thorpej return; 1868 1.1 thorpej } 1869 1.1 thorpej 1870 1.1 thorpej /* Optimistically test whether we need to do anything at all. */ 1871 1.1 thorpej if ((oldval & FUTEX_TID_MASK) != tid) 1872 1.1 thorpej return; 1873 1.1 thorpej 1874 1.1 thorpej /* 1875 1.1 thorpej * We need to handle the case where this thread owned the futex, 1876 1.1 thorpej * but it was uncontended. In this case, there won't be any 1877 1.1 thorpej * kernel state to look up. All we can do is mark the futex 1878 1.1 thorpej * as a zombie to be mopped up the next time another thread 1879 1.1 thorpej * attempts to acquire it. 1880 1.1 thorpej * 1881 1.1 thorpej * N.B. It's important to ensure to set FUTEX_OWNER_DIED in 1882 1.1 thorpej * this loop, even if waiters appear while we're are doing 1883 1.1 thorpej * so. This is beause FUTEX_WAITERS is set by user space 1884 1.1 thorpej * before calling __futex() to wait, and the futex needs 1885 1.1 thorpej * to be marked as a zombie when the new waiter gets into 1886 1.1 thorpej * the kernel. 1887 1.1 thorpej */ 1888 1.1 thorpej if ((oldval & FUTEX_WAITERS) == 0) { 1889 1.1 thorpej do { 1890 1.1 thorpej error = futex_load(uaddr, &oldval); 1891 1.1 thorpej if (error) 1892 1.1 thorpej return; 1893 1.1 thorpej if ((oldval & FUTEX_TID_MASK) != tid) 1894 1.1 thorpej return; 1895 1.1 thorpej newval = oldval | FUTEX_OWNER_DIED; 1896 1.1 thorpej error = ucas_int(uaddr, oldval, newval, &actual); 1897 1.1 thorpej if (error) 1898 1.1 thorpej return; 1899 1.1 thorpej } while (actual != oldval); 1900 1.1 thorpej 1901 1.1 thorpej /* 1902 1.1 thorpej * If where is still no indication of waiters, then there is 1903 1.1 thorpej * no more work for us to do. 1904 1.1 thorpej */ 1905 1.1 thorpej if ((oldval & FUTEX_WAITERS) == 0) 1906 1.1 thorpej return; 1907 1.1 thorpej } 1908 1.1 thorpej 1909 1.1 thorpej /* 1910 1.1 thorpej * Look for a shared futex since we have no positive indication 1911 1.1 thorpej * it is private. If we can't, tough. 1912 1.1 thorpej */ 1913 1.1 thorpej error = futex_lookup(uaddr, /*shared*/true, &f); 1914 1.1 thorpej if (error) 1915 1.1 thorpej return; 1916 1.1 thorpej 1917 1.1 thorpej /* 1918 1.1 thorpej * If there's no kernel state for this futex, there's nothing to 1919 1.1 thorpej * release. 1920 1.1 thorpej */ 1921 1.1 thorpej if (f == NULL) 1922 1.1 thorpej return; 1923 1.1 thorpej 1924 1.1 thorpej /* Work under the futex queue lock. */ 1925 1.1 thorpej futex_queue_lock(f); 1926 1.1 thorpej 1927 1.1 thorpej /* 1928 1.1 thorpej * Fetch the word: if the tid doesn't match ours, skip; 1929 1.1 thorpej * otherwise, set the owner-died bit, atomically. 1930 1.1 thorpej */ 1931 1.1 thorpej do { 1932 1.1 thorpej error = futex_load(uaddr, &oldval); 1933 1.1 thorpej if (error) 1934 1.1 thorpej goto out; 1935 1.1 thorpej if ((oldval & FUTEX_TID_MASK) != tid) 1936 1.1 thorpej goto out; 1937 1.1 thorpej newval = oldval | FUTEX_OWNER_DIED; 1938 1.1 thorpej error = ucas_int(uaddr, oldval, newval, &actual); 1939 1.1 thorpej if (error) 1940 1.1 thorpej goto out; 1941 1.1 thorpej } while (actual != oldval); 1942 1.1 thorpej 1943 1.1 thorpej /* 1944 1.1 thorpej * If there may be waiters, try to wake one. If anything goes 1945 1.1 thorpej * wrong, tough. 1946 1.1 thorpej * 1947 1.1 thorpej * XXX eventual PI handling? 1948 1.1 thorpej */ 1949 1.22 riastrad if (oldval & FUTEX_WAITERS) { 1950 1.22 riastrad (void)futex_wake(f, /*nwake*/1, NULL, /*nrequeue*/0, 1951 1.22 riastrad FUTEX_BITSET_MATCH_ANY); 1952 1.22 riastrad } 1953 1.1 thorpej 1954 1.1 thorpej /* Unlock the queue and release the futex. */ 1955 1.1 thorpej out: futex_queue_unlock(f); 1956 1.5 riastrad futex_rele(f); 1957 1.1 thorpej } 1958 1.1 thorpej 1959 1.1 thorpej /* 1960 1.1 thorpej * futex_robust_head_lookup(l, lwpid) 1961 1.1 thorpej * 1962 1.1 thorpej * Helper function to look up a robust head by LWP ID. 1963 1.1 thorpej */ 1964 1.1 thorpej int 1965 1.1 thorpej futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp) 1966 1.1 thorpej { 1967 1.1 thorpej struct proc *p = l->l_proc; 1968 1.1 thorpej 1969 1.1 thorpej /* Find the other lwp, if requested; otherwise use our robust head. */ 1970 1.1 thorpej if (lwpid) { 1971 1.1 thorpej mutex_enter(p->p_lock); 1972 1.1 thorpej l = lwp_find(p, lwpid); 1973 1.1 thorpej if (l == NULL) { 1974 1.1 thorpej mutex_exit(p->p_lock); 1975 1.1 thorpej return ESRCH; 1976 1.1 thorpej } 1977 1.1 thorpej *headp = (void *)l->l_robust_head; 1978 1.1 thorpej mutex_exit(p->p_lock); 1979 1.1 thorpej } else { 1980 1.1 thorpej *headp = (void *)l->l_robust_head; 1981 1.1 thorpej } 1982 1.1 thorpej return 0; 1983 1.1 thorpej } 1984 1.1 thorpej 1985 1.1 thorpej /* 1986 1.1 thorpej * futex_fetch_robust_head(uaddr) 1987 1.1 thorpej * 1988 1.1 thorpej * Helper routine to fetch the futex robust list head that 1989 1.1 thorpej * handles 32-bit binaries running on 64-bit kernels. 1990 1.1 thorpej */ 1991 1.1 thorpej static int 1992 1.1 thorpej futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead) 1993 1.1 thorpej { 1994 1.1 thorpej #ifdef _LP64 1995 1.1 thorpej if (curproc->p_flag & PK_32) { 1996 1.1 thorpej uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS]; 1997 1.1 thorpej int error; 1998 1.1 thorpej 1999 1.1 thorpej error = copyin((void *)uaddr, rhead32, sizeof(rhead32)); 2000 1.1 thorpej if (__predict_true(error == 0)) { 2001 1.1 thorpej for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) { 2002 1.1 thorpej if (i == _FUTEX_ROBUST_HEAD_OFFSET) { 2003 1.1 thorpej /* 2004 1.1 thorpej * Make sure the offset is sign- 2005 1.1 thorpej * extended. 2006 1.1 thorpej */ 2007 1.1 thorpej rhead[i] = (int32_t)rhead32[i]; 2008 1.1 thorpej } else { 2009 1.1 thorpej rhead[i] = rhead32[i]; 2010 1.1 thorpej } 2011 1.1 thorpej } 2012 1.1 thorpej } 2013 1.1 thorpej return error; 2014 1.1 thorpej } 2015 1.1 thorpej #endif /* _L64 */ 2016 1.1 thorpej 2017 1.1 thorpej return copyin((void *)uaddr, rhead, 2018 1.1 thorpej sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS); 2019 1.1 thorpej } 2020 1.1 thorpej 2021 1.1 thorpej /* 2022 1.1 thorpej * futex_decode_robust_word(word) 2023 1.1 thorpej * 2024 1.1 thorpej * Decode a robust futex list word into the entry and entry 2025 1.1 thorpej * properties. 2026 1.1 thorpej */ 2027 1.1 thorpej static inline void 2028 1.1 thorpej futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry, 2029 1.1 thorpej bool * const is_pi) 2030 1.1 thorpej { 2031 1.1 thorpej *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false; 2032 1.1 thorpej *entry = word & ~_FUTEX_ROBUST_ENTRY_PI; 2033 1.1 thorpej } 2034 1.1 thorpej 2035 1.1 thorpej /* 2036 1.1 thorpej * futex_fetch_robust_entry(uaddr) 2037 1.1 thorpej * 2038 1.1 thorpej * Helper routine to fetch and decode a robust futex entry 2039 1.1 thorpej * that handles 32-bit binaries running on 64-bit kernels. 2040 1.1 thorpej */ 2041 1.1 thorpej static int 2042 1.1 thorpej futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp, 2043 1.1 thorpej bool * const is_pi) 2044 1.1 thorpej { 2045 1.1 thorpej uintptr_t val = 0; 2046 1.1 thorpej int error = 0; 2047 1.1 thorpej 2048 1.1 thorpej #ifdef _LP64 2049 1.1 thorpej if (curproc->p_flag & PK_32) { 2050 1.1 thorpej uint32_t val32; 2051 1.1 thorpej 2052 1.1 thorpej error = ufetch_32((uint32_t *)uaddr, &val32); 2053 1.1 thorpej if (__predict_true(error == 0)) 2054 1.1 thorpej val = val32; 2055 1.1 thorpej } else 2056 1.1 thorpej #endif /* _LP64 */ 2057 1.1 thorpej error = ufetch_long((u_long *)uaddr, (u_long *)&val); 2058 1.1 thorpej if (__predict_false(error)) 2059 1.1 thorpej return error; 2060 1.1 thorpej 2061 1.1 thorpej futex_decode_robust_word(val, valp, is_pi); 2062 1.1 thorpej return 0; 2063 1.1 thorpej } 2064 1.1 thorpej 2065 1.1 thorpej /* 2066 1.1 thorpej * futex_release_all_lwp(l, tid) 2067 1.1 thorpej * 2068 1.1 thorpej * Release all l's robust futexes. If anything looks funny in 2069 1.1 thorpej * the process, give up -- it's userland's responsibility to dot 2070 1.1 thorpej * the i's and cross the t's. 2071 1.1 thorpej */ 2072 1.1 thorpej void 2073 1.13 thorpej futex_release_all_lwp(struct lwp * const l) 2074 1.1 thorpej { 2075 1.1 thorpej u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS]; 2076 1.1 thorpej int limit = 1000000; 2077 1.1 thorpej int error; 2078 1.1 thorpej 2079 1.1 thorpej /* If there's no robust list there's nothing to do. */ 2080 1.1 thorpej if (l->l_robust_head == 0) 2081 1.1 thorpej return; 2082 1.1 thorpej 2083 1.13 thorpej KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid); 2084 1.13 thorpej 2085 1.1 thorpej /* Read the final snapshot of the robust list head. */ 2086 1.1 thorpej error = futex_fetch_robust_head(l->l_robust_head, rhead); 2087 1.1 thorpej if (error) { 2088 1.13 thorpej printf("WARNING: pid %jd (%s) lwp %jd:" 2089 1.1 thorpej " unmapped robust futex list head\n", 2090 1.1 thorpej (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2091 1.13 thorpej (uintmax_t)l->l_lid); 2092 1.1 thorpej return; 2093 1.1 thorpej } 2094 1.1 thorpej 2095 1.1 thorpej const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET]; 2096 1.1 thorpej 2097 1.1 thorpej uintptr_t next, pending; 2098 1.1 thorpej bool is_pi, pending_is_pi; 2099 1.1 thorpej 2100 1.1 thorpej futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST], 2101 1.1 thorpej &next, &is_pi); 2102 1.1 thorpej futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING], 2103 1.1 thorpej &pending, &pending_is_pi); 2104 1.1 thorpej 2105 1.1 thorpej /* 2106 1.1 thorpej * Walk down the list of locked futexes and release them, up 2107 1.1 thorpej * to one million of them before we give up. 2108 1.1 thorpej */ 2109 1.1 thorpej 2110 1.1 thorpej while (next != l->l_robust_head && limit-- > 0) { 2111 1.1 thorpej /* pending handled below. */ 2112 1.1 thorpej if (next != pending) 2113 1.13 thorpej release_futex(next + offset, l->l_lid, is_pi, false); 2114 1.1 thorpej error = futex_fetch_robust_entry(next, &next, &is_pi); 2115 1.1 thorpej if (error) 2116 1.1 thorpej break; 2117 1.1 thorpej preempt_point(); 2118 1.1 thorpej } 2119 1.1 thorpej if (limit <= 0) { 2120 1.13 thorpej printf("WARNING: pid %jd (%s) lwp %jd:" 2121 1.1 thorpej " exhausted robust futex limit\n", 2122 1.1 thorpej (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm, 2123 1.13 thorpej (uintmax_t)l->l_lid); 2124 1.1 thorpej } 2125 1.1 thorpej 2126 1.1 thorpej /* If there's a pending futex, it may need to be released too. */ 2127 1.1 thorpej if (pending != 0) { 2128 1.13 thorpej release_futex(pending + offset, l->l_lid, pending_is_pi, true); 2129 1.1 thorpej } 2130 1.1 thorpej } 2131