1 /* $NetBSD: kern_sleepq.c,v 1.88 2026/01/04 01:41:42 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Sleep queue implementation, used by turnstiles and general sleep/wakeup 35 * interfaces. 36 */ 37 38 #include <sys/cdefs.h> 39 __KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.88 2026/01/04 01:41:42 riastradh Exp $"); 40 41 #include <sys/param.h> 42 43 #include <sys/cpu.h> 44 #include <sys/intr.h> 45 #include <sys/kernel.h> 46 #include <sys/ktrace.h> 47 #include <sys/pool.h> 48 #include <sys/proc.h> 49 #include <sys/resourcevar.h> 50 #include <sys/sched.h> 51 #include <sys/sdt.h> 52 #include <sys/sleepq.h> 53 #include <sys/syncobj.h> 54 #include <sys/systm.h> 55 56 /* 57 * for sleepq_abort: 58 * During autoconfiguration or after a panic, a sleep will simply lower the 59 * priority briefly to allow interrupts, then return. The priority to be 60 * used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and 61 * maintained in the machine-dependent layers. This priority will typically 62 * be 0, or the lowest priority that is safe for use on the interrupt stack; 63 * it can be made higher to block network software interrupts after panics. 64 */ 65 #ifndef IPL_SAFEPRI 66 #define IPL_SAFEPRI 0 67 #endif 68 69 static int sleepq_sigtoerror(lwp_t *, int); 70 71 /* General purpose sleep table, used by mtsleep() and condition variables. */ 72 sleeptab_t sleeptab __cacheline_aligned; 73 sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned; 74 75 /* 76 * sleeptab_init: 77 * 78 * Initialize a sleep table. 79 */ 80 void 81 sleeptab_init(sleeptab_t *st) 82 { 83 static bool again; 84 int i; 85 86 for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) { 87 if (!again) { 88 mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT, 89 IPL_SCHED); 90 } 91 sleepq_init(&st->st_queue[i]); 92 } 93 again = true; 94 } 95 96 /* 97 * sleepq_init: 98 * 99 * Prepare a sleep queue for use. 100 */ 101 void 102 sleepq_init(sleepq_t *sq) 103 { 104 105 LIST_INIT(sq); 106 } 107 108 /* 109 * sleepq_remove: 110 * 111 * Remove an LWP from a sleep queue and wake it up. Distinguish 112 * between deliberate wakeups (which are a valuable information) and 113 * "unsleep" (an out-of-band action must be taken). 114 * 115 * For wakeup, convert any interruptable wait into non-interruptable 116 * one before waking the LWP. Otherwise, if only one LWP is awoken it 117 * could fail to do something useful with the wakeup due to an error 118 * return and the caller of e.g. cv_signal() may not expect this. 119 */ 120 void 121 sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup) 122 { 123 struct schedstate_percpu *spc; 124 struct cpu_info *ci; 125 126 KASSERT(lwp_locked(l, NULL)); 127 128 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) { 129 KASSERT(sq != NULL); 130 LIST_REMOVE(l, l_sleepchain); 131 } else { 132 KASSERT(sq == NULL); 133 } 134 135 l->l_syncobj = &sched_syncobj; 136 l->l_wchan = NULL; 137 l->l_sleepq = NULL; 138 l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR; 139 140 ci = l->l_cpu; 141 spc = &ci->ci_schedstate; 142 143 /* 144 * If not sleeping, the LWP must have been suspended. Let whoever 145 * holds it stopped set it running again. 146 */ 147 if (l->l_stat != LSSLEEP) { 148 KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED); 149 lwp_setlock(l, spc->spc_lwplock); 150 return; 151 } 152 153 /* 154 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 155 * about to call mi_switch(), in which case it will yield. 156 */ 157 if ((l->l_pflag & LP_RUNNING) != 0) { 158 l->l_stat = LSONPROC; 159 l->l_slptime = 0; 160 lwp_setlock(l, spc->spc_lwplock); 161 return; 162 } 163 164 /* Update sleep time delta, call the wake-up handler of scheduler */ 165 l->l_slpticksum += (getticks() - l->l_slpticks); 166 sched_wakeup(l); 167 168 /* Look for a CPU to wake up */ 169 l->l_cpu = sched_takecpu(l); 170 ci = l->l_cpu; 171 spc = &ci->ci_schedstate; 172 173 /* 174 * Set it running. 175 */ 176 spc_lock(ci); 177 lwp_setlock(l, spc->spc_mutex); 178 sched_setrunnable(l); 179 l->l_stat = LSRUN; 180 l->l_slptime = 0; 181 sched_enqueue(l); 182 sched_resched_lwp(l, true); 183 /* LWP & SPC now unlocked, but we still hold sleep queue lock. */ 184 } 185 186 /* 187 * sleepq_insert: 188 * 189 * Insert an LWP into the sleep queue, optionally sorting by priority. 190 */ 191 static void 192 sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj) 193 { 194 195 if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) { 196 KASSERT(sq == NULL); 197 return; 198 } 199 KASSERT(sq != NULL); 200 201 if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) { 202 lwp_t *l2, *l_last = NULL; 203 const pri_t pri = lwp_eprio(l); 204 205 LIST_FOREACH(l2, sq, l_sleepchain) { 206 l_last = l2; 207 if (lwp_eprio(l2) < pri) { 208 LIST_INSERT_BEFORE(l2, l, l_sleepchain); 209 return; 210 } 211 } 212 /* 213 * Ensure FIFO ordering if no waiters are of lower priority. 214 */ 215 if (l_last != NULL) { 216 LIST_INSERT_AFTER(l_last, l, l_sleepchain); 217 return; 218 } 219 } 220 221 LIST_INSERT_HEAD(sq, l, l_sleepchain); 222 } 223 224 /* 225 * sleepq_enter: 226 * 227 * Prepare to block on a sleep queue, after which any interlock can be 228 * safely released. 229 */ 230 int 231 sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp) 232 { 233 int nlocks; 234 235 KASSERT((sq != NULL) == (mp != NULL)); 236 237 /* 238 * Acquire the per-LWP mutex and lend it our sleep queue lock. 239 * Once interlocked, we can release the kernel lock. 240 */ 241 lwp_lock(l); 242 if (mp != NULL) { 243 lwp_unlock_to(l, mp); 244 } 245 if (__predict_false((nlocks = l->l_blcnt) != 0)) { 246 KERNEL_UNLOCK_ALL(NULL, NULL); 247 } 248 return nlocks; 249 } 250 251 /* 252 * sleepq_enqueue: 253 * 254 * Enter an LWP into the sleep queue and prepare for sleep. The sleep 255 * queue must already be locked, and any interlock (such as the kernel 256 * lock) must have be released (see sleeptab_lookup(), sleepq_enter()). 257 */ 258 void 259 sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj, 260 bool catch_p) 261 { 262 lwp_t *l = curlwp; 263 264 KASSERT(lwp_locked(l, NULL)); 265 KASSERT(l->l_stat == LSONPROC); 266 KASSERT(l->l_wchan == NULL); 267 KASSERT(l->l_sleepq == NULL); 268 KASSERT((l->l_flag & LW_SINTR) == 0); 269 270 l->l_syncobj = sobj; 271 l->l_wchan = wchan; 272 l->l_sleepq = sq; 273 l->l_wmesg = wmesg; 274 l->l_slptime = 0; 275 l->l_stat = LSSLEEP; 276 if (catch_p) 277 l->l_flag |= LW_SINTR; 278 279 sleepq_insert(sq, l, sobj); 280 281 /* Save the time when thread has slept */ 282 l->l_slpticks = getticks(); 283 sched_slept(l); 284 } 285 286 /* 287 * sleepq_transfer: 288 * 289 * Move an LWP from one sleep queue to another. Both sleep queues 290 * must already be locked. 291 * 292 * The LWP will be updated with the new sleepq, wchan, wmesg, 293 * sobj, and mutex. The interruptible flag will also be updated. 294 */ 295 void 296 sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan, 297 const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p) 298 { 299 300 KASSERT(l->l_sleepq == from_sq); 301 302 LIST_REMOVE(l, l_sleepchain); 303 l->l_syncobj = sobj; 304 l->l_wchan = wchan; 305 l->l_sleepq = sq; 306 l->l_wmesg = wmesg; 307 308 if (catch_p) 309 l->l_flag = LW_SINTR | LW_CATCHINTR; 310 else 311 l->l_flag = ~(LW_SINTR | LW_CATCHINTR); 312 313 /* 314 * This allows the transfer from one sleepq to another where 315 * it is known that they're both protected by the same lock. 316 */ 317 if (mp != NULL) 318 lwp_setlock(l, mp); 319 320 sleepq_insert(sq, l, sobj); 321 } 322 323 /* 324 * sleepq_uncatch: 325 * 326 * Mark the LWP as no longer sleeping interruptibly. 327 */ 328 void 329 sleepq_uncatch(lwp_t *l) 330 { 331 332 l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO); 333 } 334 335 /* 336 * sleepq_block: 337 * 338 * After any intermediate step such as releasing an interlock, switch. 339 * sleepq_block() may return early under exceptional conditions, for 340 * example if the LWP's containing process is exiting. 341 * 342 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 343 */ 344 int 345 sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks) 346 { 347 const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG; 348 int error = 0, sig, flag; 349 struct proc *p; 350 lwp_t *l = curlwp; 351 bool early = false; 352 353 ktrcsw(1, 0, syncobj); 354 355 /* 356 * If sleeping interruptably, check for pending signals, exits or 357 * core dump events. 358 * 359 * Note the usage of LW_CATCHINTR. This expresses our intent 360 * to catch or not catch sleep interruptions, which might change 361 * while we are sleeping. It is independent from LW_SINTR because 362 * we don't want to leave LW_SINTR set when the LWP is not asleep. 363 */ 364 if (catch_p) { 365 if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) { 366 l->l_flag &= ~LW_CANCELLED; 367 error = SET_ERROR(EINTR); 368 early = true; 369 } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0)) 370 early = true; 371 l->l_flag |= LW_CATCHINTR; 372 } else 373 l->l_flag &= ~LW_CATCHINTR; 374 375 if (early) { 376 /* lwp_unsleep() will release the lock */ 377 lwp_unsleep(l, true); 378 } else { 379 /* 380 * The LWP may have already been awoken if the caller 381 * dropped the sleep queue lock between sleepq_enqueue() and 382 * sleepq_block(). If that happens l_stat will be LSONPROC 383 * and mi_switch() will treat this as a preemption. No need 384 * to do anything special here. 385 */ 386 if (timo) { 387 l->l_flag &= ~LW_STIMO; 388 callout_schedule(&l->l_timeout_ch, timo); 389 } 390 l->l_boostpri = l->l_syncobj->sobj_boostpri; 391 spc_lock(l->l_cpu); 392 mi_switch(l); 393 394 /* The LWP and sleep queue are now unlocked. */ 395 if (timo) { 396 /* 397 * Even if the callout appears to have fired, we 398 * need to stop it in order to synchronise with 399 * other CPUs. It's important that we do this in 400 * this LWP's context, and not during wakeup, in 401 * order to keep the callout & its cache lines 402 * co-located on the CPU with the LWP. 403 */ 404 (void)callout_halt(&l->l_timeout_ch, NULL); 405 error = (l->l_flag & LW_STIMO) ? SET_ERROR(EWOULDBLOCK) 406 : 0; 407 } 408 } 409 410 /* 411 * LW_CATCHINTR is only modified in this function OR when we 412 * are asleep (with the sleepq locked). We can therefore safely 413 * test it unlocked here as it is guaranteed to be stable by 414 * virtue of us running. 415 * 416 * We do not bother clearing it if set; that would require us 417 * to take the LWP lock, and it doesn't seem worth the hassle 418 * considering it is only meaningful here inside this function, 419 * and is set to reflect intent upon entry. 420 */ 421 flag = atomic_load_relaxed(&l->l_flag); 422 if (__predict_false((flag & mask) != 0)) { 423 if ((flag & LW_CATCHINTR) == 0 || error != 0) 424 /* nothing */; 425 else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0) 426 error = SET_ERROR(EINTR); 427 else if ((flag & LW_PENDSIG) != 0) { 428 /* 429 * Acquiring p_lock may cause us to recurse 430 * through the sleep path and back into this 431 * routine, but is safe because LWPs sleeping 432 * on locks are non-interruptable and we will 433 * not recurse again. 434 */ 435 p = l->l_proc; 436 mutex_enter(p->p_lock); 437 if (((sig = sigispending(l, 0)) != 0 && 438 (sigprop[sig] & SA_STOP) == 0) || 439 (sig = issignal(l)) != 0) 440 error = sleepq_sigtoerror(l, sig); 441 mutex_exit(p->p_lock); 442 } 443 } 444 445 ktrcsw(0, 0, syncobj); 446 if (__predict_false(nlocks != 0)) { 447 KERNEL_LOCK(nlocks, NULL); 448 } 449 return error; 450 } 451 452 /* 453 * sleepq_wake: 454 * 455 * Wake zero or more LWPs blocked on a single wait channel. 456 */ 457 void 458 sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp) 459 { 460 lwp_t *l, *next; 461 462 KASSERT(mutex_owned(mp)); 463 464 for (l = LIST_FIRST(sq); l != NULL; l = next) { 465 KASSERT(l->l_sleepq == sq); 466 KASSERT(l->l_mutex == mp); 467 next = LIST_NEXT(l, l_sleepchain); 468 if (l->l_wchan != wchan) 469 continue; 470 sleepq_remove(sq, l, true); 471 if (--expected == 0) 472 break; 473 } 474 475 mutex_spin_exit(mp); 476 } 477 478 /* 479 * sleepq_unsleep: 480 * 481 * Remove an LWP from its sleep queue and set it runnable again. 482 * sleepq_unsleep() is called with the LWP's mutex held, and will 483 * release it if "unlock" is true. 484 */ 485 void 486 sleepq_unsleep(lwp_t *l, bool unlock) 487 { 488 sleepq_t *sq = l->l_sleepq; 489 kmutex_t *mp = l->l_mutex; 490 491 KASSERT(lwp_locked(l, mp)); 492 KASSERT(l->l_wchan != NULL); 493 494 sleepq_remove(sq, l, false); 495 if (unlock) { 496 mutex_spin_exit(mp); 497 } 498 } 499 500 /* 501 * sleepq_timeout: 502 * 503 * Entered via the callout(9) subsystem to time out an LWP that is on a 504 * sleep queue. 505 */ 506 void 507 sleepq_timeout(void *arg) 508 { 509 lwp_t *l = arg; 510 511 /* 512 * Lock the LWP. Assuming it's still on the sleep queue, its 513 * current mutex will also be the sleep queue mutex. 514 */ 515 lwp_lock(l); 516 517 if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) { 518 /* 519 * Somebody beat us to it, or the LWP is blocked in 520 * callout_halt() waiting for us to finish here. In 521 * neither case should the LWP produce EWOULDBLOCK. 522 */ 523 lwp_unlock(l); 524 return; 525 } 526 527 l->l_flag |= LW_STIMO; 528 lwp_unsleep(l, true); 529 } 530 531 /* 532 * sleepq_sigtoerror: 533 * 534 * Given a signal number, interpret and return an error code. 535 */ 536 static int 537 sleepq_sigtoerror(lwp_t *l, int sig) 538 { 539 struct proc *p = l->l_proc; 540 int error; 541 542 KASSERT(mutex_owned(p->p_lock)); 543 544 /* 545 * If this sleep was canceled, don't let the syscall restart. 546 */ 547 if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 548 error = SET_ERROR(EINTR); 549 else 550 error = SET_ERROR(ERESTART); 551 552 return error; 553 } 554 555 /* 556 * sleepq_abort: 557 * 558 * After a panic or during autoconfiguration, lower the interrupt 559 * priority level to give pending interrupts a chance to run, and 560 * then return. Called if sleepq_dontsleep() returns non-zero, and 561 * always returns zero. 562 */ 563 int 564 sleepq_abort(kmutex_t *mtx, int unlock) 565 { 566 int s; 567 568 s = splhigh(); 569 splx(IPL_SAFEPRI); 570 splx(s); 571 if (mtx != NULL && unlock != 0) 572 mutex_exit(mtx); 573 574 return 0; 575 } 576 577 /* 578 * sleepq_reinsert: 579 * 580 * Move the position of the lwp in the sleep queue after a possible 581 * change of the lwp's effective priority. 582 */ 583 static void 584 sleepq_reinsert(sleepq_t *sq, lwp_t *l) 585 { 586 587 KASSERT(l->l_sleepq == sq); 588 if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { 589 return; 590 } 591 592 /* 593 * Don't let the sleep queue become empty, even briefly. 594 * cv_signal() and cv_broadcast() inspect it without the 595 * sleep queue lock held and need to see a non-empty queue 596 * head if there are waiters. 597 */ 598 if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) { 599 return; 600 } 601 LIST_REMOVE(l, l_sleepchain); 602 sleepq_insert(sq, l, l->l_syncobj); 603 } 604 605 /* 606 * sleepq_changepri: 607 * 608 * Adjust the priority of an LWP residing on a sleepq. 609 */ 610 void 611 sleepq_changepri(lwp_t *l, pri_t pri) 612 { 613 sleepq_t *sq = l->l_sleepq; 614 615 KASSERT(lwp_locked(l, NULL)); 616 617 l->l_priority = pri; 618 sleepq_reinsert(sq, l); 619 } 620 621 /* 622 * sleepq_changepri: 623 * 624 * Adjust the lended priority of an LWP residing on a sleepq. 625 */ 626 void 627 sleepq_lendpri(lwp_t *l, pri_t pri) 628 { 629 sleepq_t *sq = l->l_sleepq; 630 631 KASSERT(lwp_locked(l, NULL)); 632 633 l->l_inheritedprio = pri; 634 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 635 sleepq_reinsert(sq, l); 636 } 637