1 /* $NetBSD: kern_condvar.c,v 1.64 2026/01/03 23:58:32 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * Kernel condition variable implementation. 35 */ 36 37 #include <sys/cdefs.h> 38 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.64 2026/01/03 23:58:32 riastradh Exp $"); 39 40 #include <sys/param.h> 41 42 #include <sys/condvar.h> 43 #include <sys/cpu.h> 44 #include <sys/kernel.h> 45 #include <sys/lockdebug.h> 46 #include <sys/lwp.h> 47 #include <sys/sdt.h> 48 #include <sys/sleepq.h> 49 #include <sys/syncobj.h> 50 #include <sys/systm.h> 51 52 /* 53 * Accessors for the private contents of the kcondvar_t data type. 54 * 55 * cv_opaque[0] sleepq_t 56 * cv_opaque[1] description for ps(1) 57 * 58 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue 59 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue 60 * and dequeue). 61 * 62 * cv_opaque[1] (the wmesg) is static and does not change throughout the life 63 * of the CV. 64 */ 65 #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque) 66 #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1]) 67 #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v) 68 69 #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug) 70 #define CV_RA ((uintptr_t)__builtin_return_address(0)) 71 72 static void cv_unsleep(lwp_t *, bool); 73 static inline void cv_wakeup_one(kcondvar_t *); 74 static inline void cv_wakeup_all(kcondvar_t *); 75 76 syncobj_t cv_syncobj = { 77 .sobj_name = "cv", 78 .sobj_flag = SOBJ_SLEEPQ_SORTED, 79 .sobj_boostpri = PRI_KERNEL, 80 .sobj_unsleep = cv_unsleep, 81 .sobj_changepri = sleepq_changepri, 82 .sobj_lendpri = sleepq_lendpri, 83 .sobj_owner = syncobj_noowner, 84 }; 85 86 static const char deadcv[] = "deadcv"; 87 88 /* 89 * cv_init: 90 * 91 * Initialize a condition variable for use. 92 */ 93 void 94 cv_init(kcondvar_t *cv, const char *wmesg) 95 { 96 97 KASSERT(wmesg != NULL); 98 CV_SET_WMESG(cv, wmesg); 99 sleepq_init(CV_SLEEPQ(cv)); 100 } 101 102 /* 103 * cv_destroy: 104 * 105 * Tear down a condition variable. 106 */ 107 void 108 cv_destroy(kcondvar_t *cv) 109 { 110 111 sleepq_destroy(CV_SLEEPQ(cv)); 112 #ifdef DIAGNOSTIC 113 KASSERT(cv_is_valid(cv)); 114 KASSERT(!cv_has_waiters(cv)); 115 CV_SET_WMESG(cv, deadcv); 116 #endif 117 } 118 119 /* 120 * cv_enter: 121 * 122 * Look up and lock the sleep queue corresponding to the given 123 * condition variable, and increment the number of waiters. 124 */ 125 static inline int 126 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p) 127 { 128 sleepq_t *sq; 129 kmutex_t *mp; 130 int nlocks; 131 132 KASSERT(cv_is_valid(cv)); 133 KASSERT(!cpu_intr_p()); 134 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL); 135 136 mp = sleepq_hashlock(cv); 137 sq = CV_SLEEPQ(cv); 138 nlocks = sleepq_enter(sq, l, mp); 139 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p); 140 mutex_exit(mtx); 141 KASSERT(cv_has_waiters(cv)); 142 return nlocks; 143 } 144 145 /* 146 * cv_unsleep: 147 * 148 * Remove an LWP from the condition variable and sleep queue. This 149 * is called when the LWP has not been awoken normally but instead 150 * interrupted: for example, when a signal is received. Must be 151 * called with the LWP locked. Will unlock if "unlock" is true. 152 */ 153 static void 154 cv_unsleep(lwp_t *l, bool unlock) 155 { 156 kcondvar_t *cv __diagused; 157 158 cv = (kcondvar_t *)(uintptr_t)l->l_wchan; 159 160 KASSERT(l->l_wchan == (wchan_t)cv); 161 KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); 162 KASSERT(cv_is_valid(cv)); 163 KASSERT(cv_has_waiters(cv)); 164 165 sleepq_unsleep(l, unlock); 166 } 167 168 /* 169 * cv_wait: 170 * 171 * Wait non-interruptably on a condition variable until awoken. 172 */ 173 void 174 cv_wait(kcondvar_t *cv, kmutex_t *mtx) 175 { 176 lwp_t *l = curlwp; 177 int nlocks; 178 179 KASSERT(mutex_owned(mtx)); 180 181 nlocks = cv_enter(cv, mtx, l, false); 182 (void)sleepq_block(0, false, &cv_syncobj, nlocks); 183 mutex_enter(mtx); 184 } 185 186 /* 187 * cv_wait_sig: 188 * 189 * Wait on a condition variable until a awoken or a signal is received. 190 * Will also return early if the process is exiting. Returns zero if 191 * awoken normally, ERESTART if a signal was received and the system 192 * call is restartable, or EINTR otherwise. 193 */ 194 int 195 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx) 196 { 197 lwp_t *l = curlwp; 198 int error, nlocks; 199 200 KASSERT(mutex_owned(mtx)); 201 202 nlocks = cv_enter(cv, mtx, l, true); 203 error = sleepq_block(0, true, &cv_syncobj, nlocks); 204 mutex_enter(mtx); 205 return error; 206 } 207 208 /* 209 * cv_timedwait: 210 * 211 * Wait on a condition variable until awoken or the specified timeout 212 * expires. Returns zero if awoken normally or EWOULDBLOCK if the 213 * timeout expired. 214 * 215 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 216 */ 217 int 218 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo) 219 { 220 lwp_t *l = curlwp; 221 int error, nlocks; 222 223 KASSERT(mutex_owned(mtx)); 224 225 nlocks = cv_enter(cv, mtx, l, false); 226 error = sleepq_block(timo, false, &cv_syncobj, nlocks); 227 mutex_enter(mtx); 228 return error; 229 } 230 231 /* 232 * cv_timedwait_sig: 233 * 234 * Wait on a condition variable until a timeout expires, awoken or a 235 * signal is received. Will also return early if the process is 236 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the 237 * timeout expires, ERESTART if a signal was received and the system 238 * call is restartable, or EINTR otherwise. 239 * 240 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout. 241 */ 242 int 243 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo) 244 { 245 lwp_t *l = curlwp; 246 int error, nlocks; 247 248 KASSERT(mutex_owned(mtx)); 249 250 nlocks = cv_enter(cv, mtx, l, true); 251 error = sleepq_block(timo, true, &cv_syncobj, nlocks); 252 mutex_enter(mtx); 253 return error; 254 } 255 256 /* 257 * Given a number of seconds, sec, and 2^64ths of a second, frac, we 258 * want a number of ticks for a timeout: 259 * 260 * timo = hz*(sec + frac/2^64) 261 * = hz*sec + hz*frac/2^64 262 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64 263 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64, 264 * 265 * where frachi is the high 32 bits of frac and fraclo is the 266 * low 32 bits. 267 * 268 * We assume hz < INT_MAX/2 < UINT32_MAX, so 269 * 270 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1, 271 * 272 * since fraclo < 2^32. 273 * 274 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we 275 * can't represent timeouts higher than INT_MAX in cv_timedwait, and 276 * spurious wakeup is OK. Moreover, we don't want to wrap around, 277 * because we compute end - start in ticks in order to compute the 278 * remaining timeout, and that difference cannot wrap around, so we use 279 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of 280 * margin for paranoia and will exceed most waits in practice by far. 281 */ 282 static unsigned 283 bintime2timo(const struct bintime *bt) 284 { 285 286 KASSERT(hz < INT_MAX/2); 287 CTASSERT(INT_MAX/2 < UINT32_MAX); 288 if (bt->sec > ((INT_MAX/2)/hz)) 289 return INT_MAX/2; 290 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec)) 291 return INT_MAX/2; 292 293 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32); 294 } 295 296 /* 297 * timo is in units of ticks. We want units of seconds and 2^64ths of 298 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a 299 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a 300 * second)/tick. So for the fractional part, we compute 301 * 302 * frac = rem * 2^64 / hz 303 * = ((rem * 2^32) / hz) * 2^32 304 * 305 * Using truncating integer division instead of real division will 306 * leave us with only about 32 bits of precision, which means about 307 * 1/4-nanosecond resolution, which is good enough for our purposes. 308 */ 309 static struct bintime 310 timo2bintime(unsigned timo) 311 { 312 313 return (struct bintime) { 314 .sec = timo / hz, 315 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32), 316 }; 317 } 318 319 /* 320 * cv_timedwaitbt: 321 * 322 * Wait on a condition variable until awoken or the specified 323 * timeout expires. Returns zero if awoken normally or 324 * EWOULDBLOCK if the timeout expires. 325 * 326 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts 327 * the time slept, so on exit, bt is the time remaining after 328 * sleeping, possibly negative if the complete time has elapsed. 329 * No infinite timeout; use cv_wait_sig instead. 330 * 331 * epsilon is a requested maximum error in timeout (excluding 332 * spurious wakeups). Currently not used, will be used in the 333 * future to choose between low- and high-resolution timers. 334 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s) 335 * where r is the finest resolution of clock available and s is 336 * scheduling delays for scheduler overhead and competing threads. 337 * Time is measured by the interrupt source implementing the 338 * timeout, not by another timecounter. 339 */ 340 int 341 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 342 const struct bintime *epsilon __diagused) 343 { 344 struct bintime slept; 345 unsigned start, end; 346 int timo; 347 int error; 348 349 KASSERTMSG(bt->sec >= 0, "negative timeout"); 350 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 351 352 /* If there's nothing left to wait, time out. */ 353 if (bt->sec == 0 && bt->frac == 0) 354 return SET_ERROR(EWOULDBLOCK); 355 356 /* Convert to ticks, but clamp to be >=1. */ 357 timo = bintime2timo(bt); 358 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 359 if (timo == 0) 360 timo = 1; 361 362 /* 363 * getticks() is technically int, but nothing special 364 * happens instead of overflow, so we assume two's-complement 365 * wraparound and just treat it as unsigned. 366 */ 367 start = getticks(); 368 error = cv_timedwait(cv, mtx, timo); 369 end = getticks(); 370 371 /* 372 * Set it to the time left, or zero, whichever is larger. We 373 * do not fail with EWOULDBLOCK here because this may have been 374 * an explicit wakeup, so the caller needs to check before they 375 * give up or else cv_signal would be lost. 376 */ 377 slept = timo2bintime(end - start); 378 if (bintimecmp(bt, &slept, <=)) { 379 bt->sec = 0; 380 bt->frac = 0; 381 } else { 382 /* bt := bt - slept */ 383 bintime_sub(bt, &slept); 384 } 385 386 return error; 387 } 388 389 /* 390 * cv_timedwaitbt_sig: 391 * 392 * Wait on a condition variable until awoken, the specified 393 * timeout expires, or interrupted by a signal. Returns zero if 394 * awoken normally, EWOULDBLOCK if the timeout expires, or 395 * EINTR/ERESTART if interrupted by a signal. 396 * 397 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig 398 * subtracts the time slept, so on exit, bt is the time remaining 399 * after sleeping. No infinite timeout; use cv_wait instead. 400 * 401 * epsilon is a requested maximum error in timeout (excluding 402 * spurious wakeups). Currently not used, will be used in the 403 * future to choose between low- and high-resolution timers. 404 */ 405 int 406 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt, 407 const struct bintime *epsilon __diagused) 408 { 409 struct bintime slept; 410 unsigned start, end; 411 int timo; 412 int error; 413 414 KASSERTMSG(bt->sec >= 0, "negative timeout"); 415 KASSERTMSG(epsilon != NULL, "specify maximum requested delay"); 416 417 /* If there's nothing left to wait, time out. */ 418 if (bt->sec == 0 && bt->frac == 0) 419 return SET_ERROR(EWOULDBLOCK); 420 421 /* Convert to ticks, but clamp to be >=1. */ 422 timo = bintime2timo(bt); 423 KASSERTMSG(timo >= 0, "negative ticks: %d", timo); 424 if (timo == 0) 425 timo = 1; 426 427 /* 428 * getticks() is technically int, but nothing special 429 * happens instead of overflow, so we assume two's-complement 430 * wraparound and just treat it as unsigned. 431 */ 432 start = getticks(); 433 error = cv_timedwait_sig(cv, mtx, timo); 434 end = getticks(); 435 436 /* 437 * Set it to the time left, or zero, whichever is larger. We 438 * do not fail with EWOULDBLOCK here because this may have been 439 * an explicit wakeup, so the caller needs to check before they 440 * give up or else cv_signal would be lost. 441 */ 442 slept = timo2bintime(end - start); 443 if (bintimecmp(bt, &slept, <=)) { 444 bt->sec = 0; 445 bt->frac = 0; 446 } else { 447 /* bt := bt - slept */ 448 bintime_sub(bt, &slept); 449 } 450 451 return error; 452 } 453 454 /* 455 * cv_signal: 456 * 457 * Wake the highest priority LWP waiting on a condition variable. Must 458 * be called with the interlocking mutex held or just after it has been 459 * released (so the awoken LWP will see the changed condition). 460 */ 461 void 462 cv_signal(kcondvar_t *cv) 463 { 464 465 KASSERT(cv_is_valid(cv)); 466 467 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 468 /* 469 * Compiler turns into a tail call usually, i.e. jmp, 470 * because the arguments are the same and no locals. 471 */ 472 cv_wakeup_one(cv); 473 } 474 } 475 476 /* 477 * cv_wakeup_one: 478 * 479 * Slow path for cv_signal(). Deliberately marked __noinline to 480 * prevent the compiler pulling it in to cv_signal(), which adds 481 * extra prologue and epilogue code. 482 */ 483 static __noinline void 484 cv_wakeup_one(kcondvar_t *cv) 485 { 486 sleepq_t *sq; 487 kmutex_t *mp; 488 lwp_t *l; 489 490 mp = sleepq_hashlock(cv); 491 sq = CV_SLEEPQ(cv); 492 if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { 493 KASSERT(l->l_sleepq == sq); 494 KASSERT(l->l_mutex == mp); 495 KASSERT(l->l_wchan == cv); 496 sleepq_remove(sq, l, true); 497 } 498 mutex_spin_exit(mp); 499 } 500 501 /* 502 * cv_broadcast: 503 * 504 * Wake all LWPs waiting on a condition variable. Must be called with 505 * the interlocking mutex held or just after it has been released (so 506 * the awoken LWP will see the changed condition). 507 */ 508 void 509 cv_broadcast(kcondvar_t *cv) 510 { 511 512 KASSERT(cv_is_valid(cv)); 513 514 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) { 515 /* 516 * Compiler turns into a tail call usually, i.e. jmp, 517 * because the arguments are the same and no locals. 518 */ 519 cv_wakeup_all(cv); 520 } 521 } 522 523 /* 524 * cv_wakeup_all: 525 * 526 * Slow path for cv_broadcast(). Deliberately marked __noinline to 527 * prevent the compiler pulling it in to cv_broadcast(), which adds 528 * extra prologue and epilogue code. 529 */ 530 static __noinline void 531 cv_wakeup_all(kcondvar_t *cv) 532 { 533 sleepq_t *sq; 534 kmutex_t *mp; 535 lwp_t *l; 536 537 mp = sleepq_hashlock(cv); 538 sq = CV_SLEEPQ(cv); 539 while ((l = LIST_FIRST(sq)) != NULL) { 540 KASSERT(l->l_sleepq == sq); 541 KASSERT(l->l_mutex == mp); 542 KASSERT(l->l_wchan == cv); 543 sleepq_remove(sq, l, true); 544 } 545 mutex_spin_exit(mp); 546 } 547 548 /* 549 * cv_has_waiters: 550 * 551 * For diagnostic assertions: return non-zero if a condition 552 * variable has waiters. 553 */ 554 bool 555 cv_has_waiters(kcondvar_t *cv) 556 { 557 558 return !LIST_EMPTY(CV_SLEEPQ(cv)); 559 } 560 561 /* 562 * cv_is_valid: 563 * 564 * For diagnostic assertions: return non-zero if a condition 565 * variable appears to be valid. No locks need be held. 566 */ 567 bool 568 cv_is_valid(kcondvar_t *cv) 569 { 570 571 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL; 572 } 573