kern_condvar.c revision 1.61 1 /* $NetBSD: kern_condvar.c,v 1.61 2023/10/15 10:27:11 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Kernel condition variable implementation.
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.61 2023/10/15 10:27:11 riastradh Exp $");
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/lwp.h>
43 #include <sys/condvar.h>
44 #include <sys/sleepq.h>
45 #include <sys/lockdebug.h>
46 #include <sys/cpu.h>
47 #include <sys/kernel.h>
48 #include <sys/syncobj.h>
49
50 /*
51 * Accessors for the private contents of the kcondvar_t data type.
52 *
53 * cv_opaque[0] sleepq_t
54 * cv_opaque[1] description for ps(1)
55 *
56 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue
57 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
58 * and dequeue).
59 *
60 * cv_opaque[1] (the wmesg) is static and does not change throughout the life
61 * of the CV.
62 */
63 #define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque)
64 #define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1])
65 #define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v)
66
67 #define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug)
68 #define CV_RA ((uintptr_t)__builtin_return_address(0))
69
70 static void cv_unsleep(lwp_t *, bool);
71 static inline void cv_wakeup_one(kcondvar_t *);
72 static inline void cv_wakeup_all(kcondvar_t *);
73
74 syncobj_t cv_syncobj = {
75 .sobj_name = "cv",
76 .sobj_flag = SOBJ_SLEEPQ_SORTED,
77 .sobj_boostpri = PRI_KERNEL,
78 .sobj_unsleep = cv_unsleep,
79 .sobj_changepri = sleepq_changepri,
80 .sobj_lendpri = sleepq_lendpri,
81 .sobj_owner = syncobj_noowner,
82 };
83
84 static const char deadcv[] = "deadcv";
85
86 /*
87 * cv_init:
88 *
89 * Initialize a condition variable for use.
90 */
91 void
92 cv_init(kcondvar_t *cv, const char *wmesg)
93 {
94
95 KASSERT(wmesg != NULL);
96 CV_SET_WMESG(cv, wmesg);
97 sleepq_init(CV_SLEEPQ(cv));
98 }
99
100 /*
101 * cv_destroy:
102 *
103 * Tear down a condition variable.
104 */
105 void
106 cv_destroy(kcondvar_t *cv)
107 {
108
109 sleepq_destroy(CV_SLEEPQ(cv));
110 #ifdef DIAGNOSTIC
111 KASSERT(cv_is_valid(cv));
112 KASSERT(!cv_has_waiters(cv));
113 CV_SET_WMESG(cv, deadcv);
114 #endif
115 }
116
117 /*
118 * cv_enter:
119 *
120 * Look up and lock the sleep queue corresponding to the given
121 * condition variable, and increment the number of waiters.
122 */
123 static inline int
124 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p)
125 {
126 sleepq_t *sq;
127 kmutex_t *mp;
128 int nlocks;
129
130 KASSERT(cv_is_valid(cv));
131 KASSERT(!cpu_intr_p());
132 KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
133
134 mp = sleepq_hashlock(cv);
135 sq = CV_SLEEPQ(cv);
136 nlocks = sleepq_enter(sq, l, mp);
137 sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p);
138 mutex_exit(mtx);
139 KASSERT(cv_has_waiters(cv));
140 return nlocks;
141 }
142
143 /*
144 * cv_unsleep:
145 *
146 * Remove an LWP from the condition variable and sleep queue. This
147 * is called when the LWP has not been awoken normally but instead
148 * interrupted: for example, when a signal is received. Must be
149 * called with the LWP locked. Will unlock if "unlock" is true.
150 */
151 static void
152 cv_unsleep(lwp_t *l, bool unlock)
153 {
154 kcondvar_t *cv __diagused;
155
156 cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
157
158 KASSERT(l->l_wchan == (wchan_t)cv);
159 KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
160 KASSERT(cv_is_valid(cv));
161 KASSERT(cv_has_waiters(cv));
162
163 sleepq_unsleep(l, unlock);
164 }
165
166 /*
167 * cv_wait:
168 *
169 * Wait non-interruptably on a condition variable until awoken.
170 */
171 void
172 cv_wait(kcondvar_t *cv, kmutex_t *mtx)
173 {
174 lwp_t *l = curlwp;
175 int nlocks;
176
177 KASSERT(mutex_owned(mtx));
178
179 nlocks = cv_enter(cv, mtx, l, false);
180 (void)sleepq_block(0, false, &cv_syncobj, nlocks);
181 mutex_enter(mtx);
182 }
183
184 /*
185 * cv_wait_sig:
186 *
187 * Wait on a condition variable until a awoken or a signal is received.
188 * Will also return early if the process is exiting. Returns zero if
189 * awoken normally, ERESTART if a signal was received and the system
190 * call is restartable, or EINTR otherwise.
191 */
192 int
193 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
194 {
195 lwp_t *l = curlwp;
196 int error, nlocks;
197
198 KASSERT(mutex_owned(mtx));
199
200 nlocks = cv_enter(cv, mtx, l, true);
201 error = sleepq_block(0, true, &cv_syncobj, nlocks);
202 mutex_enter(mtx);
203 return error;
204 }
205
206 /*
207 * cv_timedwait:
208 *
209 * Wait on a condition variable until awoken or the specified timeout
210 * expires. Returns zero if awoken normally or EWOULDBLOCK if the
211 * timeout expired.
212 *
213 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
214 */
215 int
216 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
217 {
218 lwp_t *l = curlwp;
219 int error, nlocks;
220
221 KASSERT(mutex_owned(mtx));
222
223 nlocks = cv_enter(cv, mtx, l, false);
224 error = sleepq_block(timo, false, &cv_syncobj, nlocks);
225 mutex_enter(mtx);
226 return error;
227 }
228
229 /*
230 * cv_timedwait_sig:
231 *
232 * Wait on a condition variable until a timeout expires, awoken or a
233 * signal is received. Will also return early if the process is
234 * exiting. Returns zero if awoken normally, EWOULDBLOCK if the
235 * timeout expires, ERESTART if a signal was received and the system
236 * call is restartable, or EINTR otherwise.
237 *
238 * timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
239 */
240 int
241 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
242 {
243 lwp_t *l = curlwp;
244 int error, nlocks;
245
246 KASSERT(mutex_owned(mtx));
247
248 nlocks = cv_enter(cv, mtx, l, true);
249 error = sleepq_block(timo, true, &cv_syncobj, nlocks);
250 mutex_enter(mtx);
251 return error;
252 }
253
254 /*
255 * Given a number of seconds, sec, and 2^64ths of a second, frac, we
256 * want a number of ticks for a timeout:
257 *
258 * timo = hz*(sec + frac/2^64)
259 * = hz*sec + hz*frac/2^64
260 * = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
261 * = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
262 *
263 * where frachi is the high 32 bits of frac and fraclo is the
264 * low 32 bits.
265 *
266 * We assume hz < INT_MAX/2 < UINT32_MAX, so
267 *
268 * hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
269 *
270 * since fraclo < 2^32.
271 *
272 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
273 * can't represent timeouts higher than INT_MAX in cv_timedwait, and
274 * spurious wakeup is OK. Moreover, we don't want to wrap around,
275 * because we compute end - start in ticks in order to compute the
276 * remaining timeout, and that difference cannot wrap around, so we use
277 * a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of
278 * margin for paranoia and will exceed most waits in practice by far.
279 */
280 static unsigned
281 bintime2timo(const struct bintime *bt)
282 {
283
284 KASSERT(hz < INT_MAX/2);
285 CTASSERT(INT_MAX/2 < UINT32_MAX);
286 if (bt->sec > ((INT_MAX/2)/hz))
287 return INT_MAX/2;
288 if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
289 return INT_MAX/2;
290
291 return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
292 }
293
294 /*
295 * timo is in units of ticks. We want units of seconds and 2^64ths of
296 * a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
297 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
298 * second)/tick. So for the fractional part, we compute
299 *
300 * frac = rem * 2^64 / hz
301 * = ((rem * 2^32) / hz) * 2^32
302 *
303 * Using truncating integer division instead of real division will
304 * leave us with only about 32 bits of precision, which means about
305 * 1/4-nanosecond resolution, which is good enough for our purposes.
306 */
307 static struct bintime
308 timo2bintime(unsigned timo)
309 {
310
311 return (struct bintime) {
312 .sec = timo / hz,
313 .frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
314 };
315 }
316
317 /*
318 * cv_timedwaitbt:
319 *
320 * Wait on a condition variable until awoken or the specified
321 * timeout expires. Returns zero if awoken normally or
322 * EWOULDBLOCK if the timeout expires.
323 *
324 * On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts
325 * the time slept, so on exit, bt is the time remaining after
326 * sleeping, possibly negative if the complete time has elapsed.
327 * No infinite timeout; use cv_wait_sig instead.
328 *
329 * epsilon is a requested maximum error in timeout (excluding
330 * spurious wakeups). Currently not used, will be used in the
331 * future to choose between low- and high-resolution timers.
332 * Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
333 * where r is the finest resolution of clock available and s is
334 * scheduling delays for scheduler overhead and competing threads.
335 * Time is measured by the interrupt source implementing the
336 * timeout, not by another timecounter.
337 */
338 int
339 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
340 const struct bintime *epsilon __diagused)
341 {
342 struct bintime slept;
343 unsigned start, end;
344 int timo;
345 int error;
346
347 KASSERTMSG(bt->sec >= 0, "negative timeout");
348 KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
349
350 /* If there's nothing left to wait, time out. */
351 if (bt->sec == 0 && bt->frac == 0)
352 return EWOULDBLOCK;
353
354 /* Convert to ticks, but clamp to be >=1. */
355 timo = bintime2timo(bt);
356 KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
357 if (timo == 0)
358 timo = 1;
359
360 /*
361 * getticks() is technically int, but nothing special
362 * happens instead of overflow, so we assume two's-complement
363 * wraparound and just treat it as unsigned.
364 */
365 start = getticks();
366 error = cv_timedwait(cv, mtx, timo);
367 end = getticks();
368
369 /*
370 * Set it to the time left, or zero, whichever is larger. We
371 * do not fail with EWOULDBLOCK here because this may have been
372 * an explicit wakeup, so the caller needs to check before they
373 * give up or else cv_signal would be lost.
374 */
375 slept = timo2bintime(end - start);
376 if (bintimecmp(bt, &slept, <=)) {
377 bt->sec = 0;
378 bt->frac = 0;
379 } else {
380 /* bt := bt - slept */
381 bintime_sub(bt, &slept);
382 }
383
384 return error;
385 }
386
387 /*
388 * cv_timedwaitbt_sig:
389 *
390 * Wait on a condition variable until awoken, the specified
391 * timeout expires, or interrupted by a signal. Returns zero if
392 * awoken normally, EWOULDBLOCK if the timeout expires, or
393 * EINTR/ERESTART if interrupted by a signal.
394 *
395 * On entry, bt is a timeout in bintime. cv_timedwaitbt_sig
396 * subtracts the time slept, so on exit, bt is the time remaining
397 * after sleeping. No infinite timeout; use cv_wait instead.
398 *
399 * epsilon is a requested maximum error in timeout (excluding
400 * spurious wakeups). Currently not used, will be used in the
401 * future to choose between low- and high-resolution timers.
402 */
403 int
404 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
405 const struct bintime *epsilon __diagused)
406 {
407 struct bintime slept;
408 unsigned start, end;
409 int timo;
410 int error;
411
412 KASSERTMSG(bt->sec >= 0, "negative timeout");
413 KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
414
415 /* If there's nothing left to wait, time out. */
416 if (bt->sec == 0 && bt->frac == 0)
417 return EWOULDBLOCK;
418
419 /* Convert to ticks, but clamp to be >=1. */
420 timo = bintime2timo(bt);
421 KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
422 if (timo == 0)
423 timo = 1;
424
425 /*
426 * getticks() is technically int, but nothing special
427 * happens instead of overflow, so we assume two's-complement
428 * wraparound and just treat it as unsigned.
429 */
430 start = getticks();
431 error = cv_timedwait_sig(cv, mtx, timo);
432 end = getticks();
433
434 /*
435 * Set it to the time left, or zero, whichever is larger. We
436 * do not fail with EWOULDBLOCK here because this may have been
437 * an explicit wakeup, so the caller needs to check before they
438 * give up or else cv_signal would be lost.
439 */
440 slept = timo2bintime(end - start);
441 if (bintimecmp(bt, &slept, <=)) {
442 bt->sec = 0;
443 bt->frac = 0;
444 } else {
445 /* bt := bt - slept */
446 bintime_sub(bt, &slept);
447 }
448
449 return error;
450 }
451
452 /*
453 * cv_signal:
454 *
455 * Wake the highest priority LWP waiting on a condition variable. Must
456 * be called with the interlocking mutex held or just after it has been
457 * released (so the awoken LWP will see the changed condition).
458 */
459 void
460 cv_signal(kcondvar_t *cv)
461 {
462
463 KASSERT(cv_is_valid(cv));
464
465 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
466 /*
467 * Compiler turns into a tail call usually, i.e. jmp,
468 * because the arguments are the same and no locals.
469 */
470 cv_wakeup_one(cv);
471 }
472 }
473
474 /*
475 * cv_wakeup_one:
476 *
477 * Slow path for cv_signal(). Deliberately marked __noinline to
478 * prevent the compiler pulling it in to cv_signal(), which adds
479 * extra prologue and epilogue code.
480 */
481 static __noinline void
482 cv_wakeup_one(kcondvar_t *cv)
483 {
484 sleepq_t *sq;
485 kmutex_t *mp;
486 lwp_t *l;
487
488 mp = sleepq_hashlock(cv);
489 sq = CV_SLEEPQ(cv);
490 if (__predict_true((l = LIST_FIRST(sq)) != NULL)) {
491 KASSERT(l->l_sleepq == sq);
492 KASSERT(l->l_mutex == mp);
493 KASSERT(l->l_wchan == cv);
494 sleepq_remove(sq, l, true);
495 }
496 mutex_spin_exit(mp);
497 }
498
499 /*
500 * cv_broadcast:
501 *
502 * Wake all LWPs waiting on a condition variable. Must be called with
503 * the interlocking mutex held or just after it has been released (so
504 * the awoken LWP will see the changed condition).
505 */
506 void
507 cv_broadcast(kcondvar_t *cv)
508 {
509
510 KASSERT(cv_is_valid(cv));
511
512 if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
513 /*
514 * Compiler turns into a tail call usually, i.e. jmp,
515 * because the arguments are the same and no locals.
516 */
517 cv_wakeup_all(cv);
518 }
519 }
520
521 /*
522 * cv_wakeup_all:
523 *
524 * Slow path for cv_broadcast(). Deliberately marked __noinline to
525 * prevent the compiler pulling it in to cv_broadcast(), which adds
526 * extra prologue and epilogue code.
527 */
528 static __noinline void
529 cv_wakeup_all(kcondvar_t *cv)
530 {
531 sleepq_t *sq;
532 kmutex_t *mp;
533 lwp_t *l;
534
535 mp = sleepq_hashlock(cv);
536 sq = CV_SLEEPQ(cv);
537 while ((l = LIST_FIRST(sq)) != NULL) {
538 KASSERT(l->l_sleepq == sq);
539 KASSERT(l->l_mutex == mp);
540 KASSERT(l->l_wchan == cv);
541 sleepq_remove(sq, l, true);
542 }
543 mutex_spin_exit(mp);
544 }
545
546 /*
547 * cv_fdrestart:
548 *
549 * Like cv_broadcast(), but make any LWPs that share the same file
550 * descriptor table as the caller return ERESTART when resuming. Used
551 * to dislodge LWPs waiting for I/O that prevent a file descriptor from
552 * being closed, without upsetting access to the file (not descriptor)
553 * made from another direction. Rarely used thus no fast path
554 * provided.
555 */
556 void
557 cv_fdrestart(kcondvar_t *cv)
558 {
559 sleepq_t *sq;
560 kmutex_t *mp;
561 lwp_t *l;
562
563 KASSERT(cv_is_valid(cv));
564
565 if (LIST_EMPTY(CV_SLEEPQ(cv)))
566 return;
567
568 mp = sleepq_hashlock(cv);
569 sq = CV_SLEEPQ(cv);
570 while ((l = LIST_FIRST(sq)) != NULL) {
571 KASSERT(l->l_sleepq == sq);
572 KASSERT(l->l_mutex == mp);
573 KASSERT(l->l_wchan == cv);
574 /* l_fd stable at this point so no special locking needed. */
575 if (l->l_fd == curlwp->l_fd) {
576 l->l_flag |= LW_RESTART;
577 sleepq_remove(sq, l, false);
578 }
579 }
580 mutex_spin_exit(mp);
581 }
582
583 /*
584 * cv_has_waiters:
585 *
586 * For diagnostic assertions: return non-zero if a condition
587 * variable has waiters.
588 */
589 bool
590 cv_has_waiters(kcondvar_t *cv)
591 {
592
593 return !LIST_EMPTY(CV_SLEEPQ(cv));
594 }
595
596 /*
597 * cv_is_valid:
598 *
599 * For diagnostic assertions: return non-zero if a condition
600 * variable appears to be valid. No locks need be held.
601 */
602 bool
603 cv_is_valid(kcondvar_t *cv)
604 {
605
606 return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
607 }
608