Home | History | Annotate | Line # | Download | only in kern
kern_condvar.c revision 1.43
      1 /*	$NetBSD: kern_condvar.c,v 1.43 2020/02/15 17:09:24 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2006, 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Kernel condition variable implementation.
     34  */
     35 
     36 #include <sys/cdefs.h>
     37 __KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.43 2020/02/15 17:09:24 ad Exp $");
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 #include <sys/lwp.h>
     42 #include <sys/condvar.h>
     43 #include <sys/sleepq.h>
     44 #include <sys/lockdebug.h>
     45 #include <sys/cpu.h>
     46 #include <sys/kernel.h>
     47 
     48 /*
     49  * Accessors for the private contents of the kcondvar_t data type.
     50  *
     51  *	cv_opaque[0]	sleepq...
     52  *	cv_opaque[1]	...pointers
     53  *	cv_opaque[2]	description for ps(1)
     54  *
     55  * cv_opaque[0..1] is protected by the interlock passed to cv_wait() (enqueue
     56  * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
     57  * and dequeue).
     58  *
     59  * cv_opaque[2] (the wmesg) is static and does not change throughout the life
     60  * of the CV.
     61  */
     62 #define	CV_SLEEPQ(cv)		((sleepq_t *)(cv)->cv_opaque)
     63 #define	CV_WMESG(cv)		((const char *)(cv)->cv_opaque[2])
     64 #define	CV_SET_WMESG(cv, v) 	(cv)->cv_opaque[2] = __UNCONST(v)
     65 
     66 #define	CV_DEBUG_P(cv)	(CV_WMESG(cv) != nodebug)
     67 #define	CV_RA		((uintptr_t)__builtin_return_address(0))
     68 
     69 static void		cv_unsleep(lwp_t *, bool);
     70 static inline void	cv_wakeup_one(kcondvar_t *);
     71 static inline void	cv_wakeup_all(kcondvar_t *);
     72 
     73 syncobj_t cv_syncobj = {
     74 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
     75 	.sobj_unsleep	= cv_unsleep,
     76 	.sobj_changepri	= sleepq_changepri,
     77 	.sobj_lendpri	= sleepq_lendpri,
     78 	.sobj_owner	= syncobj_noowner,
     79 };
     80 
     81 lockops_t cv_lockops = {
     82 	.lo_name = "Condition variable",
     83 	.lo_type = LOCKOPS_CV,
     84 	.lo_dump = NULL,
     85 };
     86 
     87 static const char deadcv[] = "deadcv";
     88 #ifdef LOCKDEBUG
     89 static const char nodebug[] = "nodebug";
     90 
     91 #define CV_LOCKDEBUG_HANDOFF(l, cv) cv_lockdebug_handoff(l, cv)
     92 #define CV_LOCKDEBUG_PROCESS(l, cv) cv_lockdebug_process(l, cv)
     93 
     94 static inline void
     95 cv_lockdebug_handoff(lwp_t *l, kcondvar_t *cv)
     96 {
     97 
     98 	if (CV_DEBUG_P(cv))
     99 		l->l_flag |= LW_CVLOCKDEBUG;
    100 }
    101 
    102 static inline void
    103 cv_lockdebug_process(lwp_t *l, kcondvar_t *cv)
    104 {
    105 
    106 	if ((l->l_flag & LW_CVLOCKDEBUG) == 0)
    107 		return;
    108 
    109 	l->l_flag &= ~LW_CVLOCKDEBUG;
    110 	LOCKDEBUG_UNLOCKED(true, cv, CV_RA, 0);
    111 }
    112 #else
    113 #define CV_LOCKDEBUG_HANDOFF(l, cv) __nothing
    114 #define CV_LOCKDEBUG_PROCESS(l, cv) __nothing
    115 #endif
    116 
    117 /*
    118  * cv_init:
    119  *
    120  *	Initialize a condition variable for use.
    121  */
    122 void
    123 cv_init(kcondvar_t *cv, const char *wmesg)
    124 {
    125 #ifdef LOCKDEBUG
    126 	bool dodebug;
    127 
    128 	dodebug = LOCKDEBUG_ALLOC(cv, &cv_lockops,
    129 	    (uintptr_t)__builtin_return_address(0));
    130 	if (!dodebug) {
    131 		/* XXX This will break vfs_lockf. */
    132 		wmesg = nodebug;
    133 	}
    134 #endif
    135 	KASSERT(wmesg != NULL);
    136 	CV_SET_WMESG(cv, wmesg);
    137 	sleepq_init(CV_SLEEPQ(cv));
    138 }
    139 
    140 /*
    141  * cv_destroy:
    142  *
    143  *	Tear down a condition variable.
    144  */
    145 void
    146 cv_destroy(kcondvar_t *cv)
    147 {
    148 
    149 	LOCKDEBUG_FREE(CV_DEBUG_P(cv), cv);
    150 #ifdef DIAGNOSTIC
    151 	KASSERT(cv_is_valid(cv));
    152 	CV_SET_WMESG(cv, deadcv);
    153 #endif
    154 }
    155 
    156 /*
    157  * cv_enter:
    158  *
    159  *	Look up and lock the sleep queue corresponding to the given
    160  *	condition variable, and increment the number of waiters.
    161  */
    162 static inline void
    163 cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l)
    164 {
    165 	sleepq_t *sq;
    166 	kmutex_t *mp;
    167 
    168 	KASSERT(cv_is_valid(cv));
    169 	KASSERT(!cpu_intr_p());
    170 	KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
    171 
    172 	LOCKDEBUG_LOCKED(CV_DEBUG_P(cv), cv, mtx, CV_RA, 0);
    173 
    174 	l->l_kpriority = true;
    175 	mp = sleepq_hashlock(cv);
    176 	sq = CV_SLEEPQ(cv);
    177 	sleepq_enter(sq, l, mp);
    178 	sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj);
    179 	mutex_exit(mtx);
    180 	KASSERT(cv_has_waiters(cv));
    181 }
    182 
    183 /*
    184  * cv_exit:
    185  *
    186  *	After resuming execution, check to see if we have been restarted
    187  *	as a result of cv_signal().  If we have, but cannot take the
    188  *	wakeup (because of eg a pending Unix signal or timeout) then try
    189  *	to ensure that another LWP sees it.  This is necessary because
    190  *	there may be multiple waiters, and at least one should take the
    191  *	wakeup if possible.
    192  */
    193 static inline int
    194 cv_exit(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, const int error)
    195 {
    196 
    197 	mutex_enter(mtx);
    198 	if (__predict_false(error != 0))
    199 		cv_signal(cv);
    200 
    201 	LOCKDEBUG_UNLOCKED(CV_DEBUG_P(cv), cv, CV_RA, 0);
    202 	KASSERT(cv_is_valid(cv));
    203 
    204 	return error;
    205 }
    206 
    207 /*
    208  * cv_unsleep:
    209  *
    210  *	Remove an LWP from the condition variable and sleep queue.  This
    211  *	is called when the LWP has not been awoken normally but instead
    212  *	interrupted: for example, when a signal is received.  Must be
    213  *	called with the LWP locked.  Will unlock if "unlock" is true.
    214  */
    215 static void
    216 cv_unsleep(lwp_t *l, bool unlock)
    217 {
    218 	kcondvar_t *cv __diagused;
    219 
    220 	cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
    221 
    222 	KASSERT(l->l_wchan == (wchan_t)cv);
    223 	KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
    224 	KASSERT(cv_is_valid(cv));
    225 	KASSERT(cv_has_waiters(cv));
    226 
    227 	sleepq_unsleep(l, unlock);
    228 }
    229 
    230 /*
    231  * cv_wait:
    232  *
    233  *	Wait non-interruptably on a condition variable until awoken.
    234  */
    235 void
    236 cv_wait(kcondvar_t *cv, kmutex_t *mtx)
    237 {
    238 	lwp_t *l = curlwp;
    239 
    240 	KASSERT(mutex_owned(mtx));
    241 
    242 	cv_enter(cv, mtx, l);
    243 
    244 	/*
    245 	 * We can't use cv_exit() here since the cv might be destroyed before
    246 	 * this thread gets a chance to run.  Instead, hand off the lockdebug
    247 	 * responsibility to the thread that wakes us up.
    248 	 */
    249 
    250 	CV_LOCKDEBUG_HANDOFF(l, cv);
    251 	(void)sleepq_block(0, false);
    252 	mutex_enter(mtx);
    253 }
    254 
    255 /*
    256  * cv_wait_sig:
    257  *
    258  *	Wait on a condition variable until a awoken or a signal is received.
    259  *	Will also return early if the process is exiting.  Returns zero if
    260  *	awoken normally, ERESTART if a signal was received and the system
    261  *	call is restartable, or EINTR otherwise.
    262  */
    263 int
    264 cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
    265 {
    266 	lwp_t *l = curlwp;
    267 	int error;
    268 
    269 	KASSERT(mutex_owned(mtx));
    270 
    271 	cv_enter(cv, mtx, l);
    272 	error = sleepq_block(0, true);
    273 	return cv_exit(cv, mtx, l, error);
    274 }
    275 
    276 /*
    277  * cv_timedwait:
    278  *
    279  *	Wait on a condition variable until awoken or the specified timeout
    280  *	expires.  Returns zero if awoken normally or EWOULDBLOCK if the
    281  *	timeout expired.
    282  *
    283  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
    284  */
    285 int
    286 cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
    287 {
    288 	lwp_t *l = curlwp;
    289 	int error;
    290 
    291 	KASSERT(mutex_owned(mtx));
    292 
    293 	cv_enter(cv, mtx, l);
    294 	error = sleepq_block(timo, false);
    295 	return cv_exit(cv, mtx, l, error);
    296 }
    297 
    298 /*
    299  * cv_timedwait_sig:
    300  *
    301  *	Wait on a condition variable until a timeout expires, awoken or a
    302  *	signal is received.  Will also return early if the process is
    303  *	exiting.  Returns zero if awoken normally, EWOULDBLOCK if the
    304  *	timeout expires, ERESTART if a signal was received and the system
    305  *	call is restartable, or EINTR otherwise.
    306  *
    307  *	timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
    308  */
    309 int
    310 cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
    311 {
    312 	lwp_t *l = curlwp;
    313 	int error;
    314 
    315 	KASSERT(mutex_owned(mtx));
    316 
    317 	cv_enter(cv, mtx, l);
    318 	error = sleepq_block(timo, true);
    319 	return cv_exit(cv, mtx, l, error);
    320 }
    321 
    322 /*
    323  * Given a number of seconds, sec, and 2^64ths of a second, frac, we
    324  * want a number of ticks for a timeout:
    325  *
    326  *	timo = hz*(sec + frac/2^64)
    327  *	     = hz*sec + hz*frac/2^64
    328  *	     = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
    329  *	     = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
    330  *
    331  * where frachi is the high 32 bits of frac and fraclo is the
    332  * low 32 bits.
    333  *
    334  * We assume hz < INT_MAX/2 < UINT32_MAX, so
    335  *
    336  *	hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
    337  *
    338  * since fraclo < 2^32.
    339  *
    340  * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
    341  * can't represent timeouts higher than INT_MAX in cv_timedwait, and
    342  * spurious wakeup is OK.  Moreover, we don't want to wrap around,
    343  * because we compute end - start in ticks in order to compute the
    344  * remaining timeout, and that difference cannot wrap around, so we use
    345  * a timeout less than INT_MAX.  Using INT_MAX/2 provides plenty of
    346  * margin for paranoia and will exceed most waits in practice by far.
    347  */
    348 static unsigned
    349 bintime2timo(const struct bintime *bt)
    350 {
    351 
    352 	KASSERT(hz < INT_MAX/2);
    353 	CTASSERT(INT_MAX/2 < UINT32_MAX);
    354 	if (bt->sec > ((INT_MAX/2)/hz))
    355 		return INT_MAX/2;
    356 	if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
    357 		return INT_MAX/2;
    358 
    359 	return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
    360 }
    361 
    362 /*
    363  * timo is in units of ticks.  We want units of seconds and 2^64ths of
    364  * a second.  We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
    365  * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
    366  * second)/tick.  So for the fractional part, we compute
    367  *
    368  *	frac = rem * 2^64 / hz
    369  *	     = ((rem * 2^32) / hz) * 2^32
    370  *
    371  * Using truncating integer division instead of real division will
    372  * leave us with only about 32 bits of precision, which means about
    373  * 1/4-nanosecond resolution, which is good enough for our purposes.
    374  */
    375 static struct bintime
    376 timo2bintime(unsigned timo)
    377 {
    378 
    379 	return (struct bintime) {
    380 		.sec = timo / hz,
    381 		.frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
    382 	};
    383 }
    384 
    385 /*
    386  * cv_timedwaitbt:
    387  *
    388  *	Wait on a condition variable until awoken or the specified
    389  *	timeout expires.  Returns zero if awoken normally or
    390  *	EWOULDBLOCK if the timeout expires.
    391  *
    392  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt subtracts
    393  *	the time slept, so on exit, bt is the time remaining after
    394  *	sleeping, possibly negative if the complete time has elapsed.
    395  *	No infinite timeout; use cv_wait_sig instead.
    396  *
    397  *	epsilon is a requested maximum error in timeout (excluding
    398  *	spurious wakeups).  Currently not used, will be used in the
    399  *	future to choose between low- and high-resolution timers.
    400  *	Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
    401  *	where r is the finest resolution of clock available and s is
    402  *	scheduling delays for scheduler overhead and competing threads.
    403  *	Time is measured by the interrupt source implementing the
    404  *	timeout, not by another timecounter.
    405  */
    406 int
    407 cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
    408     const struct bintime *epsilon __diagused)
    409 {
    410 	struct bintime slept;
    411 	unsigned start, end;
    412 	int error;
    413 
    414 	KASSERTMSG(bt->sec >= 0, "negative timeout");
    415 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
    416 
    417 	/*
    418 	 * hardclock_ticks is technically int, but nothing special
    419 	 * happens instead of overflow, so we assume two's-complement
    420 	 * wraparound and just treat it as unsigned.
    421 	 */
    422 	start = hardclock_ticks;
    423 	error = cv_timedwait(cv, mtx, bintime2timo(bt));
    424 	end = hardclock_ticks;
    425 
    426 	slept = timo2bintime(end - start);
    427 	/* bt := bt - slept */
    428 	bintime_sub(bt, &slept);
    429 
    430 	return error;
    431 }
    432 
    433 /*
    434  * cv_timedwaitbt_sig:
    435  *
    436  *	Wait on a condition variable until awoken, the specified
    437  *	timeout expires, or interrupted by a signal.  Returns zero if
    438  *	awoken normally, EWOULDBLOCK if the timeout expires, or
    439  *	EINTR/ERESTART if interrupted by a signal.
    440  *
    441  *	On entry, bt is a timeout in bintime.  cv_timedwaitbt_sig
    442  *	subtracts the time slept, so on exit, bt is the time remaining
    443  *	after sleeping.  No infinite timeout; use cv_wait instead.
    444  *
    445  *	epsilon is a requested maximum error in timeout (excluding
    446  *	spurious wakeups).  Currently not used, will be used in the
    447  *	future to choose between low- and high-resolution timers.
    448  */
    449 int
    450 cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
    451     const struct bintime *epsilon __diagused)
    452 {
    453 	struct bintime slept;
    454 	unsigned start, end;
    455 	int error;
    456 
    457 	KASSERTMSG(bt->sec >= 0, "negative timeout");
    458 	KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
    459 
    460 	/*
    461 	 * hardclock_ticks is technically int, but nothing special
    462 	 * happens instead of overflow, so we assume two's-complement
    463 	 * wraparound and just treat it as unsigned.
    464 	 */
    465 	start = hardclock_ticks;
    466 	error = cv_timedwait_sig(cv, mtx, bintime2timo(bt));
    467 	end = hardclock_ticks;
    468 
    469 	slept = timo2bintime(end - start);
    470 	/* bt := bt - slept */
    471 	bintime_sub(bt, &slept);
    472 
    473 	return error;
    474 }
    475 
    476 /*
    477  * cv_signal:
    478  *
    479  *	Wake the highest priority LWP waiting on a condition variable.
    480  *	Must be called with the interlocking mutex held.
    481  */
    482 void
    483 cv_signal(kcondvar_t *cv)
    484 {
    485 
    486 	/* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
    487 	KASSERT(cv_is_valid(cv));
    488 
    489 	if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
    490 		cv_wakeup_one(cv);
    491 }
    492 
    493 /*
    494  * cv_wakeup_one:
    495  *
    496  *	Slow path for cv_signal().  Deliberately marked __noinline to
    497  *	prevent the compiler pulling it in to cv_signal(), which adds
    498  *	extra prologue and epilogue code.
    499  */
    500 static __noinline void
    501 cv_wakeup_one(kcondvar_t *cv)
    502 {
    503 	sleepq_t *sq;
    504 	kmutex_t *mp;
    505 	lwp_t *l;
    506 
    507 	KASSERT(cv_is_valid(cv));
    508 
    509 	mp = sleepq_hashlock(cv);
    510 	sq = CV_SLEEPQ(cv);
    511 	l = TAILQ_FIRST(sq);
    512 	if (__predict_false(l == NULL)) {
    513 		mutex_spin_exit(mp);
    514 		return;
    515 	}
    516 	KASSERT(l->l_sleepq == sq);
    517 	KASSERT(l->l_mutex == mp);
    518 	KASSERT(l->l_wchan == cv);
    519 	CV_LOCKDEBUG_PROCESS(l, cv);
    520 	sleepq_remove(sq, l);
    521 	mutex_spin_exit(mp);
    522 
    523 	KASSERT(cv_is_valid(cv));
    524 }
    525 
    526 /*
    527  * cv_broadcast:
    528  *
    529  *	Wake all LWPs waiting on a condition variable.  Must be called
    530  *	with the interlocking mutex held.
    531  */
    532 void
    533 cv_broadcast(kcondvar_t *cv)
    534 {
    535 
    536 	/* LOCKDEBUG_WAKEUP(CV_DEBUG_P(cv), cv, CV_RA); */
    537 	KASSERT(cv_is_valid(cv));
    538 
    539 	if (__predict_false(!TAILQ_EMPTY(CV_SLEEPQ(cv))))
    540 		cv_wakeup_all(cv);
    541 }
    542 
    543 /*
    544  * cv_wakeup_all:
    545  *
    546  *	Slow path for cv_broadcast().  Deliberately marked __noinline to
    547  *	prevent the compiler pulling it in to cv_broadcast(), which adds
    548  *	extra prologue and epilogue code.
    549  */
    550 static __noinline void
    551 cv_wakeup_all(kcondvar_t *cv)
    552 {
    553 	sleepq_t *sq;
    554 	kmutex_t *mp;
    555 	lwp_t *l, *next;
    556 
    557 	KASSERT(cv_is_valid(cv));
    558 
    559 	mp = sleepq_hashlock(cv);
    560 	sq = CV_SLEEPQ(cv);
    561 	for (l = TAILQ_FIRST(sq); l != NULL; l = next) {
    562 		KASSERT(l->l_sleepq == sq);
    563 		KASSERT(l->l_mutex == mp);
    564 		KASSERT(l->l_wchan == cv);
    565 		next = TAILQ_NEXT(l, l_sleepchain);
    566 		CV_LOCKDEBUG_PROCESS(l, cv);
    567 		sleepq_remove(sq, l);
    568 	}
    569 	mutex_spin_exit(mp);
    570 
    571 	KASSERT(cv_is_valid(cv));
    572 }
    573 
    574 /*
    575  * cv_has_waiters:
    576  *
    577  *	For diagnostic assertions: return non-zero if a condition
    578  *	variable has waiters.
    579  */
    580 bool
    581 cv_has_waiters(kcondvar_t *cv)
    582 {
    583 
    584 	return !TAILQ_EMPTY(CV_SLEEPQ(cv));
    585 }
    586 
    587 /*
    588  * cv_is_valid:
    589  *
    590  *	For diagnostic assertions: return non-zero if a condition
    591  *	variable appears to be valid.  No locks need be held.
    592  */
    593 bool
    594 cv_is_valid(kcondvar_t *cv)
    595 {
    596 
    597 	return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
    598 }
    599