Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.186.2.4
      1 /*	$NetBSD: kern_synch.c,v 1.186.2.4 2007/04/05 21:38:37 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  * 3. All advertising materials mentioning features or use of this software
     20  *    must display the following acknowledgement:
     21  *	This product includes software developed by the NetBSD
     22  *	Foundation, Inc. and its contributors.
     23  * 4. Neither the name of The NetBSD Foundation nor the names of its
     24  *    contributors may be used to endorse or promote products derived
     25  *    from this software without specific prior written permission.
     26  *
     27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     37  * POSSIBILITY OF SUCH DAMAGE.
     38  */
     39 
     40 /*-
     41  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     42  *	The Regents of the University of California.  All rights reserved.
     43  * (c) UNIX System Laboratories, Inc.
     44  * All or some portions of this file are derived from material licensed
     45  * to the University of California by American Telephone and Telegraph
     46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     47  * the permission of UNIX System Laboratories, Inc.
     48  *
     49  * Redistribution and use in source and binary forms, with or without
     50  * modification, are permitted provided that the following conditions
     51  * are met:
     52  * 1. Redistributions of source code must retain the above copyright
     53  *    notice, this list of conditions and the following disclaimer.
     54  * 2. Redistributions in binary form must reproduce the above copyright
     55  *    notice, this list of conditions and the following disclaimer in the
     56  *    documentation and/or other materials provided with the distribution.
     57  * 3. Neither the name of the University nor the names of its contributors
     58  *    may be used to endorse or promote products derived from this software
     59  *    without specific prior written permission.
     60  *
     61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     71  * SUCH DAMAGE.
     72  *
     73  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     74  */
     75 
     76 #include <sys/cdefs.h>
     77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.186.2.4 2007/04/05 21:38:37 ad Exp $");
     78 
     79 #include "opt_ddb.h"
     80 #include "opt_kstack.h"
     81 #include "opt_lockdebug.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_perfctrs.h"
     84 
     85 #define	__MUTEX_PRIVATE
     86 
     87 #include <sys/param.h>
     88 #include <sys/systm.h>
     89 #include <sys/callout.h>
     90 #include <sys/proc.h>
     91 #include <sys/kernel.h>
     92 #include <sys/buf.h>
     93 #if defined(PERFCTRS)
     94 #include <sys/pmc.h>
     95 #endif
     96 #include <sys/signalvar.h>
     97 #include <sys/resourcevar.h>
     98 #include <sys/sched.h>
     99 #include <sys/syscall_stats.h>
    100 #include <sys/kauth.h>
    101 #include <sys/sleepq.h>
    102 #include <sys/lockdebug.h>
    103 
    104 #include <uvm/uvm_extern.h>
    105 
    106 #include <machine/cpu.h>
    107 
    108 int	rrticks;		/* number of hardclock ticks per roundrobin() */
    109 
    110 /*
    111  * The global scheduler state.
    112  */
    113 kmutex_t	sched_mutex;		/* global sched state mutex */
    114 struct prochd	sched_qs[RUNQUE_NQS];	/* run queues */
    115 volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
    116 
    117 void	schedcpu(void *);
    118 void	updatepri(struct lwp *);
    119 
    120 void	sched_unsleep(struct lwp *);
    121 void	sched_changepri(struct lwp *, pri_t);
    122 void	sched_lendpri(struct lwp *, pri_t);
    123 
    124 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
    125 static unsigned int schedcpu_ticks;
    126 
    127 syncobj_t sleep_syncobj = {
    128 	SOBJ_SLEEPQ_SORTED,
    129 	sleepq_unsleep,
    130 	sleepq_changepri,
    131 	sleepq_lendpri,
    132 	syncobj_noowner,
    133 };
    134 
    135 syncobj_t sched_syncobj = {
    136 	SOBJ_SLEEPQ_SORTED,
    137 	sched_unsleep,
    138 	sched_changepri,
    139 	sched_lendpri,
    140 	syncobj_noowner,
    141 };
    142 
    143 /*
    144  * Force switch among equal priority processes every 100ms.
    145  * Called from hardclock every hz/10 == rrticks hardclock ticks.
    146  */
    147 /* ARGSUSED */
    148 void
    149 roundrobin(struct cpu_info *ci)
    150 {
    151 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    152 
    153 	spc->spc_rrticks = rrticks;
    154 
    155 	if (curlwp != NULL) {
    156 		if (spc->spc_flags & SPCF_SEENRR) {
    157 			/*
    158 			 * The process has already been through a roundrobin
    159 			 * without switching and may be hogging the CPU.
    160 			 * Indicate that the process should yield.
    161 			 */
    162 			spc->spc_flags |= SPCF_SHOULDYIELD;
    163 		} else
    164 			spc->spc_flags |= SPCF_SEENRR;
    165 	}
    166 	cpu_need_resched(curcpu());
    167 }
    168 
    169 #define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
    170 #define	NICE_WEIGHT 2			/* priorities per nice level */
    171 
    172 #define	ESTCPU_SHIFT	11
    173 #define	ESTCPU_MAX	((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
    174 #define	ESTCPULIM(e)	min((e), ESTCPU_MAX)
    175 
    176 /*
    177  * Constants for digital decay and forget:
    178  *	90% of (p_estcpu) usage in 5 * loadav time
    179  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
    180  *          Note that, as ps(1) mentions, this can let percentages
    181  *          total over 100% (I've seen 137.9% for 3 processes).
    182  *
    183  * Note that hardclock updates p_estcpu and p_cpticks independently.
    184  *
    185  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
    186  * That is, the system wants to compute a value of decay such
    187  * that the following for loop:
    188  * 	for (i = 0; i < (5 * loadavg); i++)
    189  * 		p_estcpu *= decay;
    190  * will compute
    191  * 	p_estcpu *= 0.1;
    192  * for all values of loadavg:
    193  *
    194  * Mathematically this loop can be expressed by saying:
    195  * 	decay ** (5 * loadavg) ~= .1
    196  *
    197  * The system computes decay as:
    198  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
    199  *
    200  * We wish to prove that the system's computation of decay
    201  * will always fulfill the equation:
    202  * 	decay ** (5 * loadavg) ~= .1
    203  *
    204  * If we compute b as:
    205  * 	b = 2 * loadavg
    206  * then
    207  * 	decay = b / (b + 1)
    208  *
    209  * We now need to prove two things:
    210  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
    211  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
    212  *
    213  * Facts:
    214  *         For x close to zero, exp(x) =~ 1 + x, since
    215  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
    216  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
    217  *         For x close to zero, ln(1+x) =~ x, since
    218  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
    219  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
    220  *         ln(.1) =~ -2.30
    221  *
    222  * Proof of (1):
    223  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
    224  *	solving for factor,
    225  *      ln(factor) =~ (-2.30/5*loadav), or
    226  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
    227  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
    228  *
    229  * Proof of (2):
    230  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
    231  *	solving for power,
    232  *      power*ln(b/(b+1)) =~ -2.30, or
    233  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
    234  *
    235  * Actual power values for the implemented algorithm are as follows:
    236  *      loadav: 1       2       3       4
    237  *      power:  5.68    10.32   14.94   19.55
    238  */
    239 
    240 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
    241 #define	loadfactor(loadav)	(2 * (loadav))
    242 
    243 static fixpt_t
    244 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
    245 {
    246 
    247 	if (estcpu == 0) {
    248 		return 0;
    249 	}
    250 
    251 #if !defined(_LP64)
    252 	/* avoid 64bit arithmetics. */
    253 #define	FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
    254 	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
    255 		return estcpu * loadfac / (loadfac + FSCALE);
    256 	}
    257 #endif /* !defined(_LP64) */
    258 
    259 	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
    260 }
    261 
    262 /*
    263  * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
    264  * sleeping for at least seven times the loadfactor will decay p_estcpu to
    265  * less than (1 << ESTCPU_SHIFT).
    266  *
    267  * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
    268  */
    269 static fixpt_t
    270 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
    271 {
    272 
    273 	if ((n << FSHIFT) >= 7 * loadfac) {
    274 		return 0;
    275 	}
    276 
    277 	while (estcpu != 0 && n > 1) {
    278 		estcpu = decay_cpu(loadfac, estcpu);
    279 		n--;
    280 	}
    281 
    282 	return estcpu;
    283 }
    284 
    285 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
    286 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
    287 
    288 /*
    289  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
    290  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
    291  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
    292  *
    293  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
    294  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
    295  *
    296  * If you dont want to bother with the faster/more-accurate formula, you
    297  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
    298  * (more general) method of calculating the %age of CPU used by a process.
    299  */
    300 #define	CCPU_SHIFT	11
    301 
    302 /*
    303  * schedcpu:
    304  *
    305  *	Recompute process priorities, every hz ticks.
    306  *
    307  *	XXXSMP This needs to be reorganised in order to reduce the locking
    308  *	burden.
    309  */
    310 /* ARGSUSED */
    311 void
    312 schedcpu(void *arg)
    313 {
    314 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
    315 	struct rlimit *rlim;
    316 	struct lwp *l;
    317 	struct proc *p;
    318 	int minslp, clkhz, sig;
    319 	long runtm;
    320 
    321 	schedcpu_ticks++;
    322 
    323 	mutex_enter(&proclist_mutex);
    324 	PROCLIST_FOREACH(p, &allproc) {
    325 		/*
    326 		 * Increment time in/out of memory and sleep time (if
    327 		 * sleeping).  We ignore overflow; with 16-bit int's
    328 		 * (remember them?) overflow takes 45 days.
    329 		 */
    330 		minslp = 2;
    331 		mutex_enter(&p->p_smutex);
    332 		runtm = p->p_rtime.tv_sec;
    333 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    334 			lwp_lock(l);
    335 			runtm += l->l_rtime.tv_sec;
    336 			l->l_swtime++;
    337 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    338 			    l->l_stat == LSSUSPENDED) {
    339 				l->l_slptime++;
    340 				minslp = min(minslp, l->l_slptime);
    341 			} else
    342 				minslp = 0;
    343 			lwp_unlock(l);
    344 		}
    345 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
    346 
    347 		/*
    348 		 * Check if the process exceeds its CPU resource allocation.
    349 		 * If over max, kill it.
    350 		 */
    351 		rlim = &p->p_rlimit[RLIMIT_CPU];
    352 		sig = 0;
    353 		if (runtm >= rlim->rlim_cur) {
    354 			if (runtm >= rlim->rlim_max)
    355 				sig = SIGKILL;
    356 			else {
    357 				sig = SIGXCPU;
    358 				if (rlim->rlim_cur < rlim->rlim_max)
    359 					rlim->rlim_cur += 5;
    360 			}
    361 		}
    362 
    363 		/*
    364 		 * If the process has run for more than autonicetime, reduce
    365 		 * priority to give others a chance.
    366 		 */
    367 		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
    368 		    && kauth_cred_geteuid(p->p_cred)) {
    369 			mutex_spin_enter(&p->p_stmutex);
    370 			p->p_nice = autoniceval + NZERO;
    371 			resetprocpriority(p);
    372 			mutex_spin_exit(&p->p_stmutex);
    373 		}
    374 
    375 		/*
    376 		 * If the process has slept the entire second,
    377 		 * stop recalculating its priority until it wakes up.
    378 		 */
    379 		if (minslp <= 1) {
    380 			/*
    381 			 * p_pctcpu is only for ps.
    382 			 */
    383 			mutex_spin_enter(&p->p_stmutex);
    384 			clkhz = stathz != 0 ? stathz : hz;
    385 #if	(FSHIFT >= CCPU_SHIFT)
    386 			p->p_pctcpu += (clkhz == 100)?
    387 			    ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
    388 			    100 * (((fixpt_t) p->p_cpticks)
    389 			    << (FSHIFT - CCPU_SHIFT)) / clkhz;
    390 #else
    391 			p->p_pctcpu += ((FSCALE - ccpu) *
    392 			    (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
    393 #endif
    394 			p->p_cpticks = 0;
    395 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    396 
    397 			LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    398 				lwp_lock(l);
    399 				if (l->l_slptime <= 1 &&
    400 				    l->l_priority >= PUSER)
    401 					resetpriority(l);
    402 				lwp_unlock(l);
    403 			}
    404 			mutex_spin_exit(&p->p_stmutex);
    405 		}
    406 
    407 		mutex_exit(&p->p_smutex);
    408 		if (sig) {
    409 			psignal(p, sig);
    410 		}
    411 	}
    412 	mutex_exit(&proclist_mutex);
    413 	uvm_meter();
    414 	cv_broadcast(&lbolt);
    415 	callout_schedule(&schedcpu_ch, hz);
    416 }
    417 
    418 /*
    419  * Recalculate the priority of a process after it has slept for a while.
    420  */
    421 void
    422 updatepri(struct lwp *l)
    423 {
    424 	struct proc *p = l->l_proc;
    425 	fixpt_t loadfac;
    426 
    427 	KASSERT(lwp_locked(l, NULL));
    428 	KASSERT(l->l_slptime > 1);
    429 
    430 	loadfac = loadfactor(averunnable.ldavg[0]);
    431 
    432 	l->l_slptime--; /* the first time was done in schedcpu */
    433 	/* XXX NJWLWP */
    434 	/* XXXSMP occasionally unlocked, should be per-LWP */
    435 	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
    436 	resetpriority(l);
    437 }
    438 
    439 /*
    440  * During autoconfiguration or after a panic, a sleep will simply lower the
    441  * priority briefly to allow interrupts, then return.  The priority to be
    442  * used (safepri) is machine-dependent, thus this value is initialized and
    443  * maintained in the machine-dependent layers.  This priority will typically
    444  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    445  * it can be made higher to block network software interrupts after panics.
    446  */
    447 int	safepri;
    448 
    449 /*
    450  * OBSOLETE INTERFACE
    451  *
    452  * General sleep call.  Suspends the current process until a wakeup is
    453  * performed on the specified identifier.  The process will then be made
    454  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    455  * means no timeout).  If pri includes PCATCH flag, signals are checked
    456  * before and after sleeping, else signals are not checked.  Returns 0 if
    457  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    458  * signal needs to be delivered, ERESTART is returned if the current system
    459  * call should be restarted if possible, and EINTR is returned if the system
    460  * call should be interrupted by the signal (return EINTR).
    461  *
    462  * The interlock is held until we are on a sleep queue. The interlock will
    463  * be locked before returning back to the caller unless the PNORELOCK flag
    464  * is specified, in which case the interlock will always be unlocked upon
    465  * return.
    466  */
    467 int
    468 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    469 	volatile struct simplelock *interlock)
    470 {
    471 	struct lwp *l = curlwp;
    472 	sleepq_t *sq;
    473 	int error, catch;
    474 
    475 	if (sleepq_dontsleep(l)) {
    476 		(void)sleepq_abort(NULL, 0);
    477 		if ((priority & PNORELOCK) != 0)
    478 			simple_unlock(interlock);
    479 		return 0;
    480 	}
    481 
    482 	sq = sleeptab_lookup(&sleeptab, ident);
    483 	sleepq_enter(sq, l);
    484 
    485 	if (interlock != NULL)
    486 		simple_unlock(interlock);
    487 
    488 	catch = priority & PCATCH;
    489 	sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
    490 	    &sleep_syncobj);
    491 	error = sleepq_unblock(timo, catch);
    492 
    493 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    494 		simple_lock(interlock);
    495 
    496 	return error;
    497 }
    498 
    499 int
    500 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    501 	kmutex_t *mtx)
    502 {
    503 	struct lwp *l = curlwp;
    504 	sleepq_t *sq;
    505 	int error, catch;
    506 
    507 	if (sleepq_dontsleep(l)) {
    508 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    509 		return 0;
    510 	}
    511 
    512 	sq = sleeptab_lookup(&sleeptab, ident);
    513 	sleepq_enter(sq, l);
    514 	mutex_exit(mtx);
    515 
    516 	catch = priority & PCATCH;
    517 	sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
    518 	    &sleep_syncobj);
    519 	error = sleepq_unblock(timo, catch);
    520 
    521 	if ((priority & PNORELOCK) == 0)
    522 		mutex_enter(mtx);
    523 
    524 	return error;
    525 }
    526 
    527 /*
    528  * General sleep call for situations where a wake-up is not expected.
    529  */
    530 int
    531 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    532 {
    533 	struct lwp *l = curlwp;
    534 	sleepq_t *sq;
    535 	int error;
    536 
    537 	if (sleepq_dontsleep(l))
    538 		return sleepq_abort(NULL, 0);
    539 
    540 	if (mtx != NULL)
    541 		mutex_exit(mtx);
    542 	sq = sleeptab_lookup(&sleeptab, l);
    543 	sleepq_enter(sq, l);
    544 	sleepq_block(sq, sched_kpri(l), l, wmesg, timo, intr, &sleep_syncobj);
    545 	error = sleepq_unblock(timo, intr);
    546 	if (mtx != NULL)
    547 		mutex_enter(mtx);
    548 
    549 	return error;
    550 }
    551 
    552 /*
    553  * OBSOLETE INTERFACE
    554  *
    555  * Make all processes sleeping on the specified identifier runnable.
    556  */
    557 void
    558 wakeup(wchan_t ident)
    559 {
    560 	sleepq_t *sq;
    561 
    562 	if (cold)
    563 		return;
    564 
    565 	sq = sleeptab_lookup(&sleeptab, ident);
    566 	sleepq_wake(sq, ident, (u_int)-1);
    567 }
    568 
    569 /*
    570  * OBSOLETE INTERFACE
    571  *
    572  * Make the highest priority process first in line on the specified
    573  * identifier runnable.
    574  */
    575 void
    576 wakeup_one(wchan_t ident)
    577 {
    578 	sleepq_t *sq;
    579 
    580 	if (cold)
    581 		return;
    582 
    583 	sq = sleeptab_lookup(&sleeptab, ident);
    584 	sleepq_wake(sq, ident, 1);
    585 }
    586 
    587 
    588 /*
    589  * General yield call.  Puts the current process back on its run queue and
    590  * performs a voluntary context switch.  Should only be called when the
    591  * current process explicitly requests it (eg sched_yield(2) in compat code).
    592  */
    593 void
    594 yield(void)
    595 {
    596 	struct lwp *l = curlwp;
    597 
    598 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    599 	lwp_lock(l);
    600 	if (l->l_stat == LSONPROC) {
    601 		KASSERT(lwp_locked(l, &sched_mutex));
    602 		l->l_priority = l->l_usrpri;
    603 	}
    604 	l->l_ncsw++;
    605 	mi_switch(l, NULL);
    606 	KERNEL_LOCK(l->l_biglocks, l);
    607 }
    608 
    609 /*
    610  * General preemption call.  Puts the current process back on its run queue
    611  * and performs an involuntary context switch.
    612  */
    613 void
    614 preempt(void)
    615 {
    616 	struct lwp *l = curlwp;
    617 
    618 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    619 	lwp_lock(l);
    620 	if (l->l_stat == LSONPROC) {
    621 		KASSERT(lwp_locked(l, &sched_mutex));
    622 		l->l_priority = l->l_usrpri;
    623 	}
    624 	l->l_nivcsw++;
    625 	(void)mi_switch(l, NULL);
    626 	KERNEL_LOCK(l->l_biglocks, l);
    627 }
    628 
    629 /*
    630  * The machine independent parts of context switch.  Switch to "new"
    631  * if non-NULL, otherwise let cpu_switch choose the next lwp.
    632  *
    633  * Returns 1 if another process was actually run.
    634  */
    635 int
    636 mi_switch(struct lwp *l, struct lwp *newl)
    637 {
    638 	struct schedstate_percpu *spc;
    639 	struct timeval tv;
    640 	int retval, oldspl;
    641 	long s, u;
    642 
    643 	KASSERT(lwp_locked(l, NULL));
    644 
    645 #ifdef KSTACK_CHECK_MAGIC
    646 	kstack_check_magic(l);
    647 #endif
    648 
    649 	/*
    650 	 * It's safe to read the per CPU schedstate unlocked here, as all we
    651 	 * are after is the run time and that's guarenteed to have been last
    652 	 * updated by this CPU.
    653 	 */
    654 	KDASSERT(l->l_cpu == curcpu());
    655 	spc = &l->l_cpu->ci_schedstate;
    656 
    657 	/*
    658 	 * Compute the amount of time during which the current
    659 	 * process was running.
    660 	 */
    661 	microtime(&tv);
    662 	u = l->l_rtime.tv_usec +
    663 	    (tv.tv_usec - spc->spc_runtime.tv_usec);
    664 	s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
    665 	if (u < 0) {
    666 		u += 1000000;
    667 		s--;
    668 	} else if (u >= 1000000) {
    669 		u -= 1000000;
    670 		s++;
    671 	}
    672 	l->l_rtime.tv_usec = u;
    673 	l->l_rtime.tv_sec = s;
    674 
    675 	/* Count time spent in current system call */
    676 	SYSCALL_TIME_SLEEP(l);
    677 
    678 	/*
    679 	 * XXXSMP If we are using h/w performance counters, save context.
    680 	 */
    681 #if PERFCTRS
    682 	if (PMC_ENABLED(l->l_proc)) {
    683 		pmc_save_context(l->l_proc);
    684 	}
    685 #endif
    686 
    687 	/*
    688 	 * Acquire the sched_mutex if necessary.  It will be released by
    689 	 * cpu_switch once it has decided to idle, or picked another LWP
    690 	 * to run.
    691 	 */
    692 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
    693 	if (l->l_mutex != &sched_mutex) {
    694 		mutex_spin_enter(&sched_mutex);
    695 		lwp_unlock(l);
    696 	}
    697 #endif
    698 
    699 	/*
    700 	 * If on the CPU and we have gotten this far, then we must yield.
    701 	 */
    702 	KASSERT(l->l_stat != LSRUN);
    703 	if (l->l_stat == LSONPROC) {
    704 		KASSERT(lwp_locked(l, &sched_mutex));
    705 		l->l_stat = LSRUN;
    706 		setrunqueue(l);
    707 	}
    708 	uvmexp.swtch++;
    709 	l->l_ncsw++;
    710 
    711 	/*
    712 	 * Process is about to yield the CPU; clear the appropriate
    713 	 * scheduling flags.
    714 	 */
    715 	spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    716 
    717 	LOCKDEBUG_BARRIER(&sched_mutex, 1);
    718 
    719 	/*
    720 	 * Switch to the new current LWP.  When we run again, we'll
    721 	 * return back here.
    722 	 */
    723 	oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
    724 
    725 	if (newl == NULL || newl->l_back == NULL)
    726 		retval = cpu_switch(l, NULL);
    727 	else {
    728 		KASSERT(lwp_locked(newl, &sched_mutex));
    729 		remrunqueue(newl);
    730 		cpu_switchto(l, newl);
    731 		retval = 0;
    732 	}
    733 
    734 	/*
    735 	 * XXXSMP If we are using h/w performance counters, restore context.
    736 	 */
    737 #if PERFCTRS
    738 	if (PMC_ENABLED(l->l_proc)) {
    739 		pmc_restore_context(l->l_proc);
    740 	}
    741 #endif
    742 
    743 	/*
    744 	 * We're running again; record our new start time.  We might
    745 	 * be running on a new CPU now, so don't use the cached
    746 	 * schedstate_percpu pointer.
    747 	 */
    748 	SYSCALL_TIME_WAKEUP(l);
    749 	KDASSERT(l->l_cpu == curcpu());
    750 	microtime(&l->l_cpu->ci_schedstate.spc_runtime);
    751 	splx(oldspl);
    752 
    753 	return retval;
    754 }
    755 
    756 /*
    757  * Initialize the (doubly-linked) run queues
    758  * to be empty.
    759  */
    760 void
    761 rqinit()
    762 {
    763 	int i;
    764 
    765 	for (i = 0; i < RUNQUE_NQS; i++)
    766 		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
    767 		    (struct lwp *)&sched_qs[i];
    768 
    769 	mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
    770 }
    771 
    772 static inline void
    773 resched_lwp(struct lwp *l)
    774 {
    775 	struct cpu_info *ci;
    776 	const pri_t pri = lwp_eprio(l);
    777 
    778 	/*
    779 	 * XXXSMP
    780 	 * Since l->l_cpu persists across a context switch,
    781 	 * this gives us *very weak* processor affinity, in
    782 	 * that we notify the CPU on which the process last
    783 	 * ran that it should try to switch.
    784 	 *
    785 	 * This does not guarantee that the process will run on
    786 	 * that processor next, because another processor might
    787 	 * grab it the next time it performs a context switch.
    788 	 *
    789 	 * This also does not handle the case where its last
    790 	 * CPU is running a higher-priority process, but every
    791 	 * other CPU is running a lower-priority process.  There
    792 	 * are ways to handle this situation, but they're not
    793 	 * currently very pretty, and we also need to weigh the
    794 	 * cost of moving a process from one CPU to another.
    795 	 *
    796 	 * XXXSMP
    797 	 * There is also the issue of locking the other CPU's
    798 	 * sched state, which we currently do not do.
    799 	 */
    800 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    801 	if (pri < ci->ci_schedstate.spc_curpriority)
    802 		cpu_need_resched(ci);
    803 }
    804 
    805 /*
    806  * Change process state to be runnable, placing it on the run queue if it is
    807  * in memory, and awakening the swapper if it isn't in memory.
    808  *
    809  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    810  */
    811 void
    812 setrunnable(struct lwp *l)
    813 {
    814 	struct proc *p = l->l_proc;
    815 	sigset_t *ss;
    816 
    817 	KASSERT(mutex_owned(&p->p_smutex));
    818 	KASSERT(lwp_locked(l, NULL));
    819 
    820 	switch (l->l_stat) {
    821 	case LSSTOP:
    822 		/*
    823 		 * If we're being traced (possibly because someone attached us
    824 		 * while we were stopped), check for a signal from the debugger.
    825 		 */
    826 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    827 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    828 				ss = &l->l_sigpend.sp_set;
    829 			else
    830 				ss = &p->p_sigpend.sp_set;
    831 			sigaddset(ss, p->p_xstat);
    832 			signotify(l);
    833 		}
    834 		p->p_nrlwps++;
    835 		break;
    836 	case LSSUSPENDED:
    837 		l->l_flag &= ~LW_WSUSPEND;
    838 		p->p_nrlwps++;
    839 		break;
    840 	case LSSLEEP:
    841 		KASSERT(l->l_wchan != NULL);
    842 		break;
    843 	default:
    844 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    845 	}
    846 
    847 	/*
    848 	 * If the LWP was sleeping interruptably, then it's OK to start it
    849 	 * again.  If not, mark it as still sleeping.
    850 	 */
    851 	if (l->l_wchan != NULL) {
    852 		l->l_stat = LSSLEEP;
    853 		/* lwp_unsleep() will release the lock. */
    854 		lwp_unsleep(l);
    855 		return;
    856 	}
    857 
    858 	KASSERT(lwp_locked(l, &sched_mutex));
    859 
    860 	/*
    861 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    862 	 * about to call mi_switch(), in which case it will yield.
    863 	 *
    864 	 * XXXSMP Will need to change for preemption.
    865 	 */
    866 #ifdef MULTIPROCESSOR
    867 	if (l->l_cpu->ci_curlwp == l) {
    868 #else
    869 	if (l == curlwp) {
    870 #endif
    871 		l->l_stat = LSONPROC;
    872 		l->l_slptime = 0;
    873 		lwp_unlock(l);
    874 		return;
    875 	}
    876 
    877 	/*
    878 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
    879 	 * to bring it back in.  Otherwise, enter it into a run queue.
    880 	 */
    881 	if (l->l_slptime > 1)
    882 		updatepri(l);
    883 	l->l_stat = LSRUN;
    884 	l->l_slptime = 0;
    885 
    886 	if (l->l_flag & LW_INMEM) {
    887 		setrunqueue(l);
    888 		resched_lwp(l);
    889 		lwp_unlock(l);
    890 	} else {
    891 		lwp_unlock(l);
    892 		uvm_kick_scheduler();
    893 	}
    894 }
    895 
    896 /*
    897  * Compute the priority of a process when running in user mode.
    898  * Arrange to reschedule if the resulting priority is better
    899  * than that of the current process.
    900  */
    901 void
    902 resetpriority(struct lwp *l)
    903 {
    904 	pri_t newpriority;
    905 	struct proc *p = l->l_proc;
    906 
    907 	/* XXXSMP KASSERT(mutex_owned(&p->p_stmutex)); */
    908 	KASSERT(lwp_locked(l, NULL));
    909 
    910 	if ((l->l_flag & LW_SYSTEM) != 0)
    911 		return;
    912 
    913 	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
    914 	    NICE_WEIGHT * (p->p_nice - NZERO);
    915 	newpriority = min(newpriority, MAXPRI);
    916 	lwp_changepri(l, newpriority);
    917 }
    918 
    919 /*
    920  * Recompute priority for all LWPs in a process.
    921  */
    922 void
    923 resetprocpriority(struct proc *p)
    924 {
    925 	struct lwp *l;
    926 
    927 	KASSERT(mutex_owned(&p->p_stmutex));
    928 
    929 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    930 		lwp_lock(l);
    931 		resetpriority(l);
    932 		lwp_unlock(l);
    933 	}
    934 }
    935 
    936 /*
    937  * We adjust the priority of the current process.  The priority of a process
    938  * gets worse as it accumulates CPU time.  The CPU usage estimator (p_estcpu)
    939  * is increased here.  The formula for computing priorities (in kern_synch.c)
    940  * will compute a different value each time p_estcpu increases. This can
    941  * cause a switch, but unless the priority crosses a PPQ boundary the actual
    942  * queue will not change.  The CPU usage estimator ramps up quite quickly
    943  * when the process is running (linearly), and decays away exponentially, at
    944  * a rate which is proportionally slower when the system is busy.  The basic
    945  * principle is that the system will 90% forget that the process used a lot
    946  * of CPU time in 5 * loadav seconds.  This causes the system to favor
    947  * processes which haven't run much recently, and to round-robin among other
    948  * processes.
    949  */
    950 
    951 void
    952 schedclock(struct lwp *l)
    953 {
    954 	struct proc *p = l->l_proc;
    955 
    956 	mutex_spin_enter(&p->p_stmutex);
    957 	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
    958 	lwp_lock(l);
    959 	resetpriority(l);
    960 	mutex_spin_exit(&p->p_stmutex);
    961 	if ((l->l_flag & LW_SYSTEM) == 0 && l->l_priority >= PUSER)
    962 		l->l_priority = l->l_usrpri;
    963 	lwp_unlock(l);
    964 }
    965 
    966 /*
    967  * suspendsched:
    968  *
    969  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    970  */
    971 void
    972 suspendsched(void)
    973 {
    974 #ifdef MULTIPROCESSOR
    975 	CPU_INFO_ITERATOR cii;
    976 	struct cpu_info *ci;
    977 #endif
    978 	struct lwp *l;
    979 	struct proc *p;
    980 
    981 	/*
    982 	 * We do this by process in order not to violate the locking rules.
    983 	 */
    984 	mutex_enter(&proclist_mutex);
    985 	PROCLIST_FOREACH(p, &allproc) {
    986 		mutex_enter(&p->p_smutex);
    987 
    988 		if ((p->p_flag & PK_SYSTEM) != 0) {
    989 			mutex_exit(&p->p_smutex);
    990 			continue;
    991 		}
    992 
    993 		p->p_stat = SSTOP;
    994 
    995 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    996 			if (l == curlwp)
    997 				continue;
    998 
    999 			lwp_lock(l);
   1000 
   1001 			/*
   1002 			 * Set L_WREBOOT so that the LWP will suspend itself
   1003 			 * when it tries to return to user mode.  We want to
   1004 			 * try and get to get as many LWPs as possible to
   1005 			 * the user / kernel boundary, so that they will
   1006 			 * release any locks that they hold.
   1007 			 */
   1008 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
   1009 
   1010 			if (l->l_stat == LSSLEEP &&
   1011 			    (l->l_flag & LW_SINTR) != 0) {
   1012 				/* setrunnable() will release the lock. */
   1013 				setrunnable(l);
   1014 				continue;
   1015 			}
   1016 
   1017 			lwp_unlock(l);
   1018 		}
   1019 
   1020 		mutex_exit(&p->p_smutex);
   1021 	}
   1022 	mutex_exit(&proclist_mutex);
   1023 
   1024 	/*
   1025 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
   1026 	 * They'll trap into the kernel and suspend themselves in userret().
   1027 	 */
   1028 	sched_lock(0);
   1029 #ifdef MULTIPROCESSOR
   1030 	for (CPU_INFO_FOREACH(cii, ci))
   1031 		cpu_need_resched(ci);
   1032 #else
   1033 	cpu_need_resched(curcpu());
   1034 #endif
   1035 	sched_unlock(0);
   1036 }
   1037 
   1038 /*
   1039  * scheduler_fork_hook:
   1040  *
   1041  *	Inherit the parent's scheduler history.
   1042  */
   1043 void
   1044 scheduler_fork_hook(struct proc *parent, struct proc *child)
   1045 {
   1046 
   1047 	KASSERT(mutex_owned(&parent->p_smutex));
   1048 
   1049 	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
   1050 	child->p_forktime = schedcpu_ticks;
   1051 }
   1052 
   1053 /*
   1054  * scheduler_wait_hook:
   1055  *
   1056  *	Chargeback parents for the sins of their children.
   1057  */
   1058 void
   1059 scheduler_wait_hook(struct proc *parent, struct proc *child)
   1060 {
   1061 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
   1062 	fixpt_t estcpu;
   1063 
   1064 	/* XXX Only if parent != init?? */
   1065 
   1066 	mutex_spin_enter(&parent->p_stmutex);
   1067 	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
   1068 	    schedcpu_ticks - child->p_forktime);
   1069 	if (child->p_estcpu > estcpu)
   1070 		parent->p_estcpu =
   1071 		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
   1072 	mutex_spin_exit(&parent->p_stmutex);
   1073 }
   1074 
   1075 /*
   1076  * sched_kpri:
   1077  *
   1078  *	Scale a priority level to a kernel priority level, usually
   1079  *	for an LWP that is about to sleep.
   1080  */
   1081 pri_t
   1082 sched_kpri(struct lwp *l)
   1083 {
   1084 	/*
   1085 	 * Scale user priorities (127 -> 50) up to kernel priorities
   1086 	 * in the range (49 -> 8).  Reserve the top 8 kernel priorities
   1087 	 * for high priority kthreads.  Kernel priorities passed in
   1088 	 * are left "as is".  XXX This is somewhat arbitrary.
   1089 	 */
   1090 	static const uint8_t kpri_tab[] = {
   1091 		 0,   1,   2,   3,   4,   5,   6,   7,
   1092 		 8,   9,  10,  11,  12,  13,  14,  15,
   1093 		16,  17,  18,  19,  20,  21,  22,  23,
   1094 		24,  25,  26,  27,  28,  29,  30,  31,
   1095 		32,  33,  34,  35,  36,  37,  38,  39,
   1096 		40,  41,  42,  43,  44,  45,  46,  47,
   1097 		48,  49,   8,   8,   9,   9,  10,  10,
   1098 		11,  11,  12,  12,  13,  14,  14,  15,
   1099 		15,  16,  16,  17,  17,  18,  18,  19,
   1100 		20,  20,  21,  21,  22,  22,  23,  23,
   1101 		24,  24,  25,  26,  26,  27,  27,  28,
   1102 		28,  29,  29,  30,  30,  31,  32,  32,
   1103 		33,  33,  34,  34,  35,  35,  36,  36,
   1104 		37,  38,  38,  39,  39,  40,  40,  41,
   1105 		41,  42,  42,  43,  44,  44,  45,  45,
   1106 		46,  46,  47,  47,  48,  48,  49,  49,
   1107 	};
   1108 
   1109 	return (pri_t)kpri_tab[l->l_usrpri];
   1110 }
   1111 
   1112 /*
   1113  * sched_unsleep:
   1114  *
   1115  *	The is called when the LWP has not been awoken normally but instead
   1116  *	interrupted: for example, if the sleep timed out.  Because of this,
   1117  *	it's not a valid action for running or idle LWPs.
   1118  */
   1119 void
   1120 sched_unsleep(struct lwp *l)
   1121 {
   1122 
   1123 	lwp_unlock(l);
   1124 	panic("sched_unsleep");
   1125 }
   1126 
   1127 /*
   1128  * sched_changepri:
   1129  *
   1130  *	Adjust the priority of an LWP.
   1131  */
   1132 void
   1133 sched_changepri(struct lwp *l, pri_t pri)
   1134 {
   1135 
   1136 	KASSERT(lwp_locked(l, &sched_mutex));
   1137 
   1138 	l->l_usrpri = pri;
   1139 	if (l->l_priority < PUSER)
   1140 		return;
   1141 
   1142 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
   1143 		l->l_priority = pri;
   1144 		return;
   1145 	}
   1146 
   1147 	remrunqueue(l);
   1148 	l->l_priority = pri;
   1149 	setrunqueue(l);
   1150 	resched_lwp(l);
   1151 }
   1152 
   1153 void
   1154 sched_lendpri(struct lwp *l, pri_t pri)
   1155 {
   1156 
   1157 	KASSERT(lwp_locked(l, &sched_mutex));
   1158 
   1159 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
   1160 		l->l_inheritedprio = pri;
   1161 		return;
   1162 	}
   1163 
   1164 	remrunqueue(l);
   1165 	l->l_inheritedprio = pri;
   1166 	setrunqueue(l);
   1167 	resched_lwp(l);
   1168 }
   1169 
   1170 struct lwp *
   1171 syncobj_noowner(wchan_t wchan)
   1172 {
   1173 
   1174 	return NULL;
   1175 }
   1176 
   1177 /*
   1178  * Low-level routines to access the run queue.  Optimised assembler
   1179  * routines can override these.
   1180  */
   1181 
   1182 #ifndef __HAVE_MD_RUNQUEUE
   1183 
   1184 /*
   1185  * On some architectures, it's faster to use a MSB ordering for the priorites
   1186  * than the traditional LSB ordering.
   1187  */
   1188 #ifdef __HAVE_BIGENDIAN_BITOPS
   1189 #define	RQMASK(n) (0x80000000 >> (n))
   1190 #else
   1191 #define	RQMASK(n) (0x00000001 << (n))
   1192 #endif
   1193 
   1194 /*
   1195  * The primitives that manipulate the run queues.  whichqs tells which
   1196  * of the 32 queues qs have processes in them.  Setrunqueue puts processes
   1197  * into queues, remrunqueue removes them from queues.  The running process is
   1198  * on no queue, other processes are on a queue related to p->p_priority,
   1199  * divided by 4 actually to shrink the 0-127 range of priorities into the 32
   1200  * available queues.
   1201  */
   1202 #ifdef RQDEBUG
   1203 static void
   1204 checkrunqueue(int whichq, struct lwp *l)
   1205 {
   1206 	const struct prochd * const rq = &sched_qs[whichq];
   1207 	struct lwp *l2;
   1208 	int found = 0;
   1209 	int die = 0;
   1210 	int empty = 1;
   1211 	for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
   1212 		if (l2->l_stat != LSRUN) {
   1213 			printf("checkrunqueue[%d]: lwp %p state (%d) "
   1214 			    " != LSRUN\n", whichq, l2, l2->l_stat);
   1215 		}
   1216 		if (l2->l_back->l_forw != l2) {
   1217 			printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
   1218 			    "corrupt %p\n", whichq, l2, l2->l_back,
   1219 			    l2->l_back->l_forw);
   1220 			die = 1;
   1221 		}
   1222 		if (l2->l_forw->l_back != l2) {
   1223 			printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
   1224 			    "corrupt %p\n", whichq, l2, l2->l_forw,
   1225 			    l2->l_forw->l_back);
   1226 			die = 1;
   1227 		}
   1228 		if (l2 == l)
   1229 			found = 1;
   1230 		empty = 0;
   1231 	}
   1232 	if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
   1233 		printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
   1234 		    whichq, rq);
   1235 		die = 1;
   1236 	} else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
   1237 		printf("checkrunqueue[%d]: bit clear for non-empty "
   1238 		    "run-queue %p\n", whichq, rq);
   1239 		die = 1;
   1240 	}
   1241 	if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
   1242 		printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
   1243 		    whichq, l);
   1244 		die = 1;
   1245 	}
   1246 	if (l != NULL && empty) {
   1247 		printf("checkrunqueue[%d]: empty run-queue %p with "
   1248 		    "active lwp %p\n", whichq, rq, l);
   1249 		die = 1;
   1250 	}
   1251 	if (l != NULL && !found) {
   1252 		printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
   1253 		    whichq, l, rq);
   1254 		die = 1;
   1255 	}
   1256 	if (die)
   1257 		panic("checkrunqueue: inconsistency found");
   1258 }
   1259 #endif /* RQDEBUG */
   1260 
   1261 void
   1262 setrunqueue(struct lwp *l)
   1263 {
   1264 	struct prochd *rq;
   1265 	struct lwp *prev;
   1266 	const int whichq = lwp_eprio(l) / PPQ;
   1267 
   1268 	KASSERT(lwp_locked(l, &sched_mutex));
   1269 
   1270 #ifdef RQDEBUG
   1271 	checkrunqueue(whichq, NULL);
   1272 #endif
   1273 #ifdef DIAGNOSTIC
   1274 	if (l->l_back != NULL || l->l_stat != LSRUN)
   1275 		panic("setrunqueue");
   1276 #endif
   1277 	sched_whichqs |= RQMASK(whichq);
   1278 	rq = &sched_qs[whichq];
   1279 	prev = rq->ph_rlink;
   1280 	l->l_forw = (struct lwp *)rq;
   1281 	rq->ph_rlink = l;
   1282 	prev->l_forw = l;
   1283 	l->l_back = prev;
   1284 #ifdef RQDEBUG
   1285 	checkrunqueue(whichq, l);
   1286 #endif
   1287 }
   1288 
   1289 /*
   1290  * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
   1291  * drop of the effective priority level from kernel to user needs to be
   1292  * moved here from userret().  The assignment in userret() is currently
   1293  * done unlocked.
   1294  */
   1295 void
   1296 remrunqueue(struct lwp *l)
   1297 {
   1298 	struct lwp *prev, *next;
   1299 	const int whichq = lwp_eprio(l) / PPQ;
   1300 
   1301 	KASSERT(lwp_locked(l, &sched_mutex));
   1302 
   1303 #ifdef RQDEBUG
   1304 	checkrunqueue(whichq, l);
   1305 #endif
   1306 
   1307 #if defined(DIAGNOSTIC)
   1308 	if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
   1309 		/* Shouldn't happen - interrupts disabled. */
   1310 		panic("remrunqueue: bit %d not set", whichq);
   1311 	}
   1312 #endif
   1313 	prev = l->l_back;
   1314 	l->l_back = NULL;
   1315 	next = l->l_forw;
   1316 	prev->l_forw = next;
   1317 	next->l_back = prev;
   1318 	if (prev == next)
   1319 		sched_whichqs &= ~RQMASK(whichq);
   1320 #ifdef RQDEBUG
   1321 	checkrunqueue(whichq, NULL);
   1322 #endif
   1323 }
   1324 
   1325 #undef RQMASK
   1326 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
   1327