Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.202
      1 /*	$NetBSD: kern_synch.c,v 1.202 2007/10/29 12:16:54 yamt Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. All advertising materials mentioning features or use of this software
     21  *    must display the following acknowledgement:
     22  *	This product includes software developed by the NetBSD
     23  *	Foundation, Inc. and its contributors.
     24  * 4. Neither the name of The NetBSD Foundation nor the names of its
     25  *    contributors may be used to endorse or promote products derived
     26  *    from this software without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     38  * POSSIBILITY OF SUCH DAMAGE.
     39  */
     40 
     41 /*-
     42  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     43  *	The Regents of the University of California.  All rights reserved.
     44  * (c) UNIX System Laboratories, Inc.
     45  * All or some portions of this file are derived from material licensed
     46  * to the University of California by American Telephone and Telegraph
     47  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     48  * the permission of UNIX System Laboratories, Inc.
     49  *
     50  * Redistribution and use in source and binary forms, with or without
     51  * modification, are permitted provided that the following conditions
     52  * are met:
     53  * 1. Redistributions of source code must retain the above copyright
     54  *    notice, this list of conditions and the following disclaimer.
     55  * 2. Redistributions in binary form must reproduce the above copyright
     56  *    notice, this list of conditions and the following disclaimer in the
     57  *    documentation and/or other materials provided with the distribution.
     58  * 3. Neither the name of the University nor the names of its contributors
     59  *    may be used to endorse or promote products derived from this software
     60  *    without specific prior written permission.
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     72  * SUCH DAMAGE.
     73  *
     74  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     75  */
     76 
     77 #include <sys/cdefs.h>
     78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.202 2007/10/29 12:16:54 yamt Exp $");
     79 
     80 #include "opt_kstack.h"
     81 #include "opt_lockdebug.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_perfctrs.h"
     84 #include "opt_sched.h"
     85 
     86 #define	__MUTEX_PRIVATE
     87 
     88 #include <sys/param.h>
     89 #include <sys/systm.h>
     90 #include <sys/proc.h>
     91 #include <sys/kernel.h>
     92 #if defined(PERFCTRS)
     93 #include <sys/pmc.h>
     94 #endif
     95 #include <sys/cpu.h>
     96 #include <sys/resourcevar.h>
     97 #include <sys/sched.h>
     98 #include <sys/syscall_stats.h>
     99 #include <sys/sleepq.h>
    100 #include <sys/lockdebug.h>
    101 #include <sys/evcnt.h>
    102 #include <sys/intr.h>
    103 
    104 #include <uvm/uvm_extern.h>
    105 
    106 callout_t sched_pstats_ch;
    107 unsigned int sched_pstats_ticks;
    108 
    109 kcondvar_t	lbolt;			/* once a second sleep address */
    110 
    111 static void	sched_unsleep(struct lwp *);
    112 static void	sched_changepri(struct lwp *, pri_t);
    113 static void	sched_lendpri(struct lwp *, pri_t);
    114 
    115 syncobj_t sleep_syncobj = {
    116 	SOBJ_SLEEPQ_SORTED,
    117 	sleepq_unsleep,
    118 	sleepq_changepri,
    119 	sleepq_lendpri,
    120 	syncobj_noowner,
    121 };
    122 
    123 syncobj_t sched_syncobj = {
    124 	SOBJ_SLEEPQ_SORTED,
    125 	sched_unsleep,
    126 	sched_changepri,
    127 	sched_lendpri,
    128 	syncobj_noowner,
    129 };
    130 
    131 /*
    132  * During autoconfiguration or after a panic, a sleep will simply lower the
    133  * priority briefly to allow interrupts, then return.  The priority to be
    134  * used (safepri) is machine-dependent, thus this value is initialized and
    135  * maintained in the machine-dependent layers.  This priority will typically
    136  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    137  * it can be made higher to block network software interrupts after panics.
    138  */
    139 int	safepri;
    140 
    141 /*
    142  * OBSOLETE INTERFACE
    143  *
    144  * General sleep call.  Suspends the current process until a wakeup is
    145  * performed on the specified identifier.  The process will then be made
    146  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    147  * means no timeout).  If pri includes PCATCH flag, signals are checked
    148  * before and after sleeping, else signals are not checked.  Returns 0 if
    149  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    150  * signal needs to be delivered, ERESTART is returned if the current system
    151  * call should be restarted if possible, and EINTR is returned if the system
    152  * call should be interrupted by the signal (return EINTR).
    153  *
    154  * The interlock is held until we are on a sleep queue. The interlock will
    155  * be locked before returning back to the caller unless the PNORELOCK flag
    156  * is specified, in which case the interlock will always be unlocked upon
    157  * return.
    158  */
    159 int
    160 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    161 	volatile struct simplelock *interlock)
    162 {
    163 	struct lwp *l = curlwp;
    164 	sleepq_t *sq;
    165 	int error;
    166 
    167 	if (sleepq_dontsleep(l)) {
    168 		(void)sleepq_abort(NULL, 0);
    169 		if ((priority & PNORELOCK) != 0)
    170 			simple_unlock(interlock);
    171 		return 0;
    172 	}
    173 
    174 	sq = sleeptab_lookup(&sleeptab, ident);
    175 	sleepq_enter(sq, l);
    176 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    177 
    178 	if (interlock != NULL) {
    179 		LOCK_ASSERT(simple_lock_held(interlock));
    180 		simple_unlock(interlock);
    181 	}
    182 
    183 	error = sleepq_block(timo, priority & PCATCH);
    184 
    185 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    186 		simple_lock(interlock);
    187 
    188 	return error;
    189 }
    190 
    191 int
    192 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    193 	kmutex_t *mtx)
    194 {
    195 	struct lwp *l = curlwp;
    196 	sleepq_t *sq;
    197 	int error;
    198 
    199 	if (sleepq_dontsleep(l)) {
    200 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    201 		return 0;
    202 	}
    203 
    204 	sq = sleeptab_lookup(&sleeptab, ident);
    205 	sleepq_enter(sq, l);
    206 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    207 	mutex_exit(mtx);
    208 	error = sleepq_block(timo, priority & PCATCH);
    209 
    210 	if ((priority & PNORELOCK) == 0)
    211 		mutex_enter(mtx);
    212 
    213 	return error;
    214 }
    215 
    216 /*
    217  * General sleep call for situations where a wake-up is not expected.
    218  */
    219 int
    220 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    221 {
    222 	struct lwp *l = curlwp;
    223 	sleepq_t *sq;
    224 	int error;
    225 
    226 	if (sleepq_dontsleep(l))
    227 		return sleepq_abort(NULL, 0);
    228 
    229 	if (mtx != NULL)
    230 		mutex_exit(mtx);
    231 	sq = sleeptab_lookup(&sleeptab, l);
    232 	sleepq_enter(sq, l);
    233 	sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
    234 	error = sleepq_block(timo, intr);
    235 	if (mtx != NULL)
    236 		mutex_enter(mtx);
    237 
    238 	return error;
    239 }
    240 
    241 /*
    242  * OBSOLETE INTERFACE
    243  *
    244  * Make all processes sleeping on the specified identifier runnable.
    245  */
    246 void
    247 wakeup(wchan_t ident)
    248 {
    249 	sleepq_t *sq;
    250 
    251 	if (cold)
    252 		return;
    253 
    254 	sq = sleeptab_lookup(&sleeptab, ident);
    255 	sleepq_wake(sq, ident, (u_int)-1);
    256 }
    257 
    258 /*
    259  * OBSOLETE INTERFACE
    260  *
    261  * Make the highest priority process first in line on the specified
    262  * identifier runnable.
    263  */
    264 void
    265 wakeup_one(wchan_t ident)
    266 {
    267 	sleepq_t *sq;
    268 
    269 	if (cold)
    270 		return;
    271 
    272 	sq = sleeptab_lookup(&sleeptab, ident);
    273 	sleepq_wake(sq, ident, 1);
    274 }
    275 
    276 
    277 /*
    278  * General yield call.  Puts the current process back on its run queue and
    279  * performs a voluntary context switch.  Should only be called when the
    280  * current process explicitly requests it (eg sched_yield(2)).
    281  */
    282 void
    283 yield(void)
    284 {
    285 	struct lwp *l = curlwp;
    286 
    287 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    288 	lwp_lock(l);
    289 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    290 	KASSERT(l->l_stat == LSONPROC);
    291 	/* XXX Only do this for timeshared threads. */
    292 	l->l_priority = MAXPRI;
    293 	(void)mi_switch(l);
    294 	KERNEL_LOCK(l->l_biglocks, l);
    295 }
    296 
    297 /*
    298  * General preemption call.  Puts the current process back on its run queue
    299  * and performs an involuntary context switch.
    300  */
    301 void
    302 preempt(void)
    303 {
    304 	struct lwp *l = curlwp;
    305 
    306 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    307 	lwp_lock(l);
    308 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    309 	KASSERT(l->l_stat == LSONPROC);
    310 	l->l_priority = l->l_usrpri;
    311 	l->l_nivcsw++;
    312 	(void)mi_switch(l);
    313 	KERNEL_LOCK(l->l_biglocks, l);
    314 }
    315 
    316 /*
    317  * Compute the amount of time during which the current lwp was running.
    318  *
    319  * - update l_rtime unless it's an idle lwp.
    320  */
    321 
    322 void
    323 updatertime(lwp_t *l, const struct timeval *tv)
    324 {
    325 	long s, u;
    326 
    327 	if ((l->l_flag & LW_IDLE) != 0)
    328 		return;
    329 
    330 	u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
    331 	s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
    332 	if (u < 0) {
    333 		u += 1000000;
    334 		s--;
    335 	} else if (u >= 1000000) {
    336 		u -= 1000000;
    337 		s++;
    338 	}
    339 	l->l_rtime.tv_usec = u;
    340 	l->l_rtime.tv_sec = s;
    341 }
    342 
    343 /*
    344  * The machine independent parts of context switch.
    345  *
    346  * Returns 1 if another LWP was actually run.
    347  */
    348 int
    349 mi_switch(lwp_t *l)
    350 {
    351 	struct schedstate_percpu *spc;
    352 	struct lwp *newl;
    353 	int retval, oldspl;
    354 	struct cpu_info *ci;
    355 	struct timeval tv;
    356 	bool returning;
    357 
    358 	KASSERT(lwp_locked(l, NULL));
    359 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    360 
    361 #ifdef KSTACK_CHECK_MAGIC
    362 	kstack_check_magic(l);
    363 #endif
    364 
    365 	microtime(&tv);
    366 
    367 	/*
    368 	 * It's safe to read the per CPU schedstate unlocked here, as all we
    369 	 * are after is the run time and that's guarenteed to have been last
    370 	 * updated by this CPU.
    371 	 */
    372 	ci = l->l_cpu;
    373 	KDASSERT(ci == curcpu());
    374 
    375 	/*
    376 	 * Process is about to yield the CPU; clear the appropriate
    377 	 * scheduling flags.
    378 	 */
    379 	spc = &ci->ci_schedstate;
    380 	returning = false;
    381 	newl = NULL;
    382 
    383 	/*
    384 	 * If we have been asked to switch to a specific LWP, then there
    385 	 * is no need to inspect the run queues.  If a soft interrupt is
    386 	 * blocking, then return to the interrupted thread without adjusting
    387 	 * VM context or its start time: neither have been changed in order
    388 	 * to take the interrupt.
    389 	 */
    390 	if (l->l_switchto != NULL) {
    391 		if ((l->l_flag & LW_INTR) != 0) {
    392 			returning = true;
    393 			softint_block(l);
    394 			if ((l->l_flag & LW_TIMEINTR) != 0)
    395 				updatertime(l, &tv);
    396 		}
    397 		newl = l->l_switchto;
    398 		l->l_switchto = NULL;
    399 	}
    400 
    401 	/* Count time spent in current system call */
    402 	if (!returning) {
    403 		SYSCALL_TIME_SLEEP(l);
    404 
    405 		/*
    406 		 * XXXSMP If we are using h/w performance counters,
    407 		 * save context.
    408 		 */
    409 #if PERFCTRS
    410 		if (PMC_ENABLED(l->l_proc)) {
    411 			pmc_save_context(l->l_proc);
    412 		}
    413 #endif
    414 		updatertime(l, &tv);
    415 	}
    416 
    417 	/*
    418 	 * If on the CPU and we have gotten this far, then we must yield.
    419 	 */
    420 	mutex_spin_enter(spc->spc_mutex);
    421 	KASSERT(l->l_stat != LSRUN);
    422 	if (l->l_stat == LSONPROC) {
    423 		KASSERT(lwp_locked(l, &spc->spc_lwplock));
    424 		if ((l->l_flag & LW_IDLE) == 0) {
    425 			l->l_stat = LSRUN;
    426 			lwp_setlock(l, spc->spc_mutex);
    427 			sched_enqueue(l, true);
    428 		} else
    429 			l->l_stat = LSIDL;
    430 	}
    431 
    432 	/*
    433 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    434 	 * If no LWP is runnable, switch to the idle LWP.
    435 	 * Note that spc_lwplock might not necessary be held.
    436 	 */
    437 	if (newl == NULL) {
    438 		newl = sched_nextlwp();
    439 		if (newl != NULL) {
    440 			sched_dequeue(newl);
    441 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    442 			newl->l_stat = LSONPROC;
    443 			newl->l_cpu = ci;
    444 			newl->l_flag |= LW_RUNNING;
    445 			lwp_setlock(newl, &spc->spc_lwplock);
    446 		} else {
    447 			newl = ci->ci_data.cpu_idlelwp;
    448 			newl->l_stat = LSONPROC;
    449 			newl->l_flag |= LW_RUNNING;
    450 		}
    451 		ci->ci_want_resched = 0;
    452 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    453 	}
    454 
    455 	/* Update the new LWP's start time while it is still locked. */
    456 	if (!returning) {
    457 		newl->l_stime = tv;
    458 		/*
    459 		 * XXX The following may be done unlocked if newl != NULL
    460 		 * above.
    461 		 */
    462 		newl->l_priority = newl->l_usrpri;
    463 	}
    464 
    465 	spc->spc_curpriority = newl->l_usrpri;
    466 
    467 	if (l != newl) {
    468 		struct lwp *prevlwp;
    469 
    470 		/*
    471 		 * If the old LWP has been moved to a run queue above,
    472 		 * drop the general purpose LWP lock: it's now locked
    473 		 * by the scheduler lock.
    474 		 *
    475 		 * Otherwise, drop the scheduler lock.  We're done with
    476 		 * the run queues for now.
    477 		 */
    478 		if (l->l_mutex == spc->spc_mutex) {
    479 			mutex_spin_exit(&spc->spc_lwplock);
    480 		} else {
    481 			mutex_spin_exit(spc->spc_mutex);
    482 		}
    483 
    484 		/* Unlocked, but for statistics only. */
    485 		uvmexp.swtch++;
    486 
    487 		/*
    488 		 * Save old VM context, unless a soft interrupt
    489 		 * handler is blocking.
    490 		 */
    491 		if (!returning)
    492 			pmap_deactivate(l);
    493 
    494 		/* Switch to the new LWP.. */
    495 		l->l_ncsw++;
    496 		l->l_flag &= ~LW_RUNNING;
    497 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    498 		prevlwp = cpu_switchto(l, newl);
    499 
    500 		/*
    501 		 * .. we have switched away and are now back so we must
    502 		 * be the new curlwp.  prevlwp is who we replaced.
    503 		 */
    504 		if (prevlwp != NULL) {
    505 			curcpu()->ci_mtx_oldspl = oldspl;
    506 			lwp_unlock(prevlwp);
    507 		} else {
    508 			splx(oldspl);
    509 		}
    510 
    511 		/* Restore VM context. */
    512 		pmap_activate(l);
    513 		retval = 1;
    514 	} else {
    515 		/* Nothing to do - just unlock and return. */
    516 		mutex_spin_exit(spc->spc_mutex);
    517 		lwp_unlock(l);
    518 		retval = 0;
    519 	}
    520 
    521 	KASSERT(l == curlwp);
    522 	KASSERT(l->l_stat == LSONPROC);
    523 	KASSERT(l->l_cpu == curcpu());
    524 
    525 	/*
    526 	 * XXXSMP If we are using h/w performance counters, restore context.
    527 	 */
    528 #if PERFCTRS
    529 	if (PMC_ENABLED(l->l_proc)) {
    530 		pmc_restore_context(l->l_proc);
    531 	}
    532 #endif
    533 
    534 	/*
    535 	 * We're running again; record our new start time.  We might
    536 	 * be running on a new CPU now, so don't use the cached
    537 	 * schedstate_percpu pointer.
    538 	 */
    539 	SYSCALL_TIME_WAKEUP(l);
    540 	KASSERT(curlwp == l);
    541 	KDASSERT(l->l_cpu == curcpu());
    542 	LOCKDEBUG_BARRIER(NULL, 1);
    543 
    544 	return retval;
    545 }
    546 
    547 /*
    548  * Change process state to be runnable, placing it on the run queue if it is
    549  * in memory, and awakening the swapper if it isn't in memory.
    550  *
    551  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    552  */
    553 void
    554 setrunnable(struct lwp *l)
    555 {
    556 	struct proc *p = l->l_proc;
    557 	sigset_t *ss;
    558 
    559 	KASSERT((l->l_flag & LW_IDLE) == 0);
    560 	KASSERT(mutex_owned(&p->p_smutex));
    561 	KASSERT(lwp_locked(l, NULL));
    562 
    563 	switch (l->l_stat) {
    564 	case LSSTOP:
    565 		/*
    566 		 * If we're being traced (possibly because someone attached us
    567 		 * while we were stopped), check for a signal from the debugger.
    568 		 */
    569 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    570 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    571 				ss = &l->l_sigpend.sp_set;
    572 			else
    573 				ss = &p->p_sigpend.sp_set;
    574 			sigaddset(ss, p->p_xstat);
    575 			signotify(l);
    576 		}
    577 		p->p_nrlwps++;
    578 		break;
    579 	case LSSUSPENDED:
    580 		l->l_flag &= ~LW_WSUSPEND;
    581 		p->p_nrlwps++;
    582 		cv_broadcast(&p->p_lwpcv);
    583 		break;
    584 	case LSSLEEP:
    585 		KASSERT(l->l_wchan != NULL);
    586 		break;
    587 	default:
    588 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    589 	}
    590 
    591 	/*
    592 	 * If the LWP was sleeping interruptably, then it's OK to start it
    593 	 * again.  If not, mark it as still sleeping.
    594 	 */
    595 	if (l->l_wchan != NULL) {
    596 		l->l_stat = LSSLEEP;
    597 		/* lwp_unsleep() will release the lock. */
    598 		lwp_unsleep(l);
    599 		return;
    600 	}
    601 
    602 	/*
    603 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    604 	 * about to call mi_switch(), in which case it will yield.
    605 	 */
    606 	if ((l->l_flag & LW_RUNNING) != 0) {
    607 		l->l_stat = LSONPROC;
    608 		l->l_slptime = 0;
    609 		lwp_unlock(l);
    610 		return;
    611 	}
    612 
    613 	/*
    614 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
    615 	 * to bring it back in.  Otherwise, enter it into a run queue.
    616 	 */
    617 	if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
    618 		spc_lock(l->l_cpu);
    619 		lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
    620 	}
    621 
    622 	sched_setrunnable(l);
    623 	l->l_stat = LSRUN;
    624 	l->l_slptime = 0;
    625 
    626 	if (l->l_flag & LW_INMEM) {
    627 		sched_enqueue(l, false);
    628 		resched_cpu(l);
    629 		lwp_unlock(l);
    630 	} else {
    631 		lwp_unlock(l);
    632 		uvm_kick_scheduler();
    633 	}
    634 }
    635 
    636 /*
    637  * suspendsched:
    638  *
    639  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    640  */
    641 void
    642 suspendsched(void)
    643 {
    644 	CPU_INFO_ITERATOR cii;
    645 	struct cpu_info *ci;
    646 	struct lwp *l;
    647 	struct proc *p;
    648 
    649 	/*
    650 	 * We do this by process in order not to violate the locking rules.
    651 	 */
    652 	mutex_enter(&proclist_mutex);
    653 	PROCLIST_FOREACH(p, &allproc) {
    654 		mutex_enter(&p->p_smutex);
    655 
    656 		if ((p->p_flag & PK_SYSTEM) != 0) {
    657 			mutex_exit(&p->p_smutex);
    658 			continue;
    659 		}
    660 
    661 		p->p_stat = SSTOP;
    662 
    663 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    664 			if (l == curlwp)
    665 				continue;
    666 
    667 			lwp_lock(l);
    668 
    669 			/*
    670 			 * Set L_WREBOOT so that the LWP will suspend itself
    671 			 * when it tries to return to user mode.  We want to
    672 			 * try and get to get as many LWPs as possible to
    673 			 * the user / kernel boundary, so that they will
    674 			 * release any locks that they hold.
    675 			 */
    676 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
    677 
    678 			if (l->l_stat == LSSLEEP &&
    679 			    (l->l_flag & LW_SINTR) != 0) {
    680 				/* setrunnable() will release the lock. */
    681 				setrunnable(l);
    682 				continue;
    683 			}
    684 
    685 			lwp_unlock(l);
    686 		}
    687 
    688 		mutex_exit(&p->p_smutex);
    689 	}
    690 	mutex_exit(&proclist_mutex);
    691 
    692 	/*
    693 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
    694 	 * They'll trap into the kernel and suspend themselves in userret().
    695 	 */
    696 	for (CPU_INFO_FOREACH(cii, ci))
    697 		cpu_need_resched(ci, 0);
    698 }
    699 
    700 /*
    701  * sched_kpri:
    702  *
    703  *	Scale a priority level to a kernel priority level, usually
    704  *	for an LWP that is about to sleep.
    705  */
    706 pri_t
    707 sched_kpri(struct lwp *l)
    708 {
    709 	/*
    710 	 * Scale user priorities (127 -> 50) up to kernel priorities
    711 	 * in the range (49 -> 8).  Reserve the top 8 kernel priorities
    712 	 * for high priority kthreads.  Kernel priorities passed in
    713 	 * are left "as is".  XXX This is somewhat arbitrary.
    714 	 */
    715 	static const uint8_t kpri_tab[] = {
    716 		 0,   1,   2,   3,   4,   5,   6,   7,
    717 		 8,   9,  10,  11,  12,  13,  14,  15,
    718 		16,  17,  18,  19,  20,  21,  22,  23,
    719 		24,  25,  26,  27,  28,  29,  30,  31,
    720 		32,  33,  34,  35,  36,  37,  38,  39,
    721 		40,  41,  42,  43,  44,  45,  46,  47,
    722 		48,  49,   8,   8,   9,   9,  10,  10,
    723 		11,  11,  12,  12,  13,  14,  14,  15,
    724 		15,  16,  16,  17,  17,  18,  18,  19,
    725 		20,  20,  21,  21,  22,  22,  23,  23,
    726 		24,  24,  25,  26,  26,  27,  27,  28,
    727 		28,  29,  29,  30,  30,  31,  32,  32,
    728 		33,  33,  34,  34,  35,  35,  36,  36,
    729 		37,  38,  38,  39,  39,  40,  40,  41,
    730 		41,  42,  42,  43,  44,  44,  45,  45,
    731 		46,  46,  47,  47,  48,  48,  49,  49,
    732 	};
    733 
    734 	return (pri_t)kpri_tab[l->l_usrpri];
    735 }
    736 
    737 /*
    738  * sched_unsleep:
    739  *
    740  *	The is called when the LWP has not been awoken normally but instead
    741  *	interrupted: for example, if the sleep timed out.  Because of this,
    742  *	it's not a valid action for running or idle LWPs.
    743  */
    744 static void
    745 sched_unsleep(struct lwp *l)
    746 {
    747 
    748 	lwp_unlock(l);
    749 	panic("sched_unsleep");
    750 }
    751 
    752 inline void
    753 resched_cpu(struct lwp *l)
    754 {
    755 	struct cpu_info *ci;
    756 	const pri_t pri = lwp_eprio(l);
    757 
    758 	/*
    759 	 * XXXSMP
    760 	 * Since l->l_cpu persists across a context switch,
    761 	 * this gives us *very weak* processor affinity, in
    762 	 * that we notify the CPU on which the process last
    763 	 * ran that it should try to switch.
    764 	 *
    765 	 * This does not guarantee that the process will run on
    766 	 * that processor next, because another processor might
    767 	 * grab it the next time it performs a context switch.
    768 	 *
    769 	 * This also does not handle the case where its last
    770 	 * CPU is running a higher-priority process, but every
    771 	 * other CPU is running a lower-priority process.  There
    772 	 * are ways to handle this situation, but they're not
    773 	 * currently very pretty, and we also need to weigh the
    774 	 * cost of moving a process from one CPU to another.
    775 	 */
    776 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    777 	if (pri < ci->ci_schedstate.spc_curpriority)
    778 		cpu_need_resched(ci, 0);
    779 }
    780 
    781 static void
    782 sched_changepri(struct lwp *l, pri_t pri)
    783 {
    784 
    785 	KASSERT(lwp_locked(l, NULL));
    786 
    787 	l->l_usrpri = pri;
    788 	if (l->l_priority < PUSER)
    789 		return;
    790 
    791 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    792 		l->l_priority = pri;
    793 		return;
    794 	}
    795 
    796 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    797 
    798 	sched_dequeue(l);
    799 	l->l_priority = pri;
    800 	sched_enqueue(l, false);
    801 	resched_cpu(l);
    802 }
    803 
    804 static void
    805 sched_lendpri(struct lwp *l, pri_t pri)
    806 {
    807 
    808 	KASSERT(lwp_locked(l, NULL));
    809 
    810 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    811 		l->l_inheritedprio = pri;
    812 		return;
    813 	}
    814 
    815 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    816 
    817 	sched_dequeue(l);
    818 	l->l_inheritedprio = pri;
    819 	sched_enqueue(l, false);
    820 	resched_cpu(l);
    821 }
    822 
    823 struct lwp *
    824 syncobj_noowner(wchan_t wchan)
    825 {
    826 
    827 	return NULL;
    828 }
    829 
    830 
    831 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
    832 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
    833 
    834 /*
    835  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
    836  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
    837  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
    838  *
    839  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
    840  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
    841  *
    842  * If you dont want to bother with the faster/more-accurate formula, you
    843  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
    844  * (more general) method of calculating the %age of CPU used by a process.
    845  */
    846 #define	CCPU_SHIFT	(FSHIFT + 1)
    847 
    848 /*
    849  * sched_pstats:
    850  *
    851  * Update process statistics and check CPU resource allocation.
    852  * Call scheduler-specific hook to eventually adjust process/LWP
    853  * priorities.
    854  */
    855 /* ARGSUSED */
    856 void
    857 sched_pstats(void *arg)
    858 {
    859 	struct rlimit *rlim;
    860 	struct lwp *l;
    861 	struct proc *p;
    862 	int minslp, sig, clkhz;
    863 	long runtm;
    864 
    865 	sched_pstats_ticks++;
    866 
    867 	mutex_enter(&proclist_mutex);
    868 	PROCLIST_FOREACH(p, &allproc) {
    869 		/*
    870 		 * Increment time in/out of memory and sleep time (if
    871 		 * sleeping).  We ignore overflow; with 16-bit int's
    872 		 * (remember them?) overflow takes 45 days.
    873 		 */
    874 		minslp = 2;
    875 		mutex_enter(&p->p_smutex);
    876 		mutex_spin_enter(&p->p_stmutex);
    877 		runtm = p->p_rtime.tv_sec;
    878 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    879 			if ((l->l_flag & LW_IDLE) != 0)
    880 				continue;
    881 			lwp_lock(l);
    882 			runtm += l->l_rtime.tv_sec;
    883 			l->l_swtime++;
    884 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    885 			    l->l_stat == LSSUSPENDED) {
    886 				l->l_slptime++;
    887 				minslp = min(minslp, l->l_slptime);
    888 			} else
    889 				minslp = 0;
    890 			sched_pstats_hook(l);
    891 			lwp_unlock(l);
    892 
    893 			/*
    894 			 * p_pctcpu is only for ps.
    895 			 */
    896 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
    897 			if (l->l_slptime < 1) {
    898 				clkhz = stathz != 0 ? stathz : hz;
    899 #if	(FSHIFT >= CCPU_SHIFT)
    900 				l->l_pctcpu += (clkhz == 100) ?
    901 				    ((fixpt_t)l->l_cpticks) <<
    902 				        (FSHIFT - CCPU_SHIFT) :
    903 				    100 * (((fixpt_t) p->p_cpticks)
    904 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
    905 #else
    906 				l->l_pctcpu += ((FSCALE - ccpu) *
    907 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
    908 #endif
    909 				l->l_cpticks = 0;
    910 			}
    911 		}
    912 
    913 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
    914 #ifdef SCHED_4BSD
    915 		/*
    916 		 * XXX: Workaround - belongs to sched_4bsd.c
    917 		 * If the process has slept the entire second,
    918 		 * stop recalculating its priority until it wakes up.
    919 		 */
    920 		if (minslp <= 1) {
    921 			extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
    922 
    923 			fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
    924 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    925 		}
    926 #endif
    927 		mutex_spin_exit(&p->p_stmutex);
    928 
    929 		/*
    930 		 * Check if the process exceeds its CPU resource allocation.
    931 		 * If over max, kill it.
    932 		 */
    933 		rlim = &p->p_rlimit[RLIMIT_CPU];
    934 		sig = 0;
    935 		if (runtm >= rlim->rlim_cur) {
    936 			if (runtm >= rlim->rlim_max)
    937 				sig = SIGKILL;
    938 			else {
    939 				sig = SIGXCPU;
    940 				if (rlim->rlim_cur < rlim->rlim_max)
    941 					rlim->rlim_cur += 5;
    942 			}
    943 		}
    944 		mutex_exit(&p->p_smutex);
    945 		if (sig) {
    946 			psignal(p, sig);
    947 		}
    948 	}
    949 	mutex_exit(&proclist_mutex);
    950 	uvm_meter();
    951 	cv_wakeup(&lbolt);
    952 	callout_schedule(&sched_pstats_ch, hz);
    953 }
    954 
    955 void
    956 sched_init(void)
    957 {
    958 
    959 	cv_init(&lbolt, "lbolt");
    960 	callout_init(&sched_pstats_ch, 0);
    961 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
    962 	sched_setup();
    963 	sched_pstats(NULL);
    964 }
    965