Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.200
      1 /*	$NetBSD: kern_synch.c,v 1.200 2007/10/09 19:00:14 rmind Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. All advertising materials mentioning features or use of this software
     21  *    must display the following acknowledgement:
     22  *	This product includes software developed by the NetBSD
     23  *	Foundation, Inc. and its contributors.
     24  * 4. Neither the name of The NetBSD Foundation nor the names of its
     25  *    contributors may be used to endorse or promote products derived
     26  *    from this software without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     38  * POSSIBILITY OF SUCH DAMAGE.
     39  */
     40 
     41 /*-
     42  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     43  *	The Regents of the University of California.  All rights reserved.
     44  * (c) UNIX System Laboratories, Inc.
     45  * All or some portions of this file are derived from material licensed
     46  * to the University of California by American Telephone and Telegraph
     47  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     48  * the permission of UNIX System Laboratories, Inc.
     49  *
     50  * Redistribution and use in source and binary forms, with or without
     51  * modification, are permitted provided that the following conditions
     52  * are met:
     53  * 1. Redistributions of source code must retain the above copyright
     54  *    notice, this list of conditions and the following disclaimer.
     55  * 2. Redistributions in binary form must reproduce the above copyright
     56  *    notice, this list of conditions and the following disclaimer in the
     57  *    documentation and/or other materials provided with the distribution.
     58  * 3. Neither the name of the University nor the names of its contributors
     59  *    may be used to endorse or promote products derived from this software
     60  *    without specific prior written permission.
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     72  * SUCH DAMAGE.
     73  *
     74  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     75  */
     76 
     77 #include <sys/cdefs.h>
     78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.200 2007/10/09 19:00:14 rmind Exp $");
     79 
     80 #include "opt_kstack.h"
     81 #include "opt_lockdebug.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_perfctrs.h"
     84 
     85 #define	__MUTEX_PRIVATE
     86 
     87 #include <sys/param.h>
     88 #include <sys/systm.h>
     89 #include <sys/proc.h>
     90 #include <sys/kernel.h>
     91 #if defined(PERFCTRS)
     92 #include <sys/pmc.h>
     93 #endif
     94 #include <sys/cpu.h>
     95 #include <sys/resourcevar.h>
     96 #include <sys/sched.h>
     97 #include <sys/syscall_stats.h>
     98 #include <sys/sleepq.h>
     99 #include <sys/lockdebug.h>
    100 #include <sys/evcnt.h>
    101 #include <sys/intr.h>
    102 
    103 #include <uvm/uvm_extern.h>
    104 
    105 callout_t sched_pstats_ch;
    106 unsigned int sched_pstats_ticks;
    107 
    108 kcondvar_t	lbolt;			/* once a second sleep address */
    109 
    110 static void	sched_unsleep(struct lwp *);
    111 static void	sched_changepri(struct lwp *, pri_t);
    112 static void	sched_lendpri(struct lwp *, pri_t);
    113 
    114 syncobj_t sleep_syncobj = {
    115 	SOBJ_SLEEPQ_SORTED,
    116 	sleepq_unsleep,
    117 	sleepq_changepri,
    118 	sleepq_lendpri,
    119 	syncobj_noowner,
    120 };
    121 
    122 syncobj_t sched_syncobj = {
    123 	SOBJ_SLEEPQ_SORTED,
    124 	sched_unsleep,
    125 	sched_changepri,
    126 	sched_lendpri,
    127 	syncobj_noowner,
    128 };
    129 
    130 /*
    131  * During autoconfiguration or after a panic, a sleep will simply lower the
    132  * priority briefly to allow interrupts, then return.  The priority to be
    133  * used (safepri) is machine-dependent, thus this value is initialized and
    134  * maintained in the machine-dependent layers.  This priority will typically
    135  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    136  * it can be made higher to block network software interrupts after panics.
    137  */
    138 int	safepri;
    139 
    140 /*
    141  * OBSOLETE INTERFACE
    142  *
    143  * General sleep call.  Suspends the current process until a wakeup is
    144  * performed on the specified identifier.  The process will then be made
    145  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    146  * means no timeout).  If pri includes PCATCH flag, signals are checked
    147  * before and after sleeping, else signals are not checked.  Returns 0 if
    148  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    149  * signal needs to be delivered, ERESTART is returned if the current system
    150  * call should be restarted if possible, and EINTR is returned if the system
    151  * call should be interrupted by the signal (return EINTR).
    152  *
    153  * The interlock is held until we are on a sleep queue. The interlock will
    154  * be locked before returning back to the caller unless the PNORELOCK flag
    155  * is specified, in which case the interlock will always be unlocked upon
    156  * return.
    157  */
    158 int
    159 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    160 	volatile struct simplelock *interlock)
    161 {
    162 	struct lwp *l = curlwp;
    163 	sleepq_t *sq;
    164 	int error;
    165 
    166 	if (sleepq_dontsleep(l)) {
    167 		(void)sleepq_abort(NULL, 0);
    168 		if ((priority & PNORELOCK) != 0)
    169 			simple_unlock(interlock);
    170 		return 0;
    171 	}
    172 
    173 	sq = sleeptab_lookup(&sleeptab, ident);
    174 	sleepq_enter(sq, l);
    175 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    176 
    177 	if (interlock != NULL) {
    178 		LOCK_ASSERT(simple_lock_held(interlock));
    179 		simple_unlock(interlock);
    180 	}
    181 
    182 	error = sleepq_block(timo, priority & PCATCH);
    183 
    184 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    185 		simple_lock(interlock);
    186 
    187 	return error;
    188 }
    189 
    190 int
    191 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    192 	kmutex_t *mtx)
    193 {
    194 	struct lwp *l = curlwp;
    195 	sleepq_t *sq;
    196 	int error;
    197 
    198 	if (sleepq_dontsleep(l)) {
    199 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    200 		return 0;
    201 	}
    202 
    203 	sq = sleeptab_lookup(&sleeptab, ident);
    204 	sleepq_enter(sq, l);
    205 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    206 	mutex_exit(mtx);
    207 	error = sleepq_block(timo, priority & PCATCH);
    208 
    209 	if ((priority & PNORELOCK) == 0)
    210 		mutex_enter(mtx);
    211 
    212 	return error;
    213 }
    214 
    215 /*
    216  * General sleep call for situations where a wake-up is not expected.
    217  */
    218 int
    219 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    220 {
    221 	struct lwp *l = curlwp;
    222 	sleepq_t *sq;
    223 	int error;
    224 
    225 	if (sleepq_dontsleep(l))
    226 		return sleepq_abort(NULL, 0);
    227 
    228 	if (mtx != NULL)
    229 		mutex_exit(mtx);
    230 	sq = sleeptab_lookup(&sleeptab, l);
    231 	sleepq_enter(sq, l);
    232 	sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
    233 	error = sleepq_block(timo, intr);
    234 	if (mtx != NULL)
    235 		mutex_enter(mtx);
    236 
    237 	return error;
    238 }
    239 
    240 /*
    241  * OBSOLETE INTERFACE
    242  *
    243  * Make all processes sleeping on the specified identifier runnable.
    244  */
    245 void
    246 wakeup(wchan_t ident)
    247 {
    248 	sleepq_t *sq;
    249 
    250 	if (cold)
    251 		return;
    252 
    253 	sq = sleeptab_lookup(&sleeptab, ident);
    254 	sleepq_wake(sq, ident, (u_int)-1);
    255 }
    256 
    257 /*
    258  * OBSOLETE INTERFACE
    259  *
    260  * Make the highest priority process first in line on the specified
    261  * identifier runnable.
    262  */
    263 void
    264 wakeup_one(wchan_t ident)
    265 {
    266 	sleepq_t *sq;
    267 
    268 	if (cold)
    269 		return;
    270 
    271 	sq = sleeptab_lookup(&sleeptab, ident);
    272 	sleepq_wake(sq, ident, 1);
    273 }
    274 
    275 
    276 /*
    277  * General yield call.  Puts the current process back on its run queue and
    278  * performs a voluntary context switch.  Should only be called when the
    279  * current process explicitly requests it (eg sched_yield(2)).
    280  */
    281 void
    282 yield(void)
    283 {
    284 	struct lwp *l = curlwp;
    285 
    286 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    287 	lwp_lock(l);
    288 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    289 	KASSERT(l->l_stat == LSONPROC);
    290 	/* XXX Only do this for timeshared threads. */
    291 	l->l_priority = MAXPRI;
    292 	(void)mi_switch(l);
    293 	KERNEL_LOCK(l->l_biglocks, l);
    294 }
    295 
    296 /*
    297  * General preemption call.  Puts the current process back on its run queue
    298  * and performs an involuntary context switch.
    299  */
    300 void
    301 preempt(void)
    302 {
    303 	struct lwp *l = curlwp;
    304 
    305 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    306 	lwp_lock(l);
    307 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    308 	KASSERT(l->l_stat == LSONPROC);
    309 	l->l_priority = l->l_usrpri;
    310 	l->l_nivcsw++;
    311 	(void)mi_switch(l);
    312 	KERNEL_LOCK(l->l_biglocks, l);
    313 }
    314 
    315 /*
    316  * Compute the amount of time during which the current lwp was running.
    317  *
    318  * - update l_rtime unless it's an idle lwp.
    319  */
    320 
    321 void
    322 updatertime(lwp_t *l, const struct timeval *tv)
    323 {
    324 	long s, u;
    325 
    326 	if ((l->l_flag & LW_IDLE) != 0)
    327 		return;
    328 
    329 	u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
    330 	s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
    331 	if (u < 0) {
    332 		u += 1000000;
    333 		s--;
    334 	} else if (u >= 1000000) {
    335 		u -= 1000000;
    336 		s++;
    337 	}
    338 	l->l_rtime.tv_usec = u;
    339 	l->l_rtime.tv_sec = s;
    340 }
    341 
    342 /*
    343  * The machine independent parts of context switch.
    344  *
    345  * Returns 1 if another LWP was actually run.
    346  */
    347 int
    348 mi_switch(lwp_t *l)
    349 {
    350 	struct schedstate_percpu *spc;
    351 	struct lwp *newl;
    352 	int retval, oldspl;
    353 	struct cpu_info *ci;
    354 	struct timeval tv;
    355 	bool returning;
    356 
    357 	KASSERT(lwp_locked(l, NULL));
    358 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    359 
    360 #ifdef KSTACK_CHECK_MAGIC
    361 	kstack_check_magic(l);
    362 #endif
    363 
    364 	microtime(&tv);
    365 
    366 	/*
    367 	 * It's safe to read the per CPU schedstate unlocked here, as all we
    368 	 * are after is the run time and that's guarenteed to have been last
    369 	 * updated by this CPU.
    370 	 */
    371 	ci = l->l_cpu;
    372 	KDASSERT(ci == curcpu());
    373 
    374 	/*
    375 	 * Process is about to yield the CPU; clear the appropriate
    376 	 * scheduling flags.
    377 	 */
    378 	spc = &ci->ci_schedstate;
    379 	returning = false;
    380 	newl = NULL;
    381 
    382 	/*
    383 	 * If we have been asked to switch to a specific LWP, then there
    384 	 * is no need to inspect the run queues.  If a soft interrupt is
    385 	 * blocking, then return to the interrupted thread without adjusting
    386 	 * VM context or its start time: neither have been changed in order
    387 	 * to take the interrupt.
    388 	 */
    389 	if (l->l_switchto != NULL) {
    390 		if ((l->l_flag & LW_INTR) != 0) {
    391 			returning = true;
    392 			softint_block(l);
    393 			if ((l->l_flag & LW_TIMEINTR) != 0)
    394 				updatertime(l, &tv);
    395 		}
    396 		newl = l->l_switchto;
    397 		l->l_switchto = NULL;
    398 	}
    399 
    400 	/* Count time spent in current system call */
    401 	if (!returning) {
    402 		SYSCALL_TIME_SLEEP(l);
    403 
    404 		/*
    405 		 * XXXSMP If we are using h/w performance counters,
    406 		 * save context.
    407 		 */
    408 #if PERFCTRS
    409 		if (PMC_ENABLED(l->l_proc)) {
    410 			pmc_save_context(l->l_proc);
    411 		}
    412 #endif
    413 		updatertime(l, &tv);
    414 	}
    415 
    416 	/*
    417 	 * If on the CPU and we have gotten this far, then we must yield.
    418 	 */
    419 	mutex_spin_enter(spc->spc_mutex);
    420 	KASSERT(l->l_stat != LSRUN);
    421 	if (l->l_stat == LSONPROC) {
    422 		KASSERT(lwp_locked(l, &spc->spc_lwplock));
    423 		if ((l->l_flag & LW_IDLE) == 0) {
    424 			l->l_stat = LSRUN;
    425 			lwp_setlock(l, spc->spc_mutex);
    426 			sched_enqueue(l, true);
    427 		} else
    428 			l->l_stat = LSIDL;
    429 	}
    430 
    431 	/*
    432 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    433 	 * If no LWP is runnable, switch to the idle LWP.
    434 	 */
    435 	if (newl == NULL) {
    436 		newl = sched_nextlwp();
    437 		if (newl != NULL) {
    438 			sched_dequeue(newl);
    439 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    440 			newl->l_stat = LSONPROC;
    441 			newl->l_cpu = ci;
    442 			newl->l_flag |= LW_RUNNING;
    443 			lwp_setlock(newl, &spc->spc_lwplock);
    444 		} else {
    445 			newl = ci->ci_data.cpu_idlelwp;
    446 			newl->l_stat = LSONPROC;
    447 			newl->l_flag |= LW_RUNNING;
    448 		}
    449 		ci->ci_want_resched = 0;
    450 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    451 	}
    452 
    453 	/* Update the new LWP's start time while it is still locked. */
    454 	if (!returning) {
    455 		newl->l_stime = tv;
    456 		/*
    457 		 * XXX The following may be done unlocked if newl != NULL
    458 		 * above.
    459 		 */
    460 		newl->l_priority = newl->l_usrpri;
    461 	}
    462 
    463 	spc->spc_curpriority = newl->l_usrpri;
    464 
    465 	if (l != newl) {
    466 		struct lwp *prevlwp;
    467 
    468 		/*
    469 		 * If the old LWP has been moved to a run queue above,
    470 		 * drop the general purpose LWP lock: it's now locked
    471 		 * by the scheduler lock.
    472 		 *
    473 		 * Otherwise, drop the scheduler lock.  We're done with
    474 		 * the run queues for now.
    475 		 */
    476 		if (l->l_mutex == spc->spc_mutex) {
    477 			mutex_spin_exit(&spc->spc_lwplock);
    478 		} else {
    479 			mutex_spin_exit(spc->spc_mutex);
    480 		}
    481 
    482 		/* Unlocked, but for statistics only. */
    483 		uvmexp.swtch++;
    484 
    485 		/*
    486 		 * Save old VM context, unless a soft interrupt
    487 		 * handler is blocking.
    488 		 */
    489 		if (!returning)
    490 			pmap_deactivate(l);
    491 
    492 		/* Switch to the new LWP.. */
    493 		l->l_ncsw++;
    494 		l->l_flag &= ~LW_RUNNING;
    495 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    496 		prevlwp = cpu_switchto(l, newl);
    497 
    498 		/*
    499 		 * .. we have switched away and are now back so we must
    500 		 * be the new curlwp.  prevlwp is who we replaced.
    501 		 */
    502 		if (prevlwp != NULL) {
    503 			curcpu()->ci_mtx_oldspl = oldspl;
    504 			lwp_unlock(prevlwp);
    505 		} else {
    506 			splx(oldspl);
    507 		}
    508 
    509 		/* Restore VM context. */
    510 		pmap_activate(l);
    511 		retval = 1;
    512 	} else {
    513 		/* Nothing to do - just unlock and return. */
    514 		mutex_spin_exit(spc->spc_mutex);
    515 		lwp_unlock(l);
    516 		retval = 0;
    517 	}
    518 
    519 	KASSERT(l == curlwp);
    520 	KASSERT(l->l_stat == LSONPROC);
    521 	KASSERT(l->l_cpu == curcpu());
    522 
    523 	/*
    524 	 * XXXSMP If we are using h/w performance counters, restore context.
    525 	 */
    526 #if PERFCTRS
    527 	if (PMC_ENABLED(l->l_proc)) {
    528 		pmc_restore_context(l->l_proc);
    529 	}
    530 #endif
    531 
    532 	/*
    533 	 * We're running again; record our new start time.  We might
    534 	 * be running on a new CPU now, so don't use the cached
    535 	 * schedstate_percpu pointer.
    536 	 */
    537 	SYSCALL_TIME_WAKEUP(l);
    538 	KASSERT(curlwp == l);
    539 	KDASSERT(l->l_cpu == curcpu());
    540 	LOCKDEBUG_BARRIER(NULL, 1);
    541 
    542 	return retval;
    543 }
    544 
    545 /*
    546  * Change process state to be runnable, placing it on the run queue if it is
    547  * in memory, and awakening the swapper if it isn't in memory.
    548  *
    549  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    550  */
    551 void
    552 setrunnable(struct lwp *l)
    553 {
    554 	struct proc *p = l->l_proc;
    555 	sigset_t *ss;
    556 
    557 	KASSERT((l->l_flag & LW_IDLE) == 0);
    558 	KASSERT(mutex_owned(&p->p_smutex));
    559 	KASSERT(lwp_locked(l, NULL));
    560 
    561 	switch (l->l_stat) {
    562 	case LSSTOP:
    563 		/*
    564 		 * If we're being traced (possibly because someone attached us
    565 		 * while we were stopped), check for a signal from the debugger.
    566 		 */
    567 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    568 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    569 				ss = &l->l_sigpend.sp_set;
    570 			else
    571 				ss = &p->p_sigpend.sp_set;
    572 			sigaddset(ss, p->p_xstat);
    573 			signotify(l);
    574 		}
    575 		p->p_nrlwps++;
    576 		break;
    577 	case LSSUSPENDED:
    578 		l->l_flag &= ~LW_WSUSPEND;
    579 		p->p_nrlwps++;
    580 		cv_broadcast(&p->p_lwpcv);
    581 		break;
    582 	case LSSLEEP:
    583 		KASSERT(l->l_wchan != NULL);
    584 		break;
    585 	default:
    586 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    587 	}
    588 
    589 	/*
    590 	 * If the LWP was sleeping interruptably, then it's OK to start it
    591 	 * again.  If not, mark it as still sleeping.
    592 	 */
    593 	if (l->l_wchan != NULL) {
    594 		l->l_stat = LSSLEEP;
    595 		/* lwp_unsleep() will release the lock. */
    596 		lwp_unsleep(l);
    597 		return;
    598 	}
    599 
    600 	/*
    601 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    602 	 * about to call mi_switch(), in which case it will yield.
    603 	 */
    604 	if ((l->l_flag & LW_RUNNING) != 0) {
    605 		l->l_stat = LSONPROC;
    606 		l->l_slptime = 0;
    607 		lwp_unlock(l);
    608 		return;
    609 	}
    610 
    611 	/*
    612 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
    613 	 * to bring it back in.  Otherwise, enter it into a run queue.
    614 	 */
    615 	if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
    616 		spc_lock(l->l_cpu);
    617 		lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
    618 	}
    619 
    620 	sched_setrunnable(l);
    621 	l->l_stat = LSRUN;
    622 	l->l_slptime = 0;
    623 
    624 	if (l->l_flag & LW_INMEM) {
    625 		sched_enqueue(l, false);
    626 		resched_cpu(l);
    627 		lwp_unlock(l);
    628 	} else {
    629 		lwp_unlock(l);
    630 		uvm_kick_scheduler();
    631 	}
    632 }
    633 
    634 /*
    635  * suspendsched:
    636  *
    637  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    638  */
    639 void
    640 suspendsched(void)
    641 {
    642 	CPU_INFO_ITERATOR cii;
    643 	struct cpu_info *ci;
    644 	struct lwp *l;
    645 	struct proc *p;
    646 
    647 	/*
    648 	 * We do this by process in order not to violate the locking rules.
    649 	 */
    650 	mutex_enter(&proclist_mutex);
    651 	PROCLIST_FOREACH(p, &allproc) {
    652 		mutex_enter(&p->p_smutex);
    653 
    654 		if ((p->p_flag & PK_SYSTEM) != 0) {
    655 			mutex_exit(&p->p_smutex);
    656 			continue;
    657 		}
    658 
    659 		p->p_stat = SSTOP;
    660 
    661 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    662 			if (l == curlwp)
    663 				continue;
    664 
    665 			lwp_lock(l);
    666 
    667 			/*
    668 			 * Set L_WREBOOT so that the LWP will suspend itself
    669 			 * when it tries to return to user mode.  We want to
    670 			 * try and get to get as many LWPs as possible to
    671 			 * the user / kernel boundary, so that they will
    672 			 * release any locks that they hold.
    673 			 */
    674 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
    675 
    676 			if (l->l_stat == LSSLEEP &&
    677 			    (l->l_flag & LW_SINTR) != 0) {
    678 				/* setrunnable() will release the lock. */
    679 				setrunnable(l);
    680 				continue;
    681 			}
    682 
    683 			lwp_unlock(l);
    684 		}
    685 
    686 		mutex_exit(&p->p_smutex);
    687 	}
    688 	mutex_exit(&proclist_mutex);
    689 
    690 	/*
    691 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
    692 	 * They'll trap into the kernel and suspend themselves in userret().
    693 	 */
    694 	for (CPU_INFO_FOREACH(cii, ci))
    695 		cpu_need_resched(ci, 0);
    696 }
    697 
    698 /*
    699  * sched_kpri:
    700  *
    701  *	Scale a priority level to a kernel priority level, usually
    702  *	for an LWP that is about to sleep.
    703  */
    704 pri_t
    705 sched_kpri(struct lwp *l)
    706 {
    707 	/*
    708 	 * Scale user priorities (127 -> 50) up to kernel priorities
    709 	 * in the range (49 -> 8).  Reserve the top 8 kernel priorities
    710 	 * for high priority kthreads.  Kernel priorities passed in
    711 	 * are left "as is".  XXX This is somewhat arbitrary.
    712 	 */
    713 	static const uint8_t kpri_tab[] = {
    714 		 0,   1,   2,   3,   4,   5,   6,   7,
    715 		 8,   9,  10,  11,  12,  13,  14,  15,
    716 		16,  17,  18,  19,  20,  21,  22,  23,
    717 		24,  25,  26,  27,  28,  29,  30,  31,
    718 		32,  33,  34,  35,  36,  37,  38,  39,
    719 		40,  41,  42,  43,  44,  45,  46,  47,
    720 		48,  49,   8,   8,   9,   9,  10,  10,
    721 		11,  11,  12,  12,  13,  14,  14,  15,
    722 		15,  16,  16,  17,  17,  18,  18,  19,
    723 		20,  20,  21,  21,  22,  22,  23,  23,
    724 		24,  24,  25,  26,  26,  27,  27,  28,
    725 		28,  29,  29,  30,  30,  31,  32,  32,
    726 		33,  33,  34,  34,  35,  35,  36,  36,
    727 		37,  38,  38,  39,  39,  40,  40,  41,
    728 		41,  42,  42,  43,  44,  44,  45,  45,
    729 		46,  46,  47,  47,  48,  48,  49,  49,
    730 	};
    731 
    732 	return (pri_t)kpri_tab[l->l_usrpri];
    733 }
    734 
    735 /*
    736  * sched_unsleep:
    737  *
    738  *	The is called when the LWP has not been awoken normally but instead
    739  *	interrupted: for example, if the sleep timed out.  Because of this,
    740  *	it's not a valid action for running or idle LWPs.
    741  */
    742 static void
    743 sched_unsleep(struct lwp *l)
    744 {
    745 
    746 	lwp_unlock(l);
    747 	panic("sched_unsleep");
    748 }
    749 
    750 inline void
    751 resched_cpu(struct lwp *l)
    752 {
    753 	struct cpu_info *ci;
    754 	const pri_t pri = lwp_eprio(l);
    755 
    756 	/*
    757 	 * XXXSMP
    758 	 * Since l->l_cpu persists across a context switch,
    759 	 * this gives us *very weak* processor affinity, in
    760 	 * that we notify the CPU on which the process last
    761 	 * ran that it should try to switch.
    762 	 *
    763 	 * This does not guarantee that the process will run on
    764 	 * that processor next, because another processor might
    765 	 * grab it the next time it performs a context switch.
    766 	 *
    767 	 * This also does not handle the case where its last
    768 	 * CPU is running a higher-priority process, but every
    769 	 * other CPU is running a lower-priority process.  There
    770 	 * are ways to handle this situation, but they're not
    771 	 * currently very pretty, and we also need to weigh the
    772 	 * cost of moving a process from one CPU to another.
    773 	 */
    774 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    775 	if (pri < ci->ci_schedstate.spc_curpriority)
    776 		cpu_need_resched(ci, 0);
    777 }
    778 
    779 static void
    780 sched_changepri(struct lwp *l, pri_t pri)
    781 {
    782 
    783 	KASSERT(lwp_locked(l, NULL));
    784 
    785 	l->l_usrpri = pri;
    786 	if (l->l_priority < PUSER)
    787 		return;
    788 
    789 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    790 		l->l_priority = pri;
    791 		return;
    792 	}
    793 
    794 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    795 
    796 	sched_dequeue(l);
    797 	l->l_priority = pri;
    798 	sched_enqueue(l, false);
    799 	resched_cpu(l);
    800 }
    801 
    802 static void
    803 sched_lendpri(struct lwp *l, pri_t pri)
    804 {
    805 
    806 	KASSERT(lwp_locked(l, NULL));
    807 
    808 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    809 		l->l_inheritedprio = pri;
    810 		return;
    811 	}
    812 
    813 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    814 
    815 	sched_dequeue(l);
    816 	l->l_inheritedprio = pri;
    817 	sched_enqueue(l, false);
    818 	resched_cpu(l);
    819 }
    820 
    821 struct lwp *
    822 syncobj_noowner(wchan_t wchan)
    823 {
    824 
    825 	return NULL;
    826 }
    827 
    828 
    829 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
    830 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
    831 
    832 /*
    833  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
    834  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
    835  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
    836  *
    837  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
    838  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
    839  *
    840  * If you dont want to bother with the faster/more-accurate formula, you
    841  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
    842  * (more general) method of calculating the %age of CPU used by a process.
    843  */
    844 #define	CCPU_SHIFT	(FSHIFT + 1)
    845 
    846 /*
    847  * sched_pstats:
    848  *
    849  * Update process statistics and check CPU resource allocation.
    850  * Call scheduler-specific hook to eventually adjust process/LWP
    851  * priorities.
    852  */
    853 /* ARGSUSED */
    854 void
    855 sched_pstats(void *arg)
    856 {
    857 	struct rlimit *rlim;
    858 	struct lwp *l;
    859 	struct proc *p;
    860 	int minslp, sig, clkhz;
    861 	long runtm;
    862 
    863 	sched_pstats_ticks++;
    864 
    865 	mutex_enter(&proclist_mutex);
    866 	PROCLIST_FOREACH(p, &allproc) {
    867 		/*
    868 		 * Increment time in/out of memory and sleep time (if
    869 		 * sleeping).  We ignore overflow; with 16-bit int's
    870 		 * (remember them?) overflow takes 45 days.
    871 		 */
    872 		minslp = 2;
    873 		mutex_enter(&p->p_smutex);
    874 		mutex_spin_enter(&p->p_stmutex);
    875 		runtm = p->p_rtime.tv_sec;
    876 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    877 			if ((l->l_flag & LW_IDLE) != 0)
    878 				continue;
    879 			lwp_lock(l);
    880 			runtm += l->l_rtime.tv_sec;
    881 			l->l_swtime++;
    882 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    883 			    l->l_stat == LSSUSPENDED) {
    884 				l->l_slptime++;
    885 				minslp = min(minslp, l->l_slptime);
    886 			} else
    887 				minslp = 0;
    888 			sched_pstats_hook(l);
    889 			lwp_unlock(l);
    890 
    891 			/*
    892 			 * p_pctcpu is only for ps.
    893 			 */
    894 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
    895 			if (l->l_slptime < 1) {
    896 				clkhz = stathz != 0 ? stathz : hz;
    897 #if	(FSHIFT >= CCPU_SHIFT)
    898 				l->l_pctcpu += (clkhz == 100) ?
    899 				    ((fixpt_t)l->l_cpticks) <<
    900 				        (FSHIFT - CCPU_SHIFT) :
    901 				    100 * (((fixpt_t) p->p_cpticks)
    902 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
    903 #else
    904 				l->l_pctcpu += ((FSCALE - ccpu) *
    905 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
    906 #endif
    907 				l->l_cpticks = 0;
    908 			}
    909 		}
    910 
    911 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
    912 #ifdef SCHED_4BSD
    913 		/*
    914 		 * XXX: Workaround - belongs to sched_4bsd.c
    915 		 * If the process has slept the entire second,
    916 		 * stop recalculating its priority until it wakes up.
    917 		 */
    918 		if (minslp <= 1) {
    919 			extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
    920 
    921 			fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
    922 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    923 		}
    924 #endif
    925 		mutex_spin_exit(&p->p_stmutex);
    926 
    927 		/*
    928 		 * Check if the process exceeds its CPU resource allocation.
    929 		 * If over max, kill it.
    930 		 */
    931 		rlim = &p->p_rlimit[RLIMIT_CPU];
    932 		sig = 0;
    933 		if (runtm >= rlim->rlim_cur) {
    934 			if (runtm >= rlim->rlim_max)
    935 				sig = SIGKILL;
    936 			else {
    937 				sig = SIGXCPU;
    938 				if (rlim->rlim_cur < rlim->rlim_max)
    939 					rlim->rlim_cur += 5;
    940 			}
    941 		}
    942 		mutex_exit(&p->p_smutex);
    943 		if (sig) {
    944 			psignal(p, sig);
    945 		}
    946 	}
    947 	mutex_exit(&proclist_mutex);
    948 	uvm_meter();
    949 	cv_wakeup(&lbolt);
    950 	callout_schedule(&sched_pstats_ch, hz);
    951 }
    952 
    953 void
    954 sched_init(void)
    955 {
    956 
    957 	cv_init(&lbolt, "lbolt");
    958 	callout_init(&sched_pstats_ch, 0);
    959 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
    960 	sched_setup();
    961 	sched_pstats(NULL);
    962 }
    963