Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.203
      1 /*	$NetBSD: kern_synch.c,v 1.203 2007/11/04 11:43:07 rmind Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. All advertising materials mentioning features or use of this software
     21  *    must display the following acknowledgement:
     22  *	This product includes software developed by the NetBSD
     23  *	Foundation, Inc. and its contributors.
     24  * 4. Neither the name of The NetBSD Foundation nor the names of its
     25  *    contributors may be used to endorse or promote products derived
     26  *    from this software without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     38  * POSSIBILITY OF SUCH DAMAGE.
     39  */
     40 
     41 /*-
     42  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     43  *	The Regents of the University of California.  All rights reserved.
     44  * (c) UNIX System Laboratories, Inc.
     45  * All or some portions of this file are derived from material licensed
     46  * to the University of California by American Telephone and Telegraph
     47  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     48  * the permission of UNIX System Laboratories, Inc.
     49  *
     50  * Redistribution and use in source and binary forms, with or without
     51  * modification, are permitted provided that the following conditions
     52  * are met:
     53  * 1. Redistributions of source code must retain the above copyright
     54  *    notice, this list of conditions and the following disclaimer.
     55  * 2. Redistributions in binary form must reproduce the above copyright
     56  *    notice, this list of conditions and the following disclaimer in the
     57  *    documentation and/or other materials provided with the distribution.
     58  * 3. Neither the name of the University nor the names of its contributors
     59  *    may be used to endorse or promote products derived from this software
     60  *    without specific prior written permission.
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     72  * SUCH DAMAGE.
     73  *
     74  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     75  */
     76 
     77 #include <sys/cdefs.h>
     78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.203 2007/11/04 11:43:07 rmind Exp $");
     79 
     80 #include "opt_kstack.h"
     81 #include "opt_lockdebug.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_perfctrs.h"
     84 #include "opt_sched.h"
     85 
     86 #define	__MUTEX_PRIVATE
     87 
     88 #include <sys/param.h>
     89 #include <sys/systm.h>
     90 #include <sys/proc.h>
     91 #include <sys/kernel.h>
     92 #if defined(PERFCTRS)
     93 #include <sys/pmc.h>
     94 #endif
     95 #include <sys/cpu.h>
     96 #include <sys/resourcevar.h>
     97 #include <sys/sched.h>
     98 #include <sys/syscall_stats.h>
     99 #include <sys/sleepq.h>
    100 #include <sys/lockdebug.h>
    101 #include <sys/evcnt.h>
    102 #include <sys/intr.h>
    103 
    104 #include <uvm/uvm_extern.h>
    105 
    106 callout_t sched_pstats_ch;
    107 unsigned int sched_pstats_ticks;
    108 
    109 kcondvar_t	lbolt;			/* once a second sleep address */
    110 
    111 static void	sched_unsleep(struct lwp *);
    112 static void	sched_changepri(struct lwp *, pri_t);
    113 static void	sched_lendpri(struct lwp *, pri_t);
    114 
    115 syncobj_t sleep_syncobj = {
    116 	SOBJ_SLEEPQ_SORTED,
    117 	sleepq_unsleep,
    118 	sleepq_changepri,
    119 	sleepq_lendpri,
    120 	syncobj_noowner,
    121 };
    122 
    123 syncobj_t sched_syncobj = {
    124 	SOBJ_SLEEPQ_SORTED,
    125 	sched_unsleep,
    126 	sched_changepri,
    127 	sched_lendpri,
    128 	syncobj_noowner,
    129 };
    130 
    131 /*
    132  * During autoconfiguration or after a panic, a sleep will simply lower the
    133  * priority briefly to allow interrupts, then return.  The priority to be
    134  * used (safepri) is machine-dependent, thus this value is initialized and
    135  * maintained in the machine-dependent layers.  This priority will typically
    136  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    137  * it can be made higher to block network software interrupts after panics.
    138  */
    139 int	safepri;
    140 
    141 /*
    142  * OBSOLETE INTERFACE
    143  *
    144  * General sleep call.  Suspends the current process until a wakeup is
    145  * performed on the specified identifier.  The process will then be made
    146  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    147  * means no timeout).  If pri includes PCATCH flag, signals are checked
    148  * before and after sleeping, else signals are not checked.  Returns 0 if
    149  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    150  * signal needs to be delivered, ERESTART is returned if the current system
    151  * call should be restarted if possible, and EINTR is returned if the system
    152  * call should be interrupted by the signal (return EINTR).
    153  *
    154  * The interlock is held until we are on a sleep queue. The interlock will
    155  * be locked before returning back to the caller unless the PNORELOCK flag
    156  * is specified, in which case the interlock will always be unlocked upon
    157  * return.
    158  */
    159 int
    160 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    161 	volatile struct simplelock *interlock)
    162 {
    163 	struct lwp *l = curlwp;
    164 	sleepq_t *sq;
    165 	int error;
    166 
    167 	if (sleepq_dontsleep(l)) {
    168 		(void)sleepq_abort(NULL, 0);
    169 		if ((priority & PNORELOCK) != 0)
    170 			simple_unlock(interlock);
    171 		return 0;
    172 	}
    173 
    174 	sq = sleeptab_lookup(&sleeptab, ident);
    175 	sleepq_enter(sq, l);
    176 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    177 
    178 	if (interlock != NULL) {
    179 		LOCK_ASSERT(simple_lock_held(interlock));
    180 		simple_unlock(interlock);
    181 	}
    182 
    183 	error = sleepq_block(timo, priority & PCATCH);
    184 
    185 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    186 		simple_lock(interlock);
    187 
    188 	return error;
    189 }
    190 
    191 int
    192 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    193 	kmutex_t *mtx)
    194 {
    195 	struct lwp *l = curlwp;
    196 	sleepq_t *sq;
    197 	int error;
    198 
    199 	if (sleepq_dontsleep(l)) {
    200 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    201 		return 0;
    202 	}
    203 
    204 	sq = sleeptab_lookup(&sleeptab, ident);
    205 	sleepq_enter(sq, l);
    206 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    207 	mutex_exit(mtx);
    208 	error = sleepq_block(timo, priority & PCATCH);
    209 
    210 	if ((priority & PNORELOCK) == 0)
    211 		mutex_enter(mtx);
    212 
    213 	return error;
    214 }
    215 
    216 /*
    217  * General sleep call for situations where a wake-up is not expected.
    218  */
    219 int
    220 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    221 {
    222 	struct lwp *l = curlwp;
    223 	sleepq_t *sq;
    224 	int error;
    225 
    226 	if (sleepq_dontsleep(l))
    227 		return sleepq_abort(NULL, 0);
    228 
    229 	if (mtx != NULL)
    230 		mutex_exit(mtx);
    231 	sq = sleeptab_lookup(&sleeptab, l);
    232 	sleepq_enter(sq, l);
    233 	sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
    234 	error = sleepq_block(timo, intr);
    235 	if (mtx != NULL)
    236 		mutex_enter(mtx);
    237 
    238 	return error;
    239 }
    240 
    241 /*
    242  * OBSOLETE INTERFACE
    243  *
    244  * Make all processes sleeping on the specified identifier runnable.
    245  */
    246 void
    247 wakeup(wchan_t ident)
    248 {
    249 	sleepq_t *sq;
    250 
    251 	if (cold)
    252 		return;
    253 
    254 	sq = sleeptab_lookup(&sleeptab, ident);
    255 	sleepq_wake(sq, ident, (u_int)-1);
    256 }
    257 
    258 /*
    259  * OBSOLETE INTERFACE
    260  *
    261  * Make the highest priority process first in line on the specified
    262  * identifier runnable.
    263  */
    264 void
    265 wakeup_one(wchan_t ident)
    266 {
    267 	sleepq_t *sq;
    268 
    269 	if (cold)
    270 		return;
    271 
    272 	sq = sleeptab_lookup(&sleeptab, ident);
    273 	sleepq_wake(sq, ident, 1);
    274 }
    275 
    276 
    277 /*
    278  * General yield call.  Puts the current process back on its run queue and
    279  * performs a voluntary context switch.  Should only be called when the
    280  * current process explicitly requests it (eg sched_yield(2)).
    281  */
    282 void
    283 yield(void)
    284 {
    285 	struct lwp *l = curlwp;
    286 
    287 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    288 	lwp_lock(l);
    289 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    290 	KASSERT(l->l_stat == LSONPROC);
    291 	/* XXX Only do this for timeshared threads. */
    292 	l->l_priority = MAXPRI;
    293 	(void)mi_switch(l);
    294 	KERNEL_LOCK(l->l_biglocks, l);
    295 }
    296 
    297 /*
    298  * General preemption call.  Puts the current process back on its run queue
    299  * and performs an involuntary context switch.
    300  */
    301 void
    302 preempt(void)
    303 {
    304 	struct lwp *l = curlwp;
    305 
    306 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    307 	lwp_lock(l);
    308 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    309 	KASSERT(l->l_stat == LSONPROC);
    310 	l->l_priority = l->l_usrpri;
    311 	l->l_nivcsw++;
    312 	(void)mi_switch(l);
    313 	KERNEL_LOCK(l->l_biglocks, l);
    314 }
    315 
    316 /*
    317  * Compute the amount of time during which the current lwp was running.
    318  *
    319  * - update l_rtime unless it's an idle lwp.
    320  */
    321 
    322 void
    323 updatertime(lwp_t *l, const struct timeval *tv)
    324 {
    325 	long s, u;
    326 
    327 	if ((l->l_flag & LW_IDLE) != 0)
    328 		return;
    329 
    330 	u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
    331 	s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
    332 	if (u < 0) {
    333 		u += 1000000;
    334 		s--;
    335 	} else if (u >= 1000000) {
    336 		u -= 1000000;
    337 		s++;
    338 	}
    339 	l->l_rtime.tv_usec = u;
    340 	l->l_rtime.tv_sec = s;
    341 }
    342 
    343 /*
    344  * The machine independent parts of context switch.
    345  *
    346  * Returns 1 if another LWP was actually run.
    347  */
    348 int
    349 mi_switch(lwp_t *l)
    350 {
    351 	struct schedstate_percpu *spc;
    352 	struct lwp *newl;
    353 	int retval, oldspl;
    354 	struct cpu_info *ci;
    355 	struct timeval tv;
    356 	bool returning;
    357 
    358 	KASSERT(lwp_locked(l, NULL));
    359 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    360 
    361 #ifdef KSTACK_CHECK_MAGIC
    362 	kstack_check_magic(l);
    363 #endif
    364 
    365 	microtime(&tv);
    366 
    367 	/*
    368 	 * It's safe to read the per CPU schedstate unlocked here, as all we
    369 	 * are after is the run time and that's guarenteed to have been last
    370 	 * updated by this CPU.
    371 	 */
    372 	ci = l->l_cpu;
    373 	KDASSERT(ci == curcpu());
    374 
    375 	/*
    376 	 * Process is about to yield the CPU; clear the appropriate
    377 	 * scheduling flags.
    378 	 */
    379 	spc = &ci->ci_schedstate;
    380 	returning = false;
    381 	newl = NULL;
    382 
    383 	/*
    384 	 * If we have been asked to switch to a specific LWP, then there
    385 	 * is no need to inspect the run queues.  If a soft interrupt is
    386 	 * blocking, then return to the interrupted thread without adjusting
    387 	 * VM context or its start time: neither have been changed in order
    388 	 * to take the interrupt.
    389 	 */
    390 	if (l->l_switchto != NULL) {
    391 		if ((l->l_flag & LW_INTR) != 0) {
    392 			returning = true;
    393 			softint_block(l);
    394 			if ((l->l_flag & LW_TIMEINTR) != 0)
    395 				updatertime(l, &tv);
    396 		}
    397 		newl = l->l_switchto;
    398 		l->l_switchto = NULL;
    399 	}
    400 
    401 	/* Count time spent in current system call */
    402 	if (!returning) {
    403 		SYSCALL_TIME_SLEEP(l);
    404 
    405 		/*
    406 		 * XXXSMP If we are using h/w performance counters,
    407 		 * save context.
    408 		 */
    409 #if PERFCTRS
    410 		if (PMC_ENABLED(l->l_proc)) {
    411 			pmc_save_context(l->l_proc);
    412 		}
    413 #endif
    414 		updatertime(l, &tv);
    415 	}
    416 
    417 	/*
    418 	 * If on the CPU and we have gotten this far, then we must yield.
    419 	 */
    420 	mutex_spin_enter(spc->spc_mutex);
    421 	KASSERT(l->l_stat != LSRUN);
    422 	if (l->l_stat == LSONPROC) {
    423 		KASSERT(lwp_locked(l, &spc->spc_lwplock));
    424 		if ((l->l_flag & LW_IDLE) == 0) {
    425 			l->l_stat = LSRUN;
    426 			lwp_setlock(l, spc->spc_mutex);
    427 			sched_enqueue(l, true);
    428 		} else
    429 			l->l_stat = LSIDL;
    430 	}
    431 
    432 	/*
    433 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    434 	 * If no LWP is runnable, switch to the idle LWP.
    435 	 * Note that spc_lwplock might not necessary be held.
    436 	 */
    437 	if (newl == NULL) {
    438 		newl = sched_nextlwp();
    439 		if (newl != NULL) {
    440 			sched_dequeue(newl);
    441 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    442 			newl->l_stat = LSONPROC;
    443 			newl->l_cpu = ci;
    444 			newl->l_flag |= LW_RUNNING;
    445 			lwp_setlock(newl, &spc->spc_lwplock);
    446 		} else {
    447 			newl = ci->ci_data.cpu_idlelwp;
    448 			newl->l_stat = LSONPROC;
    449 			newl->l_flag |= LW_RUNNING;
    450 		}
    451 		ci->ci_want_resched = 0;
    452 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    453 	}
    454 
    455 	/* Update the new LWP's start time while it is still locked. */
    456 	if (!returning) {
    457 		newl->l_stime = tv;
    458 		/*
    459 		 * XXX The following may be done unlocked if newl != NULL
    460 		 * above.
    461 		 */
    462 		newl->l_priority = newl->l_usrpri;
    463 	}
    464 
    465 	spc->spc_curpriority = newl->l_usrpri;
    466 
    467 	if (l != newl) {
    468 		struct lwp *prevlwp;
    469 
    470 		/*
    471 		 * If the old LWP has been moved to a run queue above,
    472 		 * drop the general purpose LWP lock: it's now locked
    473 		 * by the scheduler lock.
    474 		 *
    475 		 * Otherwise, drop the scheduler lock.  We're done with
    476 		 * the run queues for now.
    477 		 */
    478 		if (l->l_mutex == spc->spc_mutex) {
    479 			mutex_spin_exit(&spc->spc_lwplock);
    480 		} else {
    481 			mutex_spin_exit(spc->spc_mutex);
    482 		}
    483 
    484 		/* Unlocked, but for statistics only. */
    485 		uvmexp.swtch++;
    486 
    487 		/*
    488 		 * Save old VM context, unless a soft interrupt
    489 		 * handler is blocking.
    490 		 */
    491 		if (!returning)
    492 			pmap_deactivate(l);
    493 
    494 		/* Switch to the new LWP.. */
    495 		l->l_ncsw++;
    496 		l->l_flag &= ~LW_RUNNING;
    497 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    498 		prevlwp = cpu_switchto(l, newl);
    499 
    500 		/*
    501 		 * .. we have switched away and are now back so we must
    502 		 * be the new curlwp.  prevlwp is who we replaced.
    503 		 */
    504 		if (prevlwp != NULL) {
    505 			curcpu()->ci_mtx_oldspl = oldspl;
    506 			lwp_unlock(prevlwp);
    507 		} else {
    508 			splx(oldspl);
    509 		}
    510 
    511 		/* Restore VM context. */
    512 		pmap_activate(l);
    513 		retval = 1;
    514 	} else {
    515 		/* Nothing to do - just unlock and return. */
    516 		mutex_spin_exit(spc->spc_mutex);
    517 		lwp_unlock(l);
    518 		retval = 0;
    519 	}
    520 
    521 	KASSERT(l == curlwp);
    522 	KASSERT(l->l_stat == LSONPROC);
    523 	KASSERT(l->l_cpu == curcpu());
    524 
    525 	/*
    526 	 * XXXSMP If we are using h/w performance counters, restore context.
    527 	 */
    528 #if PERFCTRS
    529 	if (PMC_ENABLED(l->l_proc)) {
    530 		pmc_restore_context(l->l_proc);
    531 	}
    532 #endif
    533 
    534 	/*
    535 	 * We're running again; record our new start time.  We might
    536 	 * be running on a new CPU now, so don't use the cached
    537 	 * schedstate_percpu pointer.
    538 	 */
    539 	SYSCALL_TIME_WAKEUP(l);
    540 	KASSERT(curlwp == l);
    541 	KDASSERT(l->l_cpu == curcpu());
    542 	LOCKDEBUG_BARRIER(NULL, 1);
    543 
    544 	return retval;
    545 }
    546 
    547 /*
    548  * Change process state to be runnable, placing it on the run queue if it is
    549  * in memory, and awakening the swapper if it isn't in memory.
    550  *
    551  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    552  */
    553 void
    554 setrunnable(struct lwp *l)
    555 {
    556 	struct proc *p = l->l_proc;
    557 	struct cpu_info *ci;
    558 	sigset_t *ss;
    559 
    560 	KASSERT((l->l_flag & LW_IDLE) == 0);
    561 	KASSERT(mutex_owned(&p->p_smutex));
    562 	KASSERT(lwp_locked(l, NULL));
    563 	KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
    564 
    565 	switch (l->l_stat) {
    566 	case LSSTOP:
    567 		/*
    568 		 * If we're being traced (possibly because someone attached us
    569 		 * while we were stopped), check for a signal from the debugger.
    570 		 */
    571 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    572 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    573 				ss = &l->l_sigpend.sp_set;
    574 			else
    575 				ss = &p->p_sigpend.sp_set;
    576 			sigaddset(ss, p->p_xstat);
    577 			signotify(l);
    578 		}
    579 		p->p_nrlwps++;
    580 		break;
    581 	case LSSUSPENDED:
    582 		l->l_flag &= ~LW_WSUSPEND;
    583 		p->p_nrlwps++;
    584 		cv_broadcast(&p->p_lwpcv);
    585 		break;
    586 	case LSSLEEP:
    587 		KASSERT(l->l_wchan != NULL);
    588 		break;
    589 	default:
    590 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    591 	}
    592 
    593 	/*
    594 	 * If the LWP was sleeping interruptably, then it's OK to start it
    595 	 * again.  If not, mark it as still sleeping.
    596 	 */
    597 	if (l->l_wchan != NULL) {
    598 		l->l_stat = LSSLEEP;
    599 		/* lwp_unsleep() will release the lock. */
    600 		lwp_unsleep(l);
    601 		return;
    602 	}
    603 
    604 	/*
    605 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    606 	 * about to call mi_switch(), in which case it will yield.
    607 	 */
    608 	if ((l->l_flag & LW_RUNNING) != 0) {
    609 		l->l_stat = LSONPROC;
    610 		l->l_slptime = 0;
    611 		lwp_unlock(l);
    612 		return;
    613 	}
    614 
    615 	/*
    616 	 * Look for a CPU to run.
    617 	 * Set the LWP runnable.
    618 	 */
    619 	ci = sched_takecpu(l);
    620 	ci = l->l_cpu;
    621 	spc_lock(ci);
    622 	l->l_cpu = ci;
    623 	lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
    624 
    625 	sched_setrunnable(l);
    626 	l->l_stat = LSRUN;
    627 	l->l_slptime = 0;
    628 
    629 	/*
    630 	 * If thread is swapped out - wake the swapper to bring it back in.
    631 	 * Otherwise, enter it into a run queue.
    632 	 */
    633 	if (l->l_flag & LW_INMEM) {
    634 		sched_enqueue(l, false);
    635 		resched_cpu(l);
    636 		lwp_unlock(l);
    637 	} else {
    638 		lwp_unlock(l);
    639 		uvm_kick_scheduler();
    640 	}
    641 }
    642 
    643 /*
    644  * suspendsched:
    645  *
    646  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    647  */
    648 void
    649 suspendsched(void)
    650 {
    651 	CPU_INFO_ITERATOR cii;
    652 	struct cpu_info *ci;
    653 	struct lwp *l;
    654 	struct proc *p;
    655 
    656 	/*
    657 	 * We do this by process in order not to violate the locking rules.
    658 	 */
    659 	mutex_enter(&proclist_mutex);
    660 	PROCLIST_FOREACH(p, &allproc) {
    661 		mutex_enter(&p->p_smutex);
    662 
    663 		if ((p->p_flag & PK_SYSTEM) != 0) {
    664 			mutex_exit(&p->p_smutex);
    665 			continue;
    666 		}
    667 
    668 		p->p_stat = SSTOP;
    669 
    670 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    671 			if (l == curlwp)
    672 				continue;
    673 
    674 			lwp_lock(l);
    675 
    676 			/*
    677 			 * Set L_WREBOOT so that the LWP will suspend itself
    678 			 * when it tries to return to user mode.  We want to
    679 			 * try and get to get as many LWPs as possible to
    680 			 * the user / kernel boundary, so that they will
    681 			 * release any locks that they hold.
    682 			 */
    683 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
    684 
    685 			if (l->l_stat == LSSLEEP &&
    686 			    (l->l_flag & LW_SINTR) != 0) {
    687 				/* setrunnable() will release the lock. */
    688 				setrunnable(l);
    689 				continue;
    690 			}
    691 
    692 			lwp_unlock(l);
    693 		}
    694 
    695 		mutex_exit(&p->p_smutex);
    696 	}
    697 	mutex_exit(&proclist_mutex);
    698 
    699 	/*
    700 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
    701 	 * They'll trap into the kernel and suspend themselves in userret().
    702 	 */
    703 	for (CPU_INFO_FOREACH(cii, ci))
    704 		cpu_need_resched(ci, 0);
    705 }
    706 
    707 /*
    708  * sched_kpri:
    709  *
    710  *	Scale a priority level to a kernel priority level, usually
    711  *	for an LWP that is about to sleep.
    712  */
    713 pri_t
    714 sched_kpri(struct lwp *l)
    715 {
    716 	/*
    717 	 * Scale user priorities (127 -> 50) up to kernel priorities
    718 	 * in the range (49 -> 8).  Reserve the top 8 kernel priorities
    719 	 * for high priority kthreads.  Kernel priorities passed in
    720 	 * are left "as is".  XXX This is somewhat arbitrary.
    721 	 */
    722 	static const uint8_t kpri_tab[] = {
    723 		 0,   1,   2,   3,   4,   5,   6,   7,
    724 		 8,   9,  10,  11,  12,  13,  14,  15,
    725 		16,  17,  18,  19,  20,  21,  22,  23,
    726 		24,  25,  26,  27,  28,  29,  30,  31,
    727 		32,  33,  34,  35,  36,  37,  38,  39,
    728 		40,  41,  42,  43,  44,  45,  46,  47,
    729 		48,  49,   8,   8,   9,   9,  10,  10,
    730 		11,  11,  12,  12,  13,  14,  14,  15,
    731 		15,  16,  16,  17,  17,  18,  18,  19,
    732 		20,  20,  21,  21,  22,  22,  23,  23,
    733 		24,  24,  25,  26,  26,  27,  27,  28,
    734 		28,  29,  29,  30,  30,  31,  32,  32,
    735 		33,  33,  34,  34,  35,  35,  36,  36,
    736 		37,  38,  38,  39,  39,  40,  40,  41,
    737 		41,  42,  42,  43,  44,  44,  45,  45,
    738 		46,  46,  47,  47,  48,  48,  49,  49,
    739 	};
    740 
    741 	return (pri_t)kpri_tab[l->l_usrpri];
    742 }
    743 
    744 /*
    745  * sched_unsleep:
    746  *
    747  *	The is called when the LWP has not been awoken normally but instead
    748  *	interrupted: for example, if the sleep timed out.  Because of this,
    749  *	it's not a valid action for running or idle LWPs.
    750  */
    751 static void
    752 sched_unsleep(struct lwp *l)
    753 {
    754 
    755 	lwp_unlock(l);
    756 	panic("sched_unsleep");
    757 }
    758 
    759 inline void
    760 resched_cpu(struct lwp *l)
    761 {
    762 	struct cpu_info *ci;
    763 	const pri_t pri = lwp_eprio(l);
    764 
    765 	/*
    766 	 * XXXSMP
    767 	 * Since l->l_cpu persists across a context switch,
    768 	 * this gives us *very weak* processor affinity, in
    769 	 * that we notify the CPU on which the process last
    770 	 * ran that it should try to switch.
    771 	 *
    772 	 * This does not guarantee that the process will run on
    773 	 * that processor next, because another processor might
    774 	 * grab it the next time it performs a context switch.
    775 	 *
    776 	 * This also does not handle the case where its last
    777 	 * CPU is running a higher-priority process, but every
    778 	 * other CPU is running a lower-priority process.  There
    779 	 * are ways to handle this situation, but they're not
    780 	 * currently very pretty, and we also need to weigh the
    781 	 * cost of moving a process from one CPU to another.
    782 	 */
    783 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    784 	if (pri < ci->ci_schedstate.spc_curpriority)
    785 		cpu_need_resched(ci, 0);
    786 }
    787 
    788 static void
    789 sched_changepri(struct lwp *l, pri_t pri)
    790 {
    791 
    792 	KASSERT(lwp_locked(l, NULL));
    793 
    794 	l->l_usrpri = pri;
    795 	if (l->l_priority < PUSER)
    796 		return;
    797 
    798 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    799 		l->l_priority = pri;
    800 		return;
    801 	}
    802 
    803 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    804 
    805 	sched_dequeue(l);
    806 	l->l_priority = pri;
    807 	sched_enqueue(l, false);
    808 	resched_cpu(l);
    809 }
    810 
    811 static void
    812 sched_lendpri(struct lwp *l, pri_t pri)
    813 {
    814 
    815 	KASSERT(lwp_locked(l, NULL));
    816 
    817 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    818 		l->l_inheritedprio = pri;
    819 		return;
    820 	}
    821 
    822 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    823 
    824 	sched_dequeue(l);
    825 	l->l_inheritedprio = pri;
    826 	sched_enqueue(l, false);
    827 	resched_cpu(l);
    828 }
    829 
    830 struct lwp *
    831 syncobj_noowner(wchan_t wchan)
    832 {
    833 
    834 	return NULL;
    835 }
    836 
    837 
    838 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
    839 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
    840 
    841 /*
    842  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
    843  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
    844  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
    845  *
    846  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
    847  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
    848  *
    849  * If you dont want to bother with the faster/more-accurate formula, you
    850  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
    851  * (more general) method of calculating the %age of CPU used by a process.
    852  */
    853 #define	CCPU_SHIFT	(FSHIFT + 1)
    854 
    855 /*
    856  * sched_pstats:
    857  *
    858  * Update process statistics and check CPU resource allocation.
    859  * Call scheduler-specific hook to eventually adjust process/LWP
    860  * priorities.
    861  */
    862 /* ARGSUSED */
    863 void
    864 sched_pstats(void *arg)
    865 {
    866 	struct rlimit *rlim;
    867 	struct lwp *l;
    868 	struct proc *p;
    869 	int minslp, sig, clkhz;
    870 	long runtm;
    871 
    872 	sched_pstats_ticks++;
    873 
    874 	mutex_enter(&proclist_mutex);
    875 	PROCLIST_FOREACH(p, &allproc) {
    876 		/*
    877 		 * Increment time in/out of memory and sleep time (if
    878 		 * sleeping).  We ignore overflow; with 16-bit int's
    879 		 * (remember them?) overflow takes 45 days.
    880 		 */
    881 		minslp = 2;
    882 		mutex_enter(&p->p_smutex);
    883 		mutex_spin_enter(&p->p_stmutex);
    884 		runtm = p->p_rtime.tv_sec;
    885 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    886 			if ((l->l_flag & LW_IDLE) != 0)
    887 				continue;
    888 			lwp_lock(l);
    889 			runtm += l->l_rtime.tv_sec;
    890 			l->l_swtime++;
    891 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    892 			    l->l_stat == LSSUSPENDED) {
    893 				l->l_slptime++;
    894 				minslp = min(minslp, l->l_slptime);
    895 			} else
    896 				minslp = 0;
    897 			sched_pstats_hook(l);
    898 			lwp_unlock(l);
    899 
    900 			/*
    901 			 * p_pctcpu is only for ps.
    902 			 */
    903 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
    904 			if (l->l_slptime < 1) {
    905 				clkhz = stathz != 0 ? stathz : hz;
    906 #if	(FSHIFT >= CCPU_SHIFT)
    907 				l->l_pctcpu += (clkhz == 100) ?
    908 				    ((fixpt_t)l->l_cpticks) <<
    909 				        (FSHIFT - CCPU_SHIFT) :
    910 				    100 * (((fixpt_t) p->p_cpticks)
    911 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
    912 #else
    913 				l->l_pctcpu += ((FSCALE - ccpu) *
    914 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
    915 #endif
    916 				l->l_cpticks = 0;
    917 			}
    918 		}
    919 
    920 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
    921 #ifdef SCHED_4BSD
    922 		/*
    923 		 * XXX: Workaround - belongs to sched_4bsd.c
    924 		 * If the process has slept the entire second,
    925 		 * stop recalculating its priority until it wakes up.
    926 		 */
    927 		if (minslp <= 1) {
    928 			extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
    929 
    930 			fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
    931 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    932 		}
    933 #endif
    934 		mutex_spin_exit(&p->p_stmutex);
    935 
    936 		/*
    937 		 * Check if the process exceeds its CPU resource allocation.
    938 		 * If over max, kill it.
    939 		 */
    940 		rlim = &p->p_rlimit[RLIMIT_CPU];
    941 		sig = 0;
    942 		if (runtm >= rlim->rlim_cur) {
    943 			if (runtm >= rlim->rlim_max)
    944 				sig = SIGKILL;
    945 			else {
    946 				sig = SIGXCPU;
    947 				if (rlim->rlim_cur < rlim->rlim_max)
    948 					rlim->rlim_cur += 5;
    949 			}
    950 		}
    951 		mutex_exit(&p->p_smutex);
    952 		if (sig) {
    953 			psignal(p, sig);
    954 		}
    955 	}
    956 	mutex_exit(&proclist_mutex);
    957 	uvm_meter();
    958 	cv_wakeup(&lbolt);
    959 	callout_schedule(&sched_pstats_ch, hz);
    960 }
    961 
    962 void
    963 sched_init(void)
    964 {
    965 
    966 	cv_init(&lbolt, "lbolt");
    967 	callout_init(&sched_pstats_ch, 0);
    968 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
    969 	sched_setup();
    970 	sched_pstats(NULL);
    971 }
    972