Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.201
      1 /*	$NetBSD: kern_synch.c,v 1.201 2007/10/13 00:30:26 rmind Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. All advertising materials mentioning features or use of this software
     21  *    must display the following acknowledgement:
     22  *	This product includes software developed by the NetBSD
     23  *	Foundation, Inc. and its contributors.
     24  * 4. Neither the name of The NetBSD Foundation nor the names of its
     25  *    contributors may be used to endorse or promote products derived
     26  *    from this software without specific prior written permission.
     27  *
     28  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     29  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     31  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     32  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     38  * POSSIBILITY OF SUCH DAMAGE.
     39  */
     40 
     41 /*-
     42  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     43  *	The Regents of the University of California.  All rights reserved.
     44  * (c) UNIX System Laboratories, Inc.
     45  * All or some portions of this file are derived from material licensed
     46  * to the University of California by American Telephone and Telegraph
     47  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     48  * the permission of UNIX System Laboratories, Inc.
     49  *
     50  * Redistribution and use in source and binary forms, with or without
     51  * modification, are permitted provided that the following conditions
     52  * are met:
     53  * 1. Redistributions of source code must retain the above copyright
     54  *    notice, this list of conditions and the following disclaimer.
     55  * 2. Redistributions in binary form must reproduce the above copyright
     56  *    notice, this list of conditions and the following disclaimer in the
     57  *    documentation and/or other materials provided with the distribution.
     58  * 3. Neither the name of the University nor the names of its contributors
     59  *    may be used to endorse or promote products derived from this software
     60  *    without specific prior written permission.
     61  *
     62  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     63  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     65  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     66  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     67  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     68  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     69  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     70  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     71  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     72  * SUCH DAMAGE.
     73  *
     74  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     75  */
     76 
     77 #include <sys/cdefs.h>
     78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.201 2007/10/13 00:30:26 rmind Exp $");
     79 
     80 #include "opt_kstack.h"
     81 #include "opt_lockdebug.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_perfctrs.h"
     84 
     85 #define	__MUTEX_PRIVATE
     86 
     87 #include <sys/param.h>
     88 #include <sys/systm.h>
     89 #include <sys/proc.h>
     90 #include <sys/kernel.h>
     91 #if defined(PERFCTRS)
     92 #include <sys/pmc.h>
     93 #endif
     94 #include <sys/cpu.h>
     95 #include <sys/resourcevar.h>
     96 #include <sys/sched.h>
     97 #include <sys/syscall_stats.h>
     98 #include <sys/sleepq.h>
     99 #include <sys/lockdebug.h>
    100 #include <sys/evcnt.h>
    101 #include <sys/intr.h>
    102 
    103 #include <uvm/uvm_extern.h>
    104 
    105 callout_t sched_pstats_ch;
    106 unsigned int sched_pstats_ticks;
    107 
    108 kcondvar_t	lbolt;			/* once a second sleep address */
    109 
    110 static void	sched_unsleep(struct lwp *);
    111 static void	sched_changepri(struct lwp *, pri_t);
    112 static void	sched_lendpri(struct lwp *, pri_t);
    113 
    114 syncobj_t sleep_syncobj = {
    115 	SOBJ_SLEEPQ_SORTED,
    116 	sleepq_unsleep,
    117 	sleepq_changepri,
    118 	sleepq_lendpri,
    119 	syncobj_noowner,
    120 };
    121 
    122 syncobj_t sched_syncobj = {
    123 	SOBJ_SLEEPQ_SORTED,
    124 	sched_unsleep,
    125 	sched_changepri,
    126 	sched_lendpri,
    127 	syncobj_noowner,
    128 };
    129 
    130 /*
    131  * During autoconfiguration or after a panic, a sleep will simply lower the
    132  * priority briefly to allow interrupts, then return.  The priority to be
    133  * used (safepri) is machine-dependent, thus this value is initialized and
    134  * maintained in the machine-dependent layers.  This priority will typically
    135  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    136  * it can be made higher to block network software interrupts after panics.
    137  */
    138 int	safepri;
    139 
    140 /*
    141  * OBSOLETE INTERFACE
    142  *
    143  * General sleep call.  Suspends the current process until a wakeup is
    144  * performed on the specified identifier.  The process will then be made
    145  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    146  * means no timeout).  If pri includes PCATCH flag, signals are checked
    147  * before and after sleeping, else signals are not checked.  Returns 0 if
    148  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    149  * signal needs to be delivered, ERESTART is returned if the current system
    150  * call should be restarted if possible, and EINTR is returned if the system
    151  * call should be interrupted by the signal (return EINTR).
    152  *
    153  * The interlock is held until we are on a sleep queue. The interlock will
    154  * be locked before returning back to the caller unless the PNORELOCK flag
    155  * is specified, in which case the interlock will always be unlocked upon
    156  * return.
    157  */
    158 int
    159 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    160 	volatile struct simplelock *interlock)
    161 {
    162 	struct lwp *l = curlwp;
    163 	sleepq_t *sq;
    164 	int error;
    165 
    166 	if (sleepq_dontsleep(l)) {
    167 		(void)sleepq_abort(NULL, 0);
    168 		if ((priority & PNORELOCK) != 0)
    169 			simple_unlock(interlock);
    170 		return 0;
    171 	}
    172 
    173 	sq = sleeptab_lookup(&sleeptab, ident);
    174 	sleepq_enter(sq, l);
    175 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    176 
    177 	if (interlock != NULL) {
    178 		LOCK_ASSERT(simple_lock_held(interlock));
    179 		simple_unlock(interlock);
    180 	}
    181 
    182 	error = sleepq_block(timo, priority & PCATCH);
    183 
    184 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    185 		simple_lock(interlock);
    186 
    187 	return error;
    188 }
    189 
    190 int
    191 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    192 	kmutex_t *mtx)
    193 {
    194 	struct lwp *l = curlwp;
    195 	sleepq_t *sq;
    196 	int error;
    197 
    198 	if (sleepq_dontsleep(l)) {
    199 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    200 		return 0;
    201 	}
    202 
    203 	sq = sleeptab_lookup(&sleeptab, ident);
    204 	sleepq_enter(sq, l);
    205 	sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
    206 	mutex_exit(mtx);
    207 	error = sleepq_block(timo, priority & PCATCH);
    208 
    209 	if ((priority & PNORELOCK) == 0)
    210 		mutex_enter(mtx);
    211 
    212 	return error;
    213 }
    214 
    215 /*
    216  * General sleep call for situations where a wake-up is not expected.
    217  */
    218 int
    219 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    220 {
    221 	struct lwp *l = curlwp;
    222 	sleepq_t *sq;
    223 	int error;
    224 
    225 	if (sleepq_dontsleep(l))
    226 		return sleepq_abort(NULL, 0);
    227 
    228 	if (mtx != NULL)
    229 		mutex_exit(mtx);
    230 	sq = sleeptab_lookup(&sleeptab, l);
    231 	sleepq_enter(sq, l);
    232 	sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
    233 	error = sleepq_block(timo, intr);
    234 	if (mtx != NULL)
    235 		mutex_enter(mtx);
    236 
    237 	return error;
    238 }
    239 
    240 /*
    241  * OBSOLETE INTERFACE
    242  *
    243  * Make all processes sleeping on the specified identifier runnable.
    244  */
    245 void
    246 wakeup(wchan_t ident)
    247 {
    248 	sleepq_t *sq;
    249 
    250 	if (cold)
    251 		return;
    252 
    253 	sq = sleeptab_lookup(&sleeptab, ident);
    254 	sleepq_wake(sq, ident, (u_int)-1);
    255 }
    256 
    257 /*
    258  * OBSOLETE INTERFACE
    259  *
    260  * Make the highest priority process first in line on the specified
    261  * identifier runnable.
    262  */
    263 void
    264 wakeup_one(wchan_t ident)
    265 {
    266 	sleepq_t *sq;
    267 
    268 	if (cold)
    269 		return;
    270 
    271 	sq = sleeptab_lookup(&sleeptab, ident);
    272 	sleepq_wake(sq, ident, 1);
    273 }
    274 
    275 
    276 /*
    277  * General yield call.  Puts the current process back on its run queue and
    278  * performs a voluntary context switch.  Should only be called when the
    279  * current process explicitly requests it (eg sched_yield(2)).
    280  */
    281 void
    282 yield(void)
    283 {
    284 	struct lwp *l = curlwp;
    285 
    286 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    287 	lwp_lock(l);
    288 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    289 	KASSERT(l->l_stat == LSONPROC);
    290 	/* XXX Only do this for timeshared threads. */
    291 	l->l_priority = MAXPRI;
    292 	(void)mi_switch(l);
    293 	KERNEL_LOCK(l->l_biglocks, l);
    294 }
    295 
    296 /*
    297  * General preemption call.  Puts the current process back on its run queue
    298  * and performs an involuntary context switch.
    299  */
    300 void
    301 preempt(void)
    302 {
    303 	struct lwp *l = curlwp;
    304 
    305 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    306 	lwp_lock(l);
    307 	KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
    308 	KASSERT(l->l_stat == LSONPROC);
    309 	l->l_priority = l->l_usrpri;
    310 	l->l_nivcsw++;
    311 	(void)mi_switch(l);
    312 	KERNEL_LOCK(l->l_biglocks, l);
    313 }
    314 
    315 /*
    316  * Compute the amount of time during which the current lwp was running.
    317  *
    318  * - update l_rtime unless it's an idle lwp.
    319  */
    320 
    321 void
    322 updatertime(lwp_t *l, const struct timeval *tv)
    323 {
    324 	long s, u;
    325 
    326 	if ((l->l_flag & LW_IDLE) != 0)
    327 		return;
    328 
    329 	u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
    330 	s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
    331 	if (u < 0) {
    332 		u += 1000000;
    333 		s--;
    334 	} else if (u >= 1000000) {
    335 		u -= 1000000;
    336 		s++;
    337 	}
    338 	l->l_rtime.tv_usec = u;
    339 	l->l_rtime.tv_sec = s;
    340 }
    341 
    342 /*
    343  * The machine independent parts of context switch.
    344  *
    345  * Returns 1 if another LWP was actually run.
    346  */
    347 int
    348 mi_switch(lwp_t *l)
    349 {
    350 	struct schedstate_percpu *spc;
    351 	struct lwp *newl;
    352 	int retval, oldspl;
    353 	struct cpu_info *ci;
    354 	struct timeval tv;
    355 	bool returning;
    356 
    357 	KASSERT(lwp_locked(l, NULL));
    358 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    359 
    360 #ifdef KSTACK_CHECK_MAGIC
    361 	kstack_check_magic(l);
    362 #endif
    363 
    364 	microtime(&tv);
    365 
    366 	/*
    367 	 * It's safe to read the per CPU schedstate unlocked here, as all we
    368 	 * are after is the run time and that's guarenteed to have been last
    369 	 * updated by this CPU.
    370 	 */
    371 	ci = l->l_cpu;
    372 	KDASSERT(ci == curcpu());
    373 
    374 	/*
    375 	 * Process is about to yield the CPU; clear the appropriate
    376 	 * scheduling flags.
    377 	 */
    378 	spc = &ci->ci_schedstate;
    379 	returning = false;
    380 	newl = NULL;
    381 
    382 	/*
    383 	 * If we have been asked to switch to a specific LWP, then there
    384 	 * is no need to inspect the run queues.  If a soft interrupt is
    385 	 * blocking, then return to the interrupted thread without adjusting
    386 	 * VM context or its start time: neither have been changed in order
    387 	 * to take the interrupt.
    388 	 */
    389 	if (l->l_switchto != NULL) {
    390 		if ((l->l_flag & LW_INTR) != 0) {
    391 			returning = true;
    392 			softint_block(l);
    393 			if ((l->l_flag & LW_TIMEINTR) != 0)
    394 				updatertime(l, &tv);
    395 		}
    396 		newl = l->l_switchto;
    397 		l->l_switchto = NULL;
    398 	}
    399 
    400 	/* Count time spent in current system call */
    401 	if (!returning) {
    402 		SYSCALL_TIME_SLEEP(l);
    403 
    404 		/*
    405 		 * XXXSMP If we are using h/w performance counters,
    406 		 * save context.
    407 		 */
    408 #if PERFCTRS
    409 		if (PMC_ENABLED(l->l_proc)) {
    410 			pmc_save_context(l->l_proc);
    411 		}
    412 #endif
    413 		updatertime(l, &tv);
    414 	}
    415 
    416 	/*
    417 	 * If on the CPU and we have gotten this far, then we must yield.
    418 	 */
    419 	mutex_spin_enter(spc->spc_mutex);
    420 	KASSERT(l->l_stat != LSRUN);
    421 	if (l->l_stat == LSONPROC) {
    422 		KASSERT(lwp_locked(l, &spc->spc_lwplock));
    423 		if ((l->l_flag & LW_IDLE) == 0) {
    424 			l->l_stat = LSRUN;
    425 			lwp_setlock(l, spc->spc_mutex);
    426 			sched_enqueue(l, true);
    427 		} else
    428 			l->l_stat = LSIDL;
    429 	}
    430 
    431 	/*
    432 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    433 	 * If no LWP is runnable, switch to the idle LWP.
    434 	 * Note that spc_lwplock might not necessary be held.
    435 	 */
    436 	if (newl == NULL) {
    437 		newl = sched_nextlwp();
    438 		if (newl != NULL) {
    439 			sched_dequeue(newl);
    440 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    441 			newl->l_stat = LSONPROC;
    442 			newl->l_cpu = ci;
    443 			newl->l_flag |= LW_RUNNING;
    444 			lwp_setlock(newl, &spc->spc_lwplock);
    445 		} else {
    446 			newl = ci->ci_data.cpu_idlelwp;
    447 			newl->l_stat = LSONPROC;
    448 			newl->l_flag |= LW_RUNNING;
    449 		}
    450 		ci->ci_want_resched = 0;
    451 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    452 	}
    453 
    454 	/* Update the new LWP's start time while it is still locked. */
    455 	if (!returning) {
    456 		newl->l_stime = tv;
    457 		/*
    458 		 * XXX The following may be done unlocked if newl != NULL
    459 		 * above.
    460 		 */
    461 		newl->l_priority = newl->l_usrpri;
    462 	}
    463 
    464 	spc->spc_curpriority = newl->l_usrpri;
    465 
    466 	if (l != newl) {
    467 		struct lwp *prevlwp;
    468 
    469 		/*
    470 		 * If the old LWP has been moved to a run queue above,
    471 		 * drop the general purpose LWP lock: it's now locked
    472 		 * by the scheduler lock.
    473 		 *
    474 		 * Otherwise, drop the scheduler lock.  We're done with
    475 		 * the run queues for now.
    476 		 */
    477 		if (l->l_mutex == spc->spc_mutex) {
    478 			mutex_spin_exit(&spc->spc_lwplock);
    479 		} else {
    480 			mutex_spin_exit(spc->spc_mutex);
    481 		}
    482 
    483 		/* Unlocked, but for statistics only. */
    484 		uvmexp.swtch++;
    485 
    486 		/*
    487 		 * Save old VM context, unless a soft interrupt
    488 		 * handler is blocking.
    489 		 */
    490 		if (!returning)
    491 			pmap_deactivate(l);
    492 
    493 		/* Switch to the new LWP.. */
    494 		l->l_ncsw++;
    495 		l->l_flag &= ~LW_RUNNING;
    496 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    497 		prevlwp = cpu_switchto(l, newl);
    498 
    499 		/*
    500 		 * .. we have switched away and are now back so we must
    501 		 * be the new curlwp.  prevlwp is who we replaced.
    502 		 */
    503 		if (prevlwp != NULL) {
    504 			curcpu()->ci_mtx_oldspl = oldspl;
    505 			lwp_unlock(prevlwp);
    506 		} else {
    507 			splx(oldspl);
    508 		}
    509 
    510 		/* Restore VM context. */
    511 		pmap_activate(l);
    512 		retval = 1;
    513 	} else {
    514 		/* Nothing to do - just unlock and return. */
    515 		mutex_spin_exit(spc->spc_mutex);
    516 		lwp_unlock(l);
    517 		retval = 0;
    518 	}
    519 
    520 	KASSERT(l == curlwp);
    521 	KASSERT(l->l_stat == LSONPROC);
    522 	KASSERT(l->l_cpu == curcpu());
    523 
    524 	/*
    525 	 * XXXSMP If we are using h/w performance counters, restore context.
    526 	 */
    527 #if PERFCTRS
    528 	if (PMC_ENABLED(l->l_proc)) {
    529 		pmc_restore_context(l->l_proc);
    530 	}
    531 #endif
    532 
    533 	/*
    534 	 * We're running again; record our new start time.  We might
    535 	 * be running on a new CPU now, so don't use the cached
    536 	 * schedstate_percpu pointer.
    537 	 */
    538 	SYSCALL_TIME_WAKEUP(l);
    539 	KASSERT(curlwp == l);
    540 	KDASSERT(l->l_cpu == curcpu());
    541 	LOCKDEBUG_BARRIER(NULL, 1);
    542 
    543 	return retval;
    544 }
    545 
    546 /*
    547  * Change process state to be runnable, placing it on the run queue if it is
    548  * in memory, and awakening the swapper if it isn't in memory.
    549  *
    550  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    551  */
    552 void
    553 setrunnable(struct lwp *l)
    554 {
    555 	struct proc *p = l->l_proc;
    556 	sigset_t *ss;
    557 
    558 	KASSERT((l->l_flag & LW_IDLE) == 0);
    559 	KASSERT(mutex_owned(&p->p_smutex));
    560 	KASSERT(lwp_locked(l, NULL));
    561 
    562 	switch (l->l_stat) {
    563 	case LSSTOP:
    564 		/*
    565 		 * If we're being traced (possibly because someone attached us
    566 		 * while we were stopped), check for a signal from the debugger.
    567 		 */
    568 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    569 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    570 				ss = &l->l_sigpend.sp_set;
    571 			else
    572 				ss = &p->p_sigpend.sp_set;
    573 			sigaddset(ss, p->p_xstat);
    574 			signotify(l);
    575 		}
    576 		p->p_nrlwps++;
    577 		break;
    578 	case LSSUSPENDED:
    579 		l->l_flag &= ~LW_WSUSPEND;
    580 		p->p_nrlwps++;
    581 		cv_broadcast(&p->p_lwpcv);
    582 		break;
    583 	case LSSLEEP:
    584 		KASSERT(l->l_wchan != NULL);
    585 		break;
    586 	default:
    587 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    588 	}
    589 
    590 	/*
    591 	 * If the LWP was sleeping interruptably, then it's OK to start it
    592 	 * again.  If not, mark it as still sleeping.
    593 	 */
    594 	if (l->l_wchan != NULL) {
    595 		l->l_stat = LSSLEEP;
    596 		/* lwp_unsleep() will release the lock. */
    597 		lwp_unsleep(l);
    598 		return;
    599 	}
    600 
    601 	/*
    602 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    603 	 * about to call mi_switch(), in which case it will yield.
    604 	 */
    605 	if ((l->l_flag & LW_RUNNING) != 0) {
    606 		l->l_stat = LSONPROC;
    607 		l->l_slptime = 0;
    608 		lwp_unlock(l);
    609 		return;
    610 	}
    611 
    612 	/*
    613 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
    614 	 * to bring it back in.  Otherwise, enter it into a run queue.
    615 	 */
    616 	if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
    617 		spc_lock(l->l_cpu);
    618 		lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
    619 	}
    620 
    621 	sched_setrunnable(l);
    622 	l->l_stat = LSRUN;
    623 	l->l_slptime = 0;
    624 
    625 	if (l->l_flag & LW_INMEM) {
    626 		sched_enqueue(l, false);
    627 		resched_cpu(l);
    628 		lwp_unlock(l);
    629 	} else {
    630 		lwp_unlock(l);
    631 		uvm_kick_scheduler();
    632 	}
    633 }
    634 
    635 /*
    636  * suspendsched:
    637  *
    638  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    639  */
    640 void
    641 suspendsched(void)
    642 {
    643 	CPU_INFO_ITERATOR cii;
    644 	struct cpu_info *ci;
    645 	struct lwp *l;
    646 	struct proc *p;
    647 
    648 	/*
    649 	 * We do this by process in order not to violate the locking rules.
    650 	 */
    651 	mutex_enter(&proclist_mutex);
    652 	PROCLIST_FOREACH(p, &allproc) {
    653 		mutex_enter(&p->p_smutex);
    654 
    655 		if ((p->p_flag & PK_SYSTEM) != 0) {
    656 			mutex_exit(&p->p_smutex);
    657 			continue;
    658 		}
    659 
    660 		p->p_stat = SSTOP;
    661 
    662 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    663 			if (l == curlwp)
    664 				continue;
    665 
    666 			lwp_lock(l);
    667 
    668 			/*
    669 			 * Set L_WREBOOT so that the LWP will suspend itself
    670 			 * when it tries to return to user mode.  We want to
    671 			 * try and get to get as many LWPs as possible to
    672 			 * the user / kernel boundary, so that they will
    673 			 * release any locks that they hold.
    674 			 */
    675 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
    676 
    677 			if (l->l_stat == LSSLEEP &&
    678 			    (l->l_flag & LW_SINTR) != 0) {
    679 				/* setrunnable() will release the lock. */
    680 				setrunnable(l);
    681 				continue;
    682 			}
    683 
    684 			lwp_unlock(l);
    685 		}
    686 
    687 		mutex_exit(&p->p_smutex);
    688 	}
    689 	mutex_exit(&proclist_mutex);
    690 
    691 	/*
    692 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
    693 	 * They'll trap into the kernel and suspend themselves in userret().
    694 	 */
    695 	for (CPU_INFO_FOREACH(cii, ci))
    696 		cpu_need_resched(ci, 0);
    697 }
    698 
    699 /*
    700  * sched_kpri:
    701  *
    702  *	Scale a priority level to a kernel priority level, usually
    703  *	for an LWP that is about to sleep.
    704  */
    705 pri_t
    706 sched_kpri(struct lwp *l)
    707 {
    708 	/*
    709 	 * Scale user priorities (127 -> 50) up to kernel priorities
    710 	 * in the range (49 -> 8).  Reserve the top 8 kernel priorities
    711 	 * for high priority kthreads.  Kernel priorities passed in
    712 	 * are left "as is".  XXX This is somewhat arbitrary.
    713 	 */
    714 	static const uint8_t kpri_tab[] = {
    715 		 0,   1,   2,   3,   4,   5,   6,   7,
    716 		 8,   9,  10,  11,  12,  13,  14,  15,
    717 		16,  17,  18,  19,  20,  21,  22,  23,
    718 		24,  25,  26,  27,  28,  29,  30,  31,
    719 		32,  33,  34,  35,  36,  37,  38,  39,
    720 		40,  41,  42,  43,  44,  45,  46,  47,
    721 		48,  49,   8,   8,   9,   9,  10,  10,
    722 		11,  11,  12,  12,  13,  14,  14,  15,
    723 		15,  16,  16,  17,  17,  18,  18,  19,
    724 		20,  20,  21,  21,  22,  22,  23,  23,
    725 		24,  24,  25,  26,  26,  27,  27,  28,
    726 		28,  29,  29,  30,  30,  31,  32,  32,
    727 		33,  33,  34,  34,  35,  35,  36,  36,
    728 		37,  38,  38,  39,  39,  40,  40,  41,
    729 		41,  42,  42,  43,  44,  44,  45,  45,
    730 		46,  46,  47,  47,  48,  48,  49,  49,
    731 	};
    732 
    733 	return (pri_t)kpri_tab[l->l_usrpri];
    734 }
    735 
    736 /*
    737  * sched_unsleep:
    738  *
    739  *	The is called when the LWP has not been awoken normally but instead
    740  *	interrupted: for example, if the sleep timed out.  Because of this,
    741  *	it's not a valid action for running or idle LWPs.
    742  */
    743 static void
    744 sched_unsleep(struct lwp *l)
    745 {
    746 
    747 	lwp_unlock(l);
    748 	panic("sched_unsleep");
    749 }
    750 
    751 inline void
    752 resched_cpu(struct lwp *l)
    753 {
    754 	struct cpu_info *ci;
    755 	const pri_t pri = lwp_eprio(l);
    756 
    757 	/*
    758 	 * XXXSMP
    759 	 * Since l->l_cpu persists across a context switch,
    760 	 * this gives us *very weak* processor affinity, in
    761 	 * that we notify the CPU on which the process last
    762 	 * ran that it should try to switch.
    763 	 *
    764 	 * This does not guarantee that the process will run on
    765 	 * that processor next, because another processor might
    766 	 * grab it the next time it performs a context switch.
    767 	 *
    768 	 * This also does not handle the case where its last
    769 	 * CPU is running a higher-priority process, but every
    770 	 * other CPU is running a lower-priority process.  There
    771 	 * are ways to handle this situation, but they're not
    772 	 * currently very pretty, and we also need to weigh the
    773 	 * cost of moving a process from one CPU to another.
    774 	 */
    775 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    776 	if (pri < ci->ci_schedstate.spc_curpriority)
    777 		cpu_need_resched(ci, 0);
    778 }
    779 
    780 static void
    781 sched_changepri(struct lwp *l, pri_t pri)
    782 {
    783 
    784 	KASSERT(lwp_locked(l, NULL));
    785 
    786 	l->l_usrpri = pri;
    787 	if (l->l_priority < PUSER)
    788 		return;
    789 
    790 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    791 		l->l_priority = pri;
    792 		return;
    793 	}
    794 
    795 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    796 
    797 	sched_dequeue(l);
    798 	l->l_priority = pri;
    799 	sched_enqueue(l, false);
    800 	resched_cpu(l);
    801 }
    802 
    803 static void
    804 sched_lendpri(struct lwp *l, pri_t pri)
    805 {
    806 
    807 	KASSERT(lwp_locked(l, NULL));
    808 
    809 	if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
    810 		l->l_inheritedprio = pri;
    811 		return;
    812 	}
    813 
    814 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    815 
    816 	sched_dequeue(l);
    817 	l->l_inheritedprio = pri;
    818 	sched_enqueue(l, false);
    819 	resched_cpu(l);
    820 }
    821 
    822 struct lwp *
    823 syncobj_noowner(wchan_t wchan)
    824 {
    825 
    826 	return NULL;
    827 }
    828 
    829 
    830 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
    831 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
    832 
    833 /*
    834  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
    835  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
    836  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
    837  *
    838  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
    839  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
    840  *
    841  * If you dont want to bother with the faster/more-accurate formula, you
    842  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
    843  * (more general) method of calculating the %age of CPU used by a process.
    844  */
    845 #define	CCPU_SHIFT	(FSHIFT + 1)
    846 
    847 /*
    848  * sched_pstats:
    849  *
    850  * Update process statistics and check CPU resource allocation.
    851  * Call scheduler-specific hook to eventually adjust process/LWP
    852  * priorities.
    853  */
    854 /* ARGSUSED */
    855 void
    856 sched_pstats(void *arg)
    857 {
    858 	struct rlimit *rlim;
    859 	struct lwp *l;
    860 	struct proc *p;
    861 	int minslp, sig, clkhz;
    862 	long runtm;
    863 
    864 	sched_pstats_ticks++;
    865 
    866 	mutex_enter(&proclist_mutex);
    867 	PROCLIST_FOREACH(p, &allproc) {
    868 		/*
    869 		 * Increment time in/out of memory and sleep time (if
    870 		 * sleeping).  We ignore overflow; with 16-bit int's
    871 		 * (remember them?) overflow takes 45 days.
    872 		 */
    873 		minslp = 2;
    874 		mutex_enter(&p->p_smutex);
    875 		mutex_spin_enter(&p->p_stmutex);
    876 		runtm = p->p_rtime.tv_sec;
    877 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    878 			if ((l->l_flag & LW_IDLE) != 0)
    879 				continue;
    880 			lwp_lock(l);
    881 			runtm += l->l_rtime.tv_sec;
    882 			l->l_swtime++;
    883 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    884 			    l->l_stat == LSSUSPENDED) {
    885 				l->l_slptime++;
    886 				minslp = min(minslp, l->l_slptime);
    887 			} else
    888 				minslp = 0;
    889 			sched_pstats_hook(l);
    890 			lwp_unlock(l);
    891 
    892 			/*
    893 			 * p_pctcpu is only for ps.
    894 			 */
    895 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
    896 			if (l->l_slptime < 1) {
    897 				clkhz = stathz != 0 ? stathz : hz;
    898 #if	(FSHIFT >= CCPU_SHIFT)
    899 				l->l_pctcpu += (clkhz == 100) ?
    900 				    ((fixpt_t)l->l_cpticks) <<
    901 				        (FSHIFT - CCPU_SHIFT) :
    902 				    100 * (((fixpt_t) p->p_cpticks)
    903 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
    904 #else
    905 				l->l_pctcpu += ((FSCALE - ccpu) *
    906 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
    907 #endif
    908 				l->l_cpticks = 0;
    909 			}
    910 		}
    911 
    912 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
    913 #ifdef SCHED_4BSD
    914 		/*
    915 		 * XXX: Workaround - belongs to sched_4bsd.c
    916 		 * If the process has slept the entire second,
    917 		 * stop recalculating its priority until it wakes up.
    918 		 */
    919 		if (minslp <= 1) {
    920 			extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
    921 
    922 			fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
    923 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    924 		}
    925 #endif
    926 		mutex_spin_exit(&p->p_stmutex);
    927 
    928 		/*
    929 		 * Check if the process exceeds its CPU resource allocation.
    930 		 * If over max, kill it.
    931 		 */
    932 		rlim = &p->p_rlimit[RLIMIT_CPU];
    933 		sig = 0;
    934 		if (runtm >= rlim->rlim_cur) {
    935 			if (runtm >= rlim->rlim_max)
    936 				sig = SIGKILL;
    937 			else {
    938 				sig = SIGXCPU;
    939 				if (rlim->rlim_cur < rlim->rlim_max)
    940 					rlim->rlim_cur += 5;
    941 			}
    942 		}
    943 		mutex_exit(&p->p_smutex);
    944 		if (sig) {
    945 			psignal(p, sig);
    946 		}
    947 	}
    948 	mutex_exit(&proclist_mutex);
    949 	uvm_meter();
    950 	cv_wakeup(&lbolt);
    951 	callout_schedule(&sched_pstats_ch, hz);
    952 }
    953 
    954 void
    955 sched_init(void)
    956 {
    957 
    958 	cv_init(&lbolt, "lbolt");
    959 	callout_init(&sched_pstats_ch, 0);
    960 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
    961 	sched_setup();
    962 	sched_pstats(NULL);
    963 }
    964