sys/kern/kern_synch.c

1.166.2.17        ad /*	$NetBSD: kern_synch.c,v 1.166.2.17 2007/01/30 13:51:41 ad Exp $	*/
      1.63   thorpej
      1.63   thorpej /*-
1.166.2.11        ad  * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
      1.63   thorpej  * All rights reserved.
      1.63   thorpej  *
      1.63   thorpej  * This code is derived from software contributed to The NetBSD Foundation
      1.63   thorpej  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 1.166.2.2        ad  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
      1.63   thorpej  *
      1.63   thorpej  * Redistribution and use in source and binary forms, with or without
      1.63   thorpej  * modification, are permitted provided that the following conditions
      1.63   thorpej  * are met:
      1.63   thorpej  * 1. Redistributions of source code must retain the above copyright
      1.63   thorpej  *    notice, this list of conditions and the following disclaimer.
      1.63   thorpej  * 2. Redistributions in binary form must reproduce the above copyright
      1.63   thorpej  *    notice, this list of conditions and the following disclaimer in the
      1.63   thorpej  *    documentation and/or other materials provided with the distribution.
      1.63   thorpej  * 3. All advertising materials mentioning features or use of this software
      1.63   thorpej  *    must display the following acknowledgement:
      1.63   thorpej  *	This product includes software developed by the NetBSD
      1.63   thorpej  *	Foundation, Inc. and its contributors.
      1.63   thorpej  * 4. Neither the name of The NetBSD Foundation nor the names of its
      1.63   thorpej  *    contributors may be used to endorse or promote products derived
      1.63   thorpej  *    from this software without specific prior written permission.
      1.63   thorpej  *
      1.63   thorpej  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
      1.63   thorpej  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
      1.63   thorpej  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
      1.63   thorpej  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
      1.63   thorpej  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
      1.63   thorpej  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
      1.63   thorpej  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
      1.63   thorpej  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
      1.63   thorpej  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
      1.63   thorpej  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
      1.63   thorpej  * POSSIBILITY OF SUCH DAMAGE.
      1.63   thorpej  */
      1.26       cgd
      1.26       cgd /*-
      1.26       cgd  * Copyright (c) 1982, 1986, 1990, 1991, 1993
      1.26       cgd  *	The Regents of the University of California.  All rights reserved.
      1.26       cgd  * (c) UNIX System Laboratories, Inc.
      1.26       cgd  * All or some portions of this file are derived from material licensed
      1.26       cgd  * to the University of California by American Telephone and Telegraph
      1.26       cgd  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
      1.26       cgd  * the permission of UNIX System Laboratories, Inc.
      1.26       cgd  *
      1.26       cgd  * Redistribution and use in source and binary forms, with or without
      1.26       cgd  * modification, are permitted provided that the following conditions
      1.26       cgd  * are met:
      1.26       cgd  * 1. Redistributions of source code must retain the above copyright
      1.26       cgd  *    notice, this list of conditions and the following disclaimer.
      1.26       cgd  * 2. Redistributions in binary form must reproduce the above copyright
      1.26       cgd  *    notice, this list of conditions and the following disclaimer in the
      1.26       cgd  *    documentation and/or other materials provided with the distribution.
     1.136       agc  * 3. Neither the name of the University nor the names of its contributors
      1.26       cgd  *    may be used to endorse or promote products derived from this software
      1.26       cgd  *    without specific prior written permission.
      1.26       cgd  *
      1.26       cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
      1.26       cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
      1.26       cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
      1.26       cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
      1.26       cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
      1.26       cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
      1.26       cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
      1.26       cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
      1.26       cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
      1.26       cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
      1.26       cgd  * SUCH DAMAGE.
      1.26       cgd  *
      1.50      fvdl  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
      1.26       cgd  */
     1.106     lukem
     1.106     lukem #include <sys/cdefs.h>
1.166.2.17        ad __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.166.2.17 2007/01/30 13:51:41 ad Exp $");
      1.48       mrg
      1.52  jonathan #include "opt_ddb.h"
     1.109      yamt #include "opt_kstack.h"
      1.82   thorpej #include "opt_lockdebug.h"
      1.83   thorpej #include "opt_multiprocessor.h"
     1.110    briggs #include "opt_perfctrs.h"
      1.26       cgd
 1.166.2.2        ad #define	__MUTEX_PRIVATE
 1.166.2.2        ad
      1.26       cgd #include <sys/param.h>
      1.26       cgd #include <sys/systm.h>
      1.68   thorpej #include <sys/callout.h>
      1.26       cgd #include <sys/proc.h>
      1.26       cgd #include <sys/kernel.h>
      1.26       cgd #include <sys/buf.h>
     1.111    briggs #if defined(PERFCTRS)
     1.110    briggs #include <sys/pmc.h>
     1.111    briggs #endif
      1.26       cgd #include <sys/signalvar.h>
      1.26       cgd #include <sys/resourcevar.h>
      1.55      ross #include <sys/sched.h>
     1.161      elad #include <sys/kauth.h>
 1.166.2.2        ad #include <sys/sleepq.h>
 1.166.2.2        ad #include <sys/lockdebug.h>
      1.47       mrg
      1.47       mrg #include <uvm/uvm_extern.h>
      1.47       mrg
      1.26       cgd #include <machine/cpu.h>
      1.34  christos
      1.26       cgd int	lbolt;			/* once a second sleep address */
      1.88  sommerfe int	rrticks;		/* number of hardclock ticks per roundrobin() */
      1.26       cgd
      1.73   thorpej /*
      1.73   thorpej  * The global scheduler state.
      1.73   thorpej  */
 1.166.2.4        ad kmutex_t	sched_mutex;		/* global sched state mutex */
 1.166.2.2        ad struct prochd	sched_qs[RUNQUE_NQS];	/* run queues */
     1.159     perry volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
      1.34  christos
 1.166.2.2        ad void	schedcpu(void *);
 1.166.2.2        ad void	updatepri(struct lwp *);
 1.166.2.2        ad void	sa_awaken(struct lwp *);
      1.63   thorpej
 1.166.2.4        ad void	sched_unsleep(struct lwp *);
 1.166.2.4        ad void	sched_changepri(struct lwp *, int);
 1.166.2.4        ad
     1.143      yamt struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
     1.157      yamt static unsigned int schedcpu_ticks;
     1.122   thorpej
 1.166.2.4        ad syncobj_t sleep_syncobj = {
 1.166.2.4        ad 	SOBJ_SLEEPQ_SORTED,
 1.166.2.4        ad 	sleepq_unsleep,
 1.166.2.4        ad 	sleepq_changepri
 1.166.2.4        ad };
 1.166.2.4        ad
 1.166.2.4        ad syncobj_t sched_syncobj = {
 1.166.2.4        ad 	SOBJ_SLEEPQ_SORTED,
 1.166.2.4        ad 	sched_unsleep,
 1.166.2.4        ad 	sched_changepri
 1.166.2.4        ad };
 1.166.2.4        ad
      1.26       cgd /*
      1.26       cgd  * Force switch among equal priority processes every 100ms.
      1.88  sommerfe  * Called from hardclock every hz/10 == rrticks hardclock ticks.
      1.26       cgd  */
      1.26       cgd /* ARGSUSED */
      1.26       cgd void
      1.89  sommerfe roundrobin(struct cpu_info *ci)
      1.26       cgd {
      1.89  sommerfe 	struct schedstate_percpu *spc = &ci->ci_schedstate;
      1.26       cgd
      1.88  sommerfe 	spc->spc_rrticks = rrticks;
     1.130   nathanw
     1.122   thorpej 	if (curlwp != NULL) {
      1.73   thorpej 		if (spc->spc_flags & SPCF_SEENRR) {
      1.69   thorpej 			/*
      1.69   thorpej 			 * The process has already been through a roundrobin
      1.69   thorpej 			 * without switching and may be hogging the CPU.
      1.69   thorpej 			 * Indicate that the process should yield.
      1.69   thorpej 			 */
      1.73   thorpej 			spc->spc_flags |= SPCF_SHOULDYIELD;
      1.69   thorpej 		} else
      1.73   thorpej 			spc->spc_flags |= SPCF_SEENRR;
      1.69   thorpej 	}
 1.166.2.2        ad 	cpu_need_resched(curcpu());
      1.26       cgd }
      1.26       cgd
     1.153      yamt #define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
     1.153      yamt #define	NICE_WEIGHT 2			/* priorities per nice level */
     1.153      yamt
     1.153      yamt #define	ESTCPU_SHIFT	11
     1.153      yamt #define	ESTCPU_MAX	((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
     1.153      yamt #define	ESTCPULIM(e)	min((e), ESTCPU_MAX)
     1.153      yamt
      1.26       cgd /*
      1.26       cgd  * Constants for digital decay and forget:
      1.26       cgd  *	90% of (p_estcpu) usage in 5 * loadav time
      1.26       cgd  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
      1.26       cgd  *          Note that, as ps(1) mentions, this can let percentages
      1.26       cgd  *          total over 100% (I've seen 137.9% for 3 processes).
      1.26       cgd  *
      1.26       cgd  * Note that hardclock updates p_estcpu and p_cpticks independently.
      1.26       cgd  *
      1.26       cgd  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
      1.26       cgd  * That is, the system wants to compute a value of decay such
      1.26       cgd  * that the following for loop:
      1.26       cgd  * 	for (i = 0; i < (5 * loadavg); i++)
      1.26       cgd  * 		p_estcpu *= decay;
      1.26       cgd  * will compute
      1.26       cgd  * 	p_estcpu *= 0.1;
      1.26       cgd  * for all values of loadavg:
      1.26       cgd  *
      1.26       cgd  * Mathematically this loop can be expressed by saying:
      1.26       cgd  * 	decay ** (5 * loadavg) ~= .1
      1.26       cgd  *
      1.26       cgd  * The system computes decay as:
      1.26       cgd  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
      1.26       cgd  *
      1.26       cgd  * We wish to prove that the system's computation of decay
      1.26       cgd  * will always fulfill the equation:
      1.26       cgd  * 	decay ** (5 * loadavg) ~= .1
      1.26       cgd  *
      1.26       cgd  * If we compute b as:
      1.26       cgd  * 	b = 2 * loadavg
      1.26       cgd  * then
      1.26       cgd  * 	decay = b / (b + 1)
      1.26       cgd  *
      1.26       cgd  * We now need to prove two things:
      1.26       cgd  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
      1.26       cgd  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
     1.130   nathanw  *
      1.26       cgd  * Facts:
      1.26       cgd  *         For x close to zero, exp(x) =~ 1 + x, since
      1.26       cgd  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
      1.26       cgd  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
      1.26       cgd  *         For x close to zero, ln(1+x) =~ x, since
      1.26       cgd  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
      1.26       cgd  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
      1.26       cgd  *         ln(.1) =~ -2.30
      1.26       cgd  *
      1.26       cgd  * Proof of (1):
      1.26       cgd  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
      1.26       cgd  *	solving for factor,
      1.26       cgd  *      ln(factor) =~ (-2.30/5*loadav), or
      1.26       cgd  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
      1.26       cgd  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
      1.26       cgd  *
      1.26       cgd  * Proof of (2):
      1.26       cgd  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
      1.26       cgd  *	solving for power,
      1.26       cgd  *      power*ln(b/(b+1)) =~ -2.30, or
      1.26       cgd  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
      1.26       cgd  *
      1.26       cgd  * Actual power values for the implemented algorithm are as follows:
      1.26       cgd  *      loadav: 1       2       3       4
      1.26       cgd  *      power:  5.68    10.32   14.94   19.55
      1.26       cgd  */
      1.26       cgd
      1.26       cgd /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
      1.26       cgd #define	loadfactor(loadav)	(2 * (loadav))
     1.153      yamt
     1.153      yamt static fixpt_t
     1.153      yamt decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
     1.153      yamt {
     1.153      yamt
     1.153      yamt 	if (estcpu == 0) {
     1.153      yamt 		return 0;
     1.153      yamt 	}
     1.153      yamt
     1.153      yamt #if !defined(_LP64)
     1.153      yamt 	/* avoid 64bit arithmetics. */
     1.153      yamt #define	FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
     1.153      yamt 	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
     1.153      yamt 		return estcpu * loadfac / (loadfac + FSCALE);
     1.153      yamt 	}
     1.153      yamt #endif /* !defined(_LP64) */
     1.153      yamt
     1.153      yamt 	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
     1.153      yamt }
      1.26       cgd
     1.157      yamt /*
     1.157      yamt  * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
     1.157      yamt  * sleeping for at least seven times the loadfactor will decay p_estcpu to
     1.157      yamt  * less than (1 << ESTCPU_SHIFT).
     1.157      yamt  *
     1.157      yamt  * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
     1.157      yamt  */
     1.157      yamt static fixpt_t
     1.157      yamt decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
     1.157      yamt {
     1.157      yamt
     1.157      yamt 	if ((n << FSHIFT) >= 7 * loadfac) {
     1.157      yamt 		return 0;
     1.157      yamt 	}
     1.157      yamt
     1.157      yamt 	while (estcpu != 0 && n > 1) {
     1.157      yamt 		estcpu = decay_cpu(loadfac, estcpu);
     1.157      yamt 		n--;
     1.157      yamt 	}
     1.157      yamt
     1.157      yamt 	return estcpu;
     1.157      yamt }
     1.157      yamt
      1.26       cgd /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
      1.26       cgd fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
      1.26       cgd
      1.26       cgd /*
      1.26       cgd  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
      1.26       cgd  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
      1.26       cgd  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
      1.26       cgd  *
      1.26       cgd  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
      1.26       cgd  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
      1.26       cgd  *
      1.26       cgd  * If you dont want to bother with the faster/more-accurate formula, you
      1.26       cgd  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
      1.26       cgd  * (more general) method of calculating the %age of CPU used by a process.
      1.26       cgd  */
      1.26       cgd #define	CCPU_SHIFT	11
      1.26       cgd
      1.26       cgd /*
 1.166.2.7        ad  * schedcpu:
 1.166.2.7        ad  *
 1.166.2.7        ad  *	Recompute process priorities, every hz ticks.
 1.166.2.7        ad  *
 1.166.2.7        ad  *	XXXSMP This needs to be reorganised in order to reduce the locking
 1.166.2.7        ad  *	burden.
      1.26       cgd  */
      1.26       cgd /* ARGSUSED */
      1.26       cgd void
      1.77   thorpej schedcpu(void *arg)
      1.26       cgd {
      1.71  augustss 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 1.166.2.2        ad 	struct rlimit *rlim;
     1.122   thorpej 	struct lwp *l;
      1.71  augustss 	struct proc *p;
1.166.2.11        ad 	int minslp, clkhz, sig;
 1.166.2.2        ad 	long runtm;
      1.26       cgd
     1.157      yamt 	schedcpu_ticks++;
     1.157      yamt
 1.166.2.1        ad 	mutex_enter(&proclist_mutex);
     1.145      yamt 	PROCLIST_FOREACH(p, &allproc) {
      1.26       cgd 		/*
 1.166.2.2        ad 		 * Increment time in/out of memory and sleep time (if
 1.166.2.2        ad 		 * sleeping).  We ignore overflow; with 16-bit int's
      1.26       cgd 		 * (remember them?) overflow takes 45 days.
      1.26       cgd 		 */
     1.122   thorpej 		minslp = 2;
 1.166.2.2        ad 		mutex_enter(&p->p_smutex);
1.166.2.10      yamt 		runtm = p->p_rtime.tv_sec;
     1.122   thorpej 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 1.166.2.2        ad 			lwp_lock(l);
 1.166.2.2        ad 			runtm += l->l_rtime.tv_sec;
     1.122   thorpej 			l->l_swtime++;
     1.130   nathanw 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
     1.122   thorpej 			    l->l_stat == LSSUSPENDED) {
     1.122   thorpej 				l->l_slptime++;
     1.122   thorpej 				minslp = min(minslp, l->l_slptime);
     1.122   thorpej 			} else
     1.122   thorpej 				minslp = 0;
 1.166.2.2        ad 			lwp_unlock(l);
     1.122   thorpej 		}
      1.26       cgd 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
 1.166.2.2        ad
 1.166.2.2        ad 		/*
 1.166.2.2        ad 		 * Check if the process exceeds its CPU resource allocation.
 1.166.2.7        ad 		 * If over max, kill it.
 1.166.2.2        ad 		 */
 1.166.2.2        ad 		rlim = &p->p_rlimit[RLIMIT_CPU];
1.166.2.11        ad 		sig = 0;
 1.166.2.2        ad 		if (runtm >= rlim->rlim_cur) {
 1.166.2.2        ad 			if (runtm >= rlim->rlim_max)
 1.166.2.9      yamt 				sig = SIGKILL;
 1.166.2.2        ad 			else {
 1.166.2.9      yamt 				sig = SIGXCPU;
 1.166.2.2        ad 				if (rlim->rlim_cur < rlim->rlim_max)
 1.166.2.2        ad 					rlim->rlim_cur += 5;
 1.166.2.2        ad 			}
 1.166.2.2        ad 		}
 1.166.2.7        ad
 1.166.2.7        ad 		/*
 1.166.2.7        ad 		 * If the process has run for more than autonicetime, reduce
 1.166.2.7        ad 		 * priority to give others a chance.
 1.166.2.7        ad 		 */
 1.166.2.2        ad 		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
 1.166.2.2        ad 		    && kauth_cred_geteuid(p->p_cred)) {
1.166.2.13        ad 			mutex_enter(&p->p_stmutex);
 1.166.2.2        ad 			p->p_nice = autoniceval + NZERO;
 1.166.2.2        ad 			resetprocpriority(p);
1.166.2.13        ad 			mutex_exit(&p->p_stmutex);
 1.166.2.2        ad 		}
 1.166.2.2        ad
      1.26       cgd 		/*
      1.26       cgd 		 * If the process has slept the entire second,
      1.26       cgd 		 * stop recalculating its priority until it wakes up.
      1.26       cgd 		 */
1.166.2.11        ad 		if (minslp <= 1) {
1.166.2.11        ad 			/*
1.166.2.11        ad 			 * p_pctcpu is only for ps.
1.166.2.11        ad 			 */
1.166.2.11        ad 			mutex_enter(&p->p_stmutex);
1.166.2.11        ad 			clkhz = stathz != 0 ? stathz : hz;
      1.26       cgd #if	(FSHIFT >= CCPU_SHIFT)
1.166.2.11        ad 			p->p_pctcpu += (clkhz == 100)?
1.166.2.11        ad 			    ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
1.166.2.11        ad 			    100 * (((fixpt_t) p->p_cpticks)
1.166.2.11        ad 			    << (FSHIFT - CCPU_SHIFT)) / clkhz;
      1.26       cgd #else
1.166.2.11        ad 			p->p_pctcpu += ((FSCALE - ccpu) *
1.166.2.11        ad 			    (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
      1.26       cgd #endif
1.166.2.11        ad 			p->p_cpticks = 0;
1.166.2.11        ad 			p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
1.166.2.11        ad
1.166.2.11        ad 			LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1.166.2.11        ad 				lwp_lock(l);
1.166.2.11        ad 				if (l->l_slptime <= 1)
1.166.2.11        ad 					resetpriority(l);
1.166.2.11        ad 				lwp_unlock(l);
1.166.2.11        ad 			}
1.166.2.13        ad 			mutex_exit(&p->p_stmutex);
      1.26       cgd 		}
1.166.2.11        ad
 1.166.2.2        ad 		mutex_exit(&p->p_smutex);
 1.166.2.9      yamt 		if (sig) {
 1.166.2.9      yamt 			psignal(p, sig);
 1.166.2.9      yamt 		}
      1.26       cgd 	}
 1.166.2.1        ad 	mutex_exit(&proclist_mutex);
      1.47       mrg 	uvm_meter();
      1.67      fvdl 	wakeup((caddr_t)&lbolt);
     1.143      yamt 	callout_schedule(&schedcpu_ch, hz);
      1.26       cgd }
      1.26       cgd
      1.26       cgd /*
      1.26       cgd  * Recalculate the priority of a process after it has slept for a while.
      1.26       cgd  */
      1.26       cgd void
     1.122   thorpej updatepri(struct lwp *l)
      1.26       cgd {
     1.122   thorpej 	struct proc *p = l->l_proc;
      1.83   thorpej 	fixpt_t loadfac;
      1.83   thorpej
 1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
     1.157      yamt 	KASSERT(l->l_slptime > 1);
      1.83   thorpej
      1.83   thorpej 	loadfac = loadfactor(averunnable.ldavg[0]);
      1.26       cgd
     1.157      yamt 	l->l_slptime--; /* the first time was done in schedcpu */
     1.157      yamt 	/* XXX NJWLWP */
1.166.2.13        ad 	/* XXXSMP occasionally unlocked, should be per-LWP */
     1.157      yamt 	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
     1.122   thorpej 	resetpriority(l);
      1.26       cgd }
      1.26       cgd
      1.26       cgd /*
 1.166.2.2        ad  * During autoconfiguration or after a panic, a sleep will simply lower the
 1.166.2.2        ad  * priority briefly to allow interrupts, then return.  The priority to be
 1.166.2.2        ad  * used (safepri) is machine-dependent, thus this value is initialized and
 1.166.2.2        ad  * maintained in the machine-dependent layers.  This priority will typically
 1.166.2.2        ad  * be 0, or the lowest priority that is safe for use on the interrupt stack;
 1.166.2.2        ad  * it can be made higher to block network software interrupts after panics.
      1.26       cgd  */
 1.166.2.2        ad int	safepri;
      1.26       cgd
      1.26       cgd /*
1.166.2.15        ad  * OBSOLETE INTERFACE
1.166.2.15        ad  *
 1.166.2.2        ad  * General sleep call.  Suspends the current process until a wakeup is
 1.166.2.2        ad  * performed on the specified identifier.  The process will then be made
 1.166.2.2        ad  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
 1.166.2.2        ad  * means no timeout).  If pri includes PCATCH flag, signals are checked
 1.166.2.2        ad  * before and after sleeping, else signals are not checked.  Returns 0 if
 1.166.2.2        ad  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 1.166.2.2        ad  * signal needs to be delivered, ERESTART is returned if the current system
 1.166.2.2        ad  * call should be restarted if possible, and EINTR is returned if the system
 1.166.2.2        ad  * call should be interrupted by the signal (return EINTR).
 1.166.2.2        ad  *
 1.166.2.2        ad  * The interlock is held until we are on a sleep queue. The interlock will
 1.166.2.2        ad  * be locked before returning back to the caller unless the PNORELOCK flag
 1.166.2.2        ad  * is specified, in which case the interlock will always be unlocked upon
 1.166.2.2        ad  * return.
 1.166.2.2        ad  */
 1.166.2.1        ad int
1.166.2.15        ad ltsleep(wchan_t ident, int priority, const char *wmesg, int timo,
1.166.2.15        ad 	volatile struct simplelock *interlock)
 1.166.2.1        ad {
 1.166.2.1        ad 	struct lwp *l = curlwp;
 1.166.2.2        ad 	sleepq_t *sq;
 1.166.2.7        ad 	int error, catch;
 1.166.2.1        ad
1.166.2.15        ad 	if (sleepq_dontsleep(l)) {
1.166.2.15        ad 		(void)sleepq_abort(NULL, 0);
1.166.2.15        ad 		if ((priority & PNORELOCK) != 0)
1.166.2.15        ad 			simple_unlock(interlock);
1.166.2.15        ad 		return 0;
1.166.2.15        ad 	}
 1.166.2.1        ad
 1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
 1.166.2.7        ad 	sleepq_enter(sq, l);
 1.166.2.1        ad
1.166.2.15        ad 	if (interlock != NULL) {
1.166.2.15        ad 		LOCK_ASSERT(simple_lock_held(interlock));
1.166.2.15        ad 		simple_unlock(interlock);
 1.166.2.1        ad 	}
 1.166.2.1        ad
 1.166.2.7        ad 	catch = priority & PCATCH;
 1.166.2.7        ad 	sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
 1.166.2.7        ad 	    &sleep_syncobj);
 1.166.2.7        ad 	error = sleepq_unblock(timo, catch);
 1.166.2.1        ad
1.166.2.15        ad 	if (interlock != NULL && (priority & PNORELOCK) == 0)
1.166.2.15        ad 		simple_lock(interlock);
 1.166.2.2        ad
 1.166.2.2        ad 	return error;
 1.166.2.1        ad }
 1.166.2.1        ad
 1.166.2.7        ad /*
1.166.2.15        ad  * General sleep call for situations where a wake-up is not expected.
 1.166.2.7        ad  */
 1.166.2.7        ad int
1.166.2.15        ad kpause(const char *wmesg, boolean_t intr, int timo, kmutex_t *mtx)
 1.166.2.7        ad {
 1.166.2.7        ad 	struct lwp *l = curlwp;
 1.166.2.7        ad 	sleepq_t *sq;
1.166.2.15        ad 	int error;
 1.166.2.7        ad
 1.166.2.7        ad 	if (sleepq_dontsleep(l))
 1.166.2.7        ad 		return sleepq_abort(NULL, 0);
 1.166.2.7        ad
1.166.2.16        ad 	if (mtx != NULL)
1.166.2.16        ad 		mutex_exit(mtx);
 1.166.2.7        ad 	sq = sleeptab_lookup(&sleeptab, l);
 1.166.2.7        ad 	sleepq_enter(sq, l);
 1.166.2.7        ad 	sleepq_block(sq, sched_kpri(l), l, wmesg, timo, intr, &sleep_syncobj);
1.166.2.15        ad 	error = sleepq_unblock(timo, intr);
1.166.2.16        ad 	if (mtx != NULL)
1.166.2.16        ad 		mutex_enter(mtx);
1.166.2.15        ad
1.166.2.15        ad 	return error;
 1.166.2.7        ad }
 1.166.2.7        ad
      1.26       cgd /*
1.166.2.15        ad  * OBSOLETE INTERFACE
1.166.2.15        ad  *
      1.26       cgd  * Make all processes sleeping on the specified identifier runnable.
      1.26       cgd  */
      1.26       cgd void
 1.166.2.2        ad wakeup(wchan_t ident)
      1.26       cgd {
 1.166.2.2        ad 	sleepq_t *sq;
      1.83   thorpej
 1.166.2.2        ad 	if (cold)
 1.166.2.2        ad 		return;
      1.83   thorpej
 1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
 1.166.2.5        ad 	sleepq_wake(sq, ident, (u_int)-1);
      1.63   thorpej }
      1.63   thorpej
      1.63   thorpej /*
1.166.2.15        ad  * OBSOLETE INTERFACE
1.166.2.15        ad  *
      1.63   thorpej  * Make the highest priority process first in line on the specified
      1.63   thorpej  * identifier runnable.
      1.63   thorpej  */
 1.166.2.2        ad void
 1.166.2.2        ad wakeup_one(wchan_t ident)
      1.63   thorpej {
 1.166.2.2        ad 	sleepq_t *sq;
      1.77   thorpej
 1.166.2.2        ad 	if (cold)
 1.166.2.2        ad 		return;
 1.166.2.2        ad
 1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
 1.166.2.5        ad 	sleepq_wake(sq, ident, 1);
     1.117  gmcgarry }
     1.117  gmcgarry
 1.166.2.2        ad
     1.117  gmcgarry /*
     1.117  gmcgarry  * General yield call.  Puts the current process back on its run queue and
     1.117  gmcgarry  * performs a voluntary context switch.  Should only be called when the
     1.117  gmcgarry  * current process explicitly requests it (eg sched_yield(2) in compat code).
     1.117  gmcgarry  */
     1.117  gmcgarry void
     1.117  gmcgarry yield(void)
     1.117  gmcgarry {
     1.122   thorpej 	struct lwp *l = curlwp;
     1.117  gmcgarry
 1.166.2.2        ad 	lwp_lock(l);
 1.166.2.2        ad 	if (l->l_stat == LSONPROC) {
 1.166.2.3        ad 		KASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.2        ad 		l->l_priority = l->l_usrpri;
 1.166.2.2        ad 	}
 1.166.2.2        ad 	l->l_nvcsw++;
     1.122   thorpej 	mi_switch(l, NULL);
      1.69   thorpej }
      1.69   thorpej
      1.69   thorpej /*
      1.69   thorpej  * General preemption call.  Puts the current process back on its run queue
     1.156    rpaulo  * and performs an involuntary context switch.
      1.69   thorpej  */
      1.69   thorpej void
1.166.2.17        ad preempt(void)
      1.69   thorpej {
     1.122   thorpej 	struct lwp *l = curlwp;
      1.69   thorpej
 1.166.2.2        ad 	lwp_lock(l);
 1.166.2.2        ad 	if (l->l_stat == LSONPROC) {
 1.166.2.3        ad 		KASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.2        ad 		l->l_priority = l->l_usrpri;
 1.166.2.2        ad 	}
 1.166.2.2        ad 	l->l_nivcsw++;
1.166.2.17        ad 	(void)mi_switch(l, NULL);
      1.69   thorpej }
      1.69   thorpej
      1.69   thorpej /*
 1.166.2.2        ad  * The machine independent parts of context switch.  Switch to "new"
 1.166.2.2        ad  * if non-NULL, otherwise let cpu_switch choose the next lwp.
     1.130   nathanw  *
     1.122   thorpej  * Returns 1 if another process was actually run.
      1.26       cgd  */
     1.122   thorpej int
     1.122   thorpej mi_switch(struct lwp *l, struct lwp *newl)
      1.26       cgd {
      1.76   thorpej 	struct schedstate_percpu *spc;
      1.26       cgd 	struct timeval tv;
 1.166.2.8        ad #ifdef MULTIPROCESSOR
     1.144      yamt 	int hold_count;
 1.166.2.8        ad #endif
 1.166.2.2        ad 	int retval, oldspl;
 1.166.2.2        ad 	long s, u;
 1.166.2.2        ad #if PERFCTRS
     1.122   thorpej 	struct proc *p = l->l_proc;
 1.166.2.2        ad #endif
      1.26       cgd
 1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
      1.83   thorpej
      1.90  sommerfe 	/*
      1.90  sommerfe 	 * Release the kernel_lock, as we are about to yield the CPU.
      1.90  sommerfe 	 */
 1.166.2.8        ad 	KERNEL_UNLOCK_ALL(l, &hold_count);
      1.85  sommerfe
     1.160       chs #ifdef LOCKDEBUG
      1.82   thorpej 	spinlock_switchcheck();
      1.81   thorpej 	simple_lock_switchcheck();
      1.50      fvdl #endif
 1.166.2.2        ad #ifdef KSTACK_CHECK_MAGIC
 1.166.2.2        ad 	kstack_check_magic(l);
 1.166.2.2        ad #endif
 1.166.2.2        ad
 1.166.2.2        ad 	/*
 1.166.2.2        ad 	 * It's safe to read the per CPU schedstate unlocked here, as all we
 1.166.2.2        ad 	 * are after is the run time and that's guarenteed to have been last
 1.166.2.2        ad 	 * updated by this CPU.
 1.166.2.2        ad 	 */
 1.166.2.2        ad 	KDASSERT(l->l_cpu == curcpu());
 1.166.2.2        ad 	spc = &l->l_cpu->ci_schedstate;
      1.81   thorpej
      1.26       cgd 	/*
      1.26       cgd 	 * Compute the amount of time during which the current
     1.113  gmcgarry 	 * process was running.
      1.26       cgd 	 */
      1.26       cgd 	microtime(&tv);
 1.166.2.2        ad 	u = l->l_rtime.tv_usec +
     1.122   thorpej 	    (tv.tv_usec - spc->spc_runtime.tv_usec);
 1.166.2.2        ad 	s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
      1.26       cgd 	if (u < 0) {
      1.26       cgd 		u += 1000000;
      1.26       cgd 		s--;
      1.26       cgd 	} else if (u >= 1000000) {
      1.26       cgd 		u -= 1000000;
      1.26       cgd 		s++;
      1.26       cgd 	}
 1.166.2.2        ad 	l->l_rtime.tv_usec = u;
 1.166.2.2        ad 	l->l_rtime.tv_sec = s;
      1.26       cgd
      1.26       cgd 	/*
 1.166.2.2        ad 	 * XXXSMP If we are using h/w performance counters, save context.
      1.26       cgd 	 */
 1.166.2.2        ad #if PERFCTRS
 1.166.2.2        ad 	if (PMC_ENABLED(p)) {
 1.166.2.2        ad 		pmc_save_context(p);
      1.26       cgd 	}
 1.166.2.2        ad #endif
 1.166.2.2        ad
 1.166.2.2        ad 	/*
 1.166.2.2        ad 	 * Acquire the sched_mutex if necessary.  It will be released by
 1.166.2.2        ad 	 * cpu_switch once it has decided to idle, or picked another LWP
 1.166.2.2        ad 	 * to run.
 1.166.2.2        ad 	 */
 1.166.2.4        ad #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
 1.166.2.4        ad 	if (l->l_mutex != &sched_mutex) {
1.166.2.14        ad 		mutex_spin_enter(&sched_mutex);
 1.166.2.4        ad 		lwp_unlock(l);
      1.26       cgd 	}
 1.166.2.3        ad #endif
 1.166.2.3        ad
 1.166.2.3        ad 	/*
 1.166.2.3        ad 	 * If on the CPU and we have gotten this far, then we must yield.
 1.166.2.3        ad 	 */
 1.166.2.3        ad 	KASSERT(l->l_stat != LSRUN);
 1.166.2.3        ad 	if (l->l_stat == LSONPROC) {
 1.166.2.4        ad 		KASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.3        ad 		l->l_stat = LSRUN;
 1.166.2.3        ad 		setrunqueue(l);
 1.166.2.3        ad 	}
 1.166.2.2        ad 	uvmexp.swtch++;
      1.69   thorpej
      1.69   thorpej 	/*
      1.69   thorpej 	 * Process is about to yield the CPU; clear the appropriate
      1.69   thorpej 	 * scheduling flags.
      1.69   thorpej 	 */
      1.73   thorpej 	spc->spc_flags &= ~SPCF_SWITCHCLEAR;
     1.109      yamt
 1.166.2.2        ad 	LOCKDEBUG_BARRIER(&sched_mutex, 1);
     1.113  gmcgarry
     1.113  gmcgarry 	/*
 1.166.2.2        ad 	 * Switch to the new current LWP.  When we run again, we'll
 1.166.2.2        ad 	 * return back here.
     1.113  gmcgarry 	 */
 1.166.2.4        ad 	oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
 1.166.2.4        ad
 1.166.2.4        ad 	if (newl == NULL || newl->l_back == NULL)
     1.122   thorpej 		retval = cpu_switch(l, NULL);
 1.166.2.2        ad 	else {
 1.166.2.4        ad 		KASSERT(lwp_locked(newl, &sched_mutex));
     1.122   thorpej 		remrunqueue(newl);
     1.122   thorpej 		cpu_switchto(l, newl);
     1.122   thorpej 		retval = 0;
     1.122   thorpej 	}
     1.110    briggs
     1.110    briggs 	/*
 1.166.2.2        ad 	 * XXXSMP If we are using h/w performance counters, restore context.
      1.26       cgd 	 */
     1.114  gmcgarry #if PERFCTRS
     1.166  christos 	if (PMC_ENABLED(p)) {
     1.114  gmcgarry 		pmc_restore_context(p);
     1.166  christos 	}
     1.114  gmcgarry #endif
     1.110    briggs
     1.110    briggs 	/*
      1.76   thorpej 	 * We're running again; record our new start time.  We might
 1.166.2.2        ad 	 * be running on a new CPU now, so don't use the cached
      1.76   thorpej 	 * schedstate_percpu pointer.
      1.76   thorpej 	 */
     1.122   thorpej 	KDASSERT(l->l_cpu == curcpu());
     1.122   thorpej 	microtime(&l->l_cpu->ci_schedstate.spc_runtime);
      1.85  sommerfe
      1.90  sommerfe 	/*
 1.166.2.4        ad 	 * Reacquire the kernel_lock.
      1.90  sommerfe 	 */
 1.166.2.2        ad 	splx(oldspl);
 1.166.2.4        ad 	KERNEL_LOCK(hold_count, l);
     1.122   thorpej
     1.122   thorpej 	return retval;
      1.26       cgd }
      1.26       cgd
      1.26       cgd /*
      1.26       cgd  * Initialize the (doubly-linked) run queues
      1.26       cgd  * to be empty.
      1.26       cgd  */
      1.26       cgd void
      1.26       cgd rqinit()
      1.26       cgd {
      1.71  augustss 	int i;
      1.26       cgd
      1.73   thorpej 	for (i = 0; i < RUNQUE_NQS; i++)
      1.73   thorpej 		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
     1.122   thorpej 		    (struct lwp *)&sched_qs[i];
 1.166.2.2        ad
 1.166.2.2        ad 	mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
      1.26       cgd }
      1.26       cgd
     1.158     perry static inline void
 1.166.2.2        ad resched_lwp(struct lwp *l, u_char pri)
     1.119   thorpej {
     1.119   thorpej 	struct cpu_info *ci;
     1.119   thorpej
     1.119   thorpej 	/*
     1.119   thorpej 	 * XXXSMP
     1.122   thorpej 	 * Since l->l_cpu persists across a context switch,
     1.119   thorpej 	 * this gives us *very weak* processor affinity, in
     1.119   thorpej 	 * that we notify the CPU on which the process last
     1.119   thorpej 	 * ran that it should try to switch.
     1.119   thorpej 	 *
     1.119   thorpej 	 * This does not guarantee that the process will run on
     1.119   thorpej 	 * that processor next, because another processor might
     1.119   thorpej 	 * grab it the next time it performs a context switch.
     1.119   thorpej 	 *
     1.119   thorpej 	 * This also does not handle the case where its last
     1.119   thorpej 	 * CPU is running a higher-priority process, but every
     1.119   thorpej 	 * other CPU is running a lower-priority process.  There
     1.119   thorpej 	 * are ways to handle this situation, but they're not
     1.119   thorpej 	 * currently very pretty, and we also need to weigh the
     1.119   thorpej 	 * cost of moving a process from one CPU to another.
 1.166.2.7        ad 	 *
 1.166.2.7        ad 	 * XXXSMP
 1.166.2.7        ad 	 * There is also the issue of locking the other CPU's
 1.166.2.7        ad 	 * sched state, which we currently do not do.
     1.119   thorpej 	 */
     1.122   thorpej 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
     1.121   thorpej 	if (pri < ci->ci_schedstate.spc_curpriority)
 1.166.2.2        ad 		cpu_need_resched(ci);
     1.119   thorpej }
     1.119   thorpej
      1.26       cgd /*
 1.166.2.2        ad  * Change process state to be runnable, placing it on the run queue if it is
 1.166.2.2        ad  * in memory, and awakening the swapper if it isn't in memory.
 1.166.2.2        ad  *
 1.166.2.2        ad  * Call with the process and LWP locked.  Will return with the LWP unlocked.
      1.26       cgd  */
      1.26       cgd void
     1.122   thorpej setrunnable(struct lwp *l)
      1.26       cgd {
     1.122   thorpej 	struct proc *p = l->l_proc;
      1.26       cgd
 1.166.2.2        ad 	LOCK_ASSERT(mutex_owned(&p->p_smutex));
 1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
      1.83   thorpej
     1.122   thorpej 	switch (l->l_stat) {
     1.122   thorpej 	case LSSTOP:
      1.33   mycroft 		/*
      1.33   mycroft 		 * If we're being traced (possibly because someone attached us
      1.33   mycroft 		 * while we were stopped), check for a signal from the debugger.
      1.33   mycroft 		 */
 1.166.2.4        ad 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
 1.166.2.7        ad 			sigaddset(&l->l_sigpend.sp_set, p->p_xstat);
 1.166.2.2        ad 			signotify(l);
      1.53   mycroft 		}
 1.166.2.2        ad 		p->p_nrlwps++;
     1.122   thorpej 		break;
     1.122   thorpej 	case LSSUSPENDED:
 1.166.2.4        ad 		l->l_flag &= ~L_WSUSPEND;
 1.166.2.2        ad 		p->p_nrlwps++;
 1.166.2.2        ad 		break;
 1.166.2.2        ad 	case LSSLEEP:
 1.166.2.4        ad 		KASSERT(l->l_wchan != NULL);
 1.166.2.7        ad 		break;
 1.166.2.7        ad 	default:
 1.166.2.7        ad 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
 1.166.2.7        ad 	}
 1.166.2.2        ad
 1.166.2.7        ad 	/*
 1.166.2.7        ad 	 * If the LWP was sleeping interruptably, then it's OK to start it
 1.166.2.7        ad 	 * again.  If not, mark it as still sleeping.
 1.166.2.7        ad 	 */
 1.166.2.7        ad 	if (l->l_wchan != NULL) {
 1.166.2.7        ad 		l->l_stat = LSSLEEP;
 1.166.2.7        ad 		if ((l->l_flag & L_SINTR) != 0)
 1.166.2.4        ad 			lwp_unsleep(l);
 1.166.2.7        ad 		else {
 1.166.2.4        ad 			lwp_unlock(l);
 1.166.2.4        ad #ifdef DIAGNOSTIC
 1.166.2.4        ad 			panic("setrunnable: !L_SINTR");
 1.166.2.4        ad #endif
 1.166.2.4        ad 		}
 1.166.2.2        ad 		return;
      1.26       cgd 	}
     1.139        cl
 1.166.2.7        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.7        ad
 1.166.2.3        ad 	/*
 1.166.2.7        ad 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
 1.166.2.7        ad 	 * about to call mi_switch(), in which case it will yield.
 1.166.2.7        ad 	 *
 1.166.2.7        ad 	 * XXXSMP Will need to change for preemption.
 1.166.2.3        ad 	 */
 1.166.2.7        ad #ifdef MULTIPROCESSOR
 1.166.2.7        ad 	if (l->l_cpu->ci_curlwp == l) {
 1.166.2.7        ad #else
 1.166.2.7        ad 	if (l == curlwp) {
 1.166.2.7        ad #endif
 1.166.2.3        ad 		l->l_stat = LSONPROC;
 1.166.2.3        ad 		l->l_slptime = 0;
 1.166.2.3        ad 		lwp_unlock(l);
 1.166.2.3        ad 		return;
 1.166.2.3        ad 	}
     1.122   thorpej
 1.166.2.3        ad 	/*
 1.166.2.3        ad 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
 1.166.2.3        ad 	 * to bring it back in.  Otherwise, enter it into a run queue.
 1.166.2.3        ad 	 */
     1.122   thorpej 	if (l->l_slptime > 1)
     1.122   thorpej 		updatepri(l);
 1.166.2.7        ad 	l->l_stat = LSRUN;
     1.122   thorpej 	l->l_slptime = 0;
 1.166.2.2        ad
 1.166.2.2        ad 	if (l->l_flag & L_INMEM) {
 1.166.2.2        ad 		setrunqueue(l);
 1.166.2.2        ad 		resched_lwp(l, l->l_priority);
 1.166.2.2        ad 		lwp_unlock(l);
 1.166.2.2        ad 	} else {
 1.166.2.2        ad 		lwp_unlock(l);
 1.166.2.2        ad 		wakeup(&proc0);
 1.166.2.2        ad 	}
      1.26       cgd }
      1.26       cgd
      1.26       cgd /*
      1.26       cgd  * Compute the priority of a process when running in user mode.
      1.26       cgd  * Arrange to reschedule if the resulting priority is better
      1.26       cgd  * than that of the current process.
      1.26       cgd  */
      1.26       cgd void
     1.122   thorpej resetpriority(struct lwp *l)
      1.26       cgd {
      1.71  augustss 	unsigned int newpriority;
     1.122   thorpej 	struct proc *p = l->l_proc;
      1.26       cgd
1.166.2.13        ad 	/* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
 1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
      1.83   thorpej
1.166.2.11        ad 	if ((l->l_flag & L_SYSTEM) != 0)
1.166.2.11        ad 		return;
1.166.2.11        ad
     1.153      yamt 	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
1.166.2.11        ad 	    NICE_WEIGHT * (p->p_nice - NZERO);
      1.26       cgd 	newpriority = min(newpriority, MAXPRI);
     1.122   thorpej 	l->l_usrpri = newpriority;
1.166.2.12        ad 	if (l->l_priority != newpriority)
1.166.2.12        ad 		lwp_changepri(l, newpriority);
     1.122   thorpej }
     1.122   thorpej
     1.130   nathanw /*
     1.122   thorpej  * Recompute priority for all LWPs in a process.
     1.122   thorpej  */
     1.122   thorpej void
     1.122   thorpej resetprocpriority(struct proc *p)
     1.122   thorpej {
     1.122   thorpej 	struct lwp *l;
     1.122   thorpej
1.166.2.13        ad 	LOCK_ASSERT(mutex_owned(&p->p_stmutex));
 1.166.2.2        ad
 1.166.2.2        ad 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 1.166.2.2        ad 		lwp_lock(l);
 1.166.2.2        ad 		resetpriority(l);
 1.166.2.2        ad 		lwp_unlock(l);
 1.166.2.2        ad 	}
      1.55      ross }
      1.55      ross
      1.55      ross /*
      1.56      ross  * We adjust the priority of the current process.  The priority of a process
     1.141       wiz  * gets worse as it accumulates CPU time.  The CPU usage estimator (p_estcpu)
      1.56      ross  * is increased here.  The formula for computing priorities (in kern_synch.c)
      1.56      ross  * will compute a different value each time p_estcpu increases. This can
      1.56      ross  * cause a switch, but unless the priority crosses a PPQ boundary the actual
     1.141       wiz  * queue will not change.  The CPU usage estimator ramps up quite quickly
      1.56      ross  * when the process is running (linearly), and decays away exponentially, at
      1.56      ross  * a rate which is proportionally slower when the system is busy.  The basic
      1.80   nathanw  * principle is that the system will 90% forget that the process used a lot
      1.56      ross  * of CPU time in 5 * loadav seconds.  This causes the system to favor
      1.56      ross  * processes which haven't run much recently, and to round-robin among other
      1.56      ross  * processes.
      1.55      ross  */
      1.55      ross
      1.55      ross void
     1.122   thorpej schedclock(struct lwp *l)
      1.55      ross {
     1.122   thorpej 	struct proc *p = l->l_proc;
 1.166.2.2        ad
1.166.2.13        ad 	mutex_enter(&p->p_stmutex);
     1.153      yamt 	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
 1.166.2.2        ad 	lwp_lock(l);
 1.166.2.2        ad 	resetpriority(l);
1.166.2.13        ad 	mutex_exit(&p->p_stmutex);
1.166.2.11        ad 	if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
     1.122   thorpej 		l->l_priority = l->l_usrpri;
 1.166.2.2        ad 	lwp_unlock(l);
      1.26       cgd }
      1.94    bouyer
 1.166.2.2        ad /*
 1.166.2.2        ad  * suspendsched:
 1.166.2.2        ad  *
 1.166.2.4        ad  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
 1.166.2.2        ad  */
      1.94    bouyer void
 1.166.2.4        ad suspendsched(void)
      1.94    bouyer {
 1.166.2.7        ad #ifdef MULTIPROCESSOR
 1.166.2.4        ad 	CPU_INFO_ITERATOR cii;
 1.166.2.4        ad 	struct cpu_info *ci;
 1.166.2.7        ad #endif
     1.122   thorpej 	struct lwp *l;
 1.166.2.2        ad 	struct proc *p;
      1.94    bouyer
 1.166.2.4        ad 	/*
 1.166.2.4        ad 	 * We do this by process in order not to violate the locking rules.
 1.166.2.4        ad 	 */
 1.166.2.7        ad 	mutex_enter(&proclist_mutex);
 1.166.2.4        ad 	PROCLIST_FOREACH(p, &allproc) {
 1.166.2.4        ad 		mutex_enter(&p->p_smutex);
 1.166.2.4        ad
 1.166.2.4        ad 		if ((p->p_flag & P_SYSTEM) != 0) {
 1.166.2.4        ad 			mutex_exit(&p->p_smutex);
      1.94    bouyer 			continue;
 1.166.2.4        ad 		}
     1.122   thorpej
 1.166.2.4        ad 		p->p_stat = SSTOP;
 1.166.2.2        ad
 1.166.2.4        ad 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
 1.166.2.4        ad 			if (l == curlwp)
 1.166.2.4        ad 				continue;
 1.166.2.4        ad
 1.166.2.4        ad 			lwp_lock(l);
 1.166.2.4        ad
 1.166.2.4        ad 			/*
 1.166.2.4        ad 			 * Set L_WREBOOT so that the LWP will suspend itself
 1.166.2.4        ad 			 * when it tries to return to user mode.  We want to
 1.166.2.4        ad 			 * try and get to get as many LWPs as possible to
 1.166.2.4        ad 			 * the user / kernel boundary, so that they will
 1.166.2.4        ad 			 * release any locks that they hold.
 1.166.2.4        ad 			 */
 1.166.2.4        ad 			l->l_flag |= (L_WREBOOT | L_WSUSPEND);
 1.166.2.4        ad
 1.166.2.4        ad 			if (l->l_stat == LSSLEEP &&
 1.166.2.4        ad 			    (l->l_flag & L_SINTR) != 0) {
 1.166.2.4        ad 				/* setrunnable() will release the lock. */
 1.166.2.4        ad 				setrunnable(l);
 1.166.2.4        ad 				continue;
 1.166.2.4        ad 			}
 1.166.2.4        ad
 1.166.2.4        ad 			lwp_unlock(l);
      1.94    bouyer 		}
 1.166.2.4        ad
 1.166.2.4        ad 		mutex_exit(&p->p_smutex);
      1.94    bouyer 	}
 1.166.2.7        ad 	mutex_exit(&proclist_mutex);
 1.166.2.4        ad
 1.166.2.4        ad 	/*
 1.166.2.4        ad 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
 1.166.2.4        ad 	 * They'll trap into the kernel and suspend themselves in userret().
 1.166.2.4        ad 	 */
 1.166.2.4        ad 	sched_lock(0);
 1.166.2.7        ad #ifdef MULTIPROCESSOR
 1.166.2.4        ad 	for (CPU_INFO_FOREACH(cii, ci))
 1.166.2.4        ad 		cpu_need_resched(ci);
 1.166.2.7        ad #else
 1.166.2.7        ad 	cpu_need_resched(curcpu());
 1.166.2.7        ad #endif
 1.166.2.4        ad 	sched_unlock(0);
      1.94    bouyer }
     1.113  gmcgarry
     1.113  gmcgarry /*
     1.151      yamt  * scheduler_fork_hook:
     1.151      yamt  *
     1.151      yamt  *	Inherit the parent's scheduler history.
     1.151      yamt  */
     1.151      yamt void
     1.151      yamt scheduler_fork_hook(struct proc *parent, struct proc *child)
     1.151      yamt {
     1.151      yamt
 1.166.2.4        ad 	LOCK_ASSERT(mutex_owned(&parent->p_smutex));
 1.166.2.4        ad
     1.157      yamt 	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
     1.157      yamt 	child->p_forktime = schedcpu_ticks;
     1.151      yamt }
     1.151      yamt
     1.151      yamt /*
     1.151      yamt  * scheduler_wait_hook:
     1.151      yamt  *
     1.151      yamt  *	Chargeback parents for the sins of their children.
     1.151      yamt  */
     1.151      yamt void
     1.151      yamt scheduler_wait_hook(struct proc *parent, struct proc *child)
     1.151      yamt {
     1.157      yamt 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
     1.157      yamt 	fixpt_t estcpu;
     1.151      yamt
     1.151      yamt 	/* XXX Only if parent != init?? */
     1.157      yamt
1.166.2.13        ad 	mutex_enter(&parent->p_stmutex);
     1.157      yamt 	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
     1.157      yamt 	    schedcpu_ticks - child->p_forktime);
 1.166.2.2        ad 	if (child->p_estcpu > estcpu)
     1.157      yamt 		parent->p_estcpu =
     1.157      yamt 		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1.166.2.13        ad 	mutex_exit(&parent->p_stmutex);
     1.151      yamt }
     1.151      yamt
     1.151      yamt /*
 1.166.2.4        ad  * sched_kpri:
 1.166.2.4        ad  *
1.166.2.11        ad  *	Scale a priority level to a kernel priority level, usually
1.166.2.11        ad  *	for an LWP that is about to sleep.
 1.166.2.1        ad  */
 1.166.2.1        ad int
 1.166.2.1        ad sched_kpri(struct lwp *l)
 1.166.2.1        ad {
 1.166.2.6        ad 	static const uint8_t kpri_tab[] = {
1.166.2.11        ad 		 0,   1,   2,   3,   4,   5,   6,   7,
1.166.2.11        ad 		 8,   9,  10,  11,  12,  13,  14,  15,
1.166.2.11        ad 		16,  17,  18,  19,  20,  21,  22,  23,
1.166.2.11        ad 		24,  25,  26,  27,  28,  29,  30,  31,
1.166.2.11        ad 		32,  33,  34,  35,  36,  37,  38,  39,
1.166.2.11        ad 		40,  41,  42,  43,  44,  45,  46,  47,
1.166.2.11        ad 		48,  49,   8,   8,   9,   9,  10,  10,
1.166.2.11        ad 		11,  11,  12,  12,  13,  14,  14,  15,
1.166.2.11        ad 		15,  16,  16,  17,  17,  18,  18,  19,
1.166.2.11        ad 		20,  20,  21,  21,  22,  22,  23,  23,
1.166.2.11        ad 		24,  24,  25,  26,  26,  27,  27,  28,
1.166.2.11        ad 		28,  29,  29,  30,  30,  31,  32,  32,
1.166.2.11        ad 		33,  33,  34,  34,  35,  35,  36,  36,
1.166.2.11        ad 		37,  38,  38,  39,  39,  40,  40,  41,
1.166.2.11        ad 		41,  42,  42,  43,  44,  44,  45,  45,
1.166.2.12        ad 		46,  46,  47,  47,  48,  48,  49,  49,
 1.166.2.4        ad 	};
 1.166.2.4        ad
 1.166.2.4        ad 	return kpri_tab[l->l_priority];
 1.166.2.1        ad }
 1.166.2.1        ad
 1.166.2.4        ad /*
 1.166.2.4        ad  * sched_unsleep:
 1.166.2.4        ad  *
 1.166.2.4        ad  *	The is called when the LWP has not been awoken normally but instead
 1.166.2.4        ad  *	interrupted: for example, if the sleep timed out.  Because of this,
 1.166.2.4        ad  *	it's not a valid action for running or idle LWPs.
 1.166.2.4        ad  */
 1.166.2.3        ad void
 1.166.2.4        ad sched_unsleep(struct lwp *l)
 1.166.2.3        ad {
 1.166.2.3        ad
 1.166.2.4        ad 	lwp_unlock(l);
 1.166.2.4        ad 	panic("sched_unsleep");
 1.166.2.3        ad }
 1.166.2.3        ad
 1.166.2.4        ad /*
 1.166.2.4        ad  * sched_changepri:
 1.166.2.4        ad  *
 1.166.2.4        ad  *	Adjust the priority of an LWP.
 1.166.2.4        ad  */
 1.166.2.3        ad void
 1.166.2.4        ad sched_changepri(struct lwp *l, int pri)
 1.166.2.3        ad {
 1.166.2.3        ad
 1.166.2.7        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.7        ad
 1.166.2.7        ad 	if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
1.166.2.12        ad 	    (l->l_priority / PPQ) == (pri / PPQ)) {
 1.166.2.4        ad 		l->l_priority = pri;
 1.166.2.4        ad 		return;
 1.166.2.4        ad 	}
 1.166.2.4        ad
 1.166.2.4        ad 	remrunqueue(l);
 1.166.2.4        ad 	l->l_priority = pri;
 1.166.2.4        ad 	setrunqueue(l);
 1.166.2.7        ad 	resched_lwp(l, pri);
 1.166.2.3        ad }
 1.166.2.3        ad
 1.166.2.1        ad /*
     1.113  gmcgarry  * Low-level routines to access the run queue.  Optimised assembler
     1.113  gmcgarry  * routines can override these.
     1.113  gmcgarry  */
     1.113  gmcgarry
     1.113  gmcgarry #ifndef __HAVE_MD_RUNQUEUE
     1.115  nisimura
     1.130   nathanw /*
     1.134      matt  * On some architectures, it's faster to use a MSB ordering for the priorites
     1.134      matt  * than the traditional LSB ordering.
     1.134      matt  */
     1.134      matt #ifdef __HAVE_BIGENDIAN_BITOPS
     1.134      matt #define	RQMASK(n) (0x80000000 >> (n))
     1.134      matt #else
     1.134      matt #define	RQMASK(n) (0x00000001 << (n))
     1.134      matt #endif
     1.134      matt
     1.134      matt /*
     1.115  nisimura  * The primitives that manipulate the run queues.  whichqs tells which
     1.115  nisimura  * of the 32 queues qs have processes in them.  Setrunqueue puts processes
     1.115  nisimura  * into queues, remrunqueue removes them from queues.  The running process is
     1.115  nisimura  * on no queue, other processes are on a queue related to p->p_priority,
     1.115  nisimura  * divided by 4 actually to shrink the 0-127 range of priorities into the 32
     1.115  nisimura  * available queues.
     1.130   nathanw  */
     1.146      matt #ifdef RQDEBUG
     1.146      matt static void
     1.146      matt checkrunqueue(int whichq, struct lwp *l)
     1.146      matt {
     1.146      matt 	const struct prochd * const rq = &sched_qs[whichq];
     1.146      matt 	struct lwp *l2;
     1.146      matt 	int found = 0;
     1.146      matt 	int die = 0;
     1.146      matt 	int empty = 1;
     1.164  christos 	for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
     1.146      matt 		if (l2->l_stat != LSRUN) {
     1.146      matt 			printf("checkrunqueue[%d]: lwp %p state (%d) "
     1.146      matt 			    " != LSRUN\n", whichq, l2, l2->l_stat);
     1.146      matt 		}
     1.146      matt 		if (l2->l_back->l_forw != l2) {
     1.146      matt 			printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
     1.146      matt 			    "corrupt %p\n", whichq, l2, l2->l_back,
     1.146      matt 			    l2->l_back->l_forw);
     1.146      matt 			die = 1;
     1.146      matt 		}
     1.146      matt 		if (l2->l_forw->l_back != l2) {
     1.146      matt 			printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
     1.146      matt 			    "corrupt %p\n", whichq, l2, l2->l_forw,
     1.146      matt 			    l2->l_forw->l_back);
     1.146      matt 			die = 1;
     1.146      matt 		}
     1.146      matt 		if (l2 == l)
     1.146      matt 			found = 1;
     1.146      matt 		empty = 0;
     1.146      matt 	}
     1.146      matt 	if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
     1.146      matt 		printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
     1.146      matt 		    whichq, rq);
     1.146      matt 		die = 1;
     1.146      matt 	} else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
     1.146      matt 		printf("checkrunqueue[%d]: bit clear for non-empty "
     1.146      matt 		    "run-queue %p\n", whichq, rq);
     1.146      matt 		die = 1;
     1.146      matt 	}
     1.146      matt 	if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
     1.146      matt 		printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
     1.146      matt 		    whichq, l);
     1.146      matt 		die = 1;
     1.146      matt 	}
     1.146      matt 	if (l != NULL && empty) {
     1.146      matt 		printf("checkrunqueue[%d]: empty run-queue %p with "
     1.146      matt 		    "active lwp %p\n", whichq, rq, l);
     1.146      matt 		die = 1;
     1.146      matt 	}
     1.146      matt 	if (l != NULL && !found) {
     1.146      matt 		printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
     1.146      matt 		    whichq, l, rq);
     1.146      matt 		die = 1;
     1.146      matt 	}
     1.146      matt 	if (die)
     1.146      matt 		panic("checkrunqueue: inconsistency found");
     1.146      matt }
     1.146      matt #endif /* RQDEBUG */
     1.146      matt
     1.113  gmcgarry void
     1.122   thorpej setrunqueue(struct lwp *l)
     1.113  gmcgarry {
     1.113  gmcgarry 	struct prochd *rq;
     1.122   thorpej 	struct lwp *prev;
     1.152      yamt 	const int whichq = l->l_priority / PPQ;
     1.113  gmcgarry
 1.166.2.3        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.2        ad
     1.146      matt #ifdef RQDEBUG
     1.146      matt 	checkrunqueue(whichq, NULL);
     1.146      matt #endif
     1.113  gmcgarry #ifdef DIAGNOSTIC
 1.166.2.2        ad 	if (l->l_back != NULL || l->l_stat != LSRUN)
     1.113  gmcgarry 		panic("setrunqueue");
     1.113  gmcgarry #endif
     1.134      matt 	sched_whichqs |= RQMASK(whichq);
     1.113  gmcgarry 	rq = &sched_qs[whichq];
     1.113  gmcgarry 	prev = rq->ph_rlink;
     1.122   thorpej 	l->l_forw = (struct lwp *)rq;
     1.122   thorpej 	rq->ph_rlink = l;
     1.122   thorpej 	prev->l_forw = l;
     1.122   thorpej 	l->l_back = prev;
     1.146      matt #ifdef RQDEBUG
     1.146      matt 	checkrunqueue(whichq, l);
     1.146      matt #endif
     1.113  gmcgarry }
     1.113  gmcgarry
     1.113  gmcgarry void
     1.122   thorpej remrunqueue(struct lwp *l)
     1.113  gmcgarry {
     1.122   thorpej 	struct lwp *prev, *next;
     1.152      yamt 	const int whichq = l->l_priority / PPQ;
 1.166.2.2        ad
 1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
 1.166.2.2        ad
     1.146      matt #ifdef RQDEBUG
     1.146      matt 	checkrunqueue(whichq, l);
     1.146      matt #endif
 1.166.2.2        ad
 1.166.2.2        ad #if defined(DIAGNOSTIC)
 1.166.2.2        ad 	if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
 1.166.2.2        ad 		/* Shouldn't happen - interrupts disabled. */
     1.146      matt 		panic("remrunqueue: bit %d not set", whichq);
 1.166.2.2        ad 	}
     1.113  gmcgarry #endif
     1.122   thorpej 	prev = l->l_back;
     1.122   thorpej 	l->l_back = NULL;
     1.122   thorpej 	next = l->l_forw;
     1.122   thorpej 	prev->l_forw = next;
     1.122   thorpej 	next->l_back = prev;
     1.113  gmcgarry 	if (prev == next)
     1.134      matt 		sched_whichqs &= ~RQMASK(whichq);
     1.146      matt #ifdef RQDEBUG
     1.146      matt 	checkrunqueue(whichq, NULL);
     1.146      matt #endif
     1.113  gmcgarry }
     1.113  gmcgarry
     1.134      matt #undef RQMASK
     1.134      matt #endif /* !defined(__HAVE_MD_RUNQUEUE) */