sys/kern/kern_synch.c

1.166.2.6        ad /*	$NetBSD: kern_synch.c,v 1.166.2.6 2006/11/18 21:39:22 ad Exp $	*/
     1.63   thorpej
     1.63   thorpej /*-
1.166.2.2        ad  * Copyright (c) 1999, 2000, 2004, 2006 The NetBSD Foundation, Inc.
     1.63   thorpej  * All rights reserved.
     1.63   thorpej  *
     1.63   thorpej  * This code is derived from software contributed to The NetBSD Foundation
     1.63   thorpej  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
1.166.2.2        ad  * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
     1.63   thorpej  *
     1.63   thorpej  * Redistribution and use in source and binary forms, with or without
     1.63   thorpej  * modification, are permitted provided that the following conditions
     1.63   thorpej  * are met:
     1.63   thorpej  * 1. Redistributions of source code must retain the above copyright
     1.63   thorpej  *    notice, this list of conditions and the following disclaimer.
     1.63   thorpej  * 2. Redistributions in binary form must reproduce the above copyright
     1.63   thorpej  *    notice, this list of conditions and the following disclaimer in the
     1.63   thorpej  *    documentation and/or other materials provided with the distribution.
     1.63   thorpej  * 3. All advertising materials mentioning features or use of this software
     1.63   thorpej  *    must display the following acknowledgement:
     1.63   thorpej  *	This product includes software developed by the NetBSD
     1.63   thorpej  *	Foundation, Inc. and its contributors.
     1.63   thorpej  * 4. Neither the name of The NetBSD Foundation nor the names of its
     1.63   thorpej  *    contributors may be used to endorse or promote products derived
     1.63   thorpej  *    from this software without specific prior written permission.
     1.63   thorpej  *
     1.63   thorpej  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     1.63   thorpej  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     1.63   thorpej  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     1.63   thorpej  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     1.63   thorpej  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     1.63   thorpej  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     1.63   thorpej  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     1.63   thorpej  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     1.63   thorpej  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     1.63   thorpej  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     1.63   thorpej  * POSSIBILITY OF SUCH DAMAGE.
     1.63   thorpej  */
     1.26       cgd
     1.26       cgd /*-
     1.26       cgd  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     1.26       cgd  *	The Regents of the University of California.  All rights reserved.
     1.26       cgd  * (c) UNIX System Laboratories, Inc.
     1.26       cgd  * All or some portions of this file are derived from material licensed
     1.26       cgd  * to the University of California by American Telephone and Telegraph
     1.26       cgd  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     1.26       cgd  * the permission of UNIX System Laboratories, Inc.
     1.26       cgd  *
     1.26       cgd  * Redistribution and use in source and binary forms, with or without
     1.26       cgd  * modification, are permitted provided that the following conditions
     1.26       cgd  * are met:
     1.26       cgd  * 1. Redistributions of source code must retain the above copyright
     1.26       cgd  *    notice, this list of conditions and the following disclaimer.
     1.26       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     1.26       cgd  *    notice, this list of conditions and the following disclaimer in the
     1.26       cgd  *    documentation and/or other materials provided with the distribution.
    1.136       agc  * 3. Neither the name of the University nor the names of its contributors
     1.26       cgd  *    may be used to endorse or promote products derived from this software
     1.26       cgd  *    without specific prior written permission.
     1.26       cgd  *
     1.26       cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     1.26       cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     1.26       cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     1.26       cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     1.26       cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     1.26       cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     1.26       cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     1.26       cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     1.26       cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     1.26       cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     1.26       cgd  * SUCH DAMAGE.
     1.26       cgd  *
     1.50      fvdl  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     1.26       cgd  */
    1.106     lukem
    1.106     lukem #include <sys/cdefs.h>
1.166.2.6        ad __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.166.2.6 2006/11/18 21:39:22 ad Exp $");
     1.48       mrg
     1.52  jonathan #include "opt_ddb.h"
    1.109      yamt #include "opt_kstack.h"
     1.82   thorpej #include "opt_lockdebug.h"
     1.83   thorpej #include "opt_multiprocessor.h"
    1.110    briggs #include "opt_perfctrs.h"
     1.26       cgd
1.166.2.2        ad #define	__MUTEX_PRIVATE
1.166.2.2        ad
     1.26       cgd #include <sys/param.h>
     1.26       cgd #include <sys/systm.h>
     1.68   thorpej #include <sys/callout.h>
     1.26       cgd #include <sys/proc.h>
     1.26       cgd #include <sys/kernel.h>
     1.26       cgd #include <sys/buf.h>
    1.111    briggs #if defined(PERFCTRS)
    1.110    briggs #include <sys/pmc.h>
    1.111    briggs #endif
     1.26       cgd #include <sys/signalvar.h>
     1.26       cgd #include <sys/resourcevar.h>
     1.55      ross #include <sys/sched.h>
    1.122   thorpej #include <sys/sa.h>
    1.122   thorpej #include <sys/savar.h>
    1.161      elad #include <sys/kauth.h>
1.166.2.2        ad #include <sys/sleepq.h>
1.166.2.2        ad #include <sys/lockdebug.h>
     1.47       mrg
     1.47       mrg #include <uvm/uvm_extern.h>
     1.47       mrg
     1.26       cgd #include <machine/cpu.h>
     1.34  christos
     1.26       cgd int	lbolt;			/* once a second sleep address */
     1.88  sommerfe int	rrticks;		/* number of hardclock ticks per roundrobin() */
     1.26       cgd
     1.73   thorpej /*
     1.73   thorpej  * The global scheduler state.
     1.73   thorpej  */
1.166.2.4        ad kmutex_t	sched_mutex;		/* global sched state mutex */
1.166.2.2        ad struct prochd	sched_qs[RUNQUE_NQS];	/* run queues */
    1.159     perry volatile uint32_t sched_whichqs;	/* bitmap of non-empty queues */
     1.34  christos
1.166.2.2        ad void	schedcpu(void *);
1.166.2.2        ad void	updatepri(struct lwp *);
1.166.2.2        ad void	sa_awaken(struct lwp *);
     1.63   thorpej
1.166.2.4        ad void	sched_unsleep(struct lwp *);
1.166.2.4        ad void	sched_changepri(struct lwp *, int);
1.166.2.4        ad
    1.143      yamt struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
    1.157      yamt static unsigned int schedcpu_ticks;
    1.122   thorpej
1.166.2.4        ad syncobj_t sleep_syncobj = {
1.166.2.4        ad 	SOBJ_SLEEPQ_SORTED,
1.166.2.4        ad 	sleepq_unsleep,
1.166.2.4        ad 	sleepq_changepri
1.166.2.4        ad };
1.166.2.4        ad
1.166.2.4        ad syncobj_t sched_syncobj = {
1.166.2.4        ad 	SOBJ_SLEEPQ_SORTED,
1.166.2.4        ad 	sched_unsleep,
1.166.2.4        ad 	sched_changepri
1.166.2.4        ad };
1.166.2.4        ad
     1.26       cgd /*
     1.26       cgd  * Force switch among equal priority processes every 100ms.
     1.88  sommerfe  * Called from hardclock every hz/10 == rrticks hardclock ticks.
     1.26       cgd  */
     1.26       cgd /* ARGSUSED */
     1.26       cgd void
     1.89  sommerfe roundrobin(struct cpu_info *ci)
     1.26       cgd {
     1.89  sommerfe 	struct schedstate_percpu *spc = &ci->ci_schedstate;
     1.26       cgd
     1.88  sommerfe 	spc->spc_rrticks = rrticks;
    1.130   nathanw
    1.122   thorpej 	if (curlwp != NULL) {
     1.73   thorpej 		if (spc->spc_flags & SPCF_SEENRR) {
     1.69   thorpej 			/*
     1.69   thorpej 			 * The process has already been through a roundrobin
     1.69   thorpej 			 * without switching and may be hogging the CPU.
     1.69   thorpej 			 * Indicate that the process should yield.
     1.69   thorpej 			 */
     1.73   thorpej 			spc->spc_flags |= SPCF_SHOULDYIELD;
     1.69   thorpej 		} else
     1.73   thorpej 			spc->spc_flags |= SPCF_SEENRR;
     1.69   thorpej 	}
1.166.2.2        ad 	cpu_need_resched(curcpu());
     1.26       cgd }
     1.26       cgd
    1.153      yamt #define	PPQ	(128 / RUNQUE_NQS)	/* priorities per queue */
    1.153      yamt #define	NICE_WEIGHT 2			/* priorities per nice level */
    1.153      yamt
    1.153      yamt #define	ESTCPU_SHIFT	11
    1.153      yamt #define	ESTCPU_MAX	((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
    1.153      yamt #define	ESTCPULIM(e)	min((e), ESTCPU_MAX)
    1.153      yamt
     1.26       cgd /*
     1.26       cgd  * Constants for digital decay and forget:
     1.26       cgd  *	90% of (p_estcpu) usage in 5 * loadav time
     1.26       cgd  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
     1.26       cgd  *          Note that, as ps(1) mentions, this can let percentages
     1.26       cgd  *          total over 100% (I've seen 137.9% for 3 processes).
     1.26       cgd  *
     1.26       cgd  * Note that hardclock updates p_estcpu and p_cpticks independently.
     1.26       cgd  *
     1.26       cgd  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
     1.26       cgd  * That is, the system wants to compute a value of decay such
     1.26       cgd  * that the following for loop:
     1.26       cgd  * 	for (i = 0; i < (5 * loadavg); i++)
     1.26       cgd  * 		p_estcpu *= decay;
     1.26       cgd  * will compute
     1.26       cgd  * 	p_estcpu *= 0.1;
     1.26       cgd  * for all values of loadavg:
     1.26       cgd  *
     1.26       cgd  * Mathematically this loop can be expressed by saying:
     1.26       cgd  * 	decay ** (5 * loadavg) ~= .1
     1.26       cgd  *
     1.26       cgd  * The system computes decay as:
     1.26       cgd  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
     1.26       cgd  *
     1.26       cgd  * We wish to prove that the system's computation of decay
     1.26       cgd  * will always fulfill the equation:
     1.26       cgd  * 	decay ** (5 * loadavg) ~= .1
     1.26       cgd  *
     1.26       cgd  * If we compute b as:
     1.26       cgd  * 	b = 2 * loadavg
     1.26       cgd  * then
     1.26       cgd  * 	decay = b / (b + 1)
     1.26       cgd  *
     1.26       cgd  * We now need to prove two things:
     1.26       cgd  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
     1.26       cgd  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
    1.130   nathanw  *
     1.26       cgd  * Facts:
     1.26       cgd  *         For x close to zero, exp(x) =~ 1 + x, since
     1.26       cgd  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
     1.26       cgd  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
     1.26       cgd  *         For x close to zero, ln(1+x) =~ x, since
     1.26       cgd  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
     1.26       cgd  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
     1.26       cgd  *         ln(.1) =~ -2.30
     1.26       cgd  *
     1.26       cgd  * Proof of (1):
     1.26       cgd  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
     1.26       cgd  *	solving for factor,
     1.26       cgd  *      ln(factor) =~ (-2.30/5*loadav), or
     1.26       cgd  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
     1.26       cgd  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
     1.26       cgd  *
     1.26       cgd  * Proof of (2):
     1.26       cgd  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
     1.26       cgd  *	solving for power,
     1.26       cgd  *      power*ln(b/(b+1)) =~ -2.30, or
     1.26       cgd  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
     1.26       cgd  *
     1.26       cgd  * Actual power values for the implemented algorithm are as follows:
     1.26       cgd  *      loadav: 1       2       3       4
     1.26       cgd  *      power:  5.68    10.32   14.94   19.55
     1.26       cgd  */
     1.26       cgd
     1.26       cgd /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
     1.26       cgd #define	loadfactor(loadav)	(2 * (loadav))
    1.153      yamt
    1.153      yamt static fixpt_t
    1.153      yamt decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
    1.153      yamt {
    1.153      yamt
    1.153      yamt 	if (estcpu == 0) {
    1.153      yamt 		return 0;
    1.153      yamt 	}
    1.153      yamt
    1.153      yamt #if !defined(_LP64)
    1.153      yamt 	/* avoid 64bit arithmetics. */
    1.153      yamt #define	FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
    1.153      yamt 	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
    1.153      yamt 		return estcpu * loadfac / (loadfac + FSCALE);
    1.153      yamt 	}
    1.153      yamt #endif /* !defined(_LP64) */
    1.153      yamt
    1.153      yamt 	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
    1.153      yamt }
     1.26       cgd
    1.157      yamt /*
    1.157      yamt  * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
    1.157      yamt  * sleeping for at least seven times the loadfactor will decay p_estcpu to
    1.157      yamt  * less than (1 << ESTCPU_SHIFT).
    1.157      yamt  *
    1.157      yamt  * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
    1.157      yamt  */
    1.157      yamt static fixpt_t
    1.157      yamt decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
    1.157      yamt {
    1.157      yamt
    1.157      yamt 	if ((n << FSHIFT) >= 7 * loadfac) {
    1.157      yamt 		return 0;
    1.157      yamt 	}
    1.157      yamt
    1.157      yamt 	while (estcpu != 0 && n > 1) {
    1.157      yamt 		estcpu = decay_cpu(loadfac, estcpu);
    1.157      yamt 		n--;
    1.157      yamt 	}
    1.157      yamt
    1.157      yamt 	return estcpu;
    1.157      yamt }
    1.157      yamt
     1.26       cgd /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
     1.26       cgd fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
     1.26       cgd
     1.26       cgd /*
     1.26       cgd  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
     1.26       cgd  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
     1.26       cgd  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
     1.26       cgd  *
     1.26       cgd  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
     1.26       cgd  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
     1.26       cgd  *
     1.26       cgd  * If you dont want to bother with the faster/more-accurate formula, you
     1.26       cgd  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
     1.26       cgd  * (more general) method of calculating the %age of CPU used by a process.
     1.26       cgd  */
     1.26       cgd #define	CCPU_SHIFT	11
     1.26       cgd
     1.26       cgd /*
     1.26       cgd  * Recompute process priorities, every hz ticks.
     1.26       cgd  */
     1.26       cgd /* ARGSUSED */
     1.26       cgd void
     1.77   thorpej schedcpu(void *arg)
     1.26       cgd {
     1.71  augustss 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1.166.2.2        ad 	struct rlimit *rlim;
    1.122   thorpej 	struct lwp *l;
     1.71  augustss 	struct proc *p;
    1.122   thorpej 	int s, minslp;
     1.66      ross 	int clkhz;
1.166.2.2        ad 	long runtm;
     1.26       cgd
    1.157      yamt 	schedcpu_ticks++;
    1.157      yamt
1.166.2.1        ad 	mutex_enter(&proclist_mutex);
    1.145      yamt 	PROCLIST_FOREACH(p, &allproc) {
     1.26       cgd 		/*
1.166.2.2        ad 		 * Increment time in/out of memory and sleep time (if
1.166.2.2        ad 		 * sleeping).  We ignore overflow; with 16-bit int's
     1.26       cgd 		 * (remember them?) overflow takes 45 days.
1.166.2.2        ad 		 *
1.166.2.2        ad 		 * XXXSMP Should create an activeproc list so that we
1.166.2.2        ad 		 * don't touch every proc+LWP in the system on a regular
1.166.2.2        ad 		 * basis. l->l_swtime/l->l_slptime can become deltas.
     1.26       cgd 		 */
    1.122   thorpej 		minslp = 2;
1.166.2.2        ad 		runtm = 0;
1.166.2.2        ad 		mutex_enter(&p->p_smutex);
    1.122   thorpej 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1.166.2.2        ad 			lwp_lock(l);
1.166.2.2        ad 			runtm += l->l_rtime.tv_sec;
    1.122   thorpej 			l->l_swtime++;
    1.130   nathanw 			if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    1.122   thorpej 			    l->l_stat == LSSUSPENDED) {
    1.122   thorpej 				l->l_slptime++;
    1.122   thorpej 				minslp = min(minslp, l->l_slptime);
    1.122   thorpej 			} else
    1.122   thorpej 				minslp = 0;
1.166.2.2        ad 			lwp_unlock(l);
    1.122   thorpej 		}
     1.26       cgd 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1.166.2.2        ad
1.166.2.2        ad 		/*
1.166.2.2        ad 		 * Check if the process exceeds its CPU resource allocation.
1.166.2.2        ad 		 * If over max, kill it.  In any case, if it has run for more
1.166.2.2        ad 		 * than autonicetime, reduce priority to give others a chance.
1.166.2.2        ad 		 */
1.166.2.2        ad 		rlim = &p->p_rlimit[RLIMIT_CPU];
1.166.2.2        ad 		if (runtm >= rlim->rlim_cur) {
1.166.2.2        ad 			if (runtm >= rlim->rlim_max)
1.166.2.2        ad 				psignal(p, SIGKILL);
1.166.2.2        ad 			else {
1.166.2.2        ad 				psignal(p, SIGXCPU);
1.166.2.2        ad 				if (rlim->rlim_cur < rlim->rlim_max)
1.166.2.2        ad 					rlim->rlim_cur += 5;
1.166.2.2        ad 			}
1.166.2.2        ad 		}
1.166.2.2        ad 		if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
1.166.2.2        ad 		    && kauth_cred_geteuid(p->p_cred)) {
1.166.2.2        ad 			p->p_nice = autoniceval + NZERO;
1.166.2.2        ad 			resetprocpriority(p);
1.166.2.2        ad 		}
1.166.2.2        ad
     1.26       cgd 		/*
     1.26       cgd 		 * If the process has slept the entire second,
     1.26       cgd 		 * stop recalculating its priority until it wakes up.
     1.26       cgd 		 */
1.166.2.2        ad 		if (minslp > 1) {
1.166.2.2        ad 			mutex_exit(&p->p_smutex);
     1.26       cgd 			continue;
1.166.2.2        ad 		}
1.166.2.4        ad 		s = splstatclock();	/* XXXSMP prevent state changes */
     1.26       cgd 		/*
     1.26       cgd 		 * p_pctcpu is only for ps.
     1.26       cgd 		 */
     1.66      ross 		clkhz = stathz != 0 ? stathz : hz;
     1.26       cgd #if	(FSHIFT >= CCPU_SHIFT)
     1.66      ross 		p->p_pctcpu += (clkhz == 100)?
     1.26       cgd 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
     1.26       cgd                 	100 * (((fixpt_t) p->p_cpticks)
     1.66      ross 				<< (FSHIFT - CCPU_SHIFT)) / clkhz;
     1.26       cgd #else
     1.26       cgd 		p->p_pctcpu += ((FSCALE - ccpu) *
     1.66      ross 			(p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
     1.26       cgd #endif
     1.26       cgd 		p->p_cpticks = 0;
    1.153      yamt 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
    1.120        pk 		splx(s);	/* Done with the process CPU ticks update */
    1.122   thorpej 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1.166.2.2        ad 			lwp_lock(l);
1.166.2.2        ad 			if (l->l_slptime > 1) {
1.166.2.2        ad 				lwp_unlock(l);
    1.122   thorpej 				continue;
1.166.2.2        ad 			}
    1.122   thorpej 			resetpriority(l);
    1.122   thorpej 			if (l->l_priority >= PUSER) {
    1.122   thorpej 				if (l->l_stat == LSRUN &&
    1.122   thorpej 				    (l->l_flag & L_INMEM) &&
    1.122   thorpej 				    (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) {
1.166.2.4        ad 				    	lwp_changepri(l, l->l_usrpri);
    1.122   thorpej 				} else
    1.122   thorpej 					l->l_priority = l->l_usrpri;
    1.122   thorpej 			}
1.166.2.2        ad 			lwp_unlock(l);
     1.26       cgd 		}
1.166.2.2        ad 		mutex_exit(&p->p_smutex);
     1.26       cgd 	}
1.166.2.1        ad 	mutex_exit(&proclist_mutex);
     1.47       mrg 	uvm_meter();
     1.67      fvdl 	wakeup((caddr_t)&lbolt);
    1.143      yamt 	callout_schedule(&schedcpu_ch, hz);
     1.26       cgd }
     1.26       cgd
     1.26       cgd /*
     1.26       cgd  * Recalculate the priority of a process after it has slept for a while.
     1.26       cgd  */
     1.26       cgd void
    1.122   thorpej updatepri(struct lwp *l)
     1.26       cgd {
    1.122   thorpej 	struct proc *p = l->l_proc;
     1.83   thorpej 	fixpt_t loadfac;
     1.83   thorpej
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
    1.157      yamt 	KASSERT(l->l_slptime > 1);
     1.83   thorpej
     1.83   thorpej 	loadfac = loadfactor(averunnable.ldavg[0]);
     1.26       cgd
    1.157      yamt 	l->l_slptime--; /* the first time was done in schedcpu */
    1.157      yamt 	/* XXX NJWLWP */
1.166.2.2        ad 	/* XXXSMP occasionaly unlocked. */
    1.157      yamt 	p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
    1.122   thorpej 	resetpriority(l);
     1.26       cgd }
     1.26       cgd
     1.26       cgd /*
1.166.2.2        ad  * During autoconfiguration or after a panic, a sleep will simply lower the
1.166.2.2        ad  * priority briefly to allow interrupts, then return.  The priority to be
1.166.2.2        ad  * used (safepri) is machine-dependent, thus this value is initialized and
1.166.2.2        ad  * maintained in the machine-dependent layers.  This priority will typically
1.166.2.2        ad  * be 0, or the lowest priority that is safe for use on the interrupt stack;
1.166.2.2        ad  * it can be made higher to block network software interrupts after panics.
     1.26       cgd  */
1.166.2.2        ad int	safepri;
     1.26       cgd
     1.26       cgd /*
1.166.2.2        ad  * ltsleep: see mtsleep() for comments.
     1.26       cgd  */
     1.26       cgd int
1.166.2.2        ad ltsleep(wchan_t ident, int priority, const char *wmesg, int timo,
1.166.2.2        ad 	volatile struct simplelock *interlock)
     1.26       cgd {
    1.122   thorpej 	struct lwp *l = curlwp;
1.166.2.2        ad 	sleepq_t *sq;
1.166.2.2        ad 	int error;
     1.26       cgd
1.166.2.2        ad 	if (sleepq_dontsleep(l)) {
1.166.2.2        ad 		(void)sleepq_abort(NULL, 0);
1.166.2.2        ad 		if ((priority & PNORELOCK) != 0)
     1.77   thorpej 			simple_unlock(interlock);
1.166.2.2        ad 		return 0;
    1.122   thorpej 	}
     1.77   thorpej
1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
1.166.2.4        ad
1.166.2.4        ad 	sleepq_enter(sq, priority & PRIMASK, ident, wmesg, timo,
1.166.2.4        ad 	    priority & PCATCH, &sleep_syncobj);
     1.77   thorpej
1.166.2.2        ad 	if (interlock != NULL) {
1.166.2.2        ad 		LOCK_ASSERT(simple_lock_held(interlock));
     1.77   thorpej 		simple_unlock(interlock);
     1.26       cgd 	}
    1.126        pk
1.166.2.2        ad 	error = sleepq_block(sq, timo);
1.166.2.4        ad 	sleepq_unblock();
    1.126        pk
1.166.2.2        ad 	if (interlock != NULL && (priority & PNORELOCK) == 0)
1.166.2.2        ad 		simple_lock(interlock);
1.166.2.2        ad
1.166.2.2        ad 	return error;
     1.26       cgd }
     1.26       cgd
1.166.2.2        ad /*
1.166.2.2        ad  * General sleep call.  Suspends the current process until a wakeup is
1.166.2.2        ad  * performed on the specified identifier.  The process will then be made
1.166.2.2        ad  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
1.166.2.2        ad  * means no timeout).  If pri includes PCATCH flag, signals are checked
1.166.2.2        ad  * before and after sleeping, else signals are not checked.  Returns 0 if
1.166.2.2        ad  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
1.166.2.2        ad  * signal needs to be delivered, ERESTART is returned if the current system
1.166.2.2        ad  * call should be restarted if possible, and EINTR is returned if the system
1.166.2.2        ad  * call should be interrupted by the signal (return EINTR).
1.166.2.2        ad  *
1.166.2.2        ad  * The interlock is held until we are on a sleep queue. The interlock will
1.166.2.2        ad  * be locked before returning back to the caller unless the PNORELOCK flag
1.166.2.2        ad  * is specified, in which case the interlock will always be unlocked upon
1.166.2.2        ad  * return.
1.166.2.2        ad  */
1.166.2.1        ad int
1.166.2.2        ad mtsleep(wchan_t ident, int priority, const char *wmesg, int timo,
1.166.2.2        ad 	kmutex_t *mtx)
1.166.2.1        ad {
1.166.2.1        ad 	struct lwp *l = curlwp;
1.166.2.2        ad 	sleepq_t *sq;
1.166.2.2        ad 	int error;
1.166.2.1        ad
1.166.2.2        ad 	if (sleepq_dontsleep(l))
1.166.2.2        ad 		return sleepq_abort(mtx, priority & PNORELOCK);
1.166.2.1        ad
1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
1.166.2.4        ad
1.166.2.4        ad 	sleepq_enter(sq, priority & PRIMASK, ident, wmesg, timo,
1.166.2.4        ad 	    priority & PCATCH, &sleep_syncobj);
1.166.2.1        ad
1.166.2.1        ad 	if (mtx != NULL) {
1.166.2.2        ad 		LOCK_ASSERT(mutex_owned(mtx));
1.166.2.4        ad 		mutex_exit(mtx);
1.166.2.1        ad 	}
1.166.2.1        ad
1.166.2.2        ad 	error = sleepq_block(sq, timo);
1.166.2.4        ad 	sleepq_unblock();
1.166.2.1        ad
1.166.2.2        ad 	if (mtx != NULL && (priority & PNORELOCK) == 0)
1.166.2.1        ad 		mutex_enter(mtx);
1.166.2.2        ad
1.166.2.2        ad 	return error;
1.166.2.1        ad }
1.166.2.1        ad
     1.26       cgd void
    1.139        cl sa_awaken(struct lwp *l)
    1.139        cl {
    1.147     perry
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
    1.139        cl
    1.142        cl 	if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD)
    1.139        cl 		l->l_flag &= ~L_SA_IDLE;
    1.139        cl }
    1.139        cl
     1.26       cgd /*
     1.26       cgd  * Make all processes sleeping on the specified identifier runnable.
     1.26       cgd  */
     1.26       cgd void
1.166.2.2        ad wakeup(wchan_t ident)
     1.26       cgd {
1.166.2.2        ad 	sleepq_t *sq;
     1.83   thorpej
1.166.2.2        ad 	if (cold)
1.166.2.2        ad 		return;
     1.83   thorpej
1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
1.166.2.5        ad 	sleepq_wake(sq, ident, (u_int)-1);
     1.63   thorpej }
     1.63   thorpej
     1.63   thorpej /*
     1.63   thorpej  * Make the highest priority process first in line on the specified
     1.63   thorpej  * identifier runnable.
     1.63   thorpej  */
1.166.2.2        ad void
1.166.2.2        ad wakeup_one(wchan_t ident)
     1.63   thorpej {
1.166.2.2        ad 	sleepq_t *sq;
     1.77   thorpej
1.166.2.2        ad 	if (cold)
1.166.2.2        ad 		return;
1.166.2.2        ad
1.166.2.4        ad 	sq = sleeptab_lookup(&sleeptab, ident);
1.166.2.5        ad 	sleepq_wake(sq, ident, 1);
    1.117  gmcgarry }
    1.117  gmcgarry
1.166.2.2        ad
    1.117  gmcgarry /*
    1.117  gmcgarry  * General yield call.  Puts the current process back on its run queue and
    1.117  gmcgarry  * performs a voluntary context switch.  Should only be called when the
    1.117  gmcgarry  * current process explicitly requests it (eg sched_yield(2) in compat code).
    1.117  gmcgarry  */
    1.117  gmcgarry void
    1.117  gmcgarry yield(void)
    1.117  gmcgarry {
    1.122   thorpej 	struct lwp *l = curlwp;
    1.117  gmcgarry
1.166.2.2        ad 	lwp_lock(l);
1.166.2.2        ad 	if (l->l_stat == LSONPROC) {
1.166.2.3        ad 		KASSERT(lwp_locked(l, &sched_mutex));
1.166.2.2        ad 		l->l_priority = l->l_usrpri;
1.166.2.2        ad 	}
1.166.2.2        ad 	l->l_nvcsw++;
    1.122   thorpej 	mi_switch(l, NULL);
     1.69   thorpej }
     1.69   thorpej
     1.69   thorpej /*
     1.69   thorpej  * General preemption call.  Puts the current process back on its run queue
    1.156    rpaulo  * and performs an involuntary context switch.
    1.156    rpaulo  * The 'more' ("more work to do") argument is boolean. Returning to userspace
    1.156    rpaulo  * preempt() calls pass 0. "Voluntary" preemptions in e.g. uiomove() pass 1.
    1.156    rpaulo  * This will be used to indicate to the SA subsystem that the LWP is
    1.156    rpaulo  * not yet finished in the kernel.
     1.69   thorpej  */
     1.69   thorpej void
    1.122   thorpej preempt(int more)
     1.69   thorpej {
    1.122   thorpej 	struct lwp *l = curlwp;
1.166.2.2        ad 	int r;
     1.69   thorpej
1.166.2.2        ad 	lwp_lock(l);
1.166.2.2        ad 	if (l->l_stat == LSONPROC) {
1.166.2.3        ad 		KASSERT(lwp_locked(l, &sched_mutex));
1.166.2.2        ad 		l->l_priority = l->l_usrpri;
1.166.2.2        ad 	}
1.166.2.2        ad 	l->l_nivcsw++;
    1.122   thorpej 	r = mi_switch(l, NULL);
1.166.2.4        ad 	if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0)
    1.122   thorpej 		sa_preempt(l);
     1.69   thorpej }
     1.69   thorpej
     1.69   thorpej /*
1.166.2.2        ad  * The machine independent parts of context switch.  Switch to "new"
1.166.2.2        ad  * if non-NULL, otherwise let cpu_switch choose the next lwp.
    1.130   nathanw  *
    1.122   thorpej  * Returns 1 if another process was actually run.
     1.26       cgd  */
    1.122   thorpej int
    1.122   thorpej mi_switch(struct lwp *l, struct lwp *newl)
     1.26       cgd {
     1.76   thorpej 	struct schedstate_percpu *spc;
     1.26       cgd 	struct timeval tv;
    1.144      yamt 	int hold_count;
1.166.2.2        ad 	int retval, oldspl;
1.166.2.2        ad 	long s, u;
1.166.2.2        ad #if PERFCTRS
    1.122   thorpej 	struct proc *p = l->l_proc;
1.166.2.2        ad #endif
     1.26       cgd
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
     1.83   thorpej
     1.90  sommerfe 	/*
     1.90  sommerfe 	 * Release the kernel_lock, as we are about to yield the CPU.
     1.90  sommerfe 	 */
1.166.2.4        ad 	hold_count = KERNEL_UNLOCK(0, l);
     1.85  sommerfe
    1.160       chs #ifdef LOCKDEBUG
     1.82   thorpej 	spinlock_switchcheck();
     1.81   thorpej 	simple_lock_switchcheck();
     1.50      fvdl #endif
1.166.2.2        ad #ifdef KSTACK_CHECK_MAGIC
1.166.2.2        ad 	kstack_check_magic(l);
1.166.2.2        ad #endif
1.166.2.2        ad
1.166.2.2        ad 	/*
1.166.2.2        ad 	 * It's safe to read the per CPU schedstate unlocked here, as all we
1.166.2.2        ad 	 * are after is the run time and that's guarenteed to have been last
1.166.2.2        ad 	 * updated by this CPU.
1.166.2.2        ad 	 */
1.166.2.2        ad 	KDASSERT(l->l_cpu != NULL);
1.166.2.2        ad 	KDASSERT(l->l_cpu == curcpu());
1.166.2.2        ad 	spc = &l->l_cpu->ci_schedstate;
     1.81   thorpej
     1.26       cgd 	/*
     1.26       cgd 	 * Compute the amount of time during which the current
    1.113  gmcgarry 	 * process was running.
     1.26       cgd 	 */
     1.26       cgd 	microtime(&tv);
1.166.2.2        ad 	u = l->l_rtime.tv_usec +
    1.122   thorpej 	    (tv.tv_usec - spc->spc_runtime.tv_usec);
1.166.2.2        ad 	s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
     1.26       cgd 	if (u < 0) {
     1.26       cgd 		u += 1000000;
     1.26       cgd 		s--;
     1.26       cgd 	} else if (u >= 1000000) {
     1.26       cgd 		u -= 1000000;
     1.26       cgd 		s++;
     1.26       cgd 	}
1.166.2.2        ad 	l->l_rtime.tv_usec = u;
1.166.2.2        ad 	l->l_rtime.tv_sec = s;
     1.26       cgd
     1.26       cgd 	/*
1.166.2.2        ad 	 * XXXSMP If we are using h/w performance counters, save context.
     1.26       cgd 	 */
1.166.2.2        ad #if PERFCTRS
1.166.2.2        ad 	if (PMC_ENABLED(p)) {
1.166.2.2        ad 		pmc_save_context(p);
     1.26       cgd 	}
1.166.2.2        ad #endif
1.166.2.2        ad
1.166.2.2        ad 	/*
1.166.2.2        ad 	 * Acquire the sched_mutex if necessary.  It will be released by
1.166.2.2        ad 	 * cpu_switch once it has decided to idle, or picked another LWP
1.166.2.2        ad 	 * to run.
1.166.2.2        ad 	 */
1.166.2.4        ad #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
1.166.2.4        ad 	if (l->l_mutex != &sched_mutex) {
1.166.2.2        ad 		mutex_enter(&sched_mutex);
1.166.2.4        ad 		lwp_unlock(l);
     1.26       cgd 	}
1.166.2.3        ad #endif
1.166.2.3        ad
1.166.2.3        ad 	/*
1.166.2.3        ad 	 * If on the CPU and we have gotten this far, then we must yield.
1.166.2.3        ad 	 */
1.166.2.3        ad 	KASSERT(l->l_stat != LSRUN);
1.166.2.3        ad 	if (l->l_stat == LSONPROC) {
1.166.2.4        ad 		KASSERT(lwp_locked(l, &sched_mutex));
1.166.2.3        ad 		l->l_stat = LSRUN;
1.166.2.3        ad 		setrunqueue(l);
1.166.2.3        ad 	}
1.166.2.2        ad 	uvmexp.swtch++;
     1.69   thorpej
     1.69   thorpej 	/*
     1.69   thorpej 	 * Process is about to yield the CPU; clear the appropriate
     1.69   thorpej 	 * scheduling flags.
     1.69   thorpej 	 */
     1.73   thorpej 	spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    1.109      yamt
1.166.2.2        ad 	LOCKDEBUG_BARRIER(&sched_mutex, 1);
    1.113  gmcgarry
    1.113  gmcgarry 	/*
1.166.2.2        ad 	 * Switch to the new current LWP.  When we run again, we'll
1.166.2.2        ad 	 * return back here.
    1.113  gmcgarry 	 */
1.166.2.4        ad 	oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
1.166.2.4        ad
1.166.2.4        ad 	if (newl == NULL || newl->l_back == NULL)
    1.122   thorpej 		retval = cpu_switch(l, NULL);
1.166.2.2        ad 	else {
1.166.2.4        ad 		KASSERT(lwp_locked(newl, &sched_mutex));
    1.122   thorpej 		remrunqueue(newl);
    1.122   thorpej 		cpu_switchto(l, newl);
    1.122   thorpej 		retval = 0;
    1.122   thorpej 	}
    1.110    briggs
    1.110    briggs 	/*
1.166.2.2        ad 	 * XXXSMP If we are using h/w performance counters, restore context.
     1.26       cgd 	 */
    1.114  gmcgarry #if PERFCTRS
    1.166  christos 	if (PMC_ENABLED(p)) {
    1.114  gmcgarry 		pmc_restore_context(p);
    1.166  christos 	}
    1.114  gmcgarry #endif
    1.110    briggs
    1.110    briggs 	/*
     1.76   thorpej 	 * We're running again; record our new start time.  We might
1.166.2.2        ad 	 * be running on a new CPU now, so don't use the cached
     1.76   thorpej 	 * schedstate_percpu pointer.
     1.76   thorpej 	 */
    1.122   thorpej 	KDASSERT(l->l_cpu != NULL);
    1.122   thorpej 	KDASSERT(l->l_cpu == curcpu());
    1.122   thorpej 	microtime(&l->l_cpu->ci_schedstate.spc_runtime);
     1.85  sommerfe
     1.90  sommerfe 	/*
1.166.2.4        ad 	 * Reacquire the kernel_lock.
     1.90  sommerfe 	 */
1.166.2.2        ad 	splx(oldspl);
1.166.2.4        ad 	KERNEL_LOCK(hold_count, l);
    1.122   thorpej
    1.122   thorpej 	return retval;
     1.26       cgd }
     1.26       cgd
     1.26       cgd /*
     1.26       cgd  * Initialize the (doubly-linked) run queues
     1.26       cgd  * to be empty.
     1.26       cgd  */
     1.26       cgd void
     1.26       cgd rqinit()
     1.26       cgd {
     1.71  augustss 	int i;
     1.26       cgd
     1.73   thorpej 	for (i = 0; i < RUNQUE_NQS; i++)
     1.73   thorpej 		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
    1.122   thorpej 		    (struct lwp *)&sched_qs[i];
1.166.2.2        ad
1.166.2.2        ad 	mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
     1.26       cgd }
     1.26       cgd
    1.158     perry static inline void
1.166.2.2        ad resched_lwp(struct lwp *l, u_char pri)
    1.119   thorpej {
    1.119   thorpej 	struct cpu_info *ci;
    1.119   thorpej
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
1.166.2.2        ad
    1.119   thorpej 	/*
    1.119   thorpej 	 * XXXSMP
    1.122   thorpej 	 * Since l->l_cpu persists across a context switch,
    1.119   thorpej 	 * this gives us *very weak* processor affinity, in
    1.119   thorpej 	 * that we notify the CPU on which the process last
    1.119   thorpej 	 * ran that it should try to switch.
    1.119   thorpej 	 *
    1.119   thorpej 	 * This does not guarantee that the process will run on
    1.119   thorpej 	 * that processor next, because another processor might
    1.119   thorpej 	 * grab it the next time it performs a context switch.
    1.119   thorpej 	 *
    1.119   thorpej 	 * This also does not handle the case where its last
    1.119   thorpej 	 * CPU is running a higher-priority process, but every
    1.119   thorpej 	 * other CPU is running a lower-priority process.  There
    1.119   thorpej 	 * are ways to handle this situation, but they're not
    1.119   thorpej 	 * currently very pretty, and we also need to weigh the
    1.119   thorpej 	 * cost of moving a process from one CPU to another.
    1.119   thorpej 	 */
    1.122   thorpej 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
    1.121   thorpej 	if (pri < ci->ci_schedstate.spc_curpriority)
1.166.2.2        ad 		cpu_need_resched(ci);
    1.119   thorpej }
    1.119   thorpej
     1.26       cgd /*
1.166.2.2        ad  * Change process state to be runnable, placing it on the run queue if it is
1.166.2.2        ad  * in memory, and awakening the swapper if it isn't in memory.
1.166.2.2        ad  *
1.166.2.2        ad  * Call with the process and LWP locked.  Will return with the LWP unlocked.
     1.26       cgd  */
     1.26       cgd void
    1.122   thorpej setrunnable(struct lwp *l)
     1.26       cgd {
    1.122   thorpej 	struct proc *p = l->l_proc;
1.166.2.3        ad 	struct cpu_info *ci;
     1.26       cgd
1.166.2.2        ad 	LOCK_ASSERT(mutex_owned(&p->p_smutex));
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
     1.83   thorpej
    1.122   thorpej 	switch (l->l_stat) {
    1.122   thorpej 	case LSSTOP:
     1.33   mycroft 		/*
     1.33   mycroft 		 * If we're being traced (possibly because someone attached us
     1.33   mycroft 		 * while we were stopped), check for a signal from the debugger.
     1.33   mycroft 		 */
1.166.2.4        ad 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
1.166.2.3        ad 			sigaddset(&l->l_sigpend->sp_set, p->p_xstat);
1.166.2.2        ad 			signotify(l);
     1.53   mycroft 		}
1.166.2.2        ad 		p->p_nrlwps++;
    1.122   thorpej 		break;
    1.122   thorpej 	case LSSUSPENDED:
1.166.2.4        ad 		l->l_flag &= ~L_WSUSPEND;
1.166.2.2        ad 		p->p_nrlwps++;
1.166.2.2        ad 		break;
1.166.2.2        ad 	case LSSLEEP:
1.166.2.4        ad 		/*
1.166.2.4        ad 		 * If the LWP was sleeping interruptably, then it's OK to
1.166.2.4        ad 		 * start it again.  If not, mark it as still sleeping.
1.166.2.4        ad 		 */
1.166.2.4        ad 		KASSERT(l->l_wchan != NULL);
1.166.2.2        ad
1.166.2.4        ad 		if ((l->l_flag & L_SINTR) != 0) {
1.166.2.4        ad 			/* lwp_unsleep() will release the lock. */
1.166.2.4        ad 			lwp_unsleep(l);
1.166.2.4        ad 		} else {
1.166.2.4        ad 			lwp_unlock(l);
1.166.2.4        ad #ifdef DIAGNOSTIC
1.166.2.4        ad 			panic("setrunnable: !L_SINTR");
1.166.2.4        ad #endif
1.166.2.4        ad 		}
1.166.2.2        ad 		return;
1.166.2.4        ad 	default:
1.166.2.4        ad 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
     1.26       cgd 	}
    1.139        cl
    1.139        cl 	if (l->l_proc->p_sa)
    1.139        cl 		sa_awaken(l);
    1.139        cl
1.166.2.3        ad 	/*
1.166.2.4        ad 	 * Set in sched_mutex as it the LWP's current mutex.  If the LWP is
1.166.2.4        ad 	 * still on the CPU, mark it as LSONPROC.  It may be about to call
1.166.2.4        ad 	 * mi_switch(), in which case it will yield.
1.166.2.3        ad 	 */
1.166.2.3        ad 	lwp_relock(l, &sched_mutex);
    1.122   thorpej
1.166.2.3        ad 	if ((ci = l->l_cpu) != NULL && ci->ci_curlwp == l) {
1.166.2.3        ad 		l->l_stat = LSONPROC;
1.166.2.3        ad 		l->l_slptime = 0;
1.166.2.3        ad 		lwp_unlock(l);
1.166.2.3        ad 		return;
1.166.2.3        ad 	}
    1.122   thorpej
1.166.2.3        ad 	/*
1.166.2.3        ad 	 * Set the LWP runnable.  If it's swapped out, we need to wake the swapper
1.166.2.3        ad 	 * to bring it back in.  Otherwise, enter it into a run queue.
1.166.2.3        ad 	 */
1.166.2.3        ad 	l->l_stat = LSRUN;
    1.122   thorpej 	if (l->l_slptime > 1)
    1.122   thorpej 		updatepri(l);
    1.122   thorpej 	l->l_slptime = 0;
1.166.2.2        ad
1.166.2.2        ad 	if (l->l_flag & L_INMEM) {
1.166.2.2        ad 		setrunqueue(l);
1.166.2.2        ad 		resched_lwp(l, l->l_priority);
1.166.2.2        ad 		lwp_unlock(l);
1.166.2.2        ad 	} else {
1.166.2.2        ad 		lwp_unlock(l);
1.166.2.2        ad 		wakeup(&proc0);
1.166.2.2        ad 	}
     1.26       cgd }
     1.26       cgd
     1.26       cgd /*
     1.26       cgd  * Compute the priority of a process when running in user mode.
     1.26       cgd  * Arrange to reschedule if the resulting priority is better
     1.26       cgd  * than that of the current process.
     1.26       cgd  */
     1.26       cgd void
    1.122   thorpej resetpriority(struct lwp *l)
     1.26       cgd {
     1.71  augustss 	unsigned int newpriority;
    1.122   thorpej 	struct proc *p = l->l_proc;
     1.26       cgd
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, NULL));
     1.83   thorpej
1.166.2.2        ad 	/* XXXSMP proc values will be accessed unlocked */
    1.153      yamt 	newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
    1.122   thorpej 			NICE_WEIGHT * (p->p_nice - NZERO);
     1.26       cgd 	newpriority = min(newpriority, MAXPRI);
    1.122   thorpej 	l->l_usrpri = newpriority;
1.166.2.2        ad 	resched_lwp(l, l->l_usrpri);
    1.122   thorpej }
    1.122   thorpej
    1.130   nathanw /*
    1.122   thorpej  * Recompute priority for all LWPs in a process.
    1.122   thorpej  */
    1.122   thorpej void
    1.122   thorpej resetprocpriority(struct proc *p)
    1.122   thorpej {
    1.122   thorpej 	struct lwp *l;
    1.122   thorpej
1.166.2.2        ad 	LOCK_ASSERT(mutex_owned(&p->p_smutex));
1.166.2.2        ad
1.166.2.2        ad 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1.166.2.2        ad 		lwp_lock(l);
1.166.2.2        ad 		resetpriority(l);
1.166.2.2        ad 		lwp_unlock(l);
1.166.2.2        ad 	}
     1.55      ross }
     1.55      ross
     1.55      ross /*
     1.56      ross  * We adjust the priority of the current process.  The priority of a process
    1.141       wiz  * gets worse as it accumulates CPU time.  The CPU usage estimator (p_estcpu)
     1.56      ross  * is increased here.  The formula for computing priorities (in kern_synch.c)
     1.56      ross  * will compute a different value each time p_estcpu increases. This can
     1.56      ross  * cause a switch, but unless the priority crosses a PPQ boundary the actual
    1.141       wiz  * queue will not change.  The CPU usage estimator ramps up quite quickly
     1.56      ross  * when the process is running (linearly), and decays away exponentially, at
     1.56      ross  * a rate which is proportionally slower when the system is busy.  The basic
     1.80   nathanw  * principle is that the system will 90% forget that the process used a lot
     1.56      ross  * of CPU time in 5 * loadav seconds.  This causes the system to favor
     1.56      ross  * processes which haven't run much recently, and to round-robin among other
     1.56      ross  * processes.
     1.55      ross  */
     1.55      ross
     1.55      ross void
    1.122   thorpej schedclock(struct lwp *l)
     1.55      ross {
    1.122   thorpej 	struct proc *p = l->l_proc;
1.166.2.2        ad
1.166.2.2        ad 	LOCK_ASSERT(mutex_owned(&p->p_smutex));
     1.77   thorpej
    1.153      yamt 	p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
    1.130   nathanw
1.166.2.2        ad 	lwp_lock(l);
1.166.2.2        ad 	resetpriority(l);
    1.122   thorpej 	if (l->l_priority >= PUSER)
    1.122   thorpej 		l->l_priority = l->l_usrpri;
1.166.2.2        ad 	lwp_unlock(l);
     1.26       cgd }
     1.94    bouyer
1.166.2.2        ad /*
1.166.2.2        ad  * suspendsched:
1.166.2.2        ad  *
1.166.2.4        ad  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
1.166.2.2        ad  */
     1.94    bouyer void
1.166.2.4        ad suspendsched(void)
     1.94    bouyer {
1.166.2.4        ad 	CPU_INFO_ITERATOR cii;
1.166.2.4        ad 	struct cpu_info *ci;
    1.122   thorpej 	struct lwp *l;
1.166.2.2        ad 	struct proc *p;
     1.94    bouyer
1.166.2.4        ad 	/*
1.166.2.4        ad 	 * We do this by process in order not to violate the locking rules.
1.166.2.4        ad 	 */
1.166.2.4        ad 	rw_enter(&proclist_lock, RW_READER);
1.166.2.4        ad 	PROCLIST_FOREACH(p, &allproc) {
1.166.2.4        ad 		mutex_enter(&p->p_smutex);
1.166.2.4        ad
1.166.2.4        ad 		if ((p->p_flag & P_SYSTEM) != 0) {
1.166.2.4        ad 			mutex_exit(&p->p_smutex);
     1.94    bouyer 			continue;
1.166.2.4        ad 		}
    1.122   thorpej
1.166.2.4        ad 		p->p_stat = SSTOP;
1.166.2.2        ad
1.166.2.4        ad 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1.166.2.4        ad 			if (l == curlwp)
1.166.2.4        ad 				continue;
1.166.2.4        ad
1.166.2.4        ad 			lwp_lock(l);
1.166.2.4        ad
1.166.2.4        ad 			/*
1.166.2.4        ad 			 * Set L_WREBOOT so that the LWP will suspend itself
1.166.2.4        ad 			 * when it tries to return to user mode.  We want to
1.166.2.4        ad 			 * try and get to get as many LWPs as possible to
1.166.2.4        ad 			 * the user / kernel boundary, so that they will
1.166.2.4        ad 			 * release any locks that they hold.
1.166.2.4        ad 			 */
1.166.2.4        ad 			l->l_flag |= (L_WREBOOT | L_WSUSPEND);
1.166.2.4        ad
1.166.2.4        ad 			if (l->l_stat == LSSLEEP &&
1.166.2.4        ad 			    (l->l_flag & L_SINTR) != 0) {
1.166.2.4        ad 				/* setrunnable() will release the lock. */
1.166.2.4        ad 				setrunnable(l);
1.166.2.4        ad 				continue;
1.166.2.4        ad 			}
1.166.2.4        ad
1.166.2.4        ad 			lwp_unlock(l);
     1.94    bouyer 		}
1.166.2.4        ad
1.166.2.4        ad 		mutex_exit(&p->p_smutex);
     1.94    bouyer 	}
1.166.2.4        ad 	rw_exit(&proclist_lock);
1.166.2.4        ad
1.166.2.4        ad 	/*
1.166.2.4        ad 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
1.166.2.4        ad 	 * They'll trap into the kernel and suspend themselves in userret().
1.166.2.4        ad 	 */
1.166.2.4        ad 	sched_lock(0);
1.166.2.4        ad 	for (CPU_INFO_FOREACH(cii, ci))
1.166.2.4        ad 		cpu_need_resched(ci);
1.166.2.4        ad 	sched_unlock(0);
     1.94    bouyer }
    1.113  gmcgarry
    1.113  gmcgarry /*
    1.151      yamt  * scheduler_fork_hook:
    1.151      yamt  *
    1.151      yamt  *	Inherit the parent's scheduler history.
    1.151      yamt  */
    1.151      yamt void
    1.151      yamt scheduler_fork_hook(struct proc *parent, struct proc *child)
    1.151      yamt {
    1.151      yamt
1.166.2.4        ad 	LOCK_ASSERT(mutex_owned(&parent->p_smutex));
1.166.2.4        ad
    1.157      yamt 	child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
    1.157      yamt 	child->p_forktime = schedcpu_ticks;
    1.151      yamt }
    1.151      yamt
    1.151      yamt /*
    1.151      yamt  * scheduler_wait_hook:
    1.151      yamt  *
    1.151      yamt  *	Chargeback parents for the sins of their children.
    1.151      yamt  */
    1.151      yamt void
    1.151      yamt scheduler_wait_hook(struct proc *parent, struct proc *child)
    1.151      yamt {
    1.157      yamt 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
    1.157      yamt 	fixpt_t estcpu;
    1.151      yamt
    1.151      yamt 	/* XXX Only if parent != init?? */
    1.157      yamt
1.166.2.2        ad 	mutex_enter(&parent->p_smutex);
    1.157      yamt 	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
    1.157      yamt 	    schedcpu_ticks - child->p_forktime);
1.166.2.2        ad 	if (child->p_estcpu > estcpu)
    1.157      yamt 		parent->p_estcpu =
    1.157      yamt 		    ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1.166.2.2        ad 	mutex_exit(&parent->p_smutex);
    1.151      yamt }
    1.151      yamt
    1.151      yamt /*
1.166.2.4        ad  * sched_kpri:
1.166.2.4        ad  *
1.166.2.4        ad  *	Given an LWP a priority boost before it sleeps.  Currently we scale
1.166.2.4        ad  *	user priorites into the range 60 -> 40, and kernel priorities into
1.166.2.4        ad  *	40 -> 0.
1.166.2.1        ad  */
1.166.2.1        ad int
1.166.2.1        ad sched_kpri(struct lwp *l)
1.166.2.1        ad {
1.166.2.6        ad 	static const uint8_t kpri_tab[] = {
1.166.2.4        ad 		 0,   0,   1,   2,   3,   4,   4,   5,
1.166.2.4        ad 		 6,   7,   8,   8,   9,  10,  11,  12,
1.166.2.4        ad 		12,  13,  14,  15,  16,  16,  17,  18,
1.166.2.4        ad 		19,  20,  20,  21,  22,  23,  24,  24,
1.166.2.4        ad 		25,  26,  27,  28,  28,  29,  30,  31,
1.166.2.4        ad 		32,  32,  33,  34,  35,  36,  36,  37,
1.166.2.4        ad 		38,  39,  40,  40,  40,  40,  41,  41,
1.166.2.4        ad 		41,  41,  42,  42,  42,  42,  43,  43,
1.166.2.4        ad 		43,  43,  44,  44,  44,  44,  45,  45,
1.166.2.4        ad 		45,  45,  46,  46,  46,  47,  47,  47,
1.166.2.4        ad 		47,  48,  48,  48,  48,  49,  49,  49,
1.166.2.4        ad 		49,  50,  50,  50,  50,  51,  51,  51,
1.166.2.4        ad 		51,  52,  52,  52,  52,  53,  53,  53,
1.166.2.4        ad 		54,  54,  54,  54,  55,  55,  55,  55,
1.166.2.4        ad 		56,  56,  56,  56,  57,  57,  57,  57,
1.166.2.4        ad 		58,  58,  58,  58,  59,  59,  59,  60,
1.166.2.4        ad 	};
1.166.2.4        ad
1.166.2.4        ad 	return kpri_tab[l->l_priority];
1.166.2.1        ad }
1.166.2.1        ad
1.166.2.4        ad /*
1.166.2.4        ad  * sched_unsleep:
1.166.2.4        ad  *
1.166.2.4        ad  *	The is called when the LWP has not been awoken normally but instead
1.166.2.4        ad  *	interrupted: for example, if the sleep timed out.  Because of this,
1.166.2.4        ad  *	it's not a valid action for running or idle LWPs.
1.166.2.4        ad  */
1.166.2.3        ad void
1.166.2.4        ad sched_unsleep(struct lwp *l)
1.166.2.3        ad {
1.166.2.3        ad
1.166.2.4        ad 	lwp_unlock(l);
1.166.2.4        ad 	panic("sched_unsleep");
1.166.2.3        ad }
1.166.2.3        ad
1.166.2.4        ad /*
1.166.2.4        ad  * sched_changepri:
1.166.2.4        ad  *
1.166.2.4        ad  *	Adjust the priority of an LWP.
1.166.2.4        ad  */
1.166.2.3        ad void
1.166.2.4        ad sched_changepri(struct lwp *l, int pri)
1.166.2.3        ad {
1.166.2.4        ad 	struct cpu_info *ci;
1.166.2.3        ad
1.166.2.4        ad 	/*
1.166.2.4        ad 	 * XXXSMP
1.166.2.4        ad 	 * Since l->l_cpu persists across a context switch,
1.166.2.4        ad 	 * this gives us *very weak* processor affinity, in
1.166.2.4        ad 	 * that we notify the CPU on which the process last
1.166.2.4        ad 	 * ran that it should try to switch.
1.166.2.4        ad 	 *
1.166.2.4        ad 	 * This does not guarantee that the process will run on
1.166.2.4        ad 	 * that processor next, because another processor might
1.166.2.4        ad 	 * grab it the next time it performs a context switch.
1.166.2.4        ad 	 *
1.166.2.4        ad 	 * This also does not handle the case where its last
1.166.2.4        ad 	 * CPU is running a higher-priority process, but every
1.166.2.4        ad 	 * other CPU is running a lower-priority process.  There
1.166.2.4        ad 	 * are ways to handle this situation, but they're not
1.166.2.4        ad 	 * currently very pretty, and we also need to weigh the
1.166.2.4        ad 	 * cost of moving a process from one CPU to another.
1.166.2.4        ad 	 */
1.166.2.4        ad 	if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0) {
1.166.2.4        ad 		l->l_priority = pri;
1.166.2.4        ad 		return;
1.166.2.4        ad 	}
1.166.2.4        ad
1.166.2.4        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1.166.2.4        ad
1.166.2.4        ad 	remrunqueue(l);
1.166.2.4        ad 	l->l_priority = pri;
1.166.2.4        ad 	setrunqueue(l);
1.166.2.4        ad
1.166.2.4        ad 	ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
1.166.2.4        ad 	if (pri < ci->ci_schedstate.spc_curpriority)
1.166.2.4        ad 		cpu_need_resched(ci);
1.166.2.3        ad }
1.166.2.3        ad
1.166.2.1        ad /*
    1.113  gmcgarry  * Low-level routines to access the run queue.  Optimised assembler
    1.113  gmcgarry  * routines can override these.
    1.113  gmcgarry  */
    1.113  gmcgarry
    1.113  gmcgarry #ifndef __HAVE_MD_RUNQUEUE
    1.115  nisimura
    1.130   nathanw /*
    1.134      matt  * On some architectures, it's faster to use a MSB ordering for the priorites
    1.134      matt  * than the traditional LSB ordering.
    1.134      matt  */
    1.134      matt #ifdef __HAVE_BIGENDIAN_BITOPS
    1.134      matt #define	RQMASK(n) (0x80000000 >> (n))
    1.134      matt #else
    1.134      matt #define	RQMASK(n) (0x00000001 << (n))
    1.134      matt #endif
    1.134      matt
    1.134      matt /*
    1.115  nisimura  * The primitives that manipulate the run queues.  whichqs tells which
    1.115  nisimura  * of the 32 queues qs have processes in them.  Setrunqueue puts processes
    1.115  nisimura  * into queues, remrunqueue removes them from queues.  The running process is
    1.115  nisimura  * on no queue, other processes are on a queue related to p->p_priority,
    1.115  nisimura  * divided by 4 actually to shrink the 0-127 range of priorities into the 32
    1.115  nisimura  * available queues.
    1.130   nathanw  */
    1.146      matt #ifdef RQDEBUG
    1.146      matt static void
    1.146      matt checkrunqueue(int whichq, struct lwp *l)
    1.146      matt {
    1.146      matt 	const struct prochd * const rq = &sched_qs[whichq];
    1.146      matt 	struct lwp *l2;
    1.146      matt 	int found = 0;
    1.146      matt 	int die = 0;
    1.146      matt 	int empty = 1;
    1.164  christos 	for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
    1.146      matt 		if (l2->l_stat != LSRUN) {
    1.146      matt 			printf("checkrunqueue[%d]: lwp %p state (%d) "
    1.146      matt 			    " != LSRUN\n", whichq, l2, l2->l_stat);
    1.146      matt 		}
    1.146      matt 		if (l2->l_back->l_forw != l2) {
    1.146      matt 			printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
    1.146      matt 			    "corrupt %p\n", whichq, l2, l2->l_back,
    1.146      matt 			    l2->l_back->l_forw);
    1.146      matt 			die = 1;
    1.146      matt 		}
    1.146      matt 		if (l2->l_forw->l_back != l2) {
    1.146      matt 			printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
    1.146      matt 			    "corrupt %p\n", whichq, l2, l2->l_forw,
    1.146      matt 			    l2->l_forw->l_back);
    1.146      matt 			die = 1;
    1.146      matt 		}
    1.146      matt 		if (l2 == l)
    1.146      matt 			found = 1;
    1.146      matt 		empty = 0;
    1.146      matt 	}
    1.146      matt 	if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
    1.146      matt 		printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
    1.146      matt 		    whichq, rq);
    1.146      matt 		die = 1;
    1.146      matt 	} else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
    1.146      matt 		printf("checkrunqueue[%d]: bit clear for non-empty "
    1.146      matt 		    "run-queue %p\n", whichq, rq);
    1.146      matt 		die = 1;
    1.146      matt 	}
    1.146      matt 	if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
    1.146      matt 		printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
    1.146      matt 		    whichq, l);
    1.146      matt 		die = 1;
    1.146      matt 	}
    1.146      matt 	if (l != NULL && empty) {
    1.146      matt 		printf("checkrunqueue[%d]: empty run-queue %p with "
    1.146      matt 		    "active lwp %p\n", whichq, rq, l);
    1.146      matt 		die = 1;
    1.146      matt 	}
    1.146      matt 	if (l != NULL && !found) {
    1.146      matt 		printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
    1.146      matt 		    whichq, l, rq);
    1.146      matt 		die = 1;
    1.146      matt 	}
    1.146      matt 	if (die)
    1.146      matt 		panic("checkrunqueue: inconsistency found");
    1.146      matt }
    1.146      matt #endif /* RQDEBUG */
    1.146      matt
    1.113  gmcgarry void
    1.122   thorpej setrunqueue(struct lwp *l)
    1.113  gmcgarry {
    1.113  gmcgarry 	struct prochd *rq;
    1.122   thorpej 	struct lwp *prev;
    1.152      yamt 	const int whichq = l->l_priority / PPQ;
    1.113  gmcgarry
1.166.2.3        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1.166.2.2        ad
    1.146      matt #ifdef RQDEBUG
    1.146      matt 	checkrunqueue(whichq, NULL);
    1.146      matt #endif
    1.113  gmcgarry #ifdef DIAGNOSTIC
1.166.2.2        ad 	if (l->l_back != NULL || l->l_stat != LSRUN)
    1.113  gmcgarry 		panic("setrunqueue");
    1.113  gmcgarry #endif
    1.134      matt 	sched_whichqs |= RQMASK(whichq);
    1.113  gmcgarry 	rq = &sched_qs[whichq];
    1.113  gmcgarry 	prev = rq->ph_rlink;
    1.122   thorpej 	l->l_forw = (struct lwp *)rq;
    1.122   thorpej 	rq->ph_rlink = l;
    1.122   thorpej 	prev->l_forw = l;
    1.122   thorpej 	l->l_back = prev;
    1.146      matt #ifdef RQDEBUG
    1.146      matt 	checkrunqueue(whichq, l);
    1.146      matt #endif
    1.113  gmcgarry }
    1.113  gmcgarry
    1.113  gmcgarry void
    1.122   thorpej remrunqueue(struct lwp *l)
    1.113  gmcgarry {
    1.122   thorpej 	struct lwp *prev, *next;
    1.152      yamt 	const int whichq = l->l_priority / PPQ;
1.166.2.2        ad
1.166.2.2        ad 	LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1.166.2.2        ad
    1.146      matt #ifdef RQDEBUG
    1.146      matt 	checkrunqueue(whichq, l);
    1.146      matt #endif
1.166.2.2        ad
1.166.2.2        ad #if defined(DIAGNOSTIC)
1.166.2.2        ad 	if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1.166.2.2        ad 		/* Shouldn't happen - interrupts disabled. */
    1.146      matt 		panic("remrunqueue: bit %d not set", whichq);
1.166.2.2        ad 	}
    1.113  gmcgarry #endif
    1.122   thorpej 	prev = l->l_back;
    1.122   thorpej 	l->l_back = NULL;
    1.122   thorpej 	next = l->l_forw;
    1.122   thorpej 	prev->l_forw = next;
    1.122   thorpej 	next->l_back = prev;
    1.113  gmcgarry 	if (prev == next)
    1.134      matt 		sched_whichqs &= ~RQMASK(whichq);
    1.146      matt #ifdef RQDEBUG
    1.146      matt 	checkrunqueue(whichq, NULL);
    1.146      matt #endif
    1.113  gmcgarry }
    1.113  gmcgarry
    1.134      matt #undef RQMASK
    1.134      matt #endif /* !defined(__HAVE_MD_RUNQUEUE) */