kern_synch.c revision 1.166.2.3 1 1.166.2.3 ad /* $NetBSD: kern_synch.c,v 1.166.2.3 2006/10/24 21:10:21 ad Exp $ */
2 1.63 thorpej
3 1.63 thorpej /*-
4 1.166.2.2 ad * Copyright (c) 1999, 2000, 2004, 2006 The NetBSD Foundation, Inc.
5 1.63 thorpej * All rights reserved.
6 1.63 thorpej *
7 1.63 thorpej * This code is derived from software contributed to The NetBSD Foundation
8 1.63 thorpej * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 1.166.2.2 ad * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 1.63 thorpej *
11 1.63 thorpej * Redistribution and use in source and binary forms, with or without
12 1.63 thorpej * modification, are permitted provided that the following conditions
13 1.63 thorpej * are met:
14 1.63 thorpej * 1. Redistributions of source code must retain the above copyright
15 1.63 thorpej * notice, this list of conditions and the following disclaimer.
16 1.63 thorpej * 2. Redistributions in binary form must reproduce the above copyright
17 1.63 thorpej * notice, this list of conditions and the following disclaimer in the
18 1.63 thorpej * documentation and/or other materials provided with the distribution.
19 1.63 thorpej * 3. All advertising materials mentioning features or use of this software
20 1.63 thorpej * must display the following acknowledgement:
21 1.63 thorpej * This product includes software developed by the NetBSD
22 1.63 thorpej * Foundation, Inc. and its contributors.
23 1.63 thorpej * 4. Neither the name of The NetBSD Foundation nor the names of its
24 1.63 thorpej * contributors may be used to endorse or promote products derived
25 1.63 thorpej * from this software without specific prior written permission.
26 1.63 thorpej *
27 1.63 thorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 1.63 thorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 1.63 thorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 1.63 thorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 1.63 thorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 1.63 thorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 1.63 thorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 1.63 thorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 1.63 thorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 1.63 thorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 1.63 thorpej * POSSIBILITY OF SUCH DAMAGE.
38 1.63 thorpej */
39 1.26 cgd
40 1.26 cgd /*-
41 1.26 cgd * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 1.26 cgd * The Regents of the University of California. All rights reserved.
43 1.26 cgd * (c) UNIX System Laboratories, Inc.
44 1.26 cgd * All or some portions of this file are derived from material licensed
45 1.26 cgd * to the University of California by American Telephone and Telegraph
46 1.26 cgd * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 1.26 cgd * the permission of UNIX System Laboratories, Inc.
48 1.26 cgd *
49 1.26 cgd * Redistribution and use in source and binary forms, with or without
50 1.26 cgd * modification, are permitted provided that the following conditions
51 1.26 cgd * are met:
52 1.26 cgd * 1. Redistributions of source code must retain the above copyright
53 1.26 cgd * notice, this list of conditions and the following disclaimer.
54 1.26 cgd * 2. Redistributions in binary form must reproduce the above copyright
55 1.26 cgd * notice, this list of conditions and the following disclaimer in the
56 1.26 cgd * documentation and/or other materials provided with the distribution.
57 1.136 agc * 3. Neither the name of the University nor the names of its contributors
58 1.26 cgd * may be used to endorse or promote products derived from this software
59 1.26 cgd * without specific prior written permission.
60 1.26 cgd *
61 1.26 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 1.26 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 1.26 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 1.26 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 1.26 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 1.26 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 1.26 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 1.26 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 1.26 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 1.26 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 1.26 cgd * SUCH DAMAGE.
72 1.26 cgd *
73 1.50 fvdl * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 1.26 cgd */
75 1.106 lukem
76 1.106 lukem #include <sys/cdefs.h>
77 1.166.2.3 ad __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.166.2.3 2006/10/24 21:10:21 ad Exp $");
78 1.48 mrg
79 1.52 jonathan #include "opt_ddb.h"
80 1.51 thorpej #include "opt_ktrace.h"
81 1.109 yamt #include "opt_kstack.h"
82 1.82 thorpej #include "opt_lockdebug.h"
83 1.83 thorpej #include "opt_multiprocessor.h"
84 1.110 briggs #include "opt_perfctrs.h"
85 1.26 cgd
86 1.166.2.2 ad #define __MUTEX_PRIVATE
87 1.166.2.2 ad
88 1.26 cgd #include <sys/param.h>
89 1.26 cgd #include <sys/systm.h>
90 1.68 thorpej #include <sys/callout.h>
91 1.26 cgd #include <sys/proc.h>
92 1.26 cgd #include <sys/kernel.h>
93 1.26 cgd #include <sys/buf.h>
94 1.111 briggs #if defined(PERFCTRS)
95 1.110 briggs #include <sys/pmc.h>
96 1.111 briggs #endif
97 1.26 cgd #include <sys/signalvar.h>
98 1.26 cgd #include <sys/resourcevar.h>
99 1.55 ross #include <sys/sched.h>
100 1.122 thorpej #include <sys/sa.h>
101 1.122 thorpej #include <sys/savar.h>
102 1.161 elad #include <sys/kauth.h>
103 1.166.2.2 ad #include <sys/sleepq.h>
104 1.166.2.2 ad #include <sys/lockdebug.h>
105 1.47 mrg
106 1.47 mrg #include <uvm/uvm_extern.h>
107 1.47 mrg
108 1.26 cgd #ifdef KTRACE
109 1.26 cgd #include <sys/ktrace.h>
110 1.26 cgd #endif
111 1.26 cgd
112 1.26 cgd #include <machine/cpu.h>
113 1.34 christos
114 1.26 cgd int lbolt; /* once a second sleep address */
115 1.88 sommerfe int rrticks; /* number of hardclock ticks per roundrobin() */
116 1.26 cgd
117 1.73 thorpej /*
118 1.73 thorpej * The global scheduler state.
119 1.73 thorpej */
120 1.166.2.2 ad kmutex_t sched_mutex; /* global run queue mutex */
121 1.166.2.2 ad struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
122 1.159 perry volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
123 1.34 christos
124 1.166.2.2 ad void schedcpu(void *);
125 1.166.2.2 ad void updatepri(struct lwp *);
126 1.166.2.2 ad void sa_awaken(struct lwp *);
127 1.63 thorpej
128 1.143 yamt struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
129 1.157 yamt static unsigned int schedcpu_ticks;
130 1.122 thorpej
131 1.26 cgd /*
132 1.26 cgd * Force switch among equal priority processes every 100ms.
133 1.88 sommerfe * Called from hardclock every hz/10 == rrticks hardclock ticks.
134 1.26 cgd */
135 1.26 cgd /* ARGSUSED */
136 1.26 cgd void
137 1.89 sommerfe roundrobin(struct cpu_info *ci)
138 1.26 cgd {
139 1.89 sommerfe struct schedstate_percpu *spc = &ci->ci_schedstate;
140 1.26 cgd
141 1.88 sommerfe spc->spc_rrticks = rrticks;
142 1.130 nathanw
143 1.122 thorpej if (curlwp != NULL) {
144 1.73 thorpej if (spc->spc_flags & SPCF_SEENRR) {
145 1.69 thorpej /*
146 1.69 thorpej * The process has already been through a roundrobin
147 1.69 thorpej * without switching and may be hogging the CPU.
148 1.69 thorpej * Indicate that the process should yield.
149 1.69 thorpej */
150 1.73 thorpej spc->spc_flags |= SPCF_SHOULDYIELD;
151 1.69 thorpej } else
152 1.73 thorpej spc->spc_flags |= SPCF_SEENRR;
153 1.69 thorpej }
154 1.166.2.2 ad cpu_need_resched(curcpu());
155 1.26 cgd }
156 1.26 cgd
157 1.153 yamt #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
158 1.153 yamt #define NICE_WEIGHT 2 /* priorities per nice level */
159 1.153 yamt
160 1.153 yamt #define ESTCPU_SHIFT 11
161 1.153 yamt #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
162 1.153 yamt #define ESTCPULIM(e) min((e), ESTCPU_MAX)
163 1.153 yamt
164 1.26 cgd /*
165 1.26 cgd * Constants for digital decay and forget:
166 1.26 cgd * 90% of (p_estcpu) usage in 5 * loadav time
167 1.26 cgd * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
168 1.26 cgd * Note that, as ps(1) mentions, this can let percentages
169 1.26 cgd * total over 100% (I've seen 137.9% for 3 processes).
170 1.26 cgd *
171 1.26 cgd * Note that hardclock updates p_estcpu and p_cpticks independently.
172 1.26 cgd *
173 1.26 cgd * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
174 1.26 cgd * That is, the system wants to compute a value of decay such
175 1.26 cgd * that the following for loop:
176 1.26 cgd * for (i = 0; i < (5 * loadavg); i++)
177 1.26 cgd * p_estcpu *= decay;
178 1.26 cgd * will compute
179 1.26 cgd * p_estcpu *= 0.1;
180 1.26 cgd * for all values of loadavg:
181 1.26 cgd *
182 1.26 cgd * Mathematically this loop can be expressed by saying:
183 1.26 cgd * decay ** (5 * loadavg) ~= .1
184 1.26 cgd *
185 1.26 cgd * The system computes decay as:
186 1.26 cgd * decay = (2 * loadavg) / (2 * loadavg + 1)
187 1.26 cgd *
188 1.26 cgd * We wish to prove that the system's computation of decay
189 1.26 cgd * will always fulfill the equation:
190 1.26 cgd * decay ** (5 * loadavg) ~= .1
191 1.26 cgd *
192 1.26 cgd * If we compute b as:
193 1.26 cgd * b = 2 * loadavg
194 1.26 cgd * then
195 1.26 cgd * decay = b / (b + 1)
196 1.26 cgd *
197 1.26 cgd * We now need to prove two things:
198 1.26 cgd * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
199 1.26 cgd * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
200 1.130 nathanw *
201 1.26 cgd * Facts:
202 1.26 cgd * For x close to zero, exp(x) =~ 1 + x, since
203 1.26 cgd * exp(x) = 0! + x**1/1! + x**2/2! + ... .
204 1.26 cgd * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
205 1.26 cgd * For x close to zero, ln(1+x) =~ x, since
206 1.26 cgd * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
207 1.26 cgd * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
208 1.26 cgd * ln(.1) =~ -2.30
209 1.26 cgd *
210 1.26 cgd * Proof of (1):
211 1.26 cgd * Solve (factor)**(power) =~ .1 given power (5*loadav):
212 1.26 cgd * solving for factor,
213 1.26 cgd * ln(factor) =~ (-2.30/5*loadav), or
214 1.26 cgd * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
215 1.26 cgd * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
216 1.26 cgd *
217 1.26 cgd * Proof of (2):
218 1.26 cgd * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
219 1.26 cgd * solving for power,
220 1.26 cgd * power*ln(b/(b+1)) =~ -2.30, or
221 1.26 cgd * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
222 1.26 cgd *
223 1.26 cgd * Actual power values for the implemented algorithm are as follows:
224 1.26 cgd * loadav: 1 2 3 4
225 1.26 cgd * power: 5.68 10.32 14.94 19.55
226 1.26 cgd */
227 1.26 cgd
228 1.26 cgd /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
229 1.26 cgd #define loadfactor(loadav) (2 * (loadav))
230 1.153 yamt
231 1.153 yamt static fixpt_t
232 1.153 yamt decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
233 1.153 yamt {
234 1.153 yamt
235 1.153 yamt if (estcpu == 0) {
236 1.153 yamt return 0;
237 1.153 yamt }
238 1.153 yamt
239 1.153 yamt #if !defined(_LP64)
240 1.153 yamt /* avoid 64bit arithmetics. */
241 1.153 yamt #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
242 1.153 yamt if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
243 1.153 yamt return estcpu * loadfac / (loadfac + FSCALE);
244 1.153 yamt }
245 1.153 yamt #endif /* !defined(_LP64) */
246 1.153 yamt
247 1.153 yamt return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
248 1.153 yamt }
249 1.26 cgd
250 1.157 yamt /*
251 1.157 yamt * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
252 1.157 yamt * sleeping for at least seven times the loadfactor will decay p_estcpu to
253 1.157 yamt * less than (1 << ESTCPU_SHIFT).
254 1.157 yamt *
255 1.157 yamt * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
256 1.157 yamt */
257 1.157 yamt static fixpt_t
258 1.157 yamt decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
259 1.157 yamt {
260 1.157 yamt
261 1.157 yamt if ((n << FSHIFT) >= 7 * loadfac) {
262 1.157 yamt return 0;
263 1.157 yamt }
264 1.157 yamt
265 1.157 yamt while (estcpu != 0 && n > 1) {
266 1.157 yamt estcpu = decay_cpu(loadfac, estcpu);
267 1.157 yamt n--;
268 1.157 yamt }
269 1.157 yamt
270 1.157 yamt return estcpu;
271 1.157 yamt }
272 1.157 yamt
273 1.26 cgd /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
274 1.26 cgd fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
275 1.26 cgd
276 1.26 cgd /*
277 1.26 cgd * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
278 1.26 cgd * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
279 1.26 cgd * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
280 1.26 cgd *
281 1.26 cgd * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
282 1.26 cgd * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
283 1.26 cgd *
284 1.26 cgd * If you dont want to bother with the faster/more-accurate formula, you
285 1.26 cgd * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
286 1.26 cgd * (more general) method of calculating the %age of CPU used by a process.
287 1.26 cgd */
288 1.26 cgd #define CCPU_SHIFT 11
289 1.26 cgd
290 1.26 cgd /*
291 1.26 cgd * Recompute process priorities, every hz ticks.
292 1.26 cgd */
293 1.26 cgd /* ARGSUSED */
294 1.26 cgd void
295 1.77 thorpej schedcpu(void *arg)
296 1.26 cgd {
297 1.71 augustss fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
298 1.166.2.2 ad struct rlimit *rlim;
299 1.122 thorpej struct lwp *l;
300 1.71 augustss struct proc *p;
301 1.122 thorpej int s, minslp;
302 1.66 ross int clkhz;
303 1.166.2.2 ad long runtm;
304 1.26 cgd
305 1.157 yamt schedcpu_ticks++;
306 1.157 yamt
307 1.166.2.1 ad mutex_enter(&proclist_mutex);
308 1.145 yamt PROCLIST_FOREACH(p, &allproc) {
309 1.26 cgd /*
310 1.166.2.2 ad * Increment time in/out of memory and sleep time (if
311 1.166.2.2 ad * sleeping). We ignore overflow; with 16-bit int's
312 1.26 cgd * (remember them?) overflow takes 45 days.
313 1.166.2.2 ad *
314 1.166.2.2 ad * XXXSMP Should create an activeproc list so that we
315 1.166.2.2 ad * don't touch every proc+LWP in the system on a regular
316 1.166.2.2 ad * basis. l->l_swtime/l->l_slptime can become deltas.
317 1.26 cgd */
318 1.122 thorpej minslp = 2;
319 1.166.2.2 ad runtm = 0;
320 1.166.2.2 ad mutex_enter(&p->p_smutex);
321 1.122 thorpej LIST_FOREACH(l, &p->p_lwps, l_sibling) {
322 1.166.2.2 ad lwp_lock(l);
323 1.166.2.2 ad runtm += l->l_rtime.tv_sec;
324 1.122 thorpej l->l_swtime++;
325 1.130 nathanw if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
326 1.122 thorpej l->l_stat == LSSUSPENDED) {
327 1.122 thorpej l->l_slptime++;
328 1.122 thorpej minslp = min(minslp, l->l_slptime);
329 1.122 thorpej } else
330 1.122 thorpej minslp = 0;
331 1.166.2.2 ad lwp_unlock(l);
332 1.122 thorpej }
333 1.26 cgd p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
334 1.166.2.2 ad
335 1.166.2.2 ad /*
336 1.166.2.2 ad * Check if the process exceeds its CPU resource allocation.
337 1.166.2.2 ad * If over max, kill it. In any case, if it has run for more
338 1.166.2.2 ad * than autonicetime, reduce priority to give others a chance.
339 1.166.2.2 ad */
340 1.166.2.2 ad rlim = &p->p_rlimit[RLIMIT_CPU];
341 1.166.2.2 ad if (runtm >= rlim->rlim_cur) {
342 1.166.2.2 ad if (runtm >= rlim->rlim_max)
343 1.166.2.2 ad psignal(p, SIGKILL);
344 1.166.2.2 ad else {
345 1.166.2.2 ad psignal(p, SIGXCPU);
346 1.166.2.2 ad if (rlim->rlim_cur < rlim->rlim_max)
347 1.166.2.2 ad rlim->rlim_cur += 5;
348 1.166.2.2 ad }
349 1.166.2.2 ad }
350 1.166.2.2 ad if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
351 1.166.2.2 ad && kauth_cred_geteuid(p->p_cred)) {
352 1.166.2.2 ad p->p_nice = autoniceval + NZERO;
353 1.166.2.2 ad resetprocpriority(p);
354 1.166.2.2 ad }
355 1.166.2.2 ad
356 1.26 cgd /*
357 1.26 cgd * If the process has slept the entire second,
358 1.26 cgd * stop recalculating its priority until it wakes up.
359 1.26 cgd */
360 1.166.2.2 ad if (minslp > 1) {
361 1.166.2.2 ad mutex_exit(&p->p_smutex);
362 1.26 cgd continue;
363 1.166.2.2 ad }
364 1.166.2.2 ad /* XXXAD lock */
365 1.26 cgd s = splstatclock(); /* prevent state changes */
366 1.26 cgd /*
367 1.26 cgd * p_pctcpu is only for ps.
368 1.26 cgd */
369 1.66 ross clkhz = stathz != 0 ? stathz : hz;
370 1.26 cgd #if (FSHIFT >= CCPU_SHIFT)
371 1.66 ross p->p_pctcpu += (clkhz == 100)?
372 1.26 cgd ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
373 1.26 cgd 100 * (((fixpt_t) p->p_cpticks)
374 1.66 ross << (FSHIFT - CCPU_SHIFT)) / clkhz;
375 1.26 cgd #else
376 1.26 cgd p->p_pctcpu += ((FSCALE - ccpu) *
377 1.66 ross (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
378 1.26 cgd #endif
379 1.26 cgd p->p_cpticks = 0;
380 1.153 yamt p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
381 1.120 pk splx(s); /* Done with the process CPU ticks update */
382 1.122 thorpej LIST_FOREACH(l, &p->p_lwps, l_sibling) {
383 1.166.2.2 ad lwp_lock(l);
384 1.166.2.2 ad if (l->l_slptime > 1) {
385 1.166.2.2 ad lwp_unlock(l);
386 1.122 thorpej continue;
387 1.166.2.2 ad }
388 1.122 thorpej resetpriority(l);
389 1.122 thorpej if (l->l_priority >= PUSER) {
390 1.122 thorpej if (l->l_stat == LSRUN &&
391 1.122 thorpej (l->l_flag & L_INMEM) &&
392 1.122 thorpej (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) {
393 1.122 thorpej remrunqueue(l);
394 1.122 thorpej l->l_priority = l->l_usrpri;
395 1.122 thorpej setrunqueue(l);
396 1.122 thorpej } else
397 1.122 thorpej l->l_priority = l->l_usrpri;
398 1.122 thorpej }
399 1.166.2.2 ad lwp_unlock(l);
400 1.26 cgd }
401 1.166.2.2 ad mutex_exit(&p->p_smutex);
402 1.26 cgd }
403 1.166.2.1 ad mutex_exit(&proclist_mutex);
404 1.47 mrg uvm_meter();
405 1.67 fvdl wakeup((caddr_t)&lbolt);
406 1.143 yamt callout_schedule(&schedcpu_ch, hz);
407 1.26 cgd }
408 1.26 cgd
409 1.26 cgd /*
410 1.26 cgd * Recalculate the priority of a process after it has slept for a while.
411 1.26 cgd */
412 1.26 cgd void
413 1.122 thorpej updatepri(struct lwp *l)
414 1.26 cgd {
415 1.122 thorpej struct proc *p = l->l_proc;
416 1.83 thorpej fixpt_t loadfac;
417 1.83 thorpej
418 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
419 1.157 yamt KASSERT(l->l_slptime > 1);
420 1.83 thorpej
421 1.83 thorpej loadfac = loadfactor(averunnable.ldavg[0]);
422 1.26 cgd
423 1.157 yamt l->l_slptime--; /* the first time was done in schedcpu */
424 1.157 yamt /* XXX NJWLWP */
425 1.166.2.2 ad /* XXXSMP occasionaly unlocked. */
426 1.157 yamt p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
427 1.122 thorpej resetpriority(l);
428 1.26 cgd }
429 1.26 cgd
430 1.26 cgd /*
431 1.166.2.2 ad * During autoconfiguration or after a panic, a sleep will simply lower the
432 1.166.2.2 ad * priority briefly to allow interrupts, then return. The priority to be
433 1.166.2.2 ad * used (safepri) is machine-dependent, thus this value is initialized and
434 1.166.2.2 ad * maintained in the machine-dependent layers. This priority will typically
435 1.166.2.2 ad * be 0, or the lowest priority that is safe for use on the interrupt stack;
436 1.166.2.2 ad * it can be made higher to block network software interrupts after panics.
437 1.26 cgd */
438 1.166.2.2 ad int safepri;
439 1.26 cgd
440 1.26 cgd /*
441 1.166.2.2 ad * ltsleep: see mtsleep() for comments.
442 1.26 cgd */
443 1.26 cgd int
444 1.166.2.2 ad ltsleep(wchan_t ident, int priority, const char *wmesg, int timo,
445 1.166.2.2 ad volatile struct simplelock *interlock)
446 1.26 cgd {
447 1.122 thorpej struct lwp *l = curlwp;
448 1.166.2.2 ad sleepq_t *sq;
449 1.166.2.2 ad int error;
450 1.26 cgd
451 1.166.2.2 ad if (sleepq_dontsleep(l)) {
452 1.166.2.2 ad (void)sleepq_abort(NULL, 0);
453 1.166.2.2 ad if ((priority & PNORELOCK) != 0)
454 1.77 thorpej simple_unlock(interlock);
455 1.166.2.2 ad return 0;
456 1.122 thorpej }
457 1.77 thorpej
458 1.166.2.2 ad sq = sleeptab_lookup(ident);
459 1.166.2.2 ad sleepq_enter(sq, priority, ident, wmesg, timo, priority & PCATCH);
460 1.77 thorpej
461 1.166.2.2 ad if (interlock != NULL) {
462 1.166.2.2 ad LOCK_ASSERT(simple_lock_held(interlock));
463 1.77 thorpej simple_unlock(interlock);
464 1.26 cgd }
465 1.126 pk
466 1.166.2.2 ad error = sleepq_block(sq, timo);
467 1.126 pk
468 1.166.2.2 ad if (interlock != NULL && (priority & PNORELOCK) == 0)
469 1.166.2.2 ad simple_lock(interlock);
470 1.166.2.2 ad
471 1.166.2.2 ad return error;
472 1.26 cgd }
473 1.26 cgd
474 1.166.2.2 ad /*
475 1.166.2.2 ad * General sleep call. Suspends the current process until a wakeup is
476 1.166.2.2 ad * performed on the specified identifier. The process will then be made
477 1.166.2.2 ad * runnable with the specified priority. Sleeps at most timo/hz seconds (0
478 1.166.2.2 ad * means no timeout). If pri includes PCATCH flag, signals are checked
479 1.166.2.2 ad * before and after sleeping, else signals are not checked. Returns 0 if
480 1.166.2.2 ad * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
481 1.166.2.2 ad * signal needs to be delivered, ERESTART is returned if the current system
482 1.166.2.2 ad * call should be restarted if possible, and EINTR is returned if the system
483 1.166.2.2 ad * call should be interrupted by the signal (return EINTR).
484 1.166.2.2 ad *
485 1.166.2.2 ad * The interlock is held until we are on a sleep queue. The interlock will
486 1.166.2.2 ad * be locked before returning back to the caller unless the PNORELOCK flag
487 1.166.2.2 ad * is specified, in which case the interlock will always be unlocked upon
488 1.166.2.2 ad * return.
489 1.166.2.2 ad */
490 1.166.2.1 ad int
491 1.166.2.2 ad mtsleep(wchan_t ident, int priority, const char *wmesg, int timo,
492 1.166.2.2 ad kmutex_t *mtx)
493 1.166.2.1 ad {
494 1.166.2.1 ad struct lwp *l = curlwp;
495 1.166.2.2 ad sleepq_t *sq;
496 1.166.2.2 ad int error;
497 1.166.2.1 ad
498 1.166.2.2 ad if (sleepq_dontsleep(l))
499 1.166.2.2 ad return sleepq_abort(mtx, priority & PNORELOCK);
500 1.166.2.1 ad
501 1.166.2.2 ad sq = sleeptab_lookup(ident);
502 1.166.2.2 ad sleepq_enter(sq, priority, ident, wmesg, timo, priority & PCATCH);
503 1.166.2.1 ad
504 1.166.2.1 ad if (mtx != NULL) {
505 1.166.2.2 ad LOCK_ASSERT(mutex_owned(mtx));
506 1.166.2.2 ad mutex_exit_linked(mtx, l->l_mutex);
507 1.166.2.1 ad }
508 1.166.2.1 ad
509 1.166.2.2 ad error = sleepq_block(sq, timo);
510 1.166.2.1 ad
511 1.166.2.2 ad if (mtx != NULL && (priority & PNORELOCK) == 0)
512 1.166.2.1 ad mutex_enter(mtx);
513 1.166.2.2 ad
514 1.166.2.2 ad return error;
515 1.166.2.1 ad }
516 1.166.2.1 ad
517 1.26 cgd void
518 1.139 cl sa_awaken(struct lwp *l)
519 1.139 cl {
520 1.147 perry
521 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
522 1.139 cl
523 1.142 cl if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD)
524 1.139 cl l->l_flag &= ~L_SA_IDLE;
525 1.139 cl }
526 1.139 cl
527 1.26 cgd /*
528 1.26 cgd * Make all processes sleeping on the specified identifier runnable.
529 1.26 cgd */
530 1.26 cgd void
531 1.166.2.2 ad wakeup(wchan_t ident)
532 1.26 cgd {
533 1.166.2.2 ad sleepq_t *sq;
534 1.83 thorpej
535 1.166.2.2 ad if (cold)
536 1.166.2.2 ad return;
537 1.83 thorpej
538 1.166.2.2 ad sq = sleeptab_lookup(ident);
539 1.166.2.2 ad sleepq_wakeall(sq, ident, (u_int)-1);
540 1.63 thorpej }
541 1.63 thorpej
542 1.63 thorpej /*
543 1.63 thorpej * Make the highest priority process first in line on the specified
544 1.63 thorpej * identifier runnable.
545 1.63 thorpej */
546 1.166.2.2 ad void
547 1.166.2.2 ad wakeup_one(wchan_t ident)
548 1.63 thorpej {
549 1.166.2.2 ad sleepq_t *sq;
550 1.77 thorpej
551 1.166.2.2 ad if (cold)
552 1.166.2.2 ad return;
553 1.166.2.2 ad
554 1.166.2.2 ad sq = sleeptab_lookup(ident);
555 1.166.2.2 ad sleepq_wakeone(sq, ident);
556 1.117 gmcgarry }
557 1.117 gmcgarry
558 1.166.2.2 ad
559 1.117 gmcgarry /*
560 1.117 gmcgarry * General yield call. Puts the current process back on its run queue and
561 1.117 gmcgarry * performs a voluntary context switch. Should only be called when the
562 1.117 gmcgarry * current process explicitly requests it (eg sched_yield(2) in compat code).
563 1.117 gmcgarry */
564 1.117 gmcgarry void
565 1.117 gmcgarry yield(void)
566 1.117 gmcgarry {
567 1.122 thorpej struct lwp *l = curlwp;
568 1.117 gmcgarry
569 1.166.2.2 ad lwp_lock(l);
570 1.166.2.2 ad if (l->l_stat == LSONPROC) {
571 1.166.2.3 ad KASSERT(lwp_locked(l, &sched_mutex));
572 1.166.2.2 ad l->l_priority = l->l_usrpri;
573 1.166.2.2 ad }
574 1.166.2.2 ad l->l_nvcsw++;
575 1.122 thorpej mi_switch(l, NULL);
576 1.69 thorpej }
577 1.69 thorpej
578 1.69 thorpej /*
579 1.69 thorpej * General preemption call. Puts the current process back on its run queue
580 1.156 rpaulo * and performs an involuntary context switch.
581 1.156 rpaulo * The 'more' ("more work to do") argument is boolean. Returning to userspace
582 1.156 rpaulo * preempt() calls pass 0. "Voluntary" preemptions in e.g. uiomove() pass 1.
583 1.156 rpaulo * This will be used to indicate to the SA subsystem that the LWP is
584 1.156 rpaulo * not yet finished in the kernel.
585 1.69 thorpej */
586 1.69 thorpej void
587 1.122 thorpej preempt(int more)
588 1.69 thorpej {
589 1.122 thorpej struct lwp *l = curlwp;
590 1.166.2.2 ad int r;
591 1.69 thorpej
592 1.166.2.2 ad lwp_lock(l);
593 1.166.2.2 ad if (l->l_stat == LSONPROC) {
594 1.166.2.3 ad KASSERT(lwp_locked(l, &sched_mutex));
595 1.166.2.2 ad l->l_priority = l->l_usrpri;
596 1.166.2.2 ad }
597 1.166.2.2 ad l->l_nivcsw++;
598 1.122 thorpej r = mi_switch(l, NULL);
599 1.166.2.2 ad if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) /* XXXAD */
600 1.122 thorpej sa_preempt(l);
601 1.69 thorpej }
602 1.69 thorpej
603 1.69 thorpej /*
604 1.166.2.2 ad * The machine independent parts of context switch. Switch to "new"
605 1.166.2.2 ad * if non-NULL, otherwise let cpu_switch choose the next lwp.
606 1.130 nathanw *
607 1.122 thorpej * Returns 1 if another process was actually run.
608 1.26 cgd */
609 1.122 thorpej int
610 1.122 thorpej mi_switch(struct lwp *l, struct lwp *newl)
611 1.26 cgd {
612 1.76 thorpej struct schedstate_percpu *spc;
613 1.26 cgd struct timeval tv;
614 1.144 yamt int hold_count;
615 1.166.2.2 ad int retval, oldspl;
616 1.166.2.2 ad long s, u;
617 1.166.2.2 ad #if PERFCTRS
618 1.122 thorpej struct proc *p = l->l_proc;
619 1.166.2.2 ad #endif
620 1.26 cgd
621 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
622 1.83 thorpej
623 1.90 sommerfe /*
624 1.90 sommerfe * Release the kernel_lock, as we are about to yield the CPU.
625 1.90 sommerfe */
626 1.144 yamt hold_count = KERNEL_LOCK_RELEASE_ALL();
627 1.85 sommerfe
628 1.160 chs #ifdef LOCKDEBUG
629 1.82 thorpej spinlock_switchcheck();
630 1.81 thorpej simple_lock_switchcheck();
631 1.50 fvdl #endif
632 1.166.2.2 ad #ifdef KSTACK_CHECK_MAGIC
633 1.166.2.2 ad kstack_check_magic(l);
634 1.166.2.2 ad #endif
635 1.166.2.2 ad
636 1.166.2.2 ad /*
637 1.166.2.2 ad * It's safe to read the per CPU schedstate unlocked here, as all we
638 1.166.2.2 ad * are after is the run time and that's guarenteed to have been last
639 1.166.2.2 ad * updated by this CPU.
640 1.166.2.2 ad */
641 1.166.2.2 ad KDASSERT(l->l_cpu != NULL);
642 1.166.2.2 ad KDASSERT(l->l_cpu == curcpu());
643 1.166.2.2 ad spc = &l->l_cpu->ci_schedstate;
644 1.81 thorpej
645 1.26 cgd /*
646 1.26 cgd * Compute the amount of time during which the current
647 1.113 gmcgarry * process was running.
648 1.26 cgd */
649 1.26 cgd microtime(&tv);
650 1.166.2.2 ad u = l->l_rtime.tv_usec +
651 1.122 thorpej (tv.tv_usec - spc->spc_runtime.tv_usec);
652 1.166.2.2 ad s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
653 1.26 cgd if (u < 0) {
654 1.26 cgd u += 1000000;
655 1.26 cgd s--;
656 1.26 cgd } else if (u >= 1000000) {
657 1.26 cgd u -= 1000000;
658 1.26 cgd s++;
659 1.26 cgd }
660 1.166.2.2 ad l->l_rtime.tv_usec = u;
661 1.166.2.2 ad l->l_rtime.tv_sec = s;
662 1.26 cgd
663 1.26 cgd /*
664 1.166.2.2 ad * XXXSMP If we are using h/w performance counters, save context.
665 1.26 cgd */
666 1.166.2.2 ad #if PERFCTRS
667 1.166.2.2 ad if (PMC_ENABLED(p)) {
668 1.166.2.2 ad pmc_save_context(p);
669 1.26 cgd }
670 1.166.2.2 ad #endif
671 1.166.2.2 ad
672 1.166.2.2 ad /*
673 1.166.2.2 ad * Acquire the sched_mutex if necessary. It will be released by
674 1.166.2.2 ad * cpu_switch once it has decided to idle, or picked another LWP
675 1.166.2.2 ad * to run.
676 1.166.2.2 ad */
677 1.166.2.2 ad oldspl = mutex_getspl(l->l_mutex);
678 1.166.2.3 ad #ifdef MULTIPROCESSOR
679 1.166.2.3 ad if (l->l_mutex != &sched_mutex || l->l_omutex != NULL) {
680 1.166.2.2 ad lwp_unlock(l);
681 1.166.2.2 ad mutex_enter(&sched_mutex);
682 1.26 cgd }
683 1.166.2.3 ad #endif
684 1.166.2.3 ad
685 1.166.2.3 ad /*
686 1.166.2.3 ad * If on the CPU and we have gotten this far, then we must yield.
687 1.166.2.3 ad */
688 1.166.2.3 ad KASSERT(l->l_stat != LSRUN);
689 1.166.2.3 ad if (l->l_stat == LSONPROC) {
690 1.166.2.3 ad l->l_stat = LSRUN;
691 1.166.2.3 ad setrunqueue(l);
692 1.166.2.3 ad }
693 1.166.2.2 ad uvmexp.swtch++;
694 1.69 thorpej
695 1.69 thorpej /*
696 1.69 thorpej * Process is about to yield the CPU; clear the appropriate
697 1.69 thorpej * scheduling flags.
698 1.69 thorpej */
699 1.73 thorpej spc->spc_flags &= ~SPCF_SWITCHCLEAR;
700 1.109 yamt
701 1.166.2.2 ad LOCKDEBUG_BARRIER(&sched_mutex, 1);
702 1.113 gmcgarry
703 1.113 gmcgarry /*
704 1.166.2.2 ad * Switch to the new current LWP. When we run again, we'll
705 1.166.2.2 ad * return back here.
706 1.113 gmcgarry */
707 1.166.2.2 ad if (newl == NULL)
708 1.122 thorpej retval = cpu_switch(l, NULL);
709 1.166.2.2 ad else {
710 1.166.2.2 ad /* XXXAD ? */
711 1.122 thorpej remrunqueue(newl);
712 1.122 thorpej cpu_switchto(l, newl);
713 1.122 thorpej retval = 0;
714 1.122 thorpej }
715 1.110 briggs
716 1.110 briggs /*
717 1.166.2.2 ad * XXXSMP If we are using h/w performance counters, restore context.
718 1.26 cgd */
719 1.114 gmcgarry #if PERFCTRS
720 1.166 christos if (PMC_ENABLED(p)) {
721 1.114 gmcgarry pmc_restore_context(p);
722 1.166 christos }
723 1.114 gmcgarry #endif
724 1.110 briggs
725 1.110 briggs /*
726 1.76 thorpej * We're running again; record our new start time. We might
727 1.166.2.2 ad * be running on a new CPU now, so don't use the cached
728 1.76 thorpej * schedstate_percpu pointer.
729 1.76 thorpej */
730 1.122 thorpej KDASSERT(l->l_cpu != NULL);
731 1.122 thorpej KDASSERT(l->l_cpu == curcpu());
732 1.122 thorpej microtime(&l->l_cpu->ci_schedstate.spc_runtime);
733 1.85 sommerfe
734 1.90 sommerfe /*
735 1.166.2.2 ad * Reacquire the kernel_lock, and restore the old SPL.
736 1.166.2.3 ad * XXX We should do spl0() before acquiring the kernel
737 1.166.2.3 ad * lock but splx() may not raise the IPL.
738 1.90 sommerfe */
739 1.166.2.2 ad splx(oldspl);
740 1.166.2.3 ad KERNEL_LOCK_ACQUIRE_COUNT(hold_count);
741 1.122 thorpej
742 1.122 thorpej return retval;
743 1.26 cgd }
744 1.26 cgd
745 1.26 cgd /*
746 1.26 cgd * Initialize the (doubly-linked) run queues
747 1.26 cgd * to be empty.
748 1.26 cgd */
749 1.26 cgd void
750 1.26 cgd rqinit()
751 1.26 cgd {
752 1.71 augustss int i;
753 1.26 cgd
754 1.73 thorpej for (i = 0; i < RUNQUE_NQS; i++)
755 1.73 thorpej sched_qs[i].ph_link = sched_qs[i].ph_rlink =
756 1.122 thorpej (struct lwp *)&sched_qs[i];
757 1.166.2.2 ad
758 1.166.2.2 ad mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
759 1.26 cgd }
760 1.26 cgd
761 1.158 perry static inline void
762 1.166.2.2 ad resched_lwp(struct lwp *l, u_char pri)
763 1.119 thorpej {
764 1.119 thorpej struct cpu_info *ci;
765 1.119 thorpej
766 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
767 1.166.2.2 ad
768 1.119 thorpej /*
769 1.119 thorpej * XXXSMP
770 1.122 thorpej * Since l->l_cpu persists across a context switch,
771 1.119 thorpej * this gives us *very weak* processor affinity, in
772 1.119 thorpej * that we notify the CPU on which the process last
773 1.119 thorpej * ran that it should try to switch.
774 1.119 thorpej *
775 1.119 thorpej * This does not guarantee that the process will run on
776 1.119 thorpej * that processor next, because another processor might
777 1.119 thorpej * grab it the next time it performs a context switch.
778 1.119 thorpej *
779 1.119 thorpej * This also does not handle the case where its last
780 1.119 thorpej * CPU is running a higher-priority process, but every
781 1.119 thorpej * other CPU is running a lower-priority process. There
782 1.119 thorpej * are ways to handle this situation, but they're not
783 1.119 thorpej * currently very pretty, and we also need to weigh the
784 1.119 thorpej * cost of moving a process from one CPU to another.
785 1.119 thorpej */
786 1.122 thorpej ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
787 1.121 thorpej if (pri < ci->ci_schedstate.spc_curpriority)
788 1.166.2.2 ad cpu_need_resched(ci);
789 1.119 thorpej }
790 1.119 thorpej
791 1.26 cgd /*
792 1.166.2.2 ad * Change process state to be runnable, placing it on the run queue if it is
793 1.166.2.2 ad * in memory, and awakening the swapper if it isn't in memory.
794 1.166.2.2 ad *
795 1.166.2.2 ad * Call with the process and LWP locked. Will return with the LWP unlocked.
796 1.26 cgd */
797 1.26 cgd void
798 1.122 thorpej setrunnable(struct lwp *l)
799 1.26 cgd {
800 1.122 thorpej struct proc *p = l->l_proc;
801 1.166.2.3 ad struct cpu_info *ci;
802 1.26 cgd
803 1.166.2.2 ad LOCK_ASSERT(mutex_owned(&p->p_smutex));
804 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
805 1.83 thorpej
806 1.122 thorpej switch (l->l_stat) {
807 1.122 thorpej case LSSTOP:
808 1.33 mycroft /*
809 1.33 mycroft * If we're being traced (possibly because someone attached us
810 1.33 mycroft * while we were stopped), check for a signal from the debugger.
811 1.33 mycroft */
812 1.53 mycroft if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) {
813 1.166.2.3 ad sigaddset(&l->l_sigpend->sp_set, p->p_xstat);
814 1.166.2.2 ad signotify(l);
815 1.53 mycroft }
816 1.166.2.2 ad p->p_nrlwps++;
817 1.122 thorpej break;
818 1.122 thorpej case LSSUSPENDED:
819 1.166.2.2 ad p->p_nrlwps++;
820 1.166.2.2 ad break;
821 1.166.2.2 ad case LSSLEEP:
822 1.26 cgd break;
823 1.166.2.2 ad default:
824 1.166.2.2 ad panic("setrunnable: lwp %p state was %d", l, l->l_stat);
825 1.166.2.2 ad }
826 1.166.2.2 ad
827 1.166.2.2 ad /*
828 1.166.2.2 ad * If the LWP was sleeping interruptably, then it's OK to start it
829 1.166.2.2 ad * again. If not, mark it as still sleeping.
830 1.166.2.2 ad */
831 1.166.2.2 ad if (l->l_wchan != NULL) {
832 1.166.2.2 ad l->l_stat = LSSLEEP;
833 1.166.2.2 ad if ((l->l_flag & L_SINTR) != 0)
834 1.166.2.2 ad sleepq_unsleep(l);
835 1.166.2.2 ad return;
836 1.26 cgd }
837 1.139 cl
838 1.139 cl if (l->l_proc->p_sa)
839 1.139 cl sa_awaken(l);
840 1.139 cl
841 1.166.2.3 ad /*
842 1.166.2.3 ad * Set in sched_mutex as it the LWP's current mutex.
843 1.166.2.3 ad */
844 1.166.2.3 ad lwp_relock(l, &sched_mutex);
845 1.122 thorpej
846 1.166.2.2 ad /*
847 1.166.2.3 ad * If the LWP is still on the CPU, mark it as LSONPROC. It may be
848 1.166.2.3 ad * about to call mi_switch(), in which case it will yield.
849 1.166.2.2 ad */
850 1.166.2.3 ad if ((ci = l->l_cpu) != NULL && ci->ci_curlwp == l) {
851 1.166.2.3 ad l->l_stat = LSONPROC;
852 1.166.2.3 ad l->l_slptime = 0;
853 1.166.2.3 ad lwp_unlock(l);
854 1.166.2.3 ad return;
855 1.166.2.3 ad }
856 1.122 thorpej
857 1.166.2.3 ad /*
858 1.166.2.3 ad * Set the LWP runnable. If it's swapped out, we need to wake the swapper
859 1.166.2.3 ad * to bring it back in. Otherwise, enter it into a run queue.
860 1.166.2.3 ad */
861 1.166.2.3 ad l->l_stat = LSRUN;
862 1.122 thorpej if (l->l_slptime > 1)
863 1.122 thorpej updatepri(l);
864 1.122 thorpej l->l_slptime = 0;
865 1.166.2.2 ad
866 1.166.2.2 ad if (l->l_flag & L_INMEM) {
867 1.166.2.2 ad setrunqueue(l);
868 1.166.2.2 ad resched_lwp(l, l->l_priority);
869 1.166.2.2 ad lwp_unlock(l);
870 1.166.2.2 ad } else {
871 1.166.2.2 ad lwp_unlock(l);
872 1.166.2.2 ad wakeup(&proc0);
873 1.166.2.2 ad }
874 1.26 cgd }
875 1.26 cgd
876 1.26 cgd /*
877 1.26 cgd * Compute the priority of a process when running in user mode.
878 1.26 cgd * Arrange to reschedule if the resulting priority is better
879 1.26 cgd * than that of the current process.
880 1.26 cgd */
881 1.26 cgd void
882 1.122 thorpej resetpriority(struct lwp *l)
883 1.26 cgd {
884 1.71 augustss unsigned int newpriority;
885 1.122 thorpej struct proc *p = l->l_proc;
886 1.26 cgd
887 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, NULL));
888 1.83 thorpej
889 1.166.2.2 ad /* XXXSMP proc values will be accessed unlocked */
890 1.153 yamt newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
891 1.122 thorpej NICE_WEIGHT * (p->p_nice - NZERO);
892 1.26 cgd newpriority = min(newpriority, MAXPRI);
893 1.122 thorpej l->l_usrpri = newpriority;
894 1.166.2.2 ad resched_lwp(l, l->l_usrpri);
895 1.122 thorpej }
896 1.122 thorpej
897 1.130 nathanw /*
898 1.122 thorpej * Recompute priority for all LWPs in a process.
899 1.122 thorpej */
900 1.122 thorpej void
901 1.122 thorpej resetprocpriority(struct proc *p)
902 1.122 thorpej {
903 1.122 thorpej struct lwp *l;
904 1.122 thorpej
905 1.166.2.2 ad LOCK_ASSERT(mutex_owned(&p->p_smutex));
906 1.166.2.2 ad
907 1.166.2.2 ad LIST_FOREACH(l, &p->p_lwps, l_sibling) {
908 1.166.2.2 ad lwp_lock(l);
909 1.166.2.2 ad resetpriority(l);
910 1.166.2.2 ad lwp_unlock(l);
911 1.166.2.2 ad }
912 1.55 ross }
913 1.55 ross
914 1.55 ross /*
915 1.56 ross * We adjust the priority of the current process. The priority of a process
916 1.141 wiz * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
917 1.56 ross * is increased here. The formula for computing priorities (in kern_synch.c)
918 1.56 ross * will compute a different value each time p_estcpu increases. This can
919 1.56 ross * cause a switch, but unless the priority crosses a PPQ boundary the actual
920 1.141 wiz * queue will not change. The CPU usage estimator ramps up quite quickly
921 1.56 ross * when the process is running (linearly), and decays away exponentially, at
922 1.56 ross * a rate which is proportionally slower when the system is busy. The basic
923 1.80 nathanw * principle is that the system will 90% forget that the process used a lot
924 1.56 ross * of CPU time in 5 * loadav seconds. This causes the system to favor
925 1.56 ross * processes which haven't run much recently, and to round-robin among other
926 1.56 ross * processes.
927 1.55 ross */
928 1.55 ross
929 1.55 ross void
930 1.122 thorpej schedclock(struct lwp *l)
931 1.55 ross {
932 1.122 thorpej struct proc *p = l->l_proc;
933 1.166.2.2 ad
934 1.166.2.2 ad LOCK_ASSERT(mutex_owned(&p->p_smutex));
935 1.77 thorpej
936 1.153 yamt p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
937 1.130 nathanw
938 1.166.2.2 ad lwp_lock(l);
939 1.166.2.2 ad resetpriority(l);
940 1.122 thorpej if (l->l_priority >= PUSER)
941 1.122 thorpej l->l_priority = l->l_usrpri;
942 1.166.2.2 ad lwp_unlock(l);
943 1.26 cgd }
944 1.94 bouyer
945 1.166.2.2 ad /*
946 1.166.2.2 ad * suspendsched:
947 1.166.2.2 ad *
948 1.166.2.2 ad * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
949 1.166.2.2 ad * This violates locking conventions (p->p_smutex is not held),
950 1.166.2.2 ad * however since it is only used during panic or shutdown that is
951 1.166.2.2 ad * not a problem.
952 1.166.2.2 ad *
953 1.166.2.2 ad * XXXAD Do by process?
954 1.166.2.2 ad */
955 1.94 bouyer void
956 1.94 bouyer suspendsched()
957 1.94 bouyer {
958 1.122 thorpej struct lwp *l;
959 1.166.2.2 ad struct proc *p;
960 1.94 bouyer
961 1.166.2.1 ad mutex_enter(&alllwp_mutex);
962 1.122 thorpej LIST_FOREACH(l, &alllwp, l_list) {
963 1.166.2.2 ad if ((l->l_flag & L_SYSTEM) != 0)
964 1.94 bouyer continue;
965 1.122 thorpej
966 1.166.2.2 ad p = l->l_proc;
967 1.166.2.2 ad
968 1.166.2.2 ad lwp_lock(l);
969 1.122 thorpej switch (l->l_stat) {
970 1.122 thorpej case LSRUN:
971 1.166.2.2 ad p->p_nrlwps--;
972 1.122 thorpej l->l_stat = LSSUSPENDED;
973 1.166.2.2 ad remrunqueue(l);
974 1.97 enami break;
975 1.122 thorpej case LSONPROC:
976 1.166.2.2 ad p->p_nrlwps--;
977 1.166.2.2 ad l->l_stat = LSSUSPENDED;
978 1.166.2.2 ad break;
979 1.166.2.2 ad case LSSLEEP:
980 1.166.2.2 ad p->p_nrlwps--;
981 1.166.2.2 ad l->l_stat = LSSUSPENDED;
982 1.97 enami break;
983 1.97 enami default:
984 1.97 enami break;
985 1.94 bouyer }
986 1.166.2.3 ad lwp_setlock_unlock(l, &lwp_mutex);
987 1.94 bouyer }
988 1.166.2.1 ad mutex_exit(&alllwp_mutex);
989 1.94 bouyer }
990 1.113 gmcgarry
991 1.113 gmcgarry /*
992 1.151 yamt * scheduler_fork_hook:
993 1.151 yamt *
994 1.151 yamt * Inherit the parent's scheduler history.
995 1.151 yamt */
996 1.151 yamt void
997 1.151 yamt scheduler_fork_hook(struct proc *parent, struct proc *child)
998 1.151 yamt {
999 1.151 yamt
1000 1.166.2.2 ad mutex_enter(&parent->p_smutex);
1001 1.157 yamt child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1002 1.157 yamt child->p_forktime = schedcpu_ticks;
1003 1.166.2.2 ad mutex_exit(&parent->p_smutex);
1004 1.151 yamt }
1005 1.151 yamt
1006 1.151 yamt /*
1007 1.151 yamt * scheduler_wait_hook:
1008 1.151 yamt *
1009 1.151 yamt * Chargeback parents for the sins of their children.
1010 1.151 yamt */
1011 1.151 yamt void
1012 1.151 yamt scheduler_wait_hook(struct proc *parent, struct proc *child)
1013 1.151 yamt {
1014 1.157 yamt fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1015 1.157 yamt fixpt_t estcpu;
1016 1.151 yamt
1017 1.151 yamt /* XXX Only if parent != init?? */
1018 1.157 yamt
1019 1.166.2.2 ad mutex_enter(&parent->p_smutex);
1020 1.157 yamt estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1021 1.157 yamt schedcpu_ticks - child->p_forktime);
1022 1.166.2.2 ad if (child->p_estcpu > estcpu)
1023 1.157 yamt parent->p_estcpu =
1024 1.157 yamt ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1025 1.166.2.2 ad mutex_exit(&parent->p_smutex);
1026 1.151 yamt }
1027 1.151 yamt
1028 1.151 yamt /*
1029 1.166.2.1 ad * XXXAD Scale an LWP priority (possibly a user priority) to a kernel one.
1030 1.166.2.2 ad * This is a hack; think of something better. Do unlocked for now.
1031 1.166.2.1 ad */
1032 1.166.2.1 ad int
1033 1.166.2.1 ad sched_kpri(struct lwp *l)
1034 1.166.2.1 ad {
1035 1.166.2.1 ad const int obase = PUSER;
1036 1.166.2.1 ad const int ospan = MAXPRI - obase;
1037 1.166.2.1 ad const int nbase = PRIBIO;
1038 1.166.2.1 ad const int nspan = PUSER - nbase;
1039 1.166.2.1 ad
1040 1.166.2.1 ad if (l->l_priority < obase)
1041 1.166.2.1 ad return (l->l_priority);
1042 1.166.2.1 ad
1043 1.166.2.1 ad return (l->l_priority - obase) * nspan / ospan + nbase;
1044 1.166.2.1 ad }
1045 1.166.2.1 ad
1046 1.166.2.3 ad void
1047 1.166.2.3 ad sched_lock(void)
1048 1.166.2.3 ad {
1049 1.166.2.3 ad
1050 1.166.2.3 ad mutex_enter(&sched_mutex);
1051 1.166.2.3 ad }
1052 1.166.2.3 ad
1053 1.166.2.3 ad void
1054 1.166.2.3 ad sched_unlock(void)
1055 1.166.2.3 ad {
1056 1.166.2.3 ad
1057 1.166.2.3 ad mutex_exit(&sched_mutex);
1058 1.166.2.3 ad }
1059 1.166.2.3 ad
1060 1.166.2.1 ad /*
1061 1.113 gmcgarry * Low-level routines to access the run queue. Optimised assembler
1062 1.113 gmcgarry * routines can override these.
1063 1.113 gmcgarry */
1064 1.113 gmcgarry
1065 1.113 gmcgarry #ifndef __HAVE_MD_RUNQUEUE
1066 1.115 nisimura
1067 1.130 nathanw /*
1068 1.134 matt * On some architectures, it's faster to use a MSB ordering for the priorites
1069 1.134 matt * than the traditional LSB ordering.
1070 1.134 matt */
1071 1.134 matt #ifdef __HAVE_BIGENDIAN_BITOPS
1072 1.134 matt #define RQMASK(n) (0x80000000 >> (n))
1073 1.134 matt #else
1074 1.134 matt #define RQMASK(n) (0x00000001 << (n))
1075 1.134 matt #endif
1076 1.134 matt
1077 1.134 matt /*
1078 1.115 nisimura * The primitives that manipulate the run queues. whichqs tells which
1079 1.115 nisimura * of the 32 queues qs have processes in them. Setrunqueue puts processes
1080 1.115 nisimura * into queues, remrunqueue removes them from queues. The running process is
1081 1.115 nisimura * on no queue, other processes are on a queue related to p->p_priority,
1082 1.115 nisimura * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1083 1.115 nisimura * available queues.
1084 1.130 nathanw */
1085 1.146 matt #ifdef RQDEBUG
1086 1.146 matt static void
1087 1.146 matt checkrunqueue(int whichq, struct lwp *l)
1088 1.146 matt {
1089 1.146 matt const struct prochd * const rq = &sched_qs[whichq];
1090 1.146 matt struct lwp *l2;
1091 1.146 matt int found = 0;
1092 1.146 matt int die = 0;
1093 1.146 matt int empty = 1;
1094 1.164 christos for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1095 1.146 matt if (l2->l_stat != LSRUN) {
1096 1.146 matt printf("checkrunqueue[%d]: lwp %p state (%d) "
1097 1.146 matt " != LSRUN\n", whichq, l2, l2->l_stat);
1098 1.146 matt }
1099 1.146 matt if (l2->l_back->l_forw != l2) {
1100 1.146 matt printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1101 1.146 matt "corrupt %p\n", whichq, l2, l2->l_back,
1102 1.146 matt l2->l_back->l_forw);
1103 1.146 matt die = 1;
1104 1.146 matt }
1105 1.146 matt if (l2->l_forw->l_back != l2) {
1106 1.146 matt printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1107 1.146 matt "corrupt %p\n", whichq, l2, l2->l_forw,
1108 1.146 matt l2->l_forw->l_back);
1109 1.146 matt die = 1;
1110 1.146 matt }
1111 1.146 matt if (l2 == l)
1112 1.146 matt found = 1;
1113 1.146 matt empty = 0;
1114 1.146 matt }
1115 1.146 matt if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1116 1.146 matt printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1117 1.146 matt whichq, rq);
1118 1.146 matt die = 1;
1119 1.146 matt } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1120 1.146 matt printf("checkrunqueue[%d]: bit clear for non-empty "
1121 1.146 matt "run-queue %p\n", whichq, rq);
1122 1.146 matt die = 1;
1123 1.146 matt }
1124 1.146 matt if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1125 1.146 matt printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1126 1.146 matt whichq, l);
1127 1.146 matt die = 1;
1128 1.146 matt }
1129 1.146 matt if (l != NULL && empty) {
1130 1.146 matt printf("checkrunqueue[%d]: empty run-queue %p with "
1131 1.146 matt "active lwp %p\n", whichq, rq, l);
1132 1.146 matt die = 1;
1133 1.146 matt }
1134 1.146 matt if (l != NULL && !found) {
1135 1.146 matt printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1136 1.146 matt whichq, l, rq);
1137 1.146 matt die = 1;
1138 1.146 matt }
1139 1.146 matt if (die)
1140 1.146 matt panic("checkrunqueue: inconsistency found");
1141 1.146 matt }
1142 1.146 matt #endif /* RQDEBUG */
1143 1.146 matt
1144 1.113 gmcgarry void
1145 1.122 thorpej setrunqueue(struct lwp *l)
1146 1.113 gmcgarry {
1147 1.113 gmcgarry struct prochd *rq;
1148 1.122 thorpej struct lwp *prev;
1149 1.152 yamt const int whichq = l->l_priority / PPQ;
1150 1.113 gmcgarry
1151 1.166.2.3 ad LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1152 1.166.2.2 ad
1153 1.146 matt #ifdef RQDEBUG
1154 1.146 matt checkrunqueue(whichq, NULL);
1155 1.146 matt #endif
1156 1.113 gmcgarry #ifdef DIAGNOSTIC
1157 1.166.2.2 ad if (l->l_back != NULL || l->l_stat != LSRUN)
1158 1.113 gmcgarry panic("setrunqueue");
1159 1.113 gmcgarry #endif
1160 1.134 matt sched_whichqs |= RQMASK(whichq);
1161 1.113 gmcgarry rq = &sched_qs[whichq];
1162 1.113 gmcgarry prev = rq->ph_rlink;
1163 1.122 thorpej l->l_forw = (struct lwp *)rq;
1164 1.122 thorpej rq->ph_rlink = l;
1165 1.122 thorpej prev->l_forw = l;
1166 1.122 thorpej l->l_back = prev;
1167 1.146 matt #ifdef RQDEBUG
1168 1.146 matt checkrunqueue(whichq, l);
1169 1.146 matt #endif
1170 1.113 gmcgarry }
1171 1.113 gmcgarry
1172 1.113 gmcgarry void
1173 1.122 thorpej remrunqueue(struct lwp *l)
1174 1.113 gmcgarry {
1175 1.122 thorpej struct lwp *prev, *next;
1176 1.152 yamt const int whichq = l->l_priority / PPQ;
1177 1.166.2.2 ad
1178 1.166.2.2 ad LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1179 1.166.2.2 ad
1180 1.146 matt #ifdef RQDEBUG
1181 1.146 matt checkrunqueue(whichq, l);
1182 1.146 matt #endif
1183 1.166.2.2 ad
1184 1.166.2.2 ad #if defined(DIAGNOSTIC)
1185 1.166.2.2 ad if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1186 1.166.2.2 ad /* Shouldn't happen - interrupts disabled. */
1187 1.146 matt panic("remrunqueue: bit %d not set", whichq);
1188 1.166.2.2 ad }
1189 1.113 gmcgarry #endif
1190 1.122 thorpej prev = l->l_back;
1191 1.122 thorpej l->l_back = NULL;
1192 1.122 thorpej next = l->l_forw;
1193 1.122 thorpej prev->l_forw = next;
1194 1.122 thorpej next->l_back = prev;
1195 1.113 gmcgarry if (prev == next)
1196 1.134 matt sched_whichqs &= ~RQMASK(whichq);
1197 1.146 matt #ifdef RQDEBUG
1198 1.146 matt checkrunqueue(whichq, NULL);
1199 1.146 matt #endif
1200 1.113 gmcgarry }
1201 1.113 gmcgarry
1202 1.134 matt #undef RQMASK
1203 1.134 matt #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1204