kern_synch.c revision 1.166.2.3 1 /* $NetBSD: kern_synch.c,v 1.166.2.3 2006/10/24 21:10:21 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*-
41 * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. Neither the name of the University nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 *
73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.166.2.3 2006/10/24 21:10:21 ad Exp $");
78
79 #include "opt_ddb.h"
80 #include "opt_ktrace.h"
81 #include "opt_kstack.h"
82 #include "opt_lockdebug.h"
83 #include "opt_multiprocessor.h"
84 #include "opt_perfctrs.h"
85
86 #define __MUTEX_PRIVATE
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/callout.h>
91 #include <sys/proc.h>
92 #include <sys/kernel.h>
93 #include <sys/buf.h>
94 #if defined(PERFCTRS)
95 #include <sys/pmc.h>
96 #endif
97 #include <sys/signalvar.h>
98 #include <sys/resourcevar.h>
99 #include <sys/sched.h>
100 #include <sys/sa.h>
101 #include <sys/savar.h>
102 #include <sys/kauth.h>
103 #include <sys/sleepq.h>
104 #include <sys/lockdebug.h>
105
106 #include <uvm/uvm_extern.h>
107
108 #ifdef KTRACE
109 #include <sys/ktrace.h>
110 #endif
111
112 #include <machine/cpu.h>
113
114 int lbolt; /* once a second sleep address */
115 int rrticks; /* number of hardclock ticks per roundrobin() */
116
117 /*
118 * The global scheduler state.
119 */
120 kmutex_t sched_mutex; /* global run queue mutex */
121 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
122 volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
123
124 void schedcpu(void *);
125 void updatepri(struct lwp *);
126 void sa_awaken(struct lwp *);
127
128 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
129 static unsigned int schedcpu_ticks;
130
131 /*
132 * Force switch among equal priority processes every 100ms.
133 * Called from hardclock every hz/10 == rrticks hardclock ticks.
134 */
135 /* ARGSUSED */
136 void
137 roundrobin(struct cpu_info *ci)
138 {
139 struct schedstate_percpu *spc = &ci->ci_schedstate;
140
141 spc->spc_rrticks = rrticks;
142
143 if (curlwp != NULL) {
144 if (spc->spc_flags & SPCF_SEENRR) {
145 /*
146 * The process has already been through a roundrobin
147 * without switching and may be hogging the CPU.
148 * Indicate that the process should yield.
149 */
150 spc->spc_flags |= SPCF_SHOULDYIELD;
151 } else
152 spc->spc_flags |= SPCF_SEENRR;
153 }
154 cpu_need_resched(curcpu());
155 }
156
157 #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
158 #define NICE_WEIGHT 2 /* priorities per nice level */
159
160 #define ESTCPU_SHIFT 11
161 #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
162 #define ESTCPULIM(e) min((e), ESTCPU_MAX)
163
164 /*
165 * Constants for digital decay and forget:
166 * 90% of (p_estcpu) usage in 5 * loadav time
167 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
168 * Note that, as ps(1) mentions, this can let percentages
169 * total over 100% (I've seen 137.9% for 3 processes).
170 *
171 * Note that hardclock updates p_estcpu and p_cpticks independently.
172 *
173 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
174 * That is, the system wants to compute a value of decay such
175 * that the following for loop:
176 * for (i = 0; i < (5 * loadavg); i++)
177 * p_estcpu *= decay;
178 * will compute
179 * p_estcpu *= 0.1;
180 * for all values of loadavg:
181 *
182 * Mathematically this loop can be expressed by saying:
183 * decay ** (5 * loadavg) ~= .1
184 *
185 * The system computes decay as:
186 * decay = (2 * loadavg) / (2 * loadavg + 1)
187 *
188 * We wish to prove that the system's computation of decay
189 * will always fulfill the equation:
190 * decay ** (5 * loadavg) ~= .1
191 *
192 * If we compute b as:
193 * b = 2 * loadavg
194 * then
195 * decay = b / (b + 1)
196 *
197 * We now need to prove two things:
198 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
199 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
200 *
201 * Facts:
202 * For x close to zero, exp(x) =~ 1 + x, since
203 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
204 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
205 * For x close to zero, ln(1+x) =~ x, since
206 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
207 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
208 * ln(.1) =~ -2.30
209 *
210 * Proof of (1):
211 * Solve (factor)**(power) =~ .1 given power (5*loadav):
212 * solving for factor,
213 * ln(factor) =~ (-2.30/5*loadav), or
214 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
215 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
216 *
217 * Proof of (2):
218 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
219 * solving for power,
220 * power*ln(b/(b+1)) =~ -2.30, or
221 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
222 *
223 * Actual power values for the implemented algorithm are as follows:
224 * loadav: 1 2 3 4
225 * power: 5.68 10.32 14.94 19.55
226 */
227
228 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
229 #define loadfactor(loadav) (2 * (loadav))
230
231 static fixpt_t
232 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
233 {
234
235 if (estcpu == 0) {
236 return 0;
237 }
238
239 #if !defined(_LP64)
240 /* avoid 64bit arithmetics. */
241 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
242 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
243 return estcpu * loadfac / (loadfac + FSCALE);
244 }
245 #endif /* !defined(_LP64) */
246
247 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
248 }
249
250 /*
251 * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
252 * sleeping for at least seven times the loadfactor will decay p_estcpu to
253 * less than (1 << ESTCPU_SHIFT).
254 *
255 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
256 */
257 static fixpt_t
258 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
259 {
260
261 if ((n << FSHIFT) >= 7 * loadfac) {
262 return 0;
263 }
264
265 while (estcpu != 0 && n > 1) {
266 estcpu = decay_cpu(loadfac, estcpu);
267 n--;
268 }
269
270 return estcpu;
271 }
272
273 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
274 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
275
276 /*
277 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
278 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
279 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
280 *
281 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
282 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
283 *
284 * If you dont want to bother with the faster/more-accurate formula, you
285 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
286 * (more general) method of calculating the %age of CPU used by a process.
287 */
288 #define CCPU_SHIFT 11
289
290 /*
291 * Recompute process priorities, every hz ticks.
292 */
293 /* ARGSUSED */
294 void
295 schedcpu(void *arg)
296 {
297 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
298 struct rlimit *rlim;
299 struct lwp *l;
300 struct proc *p;
301 int s, minslp;
302 int clkhz;
303 long runtm;
304
305 schedcpu_ticks++;
306
307 mutex_enter(&proclist_mutex);
308 PROCLIST_FOREACH(p, &allproc) {
309 /*
310 * Increment time in/out of memory and sleep time (if
311 * sleeping). We ignore overflow; with 16-bit int's
312 * (remember them?) overflow takes 45 days.
313 *
314 * XXXSMP Should create an activeproc list so that we
315 * don't touch every proc+LWP in the system on a regular
316 * basis. l->l_swtime/l->l_slptime can become deltas.
317 */
318 minslp = 2;
319 runtm = 0;
320 mutex_enter(&p->p_smutex);
321 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
322 lwp_lock(l);
323 runtm += l->l_rtime.tv_sec;
324 l->l_swtime++;
325 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
326 l->l_stat == LSSUSPENDED) {
327 l->l_slptime++;
328 minslp = min(minslp, l->l_slptime);
329 } else
330 minslp = 0;
331 lwp_unlock(l);
332 }
333 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
334
335 /*
336 * Check if the process exceeds its CPU resource allocation.
337 * If over max, kill it. In any case, if it has run for more
338 * than autonicetime, reduce priority to give others a chance.
339 */
340 rlim = &p->p_rlimit[RLIMIT_CPU];
341 if (runtm >= rlim->rlim_cur) {
342 if (runtm >= rlim->rlim_max)
343 psignal(p, SIGKILL);
344 else {
345 psignal(p, SIGXCPU);
346 if (rlim->rlim_cur < rlim->rlim_max)
347 rlim->rlim_cur += 5;
348 }
349 }
350 if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
351 && kauth_cred_geteuid(p->p_cred)) {
352 p->p_nice = autoniceval + NZERO;
353 resetprocpriority(p);
354 }
355
356 /*
357 * If the process has slept the entire second,
358 * stop recalculating its priority until it wakes up.
359 */
360 if (minslp > 1) {
361 mutex_exit(&p->p_smutex);
362 continue;
363 }
364 /* XXXAD lock */
365 s = splstatclock(); /* prevent state changes */
366 /*
367 * p_pctcpu is only for ps.
368 */
369 clkhz = stathz != 0 ? stathz : hz;
370 #if (FSHIFT >= CCPU_SHIFT)
371 p->p_pctcpu += (clkhz == 100)?
372 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
373 100 * (((fixpt_t) p->p_cpticks)
374 << (FSHIFT - CCPU_SHIFT)) / clkhz;
375 #else
376 p->p_pctcpu += ((FSCALE - ccpu) *
377 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
378 #endif
379 p->p_cpticks = 0;
380 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
381 splx(s); /* Done with the process CPU ticks update */
382 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
383 lwp_lock(l);
384 if (l->l_slptime > 1) {
385 lwp_unlock(l);
386 continue;
387 }
388 resetpriority(l);
389 if (l->l_priority >= PUSER) {
390 if (l->l_stat == LSRUN &&
391 (l->l_flag & L_INMEM) &&
392 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) {
393 remrunqueue(l);
394 l->l_priority = l->l_usrpri;
395 setrunqueue(l);
396 } else
397 l->l_priority = l->l_usrpri;
398 }
399 lwp_unlock(l);
400 }
401 mutex_exit(&p->p_smutex);
402 }
403 mutex_exit(&proclist_mutex);
404 uvm_meter();
405 wakeup((caddr_t)&lbolt);
406 callout_schedule(&schedcpu_ch, hz);
407 }
408
409 /*
410 * Recalculate the priority of a process after it has slept for a while.
411 */
412 void
413 updatepri(struct lwp *l)
414 {
415 struct proc *p = l->l_proc;
416 fixpt_t loadfac;
417
418 LOCK_ASSERT(lwp_locked(l, NULL));
419 KASSERT(l->l_slptime > 1);
420
421 loadfac = loadfactor(averunnable.ldavg[0]);
422
423 l->l_slptime--; /* the first time was done in schedcpu */
424 /* XXX NJWLWP */
425 /* XXXSMP occasionaly unlocked. */
426 p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
427 resetpriority(l);
428 }
429
430 /*
431 * During autoconfiguration or after a panic, a sleep will simply lower the
432 * priority briefly to allow interrupts, then return. The priority to be
433 * used (safepri) is machine-dependent, thus this value is initialized and
434 * maintained in the machine-dependent layers. This priority will typically
435 * be 0, or the lowest priority that is safe for use on the interrupt stack;
436 * it can be made higher to block network software interrupts after panics.
437 */
438 int safepri;
439
440 /*
441 * ltsleep: see mtsleep() for comments.
442 */
443 int
444 ltsleep(wchan_t ident, int priority, const char *wmesg, int timo,
445 volatile struct simplelock *interlock)
446 {
447 struct lwp *l = curlwp;
448 sleepq_t *sq;
449 int error;
450
451 if (sleepq_dontsleep(l)) {
452 (void)sleepq_abort(NULL, 0);
453 if ((priority & PNORELOCK) != 0)
454 simple_unlock(interlock);
455 return 0;
456 }
457
458 sq = sleeptab_lookup(ident);
459 sleepq_enter(sq, priority, ident, wmesg, timo, priority & PCATCH);
460
461 if (interlock != NULL) {
462 LOCK_ASSERT(simple_lock_held(interlock));
463 simple_unlock(interlock);
464 }
465
466 error = sleepq_block(sq, timo);
467
468 if (interlock != NULL && (priority & PNORELOCK) == 0)
469 simple_lock(interlock);
470
471 return error;
472 }
473
474 /*
475 * General sleep call. Suspends the current process until a wakeup is
476 * performed on the specified identifier. The process will then be made
477 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
478 * means no timeout). If pri includes PCATCH flag, signals are checked
479 * before and after sleeping, else signals are not checked. Returns 0 if
480 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
481 * signal needs to be delivered, ERESTART is returned if the current system
482 * call should be restarted if possible, and EINTR is returned if the system
483 * call should be interrupted by the signal (return EINTR).
484 *
485 * The interlock is held until we are on a sleep queue. The interlock will
486 * be locked before returning back to the caller unless the PNORELOCK flag
487 * is specified, in which case the interlock will always be unlocked upon
488 * return.
489 */
490 int
491 mtsleep(wchan_t ident, int priority, const char *wmesg, int timo,
492 kmutex_t *mtx)
493 {
494 struct lwp *l = curlwp;
495 sleepq_t *sq;
496 int error;
497
498 if (sleepq_dontsleep(l))
499 return sleepq_abort(mtx, priority & PNORELOCK);
500
501 sq = sleeptab_lookup(ident);
502 sleepq_enter(sq, priority, ident, wmesg, timo, priority & PCATCH);
503
504 if (mtx != NULL) {
505 LOCK_ASSERT(mutex_owned(mtx));
506 mutex_exit_linked(mtx, l->l_mutex);
507 }
508
509 error = sleepq_block(sq, timo);
510
511 if (mtx != NULL && (priority & PNORELOCK) == 0)
512 mutex_enter(mtx);
513
514 return error;
515 }
516
517 void
518 sa_awaken(struct lwp *l)
519 {
520
521 LOCK_ASSERT(lwp_locked(l, NULL));
522
523 if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD)
524 l->l_flag &= ~L_SA_IDLE;
525 }
526
527 /*
528 * Make all processes sleeping on the specified identifier runnable.
529 */
530 void
531 wakeup(wchan_t ident)
532 {
533 sleepq_t *sq;
534
535 if (cold)
536 return;
537
538 sq = sleeptab_lookup(ident);
539 sleepq_wakeall(sq, ident, (u_int)-1);
540 }
541
542 /*
543 * Make the highest priority process first in line on the specified
544 * identifier runnable.
545 */
546 void
547 wakeup_one(wchan_t ident)
548 {
549 sleepq_t *sq;
550
551 if (cold)
552 return;
553
554 sq = sleeptab_lookup(ident);
555 sleepq_wakeone(sq, ident);
556 }
557
558
559 /*
560 * General yield call. Puts the current process back on its run queue and
561 * performs a voluntary context switch. Should only be called when the
562 * current process explicitly requests it (eg sched_yield(2) in compat code).
563 */
564 void
565 yield(void)
566 {
567 struct lwp *l = curlwp;
568
569 lwp_lock(l);
570 if (l->l_stat == LSONPROC) {
571 KASSERT(lwp_locked(l, &sched_mutex));
572 l->l_priority = l->l_usrpri;
573 }
574 l->l_nvcsw++;
575 mi_switch(l, NULL);
576 }
577
578 /*
579 * General preemption call. Puts the current process back on its run queue
580 * and performs an involuntary context switch.
581 * The 'more' ("more work to do") argument is boolean. Returning to userspace
582 * preempt() calls pass 0. "Voluntary" preemptions in e.g. uiomove() pass 1.
583 * This will be used to indicate to the SA subsystem that the LWP is
584 * not yet finished in the kernel.
585 */
586 void
587 preempt(int more)
588 {
589 struct lwp *l = curlwp;
590 int r;
591
592 lwp_lock(l);
593 if (l->l_stat == LSONPROC) {
594 KASSERT(lwp_locked(l, &sched_mutex));
595 l->l_priority = l->l_usrpri;
596 }
597 l->l_nivcsw++;
598 r = mi_switch(l, NULL);
599 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) /* XXXAD */
600 sa_preempt(l);
601 }
602
603 /*
604 * The machine independent parts of context switch. Switch to "new"
605 * if non-NULL, otherwise let cpu_switch choose the next lwp.
606 *
607 * Returns 1 if another process was actually run.
608 */
609 int
610 mi_switch(struct lwp *l, struct lwp *newl)
611 {
612 struct schedstate_percpu *spc;
613 struct timeval tv;
614 int hold_count;
615 int retval, oldspl;
616 long s, u;
617 #if PERFCTRS
618 struct proc *p = l->l_proc;
619 #endif
620
621 LOCK_ASSERT(lwp_locked(l, NULL));
622
623 /*
624 * Release the kernel_lock, as we are about to yield the CPU.
625 */
626 hold_count = KERNEL_LOCK_RELEASE_ALL();
627
628 #ifdef LOCKDEBUG
629 spinlock_switchcheck();
630 simple_lock_switchcheck();
631 #endif
632 #ifdef KSTACK_CHECK_MAGIC
633 kstack_check_magic(l);
634 #endif
635
636 /*
637 * It's safe to read the per CPU schedstate unlocked here, as all we
638 * are after is the run time and that's guarenteed to have been last
639 * updated by this CPU.
640 */
641 KDASSERT(l->l_cpu != NULL);
642 KDASSERT(l->l_cpu == curcpu());
643 spc = &l->l_cpu->ci_schedstate;
644
645 /*
646 * Compute the amount of time during which the current
647 * process was running.
648 */
649 microtime(&tv);
650 u = l->l_rtime.tv_usec +
651 (tv.tv_usec - spc->spc_runtime.tv_usec);
652 s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
653 if (u < 0) {
654 u += 1000000;
655 s--;
656 } else if (u >= 1000000) {
657 u -= 1000000;
658 s++;
659 }
660 l->l_rtime.tv_usec = u;
661 l->l_rtime.tv_sec = s;
662
663 /*
664 * XXXSMP If we are using h/w performance counters, save context.
665 */
666 #if PERFCTRS
667 if (PMC_ENABLED(p)) {
668 pmc_save_context(p);
669 }
670 #endif
671
672 /*
673 * Acquire the sched_mutex if necessary. It will be released by
674 * cpu_switch once it has decided to idle, or picked another LWP
675 * to run.
676 */
677 oldspl = mutex_getspl(l->l_mutex);
678 #ifdef MULTIPROCESSOR
679 if (l->l_mutex != &sched_mutex || l->l_omutex != NULL) {
680 lwp_unlock(l);
681 mutex_enter(&sched_mutex);
682 }
683 #endif
684
685 /*
686 * If on the CPU and we have gotten this far, then we must yield.
687 */
688 KASSERT(l->l_stat != LSRUN);
689 if (l->l_stat == LSONPROC) {
690 l->l_stat = LSRUN;
691 setrunqueue(l);
692 }
693 uvmexp.swtch++;
694
695 /*
696 * Process is about to yield the CPU; clear the appropriate
697 * scheduling flags.
698 */
699 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
700
701 LOCKDEBUG_BARRIER(&sched_mutex, 1);
702
703 /*
704 * Switch to the new current LWP. When we run again, we'll
705 * return back here.
706 */
707 if (newl == NULL)
708 retval = cpu_switch(l, NULL);
709 else {
710 /* XXXAD ? */
711 remrunqueue(newl);
712 cpu_switchto(l, newl);
713 retval = 0;
714 }
715
716 /*
717 * XXXSMP If we are using h/w performance counters, restore context.
718 */
719 #if PERFCTRS
720 if (PMC_ENABLED(p)) {
721 pmc_restore_context(p);
722 }
723 #endif
724
725 /*
726 * We're running again; record our new start time. We might
727 * be running on a new CPU now, so don't use the cached
728 * schedstate_percpu pointer.
729 */
730 KDASSERT(l->l_cpu != NULL);
731 KDASSERT(l->l_cpu == curcpu());
732 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
733
734 /*
735 * Reacquire the kernel_lock, and restore the old SPL.
736 * XXX We should do spl0() before acquiring the kernel
737 * lock but splx() may not raise the IPL.
738 */
739 splx(oldspl);
740 KERNEL_LOCK_ACQUIRE_COUNT(hold_count);
741
742 return retval;
743 }
744
745 /*
746 * Initialize the (doubly-linked) run queues
747 * to be empty.
748 */
749 void
750 rqinit()
751 {
752 int i;
753
754 for (i = 0; i < RUNQUE_NQS; i++)
755 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
756 (struct lwp *)&sched_qs[i];
757
758 mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
759 }
760
761 static inline void
762 resched_lwp(struct lwp *l, u_char pri)
763 {
764 struct cpu_info *ci;
765
766 LOCK_ASSERT(lwp_locked(l, NULL));
767
768 /*
769 * XXXSMP
770 * Since l->l_cpu persists across a context switch,
771 * this gives us *very weak* processor affinity, in
772 * that we notify the CPU on which the process last
773 * ran that it should try to switch.
774 *
775 * This does not guarantee that the process will run on
776 * that processor next, because another processor might
777 * grab it the next time it performs a context switch.
778 *
779 * This also does not handle the case where its last
780 * CPU is running a higher-priority process, but every
781 * other CPU is running a lower-priority process. There
782 * are ways to handle this situation, but they're not
783 * currently very pretty, and we also need to weigh the
784 * cost of moving a process from one CPU to another.
785 */
786 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
787 if (pri < ci->ci_schedstate.spc_curpriority)
788 cpu_need_resched(ci);
789 }
790
791 /*
792 * Change process state to be runnable, placing it on the run queue if it is
793 * in memory, and awakening the swapper if it isn't in memory.
794 *
795 * Call with the process and LWP locked. Will return with the LWP unlocked.
796 */
797 void
798 setrunnable(struct lwp *l)
799 {
800 struct proc *p = l->l_proc;
801 struct cpu_info *ci;
802
803 LOCK_ASSERT(mutex_owned(&p->p_smutex));
804 LOCK_ASSERT(lwp_locked(l, NULL));
805
806 switch (l->l_stat) {
807 case LSSTOP:
808 /*
809 * If we're being traced (possibly because someone attached us
810 * while we were stopped), check for a signal from the debugger.
811 */
812 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) {
813 sigaddset(&l->l_sigpend->sp_set, p->p_xstat);
814 signotify(l);
815 }
816 p->p_nrlwps++;
817 break;
818 case LSSUSPENDED:
819 p->p_nrlwps++;
820 break;
821 case LSSLEEP:
822 break;
823 default:
824 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
825 }
826
827 /*
828 * If the LWP was sleeping interruptably, then it's OK to start it
829 * again. If not, mark it as still sleeping.
830 */
831 if (l->l_wchan != NULL) {
832 l->l_stat = LSSLEEP;
833 if ((l->l_flag & L_SINTR) != 0)
834 sleepq_unsleep(l);
835 return;
836 }
837
838 if (l->l_proc->p_sa)
839 sa_awaken(l);
840
841 /*
842 * Set in sched_mutex as it the LWP's current mutex.
843 */
844 lwp_relock(l, &sched_mutex);
845
846 /*
847 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
848 * about to call mi_switch(), in which case it will yield.
849 */
850 if ((ci = l->l_cpu) != NULL && ci->ci_curlwp == l) {
851 l->l_stat = LSONPROC;
852 l->l_slptime = 0;
853 lwp_unlock(l);
854 return;
855 }
856
857 /*
858 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
859 * to bring it back in. Otherwise, enter it into a run queue.
860 */
861 l->l_stat = LSRUN;
862 if (l->l_slptime > 1)
863 updatepri(l);
864 l->l_slptime = 0;
865
866 if (l->l_flag & L_INMEM) {
867 setrunqueue(l);
868 resched_lwp(l, l->l_priority);
869 lwp_unlock(l);
870 } else {
871 lwp_unlock(l);
872 wakeup(&proc0);
873 }
874 }
875
876 /*
877 * Compute the priority of a process when running in user mode.
878 * Arrange to reschedule if the resulting priority is better
879 * than that of the current process.
880 */
881 void
882 resetpriority(struct lwp *l)
883 {
884 unsigned int newpriority;
885 struct proc *p = l->l_proc;
886
887 LOCK_ASSERT(lwp_locked(l, NULL));
888
889 /* XXXSMP proc values will be accessed unlocked */
890 newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
891 NICE_WEIGHT * (p->p_nice - NZERO);
892 newpriority = min(newpriority, MAXPRI);
893 l->l_usrpri = newpriority;
894 resched_lwp(l, l->l_usrpri);
895 }
896
897 /*
898 * Recompute priority for all LWPs in a process.
899 */
900 void
901 resetprocpriority(struct proc *p)
902 {
903 struct lwp *l;
904
905 LOCK_ASSERT(mutex_owned(&p->p_smutex));
906
907 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
908 lwp_lock(l);
909 resetpriority(l);
910 lwp_unlock(l);
911 }
912 }
913
914 /*
915 * We adjust the priority of the current process. The priority of a process
916 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
917 * is increased here. The formula for computing priorities (in kern_synch.c)
918 * will compute a different value each time p_estcpu increases. This can
919 * cause a switch, but unless the priority crosses a PPQ boundary the actual
920 * queue will not change. The CPU usage estimator ramps up quite quickly
921 * when the process is running (linearly), and decays away exponentially, at
922 * a rate which is proportionally slower when the system is busy. The basic
923 * principle is that the system will 90% forget that the process used a lot
924 * of CPU time in 5 * loadav seconds. This causes the system to favor
925 * processes which haven't run much recently, and to round-robin among other
926 * processes.
927 */
928
929 void
930 schedclock(struct lwp *l)
931 {
932 struct proc *p = l->l_proc;
933
934 LOCK_ASSERT(mutex_owned(&p->p_smutex));
935
936 p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
937
938 lwp_lock(l);
939 resetpriority(l);
940 if (l->l_priority >= PUSER)
941 l->l_priority = l->l_usrpri;
942 lwp_unlock(l);
943 }
944
945 /*
946 * suspendsched:
947 *
948 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
949 * This violates locking conventions (p->p_smutex is not held),
950 * however since it is only used during panic or shutdown that is
951 * not a problem.
952 *
953 * XXXAD Do by process?
954 */
955 void
956 suspendsched()
957 {
958 struct lwp *l;
959 struct proc *p;
960
961 mutex_enter(&alllwp_mutex);
962 LIST_FOREACH(l, &alllwp, l_list) {
963 if ((l->l_flag & L_SYSTEM) != 0)
964 continue;
965
966 p = l->l_proc;
967
968 lwp_lock(l);
969 switch (l->l_stat) {
970 case LSRUN:
971 p->p_nrlwps--;
972 l->l_stat = LSSUSPENDED;
973 remrunqueue(l);
974 break;
975 case LSONPROC:
976 p->p_nrlwps--;
977 l->l_stat = LSSUSPENDED;
978 break;
979 case LSSLEEP:
980 p->p_nrlwps--;
981 l->l_stat = LSSUSPENDED;
982 break;
983 default:
984 break;
985 }
986 lwp_setlock_unlock(l, &lwp_mutex);
987 }
988 mutex_exit(&alllwp_mutex);
989 }
990
991 /*
992 * scheduler_fork_hook:
993 *
994 * Inherit the parent's scheduler history.
995 */
996 void
997 scheduler_fork_hook(struct proc *parent, struct proc *child)
998 {
999
1000 mutex_enter(&parent->p_smutex);
1001 child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1002 child->p_forktime = schedcpu_ticks;
1003 mutex_exit(&parent->p_smutex);
1004 }
1005
1006 /*
1007 * scheduler_wait_hook:
1008 *
1009 * Chargeback parents for the sins of their children.
1010 */
1011 void
1012 scheduler_wait_hook(struct proc *parent, struct proc *child)
1013 {
1014 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1015 fixpt_t estcpu;
1016
1017 /* XXX Only if parent != init?? */
1018
1019 mutex_enter(&parent->p_smutex);
1020 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1021 schedcpu_ticks - child->p_forktime);
1022 if (child->p_estcpu > estcpu)
1023 parent->p_estcpu =
1024 ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1025 mutex_exit(&parent->p_smutex);
1026 }
1027
1028 /*
1029 * XXXAD Scale an LWP priority (possibly a user priority) to a kernel one.
1030 * This is a hack; think of something better. Do unlocked for now.
1031 */
1032 int
1033 sched_kpri(struct lwp *l)
1034 {
1035 const int obase = PUSER;
1036 const int ospan = MAXPRI - obase;
1037 const int nbase = PRIBIO;
1038 const int nspan = PUSER - nbase;
1039
1040 if (l->l_priority < obase)
1041 return (l->l_priority);
1042
1043 return (l->l_priority - obase) * nspan / ospan + nbase;
1044 }
1045
1046 void
1047 sched_lock(void)
1048 {
1049
1050 mutex_enter(&sched_mutex);
1051 }
1052
1053 void
1054 sched_unlock(void)
1055 {
1056
1057 mutex_exit(&sched_mutex);
1058 }
1059
1060 /*
1061 * Low-level routines to access the run queue. Optimised assembler
1062 * routines can override these.
1063 */
1064
1065 #ifndef __HAVE_MD_RUNQUEUE
1066
1067 /*
1068 * On some architectures, it's faster to use a MSB ordering for the priorites
1069 * than the traditional LSB ordering.
1070 */
1071 #ifdef __HAVE_BIGENDIAN_BITOPS
1072 #define RQMASK(n) (0x80000000 >> (n))
1073 #else
1074 #define RQMASK(n) (0x00000001 << (n))
1075 #endif
1076
1077 /*
1078 * The primitives that manipulate the run queues. whichqs tells which
1079 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1080 * into queues, remrunqueue removes them from queues. The running process is
1081 * on no queue, other processes are on a queue related to p->p_priority,
1082 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1083 * available queues.
1084 */
1085 #ifdef RQDEBUG
1086 static void
1087 checkrunqueue(int whichq, struct lwp *l)
1088 {
1089 const struct prochd * const rq = &sched_qs[whichq];
1090 struct lwp *l2;
1091 int found = 0;
1092 int die = 0;
1093 int empty = 1;
1094 for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1095 if (l2->l_stat != LSRUN) {
1096 printf("checkrunqueue[%d]: lwp %p state (%d) "
1097 " != LSRUN\n", whichq, l2, l2->l_stat);
1098 }
1099 if (l2->l_back->l_forw != l2) {
1100 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1101 "corrupt %p\n", whichq, l2, l2->l_back,
1102 l2->l_back->l_forw);
1103 die = 1;
1104 }
1105 if (l2->l_forw->l_back != l2) {
1106 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1107 "corrupt %p\n", whichq, l2, l2->l_forw,
1108 l2->l_forw->l_back);
1109 die = 1;
1110 }
1111 if (l2 == l)
1112 found = 1;
1113 empty = 0;
1114 }
1115 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1116 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1117 whichq, rq);
1118 die = 1;
1119 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1120 printf("checkrunqueue[%d]: bit clear for non-empty "
1121 "run-queue %p\n", whichq, rq);
1122 die = 1;
1123 }
1124 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1125 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1126 whichq, l);
1127 die = 1;
1128 }
1129 if (l != NULL && empty) {
1130 printf("checkrunqueue[%d]: empty run-queue %p with "
1131 "active lwp %p\n", whichq, rq, l);
1132 die = 1;
1133 }
1134 if (l != NULL && !found) {
1135 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1136 whichq, l, rq);
1137 die = 1;
1138 }
1139 if (die)
1140 panic("checkrunqueue: inconsistency found");
1141 }
1142 #endif /* RQDEBUG */
1143
1144 void
1145 setrunqueue(struct lwp *l)
1146 {
1147 struct prochd *rq;
1148 struct lwp *prev;
1149 const int whichq = l->l_priority / PPQ;
1150
1151 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1152
1153 #ifdef RQDEBUG
1154 checkrunqueue(whichq, NULL);
1155 #endif
1156 #ifdef DIAGNOSTIC
1157 if (l->l_back != NULL || l->l_stat != LSRUN)
1158 panic("setrunqueue");
1159 #endif
1160 sched_whichqs |= RQMASK(whichq);
1161 rq = &sched_qs[whichq];
1162 prev = rq->ph_rlink;
1163 l->l_forw = (struct lwp *)rq;
1164 rq->ph_rlink = l;
1165 prev->l_forw = l;
1166 l->l_back = prev;
1167 #ifdef RQDEBUG
1168 checkrunqueue(whichq, l);
1169 #endif
1170 }
1171
1172 void
1173 remrunqueue(struct lwp *l)
1174 {
1175 struct lwp *prev, *next;
1176 const int whichq = l->l_priority / PPQ;
1177
1178 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1179
1180 #ifdef RQDEBUG
1181 checkrunqueue(whichq, l);
1182 #endif
1183
1184 #if defined(DIAGNOSTIC)
1185 if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1186 /* Shouldn't happen - interrupts disabled. */
1187 panic("remrunqueue: bit %d not set", whichq);
1188 }
1189 #endif
1190 prev = l->l_back;
1191 l->l_back = NULL;
1192 next = l->l_forw;
1193 prev->l_forw = next;
1194 next->l_back = prev;
1195 if (prev == next)
1196 sched_whichqs &= ~RQMASK(whichq);
1197 #ifdef RQDEBUG
1198 checkrunqueue(whichq, NULL);
1199 #endif
1200 }
1201
1202 #undef RQMASK
1203 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1204