kern_synch.c revision 1.177 1 /* $NetBSD: kern_synch.c,v 1.177 2007/02/15 20:21:13 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*-
41 * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. Neither the name of the University nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 *
73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.177 2007/02/15 20:21:13 ad Exp $");
78
79 #include "opt_ddb.h"
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/callout.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #include <sys/buf.h>
93 #if defined(PERFCTRS)
94 #include <sys/pmc.h>
95 #endif
96 #include <sys/signalvar.h>
97 #include <sys/resourcevar.h>
98 #include <sys/sched.h>
99 #include <sys/kauth.h>
100 #include <sys/sleepq.h>
101 #include <sys/lockdebug.h>
102
103 #include <uvm/uvm_extern.h>
104
105 #include <machine/cpu.h>
106
107 int lbolt; /* once a second sleep address */
108 int rrticks; /* number of hardclock ticks per roundrobin() */
109
110 /*
111 * The global scheduler state.
112 */
113 kmutex_t sched_mutex; /* global sched state mutex */
114 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
115 volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
116
117 void schedcpu(void *);
118 void updatepri(struct lwp *);
119
120 void sched_unsleep(struct lwp *);
121 void sched_changepri(struct lwp *, int);
122
123 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
124 static unsigned int schedcpu_ticks;
125
126 syncobj_t sleep_syncobj = {
127 SOBJ_SLEEPQ_SORTED,
128 sleepq_unsleep,
129 sleepq_changepri
130 };
131
132 syncobj_t sched_syncobj = {
133 SOBJ_SLEEPQ_SORTED,
134 sched_unsleep,
135 sched_changepri
136 };
137
138 /*
139 * Force switch among equal priority processes every 100ms.
140 * Called from hardclock every hz/10 == rrticks hardclock ticks.
141 */
142 /* ARGSUSED */
143 void
144 roundrobin(struct cpu_info *ci)
145 {
146 struct schedstate_percpu *spc = &ci->ci_schedstate;
147
148 spc->spc_rrticks = rrticks;
149
150 if (curlwp != NULL) {
151 if (spc->spc_flags & SPCF_SEENRR) {
152 /*
153 * The process has already been through a roundrobin
154 * without switching and may be hogging the CPU.
155 * Indicate that the process should yield.
156 */
157 spc->spc_flags |= SPCF_SHOULDYIELD;
158 } else
159 spc->spc_flags |= SPCF_SEENRR;
160 }
161 cpu_need_resched(curcpu());
162 }
163
164 #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
165 #define NICE_WEIGHT 2 /* priorities per nice level */
166
167 #define ESTCPU_SHIFT 11
168 #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
169 #define ESTCPULIM(e) min((e), ESTCPU_MAX)
170
171 /*
172 * Constants for digital decay and forget:
173 * 90% of (p_estcpu) usage in 5 * loadav time
174 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
175 * Note that, as ps(1) mentions, this can let percentages
176 * total over 100% (I've seen 137.9% for 3 processes).
177 *
178 * Note that hardclock updates p_estcpu and p_cpticks independently.
179 *
180 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
181 * That is, the system wants to compute a value of decay such
182 * that the following for loop:
183 * for (i = 0; i < (5 * loadavg); i++)
184 * p_estcpu *= decay;
185 * will compute
186 * p_estcpu *= 0.1;
187 * for all values of loadavg:
188 *
189 * Mathematically this loop can be expressed by saying:
190 * decay ** (5 * loadavg) ~= .1
191 *
192 * The system computes decay as:
193 * decay = (2 * loadavg) / (2 * loadavg + 1)
194 *
195 * We wish to prove that the system's computation of decay
196 * will always fulfill the equation:
197 * decay ** (5 * loadavg) ~= .1
198 *
199 * If we compute b as:
200 * b = 2 * loadavg
201 * then
202 * decay = b / (b + 1)
203 *
204 * We now need to prove two things:
205 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
206 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
207 *
208 * Facts:
209 * For x close to zero, exp(x) =~ 1 + x, since
210 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
211 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
212 * For x close to zero, ln(1+x) =~ x, since
213 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
214 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
215 * ln(.1) =~ -2.30
216 *
217 * Proof of (1):
218 * Solve (factor)**(power) =~ .1 given power (5*loadav):
219 * solving for factor,
220 * ln(factor) =~ (-2.30/5*loadav), or
221 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
222 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
223 *
224 * Proof of (2):
225 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
226 * solving for power,
227 * power*ln(b/(b+1)) =~ -2.30, or
228 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
229 *
230 * Actual power values for the implemented algorithm are as follows:
231 * loadav: 1 2 3 4
232 * power: 5.68 10.32 14.94 19.55
233 */
234
235 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
236 #define loadfactor(loadav) (2 * (loadav))
237
238 static fixpt_t
239 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
240 {
241
242 if (estcpu == 0) {
243 return 0;
244 }
245
246 #if !defined(_LP64)
247 /* avoid 64bit arithmetics. */
248 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
249 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
250 return estcpu * loadfac / (loadfac + FSCALE);
251 }
252 #endif /* !defined(_LP64) */
253
254 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
255 }
256
257 /*
258 * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
259 * sleeping for at least seven times the loadfactor will decay p_estcpu to
260 * less than (1 << ESTCPU_SHIFT).
261 *
262 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
263 */
264 static fixpt_t
265 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
266 {
267
268 if ((n << FSHIFT) >= 7 * loadfac) {
269 return 0;
270 }
271
272 while (estcpu != 0 && n > 1) {
273 estcpu = decay_cpu(loadfac, estcpu);
274 n--;
275 }
276
277 return estcpu;
278 }
279
280 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
281 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
282
283 /*
284 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
285 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
286 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
287 *
288 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
289 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
290 *
291 * If you dont want to bother with the faster/more-accurate formula, you
292 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
293 * (more general) method of calculating the %age of CPU used by a process.
294 */
295 #define CCPU_SHIFT 11
296
297 /*
298 * schedcpu:
299 *
300 * Recompute process priorities, every hz ticks.
301 *
302 * XXXSMP This needs to be reorganised in order to reduce the locking
303 * burden.
304 */
305 /* ARGSUSED */
306 void
307 schedcpu(void *arg)
308 {
309 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
310 struct rlimit *rlim;
311 struct lwp *l;
312 struct proc *p;
313 int minslp, clkhz, sig;
314 long runtm;
315
316 schedcpu_ticks++;
317
318 mutex_enter(&proclist_mutex);
319 PROCLIST_FOREACH(p, &allproc) {
320 /*
321 * Increment time in/out of memory and sleep time (if
322 * sleeping). We ignore overflow; with 16-bit int's
323 * (remember them?) overflow takes 45 days.
324 */
325 minslp = 2;
326 mutex_enter(&p->p_smutex);
327 runtm = p->p_rtime.tv_sec;
328 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
329 lwp_lock(l);
330 runtm += l->l_rtime.tv_sec;
331 l->l_swtime++;
332 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
333 l->l_stat == LSSUSPENDED) {
334 l->l_slptime++;
335 minslp = min(minslp, l->l_slptime);
336 } else
337 minslp = 0;
338 lwp_unlock(l);
339 }
340 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
341
342 /*
343 * Check if the process exceeds its CPU resource allocation.
344 * If over max, kill it.
345 */
346 rlim = &p->p_rlimit[RLIMIT_CPU];
347 sig = 0;
348 if (runtm >= rlim->rlim_cur) {
349 if (runtm >= rlim->rlim_max)
350 sig = SIGKILL;
351 else {
352 sig = SIGXCPU;
353 if (rlim->rlim_cur < rlim->rlim_max)
354 rlim->rlim_cur += 5;
355 }
356 }
357
358 /*
359 * If the process has run for more than autonicetime, reduce
360 * priority to give others a chance.
361 */
362 if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
363 && kauth_cred_geteuid(p->p_cred)) {
364 mutex_spin_enter(&p->p_stmutex);
365 p->p_nice = autoniceval + NZERO;
366 resetprocpriority(p);
367 mutex_spin_exit(&p->p_stmutex);
368 }
369
370 /*
371 * If the process has slept the entire second,
372 * stop recalculating its priority until it wakes up.
373 */
374 if (minslp <= 1) {
375 /*
376 * p_pctcpu is only for ps.
377 */
378 mutex_spin_enter(&p->p_stmutex);
379 clkhz = stathz != 0 ? stathz : hz;
380 #if (FSHIFT >= CCPU_SHIFT)
381 p->p_pctcpu += (clkhz == 100)?
382 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
383 100 * (((fixpt_t) p->p_cpticks)
384 << (FSHIFT - CCPU_SHIFT)) / clkhz;
385 #else
386 p->p_pctcpu += ((FSCALE - ccpu) *
387 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
388 #endif
389 p->p_cpticks = 0;
390 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
391
392 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
393 lwp_lock(l);
394 if (l->l_slptime <= 1 &&
395 l->l_priority >= PUSER)
396 resetpriority(l);
397 lwp_unlock(l);
398 }
399 mutex_spin_exit(&p->p_stmutex);
400 }
401
402 mutex_exit(&p->p_smutex);
403 if (sig) {
404 psignal(p, sig);
405 }
406 }
407 mutex_exit(&proclist_mutex);
408 uvm_meter();
409 wakeup((caddr_t)&lbolt);
410 callout_schedule(&schedcpu_ch, hz);
411 }
412
413 /*
414 * Recalculate the priority of a process after it has slept for a while.
415 */
416 void
417 updatepri(struct lwp *l)
418 {
419 struct proc *p = l->l_proc;
420 fixpt_t loadfac;
421
422 LOCK_ASSERT(lwp_locked(l, NULL));
423 KASSERT(l->l_slptime > 1);
424
425 loadfac = loadfactor(averunnable.ldavg[0]);
426
427 l->l_slptime--; /* the first time was done in schedcpu */
428 /* XXX NJWLWP */
429 /* XXXSMP occasionally unlocked, should be per-LWP */
430 p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
431 resetpriority(l);
432 }
433
434 /*
435 * During autoconfiguration or after a panic, a sleep will simply lower the
436 * priority briefly to allow interrupts, then return. The priority to be
437 * used (safepri) is machine-dependent, thus this value is initialized and
438 * maintained in the machine-dependent layers. This priority will typically
439 * be 0, or the lowest priority that is safe for use on the interrupt stack;
440 * it can be made higher to block network software interrupts after panics.
441 */
442 int safepri;
443
444 /*
445 * OBSOLETE INTERFACE
446 *
447 * General sleep call. Suspends the current process until a wakeup is
448 * performed on the specified identifier. The process will then be made
449 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
450 * means no timeout). If pri includes PCATCH flag, signals are checked
451 * before and after sleeping, else signals are not checked. Returns 0 if
452 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
453 * signal needs to be delivered, ERESTART is returned if the current system
454 * call should be restarted if possible, and EINTR is returned if the system
455 * call should be interrupted by the signal (return EINTR).
456 *
457 * The interlock is held until we are on a sleep queue. The interlock will
458 * be locked before returning back to the caller unless the PNORELOCK flag
459 * is specified, in which case the interlock will always be unlocked upon
460 * return.
461 */
462 int
463 ltsleep(wchan_t ident, int priority, const char *wmesg, int timo,
464 volatile struct simplelock *interlock)
465 {
466 struct lwp *l = curlwp;
467 sleepq_t *sq;
468 int error, catch;
469
470 if (sleepq_dontsleep(l)) {
471 (void)sleepq_abort(NULL, 0);
472 if ((priority & PNORELOCK) != 0)
473 simple_unlock(interlock);
474 return 0;
475 }
476
477 sq = sleeptab_lookup(&sleeptab, ident);
478 sleepq_enter(sq, l);
479
480 if (interlock != NULL) {
481 LOCK_ASSERT(simple_lock_held(interlock));
482 simple_unlock(interlock);
483 }
484
485 catch = priority & PCATCH;
486 sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
487 &sleep_syncobj);
488 error = sleepq_unblock(timo, catch);
489
490 if (interlock != NULL && (priority & PNORELOCK) == 0)
491 simple_lock(interlock);
492
493 return error;
494 }
495
496 /*
497 * General sleep call for situations where a wake-up is not expected.
498 */
499 int
500 kpause(const char *wmesg, boolean_t intr, int timo, kmutex_t *mtx)
501 {
502 struct lwp *l = curlwp;
503 sleepq_t *sq;
504 int error;
505
506 if (sleepq_dontsleep(l))
507 return sleepq_abort(NULL, 0);
508
509 if (mtx != NULL)
510 mutex_exit(mtx);
511 sq = sleeptab_lookup(&sleeptab, l);
512 sleepq_enter(sq, l);
513 sleepq_block(sq, sched_kpri(l), l, wmesg, timo, intr, &sleep_syncobj);
514 error = sleepq_unblock(timo, intr);
515 if (mtx != NULL)
516 mutex_enter(mtx);
517
518 return error;
519 }
520
521 /*
522 * OBSOLETE INTERFACE
523 *
524 * Make all processes sleeping on the specified identifier runnable.
525 */
526 void
527 wakeup(wchan_t ident)
528 {
529 sleepq_t *sq;
530
531 if (cold)
532 return;
533
534 sq = sleeptab_lookup(&sleeptab, ident);
535 sleepq_wake(sq, ident, (u_int)-1);
536 }
537
538 /*
539 * OBSOLETE INTERFACE
540 *
541 * Make the highest priority process first in line on the specified
542 * identifier runnable.
543 */
544 void
545 wakeup_one(wchan_t ident)
546 {
547 sleepq_t *sq;
548
549 if (cold)
550 return;
551
552 sq = sleeptab_lookup(&sleeptab, ident);
553 sleepq_wake(sq, ident, 1);
554 }
555
556
557 /*
558 * General yield call. Puts the current process back on its run queue and
559 * performs a voluntary context switch. Should only be called when the
560 * current process explicitly requests it (eg sched_yield(2) in compat code).
561 */
562 void
563 yield(void)
564 {
565 struct lwp *l = curlwp;
566
567 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
568 lwp_lock(l);
569 if (l->l_stat == LSONPROC) {
570 KASSERT(lwp_locked(l, &sched_mutex));
571 l->l_priority = l->l_usrpri;
572 }
573 l->l_nvcsw++;
574 mi_switch(l, NULL);
575 KERNEL_LOCK(l->l_biglocks, l);
576 }
577
578 /*
579 * General preemption call. Puts the current process back on its run queue
580 * and performs an involuntary context switch.
581 */
582 void
583 preempt(void)
584 {
585 struct lwp *l = curlwp;
586
587 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
588 lwp_lock(l);
589 if (l->l_stat == LSONPROC) {
590 KASSERT(lwp_locked(l, &sched_mutex));
591 l->l_priority = l->l_usrpri;
592 }
593 l->l_nivcsw++;
594 (void)mi_switch(l, NULL);
595 KERNEL_LOCK(l->l_biglocks, l);
596 }
597
598 /*
599 * The machine independent parts of context switch. Switch to "new"
600 * if non-NULL, otherwise let cpu_switch choose the next lwp.
601 *
602 * Returns 1 if another process was actually run.
603 */
604 int
605 mi_switch(struct lwp *l, struct lwp *newl)
606 {
607 struct schedstate_percpu *spc;
608 struct timeval tv;
609 int retval, oldspl;
610 long s, u;
611
612 LOCK_ASSERT(lwp_locked(l, NULL));
613
614 #ifdef LOCKDEBUG
615 spinlock_switchcheck();
616 simple_lock_switchcheck();
617 #endif
618 #ifdef KSTACK_CHECK_MAGIC
619 kstack_check_magic(l);
620 #endif
621
622 /*
623 * It's safe to read the per CPU schedstate unlocked here, as all we
624 * are after is the run time and that's guarenteed to have been last
625 * updated by this CPU.
626 */
627 KDASSERT(l->l_cpu == curcpu());
628 spc = &l->l_cpu->ci_schedstate;
629
630 /*
631 * Compute the amount of time during which the current
632 * process was running.
633 */
634 microtime(&tv);
635 u = l->l_rtime.tv_usec +
636 (tv.tv_usec - spc->spc_runtime.tv_usec);
637 s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
638 if (u < 0) {
639 u += 1000000;
640 s--;
641 } else if (u >= 1000000) {
642 u -= 1000000;
643 s++;
644 }
645 l->l_rtime.tv_usec = u;
646 l->l_rtime.tv_sec = s;
647
648 /*
649 * XXXSMP If we are using h/w performance counters, save context.
650 */
651 #if PERFCTRS
652 if (PMC_ENABLED(l->l_proc)) {
653 pmc_save_context(l->l_proc);
654 }
655 #endif
656
657 /*
658 * Acquire the sched_mutex if necessary. It will be released by
659 * cpu_switch once it has decided to idle, or picked another LWP
660 * to run.
661 */
662 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
663 if (l->l_mutex != &sched_mutex) {
664 mutex_spin_enter(&sched_mutex);
665 lwp_unlock(l);
666 }
667 #endif
668
669 /*
670 * If on the CPU and we have gotten this far, then we must yield.
671 */
672 KASSERT(l->l_stat != LSRUN);
673 if (l->l_stat == LSONPROC) {
674 KASSERT(lwp_locked(l, &sched_mutex));
675 l->l_stat = LSRUN;
676 setrunqueue(l);
677 }
678 uvmexp.swtch++;
679
680 /*
681 * Process is about to yield the CPU; clear the appropriate
682 * scheduling flags.
683 */
684 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
685
686 LOCKDEBUG_BARRIER(&sched_mutex, 1);
687
688 /*
689 * Switch to the new current LWP. When we run again, we'll
690 * return back here.
691 */
692 oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
693
694 if (newl == NULL || newl->l_back == NULL)
695 retval = cpu_switch(l, NULL);
696 else {
697 KASSERT(lwp_locked(newl, &sched_mutex));
698 remrunqueue(newl);
699 cpu_switchto(l, newl);
700 retval = 0;
701 }
702
703 /*
704 * XXXSMP If we are using h/w performance counters, restore context.
705 */
706 #if PERFCTRS
707 if (PMC_ENABLED(l->l_proc)) {
708 pmc_restore_context(l->l_proc);
709 }
710 #endif
711
712 /*
713 * We're running again; record our new start time. We might
714 * be running on a new CPU now, so don't use the cached
715 * schedstate_percpu pointer.
716 */
717 KDASSERT(l->l_cpu == curcpu());
718 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
719 splx(oldspl);
720
721 return retval;
722 }
723
724 /*
725 * Initialize the (doubly-linked) run queues
726 * to be empty.
727 */
728 void
729 rqinit()
730 {
731 int i;
732
733 for (i = 0; i < RUNQUE_NQS; i++)
734 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
735 (struct lwp *)&sched_qs[i];
736
737 mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
738 }
739
740 static inline void
741 resched_lwp(struct lwp *l, u_char pri)
742 {
743 struct cpu_info *ci;
744
745 /*
746 * XXXSMP
747 * Since l->l_cpu persists across a context switch,
748 * this gives us *very weak* processor affinity, in
749 * that we notify the CPU on which the process last
750 * ran that it should try to switch.
751 *
752 * This does not guarantee that the process will run on
753 * that processor next, because another processor might
754 * grab it the next time it performs a context switch.
755 *
756 * This also does not handle the case where its last
757 * CPU is running a higher-priority process, but every
758 * other CPU is running a lower-priority process. There
759 * are ways to handle this situation, but they're not
760 * currently very pretty, and we also need to weigh the
761 * cost of moving a process from one CPU to another.
762 *
763 * XXXSMP
764 * There is also the issue of locking the other CPU's
765 * sched state, which we currently do not do.
766 */
767 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
768 if (pri < ci->ci_schedstate.spc_curpriority)
769 cpu_need_resched(ci);
770 }
771
772 /*
773 * Change process state to be runnable, placing it on the run queue if it is
774 * in memory, and awakening the swapper if it isn't in memory.
775 *
776 * Call with the process and LWP locked. Will return with the LWP unlocked.
777 */
778 void
779 setrunnable(struct lwp *l)
780 {
781 struct proc *p = l->l_proc;
782 sigset_t *ss;
783
784 LOCK_ASSERT(mutex_owned(&p->p_smutex));
785 LOCK_ASSERT(lwp_locked(l, NULL));
786
787 switch (l->l_stat) {
788 case LSSTOP:
789 /*
790 * If we're being traced (possibly because someone attached us
791 * while we were stopped), check for a signal from the debugger.
792 */
793 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
794 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
795 ss = &l->l_sigpend.sp_set;
796 else
797 ss = &p->p_sigpend.sp_set;
798 sigaddset(ss, p->p_xstat);
799 signotify(l);
800 }
801 p->p_nrlwps++;
802 break;
803 case LSSUSPENDED:
804 l->l_flag &= ~L_WSUSPEND;
805 p->p_nrlwps++;
806 break;
807 case LSSLEEP:
808 KASSERT(l->l_wchan != NULL);
809 break;
810 default:
811 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
812 }
813
814 /*
815 * If the LWP was sleeping interruptably, then it's OK to start it
816 * again. If not, mark it as still sleeping.
817 */
818 if (l->l_wchan != NULL) {
819 l->l_stat = LSSLEEP;
820 if ((l->l_flag & L_SINTR) != 0)
821 lwp_unsleep(l);
822 else {
823 lwp_unlock(l);
824 #ifdef DIAGNOSTIC
825 panic("setrunnable: !L_SINTR");
826 #endif
827 }
828 return;
829 }
830
831 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
832
833 /*
834 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
835 * about to call mi_switch(), in which case it will yield.
836 *
837 * XXXSMP Will need to change for preemption.
838 */
839 #ifdef MULTIPROCESSOR
840 if (l->l_cpu->ci_curlwp == l) {
841 #else
842 if (l == curlwp) {
843 #endif
844 l->l_stat = LSONPROC;
845 l->l_slptime = 0;
846 lwp_unlock(l);
847 return;
848 }
849
850 /*
851 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
852 * to bring it back in. Otherwise, enter it into a run queue.
853 */
854 if (l->l_slptime > 1)
855 updatepri(l);
856 l->l_stat = LSRUN;
857 l->l_slptime = 0;
858
859 if (l->l_flag & L_INMEM) {
860 setrunqueue(l);
861 resched_lwp(l, l->l_priority);
862 lwp_unlock(l);
863 } else {
864 lwp_unlock(l);
865 uvm_kick_scheduler();
866 }
867 }
868
869 /*
870 * Compute the priority of a process when running in user mode.
871 * Arrange to reschedule if the resulting priority is better
872 * than that of the current process.
873 */
874 void
875 resetpriority(struct lwp *l)
876 {
877 unsigned int newpriority;
878 struct proc *p = l->l_proc;
879
880 /* XXXSMP LOCK_ASSERT(mutex_owned(&p->p_stmutex)); */
881 LOCK_ASSERT(lwp_locked(l, NULL));
882
883 if ((l->l_flag & L_SYSTEM) != 0)
884 return;
885
886 newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
887 NICE_WEIGHT * (p->p_nice - NZERO);
888 newpriority = min(newpriority, MAXPRI);
889 lwp_changepri(l, newpriority);
890 }
891
892 /*
893 * Recompute priority for all LWPs in a process.
894 */
895 void
896 resetprocpriority(struct proc *p)
897 {
898 struct lwp *l;
899
900 LOCK_ASSERT(mutex_owned(&p->p_stmutex));
901
902 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
903 lwp_lock(l);
904 resetpriority(l);
905 lwp_unlock(l);
906 }
907 }
908
909 /*
910 * We adjust the priority of the current process. The priority of a process
911 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
912 * is increased here. The formula for computing priorities (in kern_synch.c)
913 * will compute a different value each time p_estcpu increases. This can
914 * cause a switch, but unless the priority crosses a PPQ boundary the actual
915 * queue will not change. The CPU usage estimator ramps up quite quickly
916 * when the process is running (linearly), and decays away exponentially, at
917 * a rate which is proportionally slower when the system is busy. The basic
918 * principle is that the system will 90% forget that the process used a lot
919 * of CPU time in 5 * loadav seconds. This causes the system to favor
920 * processes which haven't run much recently, and to round-robin among other
921 * processes.
922 */
923
924 void
925 schedclock(struct lwp *l)
926 {
927 struct proc *p = l->l_proc;
928
929 mutex_spin_enter(&p->p_stmutex);
930 p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
931 lwp_lock(l);
932 resetpriority(l);
933 mutex_spin_exit(&p->p_stmutex);
934 if ((l->l_flag & L_SYSTEM) == 0 && l->l_priority >= PUSER)
935 l->l_priority = l->l_usrpri;
936 lwp_unlock(l);
937 }
938
939 /*
940 * suspendsched:
941 *
942 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
943 */
944 void
945 suspendsched(void)
946 {
947 #ifdef MULTIPROCESSOR
948 CPU_INFO_ITERATOR cii;
949 struct cpu_info *ci;
950 #endif
951 struct lwp *l;
952 struct proc *p;
953
954 /*
955 * We do this by process in order not to violate the locking rules.
956 */
957 mutex_enter(&proclist_mutex);
958 PROCLIST_FOREACH(p, &allproc) {
959 mutex_enter(&p->p_smutex);
960
961 if ((p->p_flag & P_SYSTEM) != 0) {
962 mutex_exit(&p->p_smutex);
963 continue;
964 }
965
966 p->p_stat = SSTOP;
967
968 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
969 if (l == curlwp)
970 continue;
971
972 lwp_lock(l);
973
974 /*
975 * Set L_WREBOOT so that the LWP will suspend itself
976 * when it tries to return to user mode. We want to
977 * try and get to get as many LWPs as possible to
978 * the user / kernel boundary, so that they will
979 * release any locks that they hold.
980 */
981 l->l_flag |= (L_WREBOOT | L_WSUSPEND);
982
983 if (l->l_stat == LSSLEEP &&
984 (l->l_flag & L_SINTR) != 0) {
985 /* setrunnable() will release the lock. */
986 setrunnable(l);
987 continue;
988 }
989
990 lwp_unlock(l);
991 }
992
993 mutex_exit(&p->p_smutex);
994 }
995 mutex_exit(&proclist_mutex);
996
997 /*
998 * Kick all CPUs to make them preempt any LWPs running in user mode.
999 * They'll trap into the kernel and suspend themselves in userret().
1000 */
1001 sched_lock(0);
1002 #ifdef MULTIPROCESSOR
1003 for (CPU_INFO_FOREACH(cii, ci))
1004 cpu_need_resched(ci);
1005 #else
1006 cpu_need_resched(curcpu());
1007 #endif
1008 sched_unlock(0);
1009 }
1010
1011 /*
1012 * scheduler_fork_hook:
1013 *
1014 * Inherit the parent's scheduler history.
1015 */
1016 void
1017 scheduler_fork_hook(struct proc *parent, struct proc *child)
1018 {
1019
1020 LOCK_ASSERT(mutex_owned(&parent->p_smutex));
1021
1022 child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1023 child->p_forktime = schedcpu_ticks;
1024 }
1025
1026 /*
1027 * scheduler_wait_hook:
1028 *
1029 * Chargeback parents for the sins of their children.
1030 */
1031 void
1032 scheduler_wait_hook(struct proc *parent, struct proc *child)
1033 {
1034 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1035 fixpt_t estcpu;
1036
1037 /* XXX Only if parent != init?? */
1038
1039 mutex_spin_enter(&parent->p_stmutex);
1040 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1041 schedcpu_ticks - child->p_forktime);
1042 if (child->p_estcpu > estcpu)
1043 parent->p_estcpu =
1044 ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1045 mutex_spin_exit(&parent->p_stmutex);
1046 }
1047
1048 /*
1049 * sched_kpri:
1050 *
1051 * Scale a priority level to a kernel priority level, usually
1052 * for an LWP that is about to sleep.
1053 */
1054 int
1055 sched_kpri(struct lwp *l)
1056 {
1057 /*
1058 * Scale user priorities (127 -> 50) up to kernel priorities
1059 * in the range (49 -> 8). Reserve the top 8 kernel priorities
1060 * for high priority kthreads. Kernel priorities passed in
1061 * are left "as is". XXX This is somewhat arbitrary.
1062 */
1063 static const uint8_t kpri_tab[] = {
1064 0, 1, 2, 3, 4, 5, 6, 7,
1065 8, 9, 10, 11, 12, 13, 14, 15,
1066 16, 17, 18, 19, 20, 21, 22, 23,
1067 24, 25, 26, 27, 28, 29, 30, 31,
1068 32, 33, 34, 35, 36, 37, 38, 39,
1069 40, 41, 42, 43, 44, 45, 46, 47,
1070 48, 49, 8, 8, 9, 9, 10, 10,
1071 11, 11, 12, 12, 13, 14, 14, 15,
1072 15, 16, 16, 17, 17, 18, 18, 19,
1073 20, 20, 21, 21, 22, 22, 23, 23,
1074 24, 24, 25, 26, 26, 27, 27, 28,
1075 28, 29, 29, 30, 30, 31, 32, 32,
1076 33, 33, 34, 34, 35, 35, 36, 36,
1077 37, 38, 38, 39, 39, 40, 40, 41,
1078 41, 42, 42, 43, 44, 44, 45, 45,
1079 46, 46, 47, 47, 48, 48, 49, 49,
1080 };
1081
1082 return kpri_tab[l->l_usrpri];
1083 }
1084
1085 /*
1086 * sched_unsleep:
1087 *
1088 * The is called when the LWP has not been awoken normally but instead
1089 * interrupted: for example, if the sleep timed out. Because of this,
1090 * it's not a valid action for running or idle LWPs.
1091 */
1092 void
1093 sched_unsleep(struct lwp *l)
1094 {
1095
1096 lwp_unlock(l);
1097 panic("sched_unsleep");
1098 }
1099
1100 /*
1101 * sched_changepri:
1102 *
1103 * Adjust the priority of an LWP.
1104 */
1105 void
1106 sched_changepri(struct lwp *l, int pri)
1107 {
1108
1109 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1110
1111 l->l_usrpri = pri;
1112
1113 if (l->l_priority < PUSER)
1114 return;
1115 if (l->l_stat != LSRUN || (l->l_flag & L_INMEM) == 0 ||
1116 (l->l_priority / PPQ) == (pri / PPQ)) {
1117 l->l_priority = pri;
1118 return;
1119 }
1120
1121 remrunqueue(l);
1122 l->l_priority = pri;
1123 setrunqueue(l);
1124 resched_lwp(l, pri);
1125 }
1126
1127 /*
1128 * Low-level routines to access the run queue. Optimised assembler
1129 * routines can override these.
1130 */
1131
1132 #ifndef __HAVE_MD_RUNQUEUE
1133
1134 /*
1135 * On some architectures, it's faster to use a MSB ordering for the priorites
1136 * than the traditional LSB ordering.
1137 */
1138 #ifdef __HAVE_BIGENDIAN_BITOPS
1139 #define RQMASK(n) (0x80000000 >> (n))
1140 #else
1141 #define RQMASK(n) (0x00000001 << (n))
1142 #endif
1143
1144 /*
1145 * The primitives that manipulate the run queues. whichqs tells which
1146 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1147 * into queues, remrunqueue removes them from queues. The running process is
1148 * on no queue, other processes are on a queue related to p->p_priority,
1149 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1150 * available queues.
1151 */
1152 #ifdef RQDEBUG
1153 static void
1154 checkrunqueue(int whichq, struct lwp *l)
1155 {
1156 const struct prochd * const rq = &sched_qs[whichq];
1157 struct lwp *l2;
1158 int found = 0;
1159 int die = 0;
1160 int empty = 1;
1161 for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1162 if (l2->l_stat != LSRUN) {
1163 printf("checkrunqueue[%d]: lwp %p state (%d) "
1164 " != LSRUN\n", whichq, l2, l2->l_stat);
1165 }
1166 if (l2->l_back->l_forw != l2) {
1167 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1168 "corrupt %p\n", whichq, l2, l2->l_back,
1169 l2->l_back->l_forw);
1170 die = 1;
1171 }
1172 if (l2->l_forw->l_back != l2) {
1173 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1174 "corrupt %p\n", whichq, l2, l2->l_forw,
1175 l2->l_forw->l_back);
1176 die = 1;
1177 }
1178 if (l2 == l)
1179 found = 1;
1180 empty = 0;
1181 }
1182 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1183 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1184 whichq, rq);
1185 die = 1;
1186 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1187 printf("checkrunqueue[%d]: bit clear for non-empty "
1188 "run-queue %p\n", whichq, rq);
1189 die = 1;
1190 }
1191 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1192 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1193 whichq, l);
1194 die = 1;
1195 }
1196 if (l != NULL && empty) {
1197 printf("checkrunqueue[%d]: empty run-queue %p with "
1198 "active lwp %p\n", whichq, rq, l);
1199 die = 1;
1200 }
1201 if (l != NULL && !found) {
1202 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1203 whichq, l, rq);
1204 die = 1;
1205 }
1206 if (die)
1207 panic("checkrunqueue: inconsistency found");
1208 }
1209 #endif /* RQDEBUG */
1210
1211 void
1212 setrunqueue(struct lwp *l)
1213 {
1214 struct prochd *rq;
1215 struct lwp *prev;
1216 const int whichq = l->l_priority / PPQ;
1217
1218 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1219
1220 #ifdef RQDEBUG
1221 checkrunqueue(whichq, NULL);
1222 #endif
1223 #ifdef DIAGNOSTIC
1224 if (l->l_back != NULL || l->l_stat != LSRUN)
1225 panic("setrunqueue");
1226 #endif
1227 sched_whichqs |= RQMASK(whichq);
1228 rq = &sched_qs[whichq];
1229 prev = rq->ph_rlink;
1230 l->l_forw = (struct lwp *)rq;
1231 rq->ph_rlink = l;
1232 prev->l_forw = l;
1233 l->l_back = prev;
1234 #ifdef RQDEBUG
1235 checkrunqueue(whichq, l);
1236 #endif
1237 }
1238
1239 /*
1240 * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
1241 * drop of the effective priority level from kernel to user needs to be
1242 * moved here from userret(). The assignment in userret() is currently
1243 * done unlocked.
1244 */
1245 void
1246 remrunqueue(struct lwp *l)
1247 {
1248 struct lwp *prev, *next;
1249 const int whichq = l->l_priority / PPQ;
1250
1251 LOCK_ASSERT(lwp_locked(l, &sched_mutex));
1252
1253 #ifdef RQDEBUG
1254 checkrunqueue(whichq, l);
1255 #endif
1256
1257 #if defined(DIAGNOSTIC)
1258 if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1259 /* Shouldn't happen - interrupts disabled. */
1260 panic("remrunqueue: bit %d not set", whichq);
1261 }
1262 #endif
1263 prev = l->l_back;
1264 l->l_back = NULL;
1265 next = l->l_forw;
1266 prev->l_forw = next;
1267 next->l_back = prev;
1268 if (prev == next)
1269 sched_whichqs &= ~RQMASK(whichq);
1270 #ifdef RQDEBUG
1271 checkrunqueue(whichq, NULL);
1272 #endif
1273 }
1274
1275 #undef RQMASK
1276 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1277