kern_synch.c revision 1.186.2.4 1 /* $NetBSD: kern_synch.c,v 1.186.2.4 2007/04/05 21:38:37 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*-
41 * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. Neither the name of the University nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 *
73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.186.2.4 2007/04/05 21:38:37 ad Exp $");
78
79 #include "opt_ddb.h"
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/callout.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #include <sys/buf.h>
93 #if defined(PERFCTRS)
94 #include <sys/pmc.h>
95 #endif
96 #include <sys/signalvar.h>
97 #include <sys/resourcevar.h>
98 #include <sys/sched.h>
99 #include <sys/syscall_stats.h>
100 #include <sys/kauth.h>
101 #include <sys/sleepq.h>
102 #include <sys/lockdebug.h>
103
104 #include <uvm/uvm_extern.h>
105
106 #include <machine/cpu.h>
107
108 int rrticks; /* number of hardclock ticks per roundrobin() */
109
110 /*
111 * The global scheduler state.
112 */
113 kmutex_t sched_mutex; /* global sched state mutex */
114 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
115 volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
116
117 void schedcpu(void *);
118 void updatepri(struct lwp *);
119
120 void sched_unsleep(struct lwp *);
121 void sched_changepri(struct lwp *, pri_t);
122 void sched_lendpri(struct lwp *, pri_t);
123
124 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
125 static unsigned int schedcpu_ticks;
126
127 syncobj_t sleep_syncobj = {
128 SOBJ_SLEEPQ_SORTED,
129 sleepq_unsleep,
130 sleepq_changepri,
131 sleepq_lendpri,
132 syncobj_noowner,
133 };
134
135 syncobj_t sched_syncobj = {
136 SOBJ_SLEEPQ_SORTED,
137 sched_unsleep,
138 sched_changepri,
139 sched_lendpri,
140 syncobj_noowner,
141 };
142
143 /*
144 * Force switch among equal priority processes every 100ms.
145 * Called from hardclock every hz/10 == rrticks hardclock ticks.
146 */
147 /* ARGSUSED */
148 void
149 roundrobin(struct cpu_info *ci)
150 {
151 struct schedstate_percpu *spc = &ci->ci_schedstate;
152
153 spc->spc_rrticks = rrticks;
154
155 if (curlwp != NULL) {
156 if (spc->spc_flags & SPCF_SEENRR) {
157 /*
158 * The process has already been through a roundrobin
159 * without switching and may be hogging the CPU.
160 * Indicate that the process should yield.
161 */
162 spc->spc_flags |= SPCF_SHOULDYIELD;
163 } else
164 spc->spc_flags |= SPCF_SEENRR;
165 }
166 cpu_need_resched(curcpu());
167 }
168
169 #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
170 #define NICE_WEIGHT 2 /* priorities per nice level */
171
172 #define ESTCPU_SHIFT 11
173 #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
174 #define ESTCPULIM(e) min((e), ESTCPU_MAX)
175
176 /*
177 * Constants for digital decay and forget:
178 * 90% of (p_estcpu) usage in 5 * loadav time
179 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
180 * Note that, as ps(1) mentions, this can let percentages
181 * total over 100% (I've seen 137.9% for 3 processes).
182 *
183 * Note that hardclock updates p_estcpu and p_cpticks independently.
184 *
185 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
186 * That is, the system wants to compute a value of decay such
187 * that the following for loop:
188 * for (i = 0; i < (5 * loadavg); i++)
189 * p_estcpu *= decay;
190 * will compute
191 * p_estcpu *= 0.1;
192 * for all values of loadavg:
193 *
194 * Mathematically this loop can be expressed by saying:
195 * decay ** (5 * loadavg) ~= .1
196 *
197 * The system computes decay as:
198 * decay = (2 * loadavg) / (2 * loadavg + 1)
199 *
200 * We wish to prove that the system's computation of decay
201 * will always fulfill the equation:
202 * decay ** (5 * loadavg) ~= .1
203 *
204 * If we compute b as:
205 * b = 2 * loadavg
206 * then
207 * decay = b / (b + 1)
208 *
209 * We now need to prove two things:
210 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
211 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
212 *
213 * Facts:
214 * For x close to zero, exp(x) =~ 1 + x, since
215 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
216 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
217 * For x close to zero, ln(1+x) =~ x, since
218 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
219 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
220 * ln(.1) =~ -2.30
221 *
222 * Proof of (1):
223 * Solve (factor)**(power) =~ .1 given power (5*loadav):
224 * solving for factor,
225 * ln(factor) =~ (-2.30/5*loadav), or
226 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
227 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
228 *
229 * Proof of (2):
230 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
231 * solving for power,
232 * power*ln(b/(b+1)) =~ -2.30, or
233 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
234 *
235 * Actual power values for the implemented algorithm are as follows:
236 * loadav: 1 2 3 4
237 * power: 5.68 10.32 14.94 19.55
238 */
239
240 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
241 #define loadfactor(loadav) (2 * (loadav))
242
243 static fixpt_t
244 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
245 {
246
247 if (estcpu == 0) {
248 return 0;
249 }
250
251 #if !defined(_LP64)
252 /* avoid 64bit arithmetics. */
253 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
254 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
255 return estcpu * loadfac / (loadfac + FSCALE);
256 }
257 #endif /* !defined(_LP64) */
258
259 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
260 }
261
262 /*
263 * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
264 * sleeping for at least seven times the loadfactor will decay p_estcpu to
265 * less than (1 << ESTCPU_SHIFT).
266 *
267 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
268 */
269 static fixpt_t
270 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
271 {
272
273 if ((n << FSHIFT) >= 7 * loadfac) {
274 return 0;
275 }
276
277 while (estcpu != 0 && n > 1) {
278 estcpu = decay_cpu(loadfac, estcpu);
279 n--;
280 }
281
282 return estcpu;
283 }
284
285 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
286 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
287
288 /*
289 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
290 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
291 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
292 *
293 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
294 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
295 *
296 * If you dont want to bother with the faster/more-accurate formula, you
297 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
298 * (more general) method of calculating the %age of CPU used by a process.
299 */
300 #define CCPU_SHIFT 11
301
302 /*
303 * schedcpu:
304 *
305 * Recompute process priorities, every hz ticks.
306 *
307 * XXXSMP This needs to be reorganised in order to reduce the locking
308 * burden.
309 */
310 /* ARGSUSED */
311 void
312 schedcpu(void *arg)
313 {
314 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
315 struct rlimit *rlim;
316 struct lwp *l;
317 struct proc *p;
318 int minslp, clkhz, sig;
319 long runtm;
320
321 schedcpu_ticks++;
322
323 mutex_enter(&proclist_mutex);
324 PROCLIST_FOREACH(p, &allproc) {
325 /*
326 * Increment time in/out of memory and sleep time (if
327 * sleeping). We ignore overflow; with 16-bit int's
328 * (remember them?) overflow takes 45 days.
329 */
330 minslp = 2;
331 mutex_enter(&p->p_smutex);
332 runtm = p->p_rtime.tv_sec;
333 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
334 lwp_lock(l);
335 runtm += l->l_rtime.tv_sec;
336 l->l_swtime++;
337 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
338 l->l_stat == LSSUSPENDED) {
339 l->l_slptime++;
340 minslp = min(minslp, l->l_slptime);
341 } else
342 minslp = 0;
343 lwp_unlock(l);
344 }
345 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
346
347 /*
348 * Check if the process exceeds its CPU resource allocation.
349 * If over max, kill it.
350 */
351 rlim = &p->p_rlimit[RLIMIT_CPU];
352 sig = 0;
353 if (runtm >= rlim->rlim_cur) {
354 if (runtm >= rlim->rlim_max)
355 sig = SIGKILL;
356 else {
357 sig = SIGXCPU;
358 if (rlim->rlim_cur < rlim->rlim_max)
359 rlim->rlim_cur += 5;
360 }
361 }
362
363 /*
364 * If the process has run for more than autonicetime, reduce
365 * priority to give others a chance.
366 */
367 if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
368 && kauth_cred_geteuid(p->p_cred)) {
369 mutex_spin_enter(&p->p_stmutex);
370 p->p_nice = autoniceval + NZERO;
371 resetprocpriority(p);
372 mutex_spin_exit(&p->p_stmutex);
373 }
374
375 /*
376 * If the process has slept the entire second,
377 * stop recalculating its priority until it wakes up.
378 */
379 if (minslp <= 1) {
380 /*
381 * p_pctcpu is only for ps.
382 */
383 mutex_spin_enter(&p->p_stmutex);
384 clkhz = stathz != 0 ? stathz : hz;
385 #if (FSHIFT >= CCPU_SHIFT)
386 p->p_pctcpu += (clkhz == 100)?
387 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
388 100 * (((fixpt_t) p->p_cpticks)
389 << (FSHIFT - CCPU_SHIFT)) / clkhz;
390 #else
391 p->p_pctcpu += ((FSCALE - ccpu) *
392 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
393 #endif
394 p->p_cpticks = 0;
395 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
396
397 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
398 lwp_lock(l);
399 if (l->l_slptime <= 1 &&
400 l->l_priority >= PUSER)
401 resetpriority(l);
402 lwp_unlock(l);
403 }
404 mutex_spin_exit(&p->p_stmutex);
405 }
406
407 mutex_exit(&p->p_smutex);
408 if (sig) {
409 psignal(p, sig);
410 }
411 }
412 mutex_exit(&proclist_mutex);
413 uvm_meter();
414 cv_broadcast(&lbolt);
415 callout_schedule(&schedcpu_ch, hz);
416 }
417
418 /*
419 * Recalculate the priority of a process after it has slept for a while.
420 */
421 void
422 updatepri(struct lwp *l)
423 {
424 struct proc *p = l->l_proc;
425 fixpt_t loadfac;
426
427 KASSERT(lwp_locked(l, NULL));
428 KASSERT(l->l_slptime > 1);
429
430 loadfac = loadfactor(averunnable.ldavg[0]);
431
432 l->l_slptime--; /* the first time was done in schedcpu */
433 /* XXX NJWLWP */
434 /* XXXSMP occasionally unlocked, should be per-LWP */
435 p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
436 resetpriority(l);
437 }
438
439 /*
440 * During autoconfiguration or after a panic, a sleep will simply lower the
441 * priority briefly to allow interrupts, then return. The priority to be
442 * used (safepri) is machine-dependent, thus this value is initialized and
443 * maintained in the machine-dependent layers. This priority will typically
444 * be 0, or the lowest priority that is safe for use on the interrupt stack;
445 * it can be made higher to block network software interrupts after panics.
446 */
447 int safepri;
448
449 /*
450 * OBSOLETE INTERFACE
451 *
452 * General sleep call. Suspends the current process until a wakeup is
453 * performed on the specified identifier. The process will then be made
454 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
455 * means no timeout). If pri includes PCATCH flag, signals are checked
456 * before and after sleeping, else signals are not checked. Returns 0 if
457 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
458 * signal needs to be delivered, ERESTART is returned if the current system
459 * call should be restarted if possible, and EINTR is returned if the system
460 * call should be interrupted by the signal (return EINTR).
461 *
462 * The interlock is held until we are on a sleep queue. The interlock will
463 * be locked before returning back to the caller unless the PNORELOCK flag
464 * is specified, in which case the interlock will always be unlocked upon
465 * return.
466 */
467 int
468 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
469 volatile struct simplelock *interlock)
470 {
471 struct lwp *l = curlwp;
472 sleepq_t *sq;
473 int error, catch;
474
475 if (sleepq_dontsleep(l)) {
476 (void)sleepq_abort(NULL, 0);
477 if ((priority & PNORELOCK) != 0)
478 simple_unlock(interlock);
479 return 0;
480 }
481
482 sq = sleeptab_lookup(&sleeptab, ident);
483 sleepq_enter(sq, l);
484
485 if (interlock != NULL)
486 simple_unlock(interlock);
487
488 catch = priority & PCATCH;
489 sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
490 &sleep_syncobj);
491 error = sleepq_unblock(timo, catch);
492
493 if (interlock != NULL && (priority & PNORELOCK) == 0)
494 simple_lock(interlock);
495
496 return error;
497 }
498
499 int
500 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
501 kmutex_t *mtx)
502 {
503 struct lwp *l = curlwp;
504 sleepq_t *sq;
505 int error, catch;
506
507 if (sleepq_dontsleep(l)) {
508 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
509 return 0;
510 }
511
512 sq = sleeptab_lookup(&sleeptab, ident);
513 sleepq_enter(sq, l);
514 mutex_exit(mtx);
515
516 catch = priority & PCATCH;
517 sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
518 &sleep_syncobj);
519 error = sleepq_unblock(timo, catch);
520
521 if ((priority & PNORELOCK) == 0)
522 mutex_enter(mtx);
523
524 return error;
525 }
526
527 /*
528 * General sleep call for situations where a wake-up is not expected.
529 */
530 int
531 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
532 {
533 struct lwp *l = curlwp;
534 sleepq_t *sq;
535 int error;
536
537 if (sleepq_dontsleep(l))
538 return sleepq_abort(NULL, 0);
539
540 if (mtx != NULL)
541 mutex_exit(mtx);
542 sq = sleeptab_lookup(&sleeptab, l);
543 sleepq_enter(sq, l);
544 sleepq_block(sq, sched_kpri(l), l, wmesg, timo, intr, &sleep_syncobj);
545 error = sleepq_unblock(timo, intr);
546 if (mtx != NULL)
547 mutex_enter(mtx);
548
549 return error;
550 }
551
552 /*
553 * OBSOLETE INTERFACE
554 *
555 * Make all processes sleeping on the specified identifier runnable.
556 */
557 void
558 wakeup(wchan_t ident)
559 {
560 sleepq_t *sq;
561
562 if (cold)
563 return;
564
565 sq = sleeptab_lookup(&sleeptab, ident);
566 sleepq_wake(sq, ident, (u_int)-1);
567 }
568
569 /*
570 * OBSOLETE INTERFACE
571 *
572 * Make the highest priority process first in line on the specified
573 * identifier runnable.
574 */
575 void
576 wakeup_one(wchan_t ident)
577 {
578 sleepq_t *sq;
579
580 if (cold)
581 return;
582
583 sq = sleeptab_lookup(&sleeptab, ident);
584 sleepq_wake(sq, ident, 1);
585 }
586
587
588 /*
589 * General yield call. Puts the current process back on its run queue and
590 * performs a voluntary context switch. Should only be called when the
591 * current process explicitly requests it (eg sched_yield(2) in compat code).
592 */
593 void
594 yield(void)
595 {
596 struct lwp *l = curlwp;
597
598 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
599 lwp_lock(l);
600 if (l->l_stat == LSONPROC) {
601 KASSERT(lwp_locked(l, &sched_mutex));
602 l->l_priority = l->l_usrpri;
603 }
604 l->l_ncsw++;
605 mi_switch(l, NULL);
606 KERNEL_LOCK(l->l_biglocks, l);
607 }
608
609 /*
610 * General preemption call. Puts the current process back on its run queue
611 * and performs an involuntary context switch.
612 */
613 void
614 preempt(void)
615 {
616 struct lwp *l = curlwp;
617
618 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
619 lwp_lock(l);
620 if (l->l_stat == LSONPROC) {
621 KASSERT(lwp_locked(l, &sched_mutex));
622 l->l_priority = l->l_usrpri;
623 }
624 l->l_nivcsw++;
625 (void)mi_switch(l, NULL);
626 KERNEL_LOCK(l->l_biglocks, l);
627 }
628
629 /*
630 * The machine independent parts of context switch. Switch to "new"
631 * if non-NULL, otherwise let cpu_switch choose the next lwp.
632 *
633 * Returns 1 if another process was actually run.
634 */
635 int
636 mi_switch(struct lwp *l, struct lwp *newl)
637 {
638 struct schedstate_percpu *spc;
639 struct timeval tv;
640 int retval, oldspl;
641 long s, u;
642
643 KASSERT(lwp_locked(l, NULL));
644
645 #ifdef KSTACK_CHECK_MAGIC
646 kstack_check_magic(l);
647 #endif
648
649 /*
650 * It's safe to read the per CPU schedstate unlocked here, as all we
651 * are after is the run time and that's guarenteed to have been last
652 * updated by this CPU.
653 */
654 KDASSERT(l->l_cpu == curcpu());
655 spc = &l->l_cpu->ci_schedstate;
656
657 /*
658 * Compute the amount of time during which the current
659 * process was running.
660 */
661 microtime(&tv);
662 u = l->l_rtime.tv_usec +
663 (tv.tv_usec - spc->spc_runtime.tv_usec);
664 s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
665 if (u < 0) {
666 u += 1000000;
667 s--;
668 } else if (u >= 1000000) {
669 u -= 1000000;
670 s++;
671 }
672 l->l_rtime.tv_usec = u;
673 l->l_rtime.tv_sec = s;
674
675 /* Count time spent in current system call */
676 SYSCALL_TIME_SLEEP(l);
677
678 /*
679 * XXXSMP If we are using h/w performance counters, save context.
680 */
681 #if PERFCTRS
682 if (PMC_ENABLED(l->l_proc)) {
683 pmc_save_context(l->l_proc);
684 }
685 #endif
686
687 /*
688 * Acquire the sched_mutex if necessary. It will be released by
689 * cpu_switch once it has decided to idle, or picked another LWP
690 * to run.
691 */
692 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
693 if (l->l_mutex != &sched_mutex) {
694 mutex_spin_enter(&sched_mutex);
695 lwp_unlock(l);
696 }
697 #endif
698
699 /*
700 * If on the CPU and we have gotten this far, then we must yield.
701 */
702 KASSERT(l->l_stat != LSRUN);
703 if (l->l_stat == LSONPROC) {
704 KASSERT(lwp_locked(l, &sched_mutex));
705 l->l_stat = LSRUN;
706 setrunqueue(l);
707 }
708 uvmexp.swtch++;
709 l->l_ncsw++;
710
711 /*
712 * Process is about to yield the CPU; clear the appropriate
713 * scheduling flags.
714 */
715 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
716
717 LOCKDEBUG_BARRIER(&sched_mutex, 1);
718
719 /*
720 * Switch to the new current LWP. When we run again, we'll
721 * return back here.
722 */
723 oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
724
725 if (newl == NULL || newl->l_back == NULL)
726 retval = cpu_switch(l, NULL);
727 else {
728 KASSERT(lwp_locked(newl, &sched_mutex));
729 remrunqueue(newl);
730 cpu_switchto(l, newl);
731 retval = 0;
732 }
733
734 /*
735 * XXXSMP If we are using h/w performance counters, restore context.
736 */
737 #if PERFCTRS
738 if (PMC_ENABLED(l->l_proc)) {
739 pmc_restore_context(l->l_proc);
740 }
741 #endif
742
743 /*
744 * We're running again; record our new start time. We might
745 * be running on a new CPU now, so don't use the cached
746 * schedstate_percpu pointer.
747 */
748 SYSCALL_TIME_WAKEUP(l);
749 KDASSERT(l->l_cpu == curcpu());
750 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
751 splx(oldspl);
752
753 return retval;
754 }
755
756 /*
757 * Initialize the (doubly-linked) run queues
758 * to be empty.
759 */
760 void
761 rqinit()
762 {
763 int i;
764
765 for (i = 0; i < RUNQUE_NQS; i++)
766 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
767 (struct lwp *)&sched_qs[i];
768
769 mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
770 }
771
772 static inline void
773 resched_lwp(struct lwp *l)
774 {
775 struct cpu_info *ci;
776 const pri_t pri = lwp_eprio(l);
777
778 /*
779 * XXXSMP
780 * Since l->l_cpu persists across a context switch,
781 * this gives us *very weak* processor affinity, in
782 * that we notify the CPU on which the process last
783 * ran that it should try to switch.
784 *
785 * This does not guarantee that the process will run on
786 * that processor next, because another processor might
787 * grab it the next time it performs a context switch.
788 *
789 * This also does not handle the case where its last
790 * CPU is running a higher-priority process, but every
791 * other CPU is running a lower-priority process. There
792 * are ways to handle this situation, but they're not
793 * currently very pretty, and we also need to weigh the
794 * cost of moving a process from one CPU to another.
795 *
796 * XXXSMP
797 * There is also the issue of locking the other CPU's
798 * sched state, which we currently do not do.
799 */
800 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
801 if (pri < ci->ci_schedstate.spc_curpriority)
802 cpu_need_resched(ci);
803 }
804
805 /*
806 * Change process state to be runnable, placing it on the run queue if it is
807 * in memory, and awakening the swapper if it isn't in memory.
808 *
809 * Call with the process and LWP locked. Will return with the LWP unlocked.
810 */
811 void
812 setrunnable(struct lwp *l)
813 {
814 struct proc *p = l->l_proc;
815 sigset_t *ss;
816
817 KASSERT(mutex_owned(&p->p_smutex));
818 KASSERT(lwp_locked(l, NULL));
819
820 switch (l->l_stat) {
821 case LSSTOP:
822 /*
823 * If we're being traced (possibly because someone attached us
824 * while we were stopped), check for a signal from the debugger.
825 */
826 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
827 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
828 ss = &l->l_sigpend.sp_set;
829 else
830 ss = &p->p_sigpend.sp_set;
831 sigaddset(ss, p->p_xstat);
832 signotify(l);
833 }
834 p->p_nrlwps++;
835 break;
836 case LSSUSPENDED:
837 l->l_flag &= ~LW_WSUSPEND;
838 p->p_nrlwps++;
839 break;
840 case LSSLEEP:
841 KASSERT(l->l_wchan != NULL);
842 break;
843 default:
844 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
845 }
846
847 /*
848 * If the LWP was sleeping interruptably, then it's OK to start it
849 * again. If not, mark it as still sleeping.
850 */
851 if (l->l_wchan != NULL) {
852 l->l_stat = LSSLEEP;
853 /* lwp_unsleep() will release the lock. */
854 lwp_unsleep(l);
855 return;
856 }
857
858 KASSERT(lwp_locked(l, &sched_mutex));
859
860 /*
861 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
862 * about to call mi_switch(), in which case it will yield.
863 *
864 * XXXSMP Will need to change for preemption.
865 */
866 #ifdef MULTIPROCESSOR
867 if (l->l_cpu->ci_curlwp == l) {
868 #else
869 if (l == curlwp) {
870 #endif
871 l->l_stat = LSONPROC;
872 l->l_slptime = 0;
873 lwp_unlock(l);
874 return;
875 }
876
877 /*
878 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
879 * to bring it back in. Otherwise, enter it into a run queue.
880 */
881 if (l->l_slptime > 1)
882 updatepri(l);
883 l->l_stat = LSRUN;
884 l->l_slptime = 0;
885
886 if (l->l_flag & LW_INMEM) {
887 setrunqueue(l);
888 resched_lwp(l);
889 lwp_unlock(l);
890 } else {
891 lwp_unlock(l);
892 uvm_kick_scheduler();
893 }
894 }
895
896 /*
897 * Compute the priority of a process when running in user mode.
898 * Arrange to reschedule if the resulting priority is better
899 * than that of the current process.
900 */
901 void
902 resetpriority(struct lwp *l)
903 {
904 pri_t newpriority;
905 struct proc *p = l->l_proc;
906
907 /* XXXSMP KASSERT(mutex_owned(&p->p_stmutex)); */
908 KASSERT(lwp_locked(l, NULL));
909
910 if ((l->l_flag & LW_SYSTEM) != 0)
911 return;
912
913 newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
914 NICE_WEIGHT * (p->p_nice - NZERO);
915 newpriority = min(newpriority, MAXPRI);
916 lwp_changepri(l, newpriority);
917 }
918
919 /*
920 * Recompute priority for all LWPs in a process.
921 */
922 void
923 resetprocpriority(struct proc *p)
924 {
925 struct lwp *l;
926
927 KASSERT(mutex_owned(&p->p_stmutex));
928
929 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
930 lwp_lock(l);
931 resetpriority(l);
932 lwp_unlock(l);
933 }
934 }
935
936 /*
937 * We adjust the priority of the current process. The priority of a process
938 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
939 * is increased here. The formula for computing priorities (in kern_synch.c)
940 * will compute a different value each time p_estcpu increases. This can
941 * cause a switch, but unless the priority crosses a PPQ boundary the actual
942 * queue will not change. The CPU usage estimator ramps up quite quickly
943 * when the process is running (linearly), and decays away exponentially, at
944 * a rate which is proportionally slower when the system is busy. The basic
945 * principle is that the system will 90% forget that the process used a lot
946 * of CPU time in 5 * loadav seconds. This causes the system to favor
947 * processes which haven't run much recently, and to round-robin among other
948 * processes.
949 */
950
951 void
952 schedclock(struct lwp *l)
953 {
954 struct proc *p = l->l_proc;
955
956 mutex_spin_enter(&p->p_stmutex);
957 p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
958 lwp_lock(l);
959 resetpriority(l);
960 mutex_spin_exit(&p->p_stmutex);
961 if ((l->l_flag & LW_SYSTEM) == 0 && l->l_priority >= PUSER)
962 l->l_priority = l->l_usrpri;
963 lwp_unlock(l);
964 }
965
966 /*
967 * suspendsched:
968 *
969 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
970 */
971 void
972 suspendsched(void)
973 {
974 #ifdef MULTIPROCESSOR
975 CPU_INFO_ITERATOR cii;
976 struct cpu_info *ci;
977 #endif
978 struct lwp *l;
979 struct proc *p;
980
981 /*
982 * We do this by process in order not to violate the locking rules.
983 */
984 mutex_enter(&proclist_mutex);
985 PROCLIST_FOREACH(p, &allproc) {
986 mutex_enter(&p->p_smutex);
987
988 if ((p->p_flag & PK_SYSTEM) != 0) {
989 mutex_exit(&p->p_smutex);
990 continue;
991 }
992
993 p->p_stat = SSTOP;
994
995 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
996 if (l == curlwp)
997 continue;
998
999 lwp_lock(l);
1000
1001 /*
1002 * Set L_WREBOOT so that the LWP will suspend itself
1003 * when it tries to return to user mode. We want to
1004 * try and get to get as many LWPs as possible to
1005 * the user / kernel boundary, so that they will
1006 * release any locks that they hold.
1007 */
1008 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1009
1010 if (l->l_stat == LSSLEEP &&
1011 (l->l_flag & LW_SINTR) != 0) {
1012 /* setrunnable() will release the lock. */
1013 setrunnable(l);
1014 continue;
1015 }
1016
1017 lwp_unlock(l);
1018 }
1019
1020 mutex_exit(&p->p_smutex);
1021 }
1022 mutex_exit(&proclist_mutex);
1023
1024 /*
1025 * Kick all CPUs to make them preempt any LWPs running in user mode.
1026 * They'll trap into the kernel and suspend themselves in userret().
1027 */
1028 sched_lock(0);
1029 #ifdef MULTIPROCESSOR
1030 for (CPU_INFO_FOREACH(cii, ci))
1031 cpu_need_resched(ci);
1032 #else
1033 cpu_need_resched(curcpu());
1034 #endif
1035 sched_unlock(0);
1036 }
1037
1038 /*
1039 * scheduler_fork_hook:
1040 *
1041 * Inherit the parent's scheduler history.
1042 */
1043 void
1044 scheduler_fork_hook(struct proc *parent, struct proc *child)
1045 {
1046
1047 KASSERT(mutex_owned(&parent->p_smutex));
1048
1049 child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1050 child->p_forktime = schedcpu_ticks;
1051 }
1052
1053 /*
1054 * scheduler_wait_hook:
1055 *
1056 * Chargeback parents for the sins of their children.
1057 */
1058 void
1059 scheduler_wait_hook(struct proc *parent, struct proc *child)
1060 {
1061 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1062 fixpt_t estcpu;
1063
1064 /* XXX Only if parent != init?? */
1065
1066 mutex_spin_enter(&parent->p_stmutex);
1067 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1068 schedcpu_ticks - child->p_forktime);
1069 if (child->p_estcpu > estcpu)
1070 parent->p_estcpu =
1071 ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1072 mutex_spin_exit(&parent->p_stmutex);
1073 }
1074
1075 /*
1076 * sched_kpri:
1077 *
1078 * Scale a priority level to a kernel priority level, usually
1079 * for an LWP that is about to sleep.
1080 */
1081 pri_t
1082 sched_kpri(struct lwp *l)
1083 {
1084 /*
1085 * Scale user priorities (127 -> 50) up to kernel priorities
1086 * in the range (49 -> 8). Reserve the top 8 kernel priorities
1087 * for high priority kthreads. Kernel priorities passed in
1088 * are left "as is". XXX This is somewhat arbitrary.
1089 */
1090 static const uint8_t kpri_tab[] = {
1091 0, 1, 2, 3, 4, 5, 6, 7,
1092 8, 9, 10, 11, 12, 13, 14, 15,
1093 16, 17, 18, 19, 20, 21, 22, 23,
1094 24, 25, 26, 27, 28, 29, 30, 31,
1095 32, 33, 34, 35, 36, 37, 38, 39,
1096 40, 41, 42, 43, 44, 45, 46, 47,
1097 48, 49, 8, 8, 9, 9, 10, 10,
1098 11, 11, 12, 12, 13, 14, 14, 15,
1099 15, 16, 16, 17, 17, 18, 18, 19,
1100 20, 20, 21, 21, 22, 22, 23, 23,
1101 24, 24, 25, 26, 26, 27, 27, 28,
1102 28, 29, 29, 30, 30, 31, 32, 32,
1103 33, 33, 34, 34, 35, 35, 36, 36,
1104 37, 38, 38, 39, 39, 40, 40, 41,
1105 41, 42, 42, 43, 44, 44, 45, 45,
1106 46, 46, 47, 47, 48, 48, 49, 49,
1107 };
1108
1109 return (pri_t)kpri_tab[l->l_usrpri];
1110 }
1111
1112 /*
1113 * sched_unsleep:
1114 *
1115 * The is called when the LWP has not been awoken normally but instead
1116 * interrupted: for example, if the sleep timed out. Because of this,
1117 * it's not a valid action for running or idle LWPs.
1118 */
1119 void
1120 sched_unsleep(struct lwp *l)
1121 {
1122
1123 lwp_unlock(l);
1124 panic("sched_unsleep");
1125 }
1126
1127 /*
1128 * sched_changepri:
1129 *
1130 * Adjust the priority of an LWP.
1131 */
1132 void
1133 sched_changepri(struct lwp *l, pri_t pri)
1134 {
1135
1136 KASSERT(lwp_locked(l, &sched_mutex));
1137
1138 l->l_usrpri = pri;
1139 if (l->l_priority < PUSER)
1140 return;
1141
1142 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1143 l->l_priority = pri;
1144 return;
1145 }
1146
1147 remrunqueue(l);
1148 l->l_priority = pri;
1149 setrunqueue(l);
1150 resched_lwp(l);
1151 }
1152
1153 void
1154 sched_lendpri(struct lwp *l, pri_t pri)
1155 {
1156
1157 KASSERT(lwp_locked(l, &sched_mutex));
1158
1159 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1160 l->l_inheritedprio = pri;
1161 return;
1162 }
1163
1164 remrunqueue(l);
1165 l->l_inheritedprio = pri;
1166 setrunqueue(l);
1167 resched_lwp(l);
1168 }
1169
1170 struct lwp *
1171 syncobj_noowner(wchan_t wchan)
1172 {
1173
1174 return NULL;
1175 }
1176
1177 /*
1178 * Low-level routines to access the run queue. Optimised assembler
1179 * routines can override these.
1180 */
1181
1182 #ifndef __HAVE_MD_RUNQUEUE
1183
1184 /*
1185 * On some architectures, it's faster to use a MSB ordering for the priorites
1186 * than the traditional LSB ordering.
1187 */
1188 #ifdef __HAVE_BIGENDIAN_BITOPS
1189 #define RQMASK(n) (0x80000000 >> (n))
1190 #else
1191 #define RQMASK(n) (0x00000001 << (n))
1192 #endif
1193
1194 /*
1195 * The primitives that manipulate the run queues. whichqs tells which
1196 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1197 * into queues, remrunqueue removes them from queues. The running process is
1198 * on no queue, other processes are on a queue related to p->p_priority,
1199 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1200 * available queues.
1201 */
1202 #ifdef RQDEBUG
1203 static void
1204 checkrunqueue(int whichq, struct lwp *l)
1205 {
1206 const struct prochd * const rq = &sched_qs[whichq];
1207 struct lwp *l2;
1208 int found = 0;
1209 int die = 0;
1210 int empty = 1;
1211 for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1212 if (l2->l_stat != LSRUN) {
1213 printf("checkrunqueue[%d]: lwp %p state (%d) "
1214 " != LSRUN\n", whichq, l2, l2->l_stat);
1215 }
1216 if (l2->l_back->l_forw != l2) {
1217 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1218 "corrupt %p\n", whichq, l2, l2->l_back,
1219 l2->l_back->l_forw);
1220 die = 1;
1221 }
1222 if (l2->l_forw->l_back != l2) {
1223 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1224 "corrupt %p\n", whichq, l2, l2->l_forw,
1225 l2->l_forw->l_back);
1226 die = 1;
1227 }
1228 if (l2 == l)
1229 found = 1;
1230 empty = 0;
1231 }
1232 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1233 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1234 whichq, rq);
1235 die = 1;
1236 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1237 printf("checkrunqueue[%d]: bit clear for non-empty "
1238 "run-queue %p\n", whichq, rq);
1239 die = 1;
1240 }
1241 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1242 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1243 whichq, l);
1244 die = 1;
1245 }
1246 if (l != NULL && empty) {
1247 printf("checkrunqueue[%d]: empty run-queue %p with "
1248 "active lwp %p\n", whichq, rq, l);
1249 die = 1;
1250 }
1251 if (l != NULL && !found) {
1252 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1253 whichq, l, rq);
1254 die = 1;
1255 }
1256 if (die)
1257 panic("checkrunqueue: inconsistency found");
1258 }
1259 #endif /* RQDEBUG */
1260
1261 void
1262 setrunqueue(struct lwp *l)
1263 {
1264 struct prochd *rq;
1265 struct lwp *prev;
1266 const int whichq = lwp_eprio(l) / PPQ;
1267
1268 KASSERT(lwp_locked(l, &sched_mutex));
1269
1270 #ifdef RQDEBUG
1271 checkrunqueue(whichq, NULL);
1272 #endif
1273 #ifdef DIAGNOSTIC
1274 if (l->l_back != NULL || l->l_stat != LSRUN)
1275 panic("setrunqueue");
1276 #endif
1277 sched_whichqs |= RQMASK(whichq);
1278 rq = &sched_qs[whichq];
1279 prev = rq->ph_rlink;
1280 l->l_forw = (struct lwp *)rq;
1281 rq->ph_rlink = l;
1282 prev->l_forw = l;
1283 l->l_back = prev;
1284 #ifdef RQDEBUG
1285 checkrunqueue(whichq, l);
1286 #endif
1287 }
1288
1289 /*
1290 * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
1291 * drop of the effective priority level from kernel to user needs to be
1292 * moved here from userret(). The assignment in userret() is currently
1293 * done unlocked.
1294 */
1295 void
1296 remrunqueue(struct lwp *l)
1297 {
1298 struct lwp *prev, *next;
1299 const int whichq = lwp_eprio(l) / PPQ;
1300
1301 KASSERT(lwp_locked(l, &sched_mutex));
1302
1303 #ifdef RQDEBUG
1304 checkrunqueue(whichq, l);
1305 #endif
1306
1307 #if defined(DIAGNOSTIC)
1308 if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1309 /* Shouldn't happen - interrupts disabled. */
1310 panic("remrunqueue: bit %d not set", whichq);
1311 }
1312 #endif
1313 prev = l->l_back;
1314 l->l_back = NULL;
1315 next = l->l_forw;
1316 prev->l_forw = next;
1317 next->l_back = prev;
1318 if (prev == next)
1319 sched_whichqs &= ~RQMASK(whichq);
1320 #ifdef RQDEBUG
1321 checkrunqueue(whichq, NULL);
1322 #endif
1323 }
1324
1325 #undef RQMASK
1326 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1327