kern_synch.c revision 1.186.2.3 1 /* $NetBSD: kern_synch.c,v 1.186.2.3 2007/03/21 20:16:32 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*-
41 * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. Neither the name of the University nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 *
73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.186.2.3 2007/03/21 20:16:32 ad Exp $");
78
79 #include "opt_ddb.h"
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/callout.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #include <sys/buf.h>
93 #if defined(PERFCTRS)
94 #include <sys/pmc.h>
95 #endif
96 #include <sys/signalvar.h>
97 #include <sys/resourcevar.h>
98 #include <sys/sched.h>
99 #include <sys/syscall_stats.h>
100 #include <sys/kauth.h>
101 #include <sys/sleepq.h>
102 #include <sys/lockdebug.h>
103
104 #include <uvm/uvm_extern.h>
105
106 #include <machine/cpu.h>
107
108 int rrticks; /* number of hardclock ticks per roundrobin() */
109
110 /*
111 * The global scheduler state.
112 */
113 kmutex_t sched_mutex; /* global sched state mutex */
114 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
115 volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
116
117 void schedcpu(void *);
118 void updatepri(struct lwp *);
119
120 void sched_unsleep(struct lwp *);
121 void sched_changepri(struct lwp *, pri_t);
122 void sched_lendpri(struct lwp *, pri_t);
123
124 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
125 static unsigned int schedcpu_ticks;
126
127 syncobj_t sleep_syncobj = {
128 SOBJ_SLEEPQ_SORTED,
129 sleepq_unsleep,
130 sleepq_changepri,
131 sleepq_lendpri,
132 syncobj_noowner,
133 };
134
135 syncobj_t sched_syncobj = {
136 SOBJ_SLEEPQ_SORTED,
137 sched_unsleep,
138 sched_changepri,
139 sched_lendpri,
140 syncobj_noowner,
141 };
142
143 /*
144 * Force switch among equal priority processes every 100ms.
145 * Called from hardclock every hz/10 == rrticks hardclock ticks.
146 */
147 /* ARGSUSED */
148 void
149 roundrobin(struct cpu_info *ci)
150 {
151 struct schedstate_percpu *spc = &ci->ci_schedstate;
152
153 spc->spc_rrticks = rrticks;
154
155 if (curlwp != NULL) {
156 if (spc->spc_flags & SPCF_SEENRR) {
157 /*
158 * The process has already been through a roundrobin
159 * without switching and may be hogging the CPU.
160 * Indicate that the process should yield.
161 */
162 spc->spc_flags |= SPCF_SHOULDYIELD;
163 } else
164 spc->spc_flags |= SPCF_SEENRR;
165 }
166 cpu_need_resched(curcpu());
167 }
168
169 #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
170 #define NICE_WEIGHT 2 /* priorities per nice level */
171
172 #define ESTCPU_SHIFT 11
173 #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
174 #define ESTCPULIM(e) min((e), ESTCPU_MAX)
175
176 /*
177 * Constants for digital decay and forget:
178 * 90% of (p_estcpu) usage in 5 * loadav time
179 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
180 * Note that, as ps(1) mentions, this can let percentages
181 * total over 100% (I've seen 137.9% for 3 processes).
182 *
183 * Note that hardclock updates p_estcpu and p_cpticks independently.
184 *
185 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
186 * That is, the system wants to compute a value of decay such
187 * that the following for loop:
188 * for (i = 0; i < (5 * loadavg); i++)
189 * p_estcpu *= decay;
190 * will compute
191 * p_estcpu *= 0.1;
192 * for all values of loadavg:
193 *
194 * Mathematically this loop can be expressed by saying:
195 * decay ** (5 * loadavg) ~= .1
196 *
197 * The system computes decay as:
198 * decay = (2 * loadavg) / (2 * loadavg + 1)
199 *
200 * We wish to prove that the system's computation of decay
201 * will always fulfill the equation:
202 * decay ** (5 * loadavg) ~= .1
203 *
204 * If we compute b as:
205 * b = 2 * loadavg
206 * then
207 * decay = b / (b + 1)
208 *
209 * We now need to prove two things:
210 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
211 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
212 *
213 * Facts:
214 * For x close to zero, exp(x) =~ 1 + x, since
215 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
216 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
217 * For x close to zero, ln(1+x) =~ x, since
218 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
219 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
220 * ln(.1) =~ -2.30
221 *
222 * Proof of (1):
223 * Solve (factor)**(power) =~ .1 given power (5*loadav):
224 * solving for factor,
225 * ln(factor) =~ (-2.30/5*loadav), or
226 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
227 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
228 *
229 * Proof of (2):
230 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
231 * solving for power,
232 * power*ln(b/(b+1)) =~ -2.30, or
233 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
234 *
235 * Actual power values for the implemented algorithm are as follows:
236 * loadav: 1 2 3 4
237 * power: 5.68 10.32 14.94 19.55
238 */
239
240 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
241 #define loadfactor(loadav) (2 * (loadav))
242
243 static fixpt_t
244 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
245 {
246
247 if (estcpu == 0) {
248 return 0;
249 }
250
251 #if !defined(_LP64)
252 /* avoid 64bit arithmetics. */
253 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
254 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
255 return estcpu * loadfac / (loadfac + FSCALE);
256 }
257 #endif /* !defined(_LP64) */
258
259 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
260 }
261
262 /*
263 * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
264 * sleeping for at least seven times the loadfactor will decay p_estcpu to
265 * less than (1 << ESTCPU_SHIFT).
266 *
267 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
268 */
269 static fixpt_t
270 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
271 {
272
273 if ((n << FSHIFT) >= 7 * loadfac) {
274 return 0;
275 }
276
277 while (estcpu != 0 && n > 1) {
278 estcpu = decay_cpu(loadfac, estcpu);
279 n--;
280 }
281
282 return estcpu;
283 }
284
285 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
286 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
287
288 /*
289 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
290 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
291 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
292 *
293 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
294 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
295 *
296 * If you dont want to bother with the faster/more-accurate formula, you
297 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
298 * (more general) method of calculating the %age of CPU used by a process.
299 */
300 #define CCPU_SHIFT 11
301
302 /*
303 * schedcpu:
304 *
305 * Recompute process priorities, every hz ticks.
306 *
307 * XXXSMP This needs to be reorganised in order to reduce the locking
308 * burden.
309 */
310 /* ARGSUSED */
311 void
312 schedcpu(void *arg)
313 {
314 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
315 struct rlimit *rlim;
316 struct lwp *l;
317 struct proc *p;
318 int minslp, clkhz, sig;
319 long runtm;
320
321 schedcpu_ticks++;
322
323 mutex_enter(&proclist_mutex);
324 PROCLIST_FOREACH(p, &allproc) {
325 /*
326 * Increment time in/out of memory and sleep time (if
327 * sleeping). We ignore overflow; with 16-bit int's
328 * (remember them?) overflow takes 45 days.
329 */
330 minslp = 2;
331 mutex_enter(&p->p_smutex);
332 runtm = p->p_rtime.tv_sec;
333 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
334 lwp_lock(l);
335 runtm += l->l_rtime.tv_sec;
336 l->l_swtime++;
337 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
338 l->l_stat == LSSUSPENDED) {
339 l->l_slptime++;
340 minslp = min(minslp, l->l_slptime);
341 } else
342 minslp = 0;
343 lwp_unlock(l);
344 }
345 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
346
347 /*
348 * Check if the process exceeds its CPU resource allocation.
349 * If over max, kill it.
350 */
351 rlim = &p->p_rlimit[RLIMIT_CPU];
352 sig = 0;
353 if (runtm >= rlim->rlim_cur) {
354 if (runtm >= rlim->rlim_max)
355 sig = SIGKILL;
356 else {
357 sig = SIGXCPU;
358 if (rlim->rlim_cur < rlim->rlim_max)
359 rlim->rlim_cur += 5;
360 }
361 }
362
363 /*
364 * If the process has run for more than autonicetime, reduce
365 * priority to give others a chance.
366 */
367 if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
368 && kauth_cred_geteuid(p->p_cred)) {
369 mutex_spin_enter(&p->p_stmutex);
370 p->p_nice = autoniceval + NZERO;
371 resetprocpriority(p);
372 mutex_spin_exit(&p->p_stmutex);
373 }
374
375 /*
376 * If the process has slept the entire second,
377 * stop recalculating its priority until it wakes up.
378 */
379 if (minslp <= 1) {
380 /*
381 * p_pctcpu is only for ps.
382 */
383 mutex_spin_enter(&p->p_stmutex);
384 clkhz = stathz != 0 ? stathz : hz;
385 #if (FSHIFT >= CCPU_SHIFT)
386 p->p_pctcpu += (clkhz == 100)?
387 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
388 100 * (((fixpt_t) p->p_cpticks)
389 << (FSHIFT - CCPU_SHIFT)) / clkhz;
390 #else
391 p->p_pctcpu += ((FSCALE - ccpu) *
392 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
393 #endif
394 p->p_cpticks = 0;
395 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
396
397 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
398 lwp_lock(l);
399 if (l->l_slptime <= 1 &&
400 l->l_priority >= PUSER)
401 resetpriority(l);
402 lwp_unlock(l);
403 }
404 mutex_spin_exit(&p->p_stmutex);
405 }
406
407 mutex_exit(&p->p_smutex);
408 if (sig) {
409 psignal(p, sig);
410 }
411 }
412 mutex_exit(&proclist_mutex);
413 uvm_meter();
414 cv_broadcast(&lbolt);
415 callout_schedule(&schedcpu_ch, hz);
416 }
417
418 /*
419 * Recalculate the priority of a process after it has slept for a while.
420 */
421 void
422 updatepri(struct lwp *l)
423 {
424 struct proc *p = l->l_proc;
425 fixpt_t loadfac;
426
427 KASSERT(lwp_locked(l, NULL));
428 KASSERT(l->l_slptime > 1);
429
430 loadfac = loadfactor(averunnable.ldavg[0]);
431
432 l->l_slptime--; /* the first time was done in schedcpu */
433 /* XXX NJWLWP */
434 /* XXXSMP occasionally unlocked, should be per-LWP */
435 p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
436 resetpriority(l);
437 }
438
439 /*
440 * During autoconfiguration or after a panic, a sleep will simply lower the
441 * priority briefly to allow interrupts, then return. The priority to be
442 * used (safepri) is machine-dependent, thus this value is initialized and
443 * maintained in the machine-dependent layers. This priority will typically
444 * be 0, or the lowest priority that is safe for use on the interrupt stack;
445 * it can be made higher to block network software interrupts after panics.
446 */
447 int safepri;
448
449 /*
450 * OBSOLETE INTERFACE
451 *
452 * General sleep call. Suspends the current process until a wakeup is
453 * performed on the specified identifier. The process will then be made
454 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
455 * means no timeout). If pri includes PCATCH flag, signals are checked
456 * before and after sleeping, else signals are not checked. Returns 0 if
457 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
458 * signal needs to be delivered, ERESTART is returned if the current system
459 * call should be restarted if possible, and EINTR is returned if the system
460 * call should be interrupted by the signal (return EINTR).
461 *
462 * The interlock is held until we are on a sleep queue. The interlock will
463 * be locked before returning back to the caller unless the PNORELOCK flag
464 * is specified, in which case the interlock will always be unlocked upon
465 * return.
466 */
467 int
468 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
469 volatile struct simplelock *interlock)
470 {
471 struct lwp *l = curlwp;
472 sleepq_t *sq;
473 int error, catch;
474
475 if (sleepq_dontsleep(l)) {
476 (void)sleepq_abort(NULL, 0);
477 if ((priority & PNORELOCK) != 0)
478 simple_unlock(interlock);
479 return 0;
480 }
481
482 sq = sleeptab_lookup(&sleeptab, ident);
483 sleepq_enter(sq, l);
484
485 if (interlock != NULL)
486 simple_unlock(interlock);
487
488 catch = priority & PCATCH;
489 sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
490 &sleep_syncobj);
491 error = sleepq_unblock(timo, catch);
492
493 if (interlock != NULL && (priority & PNORELOCK) == 0)
494 simple_lock(interlock);
495
496 return error;
497 }
498
499 int
500 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
501 kmutex_t *mtx)
502 {
503 struct lwp *l = curlwp;
504 sleepq_t *sq;
505 int error, catch;
506
507 if (sleepq_dontsleep(l)) {
508 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
509 return 0;
510 }
511
512 sq = sleeptab_lookup(&sleeptab, ident);
513 sleepq_enter(sq, l);
514 mutex_exit(mtx);
515
516 catch = priority & PCATCH;
517 sleepq_block(sq, priority & PRIMASK, ident, wmesg, timo, catch,
518 &sleep_syncobj);
519 error = sleepq_unblock(timo, catch);
520
521 if ((priority & PNORELOCK) == 0)
522 mutex_enter(mtx);
523
524 return error;
525 }
526
527 /*
528 * General sleep call for situations where a wake-up is not expected.
529 */
530 int
531 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
532 {
533 struct lwp *l = curlwp;
534 sleepq_t *sq;
535 int error;
536
537 if (sleepq_dontsleep(l))
538 return sleepq_abort(NULL, 0);
539
540 if (mtx != NULL)
541 mutex_exit(mtx);
542 sq = sleeptab_lookup(&sleeptab, l);
543 sleepq_enter(sq, l);
544 sleepq_block(sq, sched_kpri(l), l, wmesg, timo, intr, &sleep_syncobj);
545 error = sleepq_unblock(timo, intr);
546 if (mtx != NULL)
547 mutex_enter(mtx);
548
549 return error;
550 }
551
552 /*
553 * OBSOLETE INTERFACE
554 *
555 * Make all processes sleeping on the specified identifier runnable.
556 */
557 void
558 wakeup(wchan_t ident)
559 {
560 sleepq_t *sq;
561
562 if (cold)
563 return;
564
565 sq = sleeptab_lookup(&sleeptab, ident);
566 sleepq_wake(sq, ident, (u_int)-1);
567 }
568
569 /*
570 * OBSOLETE INTERFACE
571 *
572 * Make the highest priority process first in line on the specified
573 * identifier runnable.
574 */
575 void
576 wakeup_one(wchan_t ident)
577 {
578 sleepq_t *sq;
579
580 if (cold)
581 return;
582
583 sq = sleeptab_lookup(&sleeptab, ident);
584 sleepq_wake(sq, ident, 1);
585 }
586
587
588 /*
589 * General yield call. Puts the current process back on its run queue and
590 * performs a voluntary context switch. Should only be called when the
591 * current process explicitly requests it (eg sched_yield(2) in compat code).
592 */
593 void
594 yield(void)
595 {
596 struct lwp *l = curlwp;
597
598 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
599 lwp_lock(l);
600 if (l->l_stat == LSONPROC) {
601 KASSERT(lwp_locked(l, &sched_mutex));
602 l->l_priority = l->l_usrpri;
603 }
604 l->l_nvcsw++;
605 mi_switch(l, NULL);
606 KERNEL_LOCK(l->l_biglocks, l);
607 }
608
609 /*
610 * General preemption call. Puts the current process back on its run queue
611 * and performs an involuntary context switch.
612 */
613 void
614 preempt(void)
615 {
616 struct lwp *l = curlwp;
617
618 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
619 lwp_lock(l);
620 if (l->l_stat == LSONPROC) {
621 KASSERT(lwp_locked(l, &sched_mutex));
622 l->l_priority = l->l_usrpri;
623 }
624 l->l_nivcsw++;
625 (void)mi_switch(l, NULL);
626 KERNEL_LOCK(l->l_biglocks, l);
627 }
628
629 /*
630 * The machine independent parts of context switch. Switch to "new"
631 * if non-NULL, otherwise let cpu_switch choose the next lwp.
632 *
633 * Returns 1 if another process was actually run.
634 */
635 int
636 mi_switch(struct lwp *l, struct lwp *newl)
637 {
638 struct schedstate_percpu *spc;
639 struct timeval tv;
640 int retval, oldspl;
641 long s, u;
642
643 KASSERT(lwp_locked(l, NULL));
644
645 #ifdef KSTACK_CHECK_MAGIC
646 kstack_check_magic(l);
647 #endif
648
649 /*
650 * It's safe to read the per CPU schedstate unlocked here, as all we
651 * are after is the run time and that's guarenteed to have been last
652 * updated by this CPU.
653 */
654 KDASSERT(l->l_cpu == curcpu());
655 spc = &l->l_cpu->ci_schedstate;
656
657 /*
658 * Compute the amount of time during which the current
659 * process was running.
660 */
661 microtime(&tv);
662 u = l->l_rtime.tv_usec +
663 (tv.tv_usec - spc->spc_runtime.tv_usec);
664 s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
665 if (u < 0) {
666 u += 1000000;
667 s--;
668 } else if (u >= 1000000) {
669 u -= 1000000;
670 s++;
671 }
672 l->l_rtime.tv_usec = u;
673 l->l_rtime.tv_sec = s;
674
675 /* Count time spent in current system call */
676 SYSCALL_TIME_SLEEP(l);
677
678 /*
679 * XXXSMP If we are using h/w performance counters, save context.
680 */
681 #if PERFCTRS
682 if (PMC_ENABLED(l->l_proc)) {
683 pmc_save_context(l->l_proc);
684 }
685 #endif
686
687 /*
688 * Acquire the sched_mutex if necessary. It will be released by
689 * cpu_switch once it has decided to idle, or picked another LWP
690 * to run.
691 */
692 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
693 if (l->l_mutex != &sched_mutex) {
694 mutex_spin_enter(&sched_mutex);
695 lwp_unlock(l);
696 }
697 #endif
698
699 /*
700 * If on the CPU and we have gotten this far, then we must yield.
701 */
702 KASSERT(l->l_stat != LSRUN);
703 if (l->l_stat == LSONPROC) {
704 KASSERT(lwp_locked(l, &sched_mutex));
705 l->l_stat = LSRUN;
706 setrunqueue(l);
707 }
708 uvmexp.swtch++;
709
710 /*
711 * Process is about to yield the CPU; clear the appropriate
712 * scheduling flags.
713 */
714 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
715
716 LOCKDEBUG_BARRIER(&sched_mutex, 1);
717
718 /*
719 * Switch to the new current LWP. When we run again, we'll
720 * return back here.
721 */
722 oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
723
724 if (newl == NULL || newl->l_back == NULL)
725 retval = cpu_switch(l, NULL);
726 else {
727 KASSERT(lwp_locked(newl, &sched_mutex));
728 remrunqueue(newl);
729 cpu_switchto(l, newl);
730 retval = 0;
731 }
732
733 /*
734 * XXXSMP If we are using h/w performance counters, restore context.
735 */
736 #if PERFCTRS
737 if (PMC_ENABLED(l->l_proc)) {
738 pmc_restore_context(l->l_proc);
739 }
740 #endif
741
742 /*
743 * We're running again; record our new start time. We might
744 * be running on a new CPU now, so don't use the cached
745 * schedstate_percpu pointer.
746 */
747 SYSCALL_TIME_WAKEUP(l);
748 KDASSERT(l->l_cpu == curcpu());
749 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
750 splx(oldspl);
751
752 return retval;
753 }
754
755 /*
756 * Initialize the (doubly-linked) run queues
757 * to be empty.
758 */
759 void
760 rqinit()
761 {
762 int i;
763
764 for (i = 0; i < RUNQUE_NQS; i++)
765 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
766 (struct lwp *)&sched_qs[i];
767
768 mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
769 }
770
771 static inline void
772 resched_lwp(struct lwp *l)
773 {
774 struct cpu_info *ci;
775 const pri_t pri = lwp_eprio(l);
776
777 /*
778 * XXXSMP
779 * Since l->l_cpu persists across a context switch,
780 * this gives us *very weak* processor affinity, in
781 * that we notify the CPU on which the process last
782 * ran that it should try to switch.
783 *
784 * This does not guarantee that the process will run on
785 * that processor next, because another processor might
786 * grab it the next time it performs a context switch.
787 *
788 * This also does not handle the case where its last
789 * CPU is running a higher-priority process, but every
790 * other CPU is running a lower-priority process. There
791 * are ways to handle this situation, but they're not
792 * currently very pretty, and we also need to weigh the
793 * cost of moving a process from one CPU to another.
794 *
795 * XXXSMP
796 * There is also the issue of locking the other CPU's
797 * sched state, which we currently do not do.
798 */
799 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
800 if (pri < ci->ci_schedstate.spc_curpriority)
801 cpu_need_resched(ci);
802 }
803
804 /*
805 * Change process state to be runnable, placing it on the run queue if it is
806 * in memory, and awakening the swapper if it isn't in memory.
807 *
808 * Call with the process and LWP locked. Will return with the LWP unlocked.
809 */
810 void
811 setrunnable(struct lwp *l)
812 {
813 struct proc *p = l->l_proc;
814 sigset_t *ss;
815
816 KASSERT(mutex_owned(&p->p_smutex));
817 KASSERT(lwp_locked(l, NULL));
818
819 switch (l->l_stat) {
820 case LSSTOP:
821 /*
822 * If we're being traced (possibly because someone attached us
823 * while we were stopped), check for a signal from the debugger.
824 */
825 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
826 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
827 ss = &l->l_sigpend.sp_set;
828 else
829 ss = &p->p_sigpend.sp_set;
830 sigaddset(ss, p->p_xstat);
831 signotify(l);
832 }
833 p->p_nrlwps++;
834 break;
835 case LSSUSPENDED:
836 l->l_flag &= ~LW_WSUSPEND;
837 p->p_nrlwps++;
838 break;
839 case LSSLEEP:
840 KASSERT(l->l_wchan != NULL);
841 break;
842 default:
843 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
844 }
845
846 /*
847 * If the LWP was sleeping interruptably, then it's OK to start it
848 * again. If not, mark it as still sleeping.
849 */
850 if (l->l_wchan != NULL) {
851 l->l_stat = LSSLEEP;
852 /* lwp_unsleep() will release the lock. */
853 lwp_unsleep(l);
854 return;
855 }
856
857 KASSERT(lwp_locked(l, &sched_mutex));
858
859 /*
860 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
861 * about to call mi_switch(), in which case it will yield.
862 *
863 * XXXSMP Will need to change for preemption.
864 */
865 #ifdef MULTIPROCESSOR
866 if (l->l_cpu->ci_curlwp == l) {
867 #else
868 if (l == curlwp) {
869 #endif
870 l->l_stat = LSONPROC;
871 l->l_slptime = 0;
872 lwp_unlock(l);
873 return;
874 }
875
876 /*
877 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
878 * to bring it back in. Otherwise, enter it into a run queue.
879 */
880 if (l->l_slptime > 1)
881 updatepri(l);
882 l->l_stat = LSRUN;
883 l->l_slptime = 0;
884
885 if (l->l_flag & LW_INMEM) {
886 setrunqueue(l);
887 resched_lwp(l);
888 lwp_unlock(l);
889 } else {
890 lwp_unlock(l);
891 uvm_kick_scheduler();
892 }
893 }
894
895 /*
896 * Compute the priority of a process when running in user mode.
897 * Arrange to reschedule if the resulting priority is better
898 * than that of the current process.
899 */
900 void
901 resetpriority(struct lwp *l)
902 {
903 pri_t newpriority;
904 struct proc *p = l->l_proc;
905
906 /* XXXSMP KASSERT(mutex_owned(&p->p_stmutex)); */
907 KASSERT(lwp_locked(l, NULL));
908
909 if ((l->l_flag & LW_SYSTEM) != 0)
910 return;
911
912 newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
913 NICE_WEIGHT * (p->p_nice - NZERO);
914 newpriority = min(newpriority, MAXPRI);
915 lwp_changepri(l, newpriority);
916 }
917
918 /*
919 * Recompute priority for all LWPs in a process.
920 */
921 void
922 resetprocpriority(struct proc *p)
923 {
924 struct lwp *l;
925
926 KASSERT(mutex_owned(&p->p_stmutex));
927
928 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
929 lwp_lock(l);
930 resetpriority(l);
931 lwp_unlock(l);
932 }
933 }
934
935 /*
936 * We adjust the priority of the current process. The priority of a process
937 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
938 * is increased here. The formula for computing priorities (in kern_synch.c)
939 * will compute a different value each time p_estcpu increases. This can
940 * cause a switch, but unless the priority crosses a PPQ boundary the actual
941 * queue will not change. The CPU usage estimator ramps up quite quickly
942 * when the process is running (linearly), and decays away exponentially, at
943 * a rate which is proportionally slower when the system is busy. The basic
944 * principle is that the system will 90% forget that the process used a lot
945 * of CPU time in 5 * loadav seconds. This causes the system to favor
946 * processes which haven't run much recently, and to round-robin among other
947 * processes.
948 */
949
950 void
951 schedclock(struct lwp *l)
952 {
953 struct proc *p = l->l_proc;
954
955 mutex_spin_enter(&p->p_stmutex);
956 p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
957 lwp_lock(l);
958 resetpriority(l);
959 mutex_spin_exit(&p->p_stmutex);
960 if ((l->l_flag & LW_SYSTEM) == 0 && l->l_priority >= PUSER)
961 l->l_priority = l->l_usrpri;
962 lwp_unlock(l);
963 }
964
965 /*
966 * suspendsched:
967 *
968 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
969 */
970 void
971 suspendsched(void)
972 {
973 #ifdef MULTIPROCESSOR
974 CPU_INFO_ITERATOR cii;
975 struct cpu_info *ci;
976 #endif
977 struct lwp *l;
978 struct proc *p;
979
980 /*
981 * We do this by process in order not to violate the locking rules.
982 */
983 mutex_enter(&proclist_mutex);
984 PROCLIST_FOREACH(p, &allproc) {
985 mutex_enter(&p->p_smutex);
986
987 if ((p->p_flag & PK_SYSTEM) != 0) {
988 mutex_exit(&p->p_smutex);
989 continue;
990 }
991
992 p->p_stat = SSTOP;
993
994 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
995 if (l == curlwp)
996 continue;
997
998 lwp_lock(l);
999
1000 /*
1001 * Set L_WREBOOT so that the LWP will suspend itself
1002 * when it tries to return to user mode. We want to
1003 * try and get to get as many LWPs as possible to
1004 * the user / kernel boundary, so that they will
1005 * release any locks that they hold.
1006 */
1007 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1008
1009 if (l->l_stat == LSSLEEP &&
1010 (l->l_flag & LW_SINTR) != 0) {
1011 /* setrunnable() will release the lock. */
1012 setrunnable(l);
1013 continue;
1014 }
1015
1016 lwp_unlock(l);
1017 }
1018
1019 mutex_exit(&p->p_smutex);
1020 }
1021 mutex_exit(&proclist_mutex);
1022
1023 /*
1024 * Kick all CPUs to make them preempt any LWPs running in user mode.
1025 * They'll trap into the kernel and suspend themselves in userret().
1026 */
1027 sched_lock(0);
1028 #ifdef MULTIPROCESSOR
1029 for (CPU_INFO_FOREACH(cii, ci))
1030 cpu_need_resched(ci);
1031 #else
1032 cpu_need_resched(curcpu());
1033 #endif
1034 sched_unlock(0);
1035 }
1036
1037 /*
1038 * scheduler_fork_hook:
1039 *
1040 * Inherit the parent's scheduler history.
1041 */
1042 void
1043 scheduler_fork_hook(struct proc *parent, struct proc *child)
1044 {
1045
1046 KASSERT(mutex_owned(&parent->p_smutex));
1047
1048 child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1049 child->p_forktime = schedcpu_ticks;
1050 }
1051
1052 /*
1053 * scheduler_wait_hook:
1054 *
1055 * Chargeback parents for the sins of their children.
1056 */
1057 void
1058 scheduler_wait_hook(struct proc *parent, struct proc *child)
1059 {
1060 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1061 fixpt_t estcpu;
1062
1063 /* XXX Only if parent != init?? */
1064
1065 mutex_spin_enter(&parent->p_stmutex);
1066 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1067 schedcpu_ticks - child->p_forktime);
1068 if (child->p_estcpu > estcpu)
1069 parent->p_estcpu =
1070 ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1071 mutex_spin_exit(&parent->p_stmutex);
1072 }
1073
1074 /*
1075 * sched_kpri:
1076 *
1077 * Scale a priority level to a kernel priority level, usually
1078 * for an LWP that is about to sleep.
1079 */
1080 pri_t
1081 sched_kpri(struct lwp *l)
1082 {
1083 /*
1084 * Scale user priorities (127 -> 50) up to kernel priorities
1085 * in the range (49 -> 8). Reserve the top 8 kernel priorities
1086 * for high priority kthreads. Kernel priorities passed in
1087 * are left "as is". XXX This is somewhat arbitrary.
1088 */
1089 static const uint8_t kpri_tab[] = {
1090 0, 1, 2, 3, 4, 5, 6, 7,
1091 8, 9, 10, 11, 12, 13, 14, 15,
1092 16, 17, 18, 19, 20, 21, 22, 23,
1093 24, 25, 26, 27, 28, 29, 30, 31,
1094 32, 33, 34, 35, 36, 37, 38, 39,
1095 40, 41, 42, 43, 44, 45, 46, 47,
1096 48, 49, 8, 8, 9, 9, 10, 10,
1097 11, 11, 12, 12, 13, 14, 14, 15,
1098 15, 16, 16, 17, 17, 18, 18, 19,
1099 20, 20, 21, 21, 22, 22, 23, 23,
1100 24, 24, 25, 26, 26, 27, 27, 28,
1101 28, 29, 29, 30, 30, 31, 32, 32,
1102 33, 33, 34, 34, 35, 35, 36, 36,
1103 37, 38, 38, 39, 39, 40, 40, 41,
1104 41, 42, 42, 43, 44, 44, 45, 45,
1105 46, 46, 47, 47, 48, 48, 49, 49,
1106 };
1107
1108 return (pri_t)kpri_tab[l->l_usrpri];
1109 }
1110
1111 /*
1112 * sched_unsleep:
1113 *
1114 * The is called when the LWP has not been awoken normally but instead
1115 * interrupted: for example, if the sleep timed out. Because of this,
1116 * it's not a valid action for running or idle LWPs.
1117 */
1118 void
1119 sched_unsleep(struct lwp *l)
1120 {
1121
1122 lwp_unlock(l);
1123 panic("sched_unsleep");
1124 }
1125
1126 /*
1127 * sched_changepri:
1128 *
1129 * Adjust the priority of an LWP.
1130 */
1131 void
1132 sched_changepri(struct lwp *l, pri_t pri)
1133 {
1134
1135 KASSERT(lwp_locked(l, &sched_mutex));
1136
1137 l->l_usrpri = pri;
1138 if (l->l_priority < PUSER)
1139 return;
1140
1141 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1142 l->l_priority = pri;
1143 return;
1144 }
1145
1146 remrunqueue(l);
1147 l->l_priority = pri;
1148 setrunqueue(l);
1149 resched_lwp(l);
1150 }
1151
1152 void
1153 sched_lendpri(struct lwp *l, pri_t pri)
1154 {
1155
1156 KASSERT(lwp_locked(l, &sched_mutex));
1157
1158 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1159 l->l_inheritedprio = pri;
1160 return;
1161 }
1162
1163 remrunqueue(l);
1164 l->l_inheritedprio = pri;
1165 setrunqueue(l);
1166 resched_lwp(l);
1167 }
1168
1169 struct lwp *
1170 syncobj_noowner(wchan_t wchan)
1171 {
1172
1173 return NULL;
1174 }
1175
1176 /*
1177 * Low-level routines to access the run queue. Optimised assembler
1178 * routines can override these.
1179 */
1180
1181 #ifndef __HAVE_MD_RUNQUEUE
1182
1183 /*
1184 * On some architectures, it's faster to use a MSB ordering for the priorites
1185 * than the traditional LSB ordering.
1186 */
1187 #ifdef __HAVE_BIGENDIAN_BITOPS
1188 #define RQMASK(n) (0x80000000 >> (n))
1189 #else
1190 #define RQMASK(n) (0x00000001 << (n))
1191 #endif
1192
1193 /*
1194 * The primitives that manipulate the run queues. whichqs tells which
1195 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1196 * into queues, remrunqueue removes them from queues. The running process is
1197 * on no queue, other processes are on a queue related to p->p_priority,
1198 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1199 * available queues.
1200 */
1201 #ifdef RQDEBUG
1202 static void
1203 checkrunqueue(int whichq, struct lwp *l)
1204 {
1205 const struct prochd * const rq = &sched_qs[whichq];
1206 struct lwp *l2;
1207 int found = 0;
1208 int die = 0;
1209 int empty = 1;
1210 for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1211 if (l2->l_stat != LSRUN) {
1212 printf("checkrunqueue[%d]: lwp %p state (%d) "
1213 " != LSRUN\n", whichq, l2, l2->l_stat);
1214 }
1215 if (l2->l_back->l_forw != l2) {
1216 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1217 "corrupt %p\n", whichq, l2, l2->l_back,
1218 l2->l_back->l_forw);
1219 die = 1;
1220 }
1221 if (l2->l_forw->l_back != l2) {
1222 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1223 "corrupt %p\n", whichq, l2, l2->l_forw,
1224 l2->l_forw->l_back);
1225 die = 1;
1226 }
1227 if (l2 == l)
1228 found = 1;
1229 empty = 0;
1230 }
1231 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1232 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1233 whichq, rq);
1234 die = 1;
1235 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1236 printf("checkrunqueue[%d]: bit clear for non-empty "
1237 "run-queue %p\n", whichq, rq);
1238 die = 1;
1239 }
1240 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1241 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1242 whichq, l);
1243 die = 1;
1244 }
1245 if (l != NULL && empty) {
1246 printf("checkrunqueue[%d]: empty run-queue %p with "
1247 "active lwp %p\n", whichq, rq, l);
1248 die = 1;
1249 }
1250 if (l != NULL && !found) {
1251 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1252 whichq, l, rq);
1253 die = 1;
1254 }
1255 if (die)
1256 panic("checkrunqueue: inconsistency found");
1257 }
1258 #endif /* RQDEBUG */
1259
1260 void
1261 setrunqueue(struct lwp *l)
1262 {
1263 struct prochd *rq;
1264 struct lwp *prev;
1265 const int whichq = lwp_eprio(l) / PPQ;
1266
1267 KASSERT(lwp_locked(l, &sched_mutex));
1268
1269 #ifdef RQDEBUG
1270 checkrunqueue(whichq, NULL);
1271 #endif
1272 #ifdef DIAGNOSTIC
1273 if (l->l_back != NULL || l->l_stat != LSRUN)
1274 panic("setrunqueue");
1275 #endif
1276 sched_whichqs |= RQMASK(whichq);
1277 rq = &sched_qs[whichq];
1278 prev = rq->ph_rlink;
1279 l->l_forw = (struct lwp *)rq;
1280 rq->ph_rlink = l;
1281 prev->l_forw = l;
1282 l->l_back = prev;
1283 #ifdef RQDEBUG
1284 checkrunqueue(whichq, l);
1285 #endif
1286 }
1287
1288 /*
1289 * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
1290 * drop of the effective priority level from kernel to user needs to be
1291 * moved here from userret(). The assignment in userret() is currently
1292 * done unlocked.
1293 */
1294 void
1295 remrunqueue(struct lwp *l)
1296 {
1297 struct lwp *prev, *next;
1298 const int whichq = lwp_eprio(l) / PPQ;
1299
1300 KASSERT(lwp_locked(l, &sched_mutex));
1301
1302 #ifdef RQDEBUG
1303 checkrunqueue(whichq, l);
1304 #endif
1305
1306 #if defined(DIAGNOSTIC)
1307 if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1308 /* Shouldn't happen - interrupts disabled. */
1309 panic("remrunqueue: bit %d not set", whichq);
1310 }
1311 #endif
1312 prev = l->l_back;
1313 l->l_back = NULL;
1314 next = l->l_forw;
1315 prev->l_forw = next;
1316 next->l_back = prev;
1317 if (prev == next)
1318 sched_whichqs &= ~RQMASK(whichq);
1319 #ifdef RQDEBUG
1320 checkrunqueue(whichq, NULL);
1321 #endif
1322 }
1323
1324 #undef RQMASK
1325 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1326