kern_synch.c revision 1.186.2.5 1 /* $NetBSD: kern_synch.c,v 1.186.2.5 2007/04/10 18:34:05 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*-
41 * Copyright (c) 1982, 1986, 1990, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 * (c) UNIX System Laboratories, Inc.
44 * All or some portions of this file are derived from material licensed
45 * to the University of California by American Telephone and Telegraph
46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47 * the permission of UNIX System Laboratories, Inc.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 * 3. Neither the name of the University nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71 * SUCH DAMAGE.
72 *
73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.186.2.5 2007/04/10 18:34:05 ad Exp $");
78
79 #include "opt_ddb.h"
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/callout.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #include <sys/buf.h>
93 #if defined(PERFCTRS)
94 #include <sys/pmc.h>
95 #endif
96 #include <sys/signalvar.h>
97 #include <sys/resourcevar.h>
98 #include <sys/sched.h>
99 #include <sys/syscall_stats.h>
100 #include <sys/kauth.h>
101 #include <sys/sleepq.h>
102 #include <sys/lockdebug.h>
103
104 #include <uvm/uvm_extern.h>
105
106 #include <machine/cpu.h>
107
108 int rrticks; /* number of hardclock ticks per roundrobin() */
109
110 /*
111 * The global scheduler state.
112 */
113 kmutex_t sched_mutex; /* global sched state mutex */
114 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */
115 volatile uint32_t sched_whichqs; /* bitmap of non-empty queues */
116
117 void schedcpu(void *);
118 void updatepri(struct lwp *);
119
120 void sched_unsleep(struct lwp *);
121 void sched_changepri(struct lwp *, pri_t);
122 void sched_lendpri(struct lwp *, pri_t);
123
124 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL);
125 static unsigned int schedcpu_ticks;
126
127 syncobj_t sleep_syncobj = {
128 SOBJ_SLEEPQ_SORTED,
129 sleepq_unsleep,
130 sleepq_changepri,
131 sleepq_lendpri,
132 syncobj_noowner,
133 };
134
135 syncobj_t sched_syncobj = {
136 SOBJ_SLEEPQ_SORTED,
137 sched_unsleep,
138 sched_changepri,
139 sched_lendpri,
140 syncobj_noowner,
141 };
142
143 /*
144 * Force switch among equal priority processes every 100ms.
145 * Called from hardclock every hz/10 == rrticks hardclock ticks.
146 */
147 /* ARGSUSED */
148 void
149 roundrobin(struct cpu_info *ci)
150 {
151 struct schedstate_percpu *spc = &ci->ci_schedstate;
152
153 spc->spc_rrticks = rrticks;
154
155 if (curlwp != NULL) {
156 if (spc->spc_flags & SPCF_SEENRR) {
157 /*
158 * The process has already been through a roundrobin
159 * without switching and may be hogging the CPU.
160 * Indicate that the process should yield.
161 */
162 spc->spc_flags |= SPCF_SHOULDYIELD;
163 } else
164 spc->spc_flags |= SPCF_SEENRR;
165 }
166 cpu_need_resched(curcpu());
167 }
168
169 #define PPQ (128 / RUNQUE_NQS) /* priorities per queue */
170 #define NICE_WEIGHT 2 /* priorities per nice level */
171
172 #define ESTCPU_SHIFT 11
173 #define ESTCPU_MAX ((NICE_WEIGHT * PRIO_MAX - PPQ) << ESTCPU_SHIFT)
174 #define ESTCPULIM(e) min((e), ESTCPU_MAX)
175
176 /*
177 * Constants for digital decay and forget:
178 * 90% of (p_estcpu) usage in 5 * loadav time
179 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
180 * Note that, as ps(1) mentions, this can let percentages
181 * total over 100% (I've seen 137.9% for 3 processes).
182 *
183 * Note that hardclock updates p_estcpu and p_cpticks independently.
184 *
185 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
186 * That is, the system wants to compute a value of decay such
187 * that the following for loop:
188 * for (i = 0; i < (5 * loadavg); i++)
189 * p_estcpu *= decay;
190 * will compute
191 * p_estcpu *= 0.1;
192 * for all values of loadavg:
193 *
194 * Mathematically this loop can be expressed by saying:
195 * decay ** (5 * loadavg) ~= .1
196 *
197 * The system computes decay as:
198 * decay = (2 * loadavg) / (2 * loadavg + 1)
199 *
200 * We wish to prove that the system's computation of decay
201 * will always fulfill the equation:
202 * decay ** (5 * loadavg) ~= .1
203 *
204 * If we compute b as:
205 * b = 2 * loadavg
206 * then
207 * decay = b / (b + 1)
208 *
209 * We now need to prove two things:
210 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
211 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
212 *
213 * Facts:
214 * For x close to zero, exp(x) =~ 1 + x, since
215 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
216 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
217 * For x close to zero, ln(1+x) =~ x, since
218 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
219 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
220 * ln(.1) =~ -2.30
221 *
222 * Proof of (1):
223 * Solve (factor)**(power) =~ .1 given power (5*loadav):
224 * solving for factor,
225 * ln(factor) =~ (-2.30/5*loadav), or
226 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
227 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
228 *
229 * Proof of (2):
230 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
231 * solving for power,
232 * power*ln(b/(b+1)) =~ -2.30, or
233 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
234 *
235 * Actual power values for the implemented algorithm are as follows:
236 * loadav: 1 2 3 4
237 * power: 5.68 10.32 14.94 19.55
238 */
239
240 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
241 #define loadfactor(loadav) (2 * (loadav))
242
243 static fixpt_t
244 decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
245 {
246
247 if (estcpu == 0) {
248 return 0;
249 }
250
251 #if !defined(_LP64)
252 /* avoid 64bit arithmetics. */
253 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
254 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
255 return estcpu * loadfac / (loadfac + FSCALE);
256 }
257 #endif /* !defined(_LP64) */
258
259 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
260 }
261
262 /*
263 * For all load averages >= 1 and max p_estcpu of (255 << ESTCPU_SHIFT),
264 * sleeping for at least seven times the loadfactor will decay p_estcpu to
265 * less than (1 << ESTCPU_SHIFT).
266 *
267 * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
268 */
269 static fixpt_t
270 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
271 {
272
273 if ((n << FSHIFT) >= 7 * loadfac) {
274 return 0;
275 }
276
277 while (estcpu != 0 && n > 1) {
278 estcpu = decay_cpu(loadfac, estcpu);
279 n--;
280 }
281
282 return estcpu;
283 }
284
285 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
286 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
287
288 /*
289 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
290 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
291 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
292 *
293 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
294 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
295 *
296 * If you dont want to bother with the faster/more-accurate formula, you
297 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
298 * (more general) method of calculating the %age of CPU used by a process.
299 */
300 #define CCPU_SHIFT 11
301
302 /*
303 * schedcpu:
304 *
305 * Recompute process priorities, every hz ticks.
306 *
307 * XXXSMP This needs to be reorganised in order to reduce the locking
308 * burden.
309 */
310 /* ARGSUSED */
311 void
312 schedcpu(void *arg)
313 {
314 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
315 struct rlimit *rlim;
316 struct lwp *l;
317 struct proc *p;
318 int minslp, clkhz, sig;
319 long runtm;
320
321 schedcpu_ticks++;
322
323 mutex_enter(&proclist_mutex);
324 PROCLIST_FOREACH(p, &allproc) {
325 /*
326 * Increment time in/out of memory and sleep time (if
327 * sleeping). We ignore overflow; with 16-bit int's
328 * (remember them?) overflow takes 45 days.
329 */
330 minslp = 2;
331 mutex_enter(&p->p_smutex);
332 runtm = p->p_rtime.tv_sec;
333 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
334 lwp_lock(l);
335 runtm += l->l_rtime.tv_sec;
336 l->l_swtime++;
337 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
338 l->l_stat == LSSUSPENDED) {
339 l->l_slptime++;
340 minslp = min(minslp, l->l_slptime);
341 } else
342 minslp = 0;
343 lwp_unlock(l);
344 }
345 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
346
347 /*
348 * Check if the process exceeds its CPU resource allocation.
349 * If over max, kill it.
350 */
351 rlim = &p->p_rlimit[RLIMIT_CPU];
352 sig = 0;
353 if (runtm >= rlim->rlim_cur) {
354 if (runtm >= rlim->rlim_max)
355 sig = SIGKILL;
356 else {
357 sig = SIGXCPU;
358 if (rlim->rlim_cur < rlim->rlim_max)
359 rlim->rlim_cur += 5;
360 }
361 }
362
363 /*
364 * If the process has run for more than autonicetime, reduce
365 * priority to give others a chance.
366 */
367 if (autonicetime && runtm > autonicetime && p->p_nice == NZERO
368 && kauth_cred_geteuid(p->p_cred)) {
369 mutex_spin_enter(&p->p_stmutex);
370 p->p_nice = autoniceval + NZERO;
371 resetprocpriority(p);
372 mutex_spin_exit(&p->p_stmutex);
373 }
374
375 /*
376 * If the process has slept the entire second,
377 * stop recalculating its priority until it wakes up.
378 */
379 if (minslp <= 1) {
380 /*
381 * p_pctcpu is only for ps.
382 */
383 mutex_spin_enter(&p->p_stmutex);
384 clkhz = stathz != 0 ? stathz : hz;
385 #if (FSHIFT >= CCPU_SHIFT)
386 p->p_pctcpu += (clkhz == 100)?
387 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
388 100 * (((fixpt_t) p->p_cpticks)
389 << (FSHIFT - CCPU_SHIFT)) / clkhz;
390 #else
391 p->p_pctcpu += ((FSCALE - ccpu) *
392 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
393 #endif
394 p->p_cpticks = 0;
395 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
396
397 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
398 lwp_lock(l);
399 if (l->l_slptime <= 1 &&
400 l->l_priority >= PUSER)
401 resetpriority(l);
402 lwp_unlock(l);
403 }
404 mutex_spin_exit(&p->p_stmutex);
405 }
406
407 mutex_exit(&p->p_smutex);
408 if (sig) {
409 psignal(p, sig);
410 }
411 }
412 mutex_exit(&proclist_mutex);
413 uvm_meter();
414 cv_broadcast(&lbolt);
415 callout_schedule(&schedcpu_ch, hz);
416 }
417
418 /*
419 * Recalculate the priority of a process after it has slept for a while.
420 */
421 void
422 updatepri(struct lwp *l)
423 {
424 struct proc *p = l->l_proc;
425 fixpt_t loadfac;
426
427 KASSERT(lwp_locked(l, NULL));
428 KASSERT(l->l_slptime > 1);
429
430 loadfac = loadfactor(averunnable.ldavg[0]);
431
432 l->l_slptime--; /* the first time was done in schedcpu */
433 /* XXX NJWLWP */
434 /* XXXSMP occasionally unlocked, should be per-LWP */
435 p->p_estcpu = decay_cpu_batch(loadfac, p->p_estcpu, l->l_slptime);
436 resetpriority(l);
437 }
438
439 /*
440 * During autoconfiguration or after a panic, a sleep will simply lower the
441 * priority briefly to allow interrupts, then return. The priority to be
442 * used (safepri) is machine-dependent, thus this value is initialized and
443 * maintained in the machine-dependent layers. This priority will typically
444 * be 0, or the lowest priority that is safe for use on the interrupt stack;
445 * it can be made higher to block network software interrupts after panics.
446 */
447 int safepri;
448
449 /*
450 * OBSOLETE INTERFACE
451 *
452 * General sleep call. Suspends the current process until a wakeup is
453 * performed on the specified identifier. The process will then be made
454 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
455 * means no timeout). If pri includes PCATCH flag, signals are checked
456 * before and after sleeping, else signals are not checked. Returns 0 if
457 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
458 * signal needs to be delivered, ERESTART is returned if the current system
459 * call should be restarted if possible, and EINTR is returned if the system
460 * call should be interrupted by the signal (return EINTR).
461 *
462 * The interlock is held until we are on a sleep queue. The interlock will
463 * be locked before returning back to the caller unless the PNORELOCK flag
464 * is specified, in which case the interlock will always be unlocked upon
465 * return.
466 */
467 int
468 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
469 volatile struct simplelock *interlock)
470 {
471 struct lwp *l = curlwp;
472 sleepq_t *sq;
473 int error;
474
475 if (sleepq_dontsleep(l)) {
476 (void)sleepq_abort(NULL, 0);
477 if ((priority & PNORELOCK) != 0)
478 simple_unlock(interlock);
479 return 0;
480 }
481
482 sq = sleeptab_lookup(&sleeptab, ident);
483 sleepq_enter(sq, l);
484 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
485 if (interlock != NULL)
486 simple_unlock(interlock);
487 error = sleepq_block(timo, priority & PCATCH);
488
489 if (interlock != NULL && (priority & PNORELOCK) == 0)
490 simple_lock(interlock);
491
492 return error;
493 }
494
495 int
496 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
497 kmutex_t *mtx)
498 {
499 struct lwp *l = curlwp;
500 sleepq_t *sq;
501 int error;
502
503 if (sleepq_dontsleep(l)) {
504 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
505 return 0;
506 }
507
508 sq = sleeptab_lookup(&sleeptab, ident);
509 sleepq_enter(sq, l);
510 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
511 mutex_exit(mtx);
512 error = sleepq_block(timo, priority & PCATCH);
513
514 if ((priority & PNORELOCK) == 0)
515 mutex_enter(mtx);
516
517 return error;
518 }
519
520 /*
521 * General sleep call for situations where a wake-up is not expected.
522 */
523 int
524 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
525 {
526 struct lwp *l = curlwp;
527 sleepq_t *sq;
528 int error;
529
530 if (sleepq_dontsleep(l))
531 return sleepq_abort(NULL, 0);
532
533 if (mtx != NULL)
534 mutex_exit(mtx);
535 sq = sleeptab_lookup(&sleeptab, l);
536 sleepq_enter(sq, l);
537 sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
538 error = sleepq_block(timo, intr);
539 if (mtx != NULL)
540 mutex_enter(mtx);
541
542 return error;
543 }
544
545 /*
546 * OBSOLETE INTERFACE
547 *
548 * Make all processes sleeping on the specified identifier runnable.
549 */
550 void
551 wakeup(wchan_t ident)
552 {
553 sleepq_t *sq;
554
555 if (cold)
556 return;
557
558 sq = sleeptab_lookup(&sleeptab, ident);
559 sleepq_wake(sq, ident, (u_int)-1);
560 }
561
562 /*
563 * OBSOLETE INTERFACE
564 *
565 * Make the highest priority process first in line on the specified
566 * identifier runnable.
567 */
568 void
569 wakeup_one(wchan_t ident)
570 {
571 sleepq_t *sq;
572
573 if (cold)
574 return;
575
576 sq = sleeptab_lookup(&sleeptab, ident);
577 sleepq_wake(sq, ident, 1);
578 }
579
580
581 /*
582 * General yield call. Puts the current process back on its run queue and
583 * performs a voluntary context switch. Should only be called when the
584 * current process explicitly requests it (eg sched_yield(2) in compat code).
585 */
586 void
587 yield(void)
588 {
589 struct lwp *l = curlwp;
590
591 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
592 lwp_lock(l);
593 if (l->l_stat == LSONPROC) {
594 KASSERT(lwp_locked(l, &sched_mutex));
595 l->l_priority = l->l_usrpri;
596 }
597 l->l_ncsw++;
598 mi_switch(l, NULL);
599 KERNEL_LOCK(l->l_biglocks, l);
600 }
601
602 /*
603 * General preemption call. Puts the current process back on its run queue
604 * and performs an involuntary context switch.
605 */
606 void
607 preempt(void)
608 {
609 struct lwp *l = curlwp;
610
611 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
612 lwp_lock(l);
613 if (l->l_stat == LSONPROC) {
614 KASSERT(lwp_locked(l, &sched_mutex));
615 l->l_priority = l->l_usrpri;
616 }
617 l->l_nivcsw++;
618 (void)mi_switch(l, NULL);
619 KERNEL_LOCK(l->l_biglocks, l);
620 }
621
622 /*
623 * The machine independent parts of context switch. Switch to "new"
624 * if non-NULL, otherwise let cpu_switch choose the next lwp.
625 *
626 * Returns 1 if another process was actually run.
627 */
628 int
629 mi_switch(struct lwp *l, struct lwp *newl)
630 {
631 struct schedstate_percpu *spc;
632 struct timeval tv;
633 int retval, oldspl;
634 long s, u;
635
636 KASSERT(lwp_locked(l, NULL));
637
638 #ifdef KSTACK_CHECK_MAGIC
639 kstack_check_magic(l);
640 #endif
641
642 /*
643 * It's safe to read the per CPU schedstate unlocked here, as all we
644 * are after is the run time and that's guarenteed to have been last
645 * updated by this CPU.
646 */
647 KDASSERT(l->l_cpu == curcpu());
648 spc = &l->l_cpu->ci_schedstate;
649
650 /*
651 * Compute the amount of time during which the current
652 * process was running.
653 */
654 microtime(&tv);
655 u = l->l_rtime.tv_usec +
656 (tv.tv_usec - spc->spc_runtime.tv_usec);
657 s = l->l_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
658 if (u < 0) {
659 u += 1000000;
660 s--;
661 } else if (u >= 1000000) {
662 u -= 1000000;
663 s++;
664 }
665 l->l_rtime.tv_usec = u;
666 l->l_rtime.tv_sec = s;
667
668 /* Count time spent in current system call */
669 SYSCALL_TIME_SLEEP(l);
670
671 /*
672 * XXXSMP If we are using h/w performance counters, save context.
673 */
674 #if PERFCTRS
675 if (PMC_ENABLED(l->l_proc)) {
676 pmc_save_context(l->l_proc);
677 }
678 #endif
679
680 /*
681 * Acquire the sched_mutex if necessary. It will be released by
682 * cpu_switch once it has decided to idle, or picked another LWP
683 * to run.
684 */
685 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
686 if (l->l_mutex != &sched_mutex) {
687 mutex_spin_enter(&sched_mutex);
688 lwp_unlock(l);
689 }
690 #endif
691
692 /*
693 * If on the CPU and we have gotten this far, then we must yield.
694 */
695 KASSERT(l->l_stat != LSRUN);
696 if (l->l_stat == LSONPROC) {
697 KASSERT(lwp_locked(l, &sched_mutex));
698 l->l_stat = LSRUN;
699 setrunqueue(l);
700 }
701 uvmexp.swtch++;
702 l->l_ncsw++;
703
704 /*
705 * Process is about to yield the CPU; clear the appropriate
706 * scheduling flags.
707 */
708 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
709
710 LOCKDEBUG_BARRIER(&sched_mutex, 1);
711
712 /*
713 * Switch to the new current LWP. When we run again, we'll
714 * return back here.
715 */
716 oldspl = MUTEX_SPIN_OLDSPL(l->l_cpu);
717
718 if (newl == NULL || newl->l_back == NULL)
719 retval = cpu_switch(l, NULL);
720 else {
721 KASSERT(lwp_locked(newl, &sched_mutex));
722 remrunqueue(newl);
723 cpu_switchto(l, newl);
724 retval = 0;
725 }
726
727 /*
728 * XXXSMP If we are using h/w performance counters, restore context.
729 */
730 #if PERFCTRS
731 if (PMC_ENABLED(l->l_proc)) {
732 pmc_restore_context(l->l_proc);
733 }
734 #endif
735
736 /*
737 * We're running again; record our new start time. We might
738 * be running on a new CPU now, so don't use the cached
739 * schedstate_percpu pointer.
740 */
741 SYSCALL_TIME_WAKEUP(l);
742 KDASSERT(l->l_cpu == curcpu());
743 microtime(&l->l_cpu->ci_schedstate.spc_runtime);
744 splx(oldspl);
745
746 return retval;
747 }
748
749 /*
750 * Initialize the (doubly-linked) run queues
751 * to be empty.
752 */
753 void
754 rqinit()
755 {
756 int i;
757
758 for (i = 0; i < RUNQUE_NQS; i++)
759 sched_qs[i].ph_link = sched_qs[i].ph_rlink =
760 (struct lwp *)&sched_qs[i];
761
762 mutex_init(&sched_mutex, MUTEX_SPIN, IPL_SCHED);
763 }
764
765 static inline void
766 resched_lwp(struct lwp *l)
767 {
768 struct cpu_info *ci;
769 const pri_t pri = lwp_eprio(l);
770
771 /*
772 * XXXSMP
773 * Since l->l_cpu persists across a context switch,
774 * this gives us *very weak* processor affinity, in
775 * that we notify the CPU on which the process last
776 * ran that it should try to switch.
777 *
778 * This does not guarantee that the process will run on
779 * that processor next, because another processor might
780 * grab it the next time it performs a context switch.
781 *
782 * This also does not handle the case where its last
783 * CPU is running a higher-priority process, but every
784 * other CPU is running a lower-priority process. There
785 * are ways to handle this situation, but they're not
786 * currently very pretty, and we also need to weigh the
787 * cost of moving a process from one CPU to another.
788 *
789 * XXXSMP
790 * There is also the issue of locking the other CPU's
791 * sched state, which we currently do not do.
792 */
793 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
794 if (pri < ci->ci_schedstate.spc_curpriority)
795 cpu_need_resched(ci);
796 }
797
798 /*
799 * Change process state to be runnable, placing it on the run queue if it is
800 * in memory, and awakening the swapper if it isn't in memory.
801 *
802 * Call with the process and LWP locked. Will return with the LWP unlocked.
803 */
804 void
805 setrunnable(struct lwp *l)
806 {
807 struct proc *p = l->l_proc;
808 sigset_t *ss;
809
810 KASSERT(mutex_owned(&p->p_smutex));
811 KASSERT(lwp_locked(l, NULL));
812
813 switch (l->l_stat) {
814 case LSSTOP:
815 /*
816 * If we're being traced (possibly because someone attached us
817 * while we were stopped), check for a signal from the debugger.
818 */
819 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
820 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
821 ss = &l->l_sigpend.sp_set;
822 else
823 ss = &p->p_sigpend.sp_set;
824 sigaddset(ss, p->p_xstat);
825 signotify(l);
826 }
827 p->p_nrlwps++;
828 break;
829 case LSSUSPENDED:
830 l->l_flag &= ~LW_WSUSPEND;
831 p->p_nrlwps++;
832 break;
833 case LSSLEEP:
834 KASSERT(l->l_wchan != NULL);
835 break;
836 default:
837 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
838 }
839
840 /*
841 * If the LWP was sleeping interruptably, then it's OK to start it
842 * again. If not, mark it as still sleeping.
843 */
844 if (l->l_wchan != NULL) {
845 l->l_stat = LSSLEEP;
846 /* lwp_unsleep() will release the lock. */
847 lwp_unsleep(l);
848 return;
849 }
850
851 KASSERT(lwp_locked(l, &sched_mutex));
852
853 /*
854 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
855 * about to call mi_switch(), in which case it will yield.
856 *
857 * XXXSMP Will need to change for preemption.
858 */
859 #ifdef MULTIPROCESSOR
860 if (l->l_cpu->ci_curlwp == l) {
861 #else
862 if (l == curlwp) {
863 #endif
864 l->l_stat = LSONPROC;
865 l->l_slptime = 0;
866 lwp_unlock(l);
867 return;
868 }
869
870 /*
871 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
872 * to bring it back in. Otherwise, enter it into a run queue.
873 */
874 if (l->l_slptime > 1)
875 updatepri(l);
876 l->l_stat = LSRUN;
877 l->l_slptime = 0;
878
879 if (l->l_flag & LW_INMEM) {
880 setrunqueue(l);
881 resched_lwp(l);
882 lwp_unlock(l);
883 } else {
884 lwp_unlock(l);
885 uvm_kick_scheduler();
886 }
887 }
888
889 /*
890 * Compute the priority of a process when running in user mode.
891 * Arrange to reschedule if the resulting priority is better
892 * than that of the current process.
893 */
894 void
895 resetpriority(struct lwp *l)
896 {
897 pri_t newpriority;
898 struct proc *p = l->l_proc;
899
900 /* XXXSMP KASSERT(mutex_owned(&p->p_stmutex)); */
901 KASSERT(lwp_locked(l, NULL));
902
903 if ((l->l_flag & LW_SYSTEM) != 0)
904 return;
905
906 newpriority = PUSER + (p->p_estcpu >> ESTCPU_SHIFT) +
907 NICE_WEIGHT * (p->p_nice - NZERO);
908 newpriority = min(newpriority, MAXPRI);
909 lwp_changepri(l, newpriority);
910 }
911
912 /*
913 * Recompute priority for all LWPs in a process.
914 */
915 void
916 resetprocpriority(struct proc *p)
917 {
918 struct lwp *l;
919
920 KASSERT(mutex_owned(&p->p_stmutex));
921
922 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
923 lwp_lock(l);
924 resetpriority(l);
925 lwp_unlock(l);
926 }
927 }
928
929 /*
930 * We adjust the priority of the current process. The priority of a process
931 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu)
932 * is increased here. The formula for computing priorities (in kern_synch.c)
933 * will compute a different value each time p_estcpu increases. This can
934 * cause a switch, but unless the priority crosses a PPQ boundary the actual
935 * queue will not change. The CPU usage estimator ramps up quite quickly
936 * when the process is running (linearly), and decays away exponentially, at
937 * a rate which is proportionally slower when the system is busy. The basic
938 * principle is that the system will 90% forget that the process used a lot
939 * of CPU time in 5 * loadav seconds. This causes the system to favor
940 * processes which haven't run much recently, and to round-robin among other
941 * processes.
942 */
943
944 void
945 schedclock(struct lwp *l)
946 {
947 struct proc *p = l->l_proc;
948
949 mutex_spin_enter(&p->p_stmutex);
950 p->p_estcpu = ESTCPULIM(p->p_estcpu + (1 << ESTCPU_SHIFT));
951 lwp_lock(l);
952 resetpriority(l);
953 mutex_spin_exit(&p->p_stmutex);
954 if ((l->l_flag & LW_SYSTEM) == 0 && l->l_priority >= PUSER)
955 l->l_priority = l->l_usrpri;
956 lwp_unlock(l);
957 }
958
959 /*
960 * suspendsched:
961 *
962 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
963 */
964 void
965 suspendsched(void)
966 {
967 #ifdef MULTIPROCESSOR
968 CPU_INFO_ITERATOR cii;
969 struct cpu_info *ci;
970 #endif
971 struct lwp *l;
972 struct proc *p;
973
974 /*
975 * We do this by process in order not to violate the locking rules.
976 */
977 mutex_enter(&proclist_mutex);
978 PROCLIST_FOREACH(p, &allproc) {
979 mutex_enter(&p->p_smutex);
980
981 if ((p->p_flag & PK_SYSTEM) != 0) {
982 mutex_exit(&p->p_smutex);
983 continue;
984 }
985
986 p->p_stat = SSTOP;
987
988 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
989 if (l == curlwp)
990 continue;
991
992 lwp_lock(l);
993
994 /*
995 * Set L_WREBOOT so that the LWP will suspend itself
996 * when it tries to return to user mode. We want to
997 * try and get to get as many LWPs as possible to
998 * the user / kernel boundary, so that they will
999 * release any locks that they hold.
1000 */
1001 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1002
1003 if (l->l_stat == LSSLEEP &&
1004 (l->l_flag & LW_SINTR) != 0) {
1005 /* setrunnable() will release the lock. */
1006 setrunnable(l);
1007 continue;
1008 }
1009
1010 lwp_unlock(l);
1011 }
1012
1013 mutex_exit(&p->p_smutex);
1014 }
1015 mutex_exit(&proclist_mutex);
1016
1017 /*
1018 * Kick all CPUs to make them preempt any LWPs running in user mode.
1019 * They'll trap into the kernel and suspend themselves in userret().
1020 */
1021 sched_lock(0);
1022 #ifdef MULTIPROCESSOR
1023 for (CPU_INFO_FOREACH(cii, ci))
1024 cpu_need_resched(ci);
1025 #else
1026 cpu_need_resched(curcpu());
1027 #endif
1028 sched_unlock(0);
1029 }
1030
1031 /*
1032 * scheduler_fork_hook:
1033 *
1034 * Inherit the parent's scheduler history.
1035 */
1036 void
1037 scheduler_fork_hook(struct proc *parent, struct proc *child)
1038 {
1039
1040 KASSERT(mutex_owned(&parent->p_smutex));
1041
1042 child->p_estcpu = child->p_estcpu_inherited = parent->p_estcpu;
1043 child->p_forktime = schedcpu_ticks;
1044 }
1045
1046 /*
1047 * scheduler_wait_hook:
1048 *
1049 * Chargeback parents for the sins of their children.
1050 */
1051 void
1052 scheduler_wait_hook(struct proc *parent, struct proc *child)
1053 {
1054 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
1055 fixpt_t estcpu;
1056
1057 /* XXX Only if parent != init?? */
1058
1059 mutex_spin_enter(&parent->p_stmutex);
1060 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
1061 schedcpu_ticks - child->p_forktime);
1062 if (child->p_estcpu > estcpu)
1063 parent->p_estcpu =
1064 ESTCPULIM(parent->p_estcpu + child->p_estcpu - estcpu);
1065 mutex_spin_exit(&parent->p_stmutex);
1066 }
1067
1068 /*
1069 * sched_kpri:
1070 *
1071 * Scale a priority level to a kernel priority level, usually
1072 * for an LWP that is about to sleep.
1073 */
1074 pri_t
1075 sched_kpri(struct lwp *l)
1076 {
1077 /*
1078 * Scale user priorities (127 -> 50) up to kernel priorities
1079 * in the range (49 -> 8). Reserve the top 8 kernel priorities
1080 * for high priority kthreads. Kernel priorities passed in
1081 * are left "as is". XXX This is somewhat arbitrary.
1082 */
1083 static const uint8_t kpri_tab[] = {
1084 0, 1, 2, 3, 4, 5, 6, 7,
1085 8, 9, 10, 11, 12, 13, 14, 15,
1086 16, 17, 18, 19, 20, 21, 22, 23,
1087 24, 25, 26, 27, 28, 29, 30, 31,
1088 32, 33, 34, 35, 36, 37, 38, 39,
1089 40, 41, 42, 43, 44, 45, 46, 47,
1090 48, 49, 8, 8, 9, 9, 10, 10,
1091 11, 11, 12, 12, 13, 14, 14, 15,
1092 15, 16, 16, 17, 17, 18, 18, 19,
1093 20, 20, 21, 21, 22, 22, 23, 23,
1094 24, 24, 25, 26, 26, 27, 27, 28,
1095 28, 29, 29, 30, 30, 31, 32, 32,
1096 33, 33, 34, 34, 35, 35, 36, 36,
1097 37, 38, 38, 39, 39, 40, 40, 41,
1098 41, 42, 42, 43, 44, 44, 45, 45,
1099 46, 46, 47, 47, 48, 48, 49, 49,
1100 };
1101
1102 return (pri_t)kpri_tab[l->l_usrpri];
1103 }
1104
1105 /*
1106 * sched_unsleep:
1107 *
1108 * The is called when the LWP has not been awoken normally but instead
1109 * interrupted: for example, if the sleep timed out. Because of this,
1110 * it's not a valid action for running or idle LWPs.
1111 */
1112 void
1113 sched_unsleep(struct lwp *l)
1114 {
1115
1116 lwp_unlock(l);
1117 panic("sched_unsleep");
1118 }
1119
1120 /*
1121 * sched_changepri:
1122 *
1123 * Adjust the priority of an LWP.
1124 */
1125 void
1126 sched_changepri(struct lwp *l, pri_t pri)
1127 {
1128
1129 KASSERT(lwp_locked(l, &sched_mutex));
1130
1131 l->l_usrpri = pri;
1132 if (l->l_priority < PUSER)
1133 return;
1134
1135 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1136 l->l_priority = pri;
1137 return;
1138 }
1139
1140 remrunqueue(l);
1141 l->l_priority = pri;
1142 setrunqueue(l);
1143 resched_lwp(l);
1144 }
1145
1146 void
1147 sched_lendpri(struct lwp *l, pri_t pri)
1148 {
1149
1150 KASSERT(lwp_locked(l, &sched_mutex));
1151
1152 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
1153 l->l_inheritedprio = pri;
1154 return;
1155 }
1156
1157 remrunqueue(l);
1158 l->l_inheritedprio = pri;
1159 setrunqueue(l);
1160 resched_lwp(l);
1161 }
1162
1163 struct lwp *
1164 syncobj_noowner(wchan_t wchan)
1165 {
1166
1167 return NULL;
1168 }
1169
1170 /*
1171 * Low-level routines to access the run queue. Optimised assembler
1172 * routines can override these.
1173 */
1174
1175 #ifndef __HAVE_MD_RUNQUEUE
1176
1177 /*
1178 * On some architectures, it's faster to use a MSB ordering for the priorites
1179 * than the traditional LSB ordering.
1180 */
1181 #ifdef __HAVE_BIGENDIAN_BITOPS
1182 #define RQMASK(n) (0x80000000 >> (n))
1183 #else
1184 #define RQMASK(n) (0x00000001 << (n))
1185 #endif
1186
1187 /*
1188 * The primitives that manipulate the run queues. whichqs tells which
1189 * of the 32 queues qs have processes in them. Setrunqueue puts processes
1190 * into queues, remrunqueue removes them from queues. The running process is
1191 * on no queue, other processes are on a queue related to p->p_priority,
1192 * divided by 4 actually to shrink the 0-127 range of priorities into the 32
1193 * available queues.
1194 */
1195 #ifdef RQDEBUG
1196 static void
1197 checkrunqueue(int whichq, struct lwp *l)
1198 {
1199 const struct prochd * const rq = &sched_qs[whichq];
1200 struct lwp *l2;
1201 int found = 0;
1202 int die = 0;
1203 int empty = 1;
1204 for (l2 = rq->ph_link; l2 != (const void*) rq; l2 = l2->l_forw) {
1205 if (l2->l_stat != LSRUN) {
1206 printf("checkrunqueue[%d]: lwp %p state (%d) "
1207 " != LSRUN\n", whichq, l2, l2->l_stat);
1208 }
1209 if (l2->l_back->l_forw != l2) {
1210 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) "
1211 "corrupt %p\n", whichq, l2, l2->l_back,
1212 l2->l_back->l_forw);
1213 die = 1;
1214 }
1215 if (l2->l_forw->l_back != l2) {
1216 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) "
1217 "corrupt %p\n", whichq, l2, l2->l_forw,
1218 l2->l_forw->l_back);
1219 die = 1;
1220 }
1221 if (l2 == l)
1222 found = 1;
1223 empty = 0;
1224 }
1225 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) {
1226 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n",
1227 whichq, rq);
1228 die = 1;
1229 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) {
1230 printf("checkrunqueue[%d]: bit clear for non-empty "
1231 "run-queue %p\n", whichq, rq);
1232 die = 1;
1233 }
1234 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) {
1235 printf("checkrunqueue[%d]: bit clear for active lwp %p\n",
1236 whichq, l);
1237 die = 1;
1238 }
1239 if (l != NULL && empty) {
1240 printf("checkrunqueue[%d]: empty run-queue %p with "
1241 "active lwp %p\n", whichq, rq, l);
1242 die = 1;
1243 }
1244 if (l != NULL && !found) {
1245 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!",
1246 whichq, l, rq);
1247 die = 1;
1248 }
1249 if (die)
1250 panic("checkrunqueue: inconsistency found");
1251 }
1252 #endif /* RQDEBUG */
1253
1254 void
1255 setrunqueue(struct lwp *l)
1256 {
1257 struct prochd *rq;
1258 struct lwp *prev;
1259 const int whichq = lwp_eprio(l) / PPQ;
1260
1261 KASSERT(lwp_locked(l, &sched_mutex));
1262
1263 #ifdef RQDEBUG
1264 checkrunqueue(whichq, NULL);
1265 #endif
1266 #ifdef DIAGNOSTIC
1267 if (l->l_back != NULL || l->l_stat != LSRUN)
1268 panic("setrunqueue");
1269 #endif
1270 sched_whichqs |= RQMASK(whichq);
1271 rq = &sched_qs[whichq];
1272 prev = rq->ph_rlink;
1273 l->l_forw = (struct lwp *)rq;
1274 rq->ph_rlink = l;
1275 prev->l_forw = l;
1276 l->l_back = prev;
1277 #ifdef RQDEBUG
1278 checkrunqueue(whichq, l);
1279 #endif
1280 }
1281
1282 /*
1283 * XXXSMP When LWP dispatch (cpu_switch()) is changed to use remrunqueue(),
1284 * drop of the effective priority level from kernel to user needs to be
1285 * moved here from userret(). The assignment in userret() is currently
1286 * done unlocked.
1287 */
1288 void
1289 remrunqueue(struct lwp *l)
1290 {
1291 struct lwp *prev, *next;
1292 const int whichq = lwp_eprio(l) / PPQ;
1293
1294 KASSERT(lwp_locked(l, &sched_mutex));
1295
1296 #ifdef RQDEBUG
1297 checkrunqueue(whichq, l);
1298 #endif
1299
1300 #if defined(DIAGNOSTIC)
1301 if (((sched_whichqs & RQMASK(whichq)) == 0) || l->l_back == NULL) {
1302 /* Shouldn't happen - interrupts disabled. */
1303 panic("remrunqueue: bit %d not set", whichq);
1304 }
1305 #endif
1306 prev = l->l_back;
1307 l->l_back = NULL;
1308 next = l->l_forw;
1309 prev->l_forw = next;
1310 next->l_back = prev;
1311 if (prev == next)
1312 sched_whichqs &= ~RQMASK(whichq);
1313 #ifdef RQDEBUG
1314 checkrunqueue(whichq, NULL);
1315 #endif
1316 }
1317
1318 #undef RQMASK
1319 #endif /* !defined(__HAVE_MD_RUNQUEUE) */
1320