kern_synch.c revision 1.200 1 /* $NetBSD: kern_synch.c,v 1.200 2007/10/09 19:00:14 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the NetBSD
23 * Foundation, Inc. and its contributors.
24 * 4. Neither the name of The NetBSD Foundation nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 /*-
42 * Copyright (c) 1982, 1986, 1990, 1991, 1993
43 * The Regents of the University of California. All rights reserved.
44 * (c) UNIX System Laboratories, Inc.
45 * All or some portions of this file are derived from material licensed
46 * to the University of California by American Telephone and Telegraph
47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
48 * the permission of UNIX System Laboratories, Inc.
49 *
50 * Redistribution and use in source and binary forms, with or without
51 * modification, are permitted provided that the following conditions
52 * are met:
53 * 1. Redistributions of source code must retain the above copyright
54 * notice, this list of conditions and the following disclaimer.
55 * 2. Redistributions in binary form must reproduce the above copyright
56 * notice, this list of conditions and the following disclaimer in the
57 * documentation and/or other materials provided with the distribution.
58 * 3. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 *
74 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
75 */
76
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.200 2007/10/09 19:00:14 rmind Exp $");
79
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/proc.h>
90 #include <sys/kernel.h>
91 #if defined(PERFCTRS)
92 #include <sys/pmc.h>
93 #endif
94 #include <sys/cpu.h>
95 #include <sys/resourcevar.h>
96 #include <sys/sched.h>
97 #include <sys/syscall_stats.h>
98 #include <sys/sleepq.h>
99 #include <sys/lockdebug.h>
100 #include <sys/evcnt.h>
101 #include <sys/intr.h>
102
103 #include <uvm/uvm_extern.h>
104
105 callout_t sched_pstats_ch;
106 unsigned int sched_pstats_ticks;
107
108 kcondvar_t lbolt; /* once a second sleep address */
109
110 static void sched_unsleep(struct lwp *);
111 static void sched_changepri(struct lwp *, pri_t);
112 static void sched_lendpri(struct lwp *, pri_t);
113
114 syncobj_t sleep_syncobj = {
115 SOBJ_SLEEPQ_SORTED,
116 sleepq_unsleep,
117 sleepq_changepri,
118 sleepq_lendpri,
119 syncobj_noowner,
120 };
121
122 syncobj_t sched_syncobj = {
123 SOBJ_SLEEPQ_SORTED,
124 sched_unsleep,
125 sched_changepri,
126 sched_lendpri,
127 syncobj_noowner,
128 };
129
130 /*
131 * During autoconfiguration or after a panic, a sleep will simply lower the
132 * priority briefly to allow interrupts, then return. The priority to be
133 * used (safepri) is machine-dependent, thus this value is initialized and
134 * maintained in the machine-dependent layers. This priority will typically
135 * be 0, or the lowest priority that is safe for use on the interrupt stack;
136 * it can be made higher to block network software interrupts after panics.
137 */
138 int safepri;
139
140 /*
141 * OBSOLETE INTERFACE
142 *
143 * General sleep call. Suspends the current process until a wakeup is
144 * performed on the specified identifier. The process will then be made
145 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
146 * means no timeout). If pri includes PCATCH flag, signals are checked
147 * before and after sleeping, else signals are not checked. Returns 0 if
148 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
149 * signal needs to be delivered, ERESTART is returned if the current system
150 * call should be restarted if possible, and EINTR is returned if the system
151 * call should be interrupted by the signal (return EINTR).
152 *
153 * The interlock is held until we are on a sleep queue. The interlock will
154 * be locked before returning back to the caller unless the PNORELOCK flag
155 * is specified, in which case the interlock will always be unlocked upon
156 * return.
157 */
158 int
159 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
160 volatile struct simplelock *interlock)
161 {
162 struct lwp *l = curlwp;
163 sleepq_t *sq;
164 int error;
165
166 if (sleepq_dontsleep(l)) {
167 (void)sleepq_abort(NULL, 0);
168 if ((priority & PNORELOCK) != 0)
169 simple_unlock(interlock);
170 return 0;
171 }
172
173 sq = sleeptab_lookup(&sleeptab, ident);
174 sleepq_enter(sq, l);
175 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
176
177 if (interlock != NULL) {
178 LOCK_ASSERT(simple_lock_held(interlock));
179 simple_unlock(interlock);
180 }
181
182 error = sleepq_block(timo, priority & PCATCH);
183
184 if (interlock != NULL && (priority & PNORELOCK) == 0)
185 simple_lock(interlock);
186
187 return error;
188 }
189
190 int
191 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
192 kmutex_t *mtx)
193 {
194 struct lwp *l = curlwp;
195 sleepq_t *sq;
196 int error;
197
198 if (sleepq_dontsleep(l)) {
199 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
200 return 0;
201 }
202
203 sq = sleeptab_lookup(&sleeptab, ident);
204 sleepq_enter(sq, l);
205 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
206 mutex_exit(mtx);
207 error = sleepq_block(timo, priority & PCATCH);
208
209 if ((priority & PNORELOCK) == 0)
210 mutex_enter(mtx);
211
212 return error;
213 }
214
215 /*
216 * General sleep call for situations where a wake-up is not expected.
217 */
218 int
219 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
220 {
221 struct lwp *l = curlwp;
222 sleepq_t *sq;
223 int error;
224
225 if (sleepq_dontsleep(l))
226 return sleepq_abort(NULL, 0);
227
228 if (mtx != NULL)
229 mutex_exit(mtx);
230 sq = sleeptab_lookup(&sleeptab, l);
231 sleepq_enter(sq, l);
232 sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
233 error = sleepq_block(timo, intr);
234 if (mtx != NULL)
235 mutex_enter(mtx);
236
237 return error;
238 }
239
240 /*
241 * OBSOLETE INTERFACE
242 *
243 * Make all processes sleeping on the specified identifier runnable.
244 */
245 void
246 wakeup(wchan_t ident)
247 {
248 sleepq_t *sq;
249
250 if (cold)
251 return;
252
253 sq = sleeptab_lookup(&sleeptab, ident);
254 sleepq_wake(sq, ident, (u_int)-1);
255 }
256
257 /*
258 * OBSOLETE INTERFACE
259 *
260 * Make the highest priority process first in line on the specified
261 * identifier runnable.
262 */
263 void
264 wakeup_one(wchan_t ident)
265 {
266 sleepq_t *sq;
267
268 if (cold)
269 return;
270
271 sq = sleeptab_lookup(&sleeptab, ident);
272 sleepq_wake(sq, ident, 1);
273 }
274
275
276 /*
277 * General yield call. Puts the current process back on its run queue and
278 * performs a voluntary context switch. Should only be called when the
279 * current process explicitly requests it (eg sched_yield(2)).
280 */
281 void
282 yield(void)
283 {
284 struct lwp *l = curlwp;
285
286 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
287 lwp_lock(l);
288 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
289 KASSERT(l->l_stat == LSONPROC);
290 /* XXX Only do this for timeshared threads. */
291 l->l_priority = MAXPRI;
292 (void)mi_switch(l);
293 KERNEL_LOCK(l->l_biglocks, l);
294 }
295
296 /*
297 * General preemption call. Puts the current process back on its run queue
298 * and performs an involuntary context switch.
299 */
300 void
301 preempt(void)
302 {
303 struct lwp *l = curlwp;
304
305 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
306 lwp_lock(l);
307 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
308 KASSERT(l->l_stat == LSONPROC);
309 l->l_priority = l->l_usrpri;
310 l->l_nivcsw++;
311 (void)mi_switch(l);
312 KERNEL_LOCK(l->l_biglocks, l);
313 }
314
315 /*
316 * Compute the amount of time during which the current lwp was running.
317 *
318 * - update l_rtime unless it's an idle lwp.
319 */
320
321 void
322 updatertime(lwp_t *l, const struct timeval *tv)
323 {
324 long s, u;
325
326 if ((l->l_flag & LW_IDLE) != 0)
327 return;
328
329 u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
330 s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
331 if (u < 0) {
332 u += 1000000;
333 s--;
334 } else if (u >= 1000000) {
335 u -= 1000000;
336 s++;
337 }
338 l->l_rtime.tv_usec = u;
339 l->l_rtime.tv_sec = s;
340 }
341
342 /*
343 * The machine independent parts of context switch.
344 *
345 * Returns 1 if another LWP was actually run.
346 */
347 int
348 mi_switch(lwp_t *l)
349 {
350 struct schedstate_percpu *spc;
351 struct lwp *newl;
352 int retval, oldspl;
353 struct cpu_info *ci;
354 struct timeval tv;
355 bool returning;
356
357 KASSERT(lwp_locked(l, NULL));
358 LOCKDEBUG_BARRIER(l->l_mutex, 1);
359
360 #ifdef KSTACK_CHECK_MAGIC
361 kstack_check_magic(l);
362 #endif
363
364 microtime(&tv);
365
366 /*
367 * It's safe to read the per CPU schedstate unlocked here, as all we
368 * are after is the run time and that's guarenteed to have been last
369 * updated by this CPU.
370 */
371 ci = l->l_cpu;
372 KDASSERT(ci == curcpu());
373
374 /*
375 * Process is about to yield the CPU; clear the appropriate
376 * scheduling flags.
377 */
378 spc = &ci->ci_schedstate;
379 returning = false;
380 newl = NULL;
381
382 /*
383 * If we have been asked to switch to a specific LWP, then there
384 * is no need to inspect the run queues. If a soft interrupt is
385 * blocking, then return to the interrupted thread without adjusting
386 * VM context or its start time: neither have been changed in order
387 * to take the interrupt.
388 */
389 if (l->l_switchto != NULL) {
390 if ((l->l_flag & LW_INTR) != 0) {
391 returning = true;
392 softint_block(l);
393 if ((l->l_flag & LW_TIMEINTR) != 0)
394 updatertime(l, &tv);
395 }
396 newl = l->l_switchto;
397 l->l_switchto = NULL;
398 }
399
400 /* Count time spent in current system call */
401 if (!returning) {
402 SYSCALL_TIME_SLEEP(l);
403
404 /*
405 * XXXSMP If we are using h/w performance counters,
406 * save context.
407 */
408 #if PERFCTRS
409 if (PMC_ENABLED(l->l_proc)) {
410 pmc_save_context(l->l_proc);
411 }
412 #endif
413 updatertime(l, &tv);
414 }
415
416 /*
417 * If on the CPU and we have gotten this far, then we must yield.
418 */
419 mutex_spin_enter(spc->spc_mutex);
420 KASSERT(l->l_stat != LSRUN);
421 if (l->l_stat == LSONPROC) {
422 KASSERT(lwp_locked(l, &spc->spc_lwplock));
423 if ((l->l_flag & LW_IDLE) == 0) {
424 l->l_stat = LSRUN;
425 lwp_setlock(l, spc->spc_mutex);
426 sched_enqueue(l, true);
427 } else
428 l->l_stat = LSIDL;
429 }
430
431 /*
432 * Let sched_nextlwp() select the LWP to run the CPU next.
433 * If no LWP is runnable, switch to the idle LWP.
434 */
435 if (newl == NULL) {
436 newl = sched_nextlwp();
437 if (newl != NULL) {
438 sched_dequeue(newl);
439 KASSERT(lwp_locked(newl, spc->spc_mutex));
440 newl->l_stat = LSONPROC;
441 newl->l_cpu = ci;
442 newl->l_flag |= LW_RUNNING;
443 lwp_setlock(newl, &spc->spc_lwplock);
444 } else {
445 newl = ci->ci_data.cpu_idlelwp;
446 newl->l_stat = LSONPROC;
447 newl->l_flag |= LW_RUNNING;
448 }
449 ci->ci_want_resched = 0;
450 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
451 }
452
453 /* Update the new LWP's start time while it is still locked. */
454 if (!returning) {
455 newl->l_stime = tv;
456 /*
457 * XXX The following may be done unlocked if newl != NULL
458 * above.
459 */
460 newl->l_priority = newl->l_usrpri;
461 }
462
463 spc->spc_curpriority = newl->l_usrpri;
464
465 if (l != newl) {
466 struct lwp *prevlwp;
467
468 /*
469 * If the old LWP has been moved to a run queue above,
470 * drop the general purpose LWP lock: it's now locked
471 * by the scheduler lock.
472 *
473 * Otherwise, drop the scheduler lock. We're done with
474 * the run queues for now.
475 */
476 if (l->l_mutex == spc->spc_mutex) {
477 mutex_spin_exit(&spc->spc_lwplock);
478 } else {
479 mutex_spin_exit(spc->spc_mutex);
480 }
481
482 /* Unlocked, but for statistics only. */
483 uvmexp.swtch++;
484
485 /*
486 * Save old VM context, unless a soft interrupt
487 * handler is blocking.
488 */
489 if (!returning)
490 pmap_deactivate(l);
491
492 /* Switch to the new LWP.. */
493 l->l_ncsw++;
494 l->l_flag &= ~LW_RUNNING;
495 oldspl = MUTEX_SPIN_OLDSPL(ci);
496 prevlwp = cpu_switchto(l, newl);
497
498 /*
499 * .. we have switched away and are now back so we must
500 * be the new curlwp. prevlwp is who we replaced.
501 */
502 if (prevlwp != NULL) {
503 curcpu()->ci_mtx_oldspl = oldspl;
504 lwp_unlock(prevlwp);
505 } else {
506 splx(oldspl);
507 }
508
509 /* Restore VM context. */
510 pmap_activate(l);
511 retval = 1;
512 } else {
513 /* Nothing to do - just unlock and return. */
514 mutex_spin_exit(spc->spc_mutex);
515 lwp_unlock(l);
516 retval = 0;
517 }
518
519 KASSERT(l == curlwp);
520 KASSERT(l->l_stat == LSONPROC);
521 KASSERT(l->l_cpu == curcpu());
522
523 /*
524 * XXXSMP If we are using h/w performance counters, restore context.
525 */
526 #if PERFCTRS
527 if (PMC_ENABLED(l->l_proc)) {
528 pmc_restore_context(l->l_proc);
529 }
530 #endif
531
532 /*
533 * We're running again; record our new start time. We might
534 * be running on a new CPU now, so don't use the cached
535 * schedstate_percpu pointer.
536 */
537 SYSCALL_TIME_WAKEUP(l);
538 KASSERT(curlwp == l);
539 KDASSERT(l->l_cpu == curcpu());
540 LOCKDEBUG_BARRIER(NULL, 1);
541
542 return retval;
543 }
544
545 /*
546 * Change process state to be runnable, placing it on the run queue if it is
547 * in memory, and awakening the swapper if it isn't in memory.
548 *
549 * Call with the process and LWP locked. Will return with the LWP unlocked.
550 */
551 void
552 setrunnable(struct lwp *l)
553 {
554 struct proc *p = l->l_proc;
555 sigset_t *ss;
556
557 KASSERT((l->l_flag & LW_IDLE) == 0);
558 KASSERT(mutex_owned(&p->p_smutex));
559 KASSERT(lwp_locked(l, NULL));
560
561 switch (l->l_stat) {
562 case LSSTOP:
563 /*
564 * If we're being traced (possibly because someone attached us
565 * while we were stopped), check for a signal from the debugger.
566 */
567 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
568 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
569 ss = &l->l_sigpend.sp_set;
570 else
571 ss = &p->p_sigpend.sp_set;
572 sigaddset(ss, p->p_xstat);
573 signotify(l);
574 }
575 p->p_nrlwps++;
576 break;
577 case LSSUSPENDED:
578 l->l_flag &= ~LW_WSUSPEND;
579 p->p_nrlwps++;
580 cv_broadcast(&p->p_lwpcv);
581 break;
582 case LSSLEEP:
583 KASSERT(l->l_wchan != NULL);
584 break;
585 default:
586 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
587 }
588
589 /*
590 * If the LWP was sleeping interruptably, then it's OK to start it
591 * again. If not, mark it as still sleeping.
592 */
593 if (l->l_wchan != NULL) {
594 l->l_stat = LSSLEEP;
595 /* lwp_unsleep() will release the lock. */
596 lwp_unsleep(l);
597 return;
598 }
599
600 /*
601 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
602 * about to call mi_switch(), in which case it will yield.
603 */
604 if ((l->l_flag & LW_RUNNING) != 0) {
605 l->l_stat = LSONPROC;
606 l->l_slptime = 0;
607 lwp_unlock(l);
608 return;
609 }
610
611 /*
612 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
613 * to bring it back in. Otherwise, enter it into a run queue.
614 */
615 if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
616 spc_lock(l->l_cpu);
617 lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
618 }
619
620 sched_setrunnable(l);
621 l->l_stat = LSRUN;
622 l->l_slptime = 0;
623
624 if (l->l_flag & LW_INMEM) {
625 sched_enqueue(l, false);
626 resched_cpu(l);
627 lwp_unlock(l);
628 } else {
629 lwp_unlock(l);
630 uvm_kick_scheduler();
631 }
632 }
633
634 /*
635 * suspendsched:
636 *
637 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
638 */
639 void
640 suspendsched(void)
641 {
642 CPU_INFO_ITERATOR cii;
643 struct cpu_info *ci;
644 struct lwp *l;
645 struct proc *p;
646
647 /*
648 * We do this by process in order not to violate the locking rules.
649 */
650 mutex_enter(&proclist_mutex);
651 PROCLIST_FOREACH(p, &allproc) {
652 mutex_enter(&p->p_smutex);
653
654 if ((p->p_flag & PK_SYSTEM) != 0) {
655 mutex_exit(&p->p_smutex);
656 continue;
657 }
658
659 p->p_stat = SSTOP;
660
661 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
662 if (l == curlwp)
663 continue;
664
665 lwp_lock(l);
666
667 /*
668 * Set L_WREBOOT so that the LWP will suspend itself
669 * when it tries to return to user mode. We want to
670 * try and get to get as many LWPs as possible to
671 * the user / kernel boundary, so that they will
672 * release any locks that they hold.
673 */
674 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
675
676 if (l->l_stat == LSSLEEP &&
677 (l->l_flag & LW_SINTR) != 0) {
678 /* setrunnable() will release the lock. */
679 setrunnable(l);
680 continue;
681 }
682
683 lwp_unlock(l);
684 }
685
686 mutex_exit(&p->p_smutex);
687 }
688 mutex_exit(&proclist_mutex);
689
690 /*
691 * Kick all CPUs to make them preempt any LWPs running in user mode.
692 * They'll trap into the kernel and suspend themselves in userret().
693 */
694 for (CPU_INFO_FOREACH(cii, ci))
695 cpu_need_resched(ci, 0);
696 }
697
698 /*
699 * sched_kpri:
700 *
701 * Scale a priority level to a kernel priority level, usually
702 * for an LWP that is about to sleep.
703 */
704 pri_t
705 sched_kpri(struct lwp *l)
706 {
707 /*
708 * Scale user priorities (127 -> 50) up to kernel priorities
709 * in the range (49 -> 8). Reserve the top 8 kernel priorities
710 * for high priority kthreads. Kernel priorities passed in
711 * are left "as is". XXX This is somewhat arbitrary.
712 */
713 static const uint8_t kpri_tab[] = {
714 0, 1, 2, 3, 4, 5, 6, 7,
715 8, 9, 10, 11, 12, 13, 14, 15,
716 16, 17, 18, 19, 20, 21, 22, 23,
717 24, 25, 26, 27, 28, 29, 30, 31,
718 32, 33, 34, 35, 36, 37, 38, 39,
719 40, 41, 42, 43, 44, 45, 46, 47,
720 48, 49, 8, 8, 9, 9, 10, 10,
721 11, 11, 12, 12, 13, 14, 14, 15,
722 15, 16, 16, 17, 17, 18, 18, 19,
723 20, 20, 21, 21, 22, 22, 23, 23,
724 24, 24, 25, 26, 26, 27, 27, 28,
725 28, 29, 29, 30, 30, 31, 32, 32,
726 33, 33, 34, 34, 35, 35, 36, 36,
727 37, 38, 38, 39, 39, 40, 40, 41,
728 41, 42, 42, 43, 44, 44, 45, 45,
729 46, 46, 47, 47, 48, 48, 49, 49,
730 };
731
732 return (pri_t)kpri_tab[l->l_usrpri];
733 }
734
735 /*
736 * sched_unsleep:
737 *
738 * The is called when the LWP has not been awoken normally but instead
739 * interrupted: for example, if the sleep timed out. Because of this,
740 * it's not a valid action for running or idle LWPs.
741 */
742 static void
743 sched_unsleep(struct lwp *l)
744 {
745
746 lwp_unlock(l);
747 panic("sched_unsleep");
748 }
749
750 inline void
751 resched_cpu(struct lwp *l)
752 {
753 struct cpu_info *ci;
754 const pri_t pri = lwp_eprio(l);
755
756 /*
757 * XXXSMP
758 * Since l->l_cpu persists across a context switch,
759 * this gives us *very weak* processor affinity, in
760 * that we notify the CPU on which the process last
761 * ran that it should try to switch.
762 *
763 * This does not guarantee that the process will run on
764 * that processor next, because another processor might
765 * grab it the next time it performs a context switch.
766 *
767 * This also does not handle the case where its last
768 * CPU is running a higher-priority process, but every
769 * other CPU is running a lower-priority process. There
770 * are ways to handle this situation, but they're not
771 * currently very pretty, and we also need to weigh the
772 * cost of moving a process from one CPU to another.
773 */
774 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
775 if (pri < ci->ci_schedstate.spc_curpriority)
776 cpu_need_resched(ci, 0);
777 }
778
779 static void
780 sched_changepri(struct lwp *l, pri_t pri)
781 {
782
783 KASSERT(lwp_locked(l, NULL));
784
785 l->l_usrpri = pri;
786 if (l->l_priority < PUSER)
787 return;
788
789 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
790 l->l_priority = pri;
791 return;
792 }
793
794 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
795
796 sched_dequeue(l);
797 l->l_priority = pri;
798 sched_enqueue(l, false);
799 resched_cpu(l);
800 }
801
802 static void
803 sched_lendpri(struct lwp *l, pri_t pri)
804 {
805
806 KASSERT(lwp_locked(l, NULL));
807
808 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
809 l->l_inheritedprio = pri;
810 return;
811 }
812
813 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
814
815 sched_dequeue(l);
816 l->l_inheritedprio = pri;
817 sched_enqueue(l, false);
818 resched_cpu(l);
819 }
820
821 struct lwp *
822 syncobj_noowner(wchan_t wchan)
823 {
824
825 return NULL;
826 }
827
828
829 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
830 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
831
832 /*
833 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
834 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
835 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
836 *
837 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
838 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
839 *
840 * If you dont want to bother with the faster/more-accurate formula, you
841 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
842 * (more general) method of calculating the %age of CPU used by a process.
843 */
844 #define CCPU_SHIFT (FSHIFT + 1)
845
846 /*
847 * sched_pstats:
848 *
849 * Update process statistics and check CPU resource allocation.
850 * Call scheduler-specific hook to eventually adjust process/LWP
851 * priorities.
852 */
853 /* ARGSUSED */
854 void
855 sched_pstats(void *arg)
856 {
857 struct rlimit *rlim;
858 struct lwp *l;
859 struct proc *p;
860 int minslp, sig, clkhz;
861 long runtm;
862
863 sched_pstats_ticks++;
864
865 mutex_enter(&proclist_mutex);
866 PROCLIST_FOREACH(p, &allproc) {
867 /*
868 * Increment time in/out of memory and sleep time (if
869 * sleeping). We ignore overflow; with 16-bit int's
870 * (remember them?) overflow takes 45 days.
871 */
872 minslp = 2;
873 mutex_enter(&p->p_smutex);
874 mutex_spin_enter(&p->p_stmutex);
875 runtm = p->p_rtime.tv_sec;
876 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
877 if ((l->l_flag & LW_IDLE) != 0)
878 continue;
879 lwp_lock(l);
880 runtm += l->l_rtime.tv_sec;
881 l->l_swtime++;
882 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
883 l->l_stat == LSSUSPENDED) {
884 l->l_slptime++;
885 minslp = min(minslp, l->l_slptime);
886 } else
887 minslp = 0;
888 sched_pstats_hook(l);
889 lwp_unlock(l);
890
891 /*
892 * p_pctcpu is only for ps.
893 */
894 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
895 if (l->l_slptime < 1) {
896 clkhz = stathz != 0 ? stathz : hz;
897 #if (FSHIFT >= CCPU_SHIFT)
898 l->l_pctcpu += (clkhz == 100) ?
899 ((fixpt_t)l->l_cpticks) <<
900 (FSHIFT - CCPU_SHIFT) :
901 100 * (((fixpt_t) p->p_cpticks)
902 << (FSHIFT - CCPU_SHIFT)) / clkhz;
903 #else
904 l->l_pctcpu += ((FSCALE - ccpu) *
905 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
906 #endif
907 l->l_cpticks = 0;
908 }
909 }
910
911 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
912 #ifdef SCHED_4BSD
913 /*
914 * XXX: Workaround - belongs to sched_4bsd.c
915 * If the process has slept the entire second,
916 * stop recalculating its priority until it wakes up.
917 */
918 if (minslp <= 1) {
919 extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
920
921 fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
922 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
923 }
924 #endif
925 mutex_spin_exit(&p->p_stmutex);
926
927 /*
928 * Check if the process exceeds its CPU resource allocation.
929 * If over max, kill it.
930 */
931 rlim = &p->p_rlimit[RLIMIT_CPU];
932 sig = 0;
933 if (runtm >= rlim->rlim_cur) {
934 if (runtm >= rlim->rlim_max)
935 sig = SIGKILL;
936 else {
937 sig = SIGXCPU;
938 if (rlim->rlim_cur < rlim->rlim_max)
939 rlim->rlim_cur += 5;
940 }
941 }
942 mutex_exit(&p->p_smutex);
943 if (sig) {
944 psignal(p, sig);
945 }
946 }
947 mutex_exit(&proclist_mutex);
948 uvm_meter();
949 cv_wakeup(&lbolt);
950 callout_schedule(&sched_pstats_ch, hz);
951 }
952
953 void
954 sched_init(void)
955 {
956
957 cv_init(&lbolt, "lbolt");
958 callout_init(&sched_pstats_ch, 0);
959 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
960 sched_setup();
961 sched_pstats(NULL);
962 }
963