kern_synch.c revision 1.203 1 /* $NetBSD: kern_synch.c,v 1.203 2007/11/04 11:43:07 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the NetBSD
23 * Foundation, Inc. and its contributors.
24 * 4. Neither the name of The NetBSD Foundation nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 /*-
42 * Copyright (c) 1982, 1986, 1990, 1991, 1993
43 * The Regents of the University of California. All rights reserved.
44 * (c) UNIX System Laboratories, Inc.
45 * All or some portions of this file are derived from material licensed
46 * to the University of California by American Telephone and Telegraph
47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
48 * the permission of UNIX System Laboratories, Inc.
49 *
50 * Redistribution and use in source and binary forms, with or without
51 * modification, are permitted provided that the following conditions
52 * are met:
53 * 1. Redistributions of source code must retain the above copyright
54 * notice, this list of conditions and the following disclaimer.
55 * 2. Redistributions in binary form must reproduce the above copyright
56 * notice, this list of conditions and the following disclaimer in the
57 * documentation and/or other materials provided with the distribution.
58 * 3. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 *
74 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
75 */
76
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.203 2007/11/04 11:43:07 rmind Exp $");
79
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84 #include "opt_sched.h"
85
86 #define __MUTEX_PRIVATE
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #if defined(PERFCTRS)
93 #include <sys/pmc.h>
94 #endif
95 #include <sys/cpu.h>
96 #include <sys/resourcevar.h>
97 #include <sys/sched.h>
98 #include <sys/syscall_stats.h>
99 #include <sys/sleepq.h>
100 #include <sys/lockdebug.h>
101 #include <sys/evcnt.h>
102 #include <sys/intr.h>
103
104 #include <uvm/uvm_extern.h>
105
106 callout_t sched_pstats_ch;
107 unsigned int sched_pstats_ticks;
108
109 kcondvar_t lbolt; /* once a second sleep address */
110
111 static void sched_unsleep(struct lwp *);
112 static void sched_changepri(struct lwp *, pri_t);
113 static void sched_lendpri(struct lwp *, pri_t);
114
115 syncobj_t sleep_syncobj = {
116 SOBJ_SLEEPQ_SORTED,
117 sleepq_unsleep,
118 sleepq_changepri,
119 sleepq_lendpri,
120 syncobj_noowner,
121 };
122
123 syncobj_t sched_syncobj = {
124 SOBJ_SLEEPQ_SORTED,
125 sched_unsleep,
126 sched_changepri,
127 sched_lendpri,
128 syncobj_noowner,
129 };
130
131 /*
132 * During autoconfiguration or after a panic, a sleep will simply lower the
133 * priority briefly to allow interrupts, then return. The priority to be
134 * used (safepri) is machine-dependent, thus this value is initialized and
135 * maintained in the machine-dependent layers. This priority will typically
136 * be 0, or the lowest priority that is safe for use on the interrupt stack;
137 * it can be made higher to block network software interrupts after panics.
138 */
139 int safepri;
140
141 /*
142 * OBSOLETE INTERFACE
143 *
144 * General sleep call. Suspends the current process until a wakeup is
145 * performed on the specified identifier. The process will then be made
146 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
147 * means no timeout). If pri includes PCATCH flag, signals are checked
148 * before and after sleeping, else signals are not checked. Returns 0 if
149 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
150 * signal needs to be delivered, ERESTART is returned if the current system
151 * call should be restarted if possible, and EINTR is returned if the system
152 * call should be interrupted by the signal (return EINTR).
153 *
154 * The interlock is held until we are on a sleep queue. The interlock will
155 * be locked before returning back to the caller unless the PNORELOCK flag
156 * is specified, in which case the interlock will always be unlocked upon
157 * return.
158 */
159 int
160 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
161 volatile struct simplelock *interlock)
162 {
163 struct lwp *l = curlwp;
164 sleepq_t *sq;
165 int error;
166
167 if (sleepq_dontsleep(l)) {
168 (void)sleepq_abort(NULL, 0);
169 if ((priority & PNORELOCK) != 0)
170 simple_unlock(interlock);
171 return 0;
172 }
173
174 sq = sleeptab_lookup(&sleeptab, ident);
175 sleepq_enter(sq, l);
176 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
177
178 if (interlock != NULL) {
179 LOCK_ASSERT(simple_lock_held(interlock));
180 simple_unlock(interlock);
181 }
182
183 error = sleepq_block(timo, priority & PCATCH);
184
185 if (interlock != NULL && (priority & PNORELOCK) == 0)
186 simple_lock(interlock);
187
188 return error;
189 }
190
191 int
192 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
193 kmutex_t *mtx)
194 {
195 struct lwp *l = curlwp;
196 sleepq_t *sq;
197 int error;
198
199 if (sleepq_dontsleep(l)) {
200 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
201 return 0;
202 }
203
204 sq = sleeptab_lookup(&sleeptab, ident);
205 sleepq_enter(sq, l);
206 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
207 mutex_exit(mtx);
208 error = sleepq_block(timo, priority & PCATCH);
209
210 if ((priority & PNORELOCK) == 0)
211 mutex_enter(mtx);
212
213 return error;
214 }
215
216 /*
217 * General sleep call for situations where a wake-up is not expected.
218 */
219 int
220 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
221 {
222 struct lwp *l = curlwp;
223 sleepq_t *sq;
224 int error;
225
226 if (sleepq_dontsleep(l))
227 return sleepq_abort(NULL, 0);
228
229 if (mtx != NULL)
230 mutex_exit(mtx);
231 sq = sleeptab_lookup(&sleeptab, l);
232 sleepq_enter(sq, l);
233 sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
234 error = sleepq_block(timo, intr);
235 if (mtx != NULL)
236 mutex_enter(mtx);
237
238 return error;
239 }
240
241 /*
242 * OBSOLETE INTERFACE
243 *
244 * Make all processes sleeping on the specified identifier runnable.
245 */
246 void
247 wakeup(wchan_t ident)
248 {
249 sleepq_t *sq;
250
251 if (cold)
252 return;
253
254 sq = sleeptab_lookup(&sleeptab, ident);
255 sleepq_wake(sq, ident, (u_int)-1);
256 }
257
258 /*
259 * OBSOLETE INTERFACE
260 *
261 * Make the highest priority process first in line on the specified
262 * identifier runnable.
263 */
264 void
265 wakeup_one(wchan_t ident)
266 {
267 sleepq_t *sq;
268
269 if (cold)
270 return;
271
272 sq = sleeptab_lookup(&sleeptab, ident);
273 sleepq_wake(sq, ident, 1);
274 }
275
276
277 /*
278 * General yield call. Puts the current process back on its run queue and
279 * performs a voluntary context switch. Should only be called when the
280 * current process explicitly requests it (eg sched_yield(2)).
281 */
282 void
283 yield(void)
284 {
285 struct lwp *l = curlwp;
286
287 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
288 lwp_lock(l);
289 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
290 KASSERT(l->l_stat == LSONPROC);
291 /* XXX Only do this for timeshared threads. */
292 l->l_priority = MAXPRI;
293 (void)mi_switch(l);
294 KERNEL_LOCK(l->l_biglocks, l);
295 }
296
297 /*
298 * General preemption call. Puts the current process back on its run queue
299 * and performs an involuntary context switch.
300 */
301 void
302 preempt(void)
303 {
304 struct lwp *l = curlwp;
305
306 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
307 lwp_lock(l);
308 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
309 KASSERT(l->l_stat == LSONPROC);
310 l->l_priority = l->l_usrpri;
311 l->l_nivcsw++;
312 (void)mi_switch(l);
313 KERNEL_LOCK(l->l_biglocks, l);
314 }
315
316 /*
317 * Compute the amount of time during which the current lwp was running.
318 *
319 * - update l_rtime unless it's an idle lwp.
320 */
321
322 void
323 updatertime(lwp_t *l, const struct timeval *tv)
324 {
325 long s, u;
326
327 if ((l->l_flag & LW_IDLE) != 0)
328 return;
329
330 u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
331 s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
332 if (u < 0) {
333 u += 1000000;
334 s--;
335 } else if (u >= 1000000) {
336 u -= 1000000;
337 s++;
338 }
339 l->l_rtime.tv_usec = u;
340 l->l_rtime.tv_sec = s;
341 }
342
343 /*
344 * The machine independent parts of context switch.
345 *
346 * Returns 1 if another LWP was actually run.
347 */
348 int
349 mi_switch(lwp_t *l)
350 {
351 struct schedstate_percpu *spc;
352 struct lwp *newl;
353 int retval, oldspl;
354 struct cpu_info *ci;
355 struct timeval tv;
356 bool returning;
357
358 KASSERT(lwp_locked(l, NULL));
359 LOCKDEBUG_BARRIER(l->l_mutex, 1);
360
361 #ifdef KSTACK_CHECK_MAGIC
362 kstack_check_magic(l);
363 #endif
364
365 microtime(&tv);
366
367 /*
368 * It's safe to read the per CPU schedstate unlocked here, as all we
369 * are after is the run time and that's guarenteed to have been last
370 * updated by this CPU.
371 */
372 ci = l->l_cpu;
373 KDASSERT(ci == curcpu());
374
375 /*
376 * Process is about to yield the CPU; clear the appropriate
377 * scheduling flags.
378 */
379 spc = &ci->ci_schedstate;
380 returning = false;
381 newl = NULL;
382
383 /*
384 * If we have been asked to switch to a specific LWP, then there
385 * is no need to inspect the run queues. If a soft interrupt is
386 * blocking, then return to the interrupted thread without adjusting
387 * VM context or its start time: neither have been changed in order
388 * to take the interrupt.
389 */
390 if (l->l_switchto != NULL) {
391 if ((l->l_flag & LW_INTR) != 0) {
392 returning = true;
393 softint_block(l);
394 if ((l->l_flag & LW_TIMEINTR) != 0)
395 updatertime(l, &tv);
396 }
397 newl = l->l_switchto;
398 l->l_switchto = NULL;
399 }
400
401 /* Count time spent in current system call */
402 if (!returning) {
403 SYSCALL_TIME_SLEEP(l);
404
405 /*
406 * XXXSMP If we are using h/w performance counters,
407 * save context.
408 */
409 #if PERFCTRS
410 if (PMC_ENABLED(l->l_proc)) {
411 pmc_save_context(l->l_proc);
412 }
413 #endif
414 updatertime(l, &tv);
415 }
416
417 /*
418 * If on the CPU and we have gotten this far, then we must yield.
419 */
420 mutex_spin_enter(spc->spc_mutex);
421 KASSERT(l->l_stat != LSRUN);
422 if (l->l_stat == LSONPROC) {
423 KASSERT(lwp_locked(l, &spc->spc_lwplock));
424 if ((l->l_flag & LW_IDLE) == 0) {
425 l->l_stat = LSRUN;
426 lwp_setlock(l, spc->spc_mutex);
427 sched_enqueue(l, true);
428 } else
429 l->l_stat = LSIDL;
430 }
431
432 /*
433 * Let sched_nextlwp() select the LWP to run the CPU next.
434 * If no LWP is runnable, switch to the idle LWP.
435 * Note that spc_lwplock might not necessary be held.
436 */
437 if (newl == NULL) {
438 newl = sched_nextlwp();
439 if (newl != NULL) {
440 sched_dequeue(newl);
441 KASSERT(lwp_locked(newl, spc->spc_mutex));
442 newl->l_stat = LSONPROC;
443 newl->l_cpu = ci;
444 newl->l_flag |= LW_RUNNING;
445 lwp_setlock(newl, &spc->spc_lwplock);
446 } else {
447 newl = ci->ci_data.cpu_idlelwp;
448 newl->l_stat = LSONPROC;
449 newl->l_flag |= LW_RUNNING;
450 }
451 ci->ci_want_resched = 0;
452 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
453 }
454
455 /* Update the new LWP's start time while it is still locked. */
456 if (!returning) {
457 newl->l_stime = tv;
458 /*
459 * XXX The following may be done unlocked if newl != NULL
460 * above.
461 */
462 newl->l_priority = newl->l_usrpri;
463 }
464
465 spc->spc_curpriority = newl->l_usrpri;
466
467 if (l != newl) {
468 struct lwp *prevlwp;
469
470 /*
471 * If the old LWP has been moved to a run queue above,
472 * drop the general purpose LWP lock: it's now locked
473 * by the scheduler lock.
474 *
475 * Otherwise, drop the scheduler lock. We're done with
476 * the run queues for now.
477 */
478 if (l->l_mutex == spc->spc_mutex) {
479 mutex_spin_exit(&spc->spc_lwplock);
480 } else {
481 mutex_spin_exit(spc->spc_mutex);
482 }
483
484 /* Unlocked, but for statistics only. */
485 uvmexp.swtch++;
486
487 /*
488 * Save old VM context, unless a soft interrupt
489 * handler is blocking.
490 */
491 if (!returning)
492 pmap_deactivate(l);
493
494 /* Switch to the new LWP.. */
495 l->l_ncsw++;
496 l->l_flag &= ~LW_RUNNING;
497 oldspl = MUTEX_SPIN_OLDSPL(ci);
498 prevlwp = cpu_switchto(l, newl);
499
500 /*
501 * .. we have switched away and are now back so we must
502 * be the new curlwp. prevlwp is who we replaced.
503 */
504 if (prevlwp != NULL) {
505 curcpu()->ci_mtx_oldspl = oldspl;
506 lwp_unlock(prevlwp);
507 } else {
508 splx(oldspl);
509 }
510
511 /* Restore VM context. */
512 pmap_activate(l);
513 retval = 1;
514 } else {
515 /* Nothing to do - just unlock and return. */
516 mutex_spin_exit(spc->spc_mutex);
517 lwp_unlock(l);
518 retval = 0;
519 }
520
521 KASSERT(l == curlwp);
522 KASSERT(l->l_stat == LSONPROC);
523 KASSERT(l->l_cpu == curcpu());
524
525 /*
526 * XXXSMP If we are using h/w performance counters, restore context.
527 */
528 #if PERFCTRS
529 if (PMC_ENABLED(l->l_proc)) {
530 pmc_restore_context(l->l_proc);
531 }
532 #endif
533
534 /*
535 * We're running again; record our new start time. We might
536 * be running on a new CPU now, so don't use the cached
537 * schedstate_percpu pointer.
538 */
539 SYSCALL_TIME_WAKEUP(l);
540 KASSERT(curlwp == l);
541 KDASSERT(l->l_cpu == curcpu());
542 LOCKDEBUG_BARRIER(NULL, 1);
543
544 return retval;
545 }
546
547 /*
548 * Change process state to be runnable, placing it on the run queue if it is
549 * in memory, and awakening the swapper if it isn't in memory.
550 *
551 * Call with the process and LWP locked. Will return with the LWP unlocked.
552 */
553 void
554 setrunnable(struct lwp *l)
555 {
556 struct proc *p = l->l_proc;
557 struct cpu_info *ci;
558 sigset_t *ss;
559
560 KASSERT((l->l_flag & LW_IDLE) == 0);
561 KASSERT(mutex_owned(&p->p_smutex));
562 KASSERT(lwp_locked(l, NULL));
563 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
564
565 switch (l->l_stat) {
566 case LSSTOP:
567 /*
568 * If we're being traced (possibly because someone attached us
569 * while we were stopped), check for a signal from the debugger.
570 */
571 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
572 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
573 ss = &l->l_sigpend.sp_set;
574 else
575 ss = &p->p_sigpend.sp_set;
576 sigaddset(ss, p->p_xstat);
577 signotify(l);
578 }
579 p->p_nrlwps++;
580 break;
581 case LSSUSPENDED:
582 l->l_flag &= ~LW_WSUSPEND;
583 p->p_nrlwps++;
584 cv_broadcast(&p->p_lwpcv);
585 break;
586 case LSSLEEP:
587 KASSERT(l->l_wchan != NULL);
588 break;
589 default:
590 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
591 }
592
593 /*
594 * If the LWP was sleeping interruptably, then it's OK to start it
595 * again. If not, mark it as still sleeping.
596 */
597 if (l->l_wchan != NULL) {
598 l->l_stat = LSSLEEP;
599 /* lwp_unsleep() will release the lock. */
600 lwp_unsleep(l);
601 return;
602 }
603
604 /*
605 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
606 * about to call mi_switch(), in which case it will yield.
607 */
608 if ((l->l_flag & LW_RUNNING) != 0) {
609 l->l_stat = LSONPROC;
610 l->l_slptime = 0;
611 lwp_unlock(l);
612 return;
613 }
614
615 /*
616 * Look for a CPU to run.
617 * Set the LWP runnable.
618 */
619 ci = sched_takecpu(l);
620 ci = l->l_cpu;
621 spc_lock(ci);
622 l->l_cpu = ci;
623 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
624
625 sched_setrunnable(l);
626 l->l_stat = LSRUN;
627 l->l_slptime = 0;
628
629 /*
630 * If thread is swapped out - wake the swapper to bring it back in.
631 * Otherwise, enter it into a run queue.
632 */
633 if (l->l_flag & LW_INMEM) {
634 sched_enqueue(l, false);
635 resched_cpu(l);
636 lwp_unlock(l);
637 } else {
638 lwp_unlock(l);
639 uvm_kick_scheduler();
640 }
641 }
642
643 /*
644 * suspendsched:
645 *
646 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
647 */
648 void
649 suspendsched(void)
650 {
651 CPU_INFO_ITERATOR cii;
652 struct cpu_info *ci;
653 struct lwp *l;
654 struct proc *p;
655
656 /*
657 * We do this by process in order not to violate the locking rules.
658 */
659 mutex_enter(&proclist_mutex);
660 PROCLIST_FOREACH(p, &allproc) {
661 mutex_enter(&p->p_smutex);
662
663 if ((p->p_flag & PK_SYSTEM) != 0) {
664 mutex_exit(&p->p_smutex);
665 continue;
666 }
667
668 p->p_stat = SSTOP;
669
670 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
671 if (l == curlwp)
672 continue;
673
674 lwp_lock(l);
675
676 /*
677 * Set L_WREBOOT so that the LWP will suspend itself
678 * when it tries to return to user mode. We want to
679 * try and get to get as many LWPs as possible to
680 * the user / kernel boundary, so that they will
681 * release any locks that they hold.
682 */
683 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
684
685 if (l->l_stat == LSSLEEP &&
686 (l->l_flag & LW_SINTR) != 0) {
687 /* setrunnable() will release the lock. */
688 setrunnable(l);
689 continue;
690 }
691
692 lwp_unlock(l);
693 }
694
695 mutex_exit(&p->p_smutex);
696 }
697 mutex_exit(&proclist_mutex);
698
699 /*
700 * Kick all CPUs to make them preempt any LWPs running in user mode.
701 * They'll trap into the kernel and suspend themselves in userret().
702 */
703 for (CPU_INFO_FOREACH(cii, ci))
704 cpu_need_resched(ci, 0);
705 }
706
707 /*
708 * sched_kpri:
709 *
710 * Scale a priority level to a kernel priority level, usually
711 * for an LWP that is about to sleep.
712 */
713 pri_t
714 sched_kpri(struct lwp *l)
715 {
716 /*
717 * Scale user priorities (127 -> 50) up to kernel priorities
718 * in the range (49 -> 8). Reserve the top 8 kernel priorities
719 * for high priority kthreads. Kernel priorities passed in
720 * are left "as is". XXX This is somewhat arbitrary.
721 */
722 static const uint8_t kpri_tab[] = {
723 0, 1, 2, 3, 4, 5, 6, 7,
724 8, 9, 10, 11, 12, 13, 14, 15,
725 16, 17, 18, 19, 20, 21, 22, 23,
726 24, 25, 26, 27, 28, 29, 30, 31,
727 32, 33, 34, 35, 36, 37, 38, 39,
728 40, 41, 42, 43, 44, 45, 46, 47,
729 48, 49, 8, 8, 9, 9, 10, 10,
730 11, 11, 12, 12, 13, 14, 14, 15,
731 15, 16, 16, 17, 17, 18, 18, 19,
732 20, 20, 21, 21, 22, 22, 23, 23,
733 24, 24, 25, 26, 26, 27, 27, 28,
734 28, 29, 29, 30, 30, 31, 32, 32,
735 33, 33, 34, 34, 35, 35, 36, 36,
736 37, 38, 38, 39, 39, 40, 40, 41,
737 41, 42, 42, 43, 44, 44, 45, 45,
738 46, 46, 47, 47, 48, 48, 49, 49,
739 };
740
741 return (pri_t)kpri_tab[l->l_usrpri];
742 }
743
744 /*
745 * sched_unsleep:
746 *
747 * The is called when the LWP has not been awoken normally but instead
748 * interrupted: for example, if the sleep timed out. Because of this,
749 * it's not a valid action for running or idle LWPs.
750 */
751 static void
752 sched_unsleep(struct lwp *l)
753 {
754
755 lwp_unlock(l);
756 panic("sched_unsleep");
757 }
758
759 inline void
760 resched_cpu(struct lwp *l)
761 {
762 struct cpu_info *ci;
763 const pri_t pri = lwp_eprio(l);
764
765 /*
766 * XXXSMP
767 * Since l->l_cpu persists across a context switch,
768 * this gives us *very weak* processor affinity, in
769 * that we notify the CPU on which the process last
770 * ran that it should try to switch.
771 *
772 * This does not guarantee that the process will run on
773 * that processor next, because another processor might
774 * grab it the next time it performs a context switch.
775 *
776 * This also does not handle the case where its last
777 * CPU is running a higher-priority process, but every
778 * other CPU is running a lower-priority process. There
779 * are ways to handle this situation, but they're not
780 * currently very pretty, and we also need to weigh the
781 * cost of moving a process from one CPU to another.
782 */
783 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
784 if (pri < ci->ci_schedstate.spc_curpriority)
785 cpu_need_resched(ci, 0);
786 }
787
788 static void
789 sched_changepri(struct lwp *l, pri_t pri)
790 {
791
792 KASSERT(lwp_locked(l, NULL));
793
794 l->l_usrpri = pri;
795 if (l->l_priority < PUSER)
796 return;
797
798 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
799 l->l_priority = pri;
800 return;
801 }
802
803 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
804
805 sched_dequeue(l);
806 l->l_priority = pri;
807 sched_enqueue(l, false);
808 resched_cpu(l);
809 }
810
811 static void
812 sched_lendpri(struct lwp *l, pri_t pri)
813 {
814
815 KASSERT(lwp_locked(l, NULL));
816
817 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
818 l->l_inheritedprio = pri;
819 return;
820 }
821
822 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
823
824 sched_dequeue(l);
825 l->l_inheritedprio = pri;
826 sched_enqueue(l, false);
827 resched_cpu(l);
828 }
829
830 struct lwp *
831 syncobj_noowner(wchan_t wchan)
832 {
833
834 return NULL;
835 }
836
837
838 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
839 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
840
841 /*
842 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
843 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
844 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
845 *
846 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
847 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
848 *
849 * If you dont want to bother with the faster/more-accurate formula, you
850 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
851 * (more general) method of calculating the %age of CPU used by a process.
852 */
853 #define CCPU_SHIFT (FSHIFT + 1)
854
855 /*
856 * sched_pstats:
857 *
858 * Update process statistics and check CPU resource allocation.
859 * Call scheduler-specific hook to eventually adjust process/LWP
860 * priorities.
861 */
862 /* ARGSUSED */
863 void
864 sched_pstats(void *arg)
865 {
866 struct rlimit *rlim;
867 struct lwp *l;
868 struct proc *p;
869 int minslp, sig, clkhz;
870 long runtm;
871
872 sched_pstats_ticks++;
873
874 mutex_enter(&proclist_mutex);
875 PROCLIST_FOREACH(p, &allproc) {
876 /*
877 * Increment time in/out of memory and sleep time (if
878 * sleeping). We ignore overflow; with 16-bit int's
879 * (remember them?) overflow takes 45 days.
880 */
881 minslp = 2;
882 mutex_enter(&p->p_smutex);
883 mutex_spin_enter(&p->p_stmutex);
884 runtm = p->p_rtime.tv_sec;
885 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
886 if ((l->l_flag & LW_IDLE) != 0)
887 continue;
888 lwp_lock(l);
889 runtm += l->l_rtime.tv_sec;
890 l->l_swtime++;
891 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
892 l->l_stat == LSSUSPENDED) {
893 l->l_slptime++;
894 minslp = min(minslp, l->l_slptime);
895 } else
896 minslp = 0;
897 sched_pstats_hook(l);
898 lwp_unlock(l);
899
900 /*
901 * p_pctcpu is only for ps.
902 */
903 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
904 if (l->l_slptime < 1) {
905 clkhz = stathz != 0 ? stathz : hz;
906 #if (FSHIFT >= CCPU_SHIFT)
907 l->l_pctcpu += (clkhz == 100) ?
908 ((fixpt_t)l->l_cpticks) <<
909 (FSHIFT - CCPU_SHIFT) :
910 100 * (((fixpt_t) p->p_cpticks)
911 << (FSHIFT - CCPU_SHIFT)) / clkhz;
912 #else
913 l->l_pctcpu += ((FSCALE - ccpu) *
914 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
915 #endif
916 l->l_cpticks = 0;
917 }
918 }
919
920 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
921 #ifdef SCHED_4BSD
922 /*
923 * XXX: Workaround - belongs to sched_4bsd.c
924 * If the process has slept the entire second,
925 * stop recalculating its priority until it wakes up.
926 */
927 if (minslp <= 1) {
928 extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
929
930 fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
931 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
932 }
933 #endif
934 mutex_spin_exit(&p->p_stmutex);
935
936 /*
937 * Check if the process exceeds its CPU resource allocation.
938 * If over max, kill it.
939 */
940 rlim = &p->p_rlimit[RLIMIT_CPU];
941 sig = 0;
942 if (runtm >= rlim->rlim_cur) {
943 if (runtm >= rlim->rlim_max)
944 sig = SIGKILL;
945 else {
946 sig = SIGXCPU;
947 if (rlim->rlim_cur < rlim->rlim_max)
948 rlim->rlim_cur += 5;
949 }
950 }
951 mutex_exit(&p->p_smutex);
952 if (sig) {
953 psignal(p, sig);
954 }
955 }
956 mutex_exit(&proclist_mutex);
957 uvm_meter();
958 cv_wakeup(&lbolt);
959 callout_schedule(&sched_pstats_ch, hz);
960 }
961
962 void
963 sched_init(void)
964 {
965
966 cv_init(&lbolt, "lbolt");
967 callout_init(&sched_pstats_ch, 0);
968 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
969 sched_setup();
970 sched_pstats(NULL);
971 }
972