kern_synch.c revision 1.201 1 /* $NetBSD: kern_synch.c,v 1.201 2007/10/13 00:30:26 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the NetBSD
23 * Foundation, Inc. and its contributors.
24 * 4. Neither the name of The NetBSD Foundation nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 /*-
42 * Copyright (c) 1982, 1986, 1990, 1991, 1993
43 * The Regents of the University of California. All rights reserved.
44 * (c) UNIX System Laboratories, Inc.
45 * All or some portions of this file are derived from material licensed
46 * to the University of California by American Telephone and Telegraph
47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
48 * the permission of UNIX System Laboratories, Inc.
49 *
50 * Redistribution and use in source and binary forms, with or without
51 * modification, are permitted provided that the following conditions
52 * are met:
53 * 1. Redistributions of source code must retain the above copyright
54 * notice, this list of conditions and the following disclaimer.
55 * 2. Redistributions in binary form must reproduce the above copyright
56 * notice, this list of conditions and the following disclaimer in the
57 * documentation and/or other materials provided with the distribution.
58 * 3. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 *
74 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
75 */
76
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.201 2007/10/13 00:30:26 rmind Exp $");
79
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84
85 #define __MUTEX_PRIVATE
86
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/proc.h>
90 #include <sys/kernel.h>
91 #if defined(PERFCTRS)
92 #include <sys/pmc.h>
93 #endif
94 #include <sys/cpu.h>
95 #include <sys/resourcevar.h>
96 #include <sys/sched.h>
97 #include <sys/syscall_stats.h>
98 #include <sys/sleepq.h>
99 #include <sys/lockdebug.h>
100 #include <sys/evcnt.h>
101 #include <sys/intr.h>
102
103 #include <uvm/uvm_extern.h>
104
105 callout_t sched_pstats_ch;
106 unsigned int sched_pstats_ticks;
107
108 kcondvar_t lbolt; /* once a second sleep address */
109
110 static void sched_unsleep(struct lwp *);
111 static void sched_changepri(struct lwp *, pri_t);
112 static void sched_lendpri(struct lwp *, pri_t);
113
114 syncobj_t sleep_syncobj = {
115 SOBJ_SLEEPQ_SORTED,
116 sleepq_unsleep,
117 sleepq_changepri,
118 sleepq_lendpri,
119 syncobj_noowner,
120 };
121
122 syncobj_t sched_syncobj = {
123 SOBJ_SLEEPQ_SORTED,
124 sched_unsleep,
125 sched_changepri,
126 sched_lendpri,
127 syncobj_noowner,
128 };
129
130 /*
131 * During autoconfiguration or after a panic, a sleep will simply lower the
132 * priority briefly to allow interrupts, then return. The priority to be
133 * used (safepri) is machine-dependent, thus this value is initialized and
134 * maintained in the machine-dependent layers. This priority will typically
135 * be 0, or the lowest priority that is safe for use on the interrupt stack;
136 * it can be made higher to block network software interrupts after panics.
137 */
138 int safepri;
139
140 /*
141 * OBSOLETE INTERFACE
142 *
143 * General sleep call. Suspends the current process until a wakeup is
144 * performed on the specified identifier. The process will then be made
145 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
146 * means no timeout). If pri includes PCATCH flag, signals are checked
147 * before and after sleeping, else signals are not checked. Returns 0 if
148 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
149 * signal needs to be delivered, ERESTART is returned if the current system
150 * call should be restarted if possible, and EINTR is returned if the system
151 * call should be interrupted by the signal (return EINTR).
152 *
153 * The interlock is held until we are on a sleep queue. The interlock will
154 * be locked before returning back to the caller unless the PNORELOCK flag
155 * is specified, in which case the interlock will always be unlocked upon
156 * return.
157 */
158 int
159 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
160 volatile struct simplelock *interlock)
161 {
162 struct lwp *l = curlwp;
163 sleepq_t *sq;
164 int error;
165
166 if (sleepq_dontsleep(l)) {
167 (void)sleepq_abort(NULL, 0);
168 if ((priority & PNORELOCK) != 0)
169 simple_unlock(interlock);
170 return 0;
171 }
172
173 sq = sleeptab_lookup(&sleeptab, ident);
174 sleepq_enter(sq, l);
175 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
176
177 if (interlock != NULL) {
178 LOCK_ASSERT(simple_lock_held(interlock));
179 simple_unlock(interlock);
180 }
181
182 error = sleepq_block(timo, priority & PCATCH);
183
184 if (interlock != NULL && (priority & PNORELOCK) == 0)
185 simple_lock(interlock);
186
187 return error;
188 }
189
190 int
191 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
192 kmutex_t *mtx)
193 {
194 struct lwp *l = curlwp;
195 sleepq_t *sq;
196 int error;
197
198 if (sleepq_dontsleep(l)) {
199 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
200 return 0;
201 }
202
203 sq = sleeptab_lookup(&sleeptab, ident);
204 sleepq_enter(sq, l);
205 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
206 mutex_exit(mtx);
207 error = sleepq_block(timo, priority & PCATCH);
208
209 if ((priority & PNORELOCK) == 0)
210 mutex_enter(mtx);
211
212 return error;
213 }
214
215 /*
216 * General sleep call for situations where a wake-up is not expected.
217 */
218 int
219 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
220 {
221 struct lwp *l = curlwp;
222 sleepq_t *sq;
223 int error;
224
225 if (sleepq_dontsleep(l))
226 return sleepq_abort(NULL, 0);
227
228 if (mtx != NULL)
229 mutex_exit(mtx);
230 sq = sleeptab_lookup(&sleeptab, l);
231 sleepq_enter(sq, l);
232 sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
233 error = sleepq_block(timo, intr);
234 if (mtx != NULL)
235 mutex_enter(mtx);
236
237 return error;
238 }
239
240 /*
241 * OBSOLETE INTERFACE
242 *
243 * Make all processes sleeping on the specified identifier runnable.
244 */
245 void
246 wakeup(wchan_t ident)
247 {
248 sleepq_t *sq;
249
250 if (cold)
251 return;
252
253 sq = sleeptab_lookup(&sleeptab, ident);
254 sleepq_wake(sq, ident, (u_int)-1);
255 }
256
257 /*
258 * OBSOLETE INTERFACE
259 *
260 * Make the highest priority process first in line on the specified
261 * identifier runnable.
262 */
263 void
264 wakeup_one(wchan_t ident)
265 {
266 sleepq_t *sq;
267
268 if (cold)
269 return;
270
271 sq = sleeptab_lookup(&sleeptab, ident);
272 sleepq_wake(sq, ident, 1);
273 }
274
275
276 /*
277 * General yield call. Puts the current process back on its run queue and
278 * performs a voluntary context switch. Should only be called when the
279 * current process explicitly requests it (eg sched_yield(2)).
280 */
281 void
282 yield(void)
283 {
284 struct lwp *l = curlwp;
285
286 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
287 lwp_lock(l);
288 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
289 KASSERT(l->l_stat == LSONPROC);
290 /* XXX Only do this for timeshared threads. */
291 l->l_priority = MAXPRI;
292 (void)mi_switch(l);
293 KERNEL_LOCK(l->l_biglocks, l);
294 }
295
296 /*
297 * General preemption call. Puts the current process back on its run queue
298 * and performs an involuntary context switch.
299 */
300 void
301 preempt(void)
302 {
303 struct lwp *l = curlwp;
304
305 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
306 lwp_lock(l);
307 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
308 KASSERT(l->l_stat == LSONPROC);
309 l->l_priority = l->l_usrpri;
310 l->l_nivcsw++;
311 (void)mi_switch(l);
312 KERNEL_LOCK(l->l_biglocks, l);
313 }
314
315 /*
316 * Compute the amount of time during which the current lwp was running.
317 *
318 * - update l_rtime unless it's an idle lwp.
319 */
320
321 void
322 updatertime(lwp_t *l, const struct timeval *tv)
323 {
324 long s, u;
325
326 if ((l->l_flag & LW_IDLE) != 0)
327 return;
328
329 u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
330 s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
331 if (u < 0) {
332 u += 1000000;
333 s--;
334 } else if (u >= 1000000) {
335 u -= 1000000;
336 s++;
337 }
338 l->l_rtime.tv_usec = u;
339 l->l_rtime.tv_sec = s;
340 }
341
342 /*
343 * The machine independent parts of context switch.
344 *
345 * Returns 1 if another LWP was actually run.
346 */
347 int
348 mi_switch(lwp_t *l)
349 {
350 struct schedstate_percpu *spc;
351 struct lwp *newl;
352 int retval, oldspl;
353 struct cpu_info *ci;
354 struct timeval tv;
355 bool returning;
356
357 KASSERT(lwp_locked(l, NULL));
358 LOCKDEBUG_BARRIER(l->l_mutex, 1);
359
360 #ifdef KSTACK_CHECK_MAGIC
361 kstack_check_magic(l);
362 #endif
363
364 microtime(&tv);
365
366 /*
367 * It's safe to read the per CPU schedstate unlocked here, as all we
368 * are after is the run time and that's guarenteed to have been last
369 * updated by this CPU.
370 */
371 ci = l->l_cpu;
372 KDASSERT(ci == curcpu());
373
374 /*
375 * Process is about to yield the CPU; clear the appropriate
376 * scheduling flags.
377 */
378 spc = &ci->ci_schedstate;
379 returning = false;
380 newl = NULL;
381
382 /*
383 * If we have been asked to switch to a specific LWP, then there
384 * is no need to inspect the run queues. If a soft interrupt is
385 * blocking, then return to the interrupted thread without adjusting
386 * VM context or its start time: neither have been changed in order
387 * to take the interrupt.
388 */
389 if (l->l_switchto != NULL) {
390 if ((l->l_flag & LW_INTR) != 0) {
391 returning = true;
392 softint_block(l);
393 if ((l->l_flag & LW_TIMEINTR) != 0)
394 updatertime(l, &tv);
395 }
396 newl = l->l_switchto;
397 l->l_switchto = NULL;
398 }
399
400 /* Count time spent in current system call */
401 if (!returning) {
402 SYSCALL_TIME_SLEEP(l);
403
404 /*
405 * XXXSMP If we are using h/w performance counters,
406 * save context.
407 */
408 #if PERFCTRS
409 if (PMC_ENABLED(l->l_proc)) {
410 pmc_save_context(l->l_proc);
411 }
412 #endif
413 updatertime(l, &tv);
414 }
415
416 /*
417 * If on the CPU and we have gotten this far, then we must yield.
418 */
419 mutex_spin_enter(spc->spc_mutex);
420 KASSERT(l->l_stat != LSRUN);
421 if (l->l_stat == LSONPROC) {
422 KASSERT(lwp_locked(l, &spc->spc_lwplock));
423 if ((l->l_flag & LW_IDLE) == 0) {
424 l->l_stat = LSRUN;
425 lwp_setlock(l, spc->spc_mutex);
426 sched_enqueue(l, true);
427 } else
428 l->l_stat = LSIDL;
429 }
430
431 /*
432 * Let sched_nextlwp() select the LWP to run the CPU next.
433 * If no LWP is runnable, switch to the idle LWP.
434 * Note that spc_lwplock might not necessary be held.
435 */
436 if (newl == NULL) {
437 newl = sched_nextlwp();
438 if (newl != NULL) {
439 sched_dequeue(newl);
440 KASSERT(lwp_locked(newl, spc->spc_mutex));
441 newl->l_stat = LSONPROC;
442 newl->l_cpu = ci;
443 newl->l_flag |= LW_RUNNING;
444 lwp_setlock(newl, &spc->spc_lwplock);
445 } else {
446 newl = ci->ci_data.cpu_idlelwp;
447 newl->l_stat = LSONPROC;
448 newl->l_flag |= LW_RUNNING;
449 }
450 ci->ci_want_resched = 0;
451 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
452 }
453
454 /* Update the new LWP's start time while it is still locked. */
455 if (!returning) {
456 newl->l_stime = tv;
457 /*
458 * XXX The following may be done unlocked if newl != NULL
459 * above.
460 */
461 newl->l_priority = newl->l_usrpri;
462 }
463
464 spc->spc_curpriority = newl->l_usrpri;
465
466 if (l != newl) {
467 struct lwp *prevlwp;
468
469 /*
470 * If the old LWP has been moved to a run queue above,
471 * drop the general purpose LWP lock: it's now locked
472 * by the scheduler lock.
473 *
474 * Otherwise, drop the scheduler lock. We're done with
475 * the run queues for now.
476 */
477 if (l->l_mutex == spc->spc_mutex) {
478 mutex_spin_exit(&spc->spc_lwplock);
479 } else {
480 mutex_spin_exit(spc->spc_mutex);
481 }
482
483 /* Unlocked, but for statistics only. */
484 uvmexp.swtch++;
485
486 /*
487 * Save old VM context, unless a soft interrupt
488 * handler is blocking.
489 */
490 if (!returning)
491 pmap_deactivate(l);
492
493 /* Switch to the new LWP.. */
494 l->l_ncsw++;
495 l->l_flag &= ~LW_RUNNING;
496 oldspl = MUTEX_SPIN_OLDSPL(ci);
497 prevlwp = cpu_switchto(l, newl);
498
499 /*
500 * .. we have switched away and are now back so we must
501 * be the new curlwp. prevlwp is who we replaced.
502 */
503 if (prevlwp != NULL) {
504 curcpu()->ci_mtx_oldspl = oldspl;
505 lwp_unlock(prevlwp);
506 } else {
507 splx(oldspl);
508 }
509
510 /* Restore VM context. */
511 pmap_activate(l);
512 retval = 1;
513 } else {
514 /* Nothing to do - just unlock and return. */
515 mutex_spin_exit(spc->spc_mutex);
516 lwp_unlock(l);
517 retval = 0;
518 }
519
520 KASSERT(l == curlwp);
521 KASSERT(l->l_stat == LSONPROC);
522 KASSERT(l->l_cpu == curcpu());
523
524 /*
525 * XXXSMP If we are using h/w performance counters, restore context.
526 */
527 #if PERFCTRS
528 if (PMC_ENABLED(l->l_proc)) {
529 pmc_restore_context(l->l_proc);
530 }
531 #endif
532
533 /*
534 * We're running again; record our new start time. We might
535 * be running on a new CPU now, so don't use the cached
536 * schedstate_percpu pointer.
537 */
538 SYSCALL_TIME_WAKEUP(l);
539 KASSERT(curlwp == l);
540 KDASSERT(l->l_cpu == curcpu());
541 LOCKDEBUG_BARRIER(NULL, 1);
542
543 return retval;
544 }
545
546 /*
547 * Change process state to be runnable, placing it on the run queue if it is
548 * in memory, and awakening the swapper if it isn't in memory.
549 *
550 * Call with the process and LWP locked. Will return with the LWP unlocked.
551 */
552 void
553 setrunnable(struct lwp *l)
554 {
555 struct proc *p = l->l_proc;
556 sigset_t *ss;
557
558 KASSERT((l->l_flag & LW_IDLE) == 0);
559 KASSERT(mutex_owned(&p->p_smutex));
560 KASSERT(lwp_locked(l, NULL));
561
562 switch (l->l_stat) {
563 case LSSTOP:
564 /*
565 * If we're being traced (possibly because someone attached us
566 * while we were stopped), check for a signal from the debugger.
567 */
568 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
569 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
570 ss = &l->l_sigpend.sp_set;
571 else
572 ss = &p->p_sigpend.sp_set;
573 sigaddset(ss, p->p_xstat);
574 signotify(l);
575 }
576 p->p_nrlwps++;
577 break;
578 case LSSUSPENDED:
579 l->l_flag &= ~LW_WSUSPEND;
580 p->p_nrlwps++;
581 cv_broadcast(&p->p_lwpcv);
582 break;
583 case LSSLEEP:
584 KASSERT(l->l_wchan != NULL);
585 break;
586 default:
587 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
588 }
589
590 /*
591 * If the LWP was sleeping interruptably, then it's OK to start it
592 * again. If not, mark it as still sleeping.
593 */
594 if (l->l_wchan != NULL) {
595 l->l_stat = LSSLEEP;
596 /* lwp_unsleep() will release the lock. */
597 lwp_unsleep(l);
598 return;
599 }
600
601 /*
602 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
603 * about to call mi_switch(), in which case it will yield.
604 */
605 if ((l->l_flag & LW_RUNNING) != 0) {
606 l->l_stat = LSONPROC;
607 l->l_slptime = 0;
608 lwp_unlock(l);
609 return;
610 }
611
612 /*
613 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
614 * to bring it back in. Otherwise, enter it into a run queue.
615 */
616 if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
617 spc_lock(l->l_cpu);
618 lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
619 }
620
621 sched_setrunnable(l);
622 l->l_stat = LSRUN;
623 l->l_slptime = 0;
624
625 if (l->l_flag & LW_INMEM) {
626 sched_enqueue(l, false);
627 resched_cpu(l);
628 lwp_unlock(l);
629 } else {
630 lwp_unlock(l);
631 uvm_kick_scheduler();
632 }
633 }
634
635 /*
636 * suspendsched:
637 *
638 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
639 */
640 void
641 suspendsched(void)
642 {
643 CPU_INFO_ITERATOR cii;
644 struct cpu_info *ci;
645 struct lwp *l;
646 struct proc *p;
647
648 /*
649 * We do this by process in order not to violate the locking rules.
650 */
651 mutex_enter(&proclist_mutex);
652 PROCLIST_FOREACH(p, &allproc) {
653 mutex_enter(&p->p_smutex);
654
655 if ((p->p_flag & PK_SYSTEM) != 0) {
656 mutex_exit(&p->p_smutex);
657 continue;
658 }
659
660 p->p_stat = SSTOP;
661
662 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
663 if (l == curlwp)
664 continue;
665
666 lwp_lock(l);
667
668 /*
669 * Set L_WREBOOT so that the LWP will suspend itself
670 * when it tries to return to user mode. We want to
671 * try and get to get as many LWPs as possible to
672 * the user / kernel boundary, so that they will
673 * release any locks that they hold.
674 */
675 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
676
677 if (l->l_stat == LSSLEEP &&
678 (l->l_flag & LW_SINTR) != 0) {
679 /* setrunnable() will release the lock. */
680 setrunnable(l);
681 continue;
682 }
683
684 lwp_unlock(l);
685 }
686
687 mutex_exit(&p->p_smutex);
688 }
689 mutex_exit(&proclist_mutex);
690
691 /*
692 * Kick all CPUs to make them preempt any LWPs running in user mode.
693 * They'll trap into the kernel and suspend themselves in userret().
694 */
695 for (CPU_INFO_FOREACH(cii, ci))
696 cpu_need_resched(ci, 0);
697 }
698
699 /*
700 * sched_kpri:
701 *
702 * Scale a priority level to a kernel priority level, usually
703 * for an LWP that is about to sleep.
704 */
705 pri_t
706 sched_kpri(struct lwp *l)
707 {
708 /*
709 * Scale user priorities (127 -> 50) up to kernel priorities
710 * in the range (49 -> 8). Reserve the top 8 kernel priorities
711 * for high priority kthreads. Kernel priorities passed in
712 * are left "as is". XXX This is somewhat arbitrary.
713 */
714 static const uint8_t kpri_tab[] = {
715 0, 1, 2, 3, 4, 5, 6, 7,
716 8, 9, 10, 11, 12, 13, 14, 15,
717 16, 17, 18, 19, 20, 21, 22, 23,
718 24, 25, 26, 27, 28, 29, 30, 31,
719 32, 33, 34, 35, 36, 37, 38, 39,
720 40, 41, 42, 43, 44, 45, 46, 47,
721 48, 49, 8, 8, 9, 9, 10, 10,
722 11, 11, 12, 12, 13, 14, 14, 15,
723 15, 16, 16, 17, 17, 18, 18, 19,
724 20, 20, 21, 21, 22, 22, 23, 23,
725 24, 24, 25, 26, 26, 27, 27, 28,
726 28, 29, 29, 30, 30, 31, 32, 32,
727 33, 33, 34, 34, 35, 35, 36, 36,
728 37, 38, 38, 39, 39, 40, 40, 41,
729 41, 42, 42, 43, 44, 44, 45, 45,
730 46, 46, 47, 47, 48, 48, 49, 49,
731 };
732
733 return (pri_t)kpri_tab[l->l_usrpri];
734 }
735
736 /*
737 * sched_unsleep:
738 *
739 * The is called when the LWP has not been awoken normally but instead
740 * interrupted: for example, if the sleep timed out. Because of this,
741 * it's not a valid action for running or idle LWPs.
742 */
743 static void
744 sched_unsleep(struct lwp *l)
745 {
746
747 lwp_unlock(l);
748 panic("sched_unsleep");
749 }
750
751 inline void
752 resched_cpu(struct lwp *l)
753 {
754 struct cpu_info *ci;
755 const pri_t pri = lwp_eprio(l);
756
757 /*
758 * XXXSMP
759 * Since l->l_cpu persists across a context switch,
760 * this gives us *very weak* processor affinity, in
761 * that we notify the CPU on which the process last
762 * ran that it should try to switch.
763 *
764 * This does not guarantee that the process will run on
765 * that processor next, because another processor might
766 * grab it the next time it performs a context switch.
767 *
768 * This also does not handle the case where its last
769 * CPU is running a higher-priority process, but every
770 * other CPU is running a lower-priority process. There
771 * are ways to handle this situation, but they're not
772 * currently very pretty, and we also need to weigh the
773 * cost of moving a process from one CPU to another.
774 */
775 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
776 if (pri < ci->ci_schedstate.spc_curpriority)
777 cpu_need_resched(ci, 0);
778 }
779
780 static void
781 sched_changepri(struct lwp *l, pri_t pri)
782 {
783
784 KASSERT(lwp_locked(l, NULL));
785
786 l->l_usrpri = pri;
787 if (l->l_priority < PUSER)
788 return;
789
790 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
791 l->l_priority = pri;
792 return;
793 }
794
795 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
796
797 sched_dequeue(l);
798 l->l_priority = pri;
799 sched_enqueue(l, false);
800 resched_cpu(l);
801 }
802
803 static void
804 sched_lendpri(struct lwp *l, pri_t pri)
805 {
806
807 KASSERT(lwp_locked(l, NULL));
808
809 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
810 l->l_inheritedprio = pri;
811 return;
812 }
813
814 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
815
816 sched_dequeue(l);
817 l->l_inheritedprio = pri;
818 sched_enqueue(l, false);
819 resched_cpu(l);
820 }
821
822 struct lwp *
823 syncobj_noowner(wchan_t wchan)
824 {
825
826 return NULL;
827 }
828
829
830 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
831 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
832
833 /*
834 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
835 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
836 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
837 *
838 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
839 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
840 *
841 * If you dont want to bother with the faster/more-accurate formula, you
842 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
843 * (more general) method of calculating the %age of CPU used by a process.
844 */
845 #define CCPU_SHIFT (FSHIFT + 1)
846
847 /*
848 * sched_pstats:
849 *
850 * Update process statistics and check CPU resource allocation.
851 * Call scheduler-specific hook to eventually adjust process/LWP
852 * priorities.
853 */
854 /* ARGSUSED */
855 void
856 sched_pstats(void *arg)
857 {
858 struct rlimit *rlim;
859 struct lwp *l;
860 struct proc *p;
861 int minslp, sig, clkhz;
862 long runtm;
863
864 sched_pstats_ticks++;
865
866 mutex_enter(&proclist_mutex);
867 PROCLIST_FOREACH(p, &allproc) {
868 /*
869 * Increment time in/out of memory and sleep time (if
870 * sleeping). We ignore overflow; with 16-bit int's
871 * (remember them?) overflow takes 45 days.
872 */
873 minslp = 2;
874 mutex_enter(&p->p_smutex);
875 mutex_spin_enter(&p->p_stmutex);
876 runtm = p->p_rtime.tv_sec;
877 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
878 if ((l->l_flag & LW_IDLE) != 0)
879 continue;
880 lwp_lock(l);
881 runtm += l->l_rtime.tv_sec;
882 l->l_swtime++;
883 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
884 l->l_stat == LSSUSPENDED) {
885 l->l_slptime++;
886 minslp = min(minslp, l->l_slptime);
887 } else
888 minslp = 0;
889 sched_pstats_hook(l);
890 lwp_unlock(l);
891
892 /*
893 * p_pctcpu is only for ps.
894 */
895 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
896 if (l->l_slptime < 1) {
897 clkhz = stathz != 0 ? stathz : hz;
898 #if (FSHIFT >= CCPU_SHIFT)
899 l->l_pctcpu += (clkhz == 100) ?
900 ((fixpt_t)l->l_cpticks) <<
901 (FSHIFT - CCPU_SHIFT) :
902 100 * (((fixpt_t) p->p_cpticks)
903 << (FSHIFT - CCPU_SHIFT)) / clkhz;
904 #else
905 l->l_pctcpu += ((FSCALE - ccpu) *
906 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
907 #endif
908 l->l_cpticks = 0;
909 }
910 }
911
912 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
913 #ifdef SCHED_4BSD
914 /*
915 * XXX: Workaround - belongs to sched_4bsd.c
916 * If the process has slept the entire second,
917 * stop recalculating its priority until it wakes up.
918 */
919 if (minslp <= 1) {
920 extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
921
922 fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
923 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
924 }
925 #endif
926 mutex_spin_exit(&p->p_stmutex);
927
928 /*
929 * Check if the process exceeds its CPU resource allocation.
930 * If over max, kill it.
931 */
932 rlim = &p->p_rlimit[RLIMIT_CPU];
933 sig = 0;
934 if (runtm >= rlim->rlim_cur) {
935 if (runtm >= rlim->rlim_max)
936 sig = SIGKILL;
937 else {
938 sig = SIGXCPU;
939 if (rlim->rlim_cur < rlim->rlim_max)
940 rlim->rlim_cur += 5;
941 }
942 }
943 mutex_exit(&p->p_smutex);
944 if (sig) {
945 psignal(p, sig);
946 }
947 }
948 mutex_exit(&proclist_mutex);
949 uvm_meter();
950 cv_wakeup(&lbolt);
951 callout_schedule(&sched_pstats_ch, hz);
952 }
953
954 void
955 sched_init(void)
956 {
957
958 cv_init(&lbolt, "lbolt");
959 callout_init(&sched_pstats_ch, 0);
960 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
961 sched_setup();
962 sched_pstats(NULL);
963 }
964