kern_synch.c revision 1.202 1 /* $NetBSD: kern_synch.c,v 1.202 2007/10/29 12:16:54 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the NetBSD
23 * Foundation, Inc. and its contributors.
24 * 4. Neither the name of The NetBSD Foundation nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
31 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
32 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 /*-
42 * Copyright (c) 1982, 1986, 1990, 1991, 1993
43 * The Regents of the University of California. All rights reserved.
44 * (c) UNIX System Laboratories, Inc.
45 * All or some portions of this file are derived from material licensed
46 * to the University of California by American Telephone and Telegraph
47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
48 * the permission of UNIX System Laboratories, Inc.
49 *
50 * Redistribution and use in source and binary forms, with or without
51 * modification, are permitted provided that the following conditions
52 * are met:
53 * 1. Redistributions of source code must retain the above copyright
54 * notice, this list of conditions and the following disclaimer.
55 * 2. Redistributions in binary form must reproduce the above copyright
56 * notice, this list of conditions and the following disclaimer in the
57 * documentation and/or other materials provided with the distribution.
58 * 3. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
61 *
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
72 * SUCH DAMAGE.
73 *
74 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
75 */
76
77 #include <sys/cdefs.h>
78 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.202 2007/10/29 12:16:54 yamt Exp $");
79
80 #include "opt_kstack.h"
81 #include "opt_lockdebug.h"
82 #include "opt_multiprocessor.h"
83 #include "opt_perfctrs.h"
84 #include "opt_sched.h"
85
86 #define __MUTEX_PRIVATE
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/proc.h>
91 #include <sys/kernel.h>
92 #if defined(PERFCTRS)
93 #include <sys/pmc.h>
94 #endif
95 #include <sys/cpu.h>
96 #include <sys/resourcevar.h>
97 #include <sys/sched.h>
98 #include <sys/syscall_stats.h>
99 #include <sys/sleepq.h>
100 #include <sys/lockdebug.h>
101 #include <sys/evcnt.h>
102 #include <sys/intr.h>
103
104 #include <uvm/uvm_extern.h>
105
106 callout_t sched_pstats_ch;
107 unsigned int sched_pstats_ticks;
108
109 kcondvar_t lbolt; /* once a second sleep address */
110
111 static void sched_unsleep(struct lwp *);
112 static void sched_changepri(struct lwp *, pri_t);
113 static void sched_lendpri(struct lwp *, pri_t);
114
115 syncobj_t sleep_syncobj = {
116 SOBJ_SLEEPQ_SORTED,
117 sleepq_unsleep,
118 sleepq_changepri,
119 sleepq_lendpri,
120 syncobj_noowner,
121 };
122
123 syncobj_t sched_syncobj = {
124 SOBJ_SLEEPQ_SORTED,
125 sched_unsleep,
126 sched_changepri,
127 sched_lendpri,
128 syncobj_noowner,
129 };
130
131 /*
132 * During autoconfiguration or after a panic, a sleep will simply lower the
133 * priority briefly to allow interrupts, then return. The priority to be
134 * used (safepri) is machine-dependent, thus this value is initialized and
135 * maintained in the machine-dependent layers. This priority will typically
136 * be 0, or the lowest priority that is safe for use on the interrupt stack;
137 * it can be made higher to block network software interrupts after panics.
138 */
139 int safepri;
140
141 /*
142 * OBSOLETE INTERFACE
143 *
144 * General sleep call. Suspends the current process until a wakeup is
145 * performed on the specified identifier. The process will then be made
146 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
147 * means no timeout). If pri includes PCATCH flag, signals are checked
148 * before and after sleeping, else signals are not checked. Returns 0 if
149 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
150 * signal needs to be delivered, ERESTART is returned if the current system
151 * call should be restarted if possible, and EINTR is returned if the system
152 * call should be interrupted by the signal (return EINTR).
153 *
154 * The interlock is held until we are on a sleep queue. The interlock will
155 * be locked before returning back to the caller unless the PNORELOCK flag
156 * is specified, in which case the interlock will always be unlocked upon
157 * return.
158 */
159 int
160 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
161 volatile struct simplelock *interlock)
162 {
163 struct lwp *l = curlwp;
164 sleepq_t *sq;
165 int error;
166
167 if (sleepq_dontsleep(l)) {
168 (void)sleepq_abort(NULL, 0);
169 if ((priority & PNORELOCK) != 0)
170 simple_unlock(interlock);
171 return 0;
172 }
173
174 sq = sleeptab_lookup(&sleeptab, ident);
175 sleepq_enter(sq, l);
176 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
177
178 if (interlock != NULL) {
179 LOCK_ASSERT(simple_lock_held(interlock));
180 simple_unlock(interlock);
181 }
182
183 error = sleepq_block(timo, priority & PCATCH);
184
185 if (interlock != NULL && (priority & PNORELOCK) == 0)
186 simple_lock(interlock);
187
188 return error;
189 }
190
191 int
192 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
193 kmutex_t *mtx)
194 {
195 struct lwp *l = curlwp;
196 sleepq_t *sq;
197 int error;
198
199 if (sleepq_dontsleep(l)) {
200 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
201 return 0;
202 }
203
204 sq = sleeptab_lookup(&sleeptab, ident);
205 sleepq_enter(sq, l);
206 sleepq_enqueue(sq, priority & PRIMASK, ident, wmesg, &sleep_syncobj);
207 mutex_exit(mtx);
208 error = sleepq_block(timo, priority & PCATCH);
209
210 if ((priority & PNORELOCK) == 0)
211 mutex_enter(mtx);
212
213 return error;
214 }
215
216 /*
217 * General sleep call for situations where a wake-up is not expected.
218 */
219 int
220 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
221 {
222 struct lwp *l = curlwp;
223 sleepq_t *sq;
224 int error;
225
226 if (sleepq_dontsleep(l))
227 return sleepq_abort(NULL, 0);
228
229 if (mtx != NULL)
230 mutex_exit(mtx);
231 sq = sleeptab_lookup(&sleeptab, l);
232 sleepq_enter(sq, l);
233 sleepq_enqueue(sq, sched_kpri(l), l, wmesg, &sleep_syncobj);
234 error = sleepq_block(timo, intr);
235 if (mtx != NULL)
236 mutex_enter(mtx);
237
238 return error;
239 }
240
241 /*
242 * OBSOLETE INTERFACE
243 *
244 * Make all processes sleeping on the specified identifier runnable.
245 */
246 void
247 wakeup(wchan_t ident)
248 {
249 sleepq_t *sq;
250
251 if (cold)
252 return;
253
254 sq = sleeptab_lookup(&sleeptab, ident);
255 sleepq_wake(sq, ident, (u_int)-1);
256 }
257
258 /*
259 * OBSOLETE INTERFACE
260 *
261 * Make the highest priority process first in line on the specified
262 * identifier runnable.
263 */
264 void
265 wakeup_one(wchan_t ident)
266 {
267 sleepq_t *sq;
268
269 if (cold)
270 return;
271
272 sq = sleeptab_lookup(&sleeptab, ident);
273 sleepq_wake(sq, ident, 1);
274 }
275
276
277 /*
278 * General yield call. Puts the current process back on its run queue and
279 * performs a voluntary context switch. Should only be called when the
280 * current process explicitly requests it (eg sched_yield(2)).
281 */
282 void
283 yield(void)
284 {
285 struct lwp *l = curlwp;
286
287 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
288 lwp_lock(l);
289 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
290 KASSERT(l->l_stat == LSONPROC);
291 /* XXX Only do this for timeshared threads. */
292 l->l_priority = MAXPRI;
293 (void)mi_switch(l);
294 KERNEL_LOCK(l->l_biglocks, l);
295 }
296
297 /*
298 * General preemption call. Puts the current process back on its run queue
299 * and performs an involuntary context switch.
300 */
301 void
302 preempt(void)
303 {
304 struct lwp *l = curlwp;
305
306 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
307 lwp_lock(l);
308 KASSERT(lwp_locked(l, &l->l_cpu->ci_schedstate.spc_lwplock));
309 KASSERT(l->l_stat == LSONPROC);
310 l->l_priority = l->l_usrpri;
311 l->l_nivcsw++;
312 (void)mi_switch(l);
313 KERNEL_LOCK(l->l_biglocks, l);
314 }
315
316 /*
317 * Compute the amount of time during which the current lwp was running.
318 *
319 * - update l_rtime unless it's an idle lwp.
320 */
321
322 void
323 updatertime(lwp_t *l, const struct timeval *tv)
324 {
325 long s, u;
326
327 if ((l->l_flag & LW_IDLE) != 0)
328 return;
329
330 u = l->l_rtime.tv_usec + (tv->tv_usec - l->l_stime.tv_usec);
331 s = l->l_rtime.tv_sec + (tv->tv_sec - l->l_stime.tv_sec);
332 if (u < 0) {
333 u += 1000000;
334 s--;
335 } else if (u >= 1000000) {
336 u -= 1000000;
337 s++;
338 }
339 l->l_rtime.tv_usec = u;
340 l->l_rtime.tv_sec = s;
341 }
342
343 /*
344 * The machine independent parts of context switch.
345 *
346 * Returns 1 if another LWP was actually run.
347 */
348 int
349 mi_switch(lwp_t *l)
350 {
351 struct schedstate_percpu *spc;
352 struct lwp *newl;
353 int retval, oldspl;
354 struct cpu_info *ci;
355 struct timeval tv;
356 bool returning;
357
358 KASSERT(lwp_locked(l, NULL));
359 LOCKDEBUG_BARRIER(l->l_mutex, 1);
360
361 #ifdef KSTACK_CHECK_MAGIC
362 kstack_check_magic(l);
363 #endif
364
365 microtime(&tv);
366
367 /*
368 * It's safe to read the per CPU schedstate unlocked here, as all we
369 * are after is the run time and that's guarenteed to have been last
370 * updated by this CPU.
371 */
372 ci = l->l_cpu;
373 KDASSERT(ci == curcpu());
374
375 /*
376 * Process is about to yield the CPU; clear the appropriate
377 * scheduling flags.
378 */
379 spc = &ci->ci_schedstate;
380 returning = false;
381 newl = NULL;
382
383 /*
384 * If we have been asked to switch to a specific LWP, then there
385 * is no need to inspect the run queues. If a soft interrupt is
386 * blocking, then return to the interrupted thread without adjusting
387 * VM context or its start time: neither have been changed in order
388 * to take the interrupt.
389 */
390 if (l->l_switchto != NULL) {
391 if ((l->l_flag & LW_INTR) != 0) {
392 returning = true;
393 softint_block(l);
394 if ((l->l_flag & LW_TIMEINTR) != 0)
395 updatertime(l, &tv);
396 }
397 newl = l->l_switchto;
398 l->l_switchto = NULL;
399 }
400
401 /* Count time spent in current system call */
402 if (!returning) {
403 SYSCALL_TIME_SLEEP(l);
404
405 /*
406 * XXXSMP If we are using h/w performance counters,
407 * save context.
408 */
409 #if PERFCTRS
410 if (PMC_ENABLED(l->l_proc)) {
411 pmc_save_context(l->l_proc);
412 }
413 #endif
414 updatertime(l, &tv);
415 }
416
417 /*
418 * If on the CPU and we have gotten this far, then we must yield.
419 */
420 mutex_spin_enter(spc->spc_mutex);
421 KASSERT(l->l_stat != LSRUN);
422 if (l->l_stat == LSONPROC) {
423 KASSERT(lwp_locked(l, &spc->spc_lwplock));
424 if ((l->l_flag & LW_IDLE) == 0) {
425 l->l_stat = LSRUN;
426 lwp_setlock(l, spc->spc_mutex);
427 sched_enqueue(l, true);
428 } else
429 l->l_stat = LSIDL;
430 }
431
432 /*
433 * Let sched_nextlwp() select the LWP to run the CPU next.
434 * If no LWP is runnable, switch to the idle LWP.
435 * Note that spc_lwplock might not necessary be held.
436 */
437 if (newl == NULL) {
438 newl = sched_nextlwp();
439 if (newl != NULL) {
440 sched_dequeue(newl);
441 KASSERT(lwp_locked(newl, spc->spc_mutex));
442 newl->l_stat = LSONPROC;
443 newl->l_cpu = ci;
444 newl->l_flag |= LW_RUNNING;
445 lwp_setlock(newl, &spc->spc_lwplock);
446 } else {
447 newl = ci->ci_data.cpu_idlelwp;
448 newl->l_stat = LSONPROC;
449 newl->l_flag |= LW_RUNNING;
450 }
451 ci->ci_want_resched = 0;
452 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
453 }
454
455 /* Update the new LWP's start time while it is still locked. */
456 if (!returning) {
457 newl->l_stime = tv;
458 /*
459 * XXX The following may be done unlocked if newl != NULL
460 * above.
461 */
462 newl->l_priority = newl->l_usrpri;
463 }
464
465 spc->spc_curpriority = newl->l_usrpri;
466
467 if (l != newl) {
468 struct lwp *prevlwp;
469
470 /*
471 * If the old LWP has been moved to a run queue above,
472 * drop the general purpose LWP lock: it's now locked
473 * by the scheduler lock.
474 *
475 * Otherwise, drop the scheduler lock. We're done with
476 * the run queues for now.
477 */
478 if (l->l_mutex == spc->spc_mutex) {
479 mutex_spin_exit(&spc->spc_lwplock);
480 } else {
481 mutex_spin_exit(spc->spc_mutex);
482 }
483
484 /* Unlocked, but for statistics only. */
485 uvmexp.swtch++;
486
487 /*
488 * Save old VM context, unless a soft interrupt
489 * handler is blocking.
490 */
491 if (!returning)
492 pmap_deactivate(l);
493
494 /* Switch to the new LWP.. */
495 l->l_ncsw++;
496 l->l_flag &= ~LW_RUNNING;
497 oldspl = MUTEX_SPIN_OLDSPL(ci);
498 prevlwp = cpu_switchto(l, newl);
499
500 /*
501 * .. we have switched away and are now back so we must
502 * be the new curlwp. prevlwp is who we replaced.
503 */
504 if (prevlwp != NULL) {
505 curcpu()->ci_mtx_oldspl = oldspl;
506 lwp_unlock(prevlwp);
507 } else {
508 splx(oldspl);
509 }
510
511 /* Restore VM context. */
512 pmap_activate(l);
513 retval = 1;
514 } else {
515 /* Nothing to do - just unlock and return. */
516 mutex_spin_exit(spc->spc_mutex);
517 lwp_unlock(l);
518 retval = 0;
519 }
520
521 KASSERT(l == curlwp);
522 KASSERT(l->l_stat == LSONPROC);
523 KASSERT(l->l_cpu == curcpu());
524
525 /*
526 * XXXSMP If we are using h/w performance counters, restore context.
527 */
528 #if PERFCTRS
529 if (PMC_ENABLED(l->l_proc)) {
530 pmc_restore_context(l->l_proc);
531 }
532 #endif
533
534 /*
535 * We're running again; record our new start time. We might
536 * be running on a new CPU now, so don't use the cached
537 * schedstate_percpu pointer.
538 */
539 SYSCALL_TIME_WAKEUP(l);
540 KASSERT(curlwp == l);
541 KDASSERT(l->l_cpu == curcpu());
542 LOCKDEBUG_BARRIER(NULL, 1);
543
544 return retval;
545 }
546
547 /*
548 * Change process state to be runnable, placing it on the run queue if it is
549 * in memory, and awakening the swapper if it isn't in memory.
550 *
551 * Call with the process and LWP locked. Will return with the LWP unlocked.
552 */
553 void
554 setrunnable(struct lwp *l)
555 {
556 struct proc *p = l->l_proc;
557 sigset_t *ss;
558
559 KASSERT((l->l_flag & LW_IDLE) == 0);
560 KASSERT(mutex_owned(&p->p_smutex));
561 KASSERT(lwp_locked(l, NULL));
562
563 switch (l->l_stat) {
564 case LSSTOP:
565 /*
566 * If we're being traced (possibly because someone attached us
567 * while we were stopped), check for a signal from the debugger.
568 */
569 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
570 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
571 ss = &l->l_sigpend.sp_set;
572 else
573 ss = &p->p_sigpend.sp_set;
574 sigaddset(ss, p->p_xstat);
575 signotify(l);
576 }
577 p->p_nrlwps++;
578 break;
579 case LSSUSPENDED:
580 l->l_flag &= ~LW_WSUSPEND;
581 p->p_nrlwps++;
582 cv_broadcast(&p->p_lwpcv);
583 break;
584 case LSSLEEP:
585 KASSERT(l->l_wchan != NULL);
586 break;
587 default:
588 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
589 }
590
591 /*
592 * If the LWP was sleeping interruptably, then it's OK to start it
593 * again. If not, mark it as still sleeping.
594 */
595 if (l->l_wchan != NULL) {
596 l->l_stat = LSSLEEP;
597 /* lwp_unsleep() will release the lock. */
598 lwp_unsleep(l);
599 return;
600 }
601
602 /*
603 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
604 * about to call mi_switch(), in which case it will yield.
605 */
606 if ((l->l_flag & LW_RUNNING) != 0) {
607 l->l_stat = LSONPROC;
608 l->l_slptime = 0;
609 lwp_unlock(l);
610 return;
611 }
612
613 /*
614 * Set the LWP runnable. If it's swapped out, we need to wake the swapper
615 * to bring it back in. Otherwise, enter it into a run queue.
616 */
617 if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
618 spc_lock(l->l_cpu);
619 lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_mutex);
620 }
621
622 sched_setrunnable(l);
623 l->l_stat = LSRUN;
624 l->l_slptime = 0;
625
626 if (l->l_flag & LW_INMEM) {
627 sched_enqueue(l, false);
628 resched_cpu(l);
629 lwp_unlock(l);
630 } else {
631 lwp_unlock(l);
632 uvm_kick_scheduler();
633 }
634 }
635
636 /*
637 * suspendsched:
638 *
639 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
640 */
641 void
642 suspendsched(void)
643 {
644 CPU_INFO_ITERATOR cii;
645 struct cpu_info *ci;
646 struct lwp *l;
647 struct proc *p;
648
649 /*
650 * We do this by process in order not to violate the locking rules.
651 */
652 mutex_enter(&proclist_mutex);
653 PROCLIST_FOREACH(p, &allproc) {
654 mutex_enter(&p->p_smutex);
655
656 if ((p->p_flag & PK_SYSTEM) != 0) {
657 mutex_exit(&p->p_smutex);
658 continue;
659 }
660
661 p->p_stat = SSTOP;
662
663 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
664 if (l == curlwp)
665 continue;
666
667 lwp_lock(l);
668
669 /*
670 * Set L_WREBOOT so that the LWP will suspend itself
671 * when it tries to return to user mode. We want to
672 * try and get to get as many LWPs as possible to
673 * the user / kernel boundary, so that they will
674 * release any locks that they hold.
675 */
676 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
677
678 if (l->l_stat == LSSLEEP &&
679 (l->l_flag & LW_SINTR) != 0) {
680 /* setrunnable() will release the lock. */
681 setrunnable(l);
682 continue;
683 }
684
685 lwp_unlock(l);
686 }
687
688 mutex_exit(&p->p_smutex);
689 }
690 mutex_exit(&proclist_mutex);
691
692 /*
693 * Kick all CPUs to make them preempt any LWPs running in user mode.
694 * They'll trap into the kernel and suspend themselves in userret().
695 */
696 for (CPU_INFO_FOREACH(cii, ci))
697 cpu_need_resched(ci, 0);
698 }
699
700 /*
701 * sched_kpri:
702 *
703 * Scale a priority level to a kernel priority level, usually
704 * for an LWP that is about to sleep.
705 */
706 pri_t
707 sched_kpri(struct lwp *l)
708 {
709 /*
710 * Scale user priorities (127 -> 50) up to kernel priorities
711 * in the range (49 -> 8). Reserve the top 8 kernel priorities
712 * for high priority kthreads. Kernel priorities passed in
713 * are left "as is". XXX This is somewhat arbitrary.
714 */
715 static const uint8_t kpri_tab[] = {
716 0, 1, 2, 3, 4, 5, 6, 7,
717 8, 9, 10, 11, 12, 13, 14, 15,
718 16, 17, 18, 19, 20, 21, 22, 23,
719 24, 25, 26, 27, 28, 29, 30, 31,
720 32, 33, 34, 35, 36, 37, 38, 39,
721 40, 41, 42, 43, 44, 45, 46, 47,
722 48, 49, 8, 8, 9, 9, 10, 10,
723 11, 11, 12, 12, 13, 14, 14, 15,
724 15, 16, 16, 17, 17, 18, 18, 19,
725 20, 20, 21, 21, 22, 22, 23, 23,
726 24, 24, 25, 26, 26, 27, 27, 28,
727 28, 29, 29, 30, 30, 31, 32, 32,
728 33, 33, 34, 34, 35, 35, 36, 36,
729 37, 38, 38, 39, 39, 40, 40, 41,
730 41, 42, 42, 43, 44, 44, 45, 45,
731 46, 46, 47, 47, 48, 48, 49, 49,
732 };
733
734 return (pri_t)kpri_tab[l->l_usrpri];
735 }
736
737 /*
738 * sched_unsleep:
739 *
740 * The is called when the LWP has not been awoken normally but instead
741 * interrupted: for example, if the sleep timed out. Because of this,
742 * it's not a valid action for running or idle LWPs.
743 */
744 static void
745 sched_unsleep(struct lwp *l)
746 {
747
748 lwp_unlock(l);
749 panic("sched_unsleep");
750 }
751
752 inline void
753 resched_cpu(struct lwp *l)
754 {
755 struct cpu_info *ci;
756 const pri_t pri = lwp_eprio(l);
757
758 /*
759 * XXXSMP
760 * Since l->l_cpu persists across a context switch,
761 * this gives us *very weak* processor affinity, in
762 * that we notify the CPU on which the process last
763 * ran that it should try to switch.
764 *
765 * This does not guarantee that the process will run on
766 * that processor next, because another processor might
767 * grab it the next time it performs a context switch.
768 *
769 * This also does not handle the case where its last
770 * CPU is running a higher-priority process, but every
771 * other CPU is running a lower-priority process. There
772 * are ways to handle this situation, but they're not
773 * currently very pretty, and we also need to weigh the
774 * cost of moving a process from one CPU to another.
775 */
776 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu();
777 if (pri < ci->ci_schedstate.spc_curpriority)
778 cpu_need_resched(ci, 0);
779 }
780
781 static void
782 sched_changepri(struct lwp *l, pri_t pri)
783 {
784
785 KASSERT(lwp_locked(l, NULL));
786
787 l->l_usrpri = pri;
788 if (l->l_priority < PUSER)
789 return;
790
791 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
792 l->l_priority = pri;
793 return;
794 }
795
796 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
797
798 sched_dequeue(l);
799 l->l_priority = pri;
800 sched_enqueue(l, false);
801 resched_cpu(l);
802 }
803
804 static void
805 sched_lendpri(struct lwp *l, pri_t pri)
806 {
807
808 KASSERT(lwp_locked(l, NULL));
809
810 if (l->l_stat != LSRUN || (l->l_flag & LW_INMEM) == 0) {
811 l->l_inheritedprio = pri;
812 return;
813 }
814
815 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
816
817 sched_dequeue(l);
818 l->l_inheritedprio = pri;
819 sched_enqueue(l, false);
820 resched_cpu(l);
821 }
822
823 struct lwp *
824 syncobj_noowner(wchan_t wchan)
825 {
826
827 return NULL;
828 }
829
830
831 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
832 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
833
834 /*
835 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
836 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
837 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
838 *
839 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
840 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
841 *
842 * If you dont want to bother with the faster/more-accurate formula, you
843 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
844 * (more general) method of calculating the %age of CPU used by a process.
845 */
846 #define CCPU_SHIFT (FSHIFT + 1)
847
848 /*
849 * sched_pstats:
850 *
851 * Update process statistics and check CPU resource allocation.
852 * Call scheduler-specific hook to eventually adjust process/LWP
853 * priorities.
854 */
855 /* ARGSUSED */
856 void
857 sched_pstats(void *arg)
858 {
859 struct rlimit *rlim;
860 struct lwp *l;
861 struct proc *p;
862 int minslp, sig, clkhz;
863 long runtm;
864
865 sched_pstats_ticks++;
866
867 mutex_enter(&proclist_mutex);
868 PROCLIST_FOREACH(p, &allproc) {
869 /*
870 * Increment time in/out of memory and sleep time (if
871 * sleeping). We ignore overflow; with 16-bit int's
872 * (remember them?) overflow takes 45 days.
873 */
874 minslp = 2;
875 mutex_enter(&p->p_smutex);
876 mutex_spin_enter(&p->p_stmutex);
877 runtm = p->p_rtime.tv_sec;
878 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
879 if ((l->l_flag & LW_IDLE) != 0)
880 continue;
881 lwp_lock(l);
882 runtm += l->l_rtime.tv_sec;
883 l->l_swtime++;
884 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
885 l->l_stat == LSSUSPENDED) {
886 l->l_slptime++;
887 minslp = min(minslp, l->l_slptime);
888 } else
889 minslp = 0;
890 sched_pstats_hook(l);
891 lwp_unlock(l);
892
893 /*
894 * p_pctcpu is only for ps.
895 */
896 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
897 if (l->l_slptime < 1) {
898 clkhz = stathz != 0 ? stathz : hz;
899 #if (FSHIFT >= CCPU_SHIFT)
900 l->l_pctcpu += (clkhz == 100) ?
901 ((fixpt_t)l->l_cpticks) <<
902 (FSHIFT - CCPU_SHIFT) :
903 100 * (((fixpt_t) p->p_cpticks)
904 << (FSHIFT - CCPU_SHIFT)) / clkhz;
905 #else
906 l->l_pctcpu += ((FSCALE - ccpu) *
907 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
908 #endif
909 l->l_cpticks = 0;
910 }
911 }
912
913 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
914 #ifdef SCHED_4BSD
915 /*
916 * XXX: Workaround - belongs to sched_4bsd.c
917 * If the process has slept the entire second,
918 * stop recalculating its priority until it wakes up.
919 */
920 if (minslp <= 1) {
921 extern fixpt_t decay_cpu(fixpt_t, fixpt_t);
922
923 fixpt_t loadfac = 2 * (averunnable.ldavg[0]);
924 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
925 }
926 #endif
927 mutex_spin_exit(&p->p_stmutex);
928
929 /*
930 * Check if the process exceeds its CPU resource allocation.
931 * If over max, kill it.
932 */
933 rlim = &p->p_rlimit[RLIMIT_CPU];
934 sig = 0;
935 if (runtm >= rlim->rlim_cur) {
936 if (runtm >= rlim->rlim_max)
937 sig = SIGKILL;
938 else {
939 sig = SIGXCPU;
940 if (rlim->rlim_cur < rlim->rlim_max)
941 rlim->rlim_cur += 5;
942 }
943 }
944 mutex_exit(&p->p_smutex);
945 if (sig) {
946 psignal(p, sig);
947 }
948 }
949 mutex_exit(&proclist_mutex);
950 uvm_meter();
951 cv_wakeup(&lbolt);
952 callout_schedule(&sched_pstats_ch, hz);
953 }
954
955 void
956 sched_init(void)
957 {
958
959 cv_init(&lbolt, "lbolt");
960 callout_init(&sched_pstats_ch, 0);
961 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
962 sched_setup();
963 sched_pstats(NULL);
964 }
965