kern_synch.c revision 1.239 1 /* $NetBSD: kern_synch.c,v 1.239 2008/04/30 00:30:56 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*-
35 * Copyright (c) 1982, 1986, 1990, 1991, 1993
36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.239 2008/04/30 00:30:56 ad Exp $");
72
73 #include "opt_kstack.h"
74 #include "opt_lockdebug.h"
75 #include "opt_multiprocessor.h"
76 #include "opt_perfctrs.h"
77
78 #define __MUTEX_PRIVATE
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/kernel.h>
84 #if defined(PERFCTRS)
85 #include <sys/pmc.h>
86 #endif
87 #include <sys/cpu.h>
88 #include <sys/resourcevar.h>
89 #include <sys/sched.h>
90 #include <sys/syscall_stats.h>
91 #include <sys/sleepq.h>
92 #include <sys/lockdebug.h>
93 #include <sys/evcnt.h>
94 #include <sys/intr.h>
95 #include <sys/lwpctl.h>
96 #include <sys/atomic.h>
97 #include <sys/simplelock.h>
98
99 #include <uvm/uvm_extern.h>
100
101 #include <dev/lockstat.h>
102
103 static u_int sched_unsleep(struct lwp *, bool);
104 static void sched_changepri(struct lwp *, pri_t);
105 static void sched_lendpri(struct lwp *, pri_t);
106
107 syncobj_t sleep_syncobj = {
108 SOBJ_SLEEPQ_SORTED,
109 sleepq_unsleep,
110 sleepq_changepri,
111 sleepq_lendpri,
112 syncobj_noowner,
113 };
114
115 syncobj_t sched_syncobj = {
116 SOBJ_SLEEPQ_SORTED,
117 sched_unsleep,
118 sched_changepri,
119 sched_lendpri,
120 syncobj_noowner,
121 };
122
123 callout_t sched_pstats_ch;
124 unsigned sched_pstats_ticks;
125 kcondvar_t lbolt; /* once a second sleep address */
126
127 /* Preemption event counters */
128 static struct evcnt kpreempt_ev_crit;
129 static struct evcnt kpreempt_ev_klock;
130 static struct evcnt kpreempt_ev_ipl;
131 static struct evcnt kpreempt_ev_immed;
132
133 /*
134 * During autoconfiguration or after a panic, a sleep will simply lower the
135 * priority briefly to allow interrupts, then return. The priority to be
136 * used (safepri) is machine-dependent, thus this value is initialized and
137 * maintained in the machine-dependent layers. This priority will typically
138 * be 0, or the lowest priority that is safe for use on the interrupt stack;
139 * it can be made higher to block network software interrupts after panics.
140 */
141 int safepri;
142
143 void
144 sched_init(void)
145 {
146
147 cv_init(&lbolt, "lbolt");
148 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
149 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
150
151 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
152 "kpreempt", "defer: critical section");
153 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
154 "kpreempt", "defer: kernel_lock");
155 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL,
156 "kpreempt", "defer: IPL");
157 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
158 "kpreempt", "immediate");
159
160 sched_pstats(NULL);
161 }
162
163 /*
164 * OBSOLETE INTERFACE
165 *
166 * General sleep call. Suspends the current process until a wakeup is
167 * performed on the specified identifier. The process will then be made
168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
169 * means no timeout). If pri includes PCATCH flag, signals are checked
170 * before and after sleeping, else signals are not checked. Returns 0 if
171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
172 * signal needs to be delivered, ERESTART is returned if the current system
173 * call should be restarted if possible, and EINTR is returned if the system
174 * call should be interrupted by the signal (return EINTR).
175 *
176 * The interlock is held until we are on a sleep queue. The interlock will
177 * be locked before returning back to the caller unless the PNORELOCK flag
178 * is specified, in which case the interlock will always be unlocked upon
179 * return.
180 */
181 int
182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
183 volatile struct simplelock *interlock)
184 {
185 struct lwp *l = curlwp;
186 sleepq_t *sq;
187 int error;
188
189 KASSERT((l->l_pflag & LP_INTR) == 0);
190
191 if (sleepq_dontsleep(l)) {
192 (void)sleepq_abort(NULL, 0);
193 if ((priority & PNORELOCK) != 0)
194 simple_unlock(interlock);
195 return 0;
196 }
197
198 l->l_kpriority = true;
199 sq = sleeptab_lookup(&sleeptab, ident);
200 sleepq_enter(sq, l);
201 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
202
203 if (interlock != NULL) {
204 KASSERT(simple_lock_held(interlock));
205 simple_unlock(interlock);
206 }
207
208 error = sleepq_block(timo, priority & PCATCH);
209
210 if (interlock != NULL && (priority & PNORELOCK) == 0)
211 simple_lock(interlock);
212
213 return error;
214 }
215
216 int
217 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
218 kmutex_t *mtx)
219 {
220 struct lwp *l = curlwp;
221 sleepq_t *sq;
222 int error;
223
224 KASSERT((l->l_pflag & LP_INTR) == 0);
225
226 if (sleepq_dontsleep(l)) {
227 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
228 return 0;
229 }
230
231 l->l_kpriority = true;
232 sq = sleeptab_lookup(&sleeptab, ident);
233 sleepq_enter(sq, l);
234 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
235 mutex_exit(mtx);
236 error = sleepq_block(timo, priority & PCATCH);
237
238 if ((priority & PNORELOCK) == 0)
239 mutex_enter(mtx);
240
241 return error;
242 }
243
244 /*
245 * General sleep call for situations where a wake-up is not expected.
246 */
247 int
248 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
249 {
250 struct lwp *l = curlwp;
251 sleepq_t *sq;
252 int error;
253
254 if (sleepq_dontsleep(l))
255 return sleepq_abort(NULL, 0);
256
257 if (mtx != NULL)
258 mutex_exit(mtx);
259 l->l_kpriority = true;
260 sq = sleeptab_lookup(&sleeptab, l);
261 sleepq_enter(sq, l);
262 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
263 error = sleepq_block(timo, intr);
264 if (mtx != NULL)
265 mutex_enter(mtx);
266
267 return error;
268 }
269
270 /*
271 * OBSOLETE INTERFACE
272 *
273 * Make all processes sleeping on the specified identifier runnable.
274 */
275 void
276 wakeup(wchan_t ident)
277 {
278 sleepq_t *sq;
279
280 if (cold)
281 return;
282
283 sq = sleeptab_lookup(&sleeptab, ident);
284 sleepq_wake(sq, ident, (u_int)-1);
285 }
286
287 /*
288 * OBSOLETE INTERFACE
289 *
290 * Make the highest priority process first in line on the specified
291 * identifier runnable.
292 */
293 void
294 wakeup_one(wchan_t ident)
295 {
296 sleepq_t *sq;
297
298 if (cold)
299 return;
300
301 sq = sleeptab_lookup(&sleeptab, ident);
302 sleepq_wake(sq, ident, 1);
303 }
304
305
306 /*
307 * General yield call. Puts the current process back on its run queue and
308 * performs a voluntary context switch. Should only be called when the
309 * current process explicitly requests it (eg sched_yield(2)).
310 */
311 void
312 yield(void)
313 {
314 struct lwp *l = curlwp;
315
316 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
317 lwp_lock(l);
318 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
319 KASSERT(l->l_stat == LSONPROC);
320 l->l_kpriority = false;
321 (void)mi_switch(l);
322 KERNEL_LOCK(l->l_biglocks, l);
323 }
324
325 /*
326 * General preemption call. Puts the current process back on its run queue
327 * and performs an involuntary context switch.
328 */
329 void
330 preempt(void)
331 {
332 struct lwp *l = curlwp;
333
334 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
335 lwp_lock(l);
336 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
337 KASSERT(l->l_stat == LSONPROC);
338 l->l_kpriority = false;
339 l->l_nivcsw++;
340 (void)mi_switch(l);
341 KERNEL_LOCK(l->l_biglocks, l);
342 }
343
344 /*
345 * Handle a request made by another agent to preempt the current LWP
346 * in-kernel. Usually called when l_dopreempt may be non-zero.
347 *
348 * Character addresses for lockstat only.
349 */
350 static char in_critical_section;
351 static char kernel_lock_held;
352 static char spl_raised;
353 static char is_softint;
354
355 bool
356 kpreempt(uintptr_t where)
357 {
358 uintptr_t failed;
359 lwp_t *l;
360 int s, dop;
361
362 l = curlwp;
363 failed = 0;
364 while ((dop = l->l_dopreempt) != 0) {
365 if (l->l_stat != LSONPROC) {
366 /*
367 * About to block (or die), let it happen.
368 * Doesn't really count as "preemption has
369 * been blocked", since we're going to
370 * context switch.
371 */
372 l->l_dopreempt = 0;
373 return true;
374 }
375 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
376 /* Can't preempt idle loop, don't count as failure. */
377 l->l_dopreempt = 0;
378 return true;
379 }
380 if (__predict_false(l->l_nopreempt != 0)) {
381 /* LWP holds preemption disabled, explicitly. */
382 if ((dop & DOPREEMPT_COUNTED) == 0) {
383 kpreempt_ev_crit.ev_count++;
384 }
385 failed = (uintptr_t)&in_critical_section;
386 break;
387 }
388 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
389 /* Can't preempt soft interrupts yet. */
390 l->l_dopreempt = 0;
391 failed = (uintptr_t)&is_softint;
392 break;
393 }
394 s = splsched();
395 if (__predict_false(l->l_blcnt != 0 ||
396 curcpu()->ci_biglock_wanted != NULL)) {
397 /* Hold or want kernel_lock, code is not MT safe. */
398 splx(s);
399 if ((dop & DOPREEMPT_COUNTED) == 0) {
400 kpreempt_ev_klock.ev_count++;
401 }
402 failed = (uintptr_t)&kernel_lock_held;
403 break;
404 }
405 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
406 /*
407 * It may be that the IPL is too high.
408 * kpreempt_enter() can schedule an
409 * interrupt to retry later.
410 */
411 splx(s);
412 if ((dop & DOPREEMPT_COUNTED) == 0) {
413 kpreempt_ev_ipl.ev_count++;
414 }
415 failed = (uintptr_t)&spl_raised;
416 break;
417 }
418 /* Do it! */
419 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
420 kpreempt_ev_immed.ev_count++;
421 }
422 lwp_lock(l);
423 mi_switch(l);
424 l->l_nopreempt++;
425 splx(s);
426
427 /* Take care of any MD cleanup. */
428 cpu_kpreempt_exit(where);
429 l->l_nopreempt--;
430 }
431
432 /* Record preemption failure for reporting via lockstat. */
433 if (__predict_false(failed)) {
434 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
435 int lsflag = 0;
436 LOCKSTAT_ENTER(lsflag);
437 /* Might recurse, make it atomic. */
438 if (__predict_false(lsflag)) {
439 if (where == 0) {
440 where = (uintptr_t)__builtin_return_address(0);
441 }
442 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
443 NULL, (void *)where) == NULL) {
444 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
445 l->l_pfaillock = failed;
446 }
447 }
448 LOCKSTAT_EXIT(lsflag);
449 }
450
451 return failed;
452 }
453
454 /*
455 * Return true if preemption is explicitly disabled.
456 */
457 bool
458 kpreempt_disabled(void)
459 {
460 lwp_t *l;
461
462 l = curlwp;
463
464 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
465 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
466 }
467
468 /*
469 * Disable kernel preemption.
470 */
471 void
472 kpreempt_disable(void)
473 {
474
475 KPREEMPT_DISABLE(curlwp);
476 }
477
478 /*
479 * Reenable kernel preemption.
480 */
481 void
482 kpreempt_enable(void)
483 {
484
485 KPREEMPT_ENABLE(curlwp);
486 }
487
488 /*
489 * Compute the amount of time during which the current lwp was running.
490 *
491 * - update l_rtime unless it's an idle lwp.
492 */
493
494 void
495 updatertime(lwp_t *l, const struct bintime *now)
496 {
497
498 if ((l->l_flag & LW_IDLE) != 0)
499 return;
500
501 /* rtime += now - stime */
502 bintime_add(&l->l_rtime, now);
503 bintime_sub(&l->l_rtime, &l->l_stime);
504 }
505
506 /*
507 * The machine independent parts of context switch.
508 *
509 * Returns 1 if another LWP was actually run.
510 */
511 int
512 mi_switch(lwp_t *l)
513 {
514 struct cpu_info *ci, *tci = NULL;
515 struct schedstate_percpu *spc;
516 struct lwp *newl;
517 int retval, oldspl;
518 struct bintime bt;
519 bool returning;
520
521 KASSERT(lwp_locked(l, NULL));
522 KASSERT(kpreempt_disabled());
523 LOCKDEBUG_BARRIER(l->l_mutex, 1);
524
525 #ifdef KSTACK_CHECK_MAGIC
526 kstack_check_magic(l);
527 #endif
528
529 binuptime(&bt);
530
531 KASSERT(l->l_cpu == curcpu());
532 ci = l->l_cpu;
533 spc = &ci->ci_schedstate;
534 returning = false;
535 newl = NULL;
536
537 /*
538 * If we have been asked to switch to a specific LWP, then there
539 * is no need to inspect the run queues. If a soft interrupt is
540 * blocking, then return to the interrupted thread without adjusting
541 * VM context or its start time: neither have been changed in order
542 * to take the interrupt.
543 */
544 if (l->l_switchto != NULL) {
545 if ((l->l_pflag & LP_INTR) != 0) {
546 returning = true;
547 softint_block(l);
548 if ((l->l_flag & LW_TIMEINTR) != 0)
549 updatertime(l, &bt);
550 }
551 newl = l->l_switchto;
552 l->l_switchto = NULL;
553 }
554 #ifndef __HAVE_FAST_SOFTINTS
555 else if (ci->ci_data.cpu_softints != 0) {
556 /* There are pending soft interrupts, so pick one. */
557 newl = softint_picklwp();
558 newl->l_stat = LSONPROC;
559 newl->l_flag |= LW_RUNNING;
560 }
561 #endif /* !__HAVE_FAST_SOFTINTS */
562
563 /* Count time spent in current system call */
564 if (!returning) {
565 SYSCALL_TIME_SLEEP(l);
566
567 /*
568 * XXXSMP If we are using h/w performance counters,
569 * save context.
570 */
571 #if PERFCTRS
572 if (PMC_ENABLED(l->l_proc)) {
573 pmc_save_context(l->l_proc);
574 }
575 #endif
576 updatertime(l, &bt);
577 }
578
579 /*
580 * If on the CPU and we have gotten this far, then we must yield.
581 */
582 KASSERT(l->l_stat != LSRUN);
583 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
584 KASSERT(lwp_locked(l, spc->spc_lwplock));
585
586 if (l->l_target_cpu == l->l_cpu) {
587 l->l_target_cpu = NULL;
588 } else {
589 tci = l->l_target_cpu;
590 }
591
592 if (__predict_false(tci != NULL)) {
593 /* Double-lock the runqueues */
594 spc_dlock(ci, tci);
595 } else {
596 /* Lock the runqueue */
597 spc_lock(ci);
598 }
599
600 if ((l->l_flag & LW_IDLE) == 0) {
601 l->l_stat = LSRUN;
602 if (__predict_false(tci != NULL)) {
603 /*
604 * Set the new CPU, lock and unset the
605 * l_target_cpu - thread will be enqueued
606 * to the runqueue of target CPU.
607 */
608 l->l_cpu = tci;
609 lwp_setlock(l, tci->ci_schedstate.spc_mutex);
610 l->l_target_cpu = NULL;
611 } else {
612 lwp_setlock(l, spc->spc_mutex);
613 }
614 sched_enqueue(l, true);
615 } else {
616 KASSERT(tci == NULL);
617 l->l_stat = LSIDL;
618 }
619 } else {
620 /* Lock the runqueue */
621 spc_lock(ci);
622 }
623
624 /*
625 * Let sched_nextlwp() select the LWP to run the CPU next.
626 * If no LWP is runnable, select the idle LWP.
627 *
628 * Note that spc_lwplock might not necessary be held, and
629 * new thread would be unlocked after setting the LWP-lock.
630 */
631 if (newl == NULL) {
632 newl = sched_nextlwp();
633 if (newl != NULL) {
634 sched_dequeue(newl);
635 KASSERT(lwp_locked(newl, spc->spc_mutex));
636 newl->l_stat = LSONPROC;
637 newl->l_cpu = ci;
638 newl->l_flag |= LW_RUNNING;
639 lwp_setlock(newl, spc->spc_lwplock);
640 } else {
641 newl = ci->ci_data.cpu_idlelwp;
642 newl->l_stat = LSONPROC;
643 newl->l_flag |= LW_RUNNING;
644 }
645 /*
646 * Only clear want_resched if there are no
647 * pending (slow) software interrupts.
648 */
649 ci->ci_want_resched = ci->ci_data.cpu_softints;
650 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
651 spc->spc_curpriority = lwp_eprio(newl);
652 }
653
654 /* Items that must be updated with the CPU locked. */
655 if (!returning) {
656 /* Update the new LWP's start time. */
657 newl->l_stime = bt;
658
659 /*
660 * ci_curlwp changes when a fast soft interrupt occurs.
661 * We use cpu_onproc to keep track of which kernel or
662 * user thread is running 'underneath' the software
663 * interrupt. This is important for time accounting,
664 * itimers and forcing user threads to preempt (aston).
665 */
666 ci->ci_data.cpu_onproc = newl;
667 }
668
669 /* Kernel preemption related tasks. */
670 l->l_dopreempt = 0;
671 if (__predict_false(l->l_pfailaddr != 0)) {
672 LOCKSTAT_FLAG(lsflag);
673 LOCKSTAT_ENTER(lsflag);
674 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
675 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
676 1, l->l_pfailtime, l->l_pfailaddr);
677 LOCKSTAT_EXIT(lsflag);
678 l->l_pfailtime = 0;
679 l->l_pfaillock = 0;
680 l->l_pfailaddr = 0;
681 }
682
683 if (l != newl) {
684 struct lwp *prevlwp;
685
686 /* Release all locks, but leave the current LWP locked */
687 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
688 /*
689 * In case of migration, drop the local runqueue
690 * lock, thread is on other runqueue now.
691 */
692 if (__predict_false(tci != NULL))
693 spc_unlock(ci);
694 /*
695 * Drop spc_lwplock, if the current LWP has been moved
696 * to the run queue (it is now locked by spc_mutex).
697 */
698 mutex_spin_exit(spc->spc_lwplock);
699 } else {
700 /*
701 * Otherwise, drop the spc_mutex, we are done with the
702 * run queues.
703 */
704 mutex_spin_exit(spc->spc_mutex);
705 KASSERT(tci == NULL);
706 }
707
708 /*
709 * Mark that context switch is going to be perfomed
710 * for this LWP, to protect it from being switched
711 * to on another CPU.
712 */
713 KASSERT(l->l_ctxswtch == 0);
714 l->l_ctxswtch = 1;
715 l->l_ncsw++;
716 l->l_flag &= ~LW_RUNNING;
717
718 /*
719 * Increase the count of spin-mutexes before the release
720 * of the last lock - we must remain at IPL_SCHED during
721 * the context switch.
722 */
723 oldspl = MUTEX_SPIN_OLDSPL(ci);
724 ci->ci_mtx_count--;
725 lwp_unlock(l);
726
727 /* Count the context switch on this CPU. */
728 ci->ci_data.cpu_nswtch++;
729
730 /* Update status for lwpctl, if present. */
731 if (l->l_lwpctl != NULL)
732 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
733
734 /*
735 * Save old VM context, unless a soft interrupt
736 * handler is blocking.
737 */
738 if (!returning)
739 pmap_deactivate(l);
740
741 /*
742 * We may need to spin-wait for if 'newl' is still
743 * context switching on another CPU.
744 */
745 if (newl->l_ctxswtch != 0) {
746 u_int count;
747 count = SPINLOCK_BACKOFF_MIN;
748 while (newl->l_ctxswtch)
749 SPINLOCK_BACKOFF(count);
750 }
751
752 /* Switch to the new LWP.. */
753 prevlwp = cpu_switchto(l, newl, returning);
754 ci = curcpu();
755
756 /*
757 * Switched away - we have new curlwp.
758 * Restore VM context and IPL.
759 */
760 pmap_activate(l);
761 if (prevlwp != NULL) {
762 /* Normalize the count of the spin-mutexes */
763 ci->ci_mtx_count++;
764 /* Unmark the state of context switch */
765 membar_exit();
766 prevlwp->l_ctxswtch = 0;
767 }
768
769 /* Update status for lwpctl, if present. */
770 if (l->l_lwpctl != NULL) {
771 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
772 l->l_lwpctl->lc_pctr++;
773 }
774
775 KASSERT(l->l_cpu == ci);
776 splx(oldspl);
777 retval = 1;
778 } else {
779 /* Nothing to do - just unlock and return. */
780 KASSERT(tci == NULL);
781 spc_unlock(ci);
782 lwp_unlock(l);
783 retval = 0;
784 }
785
786 KASSERT(l == curlwp);
787 KASSERT(l->l_stat == LSONPROC);
788
789 /*
790 * XXXSMP If we are using h/w performance counters, restore context.
791 * XXXSMP preemption problem.
792 */
793 #if PERFCTRS
794 if (PMC_ENABLED(l->l_proc)) {
795 pmc_restore_context(l->l_proc);
796 }
797 #endif
798 SYSCALL_TIME_WAKEUP(l);
799 LOCKDEBUG_BARRIER(NULL, 1);
800
801 return retval;
802 }
803
804 /*
805 * Change process state to be runnable, placing it on the run queue if it is
806 * in memory, and awakening the swapper if it isn't in memory.
807 *
808 * Call with the process and LWP locked. Will return with the LWP unlocked.
809 */
810 void
811 setrunnable(struct lwp *l)
812 {
813 struct proc *p = l->l_proc;
814 struct cpu_info *ci;
815 sigset_t *ss;
816
817 KASSERT((l->l_flag & LW_IDLE) == 0);
818 KASSERT(mutex_owned(p->p_lock));
819 KASSERT(lwp_locked(l, NULL));
820 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
821
822 switch (l->l_stat) {
823 case LSSTOP:
824 /*
825 * If we're being traced (possibly because someone attached us
826 * while we were stopped), check for a signal from the debugger.
827 */
828 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
829 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
830 ss = &l->l_sigpend.sp_set;
831 else
832 ss = &p->p_sigpend.sp_set;
833 sigaddset(ss, p->p_xstat);
834 signotify(l);
835 }
836 p->p_nrlwps++;
837 break;
838 case LSSUSPENDED:
839 l->l_flag &= ~LW_WSUSPEND;
840 p->p_nrlwps++;
841 cv_broadcast(&p->p_lwpcv);
842 break;
843 case LSSLEEP:
844 KASSERT(l->l_wchan != NULL);
845 break;
846 default:
847 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
848 }
849
850 /*
851 * If the LWP was sleeping interruptably, then it's OK to start it
852 * again. If not, mark it as still sleeping.
853 */
854 if (l->l_wchan != NULL) {
855 l->l_stat = LSSLEEP;
856 /* lwp_unsleep() will release the lock. */
857 lwp_unsleep(l, true);
858 return;
859 }
860
861 /*
862 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
863 * about to call mi_switch(), in which case it will yield.
864 */
865 if ((l->l_flag & LW_RUNNING) != 0) {
866 l->l_stat = LSONPROC;
867 l->l_slptime = 0;
868 lwp_unlock(l);
869 return;
870 }
871
872 /*
873 * Look for a CPU to run.
874 * Set the LWP runnable.
875 */
876 ci = sched_takecpu(l);
877 l->l_cpu = ci;
878 spc_lock(ci);
879 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
880 sched_setrunnable(l);
881 l->l_stat = LSRUN;
882 l->l_slptime = 0;
883
884 /*
885 * If thread is swapped out - wake the swapper to bring it back in.
886 * Otherwise, enter it into a run queue.
887 */
888 if (l->l_flag & LW_INMEM) {
889 sched_enqueue(l, false);
890 resched_cpu(l);
891 lwp_unlock(l);
892 } else {
893 lwp_unlock(l);
894 uvm_kick_scheduler();
895 }
896 }
897
898 /*
899 * suspendsched:
900 *
901 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
902 */
903 void
904 suspendsched(void)
905 {
906 CPU_INFO_ITERATOR cii;
907 struct cpu_info *ci;
908 struct lwp *l;
909 struct proc *p;
910
911 /*
912 * We do this by process in order not to violate the locking rules.
913 */
914 mutex_enter(proc_lock);
915 PROCLIST_FOREACH(p, &allproc) {
916 if ((p->p_flag & PK_MARKER) != 0)
917 continue;
918
919 mutex_enter(p->p_lock);
920 if ((p->p_flag & PK_SYSTEM) != 0) {
921 mutex_exit(p->p_lock);
922 continue;
923 }
924
925 p->p_stat = SSTOP;
926
927 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
928 if (l == curlwp)
929 continue;
930
931 lwp_lock(l);
932
933 /*
934 * Set L_WREBOOT so that the LWP will suspend itself
935 * when it tries to return to user mode. We want to
936 * try and get to get as many LWPs as possible to
937 * the user / kernel boundary, so that they will
938 * release any locks that they hold.
939 */
940 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
941
942 if (l->l_stat == LSSLEEP &&
943 (l->l_flag & LW_SINTR) != 0) {
944 /* setrunnable() will release the lock. */
945 setrunnable(l);
946 continue;
947 }
948
949 lwp_unlock(l);
950 }
951
952 mutex_exit(p->p_lock);
953 }
954 mutex_exit(proc_lock);
955
956 /*
957 * Kick all CPUs to make them preempt any LWPs running in user mode.
958 * They'll trap into the kernel and suspend themselves in userret().
959 */
960 for (CPU_INFO_FOREACH(cii, ci)) {
961 spc_lock(ci);
962 cpu_need_resched(ci, RESCHED_IMMED);
963 spc_unlock(ci);
964 }
965 }
966
967 /*
968 * sched_unsleep:
969 *
970 * The is called when the LWP has not been awoken normally but instead
971 * interrupted: for example, if the sleep timed out. Because of this,
972 * it's not a valid action for running or idle LWPs.
973 */
974 static u_int
975 sched_unsleep(struct lwp *l, bool cleanup)
976 {
977
978 lwp_unlock(l);
979 panic("sched_unsleep");
980 }
981
982 void
983 resched_cpu(struct lwp *l)
984 {
985 struct cpu_info *ci;
986
987 /*
988 * XXXSMP
989 * Since l->l_cpu persists across a context switch,
990 * this gives us *very weak* processor affinity, in
991 * that we notify the CPU on which the process last
992 * ran that it should try to switch.
993 *
994 * This does not guarantee that the process will run on
995 * that processor next, because another processor might
996 * grab it the next time it performs a context switch.
997 *
998 * This also does not handle the case where its last
999 * CPU is running a higher-priority process, but every
1000 * other CPU is running a lower-priority process. There
1001 * are ways to handle this situation, but they're not
1002 * currently very pretty, and we also need to weigh the
1003 * cost of moving a process from one CPU to another.
1004 */
1005 ci = l->l_cpu;
1006 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1007 cpu_need_resched(ci, 0);
1008 }
1009
1010 static void
1011 sched_changepri(struct lwp *l, pri_t pri)
1012 {
1013
1014 KASSERT(lwp_locked(l, NULL));
1015
1016 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1017 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1018 sched_dequeue(l);
1019 l->l_priority = pri;
1020 sched_enqueue(l, false);
1021 } else {
1022 l->l_priority = pri;
1023 }
1024 resched_cpu(l);
1025 }
1026
1027 static void
1028 sched_lendpri(struct lwp *l, pri_t pri)
1029 {
1030
1031 KASSERT(lwp_locked(l, NULL));
1032
1033 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1034 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1035 sched_dequeue(l);
1036 l->l_inheritedprio = pri;
1037 sched_enqueue(l, false);
1038 } else {
1039 l->l_inheritedprio = pri;
1040 }
1041 resched_cpu(l);
1042 }
1043
1044 struct lwp *
1045 syncobj_noowner(wchan_t wchan)
1046 {
1047
1048 return NULL;
1049 }
1050
1051 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
1052 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1053
1054 /*
1055 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
1056 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
1057 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
1058 *
1059 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
1060 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
1061 *
1062 * If you dont want to bother with the faster/more-accurate formula, you
1063 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
1064 * (more general) method of calculating the %age of CPU used by a process.
1065 */
1066 #define CCPU_SHIFT (FSHIFT + 1)
1067
1068 /*
1069 * sched_pstats:
1070 *
1071 * Update process statistics and check CPU resource allocation.
1072 * Call scheduler-specific hook to eventually adjust process/LWP
1073 * priorities.
1074 */
1075 /* ARGSUSED */
1076 void
1077 sched_pstats(void *arg)
1078 {
1079 struct rlimit *rlim;
1080 struct lwp *l;
1081 struct proc *p;
1082 int sig, clkhz;
1083 long runtm;
1084
1085 sched_pstats_ticks++;
1086
1087 mutex_enter(proc_lock);
1088 PROCLIST_FOREACH(p, &allproc) {
1089 if ((p->p_flag & PK_MARKER) != 0)
1090 continue;
1091
1092 /*
1093 * Increment time in/out of memory and sleep time (if
1094 * sleeping). We ignore overflow; with 16-bit int's
1095 * (remember them?) overflow takes 45 days.
1096 */
1097 mutex_enter(p->p_lock);
1098 mutex_spin_enter(&p->p_stmutex);
1099 runtm = p->p_rtime.sec;
1100 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1101 if ((l->l_flag & LW_IDLE) != 0)
1102 continue;
1103 lwp_lock(l);
1104 runtm += l->l_rtime.sec;
1105 l->l_swtime++;
1106 sched_pstats_hook(l);
1107 lwp_unlock(l);
1108
1109 /*
1110 * p_pctcpu is only for ps.
1111 */
1112 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1113 if (l->l_slptime < 1) {
1114 clkhz = stathz != 0 ? stathz : hz;
1115 #if (FSHIFT >= CCPU_SHIFT)
1116 l->l_pctcpu += (clkhz == 100) ?
1117 ((fixpt_t)l->l_cpticks) <<
1118 (FSHIFT - CCPU_SHIFT) :
1119 100 * (((fixpt_t) p->p_cpticks)
1120 << (FSHIFT - CCPU_SHIFT)) / clkhz;
1121 #else
1122 l->l_pctcpu += ((FSCALE - ccpu) *
1123 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
1124 #endif
1125 l->l_cpticks = 0;
1126 }
1127 }
1128 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1129 mutex_spin_exit(&p->p_stmutex);
1130
1131 /*
1132 * Check if the process exceeds its CPU resource allocation.
1133 * If over max, kill it.
1134 */
1135 rlim = &p->p_rlimit[RLIMIT_CPU];
1136 sig = 0;
1137 if (runtm >= rlim->rlim_cur) {
1138 if (runtm >= rlim->rlim_max)
1139 sig = SIGKILL;
1140 else {
1141 sig = SIGXCPU;
1142 if (rlim->rlim_cur < rlim->rlim_max)
1143 rlim->rlim_cur += 5;
1144 }
1145 }
1146 mutex_exit(p->p_lock);
1147 if (sig)
1148 psignal(p, sig);
1149 }
1150 mutex_exit(proc_lock);
1151 uvm_meter();
1152 cv_wakeup(&lbolt);
1153 callout_schedule(&sched_pstats_ch, hz);
1154 }
1155