kern_synch.c revision 1.241 1 /* $NetBSD: kern_synch.c,v 1.241 2008/04/30 12:44:27 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*-
35 * Copyright (c) 1982, 1986, 1990, 1991, 1993
36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.241 2008/04/30 12:44:27 ad Exp $");
72
73 #include "opt_kstack.h"
74 #include "opt_lockdebug.h"
75 #include "opt_multiprocessor.h"
76 #include "opt_perfctrs.h"
77
78 #define __MUTEX_PRIVATE
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/kernel.h>
84 #if defined(PERFCTRS)
85 #include <sys/pmc.h>
86 #endif
87 #include <sys/cpu.h>
88 #include <sys/resourcevar.h>
89 #include <sys/sched.h>
90 #include <sys/syscall_stats.h>
91 #include <sys/sleepq.h>
92 #include <sys/lockdebug.h>
93 #include <sys/evcnt.h>
94 #include <sys/intr.h>
95 #include <sys/lwpctl.h>
96 #include <sys/atomic.h>
97 #include <sys/simplelock.h>
98
99 #include <uvm/uvm_extern.h>
100
101 #include <dev/lockstat.h>
102
103 static u_int sched_unsleep(struct lwp *, bool);
104 static void sched_changepri(struct lwp *, pri_t);
105 static void sched_lendpri(struct lwp *, pri_t);
106
107 syncobj_t sleep_syncobj = {
108 SOBJ_SLEEPQ_SORTED,
109 sleepq_unsleep,
110 sleepq_changepri,
111 sleepq_lendpri,
112 syncobj_noowner,
113 };
114
115 syncobj_t sched_syncobj = {
116 SOBJ_SLEEPQ_SORTED,
117 sched_unsleep,
118 sched_changepri,
119 sched_lendpri,
120 syncobj_noowner,
121 };
122
123 callout_t sched_pstats_ch;
124 unsigned sched_pstats_ticks;
125 kcondvar_t lbolt; /* once a second sleep address */
126
127 /* Preemption event counters */
128 static struct evcnt kpreempt_ev_crit;
129 static struct evcnt kpreempt_ev_klock;
130 static struct evcnt kpreempt_ev_ipl;
131 static struct evcnt kpreempt_ev_immed;
132
133 /*
134 * During autoconfiguration or after a panic, a sleep will simply lower the
135 * priority briefly to allow interrupts, then return. The priority to be
136 * used (safepri) is machine-dependent, thus this value is initialized and
137 * maintained in the machine-dependent layers. This priority will typically
138 * be 0, or the lowest priority that is safe for use on the interrupt stack;
139 * it can be made higher to block network software interrupts after panics.
140 */
141 int safepri;
142
143 void
144 sched_init(void)
145 {
146
147 cv_init(&lbolt, "lbolt");
148 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
149 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
150
151 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
152 "kpreempt", "defer: critical section");
153 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
154 "kpreempt", "defer: kernel_lock");
155 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL,
156 "kpreempt", "defer: IPL");
157 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
158 "kpreempt", "immediate");
159
160 sched_pstats(NULL);
161 }
162
163 /*
164 * OBSOLETE INTERFACE
165 *
166 * General sleep call. Suspends the current process until a wakeup is
167 * performed on the specified identifier. The process will then be made
168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
169 * means no timeout). If pri includes PCATCH flag, signals are checked
170 * before and after sleeping, else signals are not checked. Returns 0 if
171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
172 * signal needs to be delivered, ERESTART is returned if the current system
173 * call should be restarted if possible, and EINTR is returned if the system
174 * call should be interrupted by the signal (return EINTR).
175 *
176 * The interlock is held until we are on a sleep queue. The interlock will
177 * be locked before returning back to the caller unless the PNORELOCK flag
178 * is specified, in which case the interlock will always be unlocked upon
179 * return.
180 */
181 int
182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
183 volatile struct simplelock *interlock)
184 {
185 struct lwp *l = curlwp;
186 sleepq_t *sq;
187 int error;
188
189 KASSERT((l->l_pflag & LP_INTR) == 0);
190
191 if (sleepq_dontsleep(l)) {
192 (void)sleepq_abort(NULL, 0);
193 if ((priority & PNORELOCK) != 0)
194 simple_unlock(interlock);
195 return 0;
196 }
197
198 l->l_kpriority = true;
199 sq = sleeptab_lookup(&sleeptab, ident);
200 sleepq_enter(sq, l);
201 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
202
203 if (interlock != NULL) {
204 KASSERT(simple_lock_held(interlock));
205 simple_unlock(interlock);
206 }
207
208 error = sleepq_block(timo, priority & PCATCH);
209
210 if (interlock != NULL && (priority & PNORELOCK) == 0)
211 simple_lock(interlock);
212
213 return error;
214 }
215
216 int
217 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
218 kmutex_t *mtx)
219 {
220 struct lwp *l = curlwp;
221 sleepq_t *sq;
222 int error;
223
224 KASSERT((l->l_pflag & LP_INTR) == 0);
225
226 if (sleepq_dontsleep(l)) {
227 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
228 return 0;
229 }
230
231 l->l_kpriority = true;
232 sq = sleeptab_lookup(&sleeptab, ident);
233 sleepq_enter(sq, l);
234 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
235 mutex_exit(mtx);
236 error = sleepq_block(timo, priority & PCATCH);
237
238 if ((priority & PNORELOCK) == 0)
239 mutex_enter(mtx);
240
241 return error;
242 }
243
244 /*
245 * General sleep call for situations where a wake-up is not expected.
246 */
247 int
248 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
249 {
250 struct lwp *l = curlwp;
251 sleepq_t *sq;
252 int error;
253
254 if (sleepq_dontsleep(l))
255 return sleepq_abort(NULL, 0);
256
257 if (mtx != NULL)
258 mutex_exit(mtx);
259 l->l_kpriority = true;
260 sq = sleeptab_lookup(&sleeptab, l);
261 sleepq_enter(sq, l);
262 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
263 error = sleepq_block(timo, intr);
264 if (mtx != NULL)
265 mutex_enter(mtx);
266
267 return error;
268 }
269
270 /*
271 * OBSOLETE INTERFACE
272 *
273 * Make all processes sleeping on the specified identifier runnable.
274 */
275 void
276 wakeup(wchan_t ident)
277 {
278 sleepq_t *sq;
279
280 if (cold)
281 return;
282
283 sq = sleeptab_lookup(&sleeptab, ident);
284 sleepq_wake(sq, ident, (u_int)-1);
285 }
286
287 /*
288 * OBSOLETE INTERFACE
289 *
290 * Make the highest priority process first in line on the specified
291 * identifier runnable.
292 */
293 void
294 wakeup_one(wchan_t ident)
295 {
296 sleepq_t *sq;
297
298 if (cold)
299 return;
300
301 sq = sleeptab_lookup(&sleeptab, ident);
302 sleepq_wake(sq, ident, 1);
303 }
304
305
306 /*
307 * General yield call. Puts the current process back on its run queue and
308 * performs a voluntary context switch. Should only be called when the
309 * current process explicitly requests it (eg sched_yield(2)).
310 */
311 void
312 yield(void)
313 {
314 struct lwp *l = curlwp;
315
316 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
317 lwp_lock(l);
318 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
319 KASSERT(l->l_stat == LSONPROC);
320 l->l_kpriority = false;
321 (void)mi_switch(l);
322 KERNEL_LOCK(l->l_biglocks, l);
323 }
324
325 /*
326 * General preemption call. Puts the current process back on its run queue
327 * and performs an involuntary context switch.
328 */
329 void
330 preempt(void)
331 {
332 struct lwp *l = curlwp;
333
334 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
335 lwp_lock(l);
336 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
337 KASSERT(l->l_stat == LSONPROC);
338 l->l_kpriority = false;
339 l->l_nivcsw++;
340 (void)mi_switch(l);
341 KERNEL_LOCK(l->l_biglocks, l);
342 }
343
344 /*
345 * Handle a request made by another agent to preempt the current LWP
346 * in-kernel. Usually called when l_dopreempt may be non-zero.
347 *
348 * Character addresses for lockstat only.
349 */
350 static char in_critical_section;
351 static char kernel_lock_held;
352 static char spl_raised;
353 static char is_softint;
354
355 bool
356 kpreempt(uintptr_t where)
357 {
358 uintptr_t failed;
359 lwp_t *l;
360 int s, dop;
361
362 l = curlwp;
363 failed = 0;
364 while ((dop = l->l_dopreempt) != 0) {
365 if (l->l_stat != LSONPROC) {
366 /*
367 * About to block (or die), let it happen.
368 * Doesn't really count as "preemption has
369 * been blocked", since we're going to
370 * context switch.
371 */
372 l->l_dopreempt = 0;
373 return true;
374 }
375 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
376 /* Can't preempt idle loop, don't count as failure. */
377 l->l_dopreempt = 0;
378 return true;
379 }
380 if (__predict_false(l->l_nopreempt != 0)) {
381 /* LWP holds preemption disabled, explicitly. */
382 if ((dop & DOPREEMPT_COUNTED) == 0) {
383 kpreempt_ev_crit.ev_count++;
384 }
385 failed = (uintptr_t)&in_critical_section;
386 break;
387 }
388 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
389 /* Can't preempt soft interrupts yet. */
390 l->l_dopreempt = 0;
391 failed = (uintptr_t)&is_softint;
392 break;
393 }
394 s = splsched();
395 if (__predict_false(l->l_blcnt != 0 ||
396 curcpu()->ci_biglock_wanted != NULL)) {
397 /* Hold or want kernel_lock, code is not MT safe. */
398 splx(s);
399 if ((dop & DOPREEMPT_COUNTED) == 0) {
400 kpreempt_ev_klock.ev_count++;
401 }
402 failed = (uintptr_t)&kernel_lock_held;
403 break;
404 }
405 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
406 /*
407 * It may be that the IPL is too high.
408 * kpreempt_enter() can schedule an
409 * interrupt to retry later.
410 */
411 splx(s);
412 if ((dop & DOPREEMPT_COUNTED) == 0) {
413 kpreempt_ev_ipl.ev_count++;
414 }
415 failed = (uintptr_t)&spl_raised;
416 break;
417 }
418 /* Do it! */
419 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
420 kpreempt_ev_immed.ev_count++;
421 }
422 lwp_lock(l);
423 mi_switch(l);
424 l->l_nopreempt++;
425 splx(s);
426
427 /* Take care of any MD cleanup. */
428 cpu_kpreempt_exit(where);
429 l->l_nopreempt--;
430 }
431
432 /* Record preemption failure for reporting via lockstat. */
433 if (__predict_false(failed)) {
434 int lsflag = 0;
435 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
436 LOCKSTAT_ENTER(lsflag);
437 /* Might recurse, make it atomic. */
438 if (__predict_false(lsflag)) {
439 if (where == 0) {
440 where = (uintptr_t)__builtin_return_address(0);
441 }
442 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
443 NULL, (void *)where) == NULL) {
444 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
445 l->l_pfaillock = failed;
446 }
447 }
448 LOCKSTAT_EXIT(lsflag);
449 }
450
451 return failed;
452 }
453
454 /*
455 * Return true if preemption is explicitly disabled.
456 */
457 bool
458 kpreempt_disabled(void)
459 {
460 lwp_t *l;
461
462 l = curlwp;
463
464 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
465 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
466 }
467
468 /*
469 * Disable kernel preemption.
470 */
471 void
472 kpreempt_disable(void)
473 {
474
475 KPREEMPT_DISABLE(curlwp);
476 }
477
478 /*
479 * Reenable kernel preemption.
480 */
481 void
482 kpreempt_enable(void)
483 {
484
485 KPREEMPT_ENABLE(curlwp);
486 }
487
488 /*
489 * Compute the amount of time during which the current lwp was running.
490 *
491 * - update l_rtime unless it's an idle lwp.
492 */
493
494 void
495 updatertime(lwp_t *l, const struct bintime *now)
496 {
497
498 if ((l->l_flag & LW_IDLE) != 0)
499 return;
500
501 /* rtime += now - stime */
502 bintime_add(&l->l_rtime, now);
503 bintime_sub(&l->l_rtime, &l->l_stime);
504 }
505
506 /*
507 * The machine independent parts of context switch.
508 *
509 * Returns 1 if another LWP was actually run.
510 */
511 int
512 mi_switch(lwp_t *l)
513 {
514 struct cpu_info *ci, *tci = NULL;
515 struct schedstate_percpu *spc;
516 struct lwp *newl;
517 int retval, oldspl;
518 struct bintime bt;
519 bool returning;
520
521 KASSERT(lwp_locked(l, NULL));
522 KASSERT(kpreempt_disabled());
523 LOCKDEBUG_BARRIER(l->l_mutex, 1);
524
525 #ifdef KSTACK_CHECK_MAGIC
526 kstack_check_magic(l);
527 #endif
528
529 binuptime(&bt);
530
531 KASSERT(l->l_cpu == curcpu());
532 ci = l->l_cpu;
533 spc = &ci->ci_schedstate;
534 returning = false;
535 newl = NULL;
536
537 /*
538 * If we have been asked to switch to a specific LWP, then there
539 * is no need to inspect the run queues. If a soft interrupt is
540 * blocking, then return to the interrupted thread without adjusting
541 * VM context or its start time: neither have been changed in order
542 * to take the interrupt.
543 */
544 if (l->l_switchto != NULL) {
545 if ((l->l_pflag & LP_INTR) != 0) {
546 returning = true;
547 softint_block(l);
548 if ((l->l_flag & LW_TIMEINTR) != 0)
549 updatertime(l, &bt);
550 }
551 newl = l->l_switchto;
552 l->l_switchto = NULL;
553 }
554 #ifndef __HAVE_FAST_SOFTINTS
555 else if (ci->ci_data.cpu_softints != 0) {
556 /* There are pending soft interrupts, so pick one. */
557 newl = softint_picklwp();
558 newl->l_stat = LSONPROC;
559 newl->l_flag |= LW_RUNNING;
560 }
561 #endif /* !__HAVE_FAST_SOFTINTS */
562
563 /* Count time spent in current system call */
564 if (!returning) {
565 SYSCALL_TIME_SLEEP(l);
566
567 /*
568 * XXXSMP If we are using h/w performance counters,
569 * save context.
570 */
571 #if PERFCTRS
572 if (PMC_ENABLED(l->l_proc)) {
573 pmc_save_context(l->l_proc);
574 }
575 #endif
576 updatertime(l, &bt);
577 }
578
579 /*
580 * If on the CPU and we have gotten this far, then we must yield.
581 */
582 KASSERT(l->l_stat != LSRUN);
583 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
584 KASSERT(lwp_locked(l, spc->spc_lwplock));
585
586 if (l->l_target_cpu == l->l_cpu) {
587 l->l_target_cpu = NULL;
588 } else {
589 tci = l->l_target_cpu;
590 }
591
592 if (__predict_false(tci != NULL)) {
593 /* Double-lock the runqueues */
594 spc_dlock(ci, tci);
595 } else {
596 /* Lock the runqueue */
597 spc_lock(ci);
598 }
599
600 if ((l->l_flag & LW_IDLE) == 0) {
601 l->l_stat = LSRUN;
602 if (__predict_false(tci != NULL)) {
603 /*
604 * Set the new CPU, lock and unset the
605 * l_target_cpu - thread will be enqueued
606 * to the runqueue of target CPU.
607 */
608 l->l_cpu = tci;
609 lwp_setlock(l, tci->ci_schedstate.spc_mutex);
610 l->l_target_cpu = NULL;
611 } else {
612 lwp_setlock(l, spc->spc_mutex);
613 }
614 sched_enqueue(l, true);
615 } else {
616 KASSERT(tci == NULL);
617 l->l_stat = LSIDL;
618 }
619 } else {
620 /* Lock the runqueue */
621 spc_lock(ci);
622 }
623
624 /*
625 * Let sched_nextlwp() select the LWP to run the CPU next.
626 * If no LWP is runnable, select the idle LWP.
627 *
628 * Note that spc_lwplock might not necessary be held, and
629 * new thread would be unlocked after setting the LWP-lock.
630 */
631 if (newl == NULL) {
632 newl = sched_nextlwp();
633 if (newl != NULL) {
634 sched_dequeue(newl);
635 KASSERT(lwp_locked(newl, spc->spc_mutex));
636 newl->l_stat = LSONPROC;
637 newl->l_cpu = ci;
638 newl->l_flag |= LW_RUNNING;
639 lwp_setlock(newl, spc->spc_lwplock);
640 } else {
641 newl = ci->ci_data.cpu_idlelwp;
642 newl->l_stat = LSONPROC;
643 newl->l_flag |= LW_RUNNING;
644 }
645 /*
646 * Only clear want_resched if there are no
647 * pending (slow) software interrupts.
648 */
649 ci->ci_want_resched = ci->ci_data.cpu_softints;
650 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
651 spc->spc_curpriority = lwp_eprio(newl);
652 }
653
654 /* Items that must be updated with the CPU locked. */
655 if (!returning) {
656 /* Update the new LWP's start time. */
657 newl->l_stime = bt;
658
659 /*
660 * ci_curlwp changes when a fast soft interrupt occurs.
661 * We use cpu_onproc to keep track of which kernel or
662 * user thread is running 'underneath' the software
663 * interrupt. This is important for time accounting,
664 * itimers and forcing user threads to preempt (aston).
665 */
666 ci->ci_data.cpu_onproc = newl;
667 }
668
669 /*
670 * Preemption related tasks. Must be done with the current
671 * CPU locked.
672 */
673 cpu_did_resched(l);
674 l->l_dopreempt = 0;
675 if (__predict_false(l->l_pfailaddr != 0)) {
676 LOCKSTAT_FLAG(lsflag);
677 LOCKSTAT_ENTER(lsflag);
678 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
679 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
680 1, l->l_pfailtime, l->l_pfailaddr);
681 LOCKSTAT_EXIT(lsflag);
682 l->l_pfailtime = 0;
683 l->l_pfaillock = 0;
684 l->l_pfailaddr = 0;
685 }
686
687 if (l != newl) {
688 struct lwp *prevlwp;
689
690 /* Release all locks, but leave the current LWP locked */
691 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
692 /*
693 * In case of migration, drop the local runqueue
694 * lock, thread is on other runqueue now.
695 */
696 if (__predict_false(tci != NULL))
697 spc_unlock(ci);
698 /*
699 * Drop spc_lwplock, if the current LWP has been moved
700 * to the run queue (it is now locked by spc_mutex).
701 */
702 mutex_spin_exit(spc->spc_lwplock);
703 } else {
704 /*
705 * Otherwise, drop the spc_mutex, we are done with the
706 * run queues.
707 */
708 mutex_spin_exit(spc->spc_mutex);
709 KASSERT(tci == NULL);
710 }
711
712 /*
713 * Mark that context switch is going to be perfomed
714 * for this LWP, to protect it from being switched
715 * to on another CPU.
716 */
717 KASSERT(l->l_ctxswtch == 0);
718 l->l_ctxswtch = 1;
719 l->l_ncsw++;
720 l->l_flag &= ~LW_RUNNING;
721
722 /*
723 * Increase the count of spin-mutexes before the release
724 * of the last lock - we must remain at IPL_SCHED during
725 * the context switch.
726 */
727 oldspl = MUTEX_SPIN_OLDSPL(ci);
728 ci->ci_mtx_count--;
729 lwp_unlock(l);
730
731 /* Count the context switch on this CPU. */
732 ci->ci_data.cpu_nswtch++;
733
734 /* Update status for lwpctl, if present. */
735 if (l->l_lwpctl != NULL)
736 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
737
738 /*
739 * Save old VM context, unless a soft interrupt
740 * handler is blocking.
741 */
742 if (!returning)
743 pmap_deactivate(l);
744
745 /*
746 * We may need to spin-wait for if 'newl' is still
747 * context switching on another CPU.
748 */
749 if (newl->l_ctxswtch != 0) {
750 u_int count;
751 count = SPINLOCK_BACKOFF_MIN;
752 while (newl->l_ctxswtch)
753 SPINLOCK_BACKOFF(count);
754 }
755
756 /* Switch to the new LWP.. */
757 prevlwp = cpu_switchto(l, newl, returning);
758 ci = curcpu();
759
760 /*
761 * Switched away - we have new curlwp.
762 * Restore VM context and IPL.
763 */
764 pmap_activate(l);
765 if (prevlwp != NULL) {
766 /* Normalize the count of the spin-mutexes */
767 ci->ci_mtx_count++;
768 /* Unmark the state of context switch */
769 membar_exit();
770 prevlwp->l_ctxswtch = 0;
771 }
772
773 /* Update status for lwpctl, if present. */
774 if (l->l_lwpctl != NULL) {
775 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
776 l->l_lwpctl->lc_pctr++;
777 }
778
779 KASSERT(l->l_cpu == ci);
780 splx(oldspl);
781 retval = 1;
782 } else {
783 /* Nothing to do - just unlock and return. */
784 KASSERT(tci == NULL);
785 spc_unlock(ci);
786 lwp_unlock(l);
787 retval = 0;
788 }
789
790 KASSERT(l == curlwp);
791 KASSERT(l->l_stat == LSONPROC);
792
793 /*
794 * XXXSMP If we are using h/w performance counters, restore context.
795 * XXXSMP preemption problem.
796 */
797 #if PERFCTRS
798 if (PMC_ENABLED(l->l_proc)) {
799 pmc_restore_context(l->l_proc);
800 }
801 #endif
802 SYSCALL_TIME_WAKEUP(l);
803 LOCKDEBUG_BARRIER(NULL, 1);
804
805 return retval;
806 }
807
808 /*
809 * Change process state to be runnable, placing it on the run queue if it is
810 * in memory, and awakening the swapper if it isn't in memory.
811 *
812 * Call with the process and LWP locked. Will return with the LWP unlocked.
813 */
814 void
815 setrunnable(struct lwp *l)
816 {
817 struct proc *p = l->l_proc;
818 struct cpu_info *ci;
819 sigset_t *ss;
820
821 KASSERT((l->l_flag & LW_IDLE) == 0);
822 KASSERT(mutex_owned(p->p_lock));
823 KASSERT(lwp_locked(l, NULL));
824 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
825
826 switch (l->l_stat) {
827 case LSSTOP:
828 /*
829 * If we're being traced (possibly because someone attached us
830 * while we were stopped), check for a signal from the debugger.
831 */
832 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
833 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
834 ss = &l->l_sigpend.sp_set;
835 else
836 ss = &p->p_sigpend.sp_set;
837 sigaddset(ss, p->p_xstat);
838 signotify(l);
839 }
840 p->p_nrlwps++;
841 break;
842 case LSSUSPENDED:
843 l->l_flag &= ~LW_WSUSPEND;
844 p->p_nrlwps++;
845 cv_broadcast(&p->p_lwpcv);
846 break;
847 case LSSLEEP:
848 KASSERT(l->l_wchan != NULL);
849 break;
850 default:
851 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
852 }
853
854 /*
855 * If the LWP was sleeping interruptably, then it's OK to start it
856 * again. If not, mark it as still sleeping.
857 */
858 if (l->l_wchan != NULL) {
859 l->l_stat = LSSLEEP;
860 /* lwp_unsleep() will release the lock. */
861 lwp_unsleep(l, true);
862 return;
863 }
864
865 /*
866 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
867 * about to call mi_switch(), in which case it will yield.
868 */
869 if ((l->l_flag & LW_RUNNING) != 0) {
870 l->l_stat = LSONPROC;
871 l->l_slptime = 0;
872 lwp_unlock(l);
873 return;
874 }
875
876 /*
877 * Look for a CPU to run.
878 * Set the LWP runnable.
879 */
880 ci = sched_takecpu(l);
881 l->l_cpu = ci;
882 spc_lock(ci);
883 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
884 sched_setrunnable(l);
885 l->l_stat = LSRUN;
886 l->l_slptime = 0;
887
888 /*
889 * If thread is swapped out - wake the swapper to bring it back in.
890 * Otherwise, enter it into a run queue.
891 */
892 if (l->l_flag & LW_INMEM) {
893 sched_enqueue(l, false);
894 resched_cpu(l);
895 lwp_unlock(l);
896 } else {
897 lwp_unlock(l);
898 uvm_kick_scheduler();
899 }
900 }
901
902 /*
903 * suspendsched:
904 *
905 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
906 */
907 void
908 suspendsched(void)
909 {
910 CPU_INFO_ITERATOR cii;
911 struct cpu_info *ci;
912 struct lwp *l;
913 struct proc *p;
914
915 /*
916 * We do this by process in order not to violate the locking rules.
917 */
918 mutex_enter(proc_lock);
919 PROCLIST_FOREACH(p, &allproc) {
920 if ((p->p_flag & PK_MARKER) != 0)
921 continue;
922
923 mutex_enter(p->p_lock);
924 if ((p->p_flag & PK_SYSTEM) != 0) {
925 mutex_exit(p->p_lock);
926 continue;
927 }
928
929 p->p_stat = SSTOP;
930
931 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
932 if (l == curlwp)
933 continue;
934
935 lwp_lock(l);
936
937 /*
938 * Set L_WREBOOT so that the LWP will suspend itself
939 * when it tries to return to user mode. We want to
940 * try and get to get as many LWPs as possible to
941 * the user / kernel boundary, so that they will
942 * release any locks that they hold.
943 */
944 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
945
946 if (l->l_stat == LSSLEEP &&
947 (l->l_flag & LW_SINTR) != 0) {
948 /* setrunnable() will release the lock. */
949 setrunnable(l);
950 continue;
951 }
952
953 lwp_unlock(l);
954 }
955
956 mutex_exit(p->p_lock);
957 }
958 mutex_exit(proc_lock);
959
960 /*
961 * Kick all CPUs to make them preempt any LWPs running in user mode.
962 * They'll trap into the kernel and suspend themselves in userret().
963 */
964 for (CPU_INFO_FOREACH(cii, ci)) {
965 spc_lock(ci);
966 cpu_need_resched(ci, RESCHED_IMMED);
967 spc_unlock(ci);
968 }
969 }
970
971 /*
972 * sched_unsleep:
973 *
974 * The is called when the LWP has not been awoken normally but instead
975 * interrupted: for example, if the sleep timed out. Because of this,
976 * it's not a valid action for running or idle LWPs.
977 */
978 static u_int
979 sched_unsleep(struct lwp *l, bool cleanup)
980 {
981
982 lwp_unlock(l);
983 panic("sched_unsleep");
984 }
985
986 void
987 resched_cpu(struct lwp *l)
988 {
989 struct cpu_info *ci;
990
991 /*
992 * XXXSMP
993 * Since l->l_cpu persists across a context switch,
994 * this gives us *very weak* processor affinity, in
995 * that we notify the CPU on which the process last
996 * ran that it should try to switch.
997 *
998 * This does not guarantee that the process will run on
999 * that processor next, because another processor might
1000 * grab it the next time it performs a context switch.
1001 *
1002 * This also does not handle the case where its last
1003 * CPU is running a higher-priority process, but every
1004 * other CPU is running a lower-priority process. There
1005 * are ways to handle this situation, but they're not
1006 * currently very pretty, and we also need to weigh the
1007 * cost of moving a process from one CPU to another.
1008 */
1009 ci = l->l_cpu;
1010 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1011 cpu_need_resched(ci, 0);
1012 }
1013
1014 static void
1015 sched_changepri(struct lwp *l, pri_t pri)
1016 {
1017
1018 KASSERT(lwp_locked(l, NULL));
1019
1020 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1021 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1022 sched_dequeue(l);
1023 l->l_priority = pri;
1024 sched_enqueue(l, false);
1025 } else {
1026 l->l_priority = pri;
1027 }
1028 resched_cpu(l);
1029 }
1030
1031 static void
1032 sched_lendpri(struct lwp *l, pri_t pri)
1033 {
1034
1035 KASSERT(lwp_locked(l, NULL));
1036
1037 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1038 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1039 sched_dequeue(l);
1040 l->l_inheritedprio = pri;
1041 sched_enqueue(l, false);
1042 } else {
1043 l->l_inheritedprio = pri;
1044 }
1045 resched_cpu(l);
1046 }
1047
1048 struct lwp *
1049 syncobj_noowner(wchan_t wchan)
1050 {
1051
1052 return NULL;
1053 }
1054
1055 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
1056 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1057
1058 /*
1059 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
1060 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
1061 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
1062 *
1063 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
1064 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
1065 *
1066 * If you dont want to bother with the faster/more-accurate formula, you
1067 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
1068 * (more general) method of calculating the %age of CPU used by a process.
1069 */
1070 #define CCPU_SHIFT (FSHIFT + 1)
1071
1072 /*
1073 * sched_pstats:
1074 *
1075 * Update process statistics and check CPU resource allocation.
1076 * Call scheduler-specific hook to eventually adjust process/LWP
1077 * priorities.
1078 */
1079 /* ARGSUSED */
1080 void
1081 sched_pstats(void *arg)
1082 {
1083 struct rlimit *rlim;
1084 struct lwp *l;
1085 struct proc *p;
1086 int sig, clkhz;
1087 long runtm;
1088
1089 sched_pstats_ticks++;
1090
1091 mutex_enter(proc_lock);
1092 PROCLIST_FOREACH(p, &allproc) {
1093 if ((p->p_flag & PK_MARKER) != 0)
1094 continue;
1095
1096 /*
1097 * Increment time in/out of memory and sleep time (if
1098 * sleeping). We ignore overflow; with 16-bit int's
1099 * (remember them?) overflow takes 45 days.
1100 */
1101 mutex_enter(p->p_lock);
1102 mutex_spin_enter(&p->p_stmutex);
1103 runtm = p->p_rtime.sec;
1104 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1105 if ((l->l_flag & LW_IDLE) != 0)
1106 continue;
1107 lwp_lock(l);
1108 runtm += l->l_rtime.sec;
1109 l->l_swtime++;
1110 sched_pstats_hook(l);
1111 lwp_unlock(l);
1112
1113 /*
1114 * p_pctcpu is only for ps.
1115 */
1116 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1117 if (l->l_slptime < 1) {
1118 clkhz = stathz != 0 ? stathz : hz;
1119 #if (FSHIFT >= CCPU_SHIFT)
1120 l->l_pctcpu += (clkhz == 100) ?
1121 ((fixpt_t)l->l_cpticks) <<
1122 (FSHIFT - CCPU_SHIFT) :
1123 100 * (((fixpt_t) p->p_cpticks)
1124 << (FSHIFT - CCPU_SHIFT)) / clkhz;
1125 #else
1126 l->l_pctcpu += ((FSCALE - ccpu) *
1127 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
1128 #endif
1129 l->l_cpticks = 0;
1130 }
1131 }
1132 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1133 mutex_spin_exit(&p->p_stmutex);
1134
1135 /*
1136 * Check if the process exceeds its CPU resource allocation.
1137 * If over max, kill it.
1138 */
1139 rlim = &p->p_rlimit[RLIMIT_CPU];
1140 sig = 0;
1141 if (runtm >= rlim->rlim_cur) {
1142 if (runtm >= rlim->rlim_max)
1143 sig = SIGKILL;
1144 else {
1145 sig = SIGXCPU;
1146 if (rlim->rlim_cur < rlim->rlim_max)
1147 rlim->rlim_cur += 5;
1148 }
1149 }
1150 mutex_exit(p->p_lock);
1151 if (sig)
1152 psignal(p, sig);
1153 }
1154 mutex_exit(proc_lock);
1155 uvm_meter();
1156 cv_wakeup(&lbolt);
1157 callout_schedule(&sched_pstats_ch, hz);
1158 }
1159