kern_synch.c revision 1.241.2.1 1 /* $NetBSD: kern_synch.c,v 1.241.2.1 2008/05/10 23:49:04 wrstuden Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*-
35 * Copyright (c) 1982, 1986, 1990, 1991, 1993
36 * The Regents of the University of California. All rights reserved.
37 * (c) UNIX System Laboratories, Inc.
38 * All or some portions of this file are derived from material licensed
39 * to the University of California by American Telephone and Telegraph
40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41 * the permission of UNIX System Laboratories, Inc.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.241.2.1 2008/05/10 23:49:04 wrstuden Exp $");
72
73 #include "opt_kstack.h"
74 #include "opt_lockdebug.h"
75 #include "opt_multiprocessor.h"
76 #include "opt_perfctrs.h"
77
78 #define __MUTEX_PRIVATE
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/kernel.h>
84 #if defined(PERFCTRS)
85 #include <sys/pmc.h>
86 #endif
87 #include <sys/cpu.h>
88 #include <sys/resourcevar.h>
89 #include <sys/sched.h>
90 #include <sys/sa.h>
91 #include <sys/savar.h>
92 #include <sys/syscall_stats.h>
93 #include <sys/sleepq.h>
94 #include <sys/lockdebug.h>
95 #include <sys/evcnt.h>
96 #include <sys/intr.h>
97 #include <sys/lwpctl.h>
98 #include <sys/atomic.h>
99 #include <sys/simplelock.h>
100
101 #include <uvm/uvm_extern.h>
102
103 #include <dev/lockstat.h>
104
105 static u_int sched_unsleep(struct lwp *, bool);
106 static void sched_changepri(struct lwp *, pri_t);
107 static void sched_lendpri(struct lwp *, pri_t);
108
109 syncobj_t sleep_syncobj = {
110 SOBJ_SLEEPQ_SORTED,
111 sleepq_unsleep,
112 sleepq_changepri,
113 sleepq_lendpri,
114 syncobj_noowner,
115 };
116
117 syncobj_t sched_syncobj = {
118 SOBJ_SLEEPQ_SORTED,
119 sched_unsleep,
120 sched_changepri,
121 sched_lendpri,
122 syncobj_noowner,
123 };
124
125 callout_t sched_pstats_ch;
126 unsigned sched_pstats_ticks;
127 kcondvar_t lbolt; /* once a second sleep address */
128
129 /* Preemption event counters */
130 static struct evcnt kpreempt_ev_crit;
131 static struct evcnt kpreempt_ev_klock;
132 static struct evcnt kpreempt_ev_ipl;
133 static struct evcnt kpreempt_ev_immed;
134
135 /*
136 * During autoconfiguration or after a panic, a sleep will simply lower the
137 * priority briefly to allow interrupts, then return. The priority to be
138 * used (safepri) is machine-dependent, thus this value is initialized and
139 * maintained in the machine-dependent layers. This priority will typically
140 * be 0, or the lowest priority that is safe for use on the interrupt stack;
141 * it can be made higher to block network software interrupts after panics.
142 */
143 int safepri;
144
145 void
146 sched_init(void)
147 {
148
149 cv_init(&lbolt, "lbolt");
150 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
151 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
152
153 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
154 "kpreempt", "defer: critical section");
155 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
156 "kpreempt", "defer: kernel_lock");
157 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL,
158 "kpreempt", "defer: IPL");
159 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
160 "kpreempt", "immediate");
161
162 sched_pstats(NULL);
163 }
164
165 /*
166 * OBSOLETE INTERFACE
167 *
168 * General sleep call. Suspends the current process until a wakeup is
169 * performed on the specified identifier. The process will then be made
170 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
171 * means no timeout). If pri includes PCATCH flag, signals are checked
172 * before and after sleeping, else signals are not checked. Returns 0 if
173 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
174 * signal needs to be delivered, ERESTART is returned if the current system
175 * call should be restarted if possible, and EINTR is returned if the system
176 * call should be interrupted by the signal (return EINTR).
177 *
178 * The interlock is held until we are on a sleep queue. The interlock will
179 * be locked before returning back to the caller unless the PNORELOCK flag
180 * is specified, in which case the interlock will always be unlocked upon
181 * return.
182 */
183 int
184 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
185 volatile struct simplelock *interlock)
186 {
187 struct lwp *l = curlwp;
188 sleepq_t *sq;
189 int error;
190
191 KASSERT((l->l_pflag & LP_INTR) == 0);
192
193 if (sleepq_dontsleep(l)) {
194 (void)sleepq_abort(NULL, 0);
195 if ((priority & PNORELOCK) != 0)
196 simple_unlock(interlock);
197 return 0;
198 }
199
200 l->l_kpriority = true;
201 sq = sleeptab_lookup(&sleeptab, ident);
202 sleepq_enter(sq, l);
203 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
204
205 if (interlock != NULL) {
206 KASSERT(simple_lock_held(interlock));
207 simple_unlock(interlock);
208 }
209
210 error = sleepq_block(timo, priority & PCATCH);
211
212 if (interlock != NULL && (priority & PNORELOCK) == 0)
213 simple_lock(interlock);
214
215 return error;
216 }
217
218 int
219 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
220 kmutex_t *mtx)
221 {
222 struct lwp *l = curlwp;
223 sleepq_t *sq;
224 int error;
225
226 KASSERT((l->l_pflag & LP_INTR) == 0);
227
228 if (sleepq_dontsleep(l)) {
229 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
230 return 0;
231 }
232
233 l->l_kpriority = true;
234 sq = sleeptab_lookup(&sleeptab, ident);
235 sleepq_enter(sq, l);
236 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
237 mutex_exit(mtx);
238 error = sleepq_block(timo, priority & PCATCH);
239
240 if ((priority & PNORELOCK) == 0)
241 mutex_enter(mtx);
242
243 return error;
244 }
245
246 /*
247 * General sleep call for situations where a wake-up is not expected.
248 */
249 int
250 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
251 {
252 struct lwp *l = curlwp;
253 sleepq_t *sq;
254 int error;
255
256 if (sleepq_dontsleep(l))
257 return sleepq_abort(NULL, 0);
258
259 if (mtx != NULL)
260 mutex_exit(mtx);
261 l->l_kpriority = true;
262 sq = sleeptab_lookup(&sleeptab, l);
263 sleepq_enter(sq, l);
264 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
265 error = sleepq_block(timo, intr);
266 if (mtx != NULL)
267 mutex_enter(mtx);
268
269 return error;
270 }
271
272 /*
273 * OBSOLETE INTERFACE
274 *
275 * Make all processes sleeping on the specified identifier runnable.
276 */
277 void
278 wakeup(wchan_t ident)
279 {
280 sleepq_t *sq;
281
282 if (cold)
283 return;
284
285 sq = sleeptab_lookup(&sleeptab, ident);
286 sleepq_wake(sq, ident, (u_int)-1);
287 }
288
289 /*
290 * OBSOLETE INTERFACE
291 *
292 * Make the highest priority process first in line on the specified
293 * identifier runnable.
294 */
295 void
296 wakeup_one(wchan_t ident)
297 {
298 sleepq_t *sq;
299
300 if (cold)
301 return;
302
303 sq = sleeptab_lookup(&sleeptab, ident);
304 sleepq_wake(sq, ident, 1);
305 }
306
307
308 /*
309 * General yield call. Puts the current process back on its run queue and
310 * performs a voluntary context switch. Should only be called when the
311 * current process explicitly requests it (eg sched_yield(2)).
312 */
313 void
314 yield(void)
315 {
316 struct lwp *l = curlwp;
317
318 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
319 lwp_lock(l);
320 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
321 KASSERT(l->l_stat == LSONPROC);
322 l->l_kpriority = false;
323 (void)mi_switch(l);
324 KERNEL_LOCK(l->l_biglocks, l);
325 }
326
327 /*
328 * General preemption call. Puts the current process back on its run queue
329 * and performs an involuntary context switch.
330 */
331 void
332 preempt(void)
333 {
334 struct lwp *l = curlwp;
335
336 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
337 lwp_lock(l);
338 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
339 KASSERT(l->l_stat == LSONPROC);
340 l->l_kpriority = false;
341 l->l_nivcsw++;
342 (void)mi_switch(l);
343 KERNEL_LOCK(l->l_biglocks, l);
344 }
345
346 /*
347 * Handle a request made by another agent to preempt the current LWP
348 * in-kernel. Usually called when l_dopreempt may be non-zero.
349 *
350 * Character addresses for lockstat only.
351 */
352 static char in_critical_section;
353 static char kernel_lock_held;
354 static char spl_raised;
355 static char is_softint;
356
357 bool
358 kpreempt(uintptr_t where)
359 {
360 uintptr_t failed;
361 lwp_t *l;
362 int s, dop;
363
364 l = curlwp;
365 failed = 0;
366 while ((dop = l->l_dopreempt) != 0) {
367 if (l->l_stat != LSONPROC) {
368 /*
369 * About to block (or die), let it happen.
370 * Doesn't really count as "preemption has
371 * been blocked", since we're going to
372 * context switch.
373 */
374 l->l_dopreempt = 0;
375 return true;
376 }
377 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
378 /* Can't preempt idle loop, don't count as failure. */
379 l->l_dopreempt = 0;
380 return true;
381 }
382 if (__predict_false(l->l_nopreempt != 0)) {
383 /* LWP holds preemption disabled, explicitly. */
384 if ((dop & DOPREEMPT_COUNTED) == 0) {
385 kpreempt_ev_crit.ev_count++;
386 }
387 failed = (uintptr_t)&in_critical_section;
388 break;
389 }
390 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
391 /* Can't preempt soft interrupts yet. */
392 l->l_dopreempt = 0;
393 failed = (uintptr_t)&is_softint;
394 break;
395 }
396 s = splsched();
397 if (__predict_false(l->l_blcnt != 0 ||
398 curcpu()->ci_biglock_wanted != NULL)) {
399 /* Hold or want kernel_lock, code is not MT safe. */
400 splx(s);
401 if ((dop & DOPREEMPT_COUNTED) == 0) {
402 kpreempt_ev_klock.ev_count++;
403 }
404 failed = (uintptr_t)&kernel_lock_held;
405 break;
406 }
407 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
408 /*
409 * It may be that the IPL is too high.
410 * kpreempt_enter() can schedule an
411 * interrupt to retry later.
412 */
413 splx(s);
414 if ((dop & DOPREEMPT_COUNTED) == 0) {
415 kpreempt_ev_ipl.ev_count++;
416 }
417 failed = (uintptr_t)&spl_raised;
418 break;
419 }
420 /* Do it! */
421 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
422 kpreempt_ev_immed.ev_count++;
423 }
424 lwp_lock(l);
425 mi_switch(l);
426 l->l_nopreempt++;
427 splx(s);
428
429 /* Take care of any MD cleanup. */
430 cpu_kpreempt_exit(where);
431 l->l_nopreempt--;
432 }
433
434 /* Record preemption failure for reporting via lockstat. */
435 if (__predict_false(failed)) {
436 int lsflag = 0;
437 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
438 LOCKSTAT_ENTER(lsflag);
439 /* Might recurse, make it atomic. */
440 if (__predict_false(lsflag)) {
441 if (where == 0) {
442 where = (uintptr_t)__builtin_return_address(0);
443 }
444 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
445 NULL, (void *)where) == NULL) {
446 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
447 l->l_pfaillock = failed;
448 }
449 }
450 LOCKSTAT_EXIT(lsflag);
451 }
452
453 return failed;
454 }
455
456 /*
457 * Return true if preemption is explicitly disabled.
458 */
459 bool
460 kpreempt_disabled(void)
461 {
462 lwp_t *l;
463
464 l = curlwp;
465
466 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
467 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
468 }
469
470 /*
471 * Disable kernel preemption.
472 */
473 void
474 kpreempt_disable(void)
475 {
476
477 KPREEMPT_DISABLE(curlwp);
478 }
479
480 /*
481 * Reenable kernel preemption.
482 */
483 void
484 kpreempt_enable(void)
485 {
486
487 KPREEMPT_ENABLE(curlwp);
488 }
489
490 /*
491 * Compute the amount of time during which the current lwp was running.
492 *
493 * - update l_rtime unless it's an idle lwp.
494 */
495
496 void
497 updatertime(lwp_t *l, const struct bintime *now)
498 {
499
500 if ((l->l_flag & LW_IDLE) != 0)
501 return;
502
503 /* rtime += now - stime */
504 bintime_add(&l->l_rtime, now);
505 bintime_sub(&l->l_rtime, &l->l_stime);
506 }
507
508 /*
509 * The machine independent parts of context switch.
510 *
511 * Returns 1 if another LWP was actually run.
512 */
513 int
514 mi_switch(lwp_t *l)
515 {
516 struct cpu_info *ci, *tci = NULL;
517 struct schedstate_percpu *spc;
518 struct lwp *newl;
519 int retval, oldspl;
520 struct bintime bt;
521 bool returning;
522
523 KASSERT(lwp_locked(l, NULL));
524 KASSERT(kpreempt_disabled());
525 LOCKDEBUG_BARRIER(l->l_mutex, 1);
526
527 #ifdef KSTACK_CHECK_MAGIC
528 kstack_check_magic(l);
529 #endif
530
531 binuptime(&bt);
532
533 KASSERT(l->l_cpu == curcpu());
534 ci = l->l_cpu;
535 spc = &ci->ci_schedstate;
536 returning = false;
537 newl = NULL;
538
539 /*
540 * If we have been asked to switch to a specific LWP, then there
541 * is no need to inspect the run queues. If a soft interrupt is
542 * blocking, then return to the interrupted thread without adjusting
543 * VM context or its start time: neither have been changed in order
544 * to take the interrupt.
545 */
546 if (l->l_switchto != NULL) {
547 if ((l->l_pflag & LP_INTR) != 0) {
548 returning = true;
549 softint_block(l);
550 if ((l->l_flag & LW_TIMEINTR) != 0)
551 updatertime(l, &bt);
552 }
553 newl = l->l_switchto;
554 l->l_switchto = NULL;
555 }
556 #ifndef __HAVE_FAST_SOFTINTS
557 else if (ci->ci_data.cpu_softints != 0) {
558 /* There are pending soft interrupts, so pick one. */
559 newl = softint_picklwp();
560 newl->l_stat = LSONPROC;
561 newl->l_flag |= LW_RUNNING;
562 }
563 #endif /* !__HAVE_FAST_SOFTINTS */
564
565 /* Count time spent in current system call */
566 if (!returning) {
567 SYSCALL_TIME_SLEEP(l);
568
569 /*
570 * XXXSMP If we are using h/w performance counters,
571 * save context.
572 */
573 #if PERFCTRS
574 if (PMC_ENABLED(l->l_proc)) {
575 pmc_save_context(l->l_proc);
576 }
577 #endif
578 updatertime(l, &bt);
579 }
580
581 /*
582 * If on the CPU and we have gotten this far, then we must yield.
583 */
584 KASSERT(l->l_stat != LSRUN);
585 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
586 KASSERT(lwp_locked(l, spc->spc_lwplock));
587
588 if (l->l_target_cpu == l->l_cpu) {
589 l->l_target_cpu = NULL;
590 } else {
591 tci = l->l_target_cpu;
592 }
593
594 if (__predict_false(tci != NULL)) {
595 /* Double-lock the runqueues */
596 spc_dlock(ci, tci);
597 } else {
598 /* Lock the runqueue */
599 spc_lock(ci);
600 }
601
602 if ((l->l_flag & LW_IDLE) == 0) {
603 l->l_stat = LSRUN;
604 if (__predict_false(tci != NULL)) {
605 /*
606 * Set the new CPU, lock and unset the
607 * l_target_cpu - thread will be enqueued
608 * to the runqueue of target CPU.
609 */
610 l->l_cpu = tci;
611 lwp_setlock(l, tci->ci_schedstate.spc_mutex);
612 l->l_target_cpu = NULL;
613 } else {
614 lwp_setlock(l, spc->spc_mutex);
615 }
616 sched_enqueue(l, true);
617 } else {
618 KASSERT(tci == NULL);
619 l->l_stat = LSIDL;
620 }
621 } else {
622 /* Lock the runqueue */
623 spc_lock(ci);
624 }
625
626 /*
627 * Let sched_nextlwp() select the LWP to run the CPU next.
628 * If no LWP is runnable, select the idle LWP.
629 *
630 * Note that spc_lwplock might not necessary be held, and
631 * new thread would be unlocked after setting the LWP-lock.
632 */
633 if (newl == NULL) {
634 newl = sched_nextlwp();
635 if (newl != NULL) {
636 sched_dequeue(newl);
637 KASSERT(lwp_locked(newl, spc->spc_mutex));
638 newl->l_stat = LSONPROC;
639 newl->l_cpu = ci;
640 newl->l_flag |= LW_RUNNING;
641 lwp_setlock(newl, spc->spc_lwplock);
642 } else {
643 newl = ci->ci_data.cpu_idlelwp;
644 newl->l_stat = LSONPROC;
645 newl->l_flag |= LW_RUNNING;
646 }
647 /*
648 * Only clear want_resched if there are no
649 * pending (slow) software interrupts.
650 */
651 ci->ci_want_resched = ci->ci_data.cpu_softints;
652 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
653 spc->spc_curpriority = lwp_eprio(newl);
654 }
655
656 /* Items that must be updated with the CPU locked. */
657 if (!returning) {
658 /* Update the new LWP's start time. */
659 newl->l_stime = bt;
660
661 /*
662 * ci_curlwp changes when a fast soft interrupt occurs.
663 * We use cpu_onproc to keep track of which kernel or
664 * user thread is running 'underneath' the software
665 * interrupt. This is important for time accounting,
666 * itimers and forcing user threads to preempt (aston).
667 */
668 ci->ci_data.cpu_onproc = newl;
669 }
670
671 /*
672 * Preemption related tasks. Must be done with the current
673 * CPU locked.
674 */
675 cpu_did_resched(l);
676 l->l_dopreempt = 0;
677 if (__predict_false(l->l_pfailaddr != 0)) {
678 LOCKSTAT_FLAG(lsflag);
679 LOCKSTAT_ENTER(lsflag);
680 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
681 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
682 1, l->l_pfailtime, l->l_pfailaddr);
683 LOCKSTAT_EXIT(lsflag);
684 l->l_pfailtime = 0;
685 l->l_pfaillock = 0;
686 l->l_pfailaddr = 0;
687 }
688
689 if (l != newl) {
690 struct lwp *prevlwp;
691
692 /* Release all locks, but leave the current LWP locked */
693 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
694 /*
695 * In case of migration, drop the local runqueue
696 * lock, thread is on other runqueue now.
697 */
698 if (__predict_false(tci != NULL))
699 spc_unlock(ci);
700 /*
701 * Drop spc_lwplock, if the current LWP has been moved
702 * to the run queue (it is now locked by spc_mutex).
703 */
704 mutex_spin_exit(spc->spc_lwplock);
705 } else {
706 /*
707 * Otherwise, drop the spc_mutex, we are done with the
708 * run queues.
709 */
710 mutex_spin_exit(spc->spc_mutex);
711 KASSERT(tci == NULL);
712 }
713
714 /*
715 * Mark that context switch is going to be perfomed
716 * for this LWP, to protect it from being switched
717 * to on another CPU.
718 */
719 KASSERT(l->l_ctxswtch == 0);
720 l->l_ctxswtch = 1;
721 l->l_ncsw++;
722 l->l_flag &= ~LW_RUNNING;
723
724 /*
725 * Increase the count of spin-mutexes before the release
726 * of the last lock - we must remain at IPL_SCHED during
727 * the context switch.
728 */
729 oldspl = MUTEX_SPIN_OLDSPL(ci);
730 ci->ci_mtx_count--;
731 lwp_unlock(l);
732
733 /* Count the context switch on this CPU. */
734 ci->ci_data.cpu_nswtch++;
735
736 /* Update status for lwpctl, if present. */
737 if (l->l_lwpctl != NULL)
738 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
739
740 /*
741 * Save old VM context, unless a soft interrupt
742 * handler is blocking.
743 */
744 if (!returning)
745 pmap_deactivate(l);
746
747 /*
748 * We may need to spin-wait for if 'newl' is still
749 * context switching on another CPU.
750 */
751 if (newl->l_ctxswtch != 0) {
752 u_int count;
753 count = SPINLOCK_BACKOFF_MIN;
754 while (newl->l_ctxswtch)
755 SPINLOCK_BACKOFF(count);
756 }
757
758 /* Switch to the new LWP.. */
759 prevlwp = cpu_switchto(l, newl, returning);
760 ci = curcpu();
761
762 /*
763 * Switched away - we have new curlwp.
764 * Restore VM context and IPL.
765 */
766 pmap_activate(l);
767 if (prevlwp != NULL) {
768 /* Normalize the count of the spin-mutexes */
769 ci->ci_mtx_count++;
770 /* Unmark the state of context switch */
771 membar_exit();
772 prevlwp->l_ctxswtch = 0;
773 }
774
775 /* Update status for lwpctl, if present. */
776 if (l->l_lwpctl != NULL) {
777 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
778 l->l_lwpctl->lc_pctr++;
779 }
780
781 KASSERT(l->l_cpu == ci);
782 splx(oldspl);
783 retval = 1;
784 } else {
785 /* Nothing to do - just unlock and return. */
786 KASSERT(tci == NULL);
787 spc_unlock(ci);
788 lwp_unlock(l);
789 retval = 0;
790 }
791
792 KASSERT(l == curlwp);
793 KASSERT(l->l_stat == LSONPROC);
794
795 /*
796 * XXXSMP If we are using h/w performance counters, restore context.
797 * XXXSMP preemption problem.
798 */
799 #if PERFCTRS
800 if (PMC_ENABLED(l->l_proc)) {
801 pmc_restore_context(l->l_proc);
802 }
803 #endif
804 SYSCALL_TIME_WAKEUP(l);
805 LOCKDEBUG_BARRIER(NULL, 1);
806
807 return retval;
808 }
809
810 /*
811 * Change process state to be runnable, placing it on the run queue if it is
812 * in memory, and awakening the swapper if it isn't in memory.
813 *
814 * Call with the process and LWP locked. Will return with the LWP unlocked.
815 */
816 void
817 setrunnable(struct lwp *l)
818 {
819 struct proc *p = l->l_proc;
820 struct cpu_info *ci;
821 sigset_t *ss;
822
823 KASSERT((l->l_flag & LW_IDLE) == 0);
824 KASSERT(mutex_owned(p->p_lock));
825 KASSERT(lwp_locked(l, NULL));
826 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
827
828 switch (l->l_stat) {
829 case LSSTOP:
830 /*
831 * If we're being traced (possibly because someone attached us
832 * while we were stopped), check for a signal from the debugger.
833 */
834 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
835 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
836 ss = &l->l_sigpend.sp_set;
837 else
838 ss = &p->p_sigpend.sp_set;
839 sigaddset(ss, p->p_xstat);
840 signotify(l);
841 }
842 p->p_nrlwps++;
843 break;
844 case LSSUSPENDED:
845 l->l_flag &= ~LW_WSUSPEND;
846 p->p_nrlwps++;
847 cv_broadcast(&p->p_lwpcv);
848 break;
849 case LSSLEEP:
850 KASSERT(l->l_wchan != NULL);
851 break;
852 default:
853 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
854 }
855
856 /*
857 * If the LWP was sleeping interruptably, then it's OK to start it
858 * again. If not, mark it as still sleeping.
859 */
860 if (l->l_wchan != NULL) {
861 l->l_stat = LSSLEEP;
862 /* lwp_unsleep() will release the lock. */
863 lwp_unsleep(l, true);
864 return;
865 }
866
867 /*
868 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
869 * about to call mi_switch(), in which case it will yield.
870 */
871 if ((l->l_flag & LW_RUNNING) != 0) {
872 l->l_stat = LSONPROC;
873 l->l_slptime = 0;
874 lwp_unlock(l);
875 return;
876 }
877
878 /*
879 * Look for a CPU to run.
880 * Set the LWP runnable.
881 */
882 ci = sched_takecpu(l);
883 l->l_cpu = ci;
884 spc_lock(ci);
885 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
886 sched_setrunnable(l);
887 l->l_stat = LSRUN;
888 l->l_slptime = 0;
889
890 /*
891 * If thread is swapped out - wake the swapper to bring it back in.
892 * Otherwise, enter it into a run queue.
893 */
894 if (l->l_flag & LW_INMEM) {
895 sched_enqueue(l, false);
896 resched_cpu(l);
897 lwp_unlock(l);
898 } else {
899 lwp_unlock(l);
900 uvm_kick_scheduler();
901 }
902 }
903
904 /*
905 * suspendsched:
906 *
907 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
908 */
909 void
910 suspendsched(void)
911 {
912 CPU_INFO_ITERATOR cii;
913 struct cpu_info *ci;
914 struct lwp *l;
915 struct proc *p;
916
917 /*
918 * We do this by process in order not to violate the locking rules.
919 */
920 mutex_enter(proc_lock);
921 PROCLIST_FOREACH(p, &allproc) {
922 if ((p->p_flag & PK_MARKER) != 0)
923 continue;
924
925 mutex_enter(p->p_lock);
926 if ((p->p_flag & PK_SYSTEM) != 0) {
927 mutex_exit(p->p_lock);
928 continue;
929 }
930
931 p->p_stat = SSTOP;
932
933 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
934 if (l == curlwp)
935 continue;
936
937 lwp_lock(l);
938
939 /*
940 * Set L_WREBOOT so that the LWP will suspend itself
941 * when it tries to return to user mode. We want to
942 * try and get to get as many LWPs as possible to
943 * the user / kernel boundary, so that they will
944 * release any locks that they hold.
945 */
946 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
947
948 if (l->l_stat == LSSLEEP &&
949 (l->l_flag & LW_SINTR) != 0) {
950 /* setrunnable() will release the lock. */
951 setrunnable(l);
952 continue;
953 }
954
955 lwp_unlock(l);
956 }
957
958 mutex_exit(p->p_lock);
959 }
960 mutex_exit(proc_lock);
961
962 /*
963 * Kick all CPUs to make them preempt any LWPs running in user mode.
964 * They'll trap into the kernel and suspend themselves in userret().
965 */
966 for (CPU_INFO_FOREACH(cii, ci)) {
967 spc_lock(ci);
968 cpu_need_resched(ci, RESCHED_IMMED);
969 spc_unlock(ci);
970 }
971 }
972
973 /*
974 * sched_unsleep:
975 *
976 * The is called when the LWP has not been awoken normally but instead
977 * interrupted: for example, if the sleep timed out. Because of this,
978 * it's not a valid action for running or idle LWPs.
979 */
980 static u_int
981 sched_unsleep(struct lwp *l, bool cleanup)
982 {
983
984 lwp_unlock(l);
985 panic("sched_unsleep");
986 }
987
988 void
989 resched_cpu(struct lwp *l)
990 {
991 struct cpu_info *ci;
992
993 /*
994 * XXXSMP
995 * Since l->l_cpu persists across a context switch,
996 * this gives us *very weak* processor affinity, in
997 * that we notify the CPU on which the process last
998 * ran that it should try to switch.
999 *
1000 * This does not guarantee that the process will run on
1001 * that processor next, because another processor might
1002 * grab it the next time it performs a context switch.
1003 *
1004 * This also does not handle the case where its last
1005 * CPU is running a higher-priority process, but every
1006 * other CPU is running a lower-priority process. There
1007 * are ways to handle this situation, but they're not
1008 * currently very pretty, and we also need to weigh the
1009 * cost of moving a process from one CPU to another.
1010 */
1011 ci = l->l_cpu;
1012 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1013 cpu_need_resched(ci, 0);
1014 }
1015
1016 static void
1017 sched_changepri(struct lwp *l, pri_t pri)
1018 {
1019
1020 KASSERT(lwp_locked(l, NULL));
1021
1022 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1023 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1024 sched_dequeue(l);
1025 l->l_priority = pri;
1026 sched_enqueue(l, false);
1027 } else {
1028 l->l_priority = pri;
1029 }
1030 resched_cpu(l);
1031 }
1032
1033 static void
1034 sched_lendpri(struct lwp *l, pri_t pri)
1035 {
1036
1037 KASSERT(lwp_locked(l, NULL));
1038
1039 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1040 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1041 sched_dequeue(l);
1042 l->l_inheritedprio = pri;
1043 sched_enqueue(l, false);
1044 } else {
1045 l->l_inheritedprio = pri;
1046 }
1047 resched_cpu(l);
1048 }
1049
1050 struct lwp *
1051 syncobj_noowner(wchan_t wchan)
1052 {
1053
1054 return NULL;
1055 }
1056
1057 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
1058 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1059
1060 /*
1061 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
1062 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
1063 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
1064 *
1065 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
1066 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
1067 *
1068 * If you dont want to bother with the faster/more-accurate formula, you
1069 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
1070 * (more general) method of calculating the %age of CPU used by a process.
1071 */
1072 #define CCPU_SHIFT (FSHIFT + 1)
1073
1074 /*
1075 * sched_pstats:
1076 *
1077 * Update process statistics and check CPU resource allocation.
1078 * Call scheduler-specific hook to eventually adjust process/LWP
1079 * priorities.
1080 */
1081 /* ARGSUSED */
1082 void
1083 sched_pstats(void *arg)
1084 {
1085 struct rlimit *rlim;
1086 struct lwp *l;
1087 struct proc *p;
1088 int sig, clkhz;
1089 long runtm;
1090
1091 sched_pstats_ticks++;
1092
1093 mutex_enter(proc_lock);
1094 PROCLIST_FOREACH(p, &allproc) {
1095 if ((p->p_flag & PK_MARKER) != 0)
1096 continue;
1097
1098 /*
1099 * Increment time in/out of memory and sleep time (if
1100 * sleeping). We ignore overflow; with 16-bit int's
1101 * (remember them?) overflow takes 45 days.
1102 */
1103 mutex_enter(p->p_lock);
1104 mutex_spin_enter(&p->p_stmutex);
1105 runtm = p->p_rtime.sec;
1106 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1107 if ((l->l_flag & LW_IDLE) != 0)
1108 continue;
1109 lwp_lock(l);
1110 runtm += l->l_rtime.sec;
1111 l->l_swtime++;
1112 sched_pstats_hook(l);
1113 lwp_unlock(l);
1114
1115 /*
1116 * p_pctcpu is only for ps.
1117 */
1118 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1119 if (l->l_slptime < 1) {
1120 clkhz = stathz != 0 ? stathz : hz;
1121 #if (FSHIFT >= CCPU_SHIFT)
1122 l->l_pctcpu += (clkhz == 100) ?
1123 ((fixpt_t)l->l_cpticks) <<
1124 (FSHIFT - CCPU_SHIFT) :
1125 100 * (((fixpt_t) p->p_cpticks)
1126 << (FSHIFT - CCPU_SHIFT)) / clkhz;
1127 #else
1128 l->l_pctcpu += ((FSCALE - ccpu) *
1129 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
1130 #endif
1131 l->l_cpticks = 0;
1132 }
1133 }
1134 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1135 mutex_spin_exit(&p->p_stmutex);
1136
1137 /*
1138 * Check if the process exceeds its CPU resource allocation.
1139 * If over max, kill it.
1140 */
1141 rlim = &p->p_rlimit[RLIMIT_CPU];
1142 sig = 0;
1143 if (runtm >= rlim->rlim_cur) {
1144 if (runtm >= rlim->rlim_max)
1145 sig = SIGKILL;
1146 else {
1147 sig = SIGXCPU;
1148 if (rlim->rlim_cur < rlim->rlim_max)
1149 rlim->rlim_cur += 5;
1150 }
1151 }
1152 mutex_exit(p->p_lock);
1153 if (sig)
1154 psignal(p, sig);
1155 }
1156 mutex_exit(proc_lock);
1157 uvm_meter();
1158 cv_wakeup(&lbolt);
1159 callout_schedule(&sched_pstats_ch, hz);
1160 }
1161