kern_synch.c revision 1.234 1 /* $NetBSD: kern_synch.c,v 1.234 2008/04/28 21:17:16 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
36 * All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 */
59
60 /*-
61 * Copyright (c) 1982, 1986, 1990, 1991, 1993
62 * The Regents of the University of California. All rights reserved.
63 * (c) UNIX System Laboratories, Inc.
64 * All or some portions of this file are derived from material licensed
65 * to the University of California by American Telephone and Telegraph
66 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
67 * the permission of UNIX System Laboratories, Inc.
68 *
69 * Redistribution and use in source and binary forms, with or without
70 * modification, are permitted provided that the following conditions
71 * are met:
72 * 1. Redistributions of source code must retain the above copyright
73 * notice, this list of conditions and the following disclaimer.
74 * 2. Redistributions in binary form must reproduce the above copyright
75 * notice, this list of conditions and the following disclaimer in the
76 * documentation and/or other materials provided with the distribution.
77 * 3. Neither the name of the University nor the names of its contributors
78 * may be used to endorse or promote products derived from this software
79 * without specific prior written permission.
80 *
81 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
82 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
83 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
84 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
85 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
86 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
87 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
88 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
89 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
90 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
91 * SUCH DAMAGE.
92 *
93 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
94 */
95
96 #include <sys/cdefs.h>
97 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.234 2008/04/28 21:17:16 ad Exp $");
98
99 #include "opt_kstack.h"
100 #include "opt_lockdebug.h"
101 #include "opt_multiprocessor.h"
102 #include "opt_perfctrs.h"
103
104 #define __MUTEX_PRIVATE
105
106 #include <sys/param.h>
107 #include <sys/systm.h>
108 #include <sys/proc.h>
109 #include <sys/kernel.h>
110 #if defined(PERFCTRS)
111 #include <sys/pmc.h>
112 #endif
113 #include <sys/cpu.h>
114 #include <sys/resourcevar.h>
115 #include <sys/sched.h>
116 #include <sys/syscall_stats.h>
117 #include <sys/sleepq.h>
118 #include <sys/lockdebug.h>
119 #include <sys/evcnt.h>
120 #include <sys/intr.h>
121 #include <sys/lwpctl.h>
122 #include <sys/atomic.h>
123 #include <sys/simplelock.h>
124 #include <sys/bitops.h>
125 #include <sys/kmem.h>
126 #include <sys/sysctl.h>
127 #include <sys/idle.h>
128
129 #include <uvm/uvm_extern.h>
130
131 #include <dev/lockstat.h>
132
133 /*
134 * Priority related defintions.
135 */
136 #define PRI_TS_COUNT (NPRI_USER)
137 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
138 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
139
140 #define PRI_HIGHEST_TS (MAXPRI_USER)
141
142 /*
143 * Bits per map.
144 */
145 #define BITMAP_BITS (32)
146 #define BITMAP_SHIFT (5)
147 #define BITMAP_MSB (0x80000000U)
148 #define BITMAP_MASK (BITMAP_BITS - 1)
149
150 /*
151 * Structures, runqueue.
152 */
153
154 typedef struct {
155 TAILQ_HEAD(, lwp) q_head;
156 } queue_t;
157
158 typedef struct {
159 /* Lock and bitmap */
160 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
161 /* Counters */
162 u_int r_count; /* Count of the threads */
163 u_int r_avgcount; /* Average count of threads */
164 u_int r_mcount; /* Count of migratable threads */
165 /* Runqueues */
166 queue_t r_rt_queue[PRI_RT_COUNT];
167 queue_t r_ts_queue[PRI_TS_COUNT];
168 } runqueue_t;
169
170 static u_int sched_unsleep(struct lwp *, bool);
171 static void sched_changepri(struct lwp *, pri_t);
172 static void sched_lendpri(struct lwp *, pri_t);
173 static void *sched_getrq(runqueue_t *, const pri_t);
174 #ifdef MULTIPROCESSOR
175 static lwp_t *sched_catchlwp(void);
176 static void sched_balance(void *);
177 #endif
178
179 syncobj_t sleep_syncobj = {
180 SOBJ_SLEEPQ_SORTED,
181 sleepq_unsleep,
182 sleepq_changepri,
183 sleepq_lendpri,
184 syncobj_noowner,
185 };
186
187 syncobj_t sched_syncobj = {
188 SOBJ_SLEEPQ_SORTED,
189 sched_unsleep,
190 sched_changepri,
191 sched_lendpri,
192 syncobj_noowner,
193 };
194
195 const int schedppq = 1;
196 callout_t sched_pstats_ch;
197 unsigned sched_pstats_ticks;
198 kcondvar_t lbolt; /* once a second sleep address */
199
200 /*
201 * Preemption control.
202 */
203 int sched_upreempt_pri = PRI_KERNEL;
204 #if 0
205 int sched_kpreempt_pri = PRI_USER_RT;
206 #else
207 /* XXX disable for now until any bugs are worked out. */
208 int sched_kpreempt_pri = 1000;
209 #endif
210
211 static struct evcnt kpreempt_ev_crit;
212 static struct evcnt kpreempt_ev_klock;
213 static struct evcnt kpreempt_ev_ipl;
214 static struct evcnt kpreempt_ev_immed;
215
216 /*
217 * Migration and balancing.
218 */
219 static u_int cacheht_time; /* Cache hotness time */
220 static u_int min_catch; /* Minimal LWP count for catching */
221 static u_int balance_period; /* Balance period */
222 static struct cpu_info *worker_ci; /* Victim CPU */
223 #ifdef MULTIPROCESSOR
224 static struct callout balance_ch; /* Callout of balancer */
225 #endif
226
227 /*
228 * During autoconfiguration or after a panic, a sleep will simply lower the
229 * priority briefly to allow interrupts, then return. The priority to be
230 * used (safepri) is machine-dependent, thus this value is initialized and
231 * maintained in the machine-dependent layers. This priority will typically
232 * be 0, or the lowest priority that is safe for use on the interrupt stack;
233 * it can be made higher to block network software interrupts after panics.
234 */
235 int safepri;
236
237 /*
238 * OBSOLETE INTERFACE
239 *
240 * General sleep call. Suspends the current process until a wakeup is
241 * performed on the specified identifier. The process will then be made
242 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
243 * means no timeout). If pri includes PCATCH flag, signals are checked
244 * before and after sleeping, else signals are not checked. Returns 0 if
245 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
246 * signal needs to be delivered, ERESTART is returned if the current system
247 * call should be restarted if possible, and EINTR is returned if the system
248 * call should be interrupted by the signal (return EINTR).
249 *
250 * The interlock is held until we are on a sleep queue. The interlock will
251 * be locked before returning back to the caller unless the PNORELOCK flag
252 * is specified, in which case the interlock will always be unlocked upon
253 * return.
254 */
255 int
256 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
257 volatile struct simplelock *interlock)
258 {
259 struct lwp *l = curlwp;
260 sleepq_t *sq;
261 int error;
262
263 KASSERT((l->l_pflag & LP_INTR) == 0);
264
265 if (sleepq_dontsleep(l)) {
266 (void)sleepq_abort(NULL, 0);
267 if ((priority & PNORELOCK) != 0)
268 simple_unlock(interlock);
269 return 0;
270 }
271
272 l->l_kpriority = true;
273 sq = sleeptab_lookup(&sleeptab, ident);
274 sleepq_enter(sq, l);
275 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
276
277 if (interlock != NULL) {
278 KASSERT(simple_lock_held(interlock));
279 simple_unlock(interlock);
280 }
281
282 error = sleepq_block(timo, priority & PCATCH);
283
284 if (interlock != NULL && (priority & PNORELOCK) == 0)
285 simple_lock(interlock);
286
287 return error;
288 }
289
290 int
291 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
292 kmutex_t *mtx)
293 {
294 struct lwp *l = curlwp;
295 sleepq_t *sq;
296 int error;
297
298 KASSERT((l->l_pflag & LP_INTR) == 0);
299
300 if (sleepq_dontsleep(l)) {
301 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
302 return 0;
303 }
304
305 l->l_kpriority = true;
306 sq = sleeptab_lookup(&sleeptab, ident);
307 sleepq_enter(sq, l);
308 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
309 mutex_exit(mtx);
310 error = sleepq_block(timo, priority & PCATCH);
311
312 if ((priority & PNORELOCK) == 0)
313 mutex_enter(mtx);
314
315 return error;
316 }
317
318 /*
319 * General sleep call for situations where a wake-up is not expected.
320 */
321 int
322 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
323 {
324 struct lwp *l = curlwp;
325 sleepq_t *sq;
326 int error;
327
328 if (sleepq_dontsleep(l))
329 return sleepq_abort(NULL, 0);
330
331 if (mtx != NULL)
332 mutex_exit(mtx);
333 l->l_kpriority = true;
334 sq = sleeptab_lookup(&sleeptab, l);
335 sleepq_enter(sq, l);
336 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
337 error = sleepq_block(timo, intr);
338 if (mtx != NULL)
339 mutex_enter(mtx);
340
341 return error;
342 }
343
344 /*
345 * OBSOLETE INTERFACE
346 *
347 * Make all processes sleeping on the specified identifier runnable.
348 */
349 void
350 wakeup(wchan_t ident)
351 {
352 sleepq_t *sq;
353
354 if (cold)
355 return;
356
357 sq = sleeptab_lookup(&sleeptab, ident);
358 sleepq_wake(sq, ident, (u_int)-1);
359 }
360
361 /*
362 * OBSOLETE INTERFACE
363 *
364 * Make the highest priority process first in line on the specified
365 * identifier runnable.
366 */
367 void
368 wakeup_one(wchan_t ident)
369 {
370 sleepq_t *sq;
371
372 if (cold)
373 return;
374
375 sq = sleeptab_lookup(&sleeptab, ident);
376 sleepq_wake(sq, ident, 1);
377 }
378
379
380 /*
381 * General yield call. Puts the current process back on its run queue and
382 * performs a voluntary context switch. Should only be called when the
383 * current process explicitly requests it (eg sched_yield(2)).
384 */
385 void
386 yield(void)
387 {
388 struct lwp *l = curlwp;
389
390 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
391 lwp_lock(l);
392 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
393 KASSERT(l->l_stat == LSONPROC);
394 l->l_kpriority = false;
395 (void)mi_switch(l);
396 KERNEL_LOCK(l->l_biglocks, l);
397 }
398
399 /*
400 * General preemption call. Puts the current process back on its run queue
401 * and performs an involuntary context switch.
402 */
403 void
404 preempt(void)
405 {
406 struct lwp *l = curlwp;
407
408 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
409 lwp_lock(l);
410 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
411 KASSERT(l->l_stat == LSONPROC);
412 l->l_kpriority = false;
413 l->l_nivcsw++;
414 (void)mi_switch(l);
415 KERNEL_LOCK(l->l_biglocks, l);
416 }
417
418 /*
419 * Handle a request made by another agent to preempt the current LWP
420 * in-kernel. Usually called when l_dopreempt may be non-zero.
421 *
422 * Character addresses for lockstat only.
423 */
424 static char in_critical_section;
425 static char kernel_lock_held;
426 static char spl_raised;
427 static char is_softint;
428
429 bool
430 kpreempt(uintptr_t where)
431 {
432 uintptr_t failed;
433 lwp_t *l;
434 int s, dop;
435
436 l = curlwp;
437 failed = 0;
438 while ((dop = l->l_dopreempt) != 0) {
439 if (l->l_stat != LSONPROC) {
440 /*
441 * About to block (or die), let it happen.
442 * Doesn't really count as "preemption has
443 * been blocked", since we're going to
444 * context switch.
445 */
446 l->l_dopreempt = 0;
447 return true;
448 }
449 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
450 /* Can't preempt idle loop, don't count as failure. */
451 l->l_dopreempt = 0;
452 return true;
453 }
454 if (__predict_false(l->l_nopreempt != 0)) {
455 /* LWP holds preemption disabled, explicitly. */
456 if ((dop & DOPREEMPT_COUNTED) == 0) {
457 kpreempt_ev_crit.ev_count++;
458 }
459 failed = (uintptr_t)&in_critical_section;
460 break;
461 }
462 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
463 /* Can't preempt soft interrupts yet. */
464 l->l_dopreempt = 0;
465 failed = (uintptr_t)&is_softint;
466 break;
467 }
468 s = splsched();
469 if (__predict_false(l->l_blcnt != 0 ||
470 curcpu()->ci_biglock_wanted != NULL)) {
471 /* Hold or want kernel_lock, code is not MT safe. */
472 splx(s);
473 if ((dop & DOPREEMPT_COUNTED) == 0) {
474 kpreempt_ev_klock.ev_count++;
475 }
476 failed = (uintptr_t)&kernel_lock_held;
477 break;
478 }
479 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
480 /*
481 * It may be that the IPL is too high.
482 * kpreempt_enter() can schedule an
483 * interrupt to retry later.
484 */
485 splx(s);
486 if ((dop & DOPREEMPT_COUNTED) == 0) {
487 kpreempt_ev_ipl.ev_count++;
488 }
489 failed = (uintptr_t)&spl_raised;
490 break;
491 }
492 /* Do it! */
493 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
494 kpreempt_ev_immed.ev_count++;
495 }
496 lwp_lock(l);
497 mi_switch(l);
498 l->l_nopreempt++;
499 splx(s);
500
501 /* Take care of any MD cleanup. */
502 cpu_kpreempt_exit(where);
503 l->l_nopreempt--;
504 }
505
506 /* Record preemption failure for reporting via lockstat. */
507 if (__predict_false(failed)) {
508 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
509 int lsflag = 0;
510 LOCKSTAT_ENTER(lsflag);
511 /* Might recurse, make it atomic. */
512 if (__predict_false(lsflag)) {
513 if (where == 0) {
514 where = (uintptr_t)__builtin_return_address(0);
515 }
516 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
517 NULL, (void *)where) == NULL) {
518 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
519 l->l_pfaillock = failed;
520 }
521 }
522 LOCKSTAT_EXIT(lsflag);
523 }
524
525 return failed;
526 }
527
528 /*
529 * Return true if preemption is explicitly disabled.
530 */
531 bool
532 kpreempt_disabled(void)
533 {
534 lwp_t *l;
535
536 l = curlwp;
537
538 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
539 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
540 }
541
542 /*
543 * Disable kernel preemption.
544 */
545 void
546 kpreempt_disable(void)
547 {
548
549 KPREEMPT_DISABLE(curlwp);
550 }
551
552 /*
553 * Reenable kernel preemption.
554 */
555 void
556 kpreempt_enable(void)
557 {
558
559 KPREEMPT_ENABLE(curlwp);
560 }
561
562 /*
563 * Compute the amount of time during which the current lwp was running.
564 *
565 * - update l_rtime unless it's an idle lwp.
566 */
567
568 void
569 updatertime(lwp_t *l, const struct bintime *now)
570 {
571
572 if ((l->l_flag & LW_IDLE) != 0)
573 return;
574
575 /* rtime += now - stime */
576 bintime_add(&l->l_rtime, now);
577 bintime_sub(&l->l_rtime, &l->l_stime);
578 }
579
580 /*
581 * The machine independent parts of context switch.
582 *
583 * Returns 1 if another LWP was actually run.
584 */
585 int
586 mi_switch(lwp_t *l)
587 {
588 struct cpu_info *ci, *tci = NULL;
589 struct schedstate_percpu *spc;
590 struct lwp *newl;
591 int retval, oldspl;
592 struct bintime bt;
593 bool returning;
594
595 KASSERT(lwp_locked(l, NULL));
596 KASSERT(kpreempt_disabled());
597 LOCKDEBUG_BARRIER(l->l_mutex, 1);
598
599 #ifdef KSTACK_CHECK_MAGIC
600 kstack_check_magic(l);
601 #endif
602
603 binuptime(&bt);
604
605 KASSERT(l->l_cpu == curcpu());
606 ci = l->l_cpu;
607 spc = &ci->ci_schedstate;
608 returning = false;
609 newl = NULL;
610
611 /*
612 * If we have been asked to switch to a specific LWP, then there
613 * is no need to inspect the run queues. If a soft interrupt is
614 * blocking, then return to the interrupted thread without adjusting
615 * VM context or its start time: neither have been changed in order
616 * to take the interrupt.
617 */
618 if (l->l_switchto != NULL) {
619 if ((l->l_pflag & LP_INTR) != 0) {
620 returning = true;
621 softint_block(l);
622 if ((l->l_flag & LW_TIMEINTR) != 0)
623 updatertime(l, &bt);
624 }
625 newl = l->l_switchto;
626 l->l_switchto = NULL;
627 }
628 #ifndef __HAVE_FAST_SOFTINTS
629 else if (ci->ci_data.cpu_softints != 0) {
630 /* There are pending soft interrupts, so pick one. */
631 newl = softint_picklwp();
632 newl->l_stat = LSONPROC;
633 newl->l_flag |= LW_RUNNING;
634 }
635 #endif /* !__HAVE_FAST_SOFTINTS */
636
637 /* Count time spent in current system call */
638 if (!returning) {
639 SYSCALL_TIME_SLEEP(l);
640
641 /*
642 * XXXSMP If we are using h/w performance counters,
643 * save context.
644 */
645 #if PERFCTRS
646 if (PMC_ENABLED(l->l_proc)) {
647 pmc_save_context(l->l_proc);
648 }
649 #endif
650 updatertime(l, &bt);
651 }
652
653 /*
654 * If on the CPU and we have gotten this far, then we must yield.
655 */
656 KASSERT(l->l_stat != LSRUN);
657 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
658 KASSERT(lwp_locked(l, spc->spc_lwplock));
659
660 if (l->l_target_cpu == l->l_cpu) {
661 l->l_target_cpu = NULL;
662 } else {
663 tci = l->l_target_cpu;
664 }
665
666 if (__predict_false(tci != NULL)) {
667 /* Double-lock the runqueues */
668 spc_dlock(ci, tci);
669 } else {
670 /* Lock the runqueue */
671 spc_lock(ci);
672 }
673
674 if ((l->l_flag & LW_IDLE) == 0) {
675 l->l_stat = LSRUN;
676 if (__predict_false(tci != NULL)) {
677 /*
678 * Set the new CPU, lock and unset the
679 * l_target_cpu - thread will be enqueued
680 * to the runqueue of target CPU.
681 */
682 l->l_cpu = tci;
683 lwp_setlock(l, tci->ci_schedstate.spc_mutex);
684 l->l_target_cpu = NULL;
685 } else {
686 lwp_setlock(l, spc->spc_mutex);
687 }
688 sched_enqueue(l, true);
689 } else {
690 KASSERT(tci == NULL);
691 l->l_stat = LSIDL;
692 }
693 } else {
694 /* Lock the runqueue */
695 spc_lock(ci);
696 }
697
698 /*
699 * Let sched_nextlwp() select the LWP to run the CPU next.
700 * If no LWP is runnable, select the idle LWP.
701 *
702 * Note that spc_lwplock might not necessary be held, and
703 * new thread would be unlocked after setting the LWP-lock.
704 */
705 if (newl == NULL) {
706 newl = sched_nextlwp();
707 if (newl != NULL) {
708 sched_dequeue(newl);
709 KASSERT(lwp_locked(newl, spc->spc_mutex));
710 newl->l_stat = LSONPROC;
711 newl->l_cpu = ci;
712 newl->l_flag |= LW_RUNNING;
713 lwp_setlock(newl, spc->spc_lwplock);
714 } else {
715 newl = ci->ci_data.cpu_idlelwp;
716 newl->l_stat = LSONPROC;
717 newl->l_flag |= LW_RUNNING;
718 }
719 /*
720 * Only clear want_resched if there are no
721 * pending (slow) software interrupts.
722 */
723 ci->ci_want_resched = ci->ci_data.cpu_softints;
724 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
725 spc->spc_curpriority = lwp_eprio(newl);
726 }
727
728 /* Items that must be updated with the CPU locked. */
729 if (!returning) {
730 /* Update the new LWP's start time. */
731 newl->l_stime = bt;
732
733 /*
734 * ci_curlwp changes when a fast soft interrupt occurs.
735 * We use cpu_onproc to keep track of which kernel or
736 * user thread is running 'underneath' the software
737 * interrupt. This is important for time accounting,
738 * itimers and forcing user threads to preempt (aston).
739 */
740 ci->ci_data.cpu_onproc = newl;
741 }
742
743 /* Kernel preemption related tasks. */
744 l->l_dopreempt = 0;
745 if (__predict_false(l->l_pfailaddr != 0)) {
746 LOCKSTAT_FLAG(lsflag);
747 LOCKSTAT_ENTER(lsflag);
748 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
749 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
750 1, l->l_pfailtime, l->l_pfailaddr);
751 LOCKSTAT_EXIT(lsflag);
752 l->l_pfailtime = 0;
753 l->l_pfaillock = 0;
754 l->l_pfailaddr = 0;
755 }
756
757 if (l != newl) {
758 struct lwp *prevlwp;
759
760 /* Release all locks, but leave the current LWP locked */
761 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
762 /*
763 * In case of migration, drop the local runqueue
764 * lock, thread is on other runqueue now.
765 */
766 if (__predict_false(tci != NULL))
767 spc_unlock(ci);
768 /*
769 * Drop spc_lwplock, if the current LWP has been moved
770 * to the run queue (it is now locked by spc_mutex).
771 */
772 mutex_spin_exit(spc->spc_lwplock);
773 } else {
774 /*
775 * Otherwise, drop the spc_mutex, we are done with the
776 * run queues.
777 */
778 mutex_spin_exit(spc->spc_mutex);
779 KASSERT(tci == NULL);
780 }
781
782 /*
783 * Mark that context switch is going to be perfomed
784 * for this LWP, to protect it from being switched
785 * to on another CPU.
786 */
787 KASSERT(l->l_ctxswtch == 0);
788 l->l_ctxswtch = 1;
789 l->l_ncsw++;
790 l->l_flag &= ~LW_RUNNING;
791
792 /*
793 * Increase the count of spin-mutexes before the release
794 * of the last lock - we must remain at IPL_SCHED during
795 * the context switch.
796 */
797 oldspl = MUTEX_SPIN_OLDSPL(ci);
798 ci->ci_mtx_count--;
799 lwp_unlock(l);
800
801 /* Count the context switch on this CPU. */
802 ci->ci_data.cpu_nswtch++;
803
804 /* Update status for lwpctl, if present. */
805 if (l->l_lwpctl != NULL)
806 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
807
808 /*
809 * Save old VM context, unless a soft interrupt
810 * handler is blocking.
811 */
812 if (!returning)
813 pmap_deactivate(l);
814
815 /*
816 * We may need to spin-wait for if 'newl' is still
817 * context switching on another CPU.
818 */
819 if (newl->l_ctxswtch != 0) {
820 u_int count;
821 count = SPINLOCK_BACKOFF_MIN;
822 while (newl->l_ctxswtch)
823 SPINLOCK_BACKOFF(count);
824 }
825
826 /* Switch to the new LWP.. */
827 prevlwp = cpu_switchto(l, newl, returning);
828 ci = curcpu();
829
830 /*
831 * Switched away - we have new curlwp.
832 * Restore VM context and IPL.
833 */
834 pmap_activate(l);
835 if (prevlwp != NULL) {
836 /* Normalize the count of the spin-mutexes */
837 ci->ci_mtx_count++;
838 /* Unmark the state of context switch */
839 membar_exit();
840 prevlwp->l_ctxswtch = 0;
841 }
842
843 /* Update status for lwpctl, if present. */
844 if (l->l_lwpctl != NULL) {
845 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
846 l->l_lwpctl->lc_pctr++;
847 }
848
849 KASSERT(l->l_cpu == ci);
850 splx(oldspl);
851 retval = 1;
852 } else {
853 /* Nothing to do - just unlock and return. */
854 KASSERT(tci == NULL);
855 spc_unlock(ci);
856 lwp_unlock(l);
857 retval = 0;
858 }
859
860 KASSERT(l == curlwp);
861 KASSERT(l->l_stat == LSONPROC);
862
863 /*
864 * XXXSMP If we are using h/w performance counters, restore context.
865 * XXXSMP preemption problem.
866 */
867 #if PERFCTRS
868 if (PMC_ENABLED(l->l_proc)) {
869 pmc_restore_context(l->l_proc);
870 }
871 #endif
872 SYSCALL_TIME_WAKEUP(l);
873 LOCKDEBUG_BARRIER(NULL, 1);
874
875 return retval;
876 }
877
878 /*
879 * Change process state to be runnable, placing it on the run queue if it is
880 * in memory, and awakening the swapper if it isn't in memory.
881 *
882 * Call with the process and LWP locked. Will return with the LWP unlocked.
883 */
884 void
885 setrunnable(struct lwp *l)
886 {
887 struct proc *p = l->l_proc;
888 struct cpu_info *ci;
889 sigset_t *ss;
890
891 KASSERT((l->l_flag & LW_IDLE) == 0);
892 KASSERT(mutex_owned(p->p_lock));
893 KASSERT(lwp_locked(l, NULL));
894 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
895
896 switch (l->l_stat) {
897 case LSSTOP:
898 /*
899 * If we're being traced (possibly because someone attached us
900 * while we were stopped), check for a signal from the debugger.
901 */
902 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
903 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
904 ss = &l->l_sigpend.sp_set;
905 else
906 ss = &p->p_sigpend.sp_set;
907 sigaddset(ss, p->p_xstat);
908 signotify(l);
909 }
910 p->p_nrlwps++;
911 break;
912 case LSSUSPENDED:
913 l->l_flag &= ~LW_WSUSPEND;
914 p->p_nrlwps++;
915 cv_broadcast(&p->p_lwpcv);
916 break;
917 case LSSLEEP:
918 KASSERT(l->l_wchan != NULL);
919 break;
920 default:
921 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
922 }
923
924 /*
925 * If the LWP was sleeping interruptably, then it's OK to start it
926 * again. If not, mark it as still sleeping.
927 */
928 if (l->l_wchan != NULL) {
929 l->l_stat = LSSLEEP;
930 /* lwp_unsleep() will release the lock. */
931 lwp_unsleep(l, true);
932 return;
933 }
934
935 /*
936 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
937 * about to call mi_switch(), in which case it will yield.
938 */
939 if ((l->l_flag & LW_RUNNING) != 0) {
940 l->l_stat = LSONPROC;
941 l->l_slptime = 0;
942 lwp_unlock(l);
943 return;
944 }
945
946 /*
947 * Look for a CPU to run.
948 * Set the LWP runnable.
949 */
950 ci = sched_takecpu(l);
951 l->l_cpu = ci;
952 if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
953 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
954 lwp_lock(l);
955 }
956 sched_setrunnable(l);
957 l->l_stat = LSRUN;
958 l->l_slptime = 0;
959
960 /*
961 * If thread is swapped out - wake the swapper to bring it back in.
962 * Otherwise, enter it into a run queue.
963 */
964 if (l->l_flag & LW_INMEM) {
965 sched_enqueue(l, false);
966 resched_cpu(l);
967 lwp_unlock(l);
968 } else {
969 lwp_unlock(l);
970 uvm_kick_scheduler();
971 }
972 }
973
974 /*
975 * suspendsched:
976 *
977 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
978 */
979 void
980 suspendsched(void)
981 {
982 CPU_INFO_ITERATOR cii;
983 struct cpu_info *ci;
984 struct lwp *l;
985 struct proc *p;
986
987 /*
988 * We do this by process in order not to violate the locking rules.
989 */
990 mutex_enter(proc_lock);
991 PROCLIST_FOREACH(p, &allproc) {
992 mutex_enter(p->p_lock);
993
994 if ((p->p_flag & PK_SYSTEM) != 0) {
995 mutex_exit(p->p_lock);
996 continue;
997 }
998
999 p->p_stat = SSTOP;
1000
1001 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1002 if (l == curlwp)
1003 continue;
1004
1005 lwp_lock(l);
1006
1007 /*
1008 * Set L_WREBOOT so that the LWP will suspend itself
1009 * when it tries to return to user mode. We want to
1010 * try and get to get as many LWPs as possible to
1011 * the user / kernel boundary, so that they will
1012 * release any locks that they hold.
1013 */
1014 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1015
1016 if (l->l_stat == LSSLEEP &&
1017 (l->l_flag & LW_SINTR) != 0) {
1018 /* setrunnable() will release the lock. */
1019 setrunnable(l);
1020 continue;
1021 }
1022
1023 lwp_unlock(l);
1024 }
1025
1026 mutex_exit(p->p_lock);
1027 }
1028 mutex_exit(proc_lock);
1029
1030 /*
1031 * Kick all CPUs to make them preempt any LWPs running in user mode.
1032 * They'll trap into the kernel and suspend themselves in userret().
1033 */
1034 for (CPU_INFO_FOREACH(cii, ci)) {
1035 spc_lock(ci);
1036 cpu_need_resched(ci, RESCHED_IMMED);
1037 spc_unlock(ci);
1038 }
1039 }
1040
1041 /*
1042 * sched_unsleep:
1043 *
1044 * The is called when the LWP has not been awoken normally but instead
1045 * interrupted: for example, if the sleep timed out. Because of this,
1046 * it's not a valid action for running or idle LWPs.
1047 */
1048 static u_int
1049 sched_unsleep(struct lwp *l, bool cleanup)
1050 {
1051
1052 lwp_unlock(l);
1053 panic("sched_unsleep");
1054 }
1055
1056 void
1057 resched_cpu(struct lwp *l)
1058 {
1059 struct cpu_info *ci;
1060
1061 /*
1062 * XXXSMP
1063 * Since l->l_cpu persists across a context switch,
1064 * this gives us *very weak* processor affinity, in
1065 * that we notify the CPU on which the process last
1066 * ran that it should try to switch.
1067 *
1068 * This does not guarantee that the process will run on
1069 * that processor next, because another processor might
1070 * grab it the next time it performs a context switch.
1071 *
1072 * This also does not handle the case where its last
1073 * CPU is running a higher-priority process, but every
1074 * other CPU is running a lower-priority process. There
1075 * are ways to handle this situation, but they're not
1076 * currently very pretty, and we also need to weigh the
1077 * cost of moving a process from one CPU to another.
1078 */
1079 ci = l->l_cpu;
1080 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1081 cpu_need_resched(ci, 0);
1082 }
1083
1084 static void
1085 sched_changepri(struct lwp *l, pri_t pri)
1086 {
1087
1088 KASSERT(lwp_locked(l, NULL));
1089
1090 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1091 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1092 sched_dequeue(l);
1093 l->l_priority = pri;
1094 sched_enqueue(l, false);
1095 } else {
1096 l->l_priority = pri;
1097 }
1098 resched_cpu(l);
1099 }
1100
1101 static void
1102 sched_lendpri(struct lwp *l, pri_t pri)
1103 {
1104
1105 KASSERT(lwp_locked(l, NULL));
1106
1107 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1108 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1109 sched_dequeue(l);
1110 l->l_inheritedprio = pri;
1111 sched_enqueue(l, false);
1112 } else {
1113 l->l_inheritedprio = pri;
1114 }
1115 resched_cpu(l);
1116 }
1117
1118 struct lwp *
1119 syncobj_noowner(wchan_t wchan)
1120 {
1121
1122 return NULL;
1123 }
1124
1125 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
1126 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1127
1128 /*
1129 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
1130 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
1131 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
1132 *
1133 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
1134 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
1135 *
1136 * If you dont want to bother with the faster/more-accurate formula, you
1137 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
1138 * (more general) method of calculating the %age of CPU used by a process.
1139 */
1140 #define CCPU_SHIFT (FSHIFT + 1)
1141
1142 /*
1143 * sched_pstats:
1144 *
1145 * Update process statistics and check CPU resource allocation.
1146 * Call scheduler-specific hook to eventually adjust process/LWP
1147 * priorities.
1148 */
1149 /* ARGSUSED */
1150 void
1151 sched_pstats(void *arg)
1152 {
1153 struct rlimit *rlim;
1154 struct lwp *l;
1155 struct proc *p;
1156 int sig, clkhz;
1157 long runtm;
1158
1159 sched_pstats_ticks++;
1160
1161 mutex_enter(proc_lock);
1162 PROCLIST_FOREACH(p, &allproc) {
1163 /*
1164 * Increment time in/out of memory and sleep time (if
1165 * sleeping). We ignore overflow; with 16-bit int's
1166 * (remember them?) overflow takes 45 days.
1167 */
1168 mutex_enter(p->p_lock);
1169 mutex_spin_enter(&p->p_stmutex);
1170 runtm = p->p_rtime.sec;
1171 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1172 if ((l->l_flag & LW_IDLE) != 0)
1173 continue;
1174 lwp_lock(l);
1175 runtm += l->l_rtime.sec;
1176 l->l_swtime++;
1177 sched_pstats_hook(l);
1178 lwp_unlock(l);
1179
1180 /*
1181 * p_pctcpu is only for ps.
1182 */
1183 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1184 if (l->l_slptime < 1) {
1185 clkhz = stathz != 0 ? stathz : hz;
1186 #if (FSHIFT >= CCPU_SHIFT)
1187 l->l_pctcpu += (clkhz == 100) ?
1188 ((fixpt_t)l->l_cpticks) <<
1189 (FSHIFT - CCPU_SHIFT) :
1190 100 * (((fixpt_t) p->p_cpticks)
1191 << (FSHIFT - CCPU_SHIFT)) / clkhz;
1192 #else
1193 l->l_pctcpu += ((FSCALE - ccpu) *
1194 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
1195 #endif
1196 l->l_cpticks = 0;
1197 }
1198 }
1199 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1200 mutex_spin_exit(&p->p_stmutex);
1201
1202 /*
1203 * Check if the process exceeds its CPU resource allocation.
1204 * If over max, kill it.
1205 */
1206 rlim = &p->p_rlimit[RLIMIT_CPU];
1207 sig = 0;
1208 if (runtm >= rlim->rlim_cur) {
1209 if (runtm >= rlim->rlim_max)
1210 sig = SIGKILL;
1211 else {
1212 sig = SIGXCPU;
1213 if (rlim->rlim_cur < rlim->rlim_max)
1214 rlim->rlim_cur += 5;
1215 }
1216 }
1217 mutex_exit(p->p_lock);
1218 if (sig)
1219 psignal(p, sig);
1220 }
1221 mutex_exit(proc_lock);
1222 uvm_meter();
1223 cv_wakeup(&lbolt);
1224 callout_schedule(&sched_pstats_ch, hz);
1225 }
1226
1227 void
1228 sched_init(void)
1229 {
1230
1231 cv_init(&lbolt, "lbolt");
1232 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
1233 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
1234
1235 /* Balancing */
1236 worker_ci = curcpu();
1237 cacheht_time = mstohz(5); /* ~5 ms */
1238 balance_period = mstohz(300); /* ~300ms */
1239
1240 /* Minimal count of LWPs for catching: log2(count of CPUs) */
1241 min_catch = min(ilog2(ncpu), 4);
1242
1243 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_INTR, NULL,
1244 "kpreempt", "defer: critical section");
1245 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_INTR, NULL,
1246 "kpreempt", "defer: kernel_lock");
1247 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_INTR, NULL,
1248 "kpreempt", "defer: IPL");
1249 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_INTR, NULL,
1250 "kpreempt", "immediate");
1251
1252 /* Initialize balancing callout and run it */
1253 #ifdef MULTIPROCESSOR
1254 callout_init(&balance_ch, CALLOUT_MPSAFE);
1255 callout_setfunc(&balance_ch, sched_balance, NULL);
1256 callout_schedule(&balance_ch, balance_period);
1257 #endif
1258 sched_pstats(NULL);
1259 }
1260
1261 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
1262 {
1263 const struct sysctlnode *node = NULL;
1264
1265 sysctl_createv(clog, 0, NULL, NULL,
1266 CTLFLAG_PERMANENT,
1267 CTLTYPE_NODE, "kern", NULL,
1268 NULL, 0, NULL, 0,
1269 CTL_KERN, CTL_EOL);
1270 sysctl_createv(clog, 0, NULL, &node,
1271 CTLFLAG_PERMANENT,
1272 CTLTYPE_NODE, "sched",
1273 SYSCTL_DESCR("Scheduler options"),
1274 NULL, 0, NULL, 0,
1275 CTL_KERN, CTL_CREATE, CTL_EOL);
1276
1277 if (node == NULL)
1278 return;
1279
1280 sysctl_createv(clog, 0, &node, NULL,
1281 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1282 CTLTYPE_INT, "cacheht_time",
1283 SYSCTL_DESCR("Cache hotness time (in ticks)"),
1284 NULL, 0, &cacheht_time, 0,
1285 CTL_CREATE, CTL_EOL);
1286 sysctl_createv(clog, 0, &node, NULL,
1287 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1288 CTLTYPE_INT, "balance_period",
1289 SYSCTL_DESCR("Balance period (in ticks)"),
1290 NULL, 0, &balance_period, 0,
1291 CTL_CREATE, CTL_EOL);
1292 sysctl_createv(clog, 0, &node, NULL,
1293 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1294 CTLTYPE_INT, "min_catch",
1295 SYSCTL_DESCR("Minimal count of threads for catching"),
1296 NULL, 0, &min_catch, 0,
1297 CTL_CREATE, CTL_EOL);
1298 sysctl_createv(clog, 0, &node, NULL,
1299 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1300 CTLTYPE_INT, "timesoftints",
1301 SYSCTL_DESCR("Track CPU time for soft interrupts"),
1302 NULL, 0, &softint_timing, 0,
1303 CTL_CREATE, CTL_EOL);
1304 sysctl_createv(clog, 0, &node, NULL,
1305 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1306 CTLTYPE_INT, "kpreempt_pri",
1307 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
1308 NULL, 0, &sched_kpreempt_pri, 0,
1309 CTL_CREATE, CTL_EOL);
1310 sysctl_createv(clog, 0, &node, NULL,
1311 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1312 CTLTYPE_INT, "upreempt_pri",
1313 SYSCTL_DESCR("Minimum priority to trigger user preemption"),
1314 NULL, 0, &sched_upreempt_pri, 0,
1315 CTL_CREATE, CTL_EOL);
1316 }
1317
1318 void
1319 sched_cpuattach(struct cpu_info *ci)
1320 {
1321 runqueue_t *ci_rq;
1322 void *rq_ptr;
1323 u_int i, size;
1324
1325 if (ci->ci_schedstate.spc_lwplock == NULL) {
1326 ci->ci_schedstate.spc_lwplock =
1327 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
1328 }
1329 if (ci == lwp0.l_cpu) {
1330 /* Initialize the scheduler structure of the primary LWP */
1331 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
1332 }
1333 if (ci->ci_schedstate.spc_mutex != NULL) {
1334 /* Already initialized. */
1335 return;
1336 }
1337
1338 /* Allocate the run queue */
1339 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
1340 rq_ptr = kmem_zalloc(size, KM_SLEEP);
1341 if (rq_ptr == NULL) {
1342 panic("sched_cpuattach: could not allocate the runqueue");
1343 }
1344 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
1345
1346 /* Initialize run queues */
1347 ci->ci_schedstate.spc_mutex =
1348 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
1349 for (i = 0; i < PRI_RT_COUNT; i++)
1350 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
1351 for (i = 0; i < PRI_TS_COUNT; i++)
1352 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
1353
1354 ci->ci_schedstate.spc_sched_info = ci_rq;
1355 }
1356
1357 /*
1358 * Control of the runqueue.
1359 */
1360
1361 static void *
1362 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
1363 {
1364
1365 KASSERT(prio < PRI_COUNT);
1366 return (prio <= PRI_HIGHEST_TS) ?
1367 &ci_rq->r_ts_queue[prio].q_head :
1368 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
1369 }
1370
1371 void
1372 sched_enqueue(struct lwp *l, bool swtch)
1373 {
1374 runqueue_t *ci_rq;
1375 struct schedstate_percpu *spc;
1376 TAILQ_HEAD(, lwp) *q_head;
1377 const pri_t eprio = lwp_eprio(l);
1378 struct cpu_info *ci;
1379 int type;
1380
1381 ci = l->l_cpu;
1382 spc = &ci->ci_schedstate;
1383 ci_rq = spc->spc_sched_info;
1384 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1385
1386 /* Update the last run time on switch */
1387 if (__predict_true(swtch == true)) {
1388 l->l_rticks = hardclock_ticks;
1389 l->l_rticksum += (hardclock_ticks - l->l_rticks);
1390 } else if (l->l_rticks == 0)
1391 l->l_rticks = hardclock_ticks;
1392
1393 /* Enqueue the thread */
1394 q_head = sched_getrq(ci_rq, eprio);
1395 if (TAILQ_EMPTY(q_head)) {
1396 u_int i;
1397 uint32_t q;
1398
1399 /* Mark bit */
1400 i = eprio >> BITMAP_SHIFT;
1401 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
1402 KASSERT((ci_rq->r_bitmap[i] & q) == 0);
1403 ci_rq->r_bitmap[i] |= q;
1404 }
1405 TAILQ_INSERT_TAIL(q_head, l, l_runq);
1406 ci_rq->r_count++;
1407 if ((l->l_pflag & LP_BOUND) == 0)
1408 ci_rq->r_mcount++;
1409
1410 /*
1411 * Update the value of highest priority in the runqueue,
1412 * if priority of this thread is higher.
1413 */
1414 if (eprio > spc->spc_maxpriority)
1415 spc->spc_maxpriority = eprio;
1416
1417 sched_newts(l);
1418
1419 /*
1420 * Wake the chosen CPU or cause a preemption if the newly
1421 * enqueued thread has higher priority. Don't cause a
1422 * preemption if the thread is yielding (swtch).
1423 */
1424 if (!swtch && eprio > spc->spc_curpriority) {
1425 if (eprio >= sched_kpreempt_pri)
1426 type = RESCHED_KPREEMPT;
1427 else if (eprio >= sched_upreempt_pri)
1428 type = RESCHED_IMMED;
1429 else
1430 type = 0;
1431 cpu_need_resched(ci, type);
1432 }
1433 }
1434
1435 void
1436 sched_dequeue(struct lwp *l)
1437 {
1438 runqueue_t *ci_rq;
1439 TAILQ_HEAD(, lwp) *q_head;
1440 struct schedstate_percpu *spc;
1441 const pri_t eprio = lwp_eprio(l);
1442
1443 spc = & l->l_cpu->ci_schedstate;
1444 ci_rq = spc->spc_sched_info;
1445 KASSERT(lwp_locked(l, spc->spc_mutex));
1446
1447 KASSERT(eprio <= spc->spc_maxpriority);
1448 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
1449 KASSERT(ci_rq->r_count > 0);
1450
1451 ci_rq->r_count--;
1452 if ((l->l_pflag & LP_BOUND) == 0)
1453 ci_rq->r_mcount--;
1454
1455 q_head = sched_getrq(ci_rq, eprio);
1456 TAILQ_REMOVE(q_head, l, l_runq);
1457 if (TAILQ_EMPTY(q_head)) {
1458 u_int i;
1459 uint32_t q;
1460
1461 /* Unmark bit */
1462 i = eprio >> BITMAP_SHIFT;
1463 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
1464 KASSERT((ci_rq->r_bitmap[i] & q) != 0);
1465 ci_rq->r_bitmap[i] &= ~q;
1466
1467 /*
1468 * Update the value of highest priority in the runqueue, in a
1469 * case it was a last thread in the queue of highest priority.
1470 */
1471 if (eprio != spc->spc_maxpriority)
1472 return;
1473
1474 do {
1475 if (ci_rq->r_bitmap[i] != 0) {
1476 q = ffs(ci_rq->r_bitmap[i]);
1477 spc->spc_maxpriority =
1478 (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
1479 return;
1480 }
1481 } while (i--);
1482
1483 /* If not found - set the lowest value */
1484 spc->spc_maxpriority = 0;
1485 }
1486 }
1487
1488 /*
1489 * Migration and balancing.
1490 */
1491
1492 #ifdef MULTIPROCESSOR
1493
1494 /* Estimate if LWP is cache-hot */
1495 static inline bool
1496 lwp_cache_hot(const struct lwp *l)
1497 {
1498
1499 if (l->l_slptime || l->l_rticks == 0)
1500 return false;
1501
1502 return (hardclock_ticks - l->l_rticks <= cacheht_time);
1503 }
1504
1505 /* Check if LWP can migrate to the chosen CPU */
1506 static inline bool
1507 sched_migratable(const struct lwp *l, struct cpu_info *ci)
1508 {
1509 const struct schedstate_percpu *spc = &ci->ci_schedstate;
1510
1511 /* CPU is offline */
1512 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
1513 return false;
1514
1515 /* Affinity bind */
1516 if (__predict_false(l->l_flag & LW_AFFINITY))
1517 return CPU_ISSET(cpu_index(ci), &l->l_affinity);
1518
1519 /* Processor-set */
1520 return (spc->spc_psid == l->l_psid);
1521 }
1522
1523 /*
1524 * Estimate the migration of LWP to the other CPU.
1525 * Take and return the CPU, if migration is needed.
1526 */
1527 struct cpu_info *
1528 sched_takecpu(struct lwp *l)
1529 {
1530 struct cpu_info *ci, *tci, *first, *next;
1531 struct schedstate_percpu *spc;
1532 runqueue_t *ci_rq, *ici_rq;
1533 pri_t eprio, lpri, pri;
1534
1535 KASSERT(lwp_locked(l, NULL));
1536
1537 ci = l->l_cpu;
1538 spc = &ci->ci_schedstate;
1539 ci_rq = spc->spc_sched_info;
1540
1541 /* If thread is strictly bound, do not estimate other CPUs */
1542 if (l->l_pflag & LP_BOUND)
1543 return ci;
1544
1545 /* CPU of this thread is idling - run there */
1546 if (ci_rq->r_count == 0)
1547 return ci;
1548
1549 eprio = lwp_eprio(l);
1550
1551 /* Stay if thread is cache-hot */
1552 if (__predict_true(l->l_stat != LSIDL) &&
1553 lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
1554 return ci;
1555
1556 /* Run on current CPU if priority of thread is higher */
1557 ci = curcpu();
1558 spc = &ci->ci_schedstate;
1559 if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
1560 return ci;
1561
1562 /*
1563 * Look for the CPU with the lowest priority thread. In case of
1564 * equal priority, choose the CPU with the fewest of threads.
1565 */
1566 first = l->l_cpu;
1567 ci = first;
1568 tci = first;
1569 lpri = PRI_COUNT;
1570 do {
1571 next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
1572 spc = &ci->ci_schedstate;
1573 ici_rq = spc->spc_sched_info;
1574 pri = max(spc->spc_curpriority, spc->spc_maxpriority);
1575 if (pri > lpri)
1576 continue;
1577
1578 if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
1579 continue;
1580
1581 if (!sched_migratable(l, ci))
1582 continue;
1583
1584 lpri = pri;
1585 tci = ci;
1586 ci_rq = ici_rq;
1587 } while (ci = next, ci != first);
1588
1589 return tci;
1590 }
1591
1592 /*
1593 * Tries to catch an LWP from the runqueue of other CPU.
1594 */
1595 static struct lwp *
1596 sched_catchlwp(void)
1597 {
1598 struct cpu_info *curci = curcpu(), *ci = worker_ci;
1599 struct schedstate_percpu *spc;
1600 TAILQ_HEAD(, lwp) *q_head;
1601 runqueue_t *ci_rq;
1602 struct lwp *l;
1603
1604 if (curci == ci)
1605 return NULL;
1606
1607 /* Lockless check */
1608 spc = &ci->ci_schedstate;
1609 ci_rq = spc->spc_sched_info;
1610 if (ci_rq->r_mcount < min_catch)
1611 return NULL;
1612
1613 /*
1614 * Double-lock the runqueues.
1615 */
1616 if (curci < ci) {
1617 spc_lock(ci);
1618 } else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
1619 const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
1620
1621 spc_unlock(curci);
1622 spc_lock(ci);
1623 spc_lock(curci);
1624
1625 if (cur_rq->r_count) {
1626 spc_unlock(ci);
1627 return NULL;
1628 }
1629 }
1630
1631 if (ci_rq->r_mcount < min_catch) {
1632 spc_unlock(ci);
1633 return NULL;
1634 }
1635
1636 /* Take the highest priority thread */
1637 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
1638 l = TAILQ_FIRST(q_head);
1639
1640 for (;;) {
1641 /* Check the first and next result from the queue */
1642 if (l == NULL)
1643 break;
1644 KASSERT(l->l_stat == LSRUN);
1645 KASSERT(l->l_flag & LW_INMEM);
1646
1647 /* Look for threads, whose are allowed to migrate */
1648 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
1649 !sched_migratable(l, curci)) {
1650 l = TAILQ_NEXT(l, l_runq);
1651 continue;
1652 }
1653
1654 /* Grab the thread, and move to the local run queue */
1655 sched_dequeue(l);
1656 l->l_cpu = curci;
1657 lwp_unlock_to(l, curci->ci_schedstate.spc_mutex);
1658 sched_enqueue(l, false);
1659 return l;
1660 }
1661 spc_unlock(ci);
1662
1663 return l;
1664 }
1665
1666 /*
1667 * Periodical calculations for balancing.
1668 */
1669 static void
1670 sched_balance(void *nocallout)
1671 {
1672 struct cpu_info *ci, *hci;
1673 runqueue_t *ci_rq;
1674 CPU_INFO_ITERATOR cii;
1675 u_int highest;
1676
1677 hci = curcpu();
1678 highest = 0;
1679
1680 /* Make lockless countings */
1681 for (CPU_INFO_FOREACH(cii, ci)) {
1682 ci_rq = ci->ci_schedstate.spc_sched_info;
1683
1684 /* Average count of the threads */
1685 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
1686
1687 /* Look for CPU with the highest average */
1688 if (ci_rq->r_avgcount > highest) {
1689 hci = ci;
1690 highest = ci_rq->r_avgcount;
1691 }
1692 }
1693
1694 /* Update the worker */
1695 worker_ci = hci;
1696
1697 if (nocallout == NULL)
1698 callout_schedule(&balance_ch, balance_period);
1699 }
1700
1701 #else
1702
1703 struct cpu_info *
1704 sched_takecpu(struct lwp *l)
1705 {
1706
1707 return l->l_cpu;
1708 }
1709
1710 #endif /* MULTIPROCESSOR */
1711
1712 /*
1713 * Scheduler mill.
1714 */
1715 struct lwp *
1716 sched_nextlwp(void)
1717 {
1718 struct cpu_info *ci = curcpu();
1719 struct schedstate_percpu *spc;
1720 TAILQ_HEAD(, lwp) *q_head;
1721 runqueue_t *ci_rq;
1722 struct lwp *l;
1723
1724 spc = &ci->ci_schedstate;
1725 ci_rq = spc->spc_sched_info;
1726
1727 #ifdef MULTIPROCESSOR
1728 /* If runqueue is empty, try to catch some thread from other CPU */
1729 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
1730 if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
1731 return NULL;
1732 } else if (ci_rq->r_count == 0) {
1733 /* Reset the counter, and call the balancer */
1734 ci_rq->r_avgcount = 0;
1735 sched_balance(ci);
1736
1737 /* The re-locking will be done inside */
1738 return sched_catchlwp();
1739 }
1740 #else
1741 if (ci_rq->r_count == 0)
1742 return NULL;
1743 #endif
1744
1745 /* Take the highest priority thread */
1746 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
1747 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
1748 l = TAILQ_FIRST(q_head);
1749 KASSERT(l != NULL);
1750
1751 sched_oncpu(l);
1752 l->l_rticks = hardclock_ticks;
1753
1754 return l;
1755 }
1756
1757 bool
1758 sched_curcpu_runnable_p(void)
1759 {
1760 const struct cpu_info *ci;
1761 const runqueue_t *ci_rq;
1762 bool rv;
1763
1764 kpreempt_disable();
1765 ci = curcpu();
1766 ci_rq = ci->ci_schedstate.spc_sched_info;
1767
1768 #ifndef __HAVE_FAST_SOFTINTS
1769 if (ci->ci_data.cpu_softints) {
1770 kpreempt_enable();
1771 return true;
1772 }
1773 #endif
1774
1775 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
1776 rv = (ci_rq->r_count - ci_rq->r_mcount);
1777 else
1778 rv = ci_rq->r_count != 0;
1779 kpreempt_enable();
1780
1781 return rv;
1782 }
1783
1784 /*
1785 * Debugging.
1786 */
1787
1788 #ifdef DDB
1789
1790 void
1791 sched_print_runqueue(void (*pr)(const char *, ...)
1792 __attribute__((__format__(__printf__,1,2))))
1793 {
1794 runqueue_t *ci_rq;
1795 struct schedstate_percpu *spc;
1796 struct lwp *l;
1797 struct proc *p;
1798 int i;
1799 struct cpu_info *ci;
1800 CPU_INFO_ITERATOR cii;
1801
1802 for (CPU_INFO_FOREACH(cii, ci)) {
1803 spc = &ci->ci_schedstate;
1804 ci_rq = spc->spc_sched_info;
1805
1806 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
1807 (*pr)(" pid.lid = %d.%d, threads count = %u, "
1808 "avgcount = %u, highest pri = %d\n",
1809 #ifdef MULTIPROCESSOR
1810 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
1811 #else
1812 curlwp->l_proc->p_pid, curlwp->l_lid,
1813 #endif
1814 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority);
1815 i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
1816 do {
1817 uint32_t q;
1818 q = ci_rq->r_bitmap[i];
1819 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
1820 } while (i--);
1821 }
1822
1823 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %s\n",
1824 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "LRTIME");
1825
1826 PROCLIST_FOREACH(p, &allproc) {
1827 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
1828 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1829 ci = l->l_cpu;
1830 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %u\n",
1831 (int)l->l_lid, l->l_priority, lwp_eprio(l),
1832 l->l_flag, l->l_stat == LSRUN ? "RQ" :
1833 (l->l_stat == LSSLEEP ? "SQ" : "-"),
1834 l, ci->ci_index,
1835 (u_int)(hardclock_ticks - l->l_rticks));
1836 }
1837 }
1838 }
1839
1840 #endif
1841