kern_synch.c revision 1.233 1 /* $NetBSD: kern_synch.c,v 1.233 2008/04/28 20:24:03 martin Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
10 * Daniel Sieger.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
36 * All rights reserved.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 */
59
60 /*-
61 * Copyright (c) 1982, 1986, 1990, 1991, 1993
62 * The Regents of the University of California. All rights reserved.
63 * (c) UNIX System Laboratories, Inc.
64 * All or some portions of this file are derived from material licensed
65 * to the University of California by American Telephone and Telegraph
66 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
67 * the permission of UNIX System Laboratories, Inc.
68 *
69 * Redistribution and use in source and binary forms, with or without
70 * modification, are permitted provided that the following conditions
71 * are met:
72 * 1. Redistributions of source code must retain the above copyright
73 * notice, this list of conditions and the following disclaimer.
74 * 2. Redistributions in binary form must reproduce the above copyright
75 * notice, this list of conditions and the following disclaimer in the
76 * documentation and/or other materials provided with the distribution.
77 * 3. Neither the name of the University nor the names of its contributors
78 * may be used to endorse or promote products derived from this software
79 * without specific prior written permission.
80 *
81 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
82 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
83 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
84 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
85 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
86 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
87 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
88 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
89 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
90 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
91 * SUCH DAMAGE.
92 *
93 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
94 */
95
96 #include <sys/cdefs.h>
97 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.233 2008/04/28 20:24:03 martin Exp $");
98
99 #include "opt_kstack.h"
100 #include "opt_lockdebug.h"
101 #include "opt_multiprocessor.h"
102 #include "opt_perfctrs.h"
103 #include "opt_preemption.h"
104
105 #define __MUTEX_PRIVATE
106
107 #include <sys/param.h>
108 #include <sys/systm.h>
109 #include <sys/proc.h>
110 #include <sys/kernel.h>
111 #if defined(PERFCTRS)
112 #include <sys/pmc.h>
113 #endif
114 #include <sys/cpu.h>
115 #include <sys/resourcevar.h>
116 #include <sys/sched.h>
117 #include <sys/syscall_stats.h>
118 #include <sys/sleepq.h>
119 #include <sys/lockdebug.h>
120 #include <sys/evcnt.h>
121 #include <sys/intr.h>
122 #include <sys/lwpctl.h>
123 #include <sys/atomic.h>
124 #include <sys/simplelock.h>
125 #include <sys/bitops.h>
126 #include <sys/kmem.h>
127 #include <sys/sysctl.h>
128 #include <sys/idle.h>
129
130 #include <uvm/uvm_extern.h>
131
132 #include <dev/lockstat.h>
133
134 /*
135 * Priority related defintions.
136 */
137 #define PRI_TS_COUNT (NPRI_USER)
138 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
139 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
140
141 #define PRI_HIGHEST_TS (MAXPRI_USER)
142
143 /*
144 * Bits per map.
145 */
146 #define BITMAP_BITS (32)
147 #define BITMAP_SHIFT (5)
148 #define BITMAP_MSB (0x80000000U)
149 #define BITMAP_MASK (BITMAP_BITS - 1)
150
151 /*
152 * Structures, runqueue.
153 */
154
155 typedef struct {
156 TAILQ_HEAD(, lwp) q_head;
157 } queue_t;
158
159 typedef struct {
160 /* Lock and bitmap */
161 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
162 /* Counters */
163 u_int r_count; /* Count of the threads */
164 u_int r_avgcount; /* Average count of threads */
165 u_int r_mcount; /* Count of migratable threads */
166 /* Runqueues */
167 queue_t r_rt_queue[PRI_RT_COUNT];
168 queue_t r_ts_queue[PRI_TS_COUNT];
169 } runqueue_t;
170
171 static u_int sched_unsleep(struct lwp *, bool);
172 static void sched_changepri(struct lwp *, pri_t);
173 static void sched_lendpri(struct lwp *, pri_t);
174 static void *sched_getrq(runqueue_t *, const pri_t);
175 #ifdef MULTIPROCESSOR
176 static lwp_t *sched_catchlwp(void);
177 static void sched_balance(void *);
178 #endif
179
180 syncobj_t sleep_syncobj = {
181 SOBJ_SLEEPQ_SORTED,
182 sleepq_unsleep,
183 sleepq_changepri,
184 sleepq_lendpri,
185 syncobj_noowner,
186 };
187
188 syncobj_t sched_syncobj = {
189 SOBJ_SLEEPQ_SORTED,
190 sched_unsleep,
191 sched_changepri,
192 sched_lendpri,
193 syncobj_noowner,
194 };
195
196 const int schedppq = 1;
197 callout_t sched_pstats_ch;
198 unsigned sched_pstats_ticks;
199 kcondvar_t lbolt; /* once a second sleep address */
200
201 /*
202 * Kernel preemption.
203 */
204 #ifdef PREEMPTION
205 #if 0
206 int sched_kpreempt_pri = PRI_USER_RT;
207 #else
208 /* XXX disable for now until any bugs are worked out. */
209 int sched_kpreempt_pri = 1000;
210 #endif
211
212 static struct evcnt kpreempt_ev_crit;
213 static struct evcnt kpreempt_ev_klock;
214 static struct evcnt kpreempt_ev_ipl;
215 static struct evcnt kpreempt_ev_immed;
216 #else
217 int sched_kpreempt_pri = INT_MAX;
218 #endif
219 int sched_upreempt_pri = PRI_KERNEL;
220
221 /*
222 * Migration and balancing.
223 */
224 static u_int cacheht_time; /* Cache hotness time */
225 static u_int min_catch; /* Minimal LWP count for catching */
226 static u_int balance_period; /* Balance period */
227 static struct cpu_info *worker_ci; /* Victim CPU */
228 #ifdef MULTIPROCESSOR
229 static struct callout balance_ch; /* Callout of balancer */
230 #endif
231
232 /*
233 * During autoconfiguration or after a panic, a sleep will simply lower the
234 * priority briefly to allow interrupts, then return. The priority to be
235 * used (safepri) is machine-dependent, thus this value is initialized and
236 * maintained in the machine-dependent layers. This priority will typically
237 * be 0, or the lowest priority that is safe for use on the interrupt stack;
238 * it can be made higher to block network software interrupts after panics.
239 */
240 int safepri;
241
242 /*
243 * OBSOLETE INTERFACE
244 *
245 * General sleep call. Suspends the current process until a wakeup is
246 * performed on the specified identifier. The process will then be made
247 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
248 * means no timeout). If pri includes PCATCH flag, signals are checked
249 * before and after sleeping, else signals are not checked. Returns 0 if
250 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
251 * signal needs to be delivered, ERESTART is returned if the current system
252 * call should be restarted if possible, and EINTR is returned if the system
253 * call should be interrupted by the signal (return EINTR).
254 *
255 * The interlock is held until we are on a sleep queue. The interlock will
256 * be locked before returning back to the caller unless the PNORELOCK flag
257 * is specified, in which case the interlock will always be unlocked upon
258 * return.
259 */
260 int
261 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
262 volatile struct simplelock *interlock)
263 {
264 struct lwp *l = curlwp;
265 sleepq_t *sq;
266 int error;
267
268 KASSERT((l->l_pflag & LP_INTR) == 0);
269
270 if (sleepq_dontsleep(l)) {
271 (void)sleepq_abort(NULL, 0);
272 if ((priority & PNORELOCK) != 0)
273 simple_unlock(interlock);
274 return 0;
275 }
276
277 l->l_kpriority = true;
278 sq = sleeptab_lookup(&sleeptab, ident);
279 sleepq_enter(sq, l);
280 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
281
282 if (interlock != NULL) {
283 KASSERT(simple_lock_held(interlock));
284 simple_unlock(interlock);
285 }
286
287 error = sleepq_block(timo, priority & PCATCH);
288
289 if (interlock != NULL && (priority & PNORELOCK) == 0)
290 simple_lock(interlock);
291
292 return error;
293 }
294
295 int
296 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
297 kmutex_t *mtx)
298 {
299 struct lwp *l = curlwp;
300 sleepq_t *sq;
301 int error;
302
303 KASSERT((l->l_pflag & LP_INTR) == 0);
304
305 if (sleepq_dontsleep(l)) {
306 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
307 return 0;
308 }
309
310 l->l_kpriority = true;
311 sq = sleeptab_lookup(&sleeptab, ident);
312 sleepq_enter(sq, l);
313 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
314 mutex_exit(mtx);
315 error = sleepq_block(timo, priority & PCATCH);
316
317 if ((priority & PNORELOCK) == 0)
318 mutex_enter(mtx);
319
320 return error;
321 }
322
323 /*
324 * General sleep call for situations where a wake-up is not expected.
325 */
326 int
327 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
328 {
329 struct lwp *l = curlwp;
330 sleepq_t *sq;
331 int error;
332
333 if (sleepq_dontsleep(l))
334 return sleepq_abort(NULL, 0);
335
336 if (mtx != NULL)
337 mutex_exit(mtx);
338 l->l_kpriority = true;
339 sq = sleeptab_lookup(&sleeptab, l);
340 sleepq_enter(sq, l);
341 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
342 error = sleepq_block(timo, intr);
343 if (mtx != NULL)
344 mutex_enter(mtx);
345
346 return error;
347 }
348
349 /*
350 * OBSOLETE INTERFACE
351 *
352 * Make all processes sleeping on the specified identifier runnable.
353 */
354 void
355 wakeup(wchan_t ident)
356 {
357 sleepq_t *sq;
358
359 if (cold)
360 return;
361
362 sq = sleeptab_lookup(&sleeptab, ident);
363 sleepq_wake(sq, ident, (u_int)-1);
364 }
365
366 /*
367 * OBSOLETE INTERFACE
368 *
369 * Make the highest priority process first in line on the specified
370 * identifier runnable.
371 */
372 void
373 wakeup_one(wchan_t ident)
374 {
375 sleepq_t *sq;
376
377 if (cold)
378 return;
379
380 sq = sleeptab_lookup(&sleeptab, ident);
381 sleepq_wake(sq, ident, 1);
382 }
383
384
385 /*
386 * General yield call. Puts the current process back on its run queue and
387 * performs a voluntary context switch. Should only be called when the
388 * current process explicitly requests it (eg sched_yield(2)).
389 */
390 void
391 yield(void)
392 {
393 struct lwp *l = curlwp;
394
395 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
396 lwp_lock(l);
397 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
398 KASSERT(l->l_stat == LSONPROC);
399 l->l_kpriority = false;
400 (void)mi_switch(l);
401 KERNEL_LOCK(l->l_biglocks, l);
402 }
403
404 /*
405 * General preemption call. Puts the current process back on its run queue
406 * and performs an involuntary context switch.
407 */
408 void
409 preempt(void)
410 {
411 struct lwp *l = curlwp;
412
413 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
414 lwp_lock(l);
415 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
416 KASSERT(l->l_stat == LSONPROC);
417 l->l_kpriority = false;
418 l->l_nivcsw++;
419 (void)mi_switch(l);
420 KERNEL_LOCK(l->l_biglocks, l);
421 }
422
423 #ifdef PREEMPTION
424 /* XXX Yuck, for lockstat. */
425 static char in_critical_section;
426 static char kernel_lock_held;
427 static char spl_raised;
428 static char is_softint;
429
430 /*
431 * Handle a request made by another agent to preempt the current LWP
432 * in-kernel. Usually called when l_dopreempt may be non-zero.
433 */
434 bool
435 kpreempt(uintptr_t where)
436 {
437 uintptr_t failed;
438 lwp_t *l;
439 int s, dop;
440
441 l = curlwp;
442 failed = 0;
443 while ((dop = l->l_dopreempt) != 0) {
444 if (l->l_stat != LSONPROC) {
445 /*
446 * About to block (or die), let it happen.
447 * Doesn't really count as "preemption has
448 * been blocked", since we're going to
449 * context switch.
450 */
451 l->l_dopreempt = 0;
452 return true;
453 }
454 if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
455 /* Can't preempt idle loop, don't count as failure. */
456 l->l_dopreempt = 0;
457 return true;
458 }
459 if (__predict_false(l->l_nopreempt != 0)) {
460 /* LWP holds preemption disabled, explicitly. */
461 if ((dop & DOPREEMPT_COUNTED) == 0) {
462 atomic_inc_64(&kpreempt_ev_crit.ev_count);
463 }
464 failed = (uintptr_t)&in_critical_section;
465 break;
466 }
467 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
468 /* Can't preempt soft interrupts yet. */
469 l->l_dopreempt = 0;
470 failed = (uintptr_t)&is_softint;
471 break;
472 }
473 s = splsched();
474 if (__predict_false(l->l_blcnt != 0 ||
475 curcpu()->ci_biglock_wanted != NULL)) {
476 /* Hold or want kernel_lock, code is not MT safe. */
477 splx(s);
478 if ((dop & DOPREEMPT_COUNTED) == 0) {
479 atomic_inc_64(&kpreempt_ev_klock.ev_count);
480 }
481 failed = (uintptr_t)&kernel_lock_held;
482 break;
483 }
484 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
485 /*
486 * It may be that the IPL is too high.
487 * kpreempt_enter() can schedule an
488 * interrupt to retry later.
489 */
490 splx(s);
491 if ((dop & DOPREEMPT_COUNTED) == 0) {
492 atomic_inc_64(&kpreempt_ev_ipl.ev_count);
493 }
494 failed = (uintptr_t)&spl_raised;
495 break;
496 }
497 /* Do it! */
498 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
499 atomic_inc_64(&kpreempt_ev_immed.ev_count);
500 }
501 lwp_lock(l);
502 mi_switch(l);
503 l->l_nopreempt++;
504 splx(s);
505
506 /* Take care of any MD cleanup. */
507 cpu_kpreempt_exit(where);
508 l->l_nopreempt--;
509 }
510
511 /* Record preemption failure for reporting via lockstat. */
512 if (__predict_false(failed)) {
513 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
514 int lsflag = 0;
515 LOCKSTAT_ENTER(lsflag);
516 /* Might recurse, make it atomic. */
517 if (__predict_false(lsflag)) {
518 if (where == 0) {
519 where = (uintptr_t)__builtin_return_address(0);
520 }
521 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
522 NULL, (void *)where) == NULL) {
523 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
524 l->l_pfaillock = failed;
525 }
526 }
527 LOCKSTAT_EXIT(lsflag);
528 }
529
530 return failed;
531 }
532
533 /*
534 * Return true if preemption is explicitly disabled.
535 */
536 bool
537 kpreempt_disabled(void)
538 {
539 lwp_t *l;
540
541 l = curlwp;
542
543 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
544 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
545 }
546 #else
547 bool
548 kpreempt(uintptr_t where)
549 {
550
551 panic("kpreempt");
552 return true;
553 }
554
555 bool
556 kpreempt_disabled(void)
557 {
558
559 return true;
560 }
561 #endif
562
563 /*
564 * Disable kernel preemption.
565 */
566 void
567 kpreempt_disable(void)
568 {
569
570 KPREEMPT_DISABLE(curlwp);
571 }
572
573 /*
574 * Reenable kernel preemption.
575 */
576 void
577 kpreempt_enable(void)
578 {
579
580 KPREEMPT_ENABLE(curlwp);
581 }
582
583 /*
584 * Compute the amount of time during which the current lwp was running.
585 *
586 * - update l_rtime unless it's an idle lwp.
587 */
588
589 void
590 updatertime(lwp_t *l, const struct bintime *now)
591 {
592
593 if ((l->l_flag & LW_IDLE) != 0)
594 return;
595
596 /* rtime += now - stime */
597 bintime_add(&l->l_rtime, now);
598 bintime_sub(&l->l_rtime, &l->l_stime);
599 }
600
601 /*
602 * The machine independent parts of context switch.
603 *
604 * Returns 1 if another LWP was actually run.
605 */
606 int
607 mi_switch(lwp_t *l)
608 {
609 struct cpu_info *ci, *tci = NULL;
610 struct schedstate_percpu *spc;
611 struct lwp *newl;
612 int retval, oldspl;
613 struct bintime bt;
614 bool returning;
615
616 KASSERT(lwp_locked(l, NULL));
617 KASSERT(kpreempt_disabled());
618 LOCKDEBUG_BARRIER(l->l_mutex, 1);
619
620 #ifdef KSTACK_CHECK_MAGIC
621 kstack_check_magic(l);
622 #endif
623
624 binuptime(&bt);
625
626 KASSERT(l->l_cpu == curcpu());
627 ci = l->l_cpu;
628 spc = &ci->ci_schedstate;
629 returning = false;
630 newl = NULL;
631
632 /*
633 * If we have been asked to switch to a specific LWP, then there
634 * is no need to inspect the run queues. If a soft interrupt is
635 * blocking, then return to the interrupted thread without adjusting
636 * VM context or its start time: neither have been changed in order
637 * to take the interrupt.
638 */
639 if (l->l_switchto != NULL) {
640 if ((l->l_pflag & LP_INTR) != 0) {
641 returning = true;
642 softint_block(l);
643 if ((l->l_flag & LW_TIMEINTR) != 0)
644 updatertime(l, &bt);
645 }
646 newl = l->l_switchto;
647 l->l_switchto = NULL;
648 }
649 #ifndef __HAVE_FAST_SOFTINTS
650 else if (ci->ci_data.cpu_softints != 0) {
651 /* There are pending soft interrupts, so pick one. */
652 newl = softint_picklwp();
653 newl->l_stat = LSONPROC;
654 newl->l_flag |= LW_RUNNING;
655 }
656 #endif /* !__HAVE_FAST_SOFTINTS */
657
658 /* Count time spent in current system call */
659 if (!returning) {
660 SYSCALL_TIME_SLEEP(l);
661
662 /*
663 * XXXSMP If we are using h/w performance counters,
664 * save context.
665 */
666 #if PERFCTRS
667 if (PMC_ENABLED(l->l_proc)) {
668 pmc_save_context(l->l_proc);
669 }
670 #endif
671 updatertime(l, &bt);
672 }
673
674 /*
675 * If on the CPU and we have gotten this far, then we must yield.
676 */
677 KASSERT(l->l_stat != LSRUN);
678 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
679 KASSERT(lwp_locked(l, spc->spc_lwplock));
680
681 if (l->l_target_cpu == l->l_cpu) {
682 l->l_target_cpu = NULL;
683 } else {
684 tci = l->l_target_cpu;
685 }
686
687 if (__predict_false(tci != NULL)) {
688 /* Double-lock the runqueues */
689 spc_dlock(ci, tci);
690 } else {
691 /* Lock the runqueue */
692 spc_lock(ci);
693 }
694
695 if ((l->l_flag & LW_IDLE) == 0) {
696 l->l_stat = LSRUN;
697 if (__predict_false(tci != NULL)) {
698 /*
699 * Set the new CPU, lock and unset the
700 * l_target_cpu - thread will be enqueued
701 * to the runqueue of target CPU.
702 */
703 l->l_cpu = tci;
704 lwp_setlock(l, tci->ci_schedstate.spc_mutex);
705 l->l_target_cpu = NULL;
706 } else {
707 lwp_setlock(l, spc->spc_mutex);
708 }
709 sched_enqueue(l, true);
710 } else {
711 KASSERT(tci == NULL);
712 l->l_stat = LSIDL;
713 }
714 } else {
715 /* Lock the runqueue */
716 spc_lock(ci);
717 }
718
719 /*
720 * Let sched_nextlwp() select the LWP to run the CPU next.
721 * If no LWP is runnable, select the idle LWP.
722 *
723 * Note that spc_lwplock might not necessary be held, and
724 * new thread would be unlocked after setting the LWP-lock.
725 */
726 if (newl == NULL) {
727 newl = sched_nextlwp();
728 if (newl != NULL) {
729 sched_dequeue(newl);
730 KASSERT(lwp_locked(newl, spc->spc_mutex));
731 newl->l_stat = LSONPROC;
732 newl->l_cpu = ci;
733 newl->l_flag |= LW_RUNNING;
734 lwp_setlock(newl, spc->spc_lwplock);
735 } else {
736 newl = ci->ci_data.cpu_idlelwp;
737 newl->l_stat = LSONPROC;
738 newl->l_flag |= LW_RUNNING;
739 }
740 /*
741 * Only clear want_resched if there are no
742 * pending (slow) software interrupts.
743 */
744 ci->ci_want_resched = ci->ci_data.cpu_softints;
745 spc->spc_flags &= ~SPCF_SWITCHCLEAR;
746 spc->spc_curpriority = lwp_eprio(newl);
747 }
748
749 /* Items that must be updated with the CPU locked. */
750 if (!returning) {
751 /* Update the new LWP's start time. */
752 newl->l_stime = bt;
753
754 /*
755 * ci_curlwp changes when a fast soft interrupt occurs.
756 * We use cpu_onproc to keep track of which kernel or
757 * user thread is running 'underneath' the software
758 * interrupt. This is important for time accounting,
759 * itimers and forcing user threads to preempt (aston).
760 */
761 ci->ci_data.cpu_onproc = newl;
762 }
763
764 /* Kernel preemption related tasks. */
765 l->l_dopreempt = 0;
766 if (__predict_false(l->l_pfailaddr != 0)) {
767 LOCKSTAT_FLAG(lsflag);
768 LOCKSTAT_ENTER(lsflag);
769 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
770 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
771 1, l->l_pfailtime, l->l_pfailaddr);
772 LOCKSTAT_EXIT(lsflag);
773 l->l_pfailtime = 0;
774 l->l_pfaillock = 0;
775 l->l_pfailaddr = 0;
776 }
777
778 if (l != newl) {
779 struct lwp *prevlwp;
780
781 /* Release all locks, but leave the current LWP locked */
782 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
783 /*
784 * In case of migration, drop the local runqueue
785 * lock, thread is on other runqueue now.
786 */
787 if (__predict_false(tci != NULL))
788 spc_unlock(ci);
789 /*
790 * Drop spc_lwplock, if the current LWP has been moved
791 * to the run queue (it is now locked by spc_mutex).
792 */
793 mutex_spin_exit(spc->spc_lwplock);
794 } else {
795 /*
796 * Otherwise, drop the spc_mutex, we are done with the
797 * run queues.
798 */
799 mutex_spin_exit(spc->spc_mutex);
800 KASSERT(tci == NULL);
801 }
802
803 /*
804 * Mark that context switch is going to be perfomed
805 * for this LWP, to protect it from being switched
806 * to on another CPU.
807 */
808 KASSERT(l->l_ctxswtch == 0);
809 l->l_ctxswtch = 1;
810 l->l_ncsw++;
811 l->l_flag &= ~LW_RUNNING;
812
813 /*
814 * Increase the count of spin-mutexes before the release
815 * of the last lock - we must remain at IPL_SCHED during
816 * the context switch.
817 */
818 oldspl = MUTEX_SPIN_OLDSPL(ci);
819 ci->ci_mtx_count--;
820 lwp_unlock(l);
821
822 /* Count the context switch on this CPU. */
823 ci->ci_data.cpu_nswtch++;
824
825 /* Update status for lwpctl, if present. */
826 if (l->l_lwpctl != NULL)
827 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
828
829 /*
830 * Save old VM context, unless a soft interrupt
831 * handler is blocking.
832 */
833 if (!returning)
834 pmap_deactivate(l);
835
836 /*
837 * We may need to spin-wait for if 'newl' is still
838 * context switching on another CPU.
839 */
840 if (newl->l_ctxswtch != 0) {
841 u_int count;
842 count = SPINLOCK_BACKOFF_MIN;
843 while (newl->l_ctxswtch)
844 SPINLOCK_BACKOFF(count);
845 }
846
847 /* Switch to the new LWP.. */
848 prevlwp = cpu_switchto(l, newl, returning);
849 ci = curcpu();
850
851 /*
852 * Switched away - we have new curlwp.
853 * Restore VM context and IPL.
854 */
855 pmap_activate(l);
856 if (prevlwp != NULL) {
857 /* Normalize the count of the spin-mutexes */
858 ci->ci_mtx_count++;
859 /* Unmark the state of context switch */
860 membar_exit();
861 prevlwp->l_ctxswtch = 0;
862 }
863
864 /* Update status for lwpctl, if present. */
865 if (l->l_lwpctl != NULL) {
866 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
867 l->l_lwpctl->lc_pctr++;
868 }
869
870 KASSERT(l->l_cpu == ci);
871 splx(oldspl);
872 retval = 1;
873 } else {
874 /* Nothing to do - just unlock and return. */
875 KASSERT(tci == NULL);
876 spc_unlock(ci);
877 lwp_unlock(l);
878 retval = 0;
879 }
880
881 KASSERT(l == curlwp);
882 KASSERT(l->l_stat == LSONPROC);
883
884 /*
885 * XXXSMP If we are using h/w performance counters, restore context.
886 * XXXSMP preemption problem.
887 */
888 #if PERFCTRS
889 if (PMC_ENABLED(l->l_proc)) {
890 pmc_restore_context(l->l_proc);
891 }
892 #endif
893 SYSCALL_TIME_WAKEUP(l);
894 LOCKDEBUG_BARRIER(NULL, 1);
895
896 return retval;
897 }
898
899 /*
900 * Change process state to be runnable, placing it on the run queue if it is
901 * in memory, and awakening the swapper if it isn't in memory.
902 *
903 * Call with the process and LWP locked. Will return with the LWP unlocked.
904 */
905 void
906 setrunnable(struct lwp *l)
907 {
908 struct proc *p = l->l_proc;
909 struct cpu_info *ci;
910 sigset_t *ss;
911
912 KASSERT((l->l_flag & LW_IDLE) == 0);
913 KASSERT(mutex_owned(p->p_lock));
914 KASSERT(lwp_locked(l, NULL));
915 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
916
917 switch (l->l_stat) {
918 case LSSTOP:
919 /*
920 * If we're being traced (possibly because someone attached us
921 * while we were stopped), check for a signal from the debugger.
922 */
923 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
924 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
925 ss = &l->l_sigpend.sp_set;
926 else
927 ss = &p->p_sigpend.sp_set;
928 sigaddset(ss, p->p_xstat);
929 signotify(l);
930 }
931 p->p_nrlwps++;
932 break;
933 case LSSUSPENDED:
934 l->l_flag &= ~LW_WSUSPEND;
935 p->p_nrlwps++;
936 cv_broadcast(&p->p_lwpcv);
937 break;
938 case LSSLEEP:
939 KASSERT(l->l_wchan != NULL);
940 break;
941 default:
942 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
943 }
944
945 /*
946 * If the LWP was sleeping interruptably, then it's OK to start it
947 * again. If not, mark it as still sleeping.
948 */
949 if (l->l_wchan != NULL) {
950 l->l_stat = LSSLEEP;
951 /* lwp_unsleep() will release the lock. */
952 lwp_unsleep(l, true);
953 return;
954 }
955
956 /*
957 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
958 * about to call mi_switch(), in which case it will yield.
959 */
960 if ((l->l_flag & LW_RUNNING) != 0) {
961 l->l_stat = LSONPROC;
962 l->l_slptime = 0;
963 lwp_unlock(l);
964 return;
965 }
966
967 /*
968 * Look for a CPU to run.
969 * Set the LWP runnable.
970 */
971 ci = sched_takecpu(l);
972 l->l_cpu = ci;
973 if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
974 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
975 lwp_lock(l);
976 }
977 sched_setrunnable(l);
978 l->l_stat = LSRUN;
979 l->l_slptime = 0;
980
981 /*
982 * If thread is swapped out - wake the swapper to bring it back in.
983 * Otherwise, enter it into a run queue.
984 */
985 if (l->l_flag & LW_INMEM) {
986 sched_enqueue(l, false);
987 resched_cpu(l);
988 lwp_unlock(l);
989 } else {
990 lwp_unlock(l);
991 uvm_kick_scheduler();
992 }
993 }
994
995 /*
996 * suspendsched:
997 *
998 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
999 */
1000 void
1001 suspendsched(void)
1002 {
1003 CPU_INFO_ITERATOR cii;
1004 struct cpu_info *ci;
1005 struct lwp *l;
1006 struct proc *p;
1007
1008 /*
1009 * We do this by process in order not to violate the locking rules.
1010 */
1011 mutex_enter(proc_lock);
1012 PROCLIST_FOREACH(p, &allproc) {
1013 mutex_enter(p->p_lock);
1014
1015 if ((p->p_flag & PK_SYSTEM) != 0) {
1016 mutex_exit(p->p_lock);
1017 continue;
1018 }
1019
1020 p->p_stat = SSTOP;
1021
1022 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1023 if (l == curlwp)
1024 continue;
1025
1026 lwp_lock(l);
1027
1028 /*
1029 * Set L_WREBOOT so that the LWP will suspend itself
1030 * when it tries to return to user mode. We want to
1031 * try and get to get as many LWPs as possible to
1032 * the user / kernel boundary, so that they will
1033 * release any locks that they hold.
1034 */
1035 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1036
1037 if (l->l_stat == LSSLEEP &&
1038 (l->l_flag & LW_SINTR) != 0) {
1039 /* setrunnable() will release the lock. */
1040 setrunnable(l);
1041 continue;
1042 }
1043
1044 lwp_unlock(l);
1045 }
1046
1047 mutex_exit(p->p_lock);
1048 }
1049 mutex_exit(proc_lock);
1050
1051 /*
1052 * Kick all CPUs to make them preempt any LWPs running in user mode.
1053 * They'll trap into the kernel and suspend themselves in userret().
1054 */
1055 for (CPU_INFO_FOREACH(cii, ci)) {
1056 spc_lock(ci);
1057 cpu_need_resched(ci, RESCHED_IMMED);
1058 spc_unlock(ci);
1059 }
1060 }
1061
1062 /*
1063 * sched_unsleep:
1064 *
1065 * The is called when the LWP has not been awoken normally but instead
1066 * interrupted: for example, if the sleep timed out. Because of this,
1067 * it's not a valid action for running or idle LWPs.
1068 */
1069 static u_int
1070 sched_unsleep(struct lwp *l, bool cleanup)
1071 {
1072
1073 lwp_unlock(l);
1074 panic("sched_unsleep");
1075 }
1076
1077 void
1078 resched_cpu(struct lwp *l)
1079 {
1080 struct cpu_info *ci;
1081
1082 /*
1083 * XXXSMP
1084 * Since l->l_cpu persists across a context switch,
1085 * this gives us *very weak* processor affinity, in
1086 * that we notify the CPU on which the process last
1087 * ran that it should try to switch.
1088 *
1089 * This does not guarantee that the process will run on
1090 * that processor next, because another processor might
1091 * grab it the next time it performs a context switch.
1092 *
1093 * This also does not handle the case where its last
1094 * CPU is running a higher-priority process, but every
1095 * other CPU is running a lower-priority process. There
1096 * are ways to handle this situation, but they're not
1097 * currently very pretty, and we also need to weigh the
1098 * cost of moving a process from one CPU to another.
1099 */
1100 ci = l->l_cpu;
1101 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
1102 cpu_need_resched(ci, 0);
1103 }
1104
1105 static void
1106 sched_changepri(struct lwp *l, pri_t pri)
1107 {
1108
1109 KASSERT(lwp_locked(l, NULL));
1110
1111 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1112 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1113 sched_dequeue(l);
1114 l->l_priority = pri;
1115 sched_enqueue(l, false);
1116 } else {
1117 l->l_priority = pri;
1118 }
1119 resched_cpu(l);
1120 }
1121
1122 static void
1123 sched_lendpri(struct lwp *l, pri_t pri)
1124 {
1125
1126 KASSERT(lwp_locked(l, NULL));
1127
1128 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
1129 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1130 sched_dequeue(l);
1131 l->l_inheritedprio = pri;
1132 sched_enqueue(l, false);
1133 } else {
1134 l->l_inheritedprio = pri;
1135 }
1136 resched_cpu(l);
1137 }
1138
1139 struct lwp *
1140 syncobj_noowner(wchan_t wchan)
1141 {
1142
1143 return NULL;
1144 }
1145
1146 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
1147 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
1148
1149 /*
1150 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
1151 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
1152 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
1153 *
1154 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
1155 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
1156 *
1157 * If you dont want to bother with the faster/more-accurate formula, you
1158 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
1159 * (more general) method of calculating the %age of CPU used by a process.
1160 */
1161 #define CCPU_SHIFT (FSHIFT + 1)
1162
1163 /*
1164 * sched_pstats:
1165 *
1166 * Update process statistics and check CPU resource allocation.
1167 * Call scheduler-specific hook to eventually adjust process/LWP
1168 * priorities.
1169 */
1170 /* ARGSUSED */
1171 void
1172 sched_pstats(void *arg)
1173 {
1174 struct rlimit *rlim;
1175 struct lwp *l;
1176 struct proc *p;
1177 int sig, clkhz;
1178 long runtm;
1179
1180 sched_pstats_ticks++;
1181
1182 mutex_enter(proc_lock);
1183 PROCLIST_FOREACH(p, &allproc) {
1184 /*
1185 * Increment time in/out of memory and sleep time (if
1186 * sleeping). We ignore overflow; with 16-bit int's
1187 * (remember them?) overflow takes 45 days.
1188 */
1189 mutex_enter(p->p_lock);
1190 mutex_spin_enter(&p->p_stmutex);
1191 runtm = p->p_rtime.sec;
1192 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1193 if ((l->l_flag & LW_IDLE) != 0)
1194 continue;
1195 lwp_lock(l);
1196 runtm += l->l_rtime.sec;
1197 l->l_swtime++;
1198 sched_pstats_hook(l);
1199 lwp_unlock(l);
1200
1201 /*
1202 * p_pctcpu is only for ps.
1203 */
1204 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1205 if (l->l_slptime < 1) {
1206 clkhz = stathz != 0 ? stathz : hz;
1207 #if (FSHIFT >= CCPU_SHIFT)
1208 l->l_pctcpu += (clkhz == 100) ?
1209 ((fixpt_t)l->l_cpticks) <<
1210 (FSHIFT - CCPU_SHIFT) :
1211 100 * (((fixpt_t) p->p_cpticks)
1212 << (FSHIFT - CCPU_SHIFT)) / clkhz;
1213 #else
1214 l->l_pctcpu += ((FSCALE - ccpu) *
1215 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
1216 #endif
1217 l->l_cpticks = 0;
1218 }
1219 }
1220 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1221 mutex_spin_exit(&p->p_stmutex);
1222
1223 /*
1224 * Check if the process exceeds its CPU resource allocation.
1225 * If over max, kill it.
1226 */
1227 rlim = &p->p_rlimit[RLIMIT_CPU];
1228 sig = 0;
1229 if (runtm >= rlim->rlim_cur) {
1230 if (runtm >= rlim->rlim_max)
1231 sig = SIGKILL;
1232 else {
1233 sig = SIGXCPU;
1234 if (rlim->rlim_cur < rlim->rlim_max)
1235 rlim->rlim_cur += 5;
1236 }
1237 }
1238 mutex_exit(p->p_lock);
1239 if (sig)
1240 psignal(p, sig);
1241 }
1242 mutex_exit(proc_lock);
1243 uvm_meter();
1244 cv_wakeup(&lbolt);
1245 callout_schedule(&sched_pstats_ch, hz);
1246 }
1247
1248 void
1249 sched_init(void)
1250 {
1251
1252 cv_init(&lbolt, "lbolt");
1253 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
1254 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
1255
1256 /* Balancing */
1257 worker_ci = curcpu();
1258 cacheht_time = mstohz(5); /* ~5 ms */
1259 balance_period = mstohz(300); /* ~300ms */
1260
1261 /* Minimal count of LWPs for catching: log2(count of CPUs) */
1262 min_catch = min(ilog2(ncpu), 4);
1263
1264 #ifdef PREEMPTION
1265 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_INTR, NULL,
1266 "kpreempt", "defer: critical section");
1267 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_INTR, NULL,
1268 "kpreempt", "defer: kernel_lock");
1269 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_INTR, NULL,
1270 "kpreempt", "defer: IPL");
1271 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_INTR, NULL,
1272 "kpreempt", "immediate");
1273 #endif
1274
1275 /* Initialize balancing callout and run it */
1276 #ifdef MULTIPROCESSOR
1277 callout_init(&balance_ch, CALLOUT_MPSAFE);
1278 callout_setfunc(&balance_ch, sched_balance, NULL);
1279 callout_schedule(&balance_ch, balance_period);
1280 #endif
1281 sched_pstats(NULL);
1282 }
1283
1284 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
1285 {
1286 const struct sysctlnode *node = NULL;
1287
1288 sysctl_createv(clog, 0, NULL, NULL,
1289 CTLFLAG_PERMANENT,
1290 CTLTYPE_NODE, "kern", NULL,
1291 NULL, 0, NULL, 0,
1292 CTL_KERN, CTL_EOL);
1293 sysctl_createv(clog, 0, NULL, &node,
1294 CTLFLAG_PERMANENT,
1295 CTLTYPE_NODE, "sched",
1296 SYSCTL_DESCR("Scheduler options"),
1297 NULL, 0, NULL, 0,
1298 CTL_KERN, CTL_CREATE, CTL_EOL);
1299
1300 if (node == NULL)
1301 return;
1302
1303 sysctl_createv(clog, 0, &node, NULL,
1304 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1305 CTLTYPE_INT, "cacheht_time",
1306 SYSCTL_DESCR("Cache hotness time (in ticks)"),
1307 NULL, 0, &cacheht_time, 0,
1308 CTL_CREATE, CTL_EOL);
1309 sysctl_createv(clog, 0, &node, NULL,
1310 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1311 CTLTYPE_INT, "balance_period",
1312 SYSCTL_DESCR("Balance period (in ticks)"),
1313 NULL, 0, &balance_period, 0,
1314 CTL_CREATE, CTL_EOL);
1315 sysctl_createv(clog, 0, &node, NULL,
1316 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1317 CTLTYPE_INT, "min_catch",
1318 SYSCTL_DESCR("Minimal count of threads for catching"),
1319 NULL, 0, &min_catch, 0,
1320 CTL_CREATE, CTL_EOL);
1321 sysctl_createv(clog, 0, &node, NULL,
1322 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1323 CTLTYPE_INT, "timesoftints",
1324 SYSCTL_DESCR("Track CPU time for soft interrupts"),
1325 NULL, 0, &softint_timing, 0,
1326 CTL_CREATE, CTL_EOL);
1327 sysctl_createv(clog, 0, &node, NULL,
1328 #ifdef PREEMPTION
1329 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1330 #else
1331 CTLFLAG_PERMANENT,
1332 #endif
1333 CTLTYPE_INT, "kpreempt_pri",
1334 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
1335 NULL, 0, &sched_kpreempt_pri, 0,
1336 CTL_CREATE, CTL_EOL);
1337 sysctl_createv(clog, 0, &node, NULL,
1338 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1339 CTLTYPE_INT, "upreempt_pri",
1340 SYSCTL_DESCR("Minimum priority to trigger user preemption"),
1341 NULL, 0, &sched_upreempt_pri, 0,
1342 CTL_CREATE, CTL_EOL);
1343 }
1344
1345 void
1346 sched_cpuattach(struct cpu_info *ci)
1347 {
1348 runqueue_t *ci_rq;
1349 void *rq_ptr;
1350 u_int i, size;
1351
1352 if (ci->ci_schedstate.spc_lwplock == NULL) {
1353 ci->ci_schedstate.spc_lwplock =
1354 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
1355 }
1356 if (ci == lwp0.l_cpu) {
1357 /* Initialize the scheduler structure of the primary LWP */
1358 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
1359 }
1360 if (ci->ci_schedstate.spc_mutex != NULL) {
1361 /* Already initialized. */
1362 return;
1363 }
1364
1365 /* Allocate the run queue */
1366 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
1367 rq_ptr = kmem_zalloc(size, KM_SLEEP);
1368 if (rq_ptr == NULL) {
1369 panic("sched_cpuattach: could not allocate the runqueue");
1370 }
1371 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
1372
1373 /* Initialize run queues */
1374 ci->ci_schedstate.spc_mutex =
1375 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
1376 for (i = 0; i < PRI_RT_COUNT; i++)
1377 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
1378 for (i = 0; i < PRI_TS_COUNT; i++)
1379 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
1380
1381 ci->ci_schedstate.spc_sched_info = ci_rq;
1382 }
1383
1384 /*
1385 * Control of the runqueue.
1386 */
1387
1388 static void *
1389 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
1390 {
1391
1392 KASSERT(prio < PRI_COUNT);
1393 return (prio <= PRI_HIGHEST_TS) ?
1394 &ci_rq->r_ts_queue[prio].q_head :
1395 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
1396 }
1397
1398 void
1399 sched_enqueue(struct lwp *l, bool swtch)
1400 {
1401 runqueue_t *ci_rq;
1402 struct schedstate_percpu *spc;
1403 TAILQ_HEAD(, lwp) *q_head;
1404 const pri_t eprio = lwp_eprio(l);
1405 struct cpu_info *ci;
1406 int type;
1407
1408 ci = l->l_cpu;
1409 spc = &ci->ci_schedstate;
1410 ci_rq = spc->spc_sched_info;
1411 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
1412
1413 /* Update the last run time on switch */
1414 if (__predict_true(swtch == true)) {
1415 l->l_rticks = hardclock_ticks;
1416 l->l_rticksum += (hardclock_ticks - l->l_rticks);
1417 } else if (l->l_rticks == 0)
1418 l->l_rticks = hardclock_ticks;
1419
1420 /* Enqueue the thread */
1421 q_head = sched_getrq(ci_rq, eprio);
1422 if (TAILQ_EMPTY(q_head)) {
1423 u_int i;
1424 uint32_t q;
1425
1426 /* Mark bit */
1427 i = eprio >> BITMAP_SHIFT;
1428 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
1429 KASSERT((ci_rq->r_bitmap[i] & q) == 0);
1430 ci_rq->r_bitmap[i] |= q;
1431 }
1432 TAILQ_INSERT_TAIL(q_head, l, l_runq);
1433 ci_rq->r_count++;
1434 if ((l->l_pflag & LP_BOUND) == 0)
1435 ci_rq->r_mcount++;
1436
1437 /*
1438 * Update the value of highest priority in the runqueue,
1439 * if priority of this thread is higher.
1440 */
1441 if (eprio > spc->spc_maxpriority)
1442 spc->spc_maxpriority = eprio;
1443
1444 sched_newts(l);
1445
1446 /*
1447 * Wake the chosen CPU or cause a preemption if the newly
1448 * enqueued thread has higher priority. Don't cause a
1449 * preemption if the thread is yielding (swtch).
1450 */
1451 if (!swtch && eprio > spc->spc_curpriority) {
1452 if (eprio >= sched_kpreempt_pri)
1453 type = RESCHED_KPREEMPT;
1454 else if (eprio >= sched_upreempt_pri)
1455 type = RESCHED_IMMED;
1456 else
1457 type = 0;
1458 cpu_need_resched(ci, type);
1459 }
1460 }
1461
1462 void
1463 sched_dequeue(struct lwp *l)
1464 {
1465 runqueue_t *ci_rq;
1466 TAILQ_HEAD(, lwp) *q_head;
1467 struct schedstate_percpu *spc;
1468 const pri_t eprio = lwp_eprio(l);
1469
1470 spc = & l->l_cpu->ci_schedstate;
1471 ci_rq = spc->spc_sched_info;
1472 KASSERT(lwp_locked(l, spc->spc_mutex));
1473
1474 KASSERT(eprio <= spc->spc_maxpriority);
1475 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
1476 KASSERT(ci_rq->r_count > 0);
1477
1478 ci_rq->r_count--;
1479 if ((l->l_pflag & LP_BOUND) == 0)
1480 ci_rq->r_mcount--;
1481
1482 q_head = sched_getrq(ci_rq, eprio);
1483 TAILQ_REMOVE(q_head, l, l_runq);
1484 if (TAILQ_EMPTY(q_head)) {
1485 u_int i;
1486 uint32_t q;
1487
1488 /* Unmark bit */
1489 i = eprio >> BITMAP_SHIFT;
1490 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
1491 KASSERT((ci_rq->r_bitmap[i] & q) != 0);
1492 ci_rq->r_bitmap[i] &= ~q;
1493
1494 /*
1495 * Update the value of highest priority in the runqueue, in a
1496 * case it was a last thread in the queue of highest priority.
1497 */
1498 if (eprio != spc->spc_maxpriority)
1499 return;
1500
1501 do {
1502 if (ci_rq->r_bitmap[i] != 0) {
1503 q = ffs(ci_rq->r_bitmap[i]);
1504 spc->spc_maxpriority =
1505 (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
1506 return;
1507 }
1508 } while (i--);
1509
1510 /* If not found - set the lowest value */
1511 spc->spc_maxpriority = 0;
1512 }
1513 }
1514
1515 /*
1516 * Migration and balancing.
1517 */
1518
1519 #ifdef MULTIPROCESSOR
1520
1521 /* Estimate if LWP is cache-hot */
1522 static inline bool
1523 lwp_cache_hot(const struct lwp *l)
1524 {
1525
1526 if (l->l_slptime || l->l_rticks == 0)
1527 return false;
1528
1529 return (hardclock_ticks - l->l_rticks <= cacheht_time);
1530 }
1531
1532 /* Check if LWP can migrate to the chosen CPU */
1533 static inline bool
1534 sched_migratable(const struct lwp *l, struct cpu_info *ci)
1535 {
1536 const struct schedstate_percpu *spc = &ci->ci_schedstate;
1537
1538 /* CPU is offline */
1539 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
1540 return false;
1541
1542 /* Affinity bind */
1543 if (__predict_false(l->l_flag & LW_AFFINITY))
1544 return CPU_ISSET(cpu_index(ci), &l->l_affinity);
1545
1546 /* Processor-set */
1547 return (spc->spc_psid == l->l_psid);
1548 }
1549
1550 /*
1551 * Estimate the migration of LWP to the other CPU.
1552 * Take and return the CPU, if migration is needed.
1553 */
1554 struct cpu_info *
1555 sched_takecpu(struct lwp *l)
1556 {
1557 struct cpu_info *ci, *tci, *first, *next;
1558 struct schedstate_percpu *spc;
1559 runqueue_t *ci_rq, *ici_rq;
1560 pri_t eprio, lpri, pri;
1561
1562 KASSERT(lwp_locked(l, NULL));
1563
1564 ci = l->l_cpu;
1565 spc = &ci->ci_schedstate;
1566 ci_rq = spc->spc_sched_info;
1567
1568 /* If thread is strictly bound, do not estimate other CPUs */
1569 if (l->l_pflag & LP_BOUND)
1570 return ci;
1571
1572 /* CPU of this thread is idling - run there */
1573 if (ci_rq->r_count == 0)
1574 return ci;
1575
1576 eprio = lwp_eprio(l);
1577
1578 /* Stay if thread is cache-hot */
1579 if (__predict_true(l->l_stat != LSIDL) &&
1580 lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
1581 return ci;
1582
1583 /* Run on current CPU if priority of thread is higher */
1584 ci = curcpu();
1585 spc = &ci->ci_schedstate;
1586 if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
1587 return ci;
1588
1589 /*
1590 * Look for the CPU with the lowest priority thread. In case of
1591 * equal priority, choose the CPU with the fewest of threads.
1592 */
1593 first = l->l_cpu;
1594 ci = first;
1595 tci = first;
1596 lpri = PRI_COUNT;
1597 do {
1598 next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
1599 spc = &ci->ci_schedstate;
1600 ici_rq = spc->spc_sched_info;
1601 pri = max(spc->spc_curpriority, spc->spc_maxpriority);
1602 if (pri > lpri)
1603 continue;
1604
1605 if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
1606 continue;
1607
1608 if (!sched_migratable(l, ci))
1609 continue;
1610
1611 lpri = pri;
1612 tci = ci;
1613 ci_rq = ici_rq;
1614 } while (ci = next, ci != first);
1615
1616 return tci;
1617 }
1618
1619 /*
1620 * Tries to catch an LWP from the runqueue of other CPU.
1621 */
1622 static struct lwp *
1623 sched_catchlwp(void)
1624 {
1625 struct cpu_info *curci = curcpu(), *ci = worker_ci;
1626 struct schedstate_percpu *spc;
1627 TAILQ_HEAD(, lwp) *q_head;
1628 runqueue_t *ci_rq;
1629 struct lwp *l;
1630
1631 if (curci == ci)
1632 return NULL;
1633
1634 /* Lockless check */
1635 spc = &ci->ci_schedstate;
1636 ci_rq = spc->spc_sched_info;
1637 if (ci_rq->r_mcount < min_catch)
1638 return NULL;
1639
1640 /*
1641 * Double-lock the runqueues.
1642 */
1643 if (curci < ci) {
1644 spc_lock(ci);
1645 } else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
1646 const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
1647
1648 spc_unlock(curci);
1649 spc_lock(ci);
1650 spc_lock(curci);
1651
1652 if (cur_rq->r_count) {
1653 spc_unlock(ci);
1654 return NULL;
1655 }
1656 }
1657
1658 if (ci_rq->r_mcount < min_catch) {
1659 spc_unlock(ci);
1660 return NULL;
1661 }
1662
1663 /* Take the highest priority thread */
1664 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
1665 l = TAILQ_FIRST(q_head);
1666
1667 for (;;) {
1668 /* Check the first and next result from the queue */
1669 if (l == NULL)
1670 break;
1671 KASSERT(l->l_stat == LSRUN);
1672 KASSERT(l->l_flag & LW_INMEM);
1673
1674 /* Look for threads, whose are allowed to migrate */
1675 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
1676 !sched_migratable(l, curci)) {
1677 l = TAILQ_NEXT(l, l_runq);
1678 continue;
1679 }
1680
1681 /* Grab the thread, and move to the local run queue */
1682 sched_dequeue(l);
1683 l->l_cpu = curci;
1684 lwp_unlock_to(l, curci->ci_schedstate.spc_mutex);
1685 sched_enqueue(l, false);
1686 return l;
1687 }
1688 spc_unlock(ci);
1689
1690 return l;
1691 }
1692
1693 /*
1694 * Periodical calculations for balancing.
1695 */
1696 static void
1697 sched_balance(void *nocallout)
1698 {
1699 struct cpu_info *ci, *hci;
1700 runqueue_t *ci_rq;
1701 CPU_INFO_ITERATOR cii;
1702 u_int highest;
1703
1704 hci = curcpu();
1705 highest = 0;
1706
1707 /* Make lockless countings */
1708 for (CPU_INFO_FOREACH(cii, ci)) {
1709 ci_rq = ci->ci_schedstate.spc_sched_info;
1710
1711 /* Average count of the threads */
1712 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
1713
1714 /* Look for CPU with the highest average */
1715 if (ci_rq->r_avgcount > highest) {
1716 hci = ci;
1717 highest = ci_rq->r_avgcount;
1718 }
1719 }
1720
1721 /* Update the worker */
1722 worker_ci = hci;
1723
1724 if (nocallout == NULL)
1725 callout_schedule(&balance_ch, balance_period);
1726 }
1727
1728 #else
1729
1730 struct cpu_info *
1731 sched_takecpu(struct lwp *l)
1732 {
1733
1734 return l->l_cpu;
1735 }
1736
1737 #endif /* MULTIPROCESSOR */
1738
1739 /*
1740 * Scheduler mill.
1741 */
1742 struct lwp *
1743 sched_nextlwp(void)
1744 {
1745 struct cpu_info *ci = curcpu();
1746 struct schedstate_percpu *spc;
1747 TAILQ_HEAD(, lwp) *q_head;
1748 runqueue_t *ci_rq;
1749 struct lwp *l;
1750
1751 spc = &ci->ci_schedstate;
1752 ci_rq = spc->spc_sched_info;
1753
1754 #ifdef MULTIPROCESSOR
1755 /* If runqueue is empty, try to catch some thread from other CPU */
1756 if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
1757 if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
1758 return NULL;
1759 } else if (ci_rq->r_count == 0) {
1760 /* Reset the counter, and call the balancer */
1761 ci_rq->r_avgcount = 0;
1762 sched_balance(ci);
1763
1764 /* The re-locking will be done inside */
1765 return sched_catchlwp();
1766 }
1767 #else
1768 if (ci_rq->r_count == 0)
1769 return NULL;
1770 #endif
1771
1772 /* Take the highest priority thread */
1773 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
1774 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
1775 l = TAILQ_FIRST(q_head);
1776 KASSERT(l != NULL);
1777
1778 sched_oncpu(l);
1779 l->l_rticks = hardclock_ticks;
1780
1781 return l;
1782 }
1783
1784 bool
1785 sched_curcpu_runnable_p(void)
1786 {
1787 const struct cpu_info *ci;
1788 const runqueue_t *ci_rq;
1789 bool rv;
1790
1791 kpreempt_disable();
1792 ci = curcpu();
1793 ci_rq = ci->ci_schedstate.spc_sched_info;
1794
1795 #ifndef __HAVE_FAST_SOFTINTS
1796 if (ci->ci_data.cpu_softints) {
1797 kpreempt_enable();
1798 return true;
1799 }
1800 #endif
1801
1802 if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
1803 rv = (ci_rq->r_count - ci_rq->r_mcount);
1804 else
1805 rv = ci_rq->r_count != 0;
1806 kpreempt_enable();
1807
1808 return rv;
1809 }
1810
1811 /*
1812 * Debugging.
1813 */
1814
1815 #ifdef DDB
1816
1817 void
1818 sched_print_runqueue(void (*pr)(const char *, ...)
1819 __attribute__((__format__(__printf__,1,2))))
1820 {
1821 runqueue_t *ci_rq;
1822 struct schedstate_percpu *spc;
1823 struct lwp *l;
1824 struct proc *p;
1825 int i;
1826 struct cpu_info *ci;
1827 CPU_INFO_ITERATOR cii;
1828
1829 for (CPU_INFO_FOREACH(cii, ci)) {
1830 spc = &ci->ci_schedstate;
1831 ci_rq = spc->spc_sched_info;
1832
1833 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
1834 (*pr)(" pid.lid = %d.%d, threads count = %u, "
1835 "avgcount = %u, highest pri = %d\n",
1836 #ifdef MULTIPROCESSOR
1837 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
1838 #else
1839 curlwp->l_proc->p_pid, curlwp->l_lid,
1840 #endif
1841 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority);
1842 i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
1843 do {
1844 uint32_t q;
1845 q = ci_rq->r_bitmap[i];
1846 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
1847 } while (i--);
1848 }
1849
1850 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %s\n",
1851 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "LRTIME");
1852
1853 PROCLIST_FOREACH(p, &allproc) {
1854 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
1855 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1856 ci = l->l_cpu;
1857 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %u\n",
1858 (int)l->l_lid, l->l_priority, lwp_eprio(l),
1859 l->l_flag, l->l_stat == LSRUN ? "RQ" :
1860 (l->l_stat == LSSLEEP ? "SQ" : "-"),
1861 l, ci->ci_index,
1862 (u_int)(hardclock_ticks - l->l_rticks));
1863 }
1864 }
1865 }
1866
1867 #endif
1868