Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.233
      1 /*	$NetBSD: kern_synch.c,v 1.233 2008/04/28 20:24:03 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     31  * POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 
     34 /*
     35  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
     36  * All rights reserved.
     37  *
     38  * Redistribution and use in source and binary forms, with or without
     39  * modification, are permitted provided that the following conditions
     40  * are met:
     41  * 1. Redistributions of source code must retain the above copyright
     42  *    notice, this list of conditions and the following disclaimer.
     43  * 2. Redistributions in binary form must reproduce the above copyright
     44  *    notice, this list of conditions and the following disclaimer in the
     45  *    documentation and/or other materials provided with the distribution.
     46  *
     47  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     57  * SUCH DAMAGE.
     58  */
     59 
     60 /*-
     61  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     62  *	The Regents of the University of California.  All rights reserved.
     63  * (c) UNIX System Laboratories, Inc.
     64  * All or some portions of this file are derived from material licensed
     65  * to the University of California by American Telephone and Telegraph
     66  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     67  * the permission of UNIX System Laboratories, Inc.
     68  *
     69  * Redistribution and use in source and binary forms, with or without
     70  * modification, are permitted provided that the following conditions
     71  * are met:
     72  * 1. Redistributions of source code must retain the above copyright
     73  *    notice, this list of conditions and the following disclaimer.
     74  * 2. Redistributions in binary form must reproduce the above copyright
     75  *    notice, this list of conditions and the following disclaimer in the
     76  *    documentation and/or other materials provided with the distribution.
     77  * 3. Neither the name of the University nor the names of its contributors
     78  *    may be used to endorse or promote products derived from this software
     79  *    without specific prior written permission.
     80  *
     81  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     82  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     83  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     84  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     85  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     86  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     87  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     88  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     89  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     90  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     91  * SUCH DAMAGE.
     92  *
     93  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     94  */
     95 
     96 #include <sys/cdefs.h>
     97 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.233 2008/04/28 20:24:03 martin Exp $");
     98 
     99 #include "opt_kstack.h"
    100 #include "opt_lockdebug.h"
    101 #include "opt_multiprocessor.h"
    102 #include "opt_perfctrs.h"
    103 #include "opt_preemption.h"
    104 
    105 #define	__MUTEX_PRIVATE
    106 
    107 #include <sys/param.h>
    108 #include <sys/systm.h>
    109 #include <sys/proc.h>
    110 #include <sys/kernel.h>
    111 #if defined(PERFCTRS)
    112 #include <sys/pmc.h>
    113 #endif
    114 #include <sys/cpu.h>
    115 #include <sys/resourcevar.h>
    116 #include <sys/sched.h>
    117 #include <sys/syscall_stats.h>
    118 #include <sys/sleepq.h>
    119 #include <sys/lockdebug.h>
    120 #include <sys/evcnt.h>
    121 #include <sys/intr.h>
    122 #include <sys/lwpctl.h>
    123 #include <sys/atomic.h>
    124 #include <sys/simplelock.h>
    125 #include <sys/bitops.h>
    126 #include <sys/kmem.h>
    127 #include <sys/sysctl.h>
    128 #include <sys/idle.h>
    129 
    130 #include <uvm/uvm_extern.h>
    131 
    132 #include <dev/lockstat.h>
    133 
    134 /*
    135  * Priority related defintions.
    136  */
    137 #define	PRI_TS_COUNT	(NPRI_USER)
    138 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
    139 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
    140 
    141 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
    142 
    143 /*
    144  * Bits per map.
    145  */
    146 #define	BITMAP_BITS	(32)
    147 #define	BITMAP_SHIFT	(5)
    148 #define	BITMAP_MSB	(0x80000000U)
    149 #define	BITMAP_MASK	(BITMAP_BITS - 1)
    150 
    151 /*
    152  * Structures, runqueue.
    153  */
    154 
    155 typedef struct {
    156 	TAILQ_HEAD(, lwp) q_head;
    157 } queue_t;
    158 
    159 typedef struct {
    160 	/* Lock and bitmap */
    161 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
    162 	/* Counters */
    163 	u_int		r_count;	/* Count of the threads */
    164 	u_int		r_avgcount;	/* Average count of threads */
    165 	u_int		r_mcount;	/* Count of migratable threads */
    166 	/* Runqueues */
    167 	queue_t		r_rt_queue[PRI_RT_COUNT];
    168 	queue_t		r_ts_queue[PRI_TS_COUNT];
    169 } runqueue_t;
    170 
    171 static u_int	sched_unsleep(struct lwp *, bool);
    172 static void	sched_changepri(struct lwp *, pri_t);
    173 static void	sched_lendpri(struct lwp *, pri_t);
    174 static void	*sched_getrq(runqueue_t *, const pri_t);
    175 #ifdef MULTIPROCESSOR
    176 static lwp_t	*sched_catchlwp(void);
    177 static void	sched_balance(void *);
    178 #endif
    179 
    180 syncobj_t sleep_syncobj = {
    181 	SOBJ_SLEEPQ_SORTED,
    182 	sleepq_unsleep,
    183 	sleepq_changepri,
    184 	sleepq_lendpri,
    185 	syncobj_noowner,
    186 };
    187 
    188 syncobj_t sched_syncobj = {
    189 	SOBJ_SLEEPQ_SORTED,
    190 	sched_unsleep,
    191 	sched_changepri,
    192 	sched_lendpri,
    193 	syncobj_noowner,
    194 };
    195 
    196 const int 	schedppq = 1;
    197 callout_t 	sched_pstats_ch;
    198 unsigned	sched_pstats_ticks;
    199 kcondvar_t	lbolt;			/* once a second sleep address */
    200 
    201 /*
    202  * Kernel preemption.
    203  */
    204 #ifdef PREEMPTION
    205 #if 0
    206 int		sched_kpreempt_pri = PRI_USER_RT;
    207 #else
    208 /* XXX disable for now until any bugs are worked out. */
    209 int		sched_kpreempt_pri = 1000;
    210 #endif
    211 
    212 static struct evcnt kpreempt_ev_crit;
    213 static struct evcnt kpreempt_ev_klock;
    214 static struct evcnt kpreempt_ev_ipl;
    215 static struct evcnt kpreempt_ev_immed;
    216 #else
    217 int		sched_kpreempt_pri = INT_MAX;
    218 #endif
    219 int		sched_upreempt_pri = PRI_KERNEL;
    220 
    221 /*
    222  * Migration and balancing.
    223  */
    224 static u_int	cacheht_time;		/* Cache hotness time */
    225 static u_int	min_catch;		/* Minimal LWP count for catching */
    226 static u_int	balance_period;		/* Balance period */
    227 static struct cpu_info *worker_ci;	/* Victim CPU */
    228 #ifdef MULTIPROCESSOR
    229 static struct callout balance_ch;	/* Callout of balancer */
    230 #endif
    231 
    232 /*
    233  * During autoconfiguration or after a panic, a sleep will simply lower the
    234  * priority briefly to allow interrupts, then return.  The priority to be
    235  * used (safepri) is machine-dependent, thus this value is initialized and
    236  * maintained in the machine-dependent layers.  This priority will typically
    237  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    238  * it can be made higher to block network software interrupts after panics.
    239  */
    240 int	safepri;
    241 
    242 /*
    243  * OBSOLETE INTERFACE
    244  *
    245  * General sleep call.  Suspends the current process until a wakeup is
    246  * performed on the specified identifier.  The process will then be made
    247  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    248  * means no timeout).  If pri includes PCATCH flag, signals are checked
    249  * before and after sleeping, else signals are not checked.  Returns 0 if
    250  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    251  * signal needs to be delivered, ERESTART is returned if the current system
    252  * call should be restarted if possible, and EINTR is returned if the system
    253  * call should be interrupted by the signal (return EINTR).
    254  *
    255  * The interlock is held until we are on a sleep queue. The interlock will
    256  * be locked before returning back to the caller unless the PNORELOCK flag
    257  * is specified, in which case the interlock will always be unlocked upon
    258  * return.
    259  */
    260 int
    261 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    262 	volatile struct simplelock *interlock)
    263 {
    264 	struct lwp *l = curlwp;
    265 	sleepq_t *sq;
    266 	int error;
    267 
    268 	KASSERT((l->l_pflag & LP_INTR) == 0);
    269 
    270 	if (sleepq_dontsleep(l)) {
    271 		(void)sleepq_abort(NULL, 0);
    272 		if ((priority & PNORELOCK) != 0)
    273 			simple_unlock(interlock);
    274 		return 0;
    275 	}
    276 
    277 	l->l_kpriority = true;
    278 	sq = sleeptab_lookup(&sleeptab, ident);
    279 	sleepq_enter(sq, l);
    280 	sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
    281 
    282 	if (interlock != NULL) {
    283 		KASSERT(simple_lock_held(interlock));
    284 		simple_unlock(interlock);
    285 	}
    286 
    287 	error = sleepq_block(timo, priority & PCATCH);
    288 
    289 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    290 		simple_lock(interlock);
    291 
    292 	return error;
    293 }
    294 
    295 int
    296 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    297 	kmutex_t *mtx)
    298 {
    299 	struct lwp *l = curlwp;
    300 	sleepq_t *sq;
    301 	int error;
    302 
    303 	KASSERT((l->l_pflag & LP_INTR) == 0);
    304 
    305 	if (sleepq_dontsleep(l)) {
    306 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    307 		return 0;
    308 	}
    309 
    310 	l->l_kpriority = true;
    311 	sq = sleeptab_lookup(&sleeptab, ident);
    312 	sleepq_enter(sq, l);
    313 	sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
    314 	mutex_exit(mtx);
    315 	error = sleepq_block(timo, priority & PCATCH);
    316 
    317 	if ((priority & PNORELOCK) == 0)
    318 		mutex_enter(mtx);
    319 
    320 	return error;
    321 }
    322 
    323 /*
    324  * General sleep call for situations where a wake-up is not expected.
    325  */
    326 int
    327 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    328 {
    329 	struct lwp *l = curlwp;
    330 	sleepq_t *sq;
    331 	int error;
    332 
    333 	if (sleepq_dontsleep(l))
    334 		return sleepq_abort(NULL, 0);
    335 
    336 	if (mtx != NULL)
    337 		mutex_exit(mtx);
    338 	l->l_kpriority = true;
    339 	sq = sleeptab_lookup(&sleeptab, l);
    340 	sleepq_enter(sq, l);
    341 	sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
    342 	error = sleepq_block(timo, intr);
    343 	if (mtx != NULL)
    344 		mutex_enter(mtx);
    345 
    346 	return error;
    347 }
    348 
    349 /*
    350  * OBSOLETE INTERFACE
    351  *
    352  * Make all processes sleeping on the specified identifier runnable.
    353  */
    354 void
    355 wakeup(wchan_t ident)
    356 {
    357 	sleepq_t *sq;
    358 
    359 	if (cold)
    360 		return;
    361 
    362 	sq = sleeptab_lookup(&sleeptab, ident);
    363 	sleepq_wake(sq, ident, (u_int)-1);
    364 }
    365 
    366 /*
    367  * OBSOLETE INTERFACE
    368  *
    369  * Make the highest priority process first in line on the specified
    370  * identifier runnable.
    371  */
    372 void
    373 wakeup_one(wchan_t ident)
    374 {
    375 	sleepq_t *sq;
    376 
    377 	if (cold)
    378 		return;
    379 
    380 	sq = sleeptab_lookup(&sleeptab, ident);
    381 	sleepq_wake(sq, ident, 1);
    382 }
    383 
    384 
    385 /*
    386  * General yield call.  Puts the current process back on its run queue and
    387  * performs a voluntary context switch.  Should only be called when the
    388  * current process explicitly requests it (eg sched_yield(2)).
    389  */
    390 void
    391 yield(void)
    392 {
    393 	struct lwp *l = curlwp;
    394 
    395 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    396 	lwp_lock(l);
    397 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
    398 	KASSERT(l->l_stat == LSONPROC);
    399 	l->l_kpriority = false;
    400 	(void)mi_switch(l);
    401 	KERNEL_LOCK(l->l_biglocks, l);
    402 }
    403 
    404 /*
    405  * General preemption call.  Puts the current process back on its run queue
    406  * and performs an involuntary context switch.
    407  */
    408 void
    409 preempt(void)
    410 {
    411 	struct lwp *l = curlwp;
    412 
    413 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    414 	lwp_lock(l);
    415 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
    416 	KASSERT(l->l_stat == LSONPROC);
    417 	l->l_kpriority = false;
    418 	l->l_nivcsw++;
    419 	(void)mi_switch(l);
    420 	KERNEL_LOCK(l->l_biglocks, l);
    421 }
    422 
    423 #ifdef PREEMPTION
    424 /* XXX Yuck, for lockstat. */
    425 static char	in_critical_section;
    426 static char	kernel_lock_held;
    427 static char	spl_raised;
    428 static char	is_softint;
    429 
    430 /*
    431  * Handle a request made by another agent to preempt the current LWP
    432  * in-kernel.  Usually called when l_dopreempt may be non-zero.
    433  */
    434 bool
    435 kpreempt(uintptr_t where)
    436 {
    437 	uintptr_t failed;
    438 	lwp_t *l;
    439 	int s, dop;
    440 
    441 	l = curlwp;
    442 	failed = 0;
    443 	while ((dop = l->l_dopreempt) != 0) {
    444 		if (l->l_stat != LSONPROC) {
    445 			/*
    446 			 * About to block (or die), let it happen.
    447 			 * Doesn't really count as "preemption has
    448 			 * been blocked", since we're going to
    449 			 * context switch.
    450 			 */
    451 			l->l_dopreempt = 0;
    452 			return true;
    453 		}
    454 		if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
    455 			/* Can't preempt idle loop, don't count as failure. */
    456 		    	l->l_dopreempt = 0;
    457 		    	return true;
    458 		}
    459 		if (__predict_false(l->l_nopreempt != 0)) {
    460 			/* LWP holds preemption disabled, explicitly. */
    461 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    462 				atomic_inc_64(&kpreempt_ev_crit.ev_count);
    463 			}
    464 			failed = (uintptr_t)&in_critical_section;
    465 			break;
    466 		}
    467 		if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
    468 		    	/* Can't preempt soft interrupts yet. */
    469 		    	l->l_dopreempt = 0;
    470 		    	failed = (uintptr_t)&is_softint;
    471 		    	break;
    472 		}
    473 		s = splsched();
    474 		if (__predict_false(l->l_blcnt != 0 ||
    475 		    curcpu()->ci_biglock_wanted != NULL)) {
    476 			/* Hold or want kernel_lock, code is not MT safe. */
    477 			splx(s);
    478 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    479 				atomic_inc_64(&kpreempt_ev_klock.ev_count);
    480 			}
    481 			failed = (uintptr_t)&kernel_lock_held;
    482 			break;
    483 		}
    484 		if (__predict_false(!cpu_kpreempt_enter(where, s))) {
    485 			/*
    486 			 * It may be that the IPL is too high.
    487 			 * kpreempt_enter() can schedule an
    488 			 * interrupt to retry later.
    489 			 */
    490 			splx(s);
    491 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    492 				atomic_inc_64(&kpreempt_ev_ipl.ev_count);
    493 			}
    494 			failed = (uintptr_t)&spl_raised;
    495 			break;
    496 		}
    497 		/* Do it! */
    498 		if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
    499 			atomic_inc_64(&kpreempt_ev_immed.ev_count);
    500 		}
    501 		lwp_lock(l);
    502 		mi_switch(l);
    503 		l->l_nopreempt++;
    504 		splx(s);
    505 
    506 		/* Take care of any MD cleanup. */
    507 		cpu_kpreempt_exit(where);
    508 		l->l_nopreempt--;
    509 	}
    510 
    511 	/* Record preemption failure for reporting via lockstat. */
    512 	if (__predict_false(failed)) {
    513 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
    514 		int lsflag = 0;
    515 		LOCKSTAT_ENTER(lsflag);
    516 		/* Might recurse, make it atomic. */
    517 		if (__predict_false(lsflag)) {
    518 			if (where == 0) {
    519 				where = (uintptr_t)__builtin_return_address(0);
    520 			}
    521 			if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
    522 			    NULL, (void *)where) == NULL) {
    523 				LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
    524 				l->l_pfaillock = failed;
    525 			}
    526 		}
    527 		LOCKSTAT_EXIT(lsflag);
    528 	}
    529 
    530 	return failed;
    531 }
    532 
    533 /*
    534  * Return true if preemption is explicitly disabled.
    535  */
    536 bool
    537 kpreempt_disabled(void)
    538 {
    539 	lwp_t *l;
    540 
    541 	l = curlwp;
    542 
    543 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
    544 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
    545 }
    546 #else
    547 bool
    548 kpreempt(uintptr_t where)
    549 {
    550 
    551 	panic("kpreempt");
    552 	return true;
    553 }
    554 
    555 bool
    556 kpreempt_disabled(void)
    557 {
    558 
    559 	return true;
    560 }
    561 #endif
    562 
    563 /*
    564  * Disable kernel preemption.
    565  */
    566 void
    567 kpreempt_disable(void)
    568 {
    569 
    570 	KPREEMPT_DISABLE(curlwp);
    571 }
    572 
    573 /*
    574  * Reenable kernel preemption.
    575  */
    576 void
    577 kpreempt_enable(void)
    578 {
    579 
    580 	KPREEMPT_ENABLE(curlwp);
    581 }
    582 
    583 /*
    584  * Compute the amount of time during which the current lwp was running.
    585  *
    586  * - update l_rtime unless it's an idle lwp.
    587  */
    588 
    589 void
    590 updatertime(lwp_t *l, const struct bintime *now)
    591 {
    592 
    593 	if ((l->l_flag & LW_IDLE) != 0)
    594 		return;
    595 
    596 	/* rtime += now - stime */
    597 	bintime_add(&l->l_rtime, now);
    598 	bintime_sub(&l->l_rtime, &l->l_stime);
    599 }
    600 
    601 /*
    602  * The machine independent parts of context switch.
    603  *
    604  * Returns 1 if another LWP was actually run.
    605  */
    606 int
    607 mi_switch(lwp_t *l)
    608 {
    609 	struct cpu_info *ci, *tci = NULL;
    610 	struct schedstate_percpu *spc;
    611 	struct lwp *newl;
    612 	int retval, oldspl;
    613 	struct bintime bt;
    614 	bool returning;
    615 
    616 	KASSERT(lwp_locked(l, NULL));
    617 	KASSERT(kpreempt_disabled());
    618 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    619 
    620 #ifdef KSTACK_CHECK_MAGIC
    621 	kstack_check_magic(l);
    622 #endif
    623 
    624 	binuptime(&bt);
    625 
    626 	KASSERT(l->l_cpu == curcpu());
    627 	ci = l->l_cpu;
    628 	spc = &ci->ci_schedstate;
    629 	returning = false;
    630 	newl = NULL;
    631 
    632 	/*
    633 	 * If we have been asked to switch to a specific LWP, then there
    634 	 * is no need to inspect the run queues.  If a soft interrupt is
    635 	 * blocking, then return to the interrupted thread without adjusting
    636 	 * VM context or its start time: neither have been changed in order
    637 	 * to take the interrupt.
    638 	 */
    639 	if (l->l_switchto != NULL) {
    640 		if ((l->l_pflag & LP_INTR) != 0) {
    641 			returning = true;
    642 			softint_block(l);
    643 			if ((l->l_flag & LW_TIMEINTR) != 0)
    644 				updatertime(l, &bt);
    645 		}
    646 		newl = l->l_switchto;
    647 		l->l_switchto = NULL;
    648 	}
    649 #ifndef __HAVE_FAST_SOFTINTS
    650 	else if (ci->ci_data.cpu_softints != 0) {
    651 		/* There are pending soft interrupts, so pick one. */
    652 		newl = softint_picklwp();
    653 		newl->l_stat = LSONPROC;
    654 		newl->l_flag |= LW_RUNNING;
    655 	}
    656 #endif	/* !__HAVE_FAST_SOFTINTS */
    657 
    658 	/* Count time spent in current system call */
    659 	if (!returning) {
    660 		SYSCALL_TIME_SLEEP(l);
    661 
    662 		/*
    663 		 * XXXSMP If we are using h/w performance counters,
    664 		 * save context.
    665 		 */
    666 #if PERFCTRS
    667 		if (PMC_ENABLED(l->l_proc)) {
    668 			pmc_save_context(l->l_proc);
    669 		}
    670 #endif
    671 		updatertime(l, &bt);
    672 	}
    673 
    674 	/*
    675 	 * If on the CPU and we have gotten this far, then we must yield.
    676 	 */
    677 	KASSERT(l->l_stat != LSRUN);
    678 	if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
    679 		KASSERT(lwp_locked(l, spc->spc_lwplock));
    680 
    681 		if (l->l_target_cpu == l->l_cpu) {
    682 			l->l_target_cpu = NULL;
    683 		} else {
    684 			tci = l->l_target_cpu;
    685 		}
    686 
    687 		if (__predict_false(tci != NULL)) {
    688 			/* Double-lock the runqueues */
    689 			spc_dlock(ci, tci);
    690 		} else {
    691 			/* Lock the runqueue */
    692 			spc_lock(ci);
    693 		}
    694 
    695 		if ((l->l_flag & LW_IDLE) == 0) {
    696 			l->l_stat = LSRUN;
    697 			if (__predict_false(tci != NULL)) {
    698 				/*
    699 				 * Set the new CPU, lock and unset the
    700 				 * l_target_cpu - thread will be enqueued
    701 				 * to the runqueue of target CPU.
    702 				 */
    703 				l->l_cpu = tci;
    704 				lwp_setlock(l, tci->ci_schedstate.spc_mutex);
    705 				l->l_target_cpu = NULL;
    706 			} else {
    707 				lwp_setlock(l, spc->spc_mutex);
    708 			}
    709 			sched_enqueue(l, true);
    710 		} else {
    711 			KASSERT(tci == NULL);
    712 			l->l_stat = LSIDL;
    713 		}
    714 	} else {
    715 		/* Lock the runqueue */
    716 		spc_lock(ci);
    717 	}
    718 
    719 	/*
    720 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    721 	 * If no LWP is runnable, select the idle LWP.
    722 	 *
    723 	 * Note that spc_lwplock might not necessary be held, and
    724 	 * new thread would be unlocked after setting the LWP-lock.
    725 	 */
    726 	if (newl == NULL) {
    727 		newl = sched_nextlwp();
    728 		if (newl != NULL) {
    729 			sched_dequeue(newl);
    730 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    731 			newl->l_stat = LSONPROC;
    732 			newl->l_cpu = ci;
    733 			newl->l_flag |= LW_RUNNING;
    734 			lwp_setlock(newl, spc->spc_lwplock);
    735 		} else {
    736 			newl = ci->ci_data.cpu_idlelwp;
    737 			newl->l_stat = LSONPROC;
    738 			newl->l_flag |= LW_RUNNING;
    739 		}
    740 		/*
    741 		 * Only clear want_resched if there are no
    742 		 * pending (slow) software interrupts.
    743 		 */
    744 		ci->ci_want_resched = ci->ci_data.cpu_softints;
    745 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    746 		spc->spc_curpriority = lwp_eprio(newl);
    747 	}
    748 
    749 	/* Items that must be updated with the CPU locked. */
    750 	if (!returning) {
    751 		/* Update the new LWP's start time. */
    752 		newl->l_stime = bt;
    753 
    754 		/*
    755 		 * ci_curlwp changes when a fast soft interrupt occurs.
    756 		 * We use cpu_onproc to keep track of which kernel or
    757 		 * user thread is running 'underneath' the software
    758 		 * interrupt.  This is important for time accounting,
    759 		 * itimers and forcing user threads to preempt (aston).
    760 		 */
    761 		ci->ci_data.cpu_onproc = newl;
    762 	}
    763 
    764 	/* Kernel preemption related tasks. */
    765 	l->l_dopreempt = 0;
    766 	if (__predict_false(l->l_pfailaddr != 0)) {
    767 		LOCKSTAT_FLAG(lsflag);
    768 		LOCKSTAT_ENTER(lsflag);
    769 		LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
    770 		LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
    771 		    1, l->l_pfailtime, l->l_pfailaddr);
    772 		LOCKSTAT_EXIT(lsflag);
    773 		l->l_pfailtime = 0;
    774 		l->l_pfaillock = 0;
    775 		l->l_pfailaddr = 0;
    776 	}
    777 
    778 	if (l != newl) {
    779 		struct lwp *prevlwp;
    780 
    781 		/* Release all locks, but leave the current LWP locked */
    782 		if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
    783 			/*
    784 			 * In case of migration, drop the local runqueue
    785 			 * lock, thread is on other runqueue now.
    786 			 */
    787 			if (__predict_false(tci != NULL))
    788 				spc_unlock(ci);
    789 			/*
    790 			 * Drop spc_lwplock, if the current LWP has been moved
    791 			 * to the run queue (it is now locked by spc_mutex).
    792 			 */
    793 			mutex_spin_exit(spc->spc_lwplock);
    794 		} else {
    795 			/*
    796 			 * Otherwise, drop the spc_mutex, we are done with the
    797 			 * run queues.
    798 			 */
    799 			mutex_spin_exit(spc->spc_mutex);
    800 			KASSERT(tci == NULL);
    801 		}
    802 
    803 		/*
    804 		 * Mark that context switch is going to be perfomed
    805 		 * for this LWP, to protect it from being switched
    806 		 * to on another CPU.
    807 		 */
    808 		KASSERT(l->l_ctxswtch == 0);
    809 		l->l_ctxswtch = 1;
    810 		l->l_ncsw++;
    811 		l->l_flag &= ~LW_RUNNING;
    812 
    813 		/*
    814 		 * Increase the count of spin-mutexes before the release
    815 		 * of the last lock - we must remain at IPL_SCHED during
    816 		 * the context switch.
    817 		 */
    818 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    819 		ci->ci_mtx_count--;
    820 		lwp_unlock(l);
    821 
    822 		/* Count the context switch on this CPU. */
    823 		ci->ci_data.cpu_nswtch++;
    824 
    825 		/* Update status for lwpctl, if present. */
    826 		if (l->l_lwpctl != NULL)
    827 			l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
    828 
    829 		/*
    830 		 * Save old VM context, unless a soft interrupt
    831 		 * handler is blocking.
    832 		 */
    833 		if (!returning)
    834 			pmap_deactivate(l);
    835 
    836 		/*
    837 		 * We may need to spin-wait for if 'newl' is still
    838 		 * context switching on another CPU.
    839 		 */
    840 		if (newl->l_ctxswtch != 0) {
    841 			u_int count;
    842 			count = SPINLOCK_BACKOFF_MIN;
    843 			while (newl->l_ctxswtch)
    844 				SPINLOCK_BACKOFF(count);
    845 		}
    846 
    847 		/* Switch to the new LWP.. */
    848 		prevlwp = cpu_switchto(l, newl, returning);
    849 		ci = curcpu();
    850 
    851 		/*
    852 		 * Switched away - we have new curlwp.
    853 		 * Restore VM context and IPL.
    854 		 */
    855 		pmap_activate(l);
    856 		if (prevlwp != NULL) {
    857 			/* Normalize the count of the spin-mutexes */
    858 			ci->ci_mtx_count++;
    859 			/* Unmark the state of context switch */
    860 			membar_exit();
    861 			prevlwp->l_ctxswtch = 0;
    862 		}
    863 
    864 		/* Update status for lwpctl, if present. */
    865 		if (l->l_lwpctl != NULL) {
    866 			l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
    867 			l->l_lwpctl->lc_pctr++;
    868 		}
    869 
    870 		KASSERT(l->l_cpu == ci);
    871 		splx(oldspl);
    872 		retval = 1;
    873 	} else {
    874 		/* Nothing to do - just unlock and return. */
    875 		KASSERT(tci == NULL);
    876 		spc_unlock(ci);
    877 		lwp_unlock(l);
    878 		retval = 0;
    879 	}
    880 
    881 	KASSERT(l == curlwp);
    882 	KASSERT(l->l_stat == LSONPROC);
    883 
    884 	/*
    885 	 * XXXSMP If we are using h/w performance counters, restore context.
    886 	 * XXXSMP preemption problem.
    887 	 */
    888 #if PERFCTRS
    889 	if (PMC_ENABLED(l->l_proc)) {
    890 		pmc_restore_context(l->l_proc);
    891 	}
    892 #endif
    893 	SYSCALL_TIME_WAKEUP(l);
    894 	LOCKDEBUG_BARRIER(NULL, 1);
    895 
    896 	return retval;
    897 }
    898 
    899 /*
    900  * Change process state to be runnable, placing it on the run queue if it is
    901  * in memory, and awakening the swapper if it isn't in memory.
    902  *
    903  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    904  */
    905 void
    906 setrunnable(struct lwp *l)
    907 {
    908 	struct proc *p = l->l_proc;
    909 	struct cpu_info *ci;
    910 	sigset_t *ss;
    911 
    912 	KASSERT((l->l_flag & LW_IDLE) == 0);
    913 	KASSERT(mutex_owned(p->p_lock));
    914 	KASSERT(lwp_locked(l, NULL));
    915 	KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
    916 
    917 	switch (l->l_stat) {
    918 	case LSSTOP:
    919 		/*
    920 		 * If we're being traced (possibly because someone attached us
    921 		 * while we were stopped), check for a signal from the debugger.
    922 		 */
    923 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    924 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    925 				ss = &l->l_sigpend.sp_set;
    926 			else
    927 				ss = &p->p_sigpend.sp_set;
    928 			sigaddset(ss, p->p_xstat);
    929 			signotify(l);
    930 		}
    931 		p->p_nrlwps++;
    932 		break;
    933 	case LSSUSPENDED:
    934 		l->l_flag &= ~LW_WSUSPEND;
    935 		p->p_nrlwps++;
    936 		cv_broadcast(&p->p_lwpcv);
    937 		break;
    938 	case LSSLEEP:
    939 		KASSERT(l->l_wchan != NULL);
    940 		break;
    941 	default:
    942 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    943 	}
    944 
    945 	/*
    946 	 * If the LWP was sleeping interruptably, then it's OK to start it
    947 	 * again.  If not, mark it as still sleeping.
    948 	 */
    949 	if (l->l_wchan != NULL) {
    950 		l->l_stat = LSSLEEP;
    951 		/* lwp_unsleep() will release the lock. */
    952 		lwp_unsleep(l, true);
    953 		return;
    954 	}
    955 
    956 	/*
    957 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    958 	 * about to call mi_switch(), in which case it will yield.
    959 	 */
    960 	if ((l->l_flag & LW_RUNNING) != 0) {
    961 		l->l_stat = LSONPROC;
    962 		l->l_slptime = 0;
    963 		lwp_unlock(l);
    964 		return;
    965 	}
    966 
    967 	/*
    968 	 * Look for a CPU to run.
    969 	 * Set the LWP runnable.
    970 	 */
    971 	ci = sched_takecpu(l);
    972 	l->l_cpu = ci;
    973 	if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
    974 		lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
    975 		lwp_lock(l);
    976 	}
    977 	sched_setrunnable(l);
    978 	l->l_stat = LSRUN;
    979 	l->l_slptime = 0;
    980 
    981 	/*
    982 	 * If thread is swapped out - wake the swapper to bring it back in.
    983 	 * Otherwise, enter it into a run queue.
    984 	 */
    985 	if (l->l_flag & LW_INMEM) {
    986 		sched_enqueue(l, false);
    987 		resched_cpu(l);
    988 		lwp_unlock(l);
    989 	} else {
    990 		lwp_unlock(l);
    991 		uvm_kick_scheduler();
    992 	}
    993 }
    994 
    995 /*
    996  * suspendsched:
    997  *
    998  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    999  */
   1000 void
   1001 suspendsched(void)
   1002 {
   1003 	CPU_INFO_ITERATOR cii;
   1004 	struct cpu_info *ci;
   1005 	struct lwp *l;
   1006 	struct proc *p;
   1007 
   1008 	/*
   1009 	 * We do this by process in order not to violate the locking rules.
   1010 	 */
   1011 	mutex_enter(proc_lock);
   1012 	PROCLIST_FOREACH(p, &allproc) {
   1013 		mutex_enter(p->p_lock);
   1014 
   1015 		if ((p->p_flag & PK_SYSTEM) != 0) {
   1016 			mutex_exit(p->p_lock);
   1017 			continue;
   1018 		}
   1019 
   1020 		p->p_stat = SSTOP;
   1021 
   1022 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1023 			if (l == curlwp)
   1024 				continue;
   1025 
   1026 			lwp_lock(l);
   1027 
   1028 			/*
   1029 			 * Set L_WREBOOT so that the LWP will suspend itself
   1030 			 * when it tries to return to user mode.  We want to
   1031 			 * try and get to get as many LWPs as possible to
   1032 			 * the user / kernel boundary, so that they will
   1033 			 * release any locks that they hold.
   1034 			 */
   1035 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
   1036 
   1037 			if (l->l_stat == LSSLEEP &&
   1038 			    (l->l_flag & LW_SINTR) != 0) {
   1039 				/* setrunnable() will release the lock. */
   1040 				setrunnable(l);
   1041 				continue;
   1042 			}
   1043 
   1044 			lwp_unlock(l);
   1045 		}
   1046 
   1047 		mutex_exit(p->p_lock);
   1048 	}
   1049 	mutex_exit(proc_lock);
   1050 
   1051 	/*
   1052 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
   1053 	 * They'll trap into the kernel and suspend themselves in userret().
   1054 	 */
   1055 	for (CPU_INFO_FOREACH(cii, ci)) {
   1056 		spc_lock(ci);
   1057 		cpu_need_resched(ci, RESCHED_IMMED);
   1058 		spc_unlock(ci);
   1059 	}
   1060 }
   1061 
   1062 /*
   1063  * sched_unsleep:
   1064  *
   1065  *	The is called when the LWP has not been awoken normally but instead
   1066  *	interrupted: for example, if the sleep timed out.  Because of this,
   1067  *	it's not a valid action for running or idle LWPs.
   1068  */
   1069 static u_int
   1070 sched_unsleep(struct lwp *l, bool cleanup)
   1071 {
   1072 
   1073 	lwp_unlock(l);
   1074 	panic("sched_unsleep");
   1075 }
   1076 
   1077 void
   1078 resched_cpu(struct lwp *l)
   1079 {
   1080 	struct cpu_info *ci;
   1081 
   1082 	/*
   1083 	 * XXXSMP
   1084 	 * Since l->l_cpu persists across a context switch,
   1085 	 * this gives us *very weak* processor affinity, in
   1086 	 * that we notify the CPU on which the process last
   1087 	 * ran that it should try to switch.
   1088 	 *
   1089 	 * This does not guarantee that the process will run on
   1090 	 * that processor next, because another processor might
   1091 	 * grab it the next time it performs a context switch.
   1092 	 *
   1093 	 * This also does not handle the case where its last
   1094 	 * CPU is running a higher-priority process, but every
   1095 	 * other CPU is running a lower-priority process.  There
   1096 	 * are ways to handle this situation, but they're not
   1097 	 * currently very pretty, and we also need to weigh the
   1098 	 * cost of moving a process from one CPU to another.
   1099 	 */
   1100 	ci = l->l_cpu;
   1101 	if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
   1102 		cpu_need_resched(ci, 0);
   1103 }
   1104 
   1105 static void
   1106 sched_changepri(struct lwp *l, pri_t pri)
   1107 {
   1108 
   1109 	KASSERT(lwp_locked(l, NULL));
   1110 
   1111 	if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
   1112 		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1113 		sched_dequeue(l);
   1114 		l->l_priority = pri;
   1115 		sched_enqueue(l, false);
   1116 	} else {
   1117 		l->l_priority = pri;
   1118 	}
   1119 	resched_cpu(l);
   1120 }
   1121 
   1122 static void
   1123 sched_lendpri(struct lwp *l, pri_t pri)
   1124 {
   1125 
   1126 	KASSERT(lwp_locked(l, NULL));
   1127 
   1128 	if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
   1129 		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1130 		sched_dequeue(l);
   1131 		l->l_inheritedprio = pri;
   1132 		sched_enqueue(l, false);
   1133 	} else {
   1134 		l->l_inheritedprio = pri;
   1135 	}
   1136 	resched_cpu(l);
   1137 }
   1138 
   1139 struct lwp *
   1140 syncobj_noowner(wchan_t wchan)
   1141 {
   1142 
   1143 	return NULL;
   1144 }
   1145 
   1146 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
   1147 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
   1148 
   1149 /*
   1150  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
   1151  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
   1152  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
   1153  *
   1154  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
   1155  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
   1156  *
   1157  * If you dont want to bother with the faster/more-accurate formula, you
   1158  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
   1159  * (more general) method of calculating the %age of CPU used by a process.
   1160  */
   1161 #define	CCPU_SHIFT	(FSHIFT + 1)
   1162 
   1163 /*
   1164  * sched_pstats:
   1165  *
   1166  * Update process statistics and check CPU resource allocation.
   1167  * Call scheduler-specific hook to eventually adjust process/LWP
   1168  * priorities.
   1169  */
   1170 /* ARGSUSED */
   1171 void
   1172 sched_pstats(void *arg)
   1173 {
   1174 	struct rlimit *rlim;
   1175 	struct lwp *l;
   1176 	struct proc *p;
   1177 	int sig, clkhz;
   1178 	long runtm;
   1179 
   1180 	sched_pstats_ticks++;
   1181 
   1182 	mutex_enter(proc_lock);
   1183 	PROCLIST_FOREACH(p, &allproc) {
   1184 		/*
   1185 		 * Increment time in/out of memory and sleep time (if
   1186 		 * sleeping).  We ignore overflow; with 16-bit int's
   1187 		 * (remember them?) overflow takes 45 days.
   1188 		 */
   1189 		mutex_enter(p->p_lock);
   1190 		mutex_spin_enter(&p->p_stmutex);
   1191 		runtm = p->p_rtime.sec;
   1192 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1193 			if ((l->l_flag & LW_IDLE) != 0)
   1194 				continue;
   1195 			lwp_lock(l);
   1196 			runtm += l->l_rtime.sec;
   1197 			l->l_swtime++;
   1198 			sched_pstats_hook(l);
   1199 			lwp_unlock(l);
   1200 
   1201 			/*
   1202 			 * p_pctcpu is only for ps.
   1203 			 */
   1204 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
   1205 			if (l->l_slptime < 1) {
   1206 				clkhz = stathz != 0 ? stathz : hz;
   1207 #if	(FSHIFT >= CCPU_SHIFT)
   1208 				l->l_pctcpu += (clkhz == 100) ?
   1209 				    ((fixpt_t)l->l_cpticks) <<
   1210 				        (FSHIFT - CCPU_SHIFT) :
   1211 				    100 * (((fixpt_t) p->p_cpticks)
   1212 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
   1213 #else
   1214 				l->l_pctcpu += ((FSCALE - ccpu) *
   1215 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
   1216 #endif
   1217 				l->l_cpticks = 0;
   1218 			}
   1219 		}
   1220 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
   1221 		mutex_spin_exit(&p->p_stmutex);
   1222 
   1223 		/*
   1224 		 * Check if the process exceeds its CPU resource allocation.
   1225 		 * If over max, kill it.
   1226 		 */
   1227 		rlim = &p->p_rlimit[RLIMIT_CPU];
   1228 		sig = 0;
   1229 		if (runtm >= rlim->rlim_cur) {
   1230 			if (runtm >= rlim->rlim_max)
   1231 				sig = SIGKILL;
   1232 			else {
   1233 				sig = SIGXCPU;
   1234 				if (rlim->rlim_cur < rlim->rlim_max)
   1235 					rlim->rlim_cur += 5;
   1236 			}
   1237 		}
   1238 		mutex_exit(p->p_lock);
   1239 		if (sig)
   1240 			psignal(p, sig);
   1241 	}
   1242 	mutex_exit(proc_lock);
   1243 	uvm_meter();
   1244 	cv_wakeup(&lbolt);
   1245 	callout_schedule(&sched_pstats_ch, hz);
   1246 }
   1247 
   1248 void
   1249 sched_init(void)
   1250 {
   1251 
   1252 	cv_init(&lbolt, "lbolt");
   1253 	callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
   1254 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
   1255 
   1256 	/* Balancing */
   1257 	worker_ci = curcpu();
   1258 	cacheht_time = mstohz(5);		/* ~5 ms  */
   1259 	balance_period = mstohz(300);		/* ~300ms */
   1260 
   1261 	/* Minimal count of LWPs for catching: log2(count of CPUs) */
   1262 	min_catch = min(ilog2(ncpu), 4);
   1263 
   1264 #ifdef PREEMPTION
   1265 	evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_INTR, NULL,
   1266 	   "kpreempt", "defer: critical section");
   1267 	evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_INTR, NULL,
   1268 	   "kpreempt", "defer: kernel_lock");
   1269 	evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_INTR, NULL,
   1270 	   "kpreempt", "defer: IPL");
   1271 	evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_INTR, NULL,
   1272 	   "kpreempt", "immediate");
   1273 #endif
   1274 
   1275 	/* Initialize balancing callout and run it */
   1276 #ifdef MULTIPROCESSOR
   1277 	callout_init(&balance_ch, CALLOUT_MPSAFE);
   1278 	callout_setfunc(&balance_ch, sched_balance, NULL);
   1279 	callout_schedule(&balance_ch, balance_period);
   1280 #endif
   1281 	sched_pstats(NULL);
   1282 }
   1283 
   1284 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
   1285 {
   1286 	const struct sysctlnode *node = NULL;
   1287 
   1288 	sysctl_createv(clog, 0, NULL, NULL,
   1289 		CTLFLAG_PERMANENT,
   1290 		CTLTYPE_NODE, "kern", NULL,
   1291 		NULL, 0, NULL, 0,
   1292 		CTL_KERN, CTL_EOL);
   1293 	sysctl_createv(clog, 0, NULL, &node,
   1294 		CTLFLAG_PERMANENT,
   1295 		CTLTYPE_NODE, "sched",
   1296 		SYSCTL_DESCR("Scheduler options"),
   1297 		NULL, 0, NULL, 0,
   1298 		CTL_KERN, CTL_CREATE, CTL_EOL);
   1299 
   1300 	if (node == NULL)
   1301 		return;
   1302 
   1303 	sysctl_createv(clog, 0, &node, NULL,
   1304 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1305 		CTLTYPE_INT, "cacheht_time",
   1306 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
   1307 		NULL, 0, &cacheht_time, 0,
   1308 		CTL_CREATE, CTL_EOL);
   1309 	sysctl_createv(clog, 0, &node, NULL,
   1310 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1311 		CTLTYPE_INT, "balance_period",
   1312 		SYSCTL_DESCR("Balance period (in ticks)"),
   1313 		NULL, 0, &balance_period, 0,
   1314 		CTL_CREATE, CTL_EOL);
   1315 	sysctl_createv(clog, 0, &node, NULL,
   1316 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1317 		CTLTYPE_INT, "min_catch",
   1318 		SYSCTL_DESCR("Minimal count of threads for catching"),
   1319 		NULL, 0, &min_catch, 0,
   1320 		CTL_CREATE, CTL_EOL);
   1321 	sysctl_createv(clog, 0, &node, NULL,
   1322 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1323 		CTLTYPE_INT, "timesoftints",
   1324 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
   1325 		NULL, 0, &softint_timing, 0,
   1326 		CTL_CREATE, CTL_EOL);
   1327 	sysctl_createv(clog, 0, &node, NULL,
   1328 #ifdef PREEMPTION
   1329 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1330 #else
   1331 		CTLFLAG_PERMANENT,
   1332 #endif
   1333 		CTLTYPE_INT, "kpreempt_pri",
   1334 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
   1335 		NULL, 0, &sched_kpreempt_pri, 0,
   1336 		CTL_CREATE, CTL_EOL);
   1337 	sysctl_createv(clog, 0, &node, NULL,
   1338 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1339 		CTLTYPE_INT, "upreempt_pri",
   1340 		SYSCTL_DESCR("Minimum priority to trigger user preemption"),
   1341 		NULL, 0, &sched_upreempt_pri, 0,
   1342 		CTL_CREATE, CTL_EOL);
   1343 }
   1344 
   1345 void
   1346 sched_cpuattach(struct cpu_info *ci)
   1347 {
   1348 	runqueue_t *ci_rq;
   1349 	void *rq_ptr;
   1350 	u_int i, size;
   1351 
   1352 	if (ci->ci_schedstate.spc_lwplock == NULL) {
   1353 		ci->ci_schedstate.spc_lwplock =
   1354 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
   1355 	}
   1356 	if (ci == lwp0.l_cpu) {
   1357 		/* Initialize the scheduler structure of the primary LWP */
   1358 		lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
   1359 	}
   1360 	if (ci->ci_schedstate.spc_mutex != NULL) {
   1361 		/* Already initialized. */
   1362 		return;
   1363 	}
   1364 
   1365 	/* Allocate the run queue */
   1366 	size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
   1367 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
   1368 	if (rq_ptr == NULL) {
   1369 		panic("sched_cpuattach: could not allocate the runqueue");
   1370 	}
   1371 	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
   1372 
   1373 	/* Initialize run queues */
   1374 	ci->ci_schedstate.spc_mutex =
   1375 	    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
   1376 	for (i = 0; i < PRI_RT_COUNT; i++)
   1377 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
   1378 	for (i = 0; i < PRI_TS_COUNT; i++)
   1379 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
   1380 
   1381 	ci->ci_schedstate.spc_sched_info = ci_rq;
   1382 }
   1383 
   1384 /*
   1385  * Control of the runqueue.
   1386  */
   1387 
   1388 static void *
   1389 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
   1390 {
   1391 
   1392 	KASSERT(prio < PRI_COUNT);
   1393 	return (prio <= PRI_HIGHEST_TS) ?
   1394 	    &ci_rq->r_ts_queue[prio].q_head :
   1395 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
   1396 }
   1397 
   1398 void
   1399 sched_enqueue(struct lwp *l, bool swtch)
   1400 {
   1401 	runqueue_t *ci_rq;
   1402 	struct schedstate_percpu *spc;
   1403 	TAILQ_HEAD(, lwp) *q_head;
   1404 	const pri_t eprio = lwp_eprio(l);
   1405 	struct cpu_info *ci;
   1406 	int type;
   1407 
   1408 	ci = l->l_cpu;
   1409 	spc = &ci->ci_schedstate;
   1410 	ci_rq = spc->spc_sched_info;
   1411 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1412 
   1413 	/* Update the last run time on switch */
   1414 	if (__predict_true(swtch == true)) {
   1415 		l->l_rticks = hardclock_ticks;
   1416 		l->l_rticksum += (hardclock_ticks - l->l_rticks);
   1417 	} else if (l->l_rticks == 0)
   1418 		l->l_rticks = hardclock_ticks;
   1419 
   1420 	/* Enqueue the thread */
   1421 	q_head = sched_getrq(ci_rq, eprio);
   1422 	if (TAILQ_EMPTY(q_head)) {
   1423 		u_int i;
   1424 		uint32_t q;
   1425 
   1426 		/* Mark bit */
   1427 		i = eprio >> BITMAP_SHIFT;
   1428 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
   1429 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
   1430 		ci_rq->r_bitmap[i] |= q;
   1431 	}
   1432 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
   1433 	ci_rq->r_count++;
   1434 	if ((l->l_pflag & LP_BOUND) == 0)
   1435 		ci_rq->r_mcount++;
   1436 
   1437 	/*
   1438 	 * Update the value of highest priority in the runqueue,
   1439 	 * if priority of this thread is higher.
   1440 	 */
   1441 	if (eprio > spc->spc_maxpriority)
   1442 		spc->spc_maxpriority = eprio;
   1443 
   1444 	sched_newts(l);
   1445 
   1446 	/*
   1447 	 * Wake the chosen CPU or cause a preemption if the newly
   1448 	 * enqueued thread has higher priority.  Don't cause a
   1449 	 * preemption if the thread is yielding (swtch).
   1450 	 */
   1451 	if (!swtch && eprio > spc->spc_curpriority) {
   1452 		if (eprio >= sched_kpreempt_pri)
   1453 			type = RESCHED_KPREEMPT;
   1454 		else if (eprio >= sched_upreempt_pri)
   1455 			type = RESCHED_IMMED;
   1456 		else
   1457 			type = 0;
   1458 		cpu_need_resched(ci, type);
   1459 	}
   1460 }
   1461 
   1462 void
   1463 sched_dequeue(struct lwp *l)
   1464 {
   1465 	runqueue_t *ci_rq;
   1466 	TAILQ_HEAD(, lwp) *q_head;
   1467 	struct schedstate_percpu *spc;
   1468 	const pri_t eprio = lwp_eprio(l);
   1469 
   1470 	spc = & l->l_cpu->ci_schedstate;
   1471 	ci_rq = spc->spc_sched_info;
   1472 	KASSERT(lwp_locked(l, spc->spc_mutex));
   1473 
   1474 	KASSERT(eprio <= spc->spc_maxpriority);
   1475 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
   1476 	KASSERT(ci_rq->r_count > 0);
   1477 
   1478 	ci_rq->r_count--;
   1479 	if ((l->l_pflag & LP_BOUND) == 0)
   1480 		ci_rq->r_mcount--;
   1481 
   1482 	q_head = sched_getrq(ci_rq, eprio);
   1483 	TAILQ_REMOVE(q_head, l, l_runq);
   1484 	if (TAILQ_EMPTY(q_head)) {
   1485 		u_int i;
   1486 		uint32_t q;
   1487 
   1488 		/* Unmark bit */
   1489 		i = eprio >> BITMAP_SHIFT;
   1490 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
   1491 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
   1492 		ci_rq->r_bitmap[i] &= ~q;
   1493 
   1494 		/*
   1495 		 * Update the value of highest priority in the runqueue, in a
   1496 		 * case it was a last thread in the queue of highest priority.
   1497 		 */
   1498 		if (eprio != spc->spc_maxpriority)
   1499 			return;
   1500 
   1501 		do {
   1502 			if (ci_rq->r_bitmap[i] != 0) {
   1503 				q = ffs(ci_rq->r_bitmap[i]);
   1504 				spc->spc_maxpriority =
   1505 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
   1506 				return;
   1507 			}
   1508 		} while (i--);
   1509 
   1510 		/* If not found - set the lowest value */
   1511 		spc->spc_maxpriority = 0;
   1512 	}
   1513 }
   1514 
   1515 /*
   1516  * Migration and balancing.
   1517  */
   1518 
   1519 #ifdef MULTIPROCESSOR
   1520 
   1521 /* Estimate if LWP is cache-hot */
   1522 static inline bool
   1523 lwp_cache_hot(const struct lwp *l)
   1524 {
   1525 
   1526 	if (l->l_slptime || l->l_rticks == 0)
   1527 		return false;
   1528 
   1529 	return (hardclock_ticks - l->l_rticks <= cacheht_time);
   1530 }
   1531 
   1532 /* Check if LWP can migrate to the chosen CPU */
   1533 static inline bool
   1534 sched_migratable(const struct lwp *l, struct cpu_info *ci)
   1535 {
   1536 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
   1537 
   1538 	/* CPU is offline */
   1539 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
   1540 		return false;
   1541 
   1542 	/* Affinity bind */
   1543 	if (__predict_false(l->l_flag & LW_AFFINITY))
   1544 		return CPU_ISSET(cpu_index(ci), &l->l_affinity);
   1545 
   1546 	/* Processor-set */
   1547 	return (spc->spc_psid == l->l_psid);
   1548 }
   1549 
   1550 /*
   1551  * Estimate the migration of LWP to the other CPU.
   1552  * Take and return the CPU, if migration is needed.
   1553  */
   1554 struct cpu_info *
   1555 sched_takecpu(struct lwp *l)
   1556 {
   1557 	struct cpu_info *ci, *tci, *first, *next;
   1558 	struct schedstate_percpu *spc;
   1559 	runqueue_t *ci_rq, *ici_rq;
   1560 	pri_t eprio, lpri, pri;
   1561 
   1562 	KASSERT(lwp_locked(l, NULL));
   1563 
   1564 	ci = l->l_cpu;
   1565 	spc = &ci->ci_schedstate;
   1566 	ci_rq = spc->spc_sched_info;
   1567 
   1568 	/* If thread is strictly bound, do not estimate other CPUs */
   1569 	if (l->l_pflag & LP_BOUND)
   1570 		return ci;
   1571 
   1572 	/* CPU of this thread is idling - run there */
   1573 	if (ci_rq->r_count == 0)
   1574 		return ci;
   1575 
   1576 	eprio = lwp_eprio(l);
   1577 
   1578 	/* Stay if thread is cache-hot */
   1579 	if (__predict_true(l->l_stat != LSIDL) &&
   1580 	    lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
   1581 		return ci;
   1582 
   1583 	/* Run on current CPU if priority of thread is higher */
   1584 	ci = curcpu();
   1585 	spc = &ci->ci_schedstate;
   1586 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
   1587 		return ci;
   1588 
   1589 	/*
   1590 	 * Look for the CPU with the lowest priority thread.  In case of
   1591 	 * equal priority, choose the CPU with the fewest of threads.
   1592 	 */
   1593 	first = l->l_cpu;
   1594 	ci = first;
   1595 	tci = first;
   1596 	lpri = PRI_COUNT;
   1597 	do {
   1598 		next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
   1599 		spc = &ci->ci_schedstate;
   1600 		ici_rq = spc->spc_sched_info;
   1601 		pri = max(spc->spc_curpriority, spc->spc_maxpriority);
   1602 		if (pri > lpri)
   1603 			continue;
   1604 
   1605 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
   1606 			continue;
   1607 
   1608 		if (!sched_migratable(l, ci))
   1609 			continue;
   1610 
   1611 		lpri = pri;
   1612 		tci = ci;
   1613 		ci_rq = ici_rq;
   1614 	} while (ci = next, ci != first);
   1615 
   1616 	return tci;
   1617 }
   1618 
   1619 /*
   1620  * Tries to catch an LWP from the runqueue of other CPU.
   1621  */
   1622 static struct lwp *
   1623 sched_catchlwp(void)
   1624 {
   1625 	struct cpu_info *curci = curcpu(), *ci = worker_ci;
   1626 	struct schedstate_percpu *spc;
   1627 	TAILQ_HEAD(, lwp) *q_head;
   1628 	runqueue_t *ci_rq;
   1629 	struct lwp *l;
   1630 
   1631 	if (curci == ci)
   1632 		return NULL;
   1633 
   1634 	/* Lockless check */
   1635 	spc = &ci->ci_schedstate;
   1636 	ci_rq = spc->spc_sched_info;
   1637 	if (ci_rq->r_mcount < min_catch)
   1638 		return NULL;
   1639 
   1640 	/*
   1641 	 * Double-lock the runqueues.
   1642 	 */
   1643 	if (curci < ci) {
   1644 		spc_lock(ci);
   1645 	} else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
   1646 		const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
   1647 
   1648 		spc_unlock(curci);
   1649 		spc_lock(ci);
   1650 		spc_lock(curci);
   1651 
   1652 		if (cur_rq->r_count) {
   1653 			spc_unlock(ci);
   1654 			return NULL;
   1655 		}
   1656 	}
   1657 
   1658 	if (ci_rq->r_mcount < min_catch) {
   1659 		spc_unlock(ci);
   1660 		return NULL;
   1661 	}
   1662 
   1663 	/* Take the highest priority thread */
   1664 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
   1665 	l = TAILQ_FIRST(q_head);
   1666 
   1667 	for (;;) {
   1668 		/* Check the first and next result from the queue */
   1669 		if (l == NULL)
   1670 			break;
   1671 		KASSERT(l->l_stat == LSRUN);
   1672 		KASSERT(l->l_flag & LW_INMEM);
   1673 
   1674 		/* Look for threads, whose are allowed to migrate */
   1675 		if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
   1676 		    !sched_migratable(l, curci)) {
   1677 			l = TAILQ_NEXT(l, l_runq);
   1678 			continue;
   1679 		}
   1680 
   1681 		/* Grab the thread, and move to the local run queue */
   1682 		sched_dequeue(l);
   1683 		l->l_cpu = curci;
   1684 		lwp_unlock_to(l, curci->ci_schedstate.spc_mutex);
   1685 		sched_enqueue(l, false);
   1686 		return l;
   1687 	}
   1688 	spc_unlock(ci);
   1689 
   1690 	return l;
   1691 }
   1692 
   1693 /*
   1694  * Periodical calculations for balancing.
   1695  */
   1696 static void
   1697 sched_balance(void *nocallout)
   1698 {
   1699 	struct cpu_info *ci, *hci;
   1700 	runqueue_t *ci_rq;
   1701 	CPU_INFO_ITERATOR cii;
   1702 	u_int highest;
   1703 
   1704 	hci = curcpu();
   1705 	highest = 0;
   1706 
   1707 	/* Make lockless countings */
   1708 	for (CPU_INFO_FOREACH(cii, ci)) {
   1709 		ci_rq = ci->ci_schedstate.spc_sched_info;
   1710 
   1711 		/* Average count of the threads */
   1712 		ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
   1713 
   1714 		/* Look for CPU with the highest average */
   1715 		if (ci_rq->r_avgcount > highest) {
   1716 			hci = ci;
   1717 			highest = ci_rq->r_avgcount;
   1718 		}
   1719 	}
   1720 
   1721 	/* Update the worker */
   1722 	worker_ci = hci;
   1723 
   1724 	if (nocallout == NULL)
   1725 		callout_schedule(&balance_ch, balance_period);
   1726 }
   1727 
   1728 #else
   1729 
   1730 struct cpu_info *
   1731 sched_takecpu(struct lwp *l)
   1732 {
   1733 
   1734 	return l->l_cpu;
   1735 }
   1736 
   1737 #endif	/* MULTIPROCESSOR */
   1738 
   1739 /*
   1740  * Scheduler mill.
   1741  */
   1742 struct lwp *
   1743 sched_nextlwp(void)
   1744 {
   1745 	struct cpu_info *ci = curcpu();
   1746 	struct schedstate_percpu *spc;
   1747 	TAILQ_HEAD(, lwp) *q_head;
   1748 	runqueue_t *ci_rq;
   1749 	struct lwp *l;
   1750 
   1751 	spc = &ci->ci_schedstate;
   1752 	ci_rq = spc->spc_sched_info;
   1753 
   1754 #ifdef MULTIPROCESSOR
   1755 	/* If runqueue is empty, try to catch some thread from other CPU */
   1756 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
   1757 		if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
   1758 			return NULL;
   1759 	} else if (ci_rq->r_count == 0) {
   1760 		/* Reset the counter, and call the balancer */
   1761 		ci_rq->r_avgcount = 0;
   1762 		sched_balance(ci);
   1763 
   1764 		/* The re-locking will be done inside */
   1765 		return sched_catchlwp();
   1766 	}
   1767 #else
   1768 	if (ci_rq->r_count == 0)
   1769 		return NULL;
   1770 #endif
   1771 
   1772 	/* Take the highest priority thread */
   1773 	KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
   1774 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
   1775 	l = TAILQ_FIRST(q_head);
   1776 	KASSERT(l != NULL);
   1777 
   1778 	sched_oncpu(l);
   1779 	l->l_rticks = hardclock_ticks;
   1780 
   1781 	return l;
   1782 }
   1783 
   1784 bool
   1785 sched_curcpu_runnable_p(void)
   1786 {
   1787 	const struct cpu_info *ci;
   1788 	const runqueue_t *ci_rq;
   1789 	bool rv;
   1790 
   1791 	kpreempt_disable();
   1792 	ci = curcpu();
   1793 	ci_rq = ci->ci_schedstate.spc_sched_info;
   1794 
   1795 #ifndef __HAVE_FAST_SOFTINTS
   1796 	if (ci->ci_data.cpu_softints) {
   1797 		kpreempt_enable();
   1798 		return true;
   1799 	}
   1800 #endif
   1801 
   1802 	if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
   1803 		rv = (ci_rq->r_count - ci_rq->r_mcount);
   1804 	else
   1805 		rv = ci_rq->r_count != 0;
   1806 	kpreempt_enable();
   1807 
   1808 	return rv;
   1809 }
   1810 
   1811 /*
   1812  * Debugging.
   1813  */
   1814 
   1815 #ifdef DDB
   1816 
   1817 void
   1818 sched_print_runqueue(void (*pr)(const char *, ...)
   1819     __attribute__((__format__(__printf__,1,2))))
   1820 {
   1821 	runqueue_t *ci_rq;
   1822 	struct schedstate_percpu *spc;
   1823 	struct lwp *l;
   1824 	struct proc *p;
   1825 	int i;
   1826 	struct cpu_info *ci;
   1827 	CPU_INFO_ITERATOR cii;
   1828 
   1829 	for (CPU_INFO_FOREACH(cii, ci)) {
   1830 		spc = &ci->ci_schedstate;
   1831 		ci_rq = spc->spc_sched_info;
   1832 
   1833 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
   1834 		(*pr)(" pid.lid = %d.%d, threads count = %u, "
   1835 		    "avgcount = %u, highest pri = %d\n",
   1836 #ifdef MULTIPROCESSOR
   1837 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
   1838 #else
   1839 		    curlwp->l_proc->p_pid, curlwp->l_lid,
   1840 #endif
   1841 		    ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority);
   1842 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
   1843 		do {
   1844 			uint32_t q;
   1845 			q = ci_rq->r_bitmap[i];
   1846 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
   1847 		} while (i--);
   1848 	}
   1849 
   1850 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %s\n",
   1851 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "LRTIME");
   1852 
   1853 	PROCLIST_FOREACH(p, &allproc) {
   1854 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
   1855 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1856 			ci = l->l_cpu;
   1857 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %u\n",
   1858 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
   1859 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
   1860 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
   1861 			    l, ci->ci_index,
   1862 			    (u_int)(hardclock_ticks - l->l_rticks));
   1863 		}
   1864 	}
   1865 }
   1866 
   1867 #endif
   1868