Home | History | Annotate | Line # | Download | only in kern
kern_synch.c revision 1.234
      1 /*	$NetBSD: kern_synch.c,v 1.234 2008/04/28 21:17:16 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
     10  * Daniel Sieger.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     31  * POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 
     34 /*
     35  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
     36  * All rights reserved.
     37  *
     38  * Redistribution and use in source and binary forms, with or without
     39  * modification, are permitted provided that the following conditions
     40  * are met:
     41  * 1. Redistributions of source code must retain the above copyright
     42  *    notice, this list of conditions and the following disclaimer.
     43  * 2. Redistributions in binary form must reproduce the above copyright
     44  *    notice, this list of conditions and the following disclaimer in the
     45  *    documentation and/or other materials provided with the distribution.
     46  *
     47  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     57  * SUCH DAMAGE.
     58  */
     59 
     60 /*-
     61  * Copyright (c) 1982, 1986, 1990, 1991, 1993
     62  *	The Regents of the University of California.  All rights reserved.
     63  * (c) UNIX System Laboratories, Inc.
     64  * All or some portions of this file are derived from material licensed
     65  * to the University of California by American Telephone and Telegraph
     66  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     67  * the permission of UNIX System Laboratories, Inc.
     68  *
     69  * Redistribution and use in source and binary forms, with or without
     70  * modification, are permitted provided that the following conditions
     71  * are met:
     72  * 1. Redistributions of source code must retain the above copyright
     73  *    notice, this list of conditions and the following disclaimer.
     74  * 2. Redistributions in binary form must reproduce the above copyright
     75  *    notice, this list of conditions and the following disclaimer in the
     76  *    documentation and/or other materials provided with the distribution.
     77  * 3. Neither the name of the University nor the names of its contributors
     78  *    may be used to endorse or promote products derived from this software
     79  *    without specific prior written permission.
     80  *
     81  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     82  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     83  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     84  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     85  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     86  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     87  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     88  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     89  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     90  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     91  * SUCH DAMAGE.
     92  *
     93  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
     94  */
     95 
     96 #include <sys/cdefs.h>
     97 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.234 2008/04/28 21:17:16 ad Exp $");
     98 
     99 #include "opt_kstack.h"
    100 #include "opt_lockdebug.h"
    101 #include "opt_multiprocessor.h"
    102 #include "opt_perfctrs.h"
    103 
    104 #define	__MUTEX_PRIVATE
    105 
    106 #include <sys/param.h>
    107 #include <sys/systm.h>
    108 #include <sys/proc.h>
    109 #include <sys/kernel.h>
    110 #if defined(PERFCTRS)
    111 #include <sys/pmc.h>
    112 #endif
    113 #include <sys/cpu.h>
    114 #include <sys/resourcevar.h>
    115 #include <sys/sched.h>
    116 #include <sys/syscall_stats.h>
    117 #include <sys/sleepq.h>
    118 #include <sys/lockdebug.h>
    119 #include <sys/evcnt.h>
    120 #include <sys/intr.h>
    121 #include <sys/lwpctl.h>
    122 #include <sys/atomic.h>
    123 #include <sys/simplelock.h>
    124 #include <sys/bitops.h>
    125 #include <sys/kmem.h>
    126 #include <sys/sysctl.h>
    127 #include <sys/idle.h>
    128 
    129 #include <uvm/uvm_extern.h>
    130 
    131 #include <dev/lockstat.h>
    132 
    133 /*
    134  * Priority related defintions.
    135  */
    136 #define	PRI_TS_COUNT	(NPRI_USER)
    137 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
    138 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
    139 
    140 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
    141 
    142 /*
    143  * Bits per map.
    144  */
    145 #define	BITMAP_BITS	(32)
    146 #define	BITMAP_SHIFT	(5)
    147 #define	BITMAP_MSB	(0x80000000U)
    148 #define	BITMAP_MASK	(BITMAP_BITS - 1)
    149 
    150 /*
    151  * Structures, runqueue.
    152  */
    153 
    154 typedef struct {
    155 	TAILQ_HEAD(, lwp) q_head;
    156 } queue_t;
    157 
    158 typedef struct {
    159 	/* Lock and bitmap */
    160 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
    161 	/* Counters */
    162 	u_int		r_count;	/* Count of the threads */
    163 	u_int		r_avgcount;	/* Average count of threads */
    164 	u_int		r_mcount;	/* Count of migratable threads */
    165 	/* Runqueues */
    166 	queue_t		r_rt_queue[PRI_RT_COUNT];
    167 	queue_t		r_ts_queue[PRI_TS_COUNT];
    168 } runqueue_t;
    169 
    170 static u_int	sched_unsleep(struct lwp *, bool);
    171 static void	sched_changepri(struct lwp *, pri_t);
    172 static void	sched_lendpri(struct lwp *, pri_t);
    173 static void	*sched_getrq(runqueue_t *, const pri_t);
    174 #ifdef MULTIPROCESSOR
    175 static lwp_t	*sched_catchlwp(void);
    176 static void	sched_balance(void *);
    177 #endif
    178 
    179 syncobj_t sleep_syncobj = {
    180 	SOBJ_SLEEPQ_SORTED,
    181 	sleepq_unsleep,
    182 	sleepq_changepri,
    183 	sleepq_lendpri,
    184 	syncobj_noowner,
    185 };
    186 
    187 syncobj_t sched_syncobj = {
    188 	SOBJ_SLEEPQ_SORTED,
    189 	sched_unsleep,
    190 	sched_changepri,
    191 	sched_lendpri,
    192 	syncobj_noowner,
    193 };
    194 
    195 const int 	schedppq = 1;
    196 callout_t 	sched_pstats_ch;
    197 unsigned	sched_pstats_ticks;
    198 kcondvar_t	lbolt;			/* once a second sleep address */
    199 
    200 /*
    201  * Preemption control.
    202  */
    203 int		sched_upreempt_pri = PRI_KERNEL;
    204 #if 0
    205 int		sched_kpreempt_pri = PRI_USER_RT;
    206 #else
    207 /* XXX disable for now until any bugs are worked out. */
    208 int		sched_kpreempt_pri = 1000;
    209 #endif
    210 
    211 static struct evcnt kpreempt_ev_crit;
    212 static struct evcnt kpreempt_ev_klock;
    213 static struct evcnt kpreempt_ev_ipl;
    214 static struct evcnt kpreempt_ev_immed;
    215 
    216 /*
    217  * Migration and balancing.
    218  */
    219 static u_int	cacheht_time;		/* Cache hotness time */
    220 static u_int	min_catch;		/* Minimal LWP count for catching */
    221 static u_int	balance_period;		/* Balance period */
    222 static struct cpu_info *worker_ci;	/* Victim CPU */
    223 #ifdef MULTIPROCESSOR
    224 static struct callout balance_ch;	/* Callout of balancer */
    225 #endif
    226 
    227 /*
    228  * During autoconfiguration or after a panic, a sleep will simply lower the
    229  * priority briefly to allow interrupts, then return.  The priority to be
    230  * used (safepri) is machine-dependent, thus this value is initialized and
    231  * maintained in the machine-dependent layers.  This priority will typically
    232  * be 0, or the lowest priority that is safe for use on the interrupt stack;
    233  * it can be made higher to block network software interrupts after panics.
    234  */
    235 int	safepri;
    236 
    237 /*
    238  * OBSOLETE INTERFACE
    239  *
    240  * General sleep call.  Suspends the current process until a wakeup is
    241  * performed on the specified identifier.  The process will then be made
    242  * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
    243  * means no timeout).  If pri includes PCATCH flag, signals are checked
    244  * before and after sleeping, else signals are not checked.  Returns 0 if
    245  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
    246  * signal needs to be delivered, ERESTART is returned if the current system
    247  * call should be restarted if possible, and EINTR is returned if the system
    248  * call should be interrupted by the signal (return EINTR).
    249  *
    250  * The interlock is held until we are on a sleep queue. The interlock will
    251  * be locked before returning back to the caller unless the PNORELOCK flag
    252  * is specified, in which case the interlock will always be unlocked upon
    253  * return.
    254  */
    255 int
    256 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    257 	volatile struct simplelock *interlock)
    258 {
    259 	struct lwp *l = curlwp;
    260 	sleepq_t *sq;
    261 	int error;
    262 
    263 	KASSERT((l->l_pflag & LP_INTR) == 0);
    264 
    265 	if (sleepq_dontsleep(l)) {
    266 		(void)sleepq_abort(NULL, 0);
    267 		if ((priority & PNORELOCK) != 0)
    268 			simple_unlock(interlock);
    269 		return 0;
    270 	}
    271 
    272 	l->l_kpriority = true;
    273 	sq = sleeptab_lookup(&sleeptab, ident);
    274 	sleepq_enter(sq, l);
    275 	sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
    276 
    277 	if (interlock != NULL) {
    278 		KASSERT(simple_lock_held(interlock));
    279 		simple_unlock(interlock);
    280 	}
    281 
    282 	error = sleepq_block(timo, priority & PCATCH);
    283 
    284 	if (interlock != NULL && (priority & PNORELOCK) == 0)
    285 		simple_lock(interlock);
    286 
    287 	return error;
    288 }
    289 
    290 int
    291 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
    292 	kmutex_t *mtx)
    293 {
    294 	struct lwp *l = curlwp;
    295 	sleepq_t *sq;
    296 	int error;
    297 
    298 	KASSERT((l->l_pflag & LP_INTR) == 0);
    299 
    300 	if (sleepq_dontsleep(l)) {
    301 		(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
    302 		return 0;
    303 	}
    304 
    305 	l->l_kpriority = true;
    306 	sq = sleeptab_lookup(&sleeptab, ident);
    307 	sleepq_enter(sq, l);
    308 	sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj);
    309 	mutex_exit(mtx);
    310 	error = sleepq_block(timo, priority & PCATCH);
    311 
    312 	if ((priority & PNORELOCK) == 0)
    313 		mutex_enter(mtx);
    314 
    315 	return error;
    316 }
    317 
    318 /*
    319  * General sleep call for situations where a wake-up is not expected.
    320  */
    321 int
    322 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
    323 {
    324 	struct lwp *l = curlwp;
    325 	sleepq_t *sq;
    326 	int error;
    327 
    328 	if (sleepq_dontsleep(l))
    329 		return sleepq_abort(NULL, 0);
    330 
    331 	if (mtx != NULL)
    332 		mutex_exit(mtx);
    333 	l->l_kpriority = true;
    334 	sq = sleeptab_lookup(&sleeptab, l);
    335 	sleepq_enter(sq, l);
    336 	sleepq_enqueue(sq, l, wmesg, &sleep_syncobj);
    337 	error = sleepq_block(timo, intr);
    338 	if (mtx != NULL)
    339 		mutex_enter(mtx);
    340 
    341 	return error;
    342 }
    343 
    344 /*
    345  * OBSOLETE INTERFACE
    346  *
    347  * Make all processes sleeping on the specified identifier runnable.
    348  */
    349 void
    350 wakeup(wchan_t ident)
    351 {
    352 	sleepq_t *sq;
    353 
    354 	if (cold)
    355 		return;
    356 
    357 	sq = sleeptab_lookup(&sleeptab, ident);
    358 	sleepq_wake(sq, ident, (u_int)-1);
    359 }
    360 
    361 /*
    362  * OBSOLETE INTERFACE
    363  *
    364  * Make the highest priority process first in line on the specified
    365  * identifier runnable.
    366  */
    367 void
    368 wakeup_one(wchan_t ident)
    369 {
    370 	sleepq_t *sq;
    371 
    372 	if (cold)
    373 		return;
    374 
    375 	sq = sleeptab_lookup(&sleeptab, ident);
    376 	sleepq_wake(sq, ident, 1);
    377 }
    378 
    379 
    380 /*
    381  * General yield call.  Puts the current process back on its run queue and
    382  * performs a voluntary context switch.  Should only be called when the
    383  * current process explicitly requests it (eg sched_yield(2)).
    384  */
    385 void
    386 yield(void)
    387 {
    388 	struct lwp *l = curlwp;
    389 
    390 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    391 	lwp_lock(l);
    392 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
    393 	KASSERT(l->l_stat == LSONPROC);
    394 	l->l_kpriority = false;
    395 	(void)mi_switch(l);
    396 	KERNEL_LOCK(l->l_biglocks, l);
    397 }
    398 
    399 /*
    400  * General preemption call.  Puts the current process back on its run queue
    401  * and performs an involuntary context switch.
    402  */
    403 void
    404 preempt(void)
    405 {
    406 	struct lwp *l = curlwp;
    407 
    408 	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
    409 	lwp_lock(l);
    410 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
    411 	KASSERT(l->l_stat == LSONPROC);
    412 	l->l_kpriority = false;
    413 	l->l_nivcsw++;
    414 	(void)mi_switch(l);
    415 	KERNEL_LOCK(l->l_biglocks, l);
    416 }
    417 
    418 /*
    419  * Handle a request made by another agent to preempt the current LWP
    420  * in-kernel.  Usually called when l_dopreempt may be non-zero.
    421  *
    422  * Character addresses for lockstat only.
    423  */
    424 static char	in_critical_section;
    425 static char	kernel_lock_held;
    426 static char	spl_raised;
    427 static char	is_softint;
    428 
    429 bool
    430 kpreempt(uintptr_t where)
    431 {
    432 	uintptr_t failed;
    433 	lwp_t *l;
    434 	int s, dop;
    435 
    436 	l = curlwp;
    437 	failed = 0;
    438 	while ((dop = l->l_dopreempt) != 0) {
    439 		if (l->l_stat != LSONPROC) {
    440 			/*
    441 			 * About to block (or die), let it happen.
    442 			 * Doesn't really count as "preemption has
    443 			 * been blocked", since we're going to
    444 			 * context switch.
    445 			 */
    446 			l->l_dopreempt = 0;
    447 			return true;
    448 		}
    449 		if (__predict_false((l->l_flag & LW_IDLE) != 0)) {
    450 			/* Can't preempt idle loop, don't count as failure. */
    451 		    	l->l_dopreempt = 0;
    452 		    	return true;
    453 		}
    454 		if (__predict_false(l->l_nopreempt != 0)) {
    455 			/* LWP holds preemption disabled, explicitly. */
    456 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    457 				kpreempt_ev_crit.ev_count++;
    458 			}
    459 			failed = (uintptr_t)&in_critical_section;
    460 			break;
    461 		}
    462 		if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
    463 		    	/* Can't preempt soft interrupts yet. */
    464 		    	l->l_dopreempt = 0;
    465 		    	failed = (uintptr_t)&is_softint;
    466 		    	break;
    467 		}
    468 		s = splsched();
    469 		if (__predict_false(l->l_blcnt != 0 ||
    470 		    curcpu()->ci_biglock_wanted != NULL)) {
    471 			/* Hold or want kernel_lock, code is not MT safe. */
    472 			splx(s);
    473 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    474 				kpreempt_ev_klock.ev_count++;
    475 			}
    476 			failed = (uintptr_t)&kernel_lock_held;
    477 			break;
    478 		}
    479 		if (__predict_false(!cpu_kpreempt_enter(where, s))) {
    480 			/*
    481 			 * It may be that the IPL is too high.
    482 			 * kpreempt_enter() can schedule an
    483 			 * interrupt to retry later.
    484 			 */
    485 			splx(s);
    486 			if ((dop & DOPREEMPT_COUNTED) == 0) {
    487 				kpreempt_ev_ipl.ev_count++;
    488 			}
    489 			failed = (uintptr_t)&spl_raised;
    490 			break;
    491 		}
    492 		/* Do it! */
    493 		if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
    494 			kpreempt_ev_immed.ev_count++;
    495 		}
    496 		lwp_lock(l);
    497 		mi_switch(l);
    498 		l->l_nopreempt++;
    499 		splx(s);
    500 
    501 		/* Take care of any MD cleanup. */
    502 		cpu_kpreempt_exit(where);
    503 		l->l_nopreempt--;
    504 	}
    505 
    506 	/* Record preemption failure for reporting via lockstat. */
    507 	if (__predict_false(failed)) {
    508 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
    509 		int lsflag = 0;
    510 		LOCKSTAT_ENTER(lsflag);
    511 		/* Might recurse, make it atomic. */
    512 		if (__predict_false(lsflag)) {
    513 			if (where == 0) {
    514 				where = (uintptr_t)__builtin_return_address(0);
    515 			}
    516 			if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr,
    517 			    NULL, (void *)where) == NULL) {
    518 				LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
    519 				l->l_pfaillock = failed;
    520 			}
    521 		}
    522 		LOCKSTAT_EXIT(lsflag);
    523 	}
    524 
    525 	return failed;
    526 }
    527 
    528 /*
    529  * Return true if preemption is explicitly disabled.
    530  */
    531 bool
    532 kpreempt_disabled(void)
    533 {
    534 	lwp_t *l;
    535 
    536 	l = curlwp;
    537 
    538 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
    539 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
    540 }
    541 
    542 /*
    543  * Disable kernel preemption.
    544  */
    545 void
    546 kpreempt_disable(void)
    547 {
    548 
    549 	KPREEMPT_DISABLE(curlwp);
    550 }
    551 
    552 /*
    553  * Reenable kernel preemption.
    554  */
    555 void
    556 kpreempt_enable(void)
    557 {
    558 
    559 	KPREEMPT_ENABLE(curlwp);
    560 }
    561 
    562 /*
    563  * Compute the amount of time during which the current lwp was running.
    564  *
    565  * - update l_rtime unless it's an idle lwp.
    566  */
    567 
    568 void
    569 updatertime(lwp_t *l, const struct bintime *now)
    570 {
    571 
    572 	if ((l->l_flag & LW_IDLE) != 0)
    573 		return;
    574 
    575 	/* rtime += now - stime */
    576 	bintime_add(&l->l_rtime, now);
    577 	bintime_sub(&l->l_rtime, &l->l_stime);
    578 }
    579 
    580 /*
    581  * The machine independent parts of context switch.
    582  *
    583  * Returns 1 if another LWP was actually run.
    584  */
    585 int
    586 mi_switch(lwp_t *l)
    587 {
    588 	struct cpu_info *ci, *tci = NULL;
    589 	struct schedstate_percpu *spc;
    590 	struct lwp *newl;
    591 	int retval, oldspl;
    592 	struct bintime bt;
    593 	bool returning;
    594 
    595 	KASSERT(lwp_locked(l, NULL));
    596 	KASSERT(kpreempt_disabled());
    597 	LOCKDEBUG_BARRIER(l->l_mutex, 1);
    598 
    599 #ifdef KSTACK_CHECK_MAGIC
    600 	kstack_check_magic(l);
    601 #endif
    602 
    603 	binuptime(&bt);
    604 
    605 	KASSERT(l->l_cpu == curcpu());
    606 	ci = l->l_cpu;
    607 	spc = &ci->ci_schedstate;
    608 	returning = false;
    609 	newl = NULL;
    610 
    611 	/*
    612 	 * If we have been asked to switch to a specific LWP, then there
    613 	 * is no need to inspect the run queues.  If a soft interrupt is
    614 	 * blocking, then return to the interrupted thread without adjusting
    615 	 * VM context or its start time: neither have been changed in order
    616 	 * to take the interrupt.
    617 	 */
    618 	if (l->l_switchto != NULL) {
    619 		if ((l->l_pflag & LP_INTR) != 0) {
    620 			returning = true;
    621 			softint_block(l);
    622 			if ((l->l_flag & LW_TIMEINTR) != 0)
    623 				updatertime(l, &bt);
    624 		}
    625 		newl = l->l_switchto;
    626 		l->l_switchto = NULL;
    627 	}
    628 #ifndef __HAVE_FAST_SOFTINTS
    629 	else if (ci->ci_data.cpu_softints != 0) {
    630 		/* There are pending soft interrupts, so pick one. */
    631 		newl = softint_picklwp();
    632 		newl->l_stat = LSONPROC;
    633 		newl->l_flag |= LW_RUNNING;
    634 	}
    635 #endif	/* !__HAVE_FAST_SOFTINTS */
    636 
    637 	/* Count time spent in current system call */
    638 	if (!returning) {
    639 		SYSCALL_TIME_SLEEP(l);
    640 
    641 		/*
    642 		 * XXXSMP If we are using h/w performance counters,
    643 		 * save context.
    644 		 */
    645 #if PERFCTRS
    646 		if (PMC_ENABLED(l->l_proc)) {
    647 			pmc_save_context(l->l_proc);
    648 		}
    649 #endif
    650 		updatertime(l, &bt);
    651 	}
    652 
    653 	/*
    654 	 * If on the CPU and we have gotten this far, then we must yield.
    655 	 */
    656 	KASSERT(l->l_stat != LSRUN);
    657 	if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) {
    658 		KASSERT(lwp_locked(l, spc->spc_lwplock));
    659 
    660 		if (l->l_target_cpu == l->l_cpu) {
    661 			l->l_target_cpu = NULL;
    662 		} else {
    663 			tci = l->l_target_cpu;
    664 		}
    665 
    666 		if (__predict_false(tci != NULL)) {
    667 			/* Double-lock the runqueues */
    668 			spc_dlock(ci, tci);
    669 		} else {
    670 			/* Lock the runqueue */
    671 			spc_lock(ci);
    672 		}
    673 
    674 		if ((l->l_flag & LW_IDLE) == 0) {
    675 			l->l_stat = LSRUN;
    676 			if (__predict_false(tci != NULL)) {
    677 				/*
    678 				 * Set the new CPU, lock and unset the
    679 				 * l_target_cpu - thread will be enqueued
    680 				 * to the runqueue of target CPU.
    681 				 */
    682 				l->l_cpu = tci;
    683 				lwp_setlock(l, tci->ci_schedstate.spc_mutex);
    684 				l->l_target_cpu = NULL;
    685 			} else {
    686 				lwp_setlock(l, spc->spc_mutex);
    687 			}
    688 			sched_enqueue(l, true);
    689 		} else {
    690 			KASSERT(tci == NULL);
    691 			l->l_stat = LSIDL;
    692 		}
    693 	} else {
    694 		/* Lock the runqueue */
    695 		spc_lock(ci);
    696 	}
    697 
    698 	/*
    699 	 * Let sched_nextlwp() select the LWP to run the CPU next.
    700 	 * If no LWP is runnable, select the idle LWP.
    701 	 *
    702 	 * Note that spc_lwplock might not necessary be held, and
    703 	 * new thread would be unlocked after setting the LWP-lock.
    704 	 */
    705 	if (newl == NULL) {
    706 		newl = sched_nextlwp();
    707 		if (newl != NULL) {
    708 			sched_dequeue(newl);
    709 			KASSERT(lwp_locked(newl, spc->spc_mutex));
    710 			newl->l_stat = LSONPROC;
    711 			newl->l_cpu = ci;
    712 			newl->l_flag |= LW_RUNNING;
    713 			lwp_setlock(newl, spc->spc_lwplock);
    714 		} else {
    715 			newl = ci->ci_data.cpu_idlelwp;
    716 			newl->l_stat = LSONPROC;
    717 			newl->l_flag |= LW_RUNNING;
    718 		}
    719 		/*
    720 		 * Only clear want_resched if there are no
    721 		 * pending (slow) software interrupts.
    722 		 */
    723 		ci->ci_want_resched = ci->ci_data.cpu_softints;
    724 		spc->spc_flags &= ~SPCF_SWITCHCLEAR;
    725 		spc->spc_curpriority = lwp_eprio(newl);
    726 	}
    727 
    728 	/* Items that must be updated with the CPU locked. */
    729 	if (!returning) {
    730 		/* Update the new LWP's start time. */
    731 		newl->l_stime = bt;
    732 
    733 		/*
    734 		 * ci_curlwp changes when a fast soft interrupt occurs.
    735 		 * We use cpu_onproc to keep track of which kernel or
    736 		 * user thread is running 'underneath' the software
    737 		 * interrupt.  This is important for time accounting,
    738 		 * itimers and forcing user threads to preempt (aston).
    739 		 */
    740 		ci->ci_data.cpu_onproc = newl;
    741 	}
    742 
    743 	/* Kernel preemption related tasks. */
    744 	l->l_dopreempt = 0;
    745 	if (__predict_false(l->l_pfailaddr != 0)) {
    746 		LOCKSTAT_FLAG(lsflag);
    747 		LOCKSTAT_ENTER(lsflag);
    748 		LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
    749 		LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
    750 		    1, l->l_pfailtime, l->l_pfailaddr);
    751 		LOCKSTAT_EXIT(lsflag);
    752 		l->l_pfailtime = 0;
    753 		l->l_pfaillock = 0;
    754 		l->l_pfailaddr = 0;
    755 	}
    756 
    757 	if (l != newl) {
    758 		struct lwp *prevlwp;
    759 
    760 		/* Release all locks, but leave the current LWP locked */
    761 		if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) {
    762 			/*
    763 			 * In case of migration, drop the local runqueue
    764 			 * lock, thread is on other runqueue now.
    765 			 */
    766 			if (__predict_false(tci != NULL))
    767 				spc_unlock(ci);
    768 			/*
    769 			 * Drop spc_lwplock, if the current LWP has been moved
    770 			 * to the run queue (it is now locked by spc_mutex).
    771 			 */
    772 			mutex_spin_exit(spc->spc_lwplock);
    773 		} else {
    774 			/*
    775 			 * Otherwise, drop the spc_mutex, we are done with the
    776 			 * run queues.
    777 			 */
    778 			mutex_spin_exit(spc->spc_mutex);
    779 			KASSERT(tci == NULL);
    780 		}
    781 
    782 		/*
    783 		 * Mark that context switch is going to be perfomed
    784 		 * for this LWP, to protect it from being switched
    785 		 * to on another CPU.
    786 		 */
    787 		KASSERT(l->l_ctxswtch == 0);
    788 		l->l_ctxswtch = 1;
    789 		l->l_ncsw++;
    790 		l->l_flag &= ~LW_RUNNING;
    791 
    792 		/*
    793 		 * Increase the count of spin-mutexes before the release
    794 		 * of the last lock - we must remain at IPL_SCHED during
    795 		 * the context switch.
    796 		 */
    797 		oldspl = MUTEX_SPIN_OLDSPL(ci);
    798 		ci->ci_mtx_count--;
    799 		lwp_unlock(l);
    800 
    801 		/* Count the context switch on this CPU. */
    802 		ci->ci_data.cpu_nswtch++;
    803 
    804 		/* Update status for lwpctl, if present. */
    805 		if (l->l_lwpctl != NULL)
    806 			l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE;
    807 
    808 		/*
    809 		 * Save old VM context, unless a soft interrupt
    810 		 * handler is blocking.
    811 		 */
    812 		if (!returning)
    813 			pmap_deactivate(l);
    814 
    815 		/*
    816 		 * We may need to spin-wait for if 'newl' is still
    817 		 * context switching on another CPU.
    818 		 */
    819 		if (newl->l_ctxswtch != 0) {
    820 			u_int count;
    821 			count = SPINLOCK_BACKOFF_MIN;
    822 			while (newl->l_ctxswtch)
    823 				SPINLOCK_BACKOFF(count);
    824 		}
    825 
    826 		/* Switch to the new LWP.. */
    827 		prevlwp = cpu_switchto(l, newl, returning);
    828 		ci = curcpu();
    829 
    830 		/*
    831 		 * Switched away - we have new curlwp.
    832 		 * Restore VM context and IPL.
    833 		 */
    834 		pmap_activate(l);
    835 		if (prevlwp != NULL) {
    836 			/* Normalize the count of the spin-mutexes */
    837 			ci->ci_mtx_count++;
    838 			/* Unmark the state of context switch */
    839 			membar_exit();
    840 			prevlwp->l_ctxswtch = 0;
    841 		}
    842 
    843 		/* Update status for lwpctl, if present. */
    844 		if (l->l_lwpctl != NULL) {
    845 			l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
    846 			l->l_lwpctl->lc_pctr++;
    847 		}
    848 
    849 		KASSERT(l->l_cpu == ci);
    850 		splx(oldspl);
    851 		retval = 1;
    852 	} else {
    853 		/* Nothing to do - just unlock and return. */
    854 		KASSERT(tci == NULL);
    855 		spc_unlock(ci);
    856 		lwp_unlock(l);
    857 		retval = 0;
    858 	}
    859 
    860 	KASSERT(l == curlwp);
    861 	KASSERT(l->l_stat == LSONPROC);
    862 
    863 	/*
    864 	 * XXXSMP If we are using h/w performance counters, restore context.
    865 	 * XXXSMP preemption problem.
    866 	 */
    867 #if PERFCTRS
    868 	if (PMC_ENABLED(l->l_proc)) {
    869 		pmc_restore_context(l->l_proc);
    870 	}
    871 #endif
    872 	SYSCALL_TIME_WAKEUP(l);
    873 	LOCKDEBUG_BARRIER(NULL, 1);
    874 
    875 	return retval;
    876 }
    877 
    878 /*
    879  * Change process state to be runnable, placing it on the run queue if it is
    880  * in memory, and awakening the swapper if it isn't in memory.
    881  *
    882  * Call with the process and LWP locked.  Will return with the LWP unlocked.
    883  */
    884 void
    885 setrunnable(struct lwp *l)
    886 {
    887 	struct proc *p = l->l_proc;
    888 	struct cpu_info *ci;
    889 	sigset_t *ss;
    890 
    891 	KASSERT((l->l_flag & LW_IDLE) == 0);
    892 	KASSERT(mutex_owned(p->p_lock));
    893 	KASSERT(lwp_locked(l, NULL));
    894 	KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
    895 
    896 	switch (l->l_stat) {
    897 	case LSSTOP:
    898 		/*
    899 		 * If we're being traced (possibly because someone attached us
    900 		 * while we were stopped), check for a signal from the debugger.
    901 		 */
    902 		if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) {
    903 			if ((sigprop[p->p_xstat] & SA_TOLWP) != 0)
    904 				ss = &l->l_sigpend.sp_set;
    905 			else
    906 				ss = &p->p_sigpend.sp_set;
    907 			sigaddset(ss, p->p_xstat);
    908 			signotify(l);
    909 		}
    910 		p->p_nrlwps++;
    911 		break;
    912 	case LSSUSPENDED:
    913 		l->l_flag &= ~LW_WSUSPEND;
    914 		p->p_nrlwps++;
    915 		cv_broadcast(&p->p_lwpcv);
    916 		break;
    917 	case LSSLEEP:
    918 		KASSERT(l->l_wchan != NULL);
    919 		break;
    920 	default:
    921 		panic("setrunnable: lwp %p state was %d", l, l->l_stat);
    922 	}
    923 
    924 	/*
    925 	 * If the LWP was sleeping interruptably, then it's OK to start it
    926 	 * again.  If not, mark it as still sleeping.
    927 	 */
    928 	if (l->l_wchan != NULL) {
    929 		l->l_stat = LSSLEEP;
    930 		/* lwp_unsleep() will release the lock. */
    931 		lwp_unsleep(l, true);
    932 		return;
    933 	}
    934 
    935 	/*
    936 	 * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
    937 	 * about to call mi_switch(), in which case it will yield.
    938 	 */
    939 	if ((l->l_flag & LW_RUNNING) != 0) {
    940 		l->l_stat = LSONPROC;
    941 		l->l_slptime = 0;
    942 		lwp_unlock(l);
    943 		return;
    944 	}
    945 
    946 	/*
    947 	 * Look for a CPU to run.
    948 	 * Set the LWP runnable.
    949 	 */
    950 	ci = sched_takecpu(l);
    951 	l->l_cpu = ci;
    952 	if (l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex) {
    953 		lwp_unlock_to(l, ci->ci_schedstate.spc_mutex);
    954 		lwp_lock(l);
    955 	}
    956 	sched_setrunnable(l);
    957 	l->l_stat = LSRUN;
    958 	l->l_slptime = 0;
    959 
    960 	/*
    961 	 * If thread is swapped out - wake the swapper to bring it back in.
    962 	 * Otherwise, enter it into a run queue.
    963 	 */
    964 	if (l->l_flag & LW_INMEM) {
    965 		sched_enqueue(l, false);
    966 		resched_cpu(l);
    967 		lwp_unlock(l);
    968 	} else {
    969 		lwp_unlock(l);
    970 		uvm_kick_scheduler();
    971 	}
    972 }
    973 
    974 /*
    975  * suspendsched:
    976  *
    977  *	Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
    978  */
    979 void
    980 suspendsched(void)
    981 {
    982 	CPU_INFO_ITERATOR cii;
    983 	struct cpu_info *ci;
    984 	struct lwp *l;
    985 	struct proc *p;
    986 
    987 	/*
    988 	 * We do this by process in order not to violate the locking rules.
    989 	 */
    990 	mutex_enter(proc_lock);
    991 	PROCLIST_FOREACH(p, &allproc) {
    992 		mutex_enter(p->p_lock);
    993 
    994 		if ((p->p_flag & PK_SYSTEM) != 0) {
    995 			mutex_exit(p->p_lock);
    996 			continue;
    997 		}
    998 
    999 		p->p_stat = SSTOP;
   1000 
   1001 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1002 			if (l == curlwp)
   1003 				continue;
   1004 
   1005 			lwp_lock(l);
   1006 
   1007 			/*
   1008 			 * Set L_WREBOOT so that the LWP will suspend itself
   1009 			 * when it tries to return to user mode.  We want to
   1010 			 * try and get to get as many LWPs as possible to
   1011 			 * the user / kernel boundary, so that they will
   1012 			 * release any locks that they hold.
   1013 			 */
   1014 			l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
   1015 
   1016 			if (l->l_stat == LSSLEEP &&
   1017 			    (l->l_flag & LW_SINTR) != 0) {
   1018 				/* setrunnable() will release the lock. */
   1019 				setrunnable(l);
   1020 				continue;
   1021 			}
   1022 
   1023 			lwp_unlock(l);
   1024 		}
   1025 
   1026 		mutex_exit(p->p_lock);
   1027 	}
   1028 	mutex_exit(proc_lock);
   1029 
   1030 	/*
   1031 	 * Kick all CPUs to make them preempt any LWPs running in user mode.
   1032 	 * They'll trap into the kernel and suspend themselves in userret().
   1033 	 */
   1034 	for (CPU_INFO_FOREACH(cii, ci)) {
   1035 		spc_lock(ci);
   1036 		cpu_need_resched(ci, RESCHED_IMMED);
   1037 		spc_unlock(ci);
   1038 	}
   1039 }
   1040 
   1041 /*
   1042  * sched_unsleep:
   1043  *
   1044  *	The is called when the LWP has not been awoken normally but instead
   1045  *	interrupted: for example, if the sleep timed out.  Because of this,
   1046  *	it's not a valid action for running or idle LWPs.
   1047  */
   1048 static u_int
   1049 sched_unsleep(struct lwp *l, bool cleanup)
   1050 {
   1051 
   1052 	lwp_unlock(l);
   1053 	panic("sched_unsleep");
   1054 }
   1055 
   1056 void
   1057 resched_cpu(struct lwp *l)
   1058 {
   1059 	struct cpu_info *ci;
   1060 
   1061 	/*
   1062 	 * XXXSMP
   1063 	 * Since l->l_cpu persists across a context switch,
   1064 	 * this gives us *very weak* processor affinity, in
   1065 	 * that we notify the CPU on which the process last
   1066 	 * ran that it should try to switch.
   1067 	 *
   1068 	 * This does not guarantee that the process will run on
   1069 	 * that processor next, because another processor might
   1070 	 * grab it the next time it performs a context switch.
   1071 	 *
   1072 	 * This also does not handle the case where its last
   1073 	 * CPU is running a higher-priority process, but every
   1074 	 * other CPU is running a lower-priority process.  There
   1075 	 * are ways to handle this situation, but they're not
   1076 	 * currently very pretty, and we also need to weigh the
   1077 	 * cost of moving a process from one CPU to another.
   1078 	 */
   1079 	ci = l->l_cpu;
   1080 	if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority)
   1081 		cpu_need_resched(ci, 0);
   1082 }
   1083 
   1084 static void
   1085 sched_changepri(struct lwp *l, pri_t pri)
   1086 {
   1087 
   1088 	KASSERT(lwp_locked(l, NULL));
   1089 
   1090 	if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
   1091 		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1092 		sched_dequeue(l);
   1093 		l->l_priority = pri;
   1094 		sched_enqueue(l, false);
   1095 	} else {
   1096 		l->l_priority = pri;
   1097 	}
   1098 	resched_cpu(l);
   1099 }
   1100 
   1101 static void
   1102 sched_lendpri(struct lwp *l, pri_t pri)
   1103 {
   1104 
   1105 	KASSERT(lwp_locked(l, NULL));
   1106 
   1107 	if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) {
   1108 		KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1109 		sched_dequeue(l);
   1110 		l->l_inheritedprio = pri;
   1111 		sched_enqueue(l, false);
   1112 	} else {
   1113 		l->l_inheritedprio = pri;
   1114 	}
   1115 	resched_cpu(l);
   1116 }
   1117 
   1118 struct lwp *
   1119 syncobj_noowner(wchan_t wchan)
   1120 {
   1121 
   1122 	return NULL;
   1123 }
   1124 
   1125 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
   1126 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
   1127 
   1128 /*
   1129  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
   1130  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
   1131  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
   1132  *
   1133  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
   1134  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
   1135  *
   1136  * If you dont want to bother with the faster/more-accurate formula, you
   1137  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
   1138  * (more general) method of calculating the %age of CPU used by a process.
   1139  */
   1140 #define	CCPU_SHIFT	(FSHIFT + 1)
   1141 
   1142 /*
   1143  * sched_pstats:
   1144  *
   1145  * Update process statistics and check CPU resource allocation.
   1146  * Call scheduler-specific hook to eventually adjust process/LWP
   1147  * priorities.
   1148  */
   1149 /* ARGSUSED */
   1150 void
   1151 sched_pstats(void *arg)
   1152 {
   1153 	struct rlimit *rlim;
   1154 	struct lwp *l;
   1155 	struct proc *p;
   1156 	int sig, clkhz;
   1157 	long runtm;
   1158 
   1159 	sched_pstats_ticks++;
   1160 
   1161 	mutex_enter(proc_lock);
   1162 	PROCLIST_FOREACH(p, &allproc) {
   1163 		/*
   1164 		 * Increment time in/out of memory and sleep time (if
   1165 		 * sleeping).  We ignore overflow; with 16-bit int's
   1166 		 * (remember them?) overflow takes 45 days.
   1167 		 */
   1168 		mutex_enter(p->p_lock);
   1169 		mutex_spin_enter(&p->p_stmutex);
   1170 		runtm = p->p_rtime.sec;
   1171 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1172 			if ((l->l_flag & LW_IDLE) != 0)
   1173 				continue;
   1174 			lwp_lock(l);
   1175 			runtm += l->l_rtime.sec;
   1176 			l->l_swtime++;
   1177 			sched_pstats_hook(l);
   1178 			lwp_unlock(l);
   1179 
   1180 			/*
   1181 			 * p_pctcpu is only for ps.
   1182 			 */
   1183 			l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
   1184 			if (l->l_slptime < 1) {
   1185 				clkhz = stathz != 0 ? stathz : hz;
   1186 #if	(FSHIFT >= CCPU_SHIFT)
   1187 				l->l_pctcpu += (clkhz == 100) ?
   1188 				    ((fixpt_t)l->l_cpticks) <<
   1189 				        (FSHIFT - CCPU_SHIFT) :
   1190 				    100 * (((fixpt_t) p->p_cpticks)
   1191 				        << (FSHIFT - CCPU_SHIFT)) / clkhz;
   1192 #else
   1193 				l->l_pctcpu += ((FSCALE - ccpu) *
   1194 				    (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT;
   1195 #endif
   1196 				l->l_cpticks = 0;
   1197 			}
   1198 		}
   1199 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
   1200 		mutex_spin_exit(&p->p_stmutex);
   1201 
   1202 		/*
   1203 		 * Check if the process exceeds its CPU resource allocation.
   1204 		 * If over max, kill it.
   1205 		 */
   1206 		rlim = &p->p_rlimit[RLIMIT_CPU];
   1207 		sig = 0;
   1208 		if (runtm >= rlim->rlim_cur) {
   1209 			if (runtm >= rlim->rlim_max)
   1210 				sig = SIGKILL;
   1211 			else {
   1212 				sig = SIGXCPU;
   1213 				if (rlim->rlim_cur < rlim->rlim_max)
   1214 					rlim->rlim_cur += 5;
   1215 			}
   1216 		}
   1217 		mutex_exit(p->p_lock);
   1218 		if (sig)
   1219 			psignal(p, sig);
   1220 	}
   1221 	mutex_exit(proc_lock);
   1222 	uvm_meter();
   1223 	cv_wakeup(&lbolt);
   1224 	callout_schedule(&sched_pstats_ch, hz);
   1225 }
   1226 
   1227 void
   1228 sched_init(void)
   1229 {
   1230 
   1231 	cv_init(&lbolt, "lbolt");
   1232 	callout_init(&sched_pstats_ch, CALLOUT_MPSAFE);
   1233 	callout_setfunc(&sched_pstats_ch, sched_pstats, NULL);
   1234 
   1235 	/* Balancing */
   1236 	worker_ci = curcpu();
   1237 	cacheht_time = mstohz(5);		/* ~5 ms  */
   1238 	balance_period = mstohz(300);		/* ~300ms */
   1239 
   1240 	/* Minimal count of LWPs for catching: log2(count of CPUs) */
   1241 	min_catch = min(ilog2(ncpu), 4);
   1242 
   1243 	evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_INTR, NULL,
   1244 	   "kpreempt", "defer: critical section");
   1245 	evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_INTR, NULL,
   1246 	   "kpreempt", "defer: kernel_lock");
   1247 	evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_INTR, NULL,
   1248 	   "kpreempt", "defer: IPL");
   1249 	evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_INTR, NULL,
   1250 	   "kpreempt", "immediate");
   1251 
   1252 	/* Initialize balancing callout and run it */
   1253 #ifdef MULTIPROCESSOR
   1254 	callout_init(&balance_ch, CALLOUT_MPSAFE);
   1255 	callout_setfunc(&balance_ch, sched_balance, NULL);
   1256 	callout_schedule(&balance_ch, balance_period);
   1257 #endif
   1258 	sched_pstats(NULL);
   1259 }
   1260 
   1261 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
   1262 {
   1263 	const struct sysctlnode *node = NULL;
   1264 
   1265 	sysctl_createv(clog, 0, NULL, NULL,
   1266 		CTLFLAG_PERMANENT,
   1267 		CTLTYPE_NODE, "kern", NULL,
   1268 		NULL, 0, NULL, 0,
   1269 		CTL_KERN, CTL_EOL);
   1270 	sysctl_createv(clog, 0, NULL, &node,
   1271 		CTLFLAG_PERMANENT,
   1272 		CTLTYPE_NODE, "sched",
   1273 		SYSCTL_DESCR("Scheduler options"),
   1274 		NULL, 0, NULL, 0,
   1275 		CTL_KERN, CTL_CREATE, CTL_EOL);
   1276 
   1277 	if (node == NULL)
   1278 		return;
   1279 
   1280 	sysctl_createv(clog, 0, &node, NULL,
   1281 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1282 		CTLTYPE_INT, "cacheht_time",
   1283 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
   1284 		NULL, 0, &cacheht_time, 0,
   1285 		CTL_CREATE, CTL_EOL);
   1286 	sysctl_createv(clog, 0, &node, NULL,
   1287 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1288 		CTLTYPE_INT, "balance_period",
   1289 		SYSCTL_DESCR("Balance period (in ticks)"),
   1290 		NULL, 0, &balance_period, 0,
   1291 		CTL_CREATE, CTL_EOL);
   1292 	sysctl_createv(clog, 0, &node, NULL,
   1293 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1294 		CTLTYPE_INT, "min_catch",
   1295 		SYSCTL_DESCR("Minimal count of threads for catching"),
   1296 		NULL, 0, &min_catch, 0,
   1297 		CTL_CREATE, CTL_EOL);
   1298 	sysctl_createv(clog, 0, &node, NULL,
   1299 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1300 		CTLTYPE_INT, "timesoftints",
   1301 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
   1302 		NULL, 0, &softint_timing, 0,
   1303 		CTL_CREATE, CTL_EOL);
   1304 	sysctl_createv(clog, 0, &node, NULL,
   1305 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1306 		CTLTYPE_INT, "kpreempt_pri",
   1307 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
   1308 		NULL, 0, &sched_kpreempt_pri, 0,
   1309 		CTL_CREATE, CTL_EOL);
   1310 	sysctl_createv(clog, 0, &node, NULL,
   1311 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1312 		CTLTYPE_INT, "upreempt_pri",
   1313 		SYSCTL_DESCR("Minimum priority to trigger user preemption"),
   1314 		NULL, 0, &sched_upreempt_pri, 0,
   1315 		CTL_CREATE, CTL_EOL);
   1316 }
   1317 
   1318 void
   1319 sched_cpuattach(struct cpu_info *ci)
   1320 {
   1321 	runqueue_t *ci_rq;
   1322 	void *rq_ptr;
   1323 	u_int i, size;
   1324 
   1325 	if (ci->ci_schedstate.spc_lwplock == NULL) {
   1326 		ci->ci_schedstate.spc_lwplock =
   1327 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
   1328 	}
   1329 	if (ci == lwp0.l_cpu) {
   1330 		/* Initialize the scheduler structure of the primary LWP */
   1331 		lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
   1332 	}
   1333 	if (ci->ci_schedstate.spc_mutex != NULL) {
   1334 		/* Already initialized. */
   1335 		return;
   1336 	}
   1337 
   1338 	/* Allocate the run queue */
   1339 	size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
   1340 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
   1341 	if (rq_ptr == NULL) {
   1342 		panic("sched_cpuattach: could not allocate the runqueue");
   1343 	}
   1344 	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
   1345 
   1346 	/* Initialize run queues */
   1347 	ci->ci_schedstate.spc_mutex =
   1348 	    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
   1349 	for (i = 0; i < PRI_RT_COUNT; i++)
   1350 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
   1351 	for (i = 0; i < PRI_TS_COUNT; i++)
   1352 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
   1353 
   1354 	ci->ci_schedstate.spc_sched_info = ci_rq;
   1355 }
   1356 
   1357 /*
   1358  * Control of the runqueue.
   1359  */
   1360 
   1361 static void *
   1362 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
   1363 {
   1364 
   1365 	KASSERT(prio < PRI_COUNT);
   1366 	return (prio <= PRI_HIGHEST_TS) ?
   1367 	    &ci_rq->r_ts_queue[prio].q_head :
   1368 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
   1369 }
   1370 
   1371 void
   1372 sched_enqueue(struct lwp *l, bool swtch)
   1373 {
   1374 	runqueue_t *ci_rq;
   1375 	struct schedstate_percpu *spc;
   1376 	TAILQ_HEAD(, lwp) *q_head;
   1377 	const pri_t eprio = lwp_eprio(l);
   1378 	struct cpu_info *ci;
   1379 	int type;
   1380 
   1381 	ci = l->l_cpu;
   1382 	spc = &ci->ci_schedstate;
   1383 	ci_rq = spc->spc_sched_info;
   1384 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
   1385 
   1386 	/* Update the last run time on switch */
   1387 	if (__predict_true(swtch == true)) {
   1388 		l->l_rticks = hardclock_ticks;
   1389 		l->l_rticksum += (hardclock_ticks - l->l_rticks);
   1390 	} else if (l->l_rticks == 0)
   1391 		l->l_rticks = hardclock_ticks;
   1392 
   1393 	/* Enqueue the thread */
   1394 	q_head = sched_getrq(ci_rq, eprio);
   1395 	if (TAILQ_EMPTY(q_head)) {
   1396 		u_int i;
   1397 		uint32_t q;
   1398 
   1399 		/* Mark bit */
   1400 		i = eprio >> BITMAP_SHIFT;
   1401 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
   1402 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
   1403 		ci_rq->r_bitmap[i] |= q;
   1404 	}
   1405 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
   1406 	ci_rq->r_count++;
   1407 	if ((l->l_pflag & LP_BOUND) == 0)
   1408 		ci_rq->r_mcount++;
   1409 
   1410 	/*
   1411 	 * Update the value of highest priority in the runqueue,
   1412 	 * if priority of this thread is higher.
   1413 	 */
   1414 	if (eprio > spc->spc_maxpriority)
   1415 		spc->spc_maxpriority = eprio;
   1416 
   1417 	sched_newts(l);
   1418 
   1419 	/*
   1420 	 * Wake the chosen CPU or cause a preemption if the newly
   1421 	 * enqueued thread has higher priority.  Don't cause a
   1422 	 * preemption if the thread is yielding (swtch).
   1423 	 */
   1424 	if (!swtch && eprio > spc->spc_curpriority) {
   1425 		if (eprio >= sched_kpreempt_pri)
   1426 			type = RESCHED_KPREEMPT;
   1427 		else if (eprio >= sched_upreempt_pri)
   1428 			type = RESCHED_IMMED;
   1429 		else
   1430 			type = 0;
   1431 		cpu_need_resched(ci, type);
   1432 	}
   1433 }
   1434 
   1435 void
   1436 sched_dequeue(struct lwp *l)
   1437 {
   1438 	runqueue_t *ci_rq;
   1439 	TAILQ_HEAD(, lwp) *q_head;
   1440 	struct schedstate_percpu *spc;
   1441 	const pri_t eprio = lwp_eprio(l);
   1442 
   1443 	spc = & l->l_cpu->ci_schedstate;
   1444 	ci_rq = spc->spc_sched_info;
   1445 	KASSERT(lwp_locked(l, spc->spc_mutex));
   1446 
   1447 	KASSERT(eprio <= spc->spc_maxpriority);
   1448 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
   1449 	KASSERT(ci_rq->r_count > 0);
   1450 
   1451 	ci_rq->r_count--;
   1452 	if ((l->l_pflag & LP_BOUND) == 0)
   1453 		ci_rq->r_mcount--;
   1454 
   1455 	q_head = sched_getrq(ci_rq, eprio);
   1456 	TAILQ_REMOVE(q_head, l, l_runq);
   1457 	if (TAILQ_EMPTY(q_head)) {
   1458 		u_int i;
   1459 		uint32_t q;
   1460 
   1461 		/* Unmark bit */
   1462 		i = eprio >> BITMAP_SHIFT;
   1463 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
   1464 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
   1465 		ci_rq->r_bitmap[i] &= ~q;
   1466 
   1467 		/*
   1468 		 * Update the value of highest priority in the runqueue, in a
   1469 		 * case it was a last thread in the queue of highest priority.
   1470 		 */
   1471 		if (eprio != spc->spc_maxpriority)
   1472 			return;
   1473 
   1474 		do {
   1475 			if (ci_rq->r_bitmap[i] != 0) {
   1476 				q = ffs(ci_rq->r_bitmap[i]);
   1477 				spc->spc_maxpriority =
   1478 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
   1479 				return;
   1480 			}
   1481 		} while (i--);
   1482 
   1483 		/* If not found - set the lowest value */
   1484 		spc->spc_maxpriority = 0;
   1485 	}
   1486 }
   1487 
   1488 /*
   1489  * Migration and balancing.
   1490  */
   1491 
   1492 #ifdef MULTIPROCESSOR
   1493 
   1494 /* Estimate if LWP is cache-hot */
   1495 static inline bool
   1496 lwp_cache_hot(const struct lwp *l)
   1497 {
   1498 
   1499 	if (l->l_slptime || l->l_rticks == 0)
   1500 		return false;
   1501 
   1502 	return (hardclock_ticks - l->l_rticks <= cacheht_time);
   1503 }
   1504 
   1505 /* Check if LWP can migrate to the chosen CPU */
   1506 static inline bool
   1507 sched_migratable(const struct lwp *l, struct cpu_info *ci)
   1508 {
   1509 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
   1510 
   1511 	/* CPU is offline */
   1512 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
   1513 		return false;
   1514 
   1515 	/* Affinity bind */
   1516 	if (__predict_false(l->l_flag & LW_AFFINITY))
   1517 		return CPU_ISSET(cpu_index(ci), &l->l_affinity);
   1518 
   1519 	/* Processor-set */
   1520 	return (spc->spc_psid == l->l_psid);
   1521 }
   1522 
   1523 /*
   1524  * Estimate the migration of LWP to the other CPU.
   1525  * Take and return the CPU, if migration is needed.
   1526  */
   1527 struct cpu_info *
   1528 sched_takecpu(struct lwp *l)
   1529 {
   1530 	struct cpu_info *ci, *tci, *first, *next;
   1531 	struct schedstate_percpu *spc;
   1532 	runqueue_t *ci_rq, *ici_rq;
   1533 	pri_t eprio, lpri, pri;
   1534 
   1535 	KASSERT(lwp_locked(l, NULL));
   1536 
   1537 	ci = l->l_cpu;
   1538 	spc = &ci->ci_schedstate;
   1539 	ci_rq = spc->spc_sched_info;
   1540 
   1541 	/* If thread is strictly bound, do not estimate other CPUs */
   1542 	if (l->l_pflag & LP_BOUND)
   1543 		return ci;
   1544 
   1545 	/* CPU of this thread is idling - run there */
   1546 	if (ci_rq->r_count == 0)
   1547 		return ci;
   1548 
   1549 	eprio = lwp_eprio(l);
   1550 
   1551 	/* Stay if thread is cache-hot */
   1552 	if (__predict_true(l->l_stat != LSIDL) &&
   1553 	    lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
   1554 		return ci;
   1555 
   1556 	/* Run on current CPU if priority of thread is higher */
   1557 	ci = curcpu();
   1558 	spc = &ci->ci_schedstate;
   1559 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
   1560 		return ci;
   1561 
   1562 	/*
   1563 	 * Look for the CPU with the lowest priority thread.  In case of
   1564 	 * equal priority, choose the CPU with the fewest of threads.
   1565 	 */
   1566 	first = l->l_cpu;
   1567 	ci = first;
   1568 	tci = first;
   1569 	lpri = PRI_COUNT;
   1570 	do {
   1571 		next = CIRCLEQ_LOOP_NEXT(&cpu_queue, ci, ci_data.cpu_qchain);
   1572 		spc = &ci->ci_schedstate;
   1573 		ici_rq = spc->spc_sched_info;
   1574 		pri = max(spc->spc_curpriority, spc->spc_maxpriority);
   1575 		if (pri > lpri)
   1576 			continue;
   1577 
   1578 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
   1579 			continue;
   1580 
   1581 		if (!sched_migratable(l, ci))
   1582 			continue;
   1583 
   1584 		lpri = pri;
   1585 		tci = ci;
   1586 		ci_rq = ici_rq;
   1587 	} while (ci = next, ci != first);
   1588 
   1589 	return tci;
   1590 }
   1591 
   1592 /*
   1593  * Tries to catch an LWP from the runqueue of other CPU.
   1594  */
   1595 static struct lwp *
   1596 sched_catchlwp(void)
   1597 {
   1598 	struct cpu_info *curci = curcpu(), *ci = worker_ci;
   1599 	struct schedstate_percpu *spc;
   1600 	TAILQ_HEAD(, lwp) *q_head;
   1601 	runqueue_t *ci_rq;
   1602 	struct lwp *l;
   1603 
   1604 	if (curci == ci)
   1605 		return NULL;
   1606 
   1607 	/* Lockless check */
   1608 	spc = &ci->ci_schedstate;
   1609 	ci_rq = spc->spc_sched_info;
   1610 	if (ci_rq->r_mcount < min_catch)
   1611 		return NULL;
   1612 
   1613 	/*
   1614 	 * Double-lock the runqueues.
   1615 	 */
   1616 	if (curci < ci) {
   1617 		spc_lock(ci);
   1618 	} else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
   1619 		const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
   1620 
   1621 		spc_unlock(curci);
   1622 		spc_lock(ci);
   1623 		spc_lock(curci);
   1624 
   1625 		if (cur_rq->r_count) {
   1626 			spc_unlock(ci);
   1627 			return NULL;
   1628 		}
   1629 	}
   1630 
   1631 	if (ci_rq->r_mcount < min_catch) {
   1632 		spc_unlock(ci);
   1633 		return NULL;
   1634 	}
   1635 
   1636 	/* Take the highest priority thread */
   1637 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
   1638 	l = TAILQ_FIRST(q_head);
   1639 
   1640 	for (;;) {
   1641 		/* Check the first and next result from the queue */
   1642 		if (l == NULL)
   1643 			break;
   1644 		KASSERT(l->l_stat == LSRUN);
   1645 		KASSERT(l->l_flag & LW_INMEM);
   1646 
   1647 		/* Look for threads, whose are allowed to migrate */
   1648 		if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
   1649 		    !sched_migratable(l, curci)) {
   1650 			l = TAILQ_NEXT(l, l_runq);
   1651 			continue;
   1652 		}
   1653 
   1654 		/* Grab the thread, and move to the local run queue */
   1655 		sched_dequeue(l);
   1656 		l->l_cpu = curci;
   1657 		lwp_unlock_to(l, curci->ci_schedstate.spc_mutex);
   1658 		sched_enqueue(l, false);
   1659 		return l;
   1660 	}
   1661 	spc_unlock(ci);
   1662 
   1663 	return l;
   1664 }
   1665 
   1666 /*
   1667  * Periodical calculations for balancing.
   1668  */
   1669 static void
   1670 sched_balance(void *nocallout)
   1671 {
   1672 	struct cpu_info *ci, *hci;
   1673 	runqueue_t *ci_rq;
   1674 	CPU_INFO_ITERATOR cii;
   1675 	u_int highest;
   1676 
   1677 	hci = curcpu();
   1678 	highest = 0;
   1679 
   1680 	/* Make lockless countings */
   1681 	for (CPU_INFO_FOREACH(cii, ci)) {
   1682 		ci_rq = ci->ci_schedstate.spc_sched_info;
   1683 
   1684 		/* Average count of the threads */
   1685 		ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
   1686 
   1687 		/* Look for CPU with the highest average */
   1688 		if (ci_rq->r_avgcount > highest) {
   1689 			hci = ci;
   1690 			highest = ci_rq->r_avgcount;
   1691 		}
   1692 	}
   1693 
   1694 	/* Update the worker */
   1695 	worker_ci = hci;
   1696 
   1697 	if (nocallout == NULL)
   1698 		callout_schedule(&balance_ch, balance_period);
   1699 }
   1700 
   1701 #else
   1702 
   1703 struct cpu_info *
   1704 sched_takecpu(struct lwp *l)
   1705 {
   1706 
   1707 	return l->l_cpu;
   1708 }
   1709 
   1710 #endif	/* MULTIPROCESSOR */
   1711 
   1712 /*
   1713  * Scheduler mill.
   1714  */
   1715 struct lwp *
   1716 sched_nextlwp(void)
   1717 {
   1718 	struct cpu_info *ci = curcpu();
   1719 	struct schedstate_percpu *spc;
   1720 	TAILQ_HEAD(, lwp) *q_head;
   1721 	runqueue_t *ci_rq;
   1722 	struct lwp *l;
   1723 
   1724 	spc = &ci->ci_schedstate;
   1725 	ci_rq = spc->spc_sched_info;
   1726 
   1727 #ifdef MULTIPROCESSOR
   1728 	/* If runqueue is empty, try to catch some thread from other CPU */
   1729 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
   1730 		if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
   1731 			return NULL;
   1732 	} else if (ci_rq->r_count == 0) {
   1733 		/* Reset the counter, and call the balancer */
   1734 		ci_rq->r_avgcount = 0;
   1735 		sched_balance(ci);
   1736 
   1737 		/* The re-locking will be done inside */
   1738 		return sched_catchlwp();
   1739 	}
   1740 #else
   1741 	if (ci_rq->r_count == 0)
   1742 		return NULL;
   1743 #endif
   1744 
   1745 	/* Take the highest priority thread */
   1746 	KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
   1747 	q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
   1748 	l = TAILQ_FIRST(q_head);
   1749 	KASSERT(l != NULL);
   1750 
   1751 	sched_oncpu(l);
   1752 	l->l_rticks = hardclock_ticks;
   1753 
   1754 	return l;
   1755 }
   1756 
   1757 bool
   1758 sched_curcpu_runnable_p(void)
   1759 {
   1760 	const struct cpu_info *ci;
   1761 	const runqueue_t *ci_rq;
   1762 	bool rv;
   1763 
   1764 	kpreempt_disable();
   1765 	ci = curcpu();
   1766 	ci_rq = ci->ci_schedstate.spc_sched_info;
   1767 
   1768 #ifndef __HAVE_FAST_SOFTINTS
   1769 	if (ci->ci_data.cpu_softints) {
   1770 		kpreempt_enable();
   1771 		return true;
   1772 	}
   1773 #endif
   1774 
   1775 	if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
   1776 		rv = (ci_rq->r_count - ci_rq->r_mcount);
   1777 	else
   1778 		rv = ci_rq->r_count != 0;
   1779 	kpreempt_enable();
   1780 
   1781 	return rv;
   1782 }
   1783 
   1784 /*
   1785  * Debugging.
   1786  */
   1787 
   1788 #ifdef DDB
   1789 
   1790 void
   1791 sched_print_runqueue(void (*pr)(const char *, ...)
   1792     __attribute__((__format__(__printf__,1,2))))
   1793 {
   1794 	runqueue_t *ci_rq;
   1795 	struct schedstate_percpu *spc;
   1796 	struct lwp *l;
   1797 	struct proc *p;
   1798 	int i;
   1799 	struct cpu_info *ci;
   1800 	CPU_INFO_ITERATOR cii;
   1801 
   1802 	for (CPU_INFO_FOREACH(cii, ci)) {
   1803 		spc = &ci->ci_schedstate;
   1804 		ci_rq = spc->spc_sched_info;
   1805 
   1806 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
   1807 		(*pr)(" pid.lid = %d.%d, threads count = %u, "
   1808 		    "avgcount = %u, highest pri = %d\n",
   1809 #ifdef MULTIPROCESSOR
   1810 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
   1811 #else
   1812 		    curlwp->l_proc->p_pid, curlwp->l_lid,
   1813 #endif
   1814 		    ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority);
   1815 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
   1816 		do {
   1817 			uint32_t q;
   1818 			q = ci_rq->r_bitmap[i];
   1819 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
   1820 		} while (i--);
   1821 	}
   1822 
   1823 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %s\n",
   1824 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "LRTIME");
   1825 
   1826 	PROCLIST_FOREACH(p, &allproc) {
   1827 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
   1828 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1829 			ci = l->l_cpu;
   1830 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %u\n",
   1831 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
   1832 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
   1833 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
   1834 			    l, ci->ci_index,
   1835 			    (u_int)(hardclock_ticks - l->l_rticks));
   1836 		}
   1837 	}
   1838 }
   1839 
   1840 #endif
   1841