Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: kern_runq.c,v 1.71 2025/01/17 04:11:33 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
     34  * All rights reserved.
     35  *
     36  * Redistribution and use in source and binary forms, with or without
     37  * modification, are permitted provided that the following conditions
     38  * are met:
     39  * 1. Redistributions of source code must retain the above copyright
     40  *    notice, this list of conditions and the following disclaimer.
     41  * 2. Redistributions in binary form must reproduce the above copyright
     42  *    notice, this list of conditions and the following disclaimer in the
     43  *    documentation and/or other materials provided with the distribution.
     44  *
     45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     55  * SUCH DAMAGE.
     56  */
     57 
     58 #include <sys/cdefs.h>
     59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.71 2025/01/17 04:11:33 mrg Exp $");
     60 
     61 #include "opt_dtrace.h"
     62 
     63 #include <sys/param.h>
     64 #include <sys/kernel.h>
     65 #include <sys/bitops.h>
     66 #include <sys/cpu.h>
     67 #include <sys/idle.h>
     68 #include <sys/intr.h>
     69 #include <sys/kmem.h>
     70 #include <sys/lwp.h>
     71 #include <sys/mutex.h>
     72 #include <sys/proc.h>
     73 #include <sys/pset.h>
     74 #include <sys/sched.h>
     75 #include <sys/syscallargs.h>
     76 #include <sys/sysctl.h>
     77 #include <sys/systm.h>
     78 #include <sys/types.h>
     79 #include <sys/evcnt.h>
     80 #include <sys/atomic.h>
     81 
     82 /*
     83  * Bits per map.
     84  */
     85 #define	BITMAP_BITS	(32)
     86 #define	BITMAP_SHIFT	(5)
     87 #define	BITMAP_MSB	(0x80000000U)
     88 #define	BITMAP_MASK	(BITMAP_BITS - 1)
     89 
     90 const int	schedppq = 1;
     91 
     92 static void	*sched_getrq(struct schedstate_percpu *, const pri_t);
     93 #ifdef MULTIPROCESSOR
     94 static lwp_t *	sched_catchlwp(struct cpu_info *);
     95 #endif
     96 
     97 /*
     98  * Preemption control.
     99  */
    100 #ifdef __HAVE_PREEMPTION
    101 # ifdef DEBUG
    102 int		sched_kpreempt_pri = 0;
    103 # else
    104 int		sched_kpreempt_pri = PRI_USER_RT;
    105 # endif
    106 #else
    107 int		sched_kpreempt_pri = 1000;
    108 #endif
    109 
    110 /*
    111  * Migration and balancing.
    112  */
    113 static u_int	cacheht_time;	/* Cache hotness time */
    114 static u_int	min_catch;	/* Minimal LWP count for catching */
    115 static u_int	skim_interval;	/* Rate limit for stealing LWPs */
    116 
    117 #ifdef KDTRACE_HOOKS
    118 struct lwp *curthread;
    119 #endif
    120 
    121 void
    122 runq_init(void)
    123 {
    124 
    125 	/* Pulling from remote packages, LWP must not have run for 10ms. */
    126 	cacheht_time = 10;
    127 
    128 	/* Minimal count of LWPs for catching */
    129 	min_catch = 1;
    130 
    131 	/* Steal from other CPUs at most every 10ms. */
    132 	skim_interval = 10;
    133 }
    134 
    135 void
    136 sched_cpuattach(struct cpu_info *ci)
    137 {
    138 	struct schedstate_percpu *spc;
    139 	size_t size;
    140 	void *p;
    141 	u_int i;
    142 
    143 	spc = &ci->ci_schedstate;
    144 	spc->spc_nextpkg = ci;
    145 
    146 	if (spc->spc_lwplock == NULL) {
    147 		spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
    148 	}
    149 	if (ci == lwp0.l_cpu) {
    150 		/* Initialize the scheduler structure of the primary LWP */
    151 		lwp0.l_mutex = spc->spc_lwplock;
    152 	}
    153 	if (spc->spc_mutex != NULL) {
    154 		/* Already initialized. */
    155 		return;
    156 	}
    157 
    158 	/* Allocate the run queue */
    159 	size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
    160 	    coherency_unit;
    161 	p = kmem_alloc(size, KM_SLEEP);
    162 	spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
    163 
    164 	/* Initialize run queues */
    165 	spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
    166 	for (i = 0; i < PRI_COUNT; i++)
    167 		TAILQ_INIT(&spc->spc_queue[i]);
    168 }
    169 
    170 /*
    171  * Control of the runqueue.
    172  */
    173 static inline void *
    174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
    175 {
    176 
    177 	KASSERT(prio < PRI_COUNT);
    178 	return &spc->spc_queue[prio];
    179 }
    180 
    181 /*
    182  * Put an LWP onto a run queue.  The LWP must be locked by spc_mutex for
    183  * l_cpu.
    184  */
    185 void
    186 sched_enqueue(struct lwp *l)
    187 {
    188 	struct schedstate_percpu *spc;
    189 	TAILQ_HEAD(, lwp) *q_head;
    190 	const pri_t eprio = lwp_eprio(l);
    191 	struct cpu_info *ci;
    192 
    193 	ci = l->l_cpu;
    194 	spc = &ci->ci_schedstate;
    195 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    196 
    197 	/* Enqueue the thread */
    198 	q_head = sched_getrq(spc, eprio);
    199 	if (TAILQ_EMPTY(q_head)) {
    200 		u_int i;
    201 		uint32_t q;
    202 
    203 		/* Mark bit */
    204 		i = eprio >> BITMAP_SHIFT;
    205 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    206 		KASSERT((spc->spc_bitmap[i] & q) == 0);
    207 		spc->spc_bitmap[i] |= q;
    208 	}
    209 
    210 	/*
    211 	 * Determine run queue position according to POSIX.  XXX Explicitly
    212 	 * lowering a thread's priority with pthread_setschedparam() is not
    213 	 * handled.
    214 	 */
    215 	if ((l->l_pflag & LP_PREEMPTING) != 0) {
    216 		switch (l->l_class) {
    217 		case SCHED_OTHER:
    218 			TAILQ_INSERT_TAIL(q_head, l, l_runq);
    219 			break;
    220 		case SCHED_FIFO:
    221 			TAILQ_INSERT_HEAD(q_head, l, l_runq);
    222 			break;
    223 		case SCHED_RR:
    224 			if (getticks() - l->l_rticks >= sched_rrticks) {
    225 				TAILQ_INSERT_TAIL(q_head, l, l_runq);
    226 			} else {
    227 				TAILQ_INSERT_HEAD(q_head, l, l_runq);
    228 			}
    229 			break;
    230 		default:
    231 			panic("sched_enqueue: LWP %p has class %d\n",
    232 			    l, l->l_class);
    233 		}
    234 	} else {
    235 		TAILQ_INSERT_TAIL(q_head, l, l_runq);
    236 	}
    237 	spc->spc_flags &= ~SPCF_IDLE;
    238 	spc->spc_count++;
    239 	if ((l->l_pflag & LP_BOUND) == 0) {
    240 		atomic_store_relaxed(&spc->spc_mcount,
    241 		    atomic_load_relaxed(&spc->spc_mcount) + 1);
    242 	}
    243 
    244 	/*
    245 	 * Update the value of highest priority in the runqueue,
    246 	 * if priority of this thread is higher.
    247 	 */
    248 	if (eprio > spc->spc_maxpriority)
    249 		spc->spc_maxpriority = eprio;
    250 
    251 	sched_newts(l);
    252 }
    253 
    254 /*
    255  * Remove and LWP from the run queue it's on.  The LWP must be in state
    256  * LSRUN.
    257  */
    258 void
    259 sched_dequeue(struct lwp *l)
    260 {
    261 	TAILQ_HEAD(, lwp) *q_head;
    262 	struct schedstate_percpu *spc;
    263 	const pri_t eprio = lwp_eprio(l);
    264 
    265 	spc = &l->l_cpu->ci_schedstate;
    266 
    267 	KASSERT(lwp_locked(l, spc->spc_mutex));
    268 	KASSERT(eprio <= spc->spc_maxpriority);
    269 	KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
    270 	KASSERT(spc->spc_count > 0);
    271 
    272 	if (spc->spc_migrating == l)
    273 		spc->spc_migrating = NULL;
    274 
    275 	spc->spc_count--;
    276 	if ((l->l_pflag & LP_BOUND) == 0) {
    277 		atomic_store_relaxed(&spc->spc_mcount,
    278 		    atomic_load_relaxed(&spc->spc_mcount) - 1);
    279 	}
    280 
    281 	q_head = sched_getrq(spc, eprio);
    282 	TAILQ_REMOVE(q_head, l, l_runq);
    283 	if (TAILQ_EMPTY(q_head)) {
    284 		u_int i;
    285 		uint32_t q;
    286 
    287 		/* Unmark bit */
    288 		i = eprio >> BITMAP_SHIFT;
    289 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    290 		KASSERT((spc->spc_bitmap[i] & q) != 0);
    291 		spc->spc_bitmap[i] &= ~q;
    292 
    293 		/*
    294 		 * Update the value of highest priority in the runqueue, in a
    295 		 * case it was a last thread in the queue of highest priority.
    296 		 */
    297 		if (eprio != spc->spc_maxpriority)
    298 			return;
    299 
    300 		do {
    301 			if (spc->spc_bitmap[i] != 0) {
    302 				q = ffs(spc->spc_bitmap[i]);
    303 				spc->spc_maxpriority =
    304 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
    305 				return;
    306 			}
    307 		} while (i--);
    308 
    309 		/* If not found - set the lowest value */
    310 		spc->spc_maxpriority = 0;
    311 	}
    312 }
    313 
    314 /*
    315  * Cause a preemption on the given CPU, if the priority "pri" is higher
    316  * priority than the running LWP.  If "unlock" is specified, and ideally it
    317  * will be for concurrency reasons, spc_mutex will be dropped before return.
    318  */
    319 void
    320 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
    321 {
    322 	struct schedstate_percpu *spc;
    323 	u_int o, n, f;
    324 	lwp_t *l;
    325 
    326 	spc = &ci->ci_schedstate;
    327 
    328 	KASSERT(mutex_owned(spc->spc_mutex));
    329 
    330 	/*
    331 	 * If the priority level we're evaluating wouldn't cause a new LWP
    332 	 * to be run on the CPU, then we have nothing to do.
    333 	 */
    334 	if (pri <= spc->spc_curpriority || !mp_online) {
    335 		if (__predict_true(unlock)) {
    336 			spc_unlock(ci);
    337 		}
    338 		return;
    339 	}
    340 
    341 	/*
    342 	 * Figure out what kind of preemption we should do.
    343 	 */
    344 	l = ci->ci_onproc;
    345 	if ((l->l_flag & LW_IDLE) != 0) {
    346 		f = RESCHED_IDLE | RESCHED_UPREEMPT;
    347 	} else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
    348 		/* We can't currently preempt softints - should be able to. */
    349 #ifdef __HAVE_PREEMPTION
    350 		f = RESCHED_KPREEMPT;
    351 #else
    352 		/* Leave door open for test: set kpreempt_pri with sysctl. */
    353 		f = RESCHED_UPREEMPT;
    354 #endif
    355 		/*
    356 		 * l_dopreempt must be set with the CPU locked to sync with
    357 		 * mi_switch().  It must also be set with an atomic to sync
    358 		 * with kpreempt().
    359 		 */
    360 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
    361 	} else {
    362 		f = RESCHED_UPREEMPT;
    363 	}
    364 	if (ci != curcpu()) {
    365 		f |= RESCHED_REMOTE;
    366 	}
    367 
    368 	/*
    369 	 * Things can start as soon as ci_want_resched is touched: x86 has
    370 	 * an instruction that monitors the memory cell it's in.  Drop the
    371 	 * schedstate lock in advance, otherwise the remote CPU can awaken
    372 	 * and immediately block on the lock.
    373 	 */
    374 	if (__predict_true(unlock)) {
    375 		spc_unlock(ci);
    376 	}
    377 
    378 	/*
    379 	 * The caller almost always has a second scheduler lock held: either
    380 	 * the running LWP lock (spc_lwplock), or a sleep queue lock.  That
    381 	 * keeps preemption disabled, which among other things ensures all
    382 	 * LWPs involved won't be freed while we're here (see lwp_dtor()).
    383 	 */
    384  	KASSERT(kpreempt_disabled());
    385 
    386 	for (o = 0;; o = n) {
    387 		n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
    388 		if (__predict_true(o == n)) {
    389 			/*
    390 			 * We're the first to set a resched on the CPU.  Try
    391 			 * to avoid causing a needless trip through trap()
    392 			 * to handle an AST fault, if it's known the LWP
    393 			 * will either block or go through userret() soon.
    394 			 */
    395 			if (l != curlwp || cpu_intr_p()) {
    396 				cpu_need_resched(ci, l, f);
    397 			}
    398 			break;
    399 		}
    400 		if (__predict_true(
    401 		    (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
    402 		    (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
    403 			/* Already in progress, nothing to do. */
    404 			break;
    405 		}
    406 	}
    407 }
    408 
    409 /*
    410  * Cause a preemption on the given CPU, if the priority of LWP "l" in state
    411  * LSRUN, is higher priority than the running LWP.  If "unlock" is
    412  * specified, and ideally it will be for concurrency reasons, spc_mutex will
    413  * be dropped before return.
    414  */
    415 void
    416 sched_resched_lwp(struct lwp *l, bool unlock)
    417 {
    418 	struct cpu_info *ci = l->l_cpu;
    419 
    420 	KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
    421 	KASSERT(l->l_stat == LSRUN);
    422 
    423 	sched_resched_cpu(ci, lwp_eprio(l), unlock);
    424 }
    425 
    426 /*
    427  * Migration and balancing.
    428  */
    429 
    430 #ifdef MULTIPROCESSOR
    431 
    432 /*
    433  * Estimate if LWP is cache-hot.
    434  */
    435 static inline bool
    436 lwp_cache_hot(const struct lwp *l)
    437 {
    438 
    439 	/* Leave new LWPs in peace, determination has already been made. */
    440 	if (l->l_stat == LSIDL)
    441 		return true;
    442 
    443 	if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
    444 		return false;
    445 
    446 	return (getticks() - l->l_rticks < mstohz(cacheht_time));
    447 }
    448 
    449 /*
    450  * Check if LWP can migrate to the chosen CPU.
    451  */
    452 static inline bool
    453 sched_migratable(const struct lwp *l, struct cpu_info *ci)
    454 {
    455 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
    456 	KASSERT(lwp_locked(__UNCONST(l), NULL));
    457 
    458 	/* Is CPU offline? */
    459 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
    460 		return false;
    461 
    462 	/* Is affinity set? */
    463 	if (__predict_false(l->l_affinity))
    464 		return kcpuset_isset(l->l_affinity, cpu_index(ci));
    465 
    466 	/* Is there a processor-set? */
    467 	return (spc->spc_psid == l->l_psid);
    468 }
    469 
    470 /*
    471  * A small helper to do round robin through CPU packages.
    472  */
    473 static struct cpu_info *
    474 sched_nextpkg(void)
    475 {
    476 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
    477 
    478 	spc->spc_nextpkg =
    479 	    spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
    480 
    481 	return spc->spc_nextpkg;
    482 }
    483 
    484 /*
    485  * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
    486  * thread.  In case of equal priority, prefer first class CPUs, and amongst
    487  * the remainder choose the CPU with the fewest runqueue entries.
    488  *
    489  * Begin the search in the CPU package which "pivot" is a member of.
    490  */
    491 static struct cpu_info * __noinline
    492 sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
    493 {
    494 	struct cpu_info *bestci, *curci, *outer;
    495 	struct schedstate_percpu *bestspc, *curspc;
    496 	pri_t bestpri, curpri;
    497 
    498 	/*
    499 	 * If this fails (it shouldn't), run on the given CPU.  This also
    500 	 * gives us a weak preference for "pivot" to begin with.
    501 	 */
    502 	bestci = pivot;
    503 	bestspc = &bestci->ci_schedstate;
    504 	if (sched_migratable(l, bestci)) {
    505 		bestpri = MAX(bestspc->spc_curpriority,
    506 		    bestspc->spc_maxpriority);
    507 	} else {
    508 		/* Invalidate the priority. */
    509 		bestpri = PRI_COUNT;
    510 	}
    511 
    512 	/* In the outer loop scroll through all CPU packages. */
    513 	pivot = pivot->ci_package1st;
    514 	outer = pivot;
    515 	do {
    516 		/* In the inner loop scroll through all CPUs in package. */
    517 		curci = outer;
    518 		do {
    519 			if (!sched_migratable(l, curci)) {
    520 				continue;
    521 			}
    522 
    523 			curspc = &curci->ci_schedstate;
    524 
    525 			/* If this CPU is idle and 1st class, we're done. */
    526 			if (cpu_is_idle_1stclass(curci)) {
    527 				return curci;
    528 			}
    529 
    530 			curpri = MAX(curspc->spc_curpriority,
    531 			    curspc->spc_maxpriority);
    532 
    533 			if (curpri > bestpri) {
    534 				continue;
    535 			}
    536 			if (curpri == bestpri) {
    537 				/* Prefer first class CPUs over others. */
    538 				if (cpu_is_better(bestci, curci)) {
    539 				    	continue;
    540 				}
    541 				/*
    542 				 * Pick the least busy CPU.  Make sure this is not
    543 				 * <=, otherwise it defeats the above preference.
    544 				 */
    545 				if (bestspc->spc_count < curspc->spc_count) {
    546 					continue;
    547 				}
    548 			}
    549 
    550 			bestpri = curpri;
    551 			bestci = curci;
    552 			bestspc = curspc;
    553 
    554 		} while (curci = curci->ci_sibling[CPUREL_PACKAGE],
    555 		    curci != outer);
    556 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
    557 	    outer != pivot);
    558 
    559 	return bestci;
    560 }
    561 
    562 /*
    563  * Estimate the migration of LWP to the other CPU.
    564  * Take and return the CPU, if migration is needed.
    565  */
    566 struct cpu_info *
    567 sched_takecpu(struct lwp *l)
    568 {
    569 	struct schedstate_percpu *spc;
    570 	struct cpu_info *ci, *curci, *tci;
    571 	pri_t eprio;
    572 	int flags;
    573 
    574 	KASSERT(lwp_locked(l, NULL));
    575 
    576 	/* If thread is strictly bound, do not estimate other CPUs */
    577 	ci = l->l_cpu;
    578 	if (l->l_pflag & LP_BOUND)
    579 		return ci;
    580 
    581 	spc = &ci->ci_schedstate;
    582 	eprio = lwp_eprio(l);
    583 
    584 	/*
    585 	 * Handle new LWPs.  For vfork() with a timeshared child, make it
    586 	 * run on the same CPU as the parent if no other LWPs in queue.
    587 	 * Otherwise scatter far and wide - try for an even distribution
    588 	 * across all CPU packages and CPUs.
    589 	 */
    590 	if (l->l_stat == LSIDL) {
    591 		if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) {
    592 			if (sched_migratable(l, curlwp->l_cpu) && eprio >
    593 			    curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
    594 				return curlwp->l_cpu;
    595 			}
    596 		} else {
    597 			return sched_bestcpu(l, sched_nextpkg());
    598 		}
    599 		flags = SPCF_IDLE;
    600 	} else {
    601 		flags = SPCF_IDLE | SPCF_1STCLASS;
    602 	}
    603 
    604 	/*
    605 	 * Try to send the LWP back to the first CPU in the same core if
    606 	 * idle.  This keeps LWPs clustered in the run queues of 1st class
    607 	 * CPUs.  This implies stickiness.  If we didn't find a home for
    608 	 * a vfork() child above, try to use any SMT sibling to help out.
    609 	 */
    610 	tci = ci;
    611 	do {
    612 		if (cpu_is_type(tci, flags) && sched_migratable(l, tci)) {
    613 			return tci;
    614 		}
    615 		tci = tci->ci_sibling[CPUREL_CORE];
    616 	} while (tci != ci);
    617 
    618 	/*
    619 	 * Otherwise the LWP is "sticky", i.e.  generally preferring to stay
    620 	 * on the same CPU.
    621 	 */
    622 	if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority ||
    623 	    (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
    624 		return ci;
    625 	}
    626 
    627 	/*
    628 	 * If the current CPU core is idle, run there and avoid the
    629 	 * expensive scan of CPUs below.
    630 	 */
    631 	curci = curcpu();
    632 	tci = curci;
    633 	do {
    634 		if (cpu_is_type(tci, flags) && sched_migratable(l, tci)) {
    635 			return tci;
    636 		}
    637 		tci = tci->ci_sibling[CPUREL_CORE];
    638 	} while (tci != curci);
    639 
    640 	/*
    641 	 * Didn't find a new home above - happens infrequently.  Start the
    642 	 * search in last CPU package that the LWP ran in, but expand to
    643 	 * include the whole system if needed.
    644 	 */
    645 	return sched_bestcpu(l, l->l_cpu);
    646 }
    647 
    648 /*
    649  * Tries to catch an LWP from the runqueue of other CPU.
    650  */
    651 static struct lwp *
    652 sched_catchlwp(struct cpu_info *ci)
    653 {
    654 	struct cpu_info *curci = curcpu();
    655 	struct schedstate_percpu *spc, *curspc;
    656 	TAILQ_HEAD(, lwp) *q_head;
    657 	struct lwp *l;
    658 	bool gentle;
    659 
    660 	curspc = &curci->ci_schedstate;
    661 	spc = &ci->ci_schedstate;
    662 
    663 	/*
    664 	 * Be more aggressive if this CPU is first class, and the other
    665 	 * is not.
    666 	 */
    667 	gentle = cpu_is_better(curci, ci);
    668 
    669 	if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
    670 	    curspc->spc_psid != spc->spc_psid) {
    671 		spc_unlock(ci);
    672 		return NULL;
    673 	}
    674 
    675 	/* Take the highest priority thread */
    676 	q_head = sched_getrq(spc, spc->spc_maxpriority);
    677 	l = TAILQ_FIRST(q_head);
    678 
    679 	for (;;) {
    680 		/* Check the first and next result from the queue */
    681 		if (l == NULL) {
    682 			break;
    683 		}
    684 		KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
    685 		    ci->ci_data.cpu_name,
    686 		    l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
    687 
    688 		/* Look for threads, whose are allowed to migrate */
    689 		if ((l->l_pflag & LP_BOUND) ||
    690 		    (gentle && lwp_cache_hot(l)) ||
    691 		    !sched_migratable(l, curci)) {
    692 			l = TAILQ_NEXT(l, l_runq);
    693 			/* XXX Gap: could walk down priority list. */
    694 			continue;
    695 		}
    696 
    697 		/* Grab the thread, and move to the local run queue */
    698 		sched_dequeue(l);
    699 		l->l_cpu = curci;
    700 		lwp_unlock_to(l, curspc->spc_mutex);
    701 		sched_enqueue(l);
    702 		return l;
    703 	}
    704 	spc_unlock(ci);
    705 
    706 	return l;
    707 }
    708 
    709 /*
    710  * Called from sched_idle() to handle migration.  Return the CPU that we
    711  * pushed the LWP to (may be NULL).
    712  */
    713 static struct cpu_info *
    714 sched_idle_migrate(void)
    715 {
    716 	struct cpu_info *ci = curcpu(), *tci = NULL;
    717 	struct schedstate_percpu *spc, *tspc;
    718 	bool dlock = false;
    719 
    720 	spc = &ci->ci_schedstate;
    721 	spc_lock(ci);
    722 	for (;;) {
    723 		struct lwp *l;
    724 
    725 		l = spc->spc_migrating;
    726 		if (l == NULL)
    727 			break;
    728 
    729 		/*
    730 		 * If second attempt, and target CPU has changed,
    731 		 * drop the old lock.
    732 		 */
    733 		if (dlock == true && tci != l->l_target_cpu) {
    734 			KASSERT(tci != NULL);
    735 			spc_unlock(tci);
    736 			dlock = false;
    737 		}
    738 
    739 		/*
    740 		 * Nothing to do if destination has changed to the
    741 		 * local CPU, or migration was done by other CPU.
    742 		 */
    743 		tci = l->l_target_cpu;
    744 		if (tci == NULL || tci == ci) {
    745 			spc->spc_migrating = NULL;
    746 			l->l_target_cpu = NULL;
    747 			break;
    748 		}
    749 		tspc = &tci->ci_schedstate;
    750 
    751 		/*
    752 		 * Double-lock the runqueues.
    753 		 * We do that only once.
    754 		 */
    755 		if (dlock == false) {
    756 			dlock = true;
    757 			if (ci < tci) {
    758 				spc_lock(tci);
    759 			} else if (!mutex_tryenter(tspc->spc_mutex)) {
    760 				spc_unlock(ci);
    761 				spc_lock(tci);
    762 				spc_lock(ci);
    763 				/* Check the situation again.. */
    764 				continue;
    765 			}
    766 		}
    767 
    768 		/* Migrate the thread */
    769 		KASSERT(l->l_stat == LSRUN);
    770 		spc->spc_migrating = NULL;
    771 		l->l_target_cpu = NULL;
    772 		sched_dequeue(l);
    773 		l->l_cpu = tci;
    774 		lwp_setlock(l, tspc->spc_mutex);
    775 		sched_enqueue(l);
    776 		sched_resched_lwp(l, true);
    777 		/* tci now unlocked */
    778 		spc_unlock(ci);
    779 		return tci;
    780 	}
    781 	if (dlock == true) {
    782 		KASSERT(tci != NULL);
    783 		spc_unlock(tci);
    784 	}
    785 	spc_unlock(ci);
    786 	return NULL;
    787 }
    788 
    789 /*
    790  * Try to steal an LWP from "tci".
    791  */
    792 static bool
    793 sched_steal(struct cpu_info *ci, struct cpu_info *tci)
    794 {
    795 	struct schedstate_percpu *spc, *tspc;
    796 	lwp_t *l;
    797 
    798 	spc = &ci->ci_schedstate;
    799 	tspc = &tci->ci_schedstate;
    800 	if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
    801 	    spc->spc_psid == tspc->spc_psid) {
    802 		spc_dlock(ci, tci);
    803 		l = sched_catchlwp(tci);
    804 		spc_unlock(ci);
    805 		if (l != NULL) {
    806 			return true;
    807 		}
    808 	}
    809 	return false;
    810 }
    811 
    812 /*
    813  * Called from each CPU's idle loop.
    814  */
    815 void
    816 sched_idle(void)
    817 {
    818 	struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
    819 	struct schedstate_percpu *spc, *tspc;
    820 	struct lwp *l;
    821 
    822 	ci = curcpu();
    823 	spc = &ci->ci_schedstate;
    824 	tci = NULL;
    825 	mci = NULL;
    826 
    827 	/*
    828 	 * Handle LWP migrations off this CPU to another.  If there a is
    829 	 * migration to do then remember the CPU the LWP was sent to, and
    830 	 * don't steal the LWP back from that CPU below.
    831 	 */
    832 	if (spc->spc_migrating != NULL) {
    833 		mci = sched_idle_migrate();
    834 	}
    835 
    836 	/* If this CPU is offline, or we have an LWP to run, we're done. */
    837 	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
    838 		return;
    839 	}
    840 
    841 	/* Deal with SMT. */
    842 	if (ci->ci_nsibling[CPUREL_CORE] > 1) {
    843 		/* Try to help our siblings out. */
    844 		tci = ci->ci_sibling[CPUREL_CORE];
    845 		while (tci != ci) {
    846 			if (tci != mci && sched_steal(ci, tci)) {
    847 				return;
    848 			}
    849 			tci = tci->ci_sibling[CPUREL_CORE];
    850 		}
    851 		/*
    852 		 * If not the first SMT in the core, and in the default
    853 		 * processor set, the search ends here.
    854 		 */
    855 		if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
    856 		    spc->spc_psid == PS_NONE) {
    857 			return;
    858 		}
    859 	}
    860 
    861 	/*
    862 	 * Find something to run, unless this CPU exceeded the rate limit.
    863 	 * Start looking on the current package to maximise L2/L3 cache
    864 	 * locality.  Then expand to looking at the rest of the system.
    865 	 *
    866 	 * XXX Should probably look at 2nd class CPUs first, but they will
    867 	 * shed jobs via preempt() anyway.
    868 	 */
    869 	if (spc->spc_nextskim > getticks()) {
    870 		return;
    871 	}
    872 	spc->spc_nextskim = getticks() + mstohz(skim_interval);
    873 
    874 	/* In the outer loop scroll through all CPU packages, starting here. */
    875 	first = ci->ci_package1st;
    876 	outer = first;
    877 	do {
    878 		/* In the inner loop scroll through all CPUs in package. */
    879 		inner = outer;
    880 		do {
    881 			/* Don't hit the locks unless needed. */
    882 			tspc = &inner->ci_schedstate;
    883 			if (ci == inner || ci == mci ||
    884 			    spc->spc_psid != tspc->spc_psid ||
    885 			    atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
    886 				continue;
    887 			}
    888 			spc_dlock(ci, inner);
    889 			l = sched_catchlwp(inner);
    890 			spc_unlock(ci);
    891 			if (l != NULL) {
    892 				/* Got it! */
    893 				return;
    894 			}
    895 		} while (inner = inner->ci_sibling[CPUREL_PACKAGE],
    896 		    inner != outer);
    897 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
    898 	    outer != first);
    899 }
    900 
    901 /*
    902  * Called from mi_switch() when an LWP has been preempted / has yielded.
    903  * The LWP is presently in the CPU's run queue.  Here we look for a better
    904  * CPU to teleport the LWP to; there may not be one.
    905  */
    906 void
    907 sched_preempted(struct lwp *l)
    908 {
    909 	struct schedstate_percpu *tspc;
    910 	struct cpu_info *ci, *tci;
    911 
    912 	ci = l->l_cpu;
    913 	tspc = &ci->ci_schedstate;
    914 
    915 	KASSERT(tspc->spc_count >= 1);
    916 
    917 	/*
    918 	 * Try to select another CPU if:
    919 	 *
    920 	 * - there is no migration pending already
    921 	 * - and this LWP is running on a 2nd class CPU
    922 	 * - or this LWP is a child of vfork() that has just done execve()
    923 	 */
    924 	if (l->l_target_cpu != NULL ||
    925 	    (cpu_is_1stclass(ci) &&
    926 	     (l->l_pflag & LP_TELEPORT) == 0)) {
    927 		return;
    928 	}
    929 
    930 	/*
    931 	 * Fast path: if the first SMT in the core is idle, send it back
    932 	 * there, because the cache is shared (cheap) and we want all LWPs
    933 	 * to be clustered on 1st class CPUs (either running there or on
    934 	 * their runqueues).
    935 	 */
    936 	tci = ci->ci_sibling[CPUREL_CORE];
    937 	while (tci != ci) {
    938 		tspc = &tci->ci_schedstate;
    939 		if (cpu_is_idle_1stclass(tci) && sched_migratable(l, tci)) {
    940 		    	l->l_target_cpu = tci;
    941 			l->l_pflag &= ~LP_TELEPORT;
    942 		    	return;
    943 		}
    944 		tci = tci->ci_sibling[CPUREL_CORE];
    945 	}
    946 
    947 	if ((l->l_pflag & LP_TELEPORT) != 0) {
    948 		/*
    949 		 * A child of vfork(): now that the parent is released,
    950 		 * scatter far and wide, to match the LSIDL distribution
    951 		 * done in sched_takecpu().
    952 		 */
    953 		l->l_pflag &= ~LP_TELEPORT;
    954 		tci = sched_bestcpu(l, sched_nextpkg());
    955 		if (tci != ci) {
    956 			l->l_target_cpu = tci;
    957 		}
    958 	} else {
    959 		/*
    960 		 * Try to find a better CPU to take it, but don't move to
    961 		 * another 2nd class CPU, and don't move to a non-idle CPU,
    962 		 * because that would prevent SMT being used to maximise
    963 		 * throughput.
    964 		 *
    965 		 * Search in the current CPU package in order to try and
    966 		 * keep L2/L3 cache locality, but expand to include the
    967 		 * whole system if needed.
    968 		 */
    969 		tci = sched_bestcpu(l, l->l_cpu);
    970 		if (tci != ci && cpu_is_idle_1stclass(tci)) {
    971 			l->l_target_cpu = tci;
    972 		}
    973 	}
    974 }
    975 
    976 /*
    977  * Called during execve() by a child of vfork().  Does two things:
    978  *
    979  * - If the parent has been awoken and put back on curcpu then give the
    980  *   CPU back to the parent.
    981  *
    982  * - If curlwp is not on a 1st class CPU then find somewhere else to run,
    983  *   since it dodged the distribution in sched_takecpu() when first set
    984  *   runnable.
    985  */
    986 void
    987 sched_vforkexec(struct lwp *l, bool samecpu)
    988 {
    989 
    990 	KASSERT(l == curlwp);
    991 	if ((samecpu && ncpu > 1) || !cpu_is_1stclass(l->l_cpu)) {
    992 		l->l_pflag |= LP_TELEPORT;
    993 		preempt();
    994 	}
    995 }
    996 
    997 #else
    998 
    999 /*
   1000  * stubs for !MULTIPROCESSOR
   1001  */
   1002 
   1003 struct cpu_info *
   1004 sched_takecpu(struct lwp *l)
   1005 {
   1006 
   1007 	return l->l_cpu;
   1008 }
   1009 
   1010 void
   1011 sched_idle(void)
   1012 {
   1013 
   1014 }
   1015 
   1016 void
   1017 sched_preempted(struct lwp *l)
   1018 {
   1019 
   1020 }
   1021 
   1022 void
   1023 sched_vforkexec(struct lwp *l, bool samecpu)
   1024 {
   1025 
   1026 	KASSERT(l == curlwp);
   1027 }
   1028 
   1029 #endif	/* MULTIPROCESSOR */
   1030 
   1031 /*
   1032  * Scheduling statistics and balancing.
   1033  */
   1034 void
   1035 sched_lwp_stats(struct lwp *l)
   1036 {
   1037 	int batch;
   1038 
   1039 	KASSERT(lwp_locked(l, NULL));
   1040 
   1041 	/* Update sleep time */
   1042 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
   1043 	    l->l_stat == LSSUSPENDED)
   1044 		l->l_slptime++;
   1045 
   1046 	/*
   1047 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
   1048 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
   1049 	 */
   1050 	batch = (l->l_rticksum > l->l_slpticksum);
   1051 	if (batch != 0) {
   1052 		if ((l->l_flag & LW_BATCH) == 0)
   1053 			batch = 0;
   1054 		l->l_flag |= LW_BATCH;
   1055 	} else
   1056 		l->l_flag &= ~LW_BATCH;
   1057 
   1058 	/* Reset the time sums */
   1059 	l->l_slpticksum = 0;
   1060 	l->l_rticksum = 0;
   1061 
   1062 	/* Scheduler-specific hook */
   1063 	sched_pstats_hook(l, batch);
   1064 #ifdef KDTRACE_HOOKS
   1065 	curthread = l;
   1066 #endif
   1067 }
   1068 
   1069 /*
   1070  * Scheduler mill.
   1071  */
   1072 struct lwp *
   1073 sched_nextlwp(void)
   1074 {
   1075 	struct cpu_info *ci = curcpu();
   1076 	struct schedstate_percpu *spc;
   1077 	TAILQ_HEAD(, lwp) *q_head;
   1078 	struct lwp *l;
   1079 
   1080 	/* Update the last run time on switch */
   1081 	l = curlwp;
   1082 	l->l_rticksum += (getticks() - l->l_rticks);
   1083 
   1084 	/* Return to idle LWP if there is a migrating thread */
   1085 	spc = &ci->ci_schedstate;
   1086 	if (__predict_false(spc->spc_migrating != NULL))
   1087 		return NULL;
   1088 
   1089 	/* Return to idle LWP if there is no runnable job */
   1090 	if (__predict_false(spc->spc_count == 0))
   1091 		return NULL;
   1092 
   1093 	/* Take the highest priority thread */
   1094 	KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
   1095 	q_head = sched_getrq(spc, spc->spc_maxpriority);
   1096 	l = TAILQ_FIRST(q_head);
   1097 	KASSERT(l != NULL);
   1098 
   1099 	sched_oncpu(l);
   1100 	l->l_rticks = getticks();
   1101 
   1102 	return l;
   1103 }
   1104 
   1105 /*
   1106  * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
   1107  */
   1108 
   1109 bool
   1110 sched_curcpu_runnable_p(void)
   1111 {
   1112 	const struct cpu_info *ci;
   1113 	const struct schedstate_percpu *spc;
   1114 	bool rv;
   1115 
   1116 	kpreempt_disable();
   1117 	ci = curcpu();
   1118 	spc = &ci->ci_schedstate;
   1119 	rv = (spc->spc_count != 0);
   1120 #ifndef __HAVE_FAST_SOFTINTS
   1121 	rv |= (ci->ci_data.cpu_softints != 0);
   1122 #endif
   1123 	kpreempt_enable();
   1124 
   1125 	return rv;
   1126 }
   1127 
   1128 /*
   1129  * Sysctl nodes and initialization.
   1130  */
   1131 
   1132 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
   1133 {
   1134 	const struct sysctlnode *node = NULL;
   1135 
   1136 	sysctl_createv(clog, 0, NULL, &node,
   1137 		CTLFLAG_PERMANENT,
   1138 		CTLTYPE_NODE, "sched",
   1139 		SYSCTL_DESCR("Scheduler options"),
   1140 		NULL, 0, NULL, 0,
   1141 		CTL_KERN, CTL_CREATE, CTL_EOL);
   1142 
   1143 	if (node == NULL)
   1144 		return;
   1145 
   1146 	sysctl_createv(clog, 0, &node, NULL,
   1147 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1148 		CTLTYPE_INT, "cacheht_time",
   1149 		SYSCTL_DESCR("Cache hotness time (in ms)"),
   1150 		NULL, 0, &cacheht_time, 0,
   1151 		CTL_CREATE, CTL_EOL);
   1152 	sysctl_createv(clog, 0, &node, NULL,
   1153 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1154 		CTLTYPE_INT, "skim_interval",
   1155 		SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
   1156 		NULL, 0, &skim_interval, 0,
   1157 		CTL_CREATE, CTL_EOL);
   1158 	sysctl_createv(clog, 0, &node, NULL,
   1159 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1160 		CTLTYPE_INT, "min_catch",
   1161 		SYSCTL_DESCR("Minimal count of threads for catching"),
   1162 		NULL, 0, &min_catch, 0,
   1163 		CTL_CREATE, CTL_EOL);
   1164 	sysctl_createv(clog, 0, &node, NULL,
   1165 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1166 		CTLTYPE_INT, "timesoftints",
   1167 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
   1168 		NULL, 0, &softint_timing, 0,
   1169 		CTL_CREATE, CTL_EOL);
   1170 	sysctl_createv(clog, 0, &node, NULL,
   1171 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1172 		CTLTYPE_INT, "kpreempt_pri",
   1173 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
   1174 		NULL, 0, &sched_kpreempt_pri, 0,
   1175 		CTL_CREATE, CTL_EOL);
   1176 }
   1177 
   1178 /*
   1179  * Debugging.
   1180  */
   1181 
   1182 #ifdef DDB
   1183 
   1184 void
   1185 sched_print_runqueue(void (*pr)(const char *, ...))
   1186 {
   1187 	struct cpu_info *ci, *tci;
   1188 	struct schedstate_percpu *spc;
   1189 	struct lwp *l;
   1190 	struct proc *p;
   1191 	CPU_INFO_ITERATOR cii;
   1192 
   1193 	for (CPU_INFO_FOREACH(cii, ci)) {
   1194 		int i;
   1195 
   1196 		spc = &ci->ci_schedstate;
   1197 
   1198 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
   1199 		(*pr)(" pid.lid = %d.%d, r_count = %u, "
   1200 		    "maxpri = %d, mlwp = %p\n",
   1201 #ifdef MULTIPROCESSOR
   1202 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
   1203 #else
   1204 		    curlwp->l_proc->p_pid, curlwp->l_lid,
   1205 #endif
   1206 		    spc->spc_count, spc->spc_maxpriority,
   1207 		    spc->spc_migrating);
   1208 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
   1209 		do {
   1210 			uint32_t q;
   1211 			q = spc->spc_bitmap[i];
   1212 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
   1213 		} while (i--);
   1214 	}
   1215 
   1216 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
   1217 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
   1218 
   1219 	PROCLIST_FOREACH(p, &allproc) {
   1220 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
   1221 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1222 			ci = l->l_cpu;
   1223 			tci = l->l_target_cpu;
   1224 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
   1225 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
   1226 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
   1227 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
   1228 			    l, ci->ci_index, (tci ? tci->ci_index : -1),
   1229 			    (u_int)(getticks() - l->l_rticks));
   1230 		}
   1231 	}
   1232 }
   1233 
   1234 #endif
   1235