Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.6.4.6
      1 /*	$NetBSD: sched_m2.c,v 1.6.4.6 2008/02/04 09:24:15 yamt Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.6.4.6 2008/02/04 09:24:15 yamt Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/bitops.h>
     41 #include <sys/cpu.h>
     42 #include <sys/callout.h>
     43 #include <sys/errno.h>
     44 #include <sys/kernel.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lwp.h>
     47 #include <sys/mutex.h>
     48 #include <sys/pool.h>
     49 #include <sys/proc.h>
     50 #include <sys/pset.h>
     51 #include <sys/resource.h>
     52 #include <sys/resourcevar.h>
     53 #include <sys/sched.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related defintions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 
     67 const int schedppq = 1;
     68 
     69 /*
     70  * Bits per map.
     71  */
     72 #define	BITMAP_BITS	(32)
     73 #define	BITMAP_SHIFT	(5)
     74 #define	BITMAP_MSB	(0x80000000U)
     75 #define	BITMAP_MASK	(BITMAP_BITS - 1)
     76 
     77 /*
     78  * Time-slices and priorities.
     79  */
     80 static u_int	min_ts;			/* Minimal time-slice */
     81 static u_int	max_ts;			/* Maximal time-slice */
     82 static u_int	rt_ts;			/* Real-time time-slice */
     83 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     84 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     85 
     86 /*
     87  * Migration and balancing.
     88  */
     89 #ifdef MULTIPROCESSOR
     90 
     91 static u_int	cacheht_time;		/* Cache hotness time */
     92 static u_int	min_catch;		/* Minimal LWP count for catching */
     93 
     94 static u_int		balance_period;	/* Balance period */
     95 static struct callout	balance_ch;	/* Callout of balancer */
     96 
     97 static struct cpu_info * volatile worker_ci;
     98 
     99 #endif
    100 
    101 /*
    102  * Structures, runqueue.
    103  */
    104 
    105 typedef struct {
    106 	TAILQ_HEAD(, lwp) q_head;
    107 } queue_t;
    108 
    109 typedef struct {
    110 	/* Lock and bitmap */
    111 	kmutex_t	r_rq_mutex;
    112 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
    113 	/* Counters */
    114 	u_int		r_count;	/* Count of the threads */
    115 	pri_t		r_highest_pri;	/* Highest priority */
    116 	u_int		r_avgcount;	/* Average count of threads */
    117 	u_int		r_mcount;	/* Count of migratable threads */
    118 	/* Runqueues */
    119 	queue_t		r_rt_queue[PRI_RT_COUNT];
    120 	queue_t		r_ts_queue[PRI_TS_COUNT];
    121 } runqueue_t;
    122 
    123 typedef struct {
    124 	u_int		sl_flags;
    125 	u_int		sl_timeslice;	/* Time-slice of thread */
    126 	u_int		sl_slept;	/* Saved sleep time for sleep sum */
    127 	u_int		sl_slpsum;	/* Sum of sleep time */
    128 	u_int		sl_rtime;	/* Saved start time of run */
    129 	u_int		sl_rtsum;	/* Sum of the run time */
    130 	u_int		sl_lrtime;	/* Last run time */
    131 } sched_info_lwp_t;
    132 
    133 /* Flags */
    134 #define	SL_BATCH	0x01
    135 
    136 /* Pool of the scheduler-specific structures for threads */
    137 static pool_cache_t	sil_pool;
    138 
    139 /*
    140  * Prototypes.
    141  */
    142 
    143 static inline void *	sched_getrq(runqueue_t *, const pri_t);
    144 static inline void	sched_newts(struct lwp *);
    145 static void		sched_precalcts(void);
    146 
    147 #ifdef MULTIPROCESSOR
    148 static struct lwp *	sched_catchlwp(void);
    149 static void		sched_balance(void *);
    150 #endif
    151 
    152 /*
    153  * Initialization and setup.
    154  */
    155 
    156 void
    157 sched_rqinit(void)
    158 {
    159 	struct cpu_info *ci = curcpu();
    160 
    161 	if (hz < 100) {
    162 		panic("sched_rqinit: value of HZ is too low\n");
    163 	}
    164 
    165 	/* Default timing ranges */
    166 	min_ts = mstohz(50);			/* ~50ms  */
    167 	max_ts = mstohz(150);			/* ~150ms */
    168 	rt_ts = mstohz(100);			/* ~100ms */
    169 	sched_precalcts();
    170 
    171 #ifdef MULTIPROCESSOR
    172 	/* Balancing */
    173 	worker_ci = ci;
    174 	cacheht_time = mstohz(5);		/* ~5 ms  */
    175 	balance_period = mstohz(300);		/* ~300ms */
    176 	min_catch = ~0;
    177 #endif
    178 
    179 	/* Pool of the scheduler-specific structures */
    180 	sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), 0, 0, 0,
    181 	    "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
    182 
    183 	/* Attach the primary CPU here */
    184 	sched_cpuattach(ci);
    185 
    186 	/* Initialize the scheduler structure of the primary LWP */
    187 	lwp0.l_mutex = &ci->ci_schedstate.spc_lwplock;
    188 	sched_lwp_fork(NULL, &lwp0);
    189 	sched_newts(&lwp0);
    190 }
    191 
    192 void
    193 sched_setup(void)
    194 {
    195 
    196 #ifdef MULTIPROCESSOR
    197 	/* Minimal count of LWPs for catching: log2(count of CPUs) */
    198 	min_catch = min(ilog2(ncpu), 4);
    199 
    200 	/* Initialize balancing callout and run it */
    201 	callout_init(&balance_ch, CALLOUT_MPSAFE);
    202 	callout_setfunc(&balance_ch, sched_balance, NULL);
    203 	callout_schedule(&balance_ch, balance_period);
    204 #endif
    205 }
    206 
    207 void
    208 sched_cpuattach(struct cpu_info *ci)
    209 {
    210 	runqueue_t *ci_rq;
    211 	void *rq_ptr;
    212 	u_int i, size;
    213 
    214 	/* Allocate the run queue */
    215 	size = roundup2(sizeof(runqueue_t), CACHE_LINE_SIZE) + CACHE_LINE_SIZE;
    216 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
    217 	if (rq_ptr == NULL) {
    218 		panic("sched_cpuattach: could not allocate the runqueue");
    219 	}
    220 	ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), CACHE_LINE_SIZE));
    221 
    222 	/* Initialize run queues */
    223 	mutex_init(&ci_rq->r_rq_mutex, MUTEX_DEFAULT, IPL_SCHED);
    224 	for (i = 0; i < PRI_RT_COUNT; i++)
    225 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
    226 	for (i = 0; i < PRI_TS_COUNT; i++)
    227 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
    228 	ci_rq->r_highest_pri = 0;
    229 
    230 	ci->ci_schedstate.spc_sched_info = ci_rq;
    231 	ci->ci_schedstate.spc_mutex = &ci_rq->r_rq_mutex;
    232 }
    233 
    234 /* Pre-calculate the time-slices for the priorities */
    235 static void
    236 sched_precalcts(void)
    237 {
    238 	pri_t p;
    239 
    240 	/* Time-sharing range */
    241 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    242 		ts_map[p] = max_ts -
    243 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    244 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    245 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    246 	}
    247 
    248 	/* Real-time range */
    249 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    250 		ts_map[p] = rt_ts;
    251 		high_pri[p] = p;
    252 	}
    253 }
    254 
    255 /*
    256  * Hooks.
    257  */
    258 
    259 void
    260 sched_proc_fork(struct proc *parent, struct proc *child)
    261 {
    262 	struct lwp *l;
    263 
    264 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    265 		lwp_lock(l);
    266 		sched_newts(l);
    267 		lwp_unlock(l);
    268 	}
    269 }
    270 
    271 void
    272 sched_proc_exit(struct proc *child, struct proc *parent)
    273 {
    274 
    275 	/* Dummy */
    276 }
    277 
    278 void
    279 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    280 {
    281 
    282 	KASSERT(l2->l_sched_info == NULL);
    283 	l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
    284 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
    285 }
    286 
    287 void
    288 sched_lwp_exit(struct lwp *l)
    289 {
    290 
    291 	KASSERT(l->l_sched_info != NULL);
    292 	pool_cache_put(sil_pool, l->l_sched_info);
    293 	l->l_sched_info = NULL;
    294 }
    295 
    296 void
    297 sched_lwp_collect(struct lwp *l)
    298 {
    299 
    300 }
    301 
    302 void
    303 sched_setrunnable(struct lwp *l)
    304 {
    305 
    306 	/* Dummy */
    307 }
    308 
    309 void
    310 sched_schedclock(struct lwp *l)
    311 {
    312 
    313 	/* Dummy */
    314 }
    315 
    316 /*
    317  * Priorities and time-slice.
    318  */
    319 
    320 void
    321 sched_nice(struct proc *p, int prio)
    322 {
    323 
    324 	/* TODO: implement as SCHED_IA */
    325 }
    326 
    327 /* Recalculate the time-slice */
    328 static inline void
    329 sched_newts(struct lwp *l)
    330 {
    331 	sched_info_lwp_t *sil = l->l_sched_info;
    332 
    333 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
    334 }
    335 
    336 /*
    337  * Control of the runqueue.
    338  */
    339 
    340 static inline void *
    341 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
    342 {
    343 
    344 	KASSERT(prio < PRI_COUNT);
    345 	return (prio <= PRI_HIGHEST_TS) ?
    346 	    &ci_rq->r_ts_queue[prio].q_head :
    347 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
    348 }
    349 
    350 void
    351 sched_enqueue(struct lwp *l, bool swtch)
    352 {
    353 	runqueue_t *ci_rq;
    354 	sched_info_lwp_t *sil = l->l_sched_info;
    355 	TAILQ_HEAD(, lwp) *q_head;
    356 	const pri_t eprio = lwp_eprio(l);
    357 
    358 	ci_rq = l->l_cpu->ci_schedstate.spc_sched_info;
    359 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    360 
    361 	/* Update the last run time on switch */
    362 	if (__predict_true(swtch == true)) {
    363 		sil->sl_lrtime = hardclock_ticks;
    364 		sil->sl_rtsum += (hardclock_ticks - sil->sl_rtime);
    365 	} else if (sil->sl_lrtime == 0)
    366 		sil->sl_lrtime = hardclock_ticks;
    367 
    368 	/* Enqueue the thread */
    369 	q_head = sched_getrq(ci_rq, eprio);
    370 	if (TAILQ_EMPTY(q_head)) {
    371 		u_int i;
    372 		uint32_t q;
    373 
    374 		/* Mark bit */
    375 		i = eprio >> BITMAP_SHIFT;
    376 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    377 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
    378 		ci_rq->r_bitmap[i] |= q;
    379 	}
    380 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
    381 	ci_rq->r_count++;
    382 	if ((l->l_flag & LW_BOUND) == 0)
    383 		ci_rq->r_mcount++;
    384 
    385 	/*
    386 	 * Update the value of highest priority in the runqueue,
    387 	 * if priority of this thread is higher.
    388 	 */
    389 	if (eprio > ci_rq->r_highest_pri)
    390 		ci_rq->r_highest_pri = eprio;
    391 
    392 	sched_newts(l);
    393 }
    394 
    395 void
    396 sched_dequeue(struct lwp *l)
    397 {
    398 	runqueue_t *ci_rq;
    399 	TAILQ_HEAD(, lwp) *q_head;
    400 	const pri_t eprio = lwp_eprio(l);
    401 
    402 	ci_rq = l->l_cpu->ci_schedstate.spc_sched_info;
    403 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    404 
    405 	KASSERT(eprio <= ci_rq->r_highest_pri);
    406 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
    407 	KASSERT(ci_rq->r_count > 0);
    408 
    409 	ci_rq->r_count--;
    410 	if ((l->l_flag & LW_BOUND) == 0)
    411 		ci_rq->r_mcount--;
    412 
    413 	q_head = sched_getrq(ci_rq, eprio);
    414 	TAILQ_REMOVE(q_head, l, l_runq);
    415 	if (TAILQ_EMPTY(q_head)) {
    416 		u_int i;
    417 		uint32_t q;
    418 
    419 		/* Unmark bit */
    420 		i = eprio >> BITMAP_SHIFT;
    421 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    422 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
    423 		ci_rq->r_bitmap[i] &= ~q;
    424 
    425 		/*
    426 		 * Update the value of highest priority in the runqueue, in a
    427 		 * case it was a last thread in the queue of highest priority.
    428 		 */
    429 		if (eprio != ci_rq->r_highest_pri)
    430 			return;
    431 
    432 		do {
    433 			q = ffs(ci_rq->r_bitmap[i]);
    434 			if (q) {
    435 				ci_rq->r_highest_pri =
    436 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
    437 				return;
    438 			}
    439 		} while (i--);
    440 
    441 		/* If not found - set the lowest value */
    442 		ci_rq->r_highest_pri = 0;
    443 	}
    444 }
    445 
    446 void
    447 sched_slept(struct lwp *l)
    448 {
    449 	sched_info_lwp_t *sil = l->l_sched_info;
    450 
    451 	/* Save the time when thread has slept */
    452 	sil->sl_slept = hardclock_ticks;
    453 
    454 	/*
    455 	 * If thread is in time-sharing queue and batch flag is not marked,
    456 	 * increase the the priority, and run with the lower time-quantum.
    457 	 */
    458 	if (l->l_priority < PRI_HIGHEST_TS &&
    459 	    (sil->sl_flags & SL_BATCH) == 0) {
    460 		KASSERT(l->l_class == SCHED_OTHER);
    461 		l->l_priority++;
    462 	}
    463 }
    464 
    465 void
    466 sched_wakeup(struct lwp *l)
    467 {
    468 	sched_info_lwp_t *sil = l->l_sched_info;
    469 
    470 	/* Update sleep time delta */
    471 	sil->sl_slpsum += (l->l_slptime == 0) ?
    472 	    (hardclock_ticks - sil->sl_slept) : hz;
    473 
    474 	/* If thread was sleeping a second or more - set a high priority */
    475 	if (l->l_slptime > 1 || (hardclock_ticks - sil->sl_slept) >= hz)
    476 		l->l_priority = high_pri[l->l_priority];
    477 
    478 	/* Also, consider looking for a better CPU to wake up */
    479 	if ((l->l_flag & (LW_BOUND | LW_SYSTEM)) == 0)
    480 		l->l_cpu = sched_takecpu(l);
    481 }
    482 
    483 void
    484 sched_pstats_hook(struct lwp *l)
    485 {
    486 	sched_info_lwp_t *sil = l->l_sched_info;
    487 	pri_t prio;
    488 	bool batch;
    489 
    490 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    491 	    l->l_stat == LSSUSPENDED)
    492 		l->l_slptime++;
    493 
    494 	/*
    495 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
    496 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
    497 	 */
    498 	batch = (sil->sl_rtsum > sil->sl_slpsum);
    499 	if (batch) {
    500 		if ((sil->sl_flags & SL_BATCH) == 0)
    501 			batch = false;
    502 		sil->sl_flags |= SL_BATCH;
    503 	} else
    504 		sil->sl_flags &= ~SL_BATCH;
    505 
    506 	/* Reset the time sums */
    507 	sil->sl_slpsum = 0;
    508 	sil->sl_rtsum = 0;
    509 
    510 	/* Estimate threads on time-sharing queue only */
    511 	if (l->l_priority >= PRI_HIGHEST_TS)
    512 		return;
    513 	KASSERT(l->l_class == SCHED_OTHER);
    514 
    515 	/* If it is CPU-bound not a first time - decrease the priority */
    516 	prio = l->l_priority;
    517 	if (batch && prio != 0)
    518 		prio--;
    519 
    520 	/* If thread was not ran a second or more - set a high priority */
    521 	if (l->l_stat == LSRUN) {
    522 		if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz))
    523 			prio = high_pri[prio];
    524 		/* Re-enqueue the thread if priority has changed */
    525 		if (prio != l->l_priority)
    526 			lwp_changepri(l, prio);
    527 	} else {
    528 		/* In other states, change the priority directly */
    529 		l->l_priority = prio;
    530 	}
    531 }
    532 
    533 /*
    534  * Migration and balancing.
    535  */
    536 
    537 #ifdef MULTIPROCESSOR
    538 
    539 /* Estimate if LWP is cache-hot */
    540 static inline bool
    541 lwp_cache_hot(const struct lwp *l)
    542 {
    543 	const sched_info_lwp_t *sil = l->l_sched_info;
    544 
    545 	if (l->l_slptime || sil->sl_lrtime == 0)
    546 		return false;
    547 
    548 	return (hardclock_ticks - sil->sl_lrtime < cacheht_time);
    549 }
    550 
    551 /* Check if LWP can migrate to the chosen CPU */
    552 static inline bool
    553 sched_migratable(const struct lwp *l, struct cpu_info *ci)
    554 {
    555 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
    556 
    557 	/* CPU is offline */
    558 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
    559 		return false;
    560 
    561 	/* Affinity bind */
    562 	if (__predict_false(l->l_flag & LW_AFFINITY))
    563 		return CPU_ISSET(cpu_index(ci), &l->l_affinity);
    564 
    565 	/* Processor-set */
    566 	return (spc->spc_psid == l->l_psid);
    567 }
    568 
    569 /*
    570  * Estimate the migration of LWP to the other CPU.
    571  * Take and return the CPU, if migration is needed.
    572  */
    573 struct cpu_info *
    574 sched_takecpu(struct lwp *l)
    575 {
    576 	struct cpu_info *ci, *tci;
    577 	struct schedstate_percpu *spc;
    578 	runqueue_t *ci_rq;
    579 	CPU_INFO_ITERATOR cii;
    580 	pri_t eprio, lpri;
    581 
    582 	KASSERT(lwp_locked(l, NULL));
    583 
    584 	ci = l->l_cpu;
    585 	spc = &ci->ci_schedstate;
    586 	ci_rq = spc->spc_sched_info;
    587 
    588 	/* If thread is strictly bound, do not estimate other CPUs */
    589 	if (l->l_flag & LW_BOUND)
    590 		return ci;
    591 
    592 	/* CPU of this thread is idling - run there */
    593 	if (ci_rq->r_count == 0)
    594 		return ci;
    595 
    596 	eprio = lwp_eprio(l);
    597 
    598 	/* Stay if thread is cache-hot */
    599 	if (__predict_true(l->l_stat != LSIDL) &&
    600 	    lwp_cache_hot(l) && eprio >= spc->spc_curpriority)
    601 		return ci;
    602 
    603 	/* Run on current CPU if priority of thread is higher */
    604 	ci = curcpu();
    605 	spc = &ci->ci_schedstate;
    606 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
    607 		return ci;
    608 
    609 	/*
    610 	 * Look for the CPU with the lowest priority thread.  In case of
    611 	 * equal the priority - check the lower count of the threads.
    612 	 */
    613 	tci = l->l_cpu;
    614 	lpri = PRI_COUNT;
    615 	for (CPU_INFO_FOREACH(cii, ci)) {
    616 		runqueue_t *ici_rq;
    617 		pri_t pri;
    618 
    619 		spc = &ci->ci_schedstate;
    620 		ici_rq = spc->spc_sched_info;
    621 		pri = max(spc->spc_curpriority, ici_rq->r_highest_pri);
    622 		if (pri > lpri)
    623 			continue;
    624 
    625 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
    626 			continue;
    627 
    628 		if (!sched_migratable(l, ci))
    629 			continue;
    630 
    631 		lpri = pri;
    632 		tci = ci;
    633 		ci_rq = ici_rq;
    634 	}
    635 	return tci;
    636 }
    637 
    638 /*
    639  * Tries to catch an LWP from the runqueue of other CPU.
    640  */
    641 static struct lwp *
    642 sched_catchlwp(void)
    643 {
    644 	struct cpu_info *curci = curcpu(), *ci = worker_ci;
    645 	TAILQ_HEAD(, lwp) *q_head;
    646 	runqueue_t *ci_rq;
    647 	struct lwp *l;
    648 
    649 	if (curci == ci)
    650 		return NULL;
    651 
    652 	/* Lockless check */
    653 	ci_rq = ci->ci_schedstate.spc_sched_info;
    654 	if (ci_rq->r_count < min_catch)
    655 		return NULL;
    656 
    657 	/*
    658 	 * Double-lock the runqueues.
    659 	 */
    660 	if (curci < ci) {
    661 		spc_lock(ci);
    662 	} else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
    663 		const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
    664 
    665 		spc_unlock(curci);
    666 		spc_lock(ci);
    667 		spc_lock(curci);
    668 
    669 		if (cur_rq->r_count) {
    670 			spc_unlock(ci);
    671 			return NULL;
    672 		}
    673 	}
    674 
    675 	if (ci_rq->r_count < min_catch) {
    676 		spc_unlock(ci);
    677 		return NULL;
    678 	}
    679 
    680 	/* Take the highest priority thread */
    681 	q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri);
    682 	l = TAILQ_FIRST(q_head);
    683 
    684 	for (;;) {
    685 		/* Check the first and next result from the queue */
    686 		if (l == NULL)
    687 			break;
    688 
    689 		/* Look for threads, whose are allowed to migrate */
    690 		if ((l->l_flag & LW_SYSTEM) || lwp_cache_hot(l) ||
    691 		    !sched_migratable(l, curci)) {
    692 			l = TAILQ_NEXT(l, l_runq);
    693 			continue;
    694 		}
    695 		/* Recheck if chosen thread is still on the runqueue */
    696 		if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM)) {
    697 			sched_dequeue(l);
    698 			l->l_cpu = curci;
    699 			lwp_setlock(l, curci->ci_schedstate.spc_mutex);
    700 			sched_enqueue(l, false);
    701 			break;
    702 		}
    703 		l = TAILQ_NEXT(l, l_runq);
    704 	}
    705 	spc_unlock(ci);
    706 
    707 	return l;
    708 }
    709 
    710 /*
    711  * Periodical calculations for balancing.
    712  */
    713 static void
    714 sched_balance(void *nocallout)
    715 {
    716 	struct cpu_info *ci, *hci;
    717 	runqueue_t *ci_rq;
    718 	CPU_INFO_ITERATOR cii;
    719 	u_int highest;
    720 
    721 	hci = curcpu();
    722 	highest = 0;
    723 
    724 	/* Make lockless countings */
    725 	for (CPU_INFO_FOREACH(cii, ci)) {
    726 		ci_rq = ci->ci_schedstate.spc_sched_info;
    727 
    728 		/* Average count of the threads */
    729 		ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
    730 
    731 		/* Look for CPU with the highest average */
    732 		if (ci_rq->r_avgcount > highest) {
    733 			hci = ci;
    734 			highest = ci_rq->r_avgcount;
    735 		}
    736 	}
    737 
    738 	/* Update the worker */
    739 	worker_ci = hci;
    740 
    741 	if (nocallout == NULL)
    742 		callout_schedule(&balance_ch, balance_period);
    743 }
    744 
    745 #else
    746 
    747 struct cpu_info *
    748 sched_takecpu(struct lwp *l)
    749 {
    750 
    751 	return l->l_cpu;
    752 }
    753 
    754 #endif	/* MULTIPROCESSOR */
    755 
    756 /*
    757  * Scheduler mill.
    758  */
    759 struct lwp *
    760 sched_nextlwp(void)
    761 {
    762 	struct cpu_info *ci = curcpu();
    763 	struct schedstate_percpu *spc;
    764 	TAILQ_HEAD(, lwp) *q_head;
    765 	sched_info_lwp_t *sil;
    766 	runqueue_t *ci_rq;
    767 	struct lwp *l;
    768 
    769 	spc = &ci->ci_schedstate;
    770 	ci_rq = ci->ci_schedstate.spc_sched_info;
    771 
    772 #ifdef MULTIPROCESSOR
    773 	/* If runqueue is empty, try to catch some thread from other CPU */
    774 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
    775 		if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
    776 			return NULL;
    777 	} else if (ci_rq->r_count == 0) {
    778 		/* Reset the counter, and call the balancer */
    779 		ci_rq->r_avgcount = 0;
    780 		sched_balance(ci);
    781 
    782 		/* The re-locking will be done inside */
    783 		return sched_catchlwp();
    784 	}
    785 #else
    786 	if (ci_rq->r_count == 0)
    787 		return NULL;
    788 #endif
    789 
    790 	/* Take the highest priority thread */
    791 	KASSERT(ci_rq->r_bitmap[ci_rq->r_highest_pri >> BITMAP_SHIFT]);
    792 	q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri);
    793 	l = TAILQ_FIRST(q_head);
    794 	KASSERT(l != NULL);
    795 
    796 	/* Update the counters */
    797 	sil = l->l_sched_info;
    798 	KASSERT(sil->sl_timeslice >= min_ts);
    799 	KASSERT(sil->sl_timeslice <= max_ts);
    800 	spc->spc_ticks = sil->sl_timeslice;
    801 	sil->sl_rtime = hardclock_ticks;
    802 
    803 	return l;
    804 }
    805 
    806 bool
    807 sched_curcpu_runnable_p(void)
    808 {
    809 	const struct cpu_info *ci = curcpu();
    810 	const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info;
    811 
    812 #ifndef __HAVE_FAST_SOFTINTS
    813 	if (ci->ci_data.cpu_softints)
    814 		return true;
    815 #endif
    816 
    817 	if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
    818 		return (ci_rq->r_count - ci_rq->r_mcount);
    819 
    820 	return ci_rq->r_count;
    821 }
    822 
    823 /*
    824  * Time-driven events.
    825  */
    826 
    827 /*
    828  * Called once per time-quantum.  This routine is CPU-local and runs at
    829  * IPL_SCHED, thus the locking is not needed.
    830  */
    831 void
    832 sched_tick(struct cpu_info *ci)
    833 {
    834 	const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info;
    835 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    836 	struct lwp *l = curlwp;
    837 	const sched_info_lwp_t *sil = l->l_sched_info;
    838 
    839 	if (CURCPU_IDLE_P())
    840 		return;
    841 
    842 	switch (l->l_class) {
    843 	case SCHED_FIFO:
    844 		/*
    845 		 * Update the time-quantum, and continue running,
    846 		 * if thread runs on FIFO real-time policy.
    847 		 */
    848 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    849 		spc->spc_ticks = sil->sl_timeslice;
    850 		return;
    851 	case SCHED_OTHER:
    852 		/*
    853 		 * If thread is in time-sharing queue, decrease the priority,
    854 		 * and run with a higher time-quantum.
    855 		 */
    856 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    857 		if (l->l_priority != 0)
    858 			l->l_priority--;
    859 		break;
    860 	}
    861 
    862 	/*
    863 	 * If there are higher priority threads or threads in the same queue,
    864 	 * mark that thread should yield, otherwise, continue running.
    865 	 */
    866 	if (lwp_eprio(l) <= ci_rq->r_highest_pri || l->l_target_cpu) {
    867 		spc->spc_flags |= SPCF_SHOULDYIELD;
    868 		cpu_need_resched(ci, 0);
    869 	} else
    870 		spc->spc_ticks = sil->sl_timeslice;
    871 }
    872 
    873 /*
    874  * Sysctl nodes and initialization.
    875  */
    876 
    877 static int
    878 sysctl_sched_rtts(SYSCTLFN_ARGS)
    879 {
    880 	struct sysctlnode node;
    881 	int rttsms = hztoms(rt_ts);
    882 
    883 	node = *rnode;
    884 	node.sysctl_data = &rttsms;
    885 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    886 }
    887 
    888 static int
    889 sysctl_sched_mints(SYSCTLFN_ARGS)
    890 {
    891 	struct sysctlnode node;
    892 	struct cpu_info *ci;
    893 	int error, newsize;
    894 	CPU_INFO_ITERATOR cii;
    895 
    896 	node = *rnode;
    897 	node.sysctl_data = &newsize;
    898 
    899 	newsize = hztoms(min_ts);
    900 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    901 	if (error || newp == NULL)
    902 		return error;
    903 
    904 	newsize = mstohz(newsize);
    905 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    906 		return EINVAL;
    907 
    908 	/* It is safe to do this in such order */
    909 	for (CPU_INFO_FOREACH(cii, ci))
    910 		spc_lock(ci);
    911 
    912 	min_ts = newsize;
    913 	sched_precalcts();
    914 
    915 	for (CPU_INFO_FOREACH(cii, ci))
    916 		spc_unlock(ci);
    917 
    918 	return 0;
    919 }
    920 
    921 static int
    922 sysctl_sched_maxts(SYSCTLFN_ARGS)
    923 {
    924 	struct sysctlnode node;
    925 	struct cpu_info *ci;
    926 	int error, newsize;
    927 	CPU_INFO_ITERATOR cii;
    928 
    929 	node = *rnode;
    930 	node.sysctl_data = &newsize;
    931 
    932 	newsize = hztoms(max_ts);
    933 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    934 	if (error || newp == NULL)
    935 		return error;
    936 
    937 	newsize = mstohz(newsize);
    938 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    939 		return EINVAL;
    940 
    941 	/* It is safe to do this in such order */
    942 	for (CPU_INFO_FOREACH(cii, ci))
    943 		spc_lock(ci);
    944 
    945 	max_ts = newsize;
    946 	sched_precalcts();
    947 
    948 	for (CPU_INFO_FOREACH(cii, ci))
    949 		spc_unlock(ci);
    950 
    951 	return 0;
    952 }
    953 
    954 SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup")
    955 {
    956 	const struct sysctlnode *node = NULL;
    957 
    958 	sysctl_createv(clog, 0, NULL, NULL,
    959 		CTLFLAG_PERMANENT,
    960 		CTLTYPE_NODE, "kern", NULL,
    961 		NULL, 0, NULL, 0,
    962 		CTL_KERN, CTL_EOL);
    963 	sysctl_createv(clog, 0, NULL, &node,
    964 		CTLFLAG_PERMANENT,
    965 		CTLTYPE_NODE, "sched",
    966 		SYSCTL_DESCR("Scheduler options"),
    967 		NULL, 0, NULL, 0,
    968 		CTL_KERN, CTL_CREATE, CTL_EOL);
    969 
    970 	if (node == NULL)
    971 		return;
    972 
    973 	sysctl_createv(clog, 0, &node, NULL,
    974 		CTLFLAG_PERMANENT,
    975 		CTLTYPE_STRING, "name", NULL,
    976 		NULL, 0, __UNCONST("M2"), 0,
    977 		CTL_CREATE, CTL_EOL);
    978 	sysctl_createv(clog, 0, &node, NULL,
    979 		CTLFLAG_PERMANENT,
    980 		CTLTYPE_INT, "rtts",
    981 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    982 		sysctl_sched_rtts, 0, NULL, 0,
    983 		CTL_CREATE, CTL_EOL);
    984 	sysctl_createv(clog, 0, &node, NULL,
    985 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    986 		CTLTYPE_INT, "maxts",
    987 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    988 		sysctl_sched_maxts, 0, &max_ts, 0,
    989 		CTL_CREATE, CTL_EOL);
    990 	sysctl_createv(clog, 0, &node, NULL,
    991 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    992 		CTLTYPE_INT, "mints",
    993 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    994 		sysctl_sched_mints, 0, &min_ts, 0,
    995 		CTL_CREATE, CTL_EOL);
    996 
    997 #ifdef MULTIPROCESSOR
    998 	sysctl_createv(clog, 0, &node, NULL,
    999 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1000 		CTLTYPE_INT, "cacheht_time",
   1001 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
   1002 		NULL, 0, &cacheht_time, 0,
   1003 		CTL_CREATE, CTL_EOL);
   1004 	sysctl_createv(clog, 0, &node, NULL,
   1005 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1006 		CTLTYPE_INT, "balance_period",
   1007 		SYSCTL_DESCR("Balance period (in ticks)"),
   1008 		NULL, 0, &balance_period, 0,
   1009 		CTL_CREATE, CTL_EOL);
   1010 	sysctl_createv(clog, 0, &node, NULL,
   1011 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1012 		CTLTYPE_INT, "min_catch",
   1013 		SYSCTL_DESCR("Minimal count of the threads for catching"),
   1014 		NULL, 0, &min_catch, 0,
   1015 		CTL_CREATE, CTL_EOL);
   1016 #endif
   1017 }
   1018 
   1019 /*
   1020  * Debugging.
   1021  */
   1022 
   1023 #ifdef DDB
   1024 
   1025 void
   1026 sched_print_runqueue(void (*pr)(const char *, ...))
   1027 {
   1028 	runqueue_t *ci_rq;
   1029 	sched_info_lwp_t *sil;
   1030 	struct lwp *l;
   1031 	struct proc *p;
   1032 	int i;
   1033 
   1034 	struct cpu_info *ci;
   1035 	CPU_INFO_ITERATOR cii;
   1036 
   1037 	for (CPU_INFO_FOREACH(cii, ci)) {
   1038 		ci_rq = ci->ci_schedstate.spc_sched_info;
   1039 
   1040 		(*pr)("Run-queue (CPU = %d):\n", ci->ci_cpuid);
   1041 		(*pr)(" pid.lid = %d.%d, threads count = %u, "
   1042 		    "avgcount = %u, highest pri = %d\n",
   1043 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
   1044 		    ci_rq->r_count, ci_rq->r_avgcount, ci_rq->r_highest_pri);
   1045 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
   1046 		do {
   1047 			uint32_t q;
   1048 			q = ci_rq->r_bitmap[i];
   1049 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
   1050 		} while (i--);
   1051 	}
   1052 
   1053 	(*pr)("   %5s %4s %4s %10s %3s %4s %11s %3s %s\n",
   1054 	    "LID", "PRI", "EPRI", "FL", "ST", "TS", "LWP", "CPU", "LRTIME");
   1055 
   1056 	PROCLIST_FOREACH(p, &allproc) {
   1057 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
   1058 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1059 			sil = l->l_sched_info;
   1060 			ci = l->l_cpu;
   1061 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %4u %11p %3d "
   1062 			    "%u ST=%d RT=%d %d\n",
   1063 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
   1064 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
   1065 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
   1066 			    sil->sl_timeslice, l, ci->ci_cpuid,
   1067 			    (u_int)(hardclock_ticks - sil->sl_lrtime),
   1068 			    sil->sl_slpsum, sil->sl_rtsum, sil->sl_flags);
   1069 		}
   1070 	}
   1071 }
   1072 
   1073 #endif
   1074