Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.15
      1 /*	$NetBSD: sched_m2.c,v 1.15 2008/01/15 03:37:11 rmind Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.15 2008/01/15 03:37:11 rmind Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/bitops.h>
     41 #include <sys/cpu.h>
     42 #include <sys/callout.h>
     43 #include <sys/errno.h>
     44 #include <sys/kernel.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lwp.h>
     47 #include <sys/mutex.h>
     48 #include <sys/pool.h>
     49 #include <sys/proc.h>
     50 #include <sys/pset.h>
     51 #include <sys/resource.h>
     52 #include <sys/resourcevar.h>
     53 #include <sys/sched.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related defintions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 #define	PRI_DEFAULT	(NPRI_USER >> 1)
     67 
     68 const int schedppq = 1;
     69 
     70 /*
     71  * Bits per map.
     72  */
     73 #define	BITMAP_BITS	(32)
     74 #define	BITMAP_SHIFT	(5)
     75 #define	BITMAP_MSB	(0x80000000U)
     76 #define	BITMAP_MASK	(BITMAP_BITS - 1)
     77 
     78 /*
     79  * Time-slices and priorities.
     80  */
     81 static u_int	min_ts;			/* Minimal time-slice */
     82 static u_int	max_ts;			/* Maximal time-slice */
     83 static u_int	rt_ts;			/* Real-time time-slice */
     84 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     85 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     86 
     87 /*
     88  * Migration and balancing.
     89  */
     90 #ifdef MULTIPROCESSOR
     91 
     92 static u_int	cacheht_time;		/* Cache hotness time */
     93 static u_int	min_catch;		/* Minimal LWP count for catching */
     94 
     95 static u_int		balance_period;	/* Balance period */
     96 static struct callout	balance_ch;	/* Callout of balancer */
     97 
     98 static struct cpu_info * volatile worker_ci;
     99 
    100 #define CACHE_HOT(sil)		(sil->sl_lrtime && \
    101     (hardclock_ticks - sil->sl_lrtime < cacheht_time))
    102 
    103 #endif
    104 
    105 /*
    106  * Structures, runqueue.
    107  */
    108 
    109 typedef struct {
    110 	TAILQ_HEAD(, lwp) q_head;
    111 } queue_t;
    112 
    113 typedef struct {
    114 	/* Lock and bitmap */
    115 	kmutex_t	r_rq_mutex;
    116 	uint32_t	r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
    117 	/* Counters */
    118 	u_int		r_count;	/* Count of the threads */
    119 	pri_t		r_highest_pri;	/* Highest priority */
    120 	u_int		r_avgcount;	/* Average count of threads */
    121 	u_int		r_mcount;	/* Count of migratable threads */
    122 	/* Runqueues */
    123 	queue_t		r_rt_queue[PRI_RT_COUNT];
    124 	queue_t		r_ts_queue[PRI_TS_COUNT];
    125 } runqueue_t;
    126 
    127 typedef struct {
    128 	u_int		sl_flags;
    129 	u_int		sl_timeslice;	/* Time-slice of thread */
    130 	u_int		sl_slept;	/* Saved sleep time for sleep sum */
    131 	u_int		sl_slpsum;	/* Sum of sleep time */
    132 	u_int		sl_rtime;	/* Saved start time of run */
    133 	u_int		sl_rtsum;	/* Sum of the run time */
    134 	u_int		sl_lrtime;	/* Last run time */
    135 } sched_info_lwp_t;
    136 
    137 /* Flags */
    138 #define	SL_BATCH	0x01
    139 
    140 /* Pool of the scheduler-specific structures for threads */
    141 static struct pool	sil_pool;
    142 
    143 /*
    144  * Prototypes.
    145  */
    146 
    147 static inline void *	sched_getrq(runqueue_t *, const pri_t);
    148 static inline void	sched_newts(struct lwp *);
    149 static void		sched_precalcts(void);
    150 
    151 #ifdef MULTIPROCESSOR
    152 static struct lwp *	sched_catchlwp(void);
    153 static void		sched_balance(void *);
    154 #endif
    155 
    156 /*
    157  * Initialization and setup.
    158  */
    159 
    160 void
    161 sched_rqinit(void)
    162 {
    163 	struct cpu_info *ci = curcpu();
    164 
    165 	if (hz < 100) {
    166 		panic("sched_rqinit: value of HZ is too low\n");
    167 	}
    168 
    169 	/* Default timing ranges */
    170 	min_ts = mstohz(50);			/* ~50ms  */
    171 	max_ts = mstohz(150);			/* ~150ms */
    172 	rt_ts = mstohz(100);			/* ~100ms */
    173 	sched_precalcts();
    174 
    175 #ifdef MULTIPROCESSOR
    176 	/* Balancing */
    177 	worker_ci = ci;
    178 	cacheht_time = mstohz(5);		/* ~5 ms  */
    179 	balance_period = mstohz(300);		/* ~300ms */
    180 	min_catch = ~0;
    181 #endif
    182 
    183 	/* Pool of the scheduler-specific structures */
    184 	pool_init(&sil_pool, sizeof(sched_info_lwp_t), 0, 0, 0,
    185 	    "lwpsd", &pool_allocator_nointr, IPL_NONE);
    186 
    187 	/* Attach the primary CPU here */
    188 	sched_cpuattach(ci);
    189 
    190 	/* Initialize the scheduler structure of the primary LWP */
    191 	lwp0.l_mutex = &ci->ci_schedstate.spc_lwplock;
    192 	sched_lwp_fork(NULL, &lwp0);
    193 	sched_newts(&lwp0);
    194 }
    195 
    196 void
    197 sched_setup(void)
    198 {
    199 
    200 #ifdef MULTIPROCESSOR
    201 	/* Minimal count of LWPs for catching: log2(count of CPUs) */
    202 	min_catch = min(ilog2(ncpu), 4);
    203 
    204 	/* Initialize balancing callout and run it */
    205 	callout_init(&balance_ch, CALLOUT_MPSAFE);
    206 	callout_setfunc(&balance_ch, sched_balance, NULL);
    207 	callout_schedule(&balance_ch, balance_period);
    208 #endif
    209 }
    210 
    211 void
    212 sched_cpuattach(struct cpu_info *ci)
    213 {
    214 	runqueue_t *ci_rq;
    215 	void *rq_ptr;
    216 	u_int i, size;
    217 
    218 	/*
    219 	 * Allocate the run queue.
    220 	 * XXX: Estimate cache behaviour more..
    221 	 */
    222 	size = roundup(sizeof(runqueue_t), CACHE_LINE_SIZE) + CACHE_LINE_SIZE;
    223 	rq_ptr = kmem_zalloc(size, KM_SLEEP);
    224 	if (rq_ptr == NULL) {
    225 		panic("scheduler: could not allocate the runqueue");
    226 	}
    227 	/* XXX: Save the original pointer for future.. */
    228 	ci_rq = (void *)(roundup((intptr_t)(rq_ptr), CACHE_LINE_SIZE));
    229 
    230 	/* Initialize run queues */
    231 	mutex_init(&ci_rq->r_rq_mutex, MUTEX_DEFAULT, IPL_SCHED);
    232 	for (i = 0; i < PRI_RT_COUNT; i++)
    233 		TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
    234 	for (i = 0; i < PRI_TS_COUNT; i++)
    235 		TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
    236 	ci_rq->r_highest_pri = 0;
    237 
    238 	ci->ci_schedstate.spc_sched_info = ci_rq;
    239 	ci->ci_schedstate.spc_mutex = &ci_rq->r_rq_mutex;
    240 }
    241 
    242 /* Pre-calculate the time-slices for the priorities */
    243 static void
    244 sched_precalcts(void)
    245 {
    246 	pri_t p;
    247 
    248 	/* Time-sharing range */
    249 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    250 		ts_map[p] = max_ts -
    251 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    252 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    253 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    254 	}
    255 
    256 	/* Real-time range */
    257 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    258 		ts_map[p] = rt_ts;
    259 		high_pri[p] = p;
    260 	}
    261 }
    262 
    263 /*
    264  * Hooks.
    265  */
    266 
    267 void
    268 sched_proc_fork(struct proc *parent, struct proc *child)
    269 {
    270 	struct lwp *l;
    271 
    272 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    273 		lwp_lock(l);
    274 		sched_newts(l);
    275 		lwp_unlock(l);
    276 	}
    277 }
    278 
    279 void
    280 sched_proc_exit(struct proc *child, struct proc *parent)
    281 {
    282 
    283 	/* Dummy */
    284 }
    285 
    286 void
    287 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    288 {
    289 
    290 	KASSERT(l2->l_sched_info == NULL);
    291 	l2->l_sched_info = pool_get(&sil_pool, PR_WAITOK);
    292 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
    293 	if (l2->l_priority <= PRI_HIGHEST_TS) /* XXX: For now only.. */
    294 		l2->l_priority = PRI_DEFAULT;
    295 }
    296 
    297 void
    298 sched_lwp_exit(struct lwp *l)
    299 {
    300 
    301 	KASSERT(l->l_sched_info != NULL);
    302 	pool_put(&sil_pool, l->l_sched_info);
    303 	l->l_sched_info = NULL;
    304 }
    305 
    306 void
    307 sched_lwp_collect(struct lwp *l)
    308 {
    309 
    310 }
    311 
    312 void
    313 sched_setrunnable(struct lwp *l)
    314 {
    315 
    316 	/* Dummy */
    317 }
    318 
    319 void
    320 sched_schedclock(struct lwp *l)
    321 {
    322 
    323 	/* Dummy */
    324 }
    325 
    326 /*
    327  * Priorities and time-slice.
    328  */
    329 
    330 void
    331 sched_nice(struct proc *p, int prio)
    332 {
    333 	int nprio;
    334 	struct lwp *l;
    335 
    336 	KASSERT(mutex_owned(&p->p_smutex));
    337 
    338 	p->p_nice = prio;
    339 	nprio = max(min(PRI_DEFAULT + p->p_nice, PRI_HIGHEST_TS), 0);
    340 
    341 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    342 		lwp_lock(l);
    343 		lwp_changepri(l, nprio);
    344 		lwp_unlock(l);
    345 	}
    346 }
    347 
    348 /* Recalculate the time-slice */
    349 static inline void
    350 sched_newts(struct lwp *l)
    351 {
    352 	sched_info_lwp_t *sil = l->l_sched_info;
    353 
    354 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
    355 }
    356 
    357 /*
    358  * Control of the runqueue.
    359  */
    360 
    361 static inline void *
    362 sched_getrq(runqueue_t *ci_rq, const pri_t prio)
    363 {
    364 
    365 	KASSERT(prio < PRI_COUNT);
    366 	return (prio <= PRI_HIGHEST_TS) ?
    367 	    &ci_rq->r_ts_queue[prio].q_head :
    368 	    &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
    369 }
    370 
    371 void
    372 sched_enqueue(struct lwp *l, bool swtch)
    373 {
    374 	runqueue_t *ci_rq;
    375 	sched_info_lwp_t *sil = l->l_sched_info;
    376 	TAILQ_HEAD(, lwp) *q_head;
    377 	const pri_t eprio = lwp_eprio(l);
    378 
    379 	ci_rq = l->l_cpu->ci_schedstate.spc_sched_info;
    380 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    381 
    382 	/* Update the last run time on switch */
    383 	if (__predict_true(swtch == true)) {
    384 		sil->sl_lrtime = hardclock_ticks;
    385 		sil->sl_rtsum += (hardclock_ticks - sil->sl_rtime);
    386 	} else if (sil->sl_lrtime == 0)
    387 		sil->sl_lrtime = hardclock_ticks;
    388 
    389 	/* Enqueue the thread */
    390 	q_head = sched_getrq(ci_rq, eprio);
    391 	if (TAILQ_EMPTY(q_head)) {
    392 		u_int i;
    393 		uint32_t q;
    394 
    395 		/* Mark bit */
    396 		i = eprio >> BITMAP_SHIFT;
    397 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    398 		KASSERT((ci_rq->r_bitmap[i] & q) == 0);
    399 		ci_rq->r_bitmap[i] |= q;
    400 	}
    401 	TAILQ_INSERT_TAIL(q_head, l, l_runq);
    402 	ci_rq->r_count++;
    403 	if ((l->l_flag & LW_BOUND) == 0)
    404 		ci_rq->r_mcount++;
    405 
    406 	/*
    407 	 * Update the value of highest priority in the runqueue,
    408 	 * if priority of this thread is higher.
    409 	 */
    410 	if (eprio > ci_rq->r_highest_pri)
    411 		ci_rq->r_highest_pri = eprio;
    412 
    413 	sched_newts(l);
    414 }
    415 
    416 void
    417 sched_dequeue(struct lwp *l)
    418 {
    419 	runqueue_t *ci_rq;
    420 	TAILQ_HEAD(, lwp) *q_head;
    421 	const pri_t eprio = lwp_eprio(l);
    422 
    423 	ci_rq = l->l_cpu->ci_schedstate.spc_sched_info;
    424 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
    425 
    426 	KASSERT(eprio <= ci_rq->r_highest_pri);
    427 	KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
    428 	KASSERT(ci_rq->r_count > 0);
    429 
    430 	ci_rq->r_count--;
    431 	if ((l->l_flag & LW_BOUND) == 0)
    432 		ci_rq->r_mcount--;
    433 
    434 	q_head = sched_getrq(ci_rq, eprio);
    435 	TAILQ_REMOVE(q_head, l, l_runq);
    436 	if (TAILQ_EMPTY(q_head)) {
    437 		u_int i;
    438 		uint32_t q;
    439 
    440 		/* Unmark bit */
    441 		i = eprio >> BITMAP_SHIFT;
    442 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
    443 		KASSERT((ci_rq->r_bitmap[i] & q) != 0);
    444 		ci_rq->r_bitmap[i] &= ~q;
    445 
    446 		/*
    447 		 * Update the value of highest priority in the runqueue, in a
    448 		 * case it was a last thread in the queue of highest priority.
    449 		 */
    450 		if (eprio != ci_rq->r_highest_pri)
    451 			return;
    452 
    453 		do {
    454 			q = ffs(ci_rq->r_bitmap[i]);
    455 			if (q) {
    456 				ci_rq->r_highest_pri =
    457 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
    458 				return;
    459 			}
    460 		} while (i--);
    461 
    462 		/* If not found - set the lowest value */
    463 		ci_rq->r_highest_pri = 0;
    464 	}
    465 }
    466 
    467 void
    468 sched_slept(struct lwp *l)
    469 {
    470 	sched_info_lwp_t *sil = l->l_sched_info;
    471 
    472 	/* Save the time when thread has slept */
    473 	sil->sl_slept = hardclock_ticks;
    474 
    475 	/*
    476 	 * If thread is in time-sharing queue and batch flag is not marked,
    477 	 * increase the the priority, and run with the lower time-quantum.
    478 	 */
    479 	if (l->l_priority < PRI_HIGHEST_TS && (sil->sl_flags & SL_BATCH) == 0) {
    480 		KASSERT(l->l_class == SCHED_OTHER);
    481 		l->l_priority++;
    482 	}
    483 }
    484 
    485 void
    486 sched_wakeup(struct lwp *l)
    487 {
    488 	sched_info_lwp_t *sil = l->l_sched_info;
    489 
    490 	/* Update sleep time delta */
    491 	sil->sl_slpsum += (l->l_slptime == 0) ?
    492 	    (hardclock_ticks - sil->sl_slept) : hz;
    493 
    494 	/* If thread was sleeping a second or more - set a high priority */
    495 	if (l->l_slptime > 1 || (hardclock_ticks - sil->sl_slept) >= hz)
    496 		l->l_priority = high_pri[l->l_priority];
    497 
    498 	/* Also, consider looking for a better CPU to wake up */
    499 	if ((l->l_flag & (LW_BOUND | LW_SYSTEM)) == 0)
    500 		l->l_cpu = sched_takecpu(l);
    501 }
    502 
    503 void
    504 sched_pstats_hook(struct lwp *l)
    505 {
    506 	sched_info_lwp_t *sil = l->l_sched_info;
    507 	pri_t prio;
    508 	bool batch;
    509 
    510 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    511 	    l->l_stat == LSSUSPENDED)
    512 		l->l_slptime++;
    513 
    514 	/*
    515 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
    516 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
    517 	 */
    518 	batch = (sil->sl_rtsum > sil->sl_slpsum);
    519 	if (batch) {
    520 		if ((sil->sl_flags & SL_BATCH) == 0)
    521 			batch = false;
    522 		sil->sl_flags |= SL_BATCH;
    523 	} else
    524 		sil->sl_flags &= ~SL_BATCH;
    525 
    526 	/* Reset the time sums */
    527 	sil->sl_slpsum = 0;
    528 	sil->sl_rtsum = 0;
    529 
    530 	/* Estimate threads on time-sharing queue only */
    531 	if (l->l_priority >= PRI_HIGHEST_TS)
    532 		return;
    533 
    534 	/* If it is CPU-bound not a first time - decrease the priority */
    535 	prio = l->l_priority;
    536 	if (batch && prio != 0)
    537 		prio--;
    538 
    539 	/* If thread was not ran a second or more - set a high priority */
    540 	if (l->l_stat == LSRUN) {
    541 		if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz))
    542 			prio = high_pri[prio];
    543 		/* Re-enqueue the thread if priority has changed */
    544 		if (prio != l->l_priority)
    545 			lwp_changepri(l, prio);
    546 	} else {
    547 		/* In other states, change the priority directly */
    548 		l->l_priority = prio;
    549 	}
    550 }
    551 
    552 /*
    553  * Migration and balancing.
    554  */
    555 
    556 #ifdef MULTIPROCESSOR
    557 
    558 /* Check if LWP can migrate to the chosen CPU */
    559 static inline bool
    560 sched_migratable(const struct lwp *l, struct cpu_info *ci)
    561 {
    562 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
    563 
    564 	/* CPU is offline */
    565 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
    566 		return false;
    567 
    568 	/* Affinity bind */
    569 	if (__predict_false(l->l_flag & LW_AFFINITY))
    570 		return CPU_ISSET(cpu_index(ci), &l->l_affinity);
    571 
    572 	/* Processor-set */
    573 	return (spc->spc_psid == l->l_psid);
    574 }
    575 
    576 /*
    577  * Estimate the migration of LWP to the other CPU.
    578  * Take and return the CPU, if migration is needed.
    579  */
    580 struct cpu_info *
    581 sched_takecpu(struct lwp *l)
    582 {
    583 	struct cpu_info *ci, *tci;
    584 	struct schedstate_percpu *spc;
    585 	runqueue_t *ci_rq;
    586 	sched_info_lwp_t *sil;
    587 	CPU_INFO_ITERATOR cii;
    588 	pri_t eprio, lpri;
    589 
    590 	KASSERT(lwp_locked(l, NULL));
    591 
    592 	ci = l->l_cpu;
    593 	spc = &ci->ci_schedstate;
    594 	ci_rq = spc->spc_sched_info;
    595 
    596 	/* If thread is strictly bound, do not estimate other CPUs */
    597 	if (l->l_flag & LW_BOUND)
    598 		return ci;
    599 
    600 	/* CPU of this thread is idling - run there */
    601 	if (ci_rq->r_count == 0)
    602 		return ci;
    603 
    604 	eprio = lwp_eprio(l);
    605 	sil = l->l_sched_info;
    606 
    607 	/* Stay if thread is cache-hot */
    608 	if (l->l_stat == LSSLEEP && l->l_slptime <= 1 &&
    609 	    CACHE_HOT(sil) && eprio >= spc->spc_curpriority)
    610 		return ci;
    611 
    612 	/* Run on current CPU if priority of thread is higher */
    613 	ci = curcpu();
    614 	spc = &ci->ci_schedstate;
    615 	if (eprio > spc->spc_curpriority && sched_migratable(l, ci))
    616 		return ci;
    617 
    618 	/*
    619 	 * Look for the CPU with the lowest priority thread.  In case of
    620 	 * equal the priority - check the lower count of the threads.
    621 	 */
    622 	tci = l->l_cpu;
    623 	lpri = PRI_COUNT;
    624 	for (CPU_INFO_FOREACH(cii, ci)) {
    625 		runqueue_t *ici_rq;
    626 		pri_t pri;
    627 
    628 		spc = &ci->ci_schedstate;
    629 		ici_rq = spc->spc_sched_info;
    630 		pri = max(spc->spc_curpriority, ici_rq->r_highest_pri);
    631 		if (pri > lpri)
    632 			continue;
    633 
    634 		if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
    635 			continue;
    636 
    637 		if (!sched_migratable(l, ci))
    638 			continue;
    639 
    640 		lpri = pri;
    641 		tci = ci;
    642 		ci_rq = ici_rq;
    643 	}
    644 	return tci;
    645 }
    646 
    647 /*
    648  * Tries to catch an LWP from the runqueue of other CPU.
    649  */
    650 static struct lwp *
    651 sched_catchlwp(void)
    652 {
    653 	struct cpu_info *curci = curcpu(), *ci = worker_ci;
    654 	TAILQ_HEAD(, lwp) *q_head;
    655 	runqueue_t *ci_rq;
    656 	struct lwp *l;
    657 
    658 	if (curci == ci)
    659 		return NULL;
    660 
    661 	/* Lockless check */
    662 	ci_rq = ci->ci_schedstate.spc_sched_info;
    663 	if (ci_rq->r_count < min_catch)
    664 		return NULL;
    665 
    666 	/*
    667 	 * Double-lock the runqueues.
    668 	 */
    669 	if (curci < ci) {
    670 		spc_lock(ci);
    671 	} else if (!mutex_tryenter(ci->ci_schedstate.spc_mutex)) {
    672 		const runqueue_t *cur_rq = curci->ci_schedstate.spc_sched_info;
    673 
    674 		spc_unlock(curci);
    675 		spc_lock(ci);
    676 		spc_lock(curci);
    677 
    678 		if (cur_rq->r_count) {
    679 			spc_unlock(ci);
    680 			return NULL;
    681 		}
    682 	}
    683 
    684 	if (ci_rq->r_count < min_catch) {
    685 		spc_unlock(ci);
    686 		return NULL;
    687 	}
    688 
    689 	/* Take the highest priority thread */
    690 	q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri);
    691 	l = TAILQ_FIRST(q_head);
    692 
    693 	for (;;) {
    694 		sched_info_lwp_t *sil;
    695 
    696 		/* Check the first and next result from the queue */
    697 		if (l == NULL)
    698 			break;
    699 
    700 		/* Look for threads, whose are allowed to migrate */
    701 		sil = l->l_sched_info;
    702 		if ((l->l_flag & LW_SYSTEM) || CACHE_HOT(sil) ||
    703 		    !sched_migratable(l, curci)) {
    704 			l = TAILQ_NEXT(l, l_runq);
    705 			continue;
    706 		}
    707 		/* Recheck if chosen thread is still on the runqueue */
    708 		if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM)) {
    709 			sched_dequeue(l);
    710 			l->l_cpu = curci;
    711 			lwp_setlock(l, curci->ci_schedstate.spc_mutex);
    712 			sched_enqueue(l, false);
    713 			break;
    714 		}
    715 		l = TAILQ_NEXT(l, l_runq);
    716 	}
    717 	spc_unlock(ci);
    718 
    719 	return l;
    720 }
    721 
    722 /*
    723  * Periodical calculations for balancing.
    724  */
    725 static void
    726 sched_balance(void *nocallout)
    727 {
    728 	struct cpu_info *ci, *hci;
    729 	runqueue_t *ci_rq;
    730 	CPU_INFO_ITERATOR cii;
    731 	u_int highest;
    732 
    733 	hci = curcpu();
    734 	highest = 0;
    735 
    736 	/* Make lockless countings */
    737 	for (CPU_INFO_FOREACH(cii, ci)) {
    738 		ci_rq = ci->ci_schedstate.spc_sched_info;
    739 
    740 		/* Average count of the threads */
    741 		ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
    742 
    743 		/* Look for CPU with the highest average */
    744 		if (ci_rq->r_avgcount > highest) {
    745 			hci = ci;
    746 			highest = ci_rq->r_avgcount;
    747 		}
    748 	}
    749 
    750 	/* Update the worker */
    751 	worker_ci = hci;
    752 
    753 	if (nocallout == NULL)
    754 		callout_schedule(&balance_ch, balance_period);
    755 }
    756 
    757 #else
    758 
    759 struct cpu_info *
    760 sched_takecpu(struct lwp *l)
    761 {
    762 
    763 	return l->l_cpu;
    764 }
    765 
    766 #endif	/* MULTIPROCESSOR */
    767 
    768 /*
    769  * Scheduler mill.
    770  */
    771 struct lwp *
    772 sched_nextlwp(void)
    773 {
    774 	struct cpu_info *ci = curcpu();
    775 	struct schedstate_percpu *spc;
    776 	TAILQ_HEAD(, lwp) *q_head;
    777 	sched_info_lwp_t *sil;
    778 	runqueue_t *ci_rq;
    779 	struct lwp *l;
    780 
    781 	spc = &ci->ci_schedstate;
    782 	ci_rq = ci->ci_schedstate.spc_sched_info;
    783 
    784 #ifdef MULTIPROCESSOR
    785 	/* If runqueue is empty, try to catch some thread from other CPU */
    786 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) {
    787 		if ((ci_rq->r_count - ci_rq->r_mcount) == 0)
    788 			return NULL;
    789 	} else if (ci_rq->r_count == 0) {
    790 		/* Reset the counter, and call the balancer */
    791 		ci_rq->r_avgcount = 0;
    792 		sched_balance(ci);
    793 
    794 		/* The re-locking will be done inside */
    795 		return sched_catchlwp();
    796 	}
    797 #else
    798 	if (ci_rq->r_count == 0)
    799 		return NULL;
    800 #endif
    801 
    802 	/* Take the highest priority thread */
    803 	KASSERT(ci_rq->r_bitmap[ci_rq->r_highest_pri >> BITMAP_SHIFT]);
    804 	q_head = sched_getrq(ci_rq, ci_rq->r_highest_pri);
    805 	l = TAILQ_FIRST(q_head);
    806 	KASSERT(l != NULL);
    807 
    808 	/* Update the counters */
    809 	sil = l->l_sched_info;
    810 	KASSERT(sil->sl_timeslice >= min_ts);
    811 	KASSERT(sil->sl_timeslice <= max_ts);
    812 	spc->spc_ticks = sil->sl_timeslice;
    813 	sil->sl_rtime = hardclock_ticks;
    814 
    815 	return l;
    816 }
    817 
    818 bool
    819 sched_curcpu_runnable_p(void)
    820 {
    821 	const struct cpu_info *ci = curcpu();
    822 	const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info;
    823 
    824 #ifndef __HAVE_FAST_SOFTINTS
    825 	if (ci->ci_data.cpu_softints)
    826 		return true;
    827 #endif
    828 
    829 	if (ci->ci_schedstate.spc_flags & SPCF_OFFLINE)
    830 		return (ci_rq->r_count - ci_rq->r_mcount);
    831 
    832 	return ci_rq->r_count;
    833 }
    834 
    835 /*
    836  * Time-driven events.
    837  */
    838 
    839 /*
    840  * Called once per time-quantum.  This routine is CPU-local and runs at
    841  * IPL_SCHED, thus the locking is not needed.
    842  */
    843 void
    844 sched_tick(struct cpu_info *ci)
    845 {
    846 	const runqueue_t *ci_rq = ci->ci_schedstate.spc_sched_info;
    847 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    848 	struct lwp *l = curlwp;
    849 	sched_info_lwp_t *sil = l->l_sched_info;
    850 
    851 	if (CURCPU_IDLE_P())
    852 		return;
    853 
    854 	switch (l->l_class) {
    855 	case SCHED_FIFO:
    856 		/*
    857 		 * Update the time-quantum, and continue running,
    858 		 * if thread runs on FIFO real-time policy.
    859 		 */
    860 		spc->spc_ticks = sil->sl_timeslice;
    861 		return;
    862 	case SCHED_OTHER:
    863 		/*
    864 		 * If thread is in time-sharing queue, decrease the priority,
    865 		 * and run with a higher time-quantum.
    866 		 */
    867 		if (l->l_priority > PRI_HIGHEST_TS)
    868 			break;
    869 		if (l->l_priority != 0)
    870 			l->l_priority--;
    871 		break;
    872 	}
    873 
    874 	/*
    875 	 * If there are higher priority threads or threads in the same queue,
    876 	 * mark that thread should yield, otherwise, continue running.
    877 	 */
    878 	if (lwp_eprio(l) <= ci_rq->r_highest_pri || l->l_target_cpu) {
    879 		spc->spc_flags |= SPCF_SHOULDYIELD;
    880 		cpu_need_resched(ci, 0);
    881 	} else
    882 		spc->spc_ticks = sil->sl_timeslice;
    883 }
    884 
    885 /*
    886  * Sysctl nodes and initialization.
    887  */
    888 
    889 static int
    890 sysctl_sched_rtts(SYSCTLFN_ARGS)
    891 {
    892 	struct sysctlnode node;
    893 	int rttsms = hztoms(rt_ts);
    894 
    895 	node = *rnode;
    896 	node.sysctl_data = &rttsms;
    897 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    898 }
    899 
    900 static int
    901 sysctl_sched_mints(SYSCTLFN_ARGS)
    902 {
    903 	struct sysctlnode node;
    904 	struct cpu_info *ci;
    905 	int error, newsize;
    906 	CPU_INFO_ITERATOR cii;
    907 
    908 	node = *rnode;
    909 	node.sysctl_data = &newsize;
    910 
    911 	newsize = hztoms(min_ts);
    912 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    913 	if (error || newp == NULL)
    914 		return error;
    915 
    916 	newsize = mstohz(newsize);
    917 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    918 		return EINVAL;
    919 
    920 	/* It is safe to do this in such order */
    921 	for (CPU_INFO_FOREACH(cii, ci))
    922 		spc_lock(ci);
    923 
    924 	min_ts = newsize;
    925 	sched_precalcts();
    926 
    927 	for (CPU_INFO_FOREACH(cii, ci))
    928 		spc_unlock(ci);
    929 
    930 	return 0;
    931 }
    932 
    933 static int
    934 sysctl_sched_maxts(SYSCTLFN_ARGS)
    935 {
    936 	struct sysctlnode node;
    937 	struct cpu_info *ci;
    938 	int error, newsize;
    939 	CPU_INFO_ITERATOR cii;
    940 
    941 	node = *rnode;
    942 	node.sysctl_data = &newsize;
    943 
    944 	newsize = hztoms(max_ts);
    945 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    946 	if (error || newp == NULL)
    947 		return error;
    948 
    949 	newsize = mstohz(newsize);
    950 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    951 		return EINVAL;
    952 
    953 	/* It is safe to do this in such order */
    954 	for (CPU_INFO_FOREACH(cii, ci))
    955 		spc_lock(ci);
    956 
    957 	max_ts = newsize;
    958 	sched_precalcts();
    959 
    960 	for (CPU_INFO_FOREACH(cii, ci))
    961 		spc_unlock(ci);
    962 
    963 	return 0;
    964 }
    965 
    966 SYSCTL_SETUP(sysctl_sched_setup, "sysctl kern.sched subtree setup")
    967 {
    968 	const struct sysctlnode *node = NULL;
    969 
    970 	sysctl_createv(clog, 0, NULL, NULL,
    971 		CTLFLAG_PERMANENT,
    972 		CTLTYPE_NODE, "kern", NULL,
    973 		NULL, 0, NULL, 0,
    974 		CTL_KERN, CTL_EOL);
    975 	sysctl_createv(clog, 0, NULL, &node,
    976 		CTLFLAG_PERMANENT,
    977 		CTLTYPE_NODE, "sched",
    978 		SYSCTL_DESCR("Scheduler options"),
    979 		NULL, 0, NULL, 0,
    980 		CTL_KERN, CTL_CREATE, CTL_EOL);
    981 
    982 	if (node == NULL)
    983 		return;
    984 
    985 	sysctl_createv(clog, 0, &node, NULL,
    986 		CTLFLAG_PERMANENT,
    987 		CTLTYPE_STRING, "name", NULL,
    988 		NULL, 0, __UNCONST("M2"), 0,
    989 		CTL_CREATE, CTL_EOL);
    990 	sysctl_createv(clog, 0, &node, NULL,
    991 		CTLFLAG_PERMANENT,
    992 		CTLTYPE_INT, "rtts",
    993 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    994 		sysctl_sched_rtts, 0, NULL, 0,
    995 		CTL_CREATE, CTL_EOL);
    996 	sysctl_createv(clog, 0, &node, NULL,
    997 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    998 		CTLTYPE_INT, "maxts",
    999 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
   1000 		sysctl_sched_maxts, 0, &max_ts, 0,
   1001 		CTL_CREATE, CTL_EOL);
   1002 	sysctl_createv(clog, 0, &node, NULL,
   1003 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1004 		CTLTYPE_INT, "mints",
   1005 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
   1006 		sysctl_sched_mints, 0, &min_ts, 0,
   1007 		CTL_CREATE, CTL_EOL);
   1008 
   1009 #ifdef MULTIPROCESSOR
   1010 	sysctl_createv(clog, 0, &node, NULL,
   1011 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1012 		CTLTYPE_INT, "cacheht_time",
   1013 		SYSCTL_DESCR("Cache hotness time (in ticks)"),
   1014 		NULL, 0, &cacheht_time, 0,
   1015 		CTL_CREATE, CTL_EOL);
   1016 	sysctl_createv(clog, 0, &node, NULL,
   1017 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1018 		CTLTYPE_INT, "balance_period",
   1019 		SYSCTL_DESCR("Balance period (in ticks)"),
   1020 		NULL, 0, &balance_period, 0,
   1021 		CTL_CREATE, CTL_EOL);
   1022 	sysctl_createv(clog, 0, &node, NULL,
   1023 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
   1024 		CTLTYPE_INT, "min_catch",
   1025 		SYSCTL_DESCR("Minimal count of the threads for catching"),
   1026 		NULL, 0, &min_catch, 0,
   1027 		CTL_CREATE, CTL_EOL);
   1028 #endif
   1029 }
   1030 
   1031 /*
   1032  * Debugging.
   1033  */
   1034 
   1035 #ifdef DDB
   1036 
   1037 void
   1038 sched_print_runqueue(void (*pr)(const char *, ...))
   1039 {
   1040 	runqueue_t *ci_rq;
   1041 	sched_info_lwp_t *sil;
   1042 	struct lwp *l;
   1043 	struct proc *p;
   1044 	int i;
   1045 
   1046 	struct cpu_info *ci;
   1047 	CPU_INFO_ITERATOR cii;
   1048 
   1049 	for (CPU_INFO_FOREACH(cii, ci)) {
   1050 		ci_rq = ci->ci_schedstate.spc_sched_info;
   1051 
   1052 		(*pr)("Run-queue (CPU = %d):\n", ci->ci_cpuid);
   1053 		(*pr)(" pid.lid = %d.%d, threads count = %u, "
   1054 		    "avgcount = %u, highest pri = %d\n",
   1055 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
   1056 		    ci_rq->r_count, ci_rq->r_avgcount, ci_rq->r_highest_pri);
   1057 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
   1058 		do {
   1059 			uint32_t q;
   1060 			q = ci_rq->r_bitmap[i];
   1061 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
   1062 		} while (i--);
   1063 	}
   1064 
   1065 	(*pr)("   %5s %4s %4s %10s %3s %4s %11s %3s %s\n",
   1066 	    "LID", "PRI", "EPRI", "FL", "ST", "TS", "LWP", "CPU", "LRTIME");
   1067 
   1068 	PROCLIST_FOREACH(p, &allproc) {
   1069 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
   1070 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
   1071 			sil = l->l_sched_info;
   1072 			ci = l->l_cpu;
   1073 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %4u %11p %3d "
   1074 			    "%u ST=%d RT=%d %d\n",
   1075 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
   1076 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
   1077 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
   1078 			    sil->sl_timeslice, l, ci->ci_cpuid,
   1079 			    (u_int)(hardclock_ticks - sil->sl_lrtime),
   1080 			    sil->sl_slpsum, sil->sl_rtsum, sil->sl_flags);
   1081 		}
   1082 	}
   1083 }
   1084 
   1085 #endif /* defined(DDB) */
   1086