Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.24.4.2
      1 /*	$NetBSD: sched_m2.c,v 1.24.4.2 2009/07/18 14:53:23 yamt Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24.4.2 2009/07/18 14:53:23 yamt Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/cpu.h>
     41 #include <sys/callout.h>
     42 #include <sys/errno.h>
     43 #include <sys/kernel.h>
     44 #include <sys/kmem.h>
     45 #include <sys/lwp.h>
     46 #include <sys/mutex.h>
     47 #include <sys/pool.h>
     48 #include <sys/proc.h>
     49 #include <sys/pset.h>
     50 #include <sys/resource.h>
     51 #include <sys/resourcevar.h>
     52 #include <sys/sched.h>
     53 #include <sys/syscallargs.h>
     54 #include <sys/sysctl.h>
     55 #include <sys/types.h>
     56 
     57 /*
     58  * Priority related defintions.
     59  */
     60 #define	PRI_TS_COUNT	(NPRI_USER)
     61 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     62 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     63 
     64 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     65 
     66 /*
     67  * Time-slices and priorities.
     68  */
     69 static u_int	min_ts;			/* Minimal time-slice */
     70 static u_int	max_ts;			/* Maximal time-slice */
     71 static u_int	rt_ts;			/* Real-time time-slice */
     72 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     73 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     74 
     75 static void	sched_precalcts(void);
     76 
     77 /*
     78  * Initialization and setup.
     79  */
     80 
     81 void
     82 sched_rqinit(void)
     83 {
     84 	struct cpu_info *ci = curcpu();
     85 
     86 	if (hz < 100) {
     87 		panic("sched_rqinit: value of HZ is too low\n");
     88 	}
     89 
     90 	/* Default timing ranges */
     91 	min_ts = mstohz(20);			/*  ~20 ms */
     92 	max_ts = mstohz(150);			/* ~150 ms */
     93 	rt_ts = mstohz(100);			/* ~100 ms */
     94 	sched_precalcts();
     95 
     96 	/* Attach the primary CPU here */
     97 	sched_cpuattach(ci);
     98 
     99 	sched_lwp_fork(NULL, &lwp0);
    100 	sched_newts(&lwp0);
    101 }
    102 
    103 /* Pre-calculate the time-slices for the priorities */
    104 static void
    105 sched_precalcts(void)
    106 {
    107 	pri_t p;
    108 
    109 	/* Time-sharing range */
    110 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    111 		ts_map[p] = max_ts -
    112 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    113 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    114 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    115 	}
    116 
    117 	/* Real-time range */
    118 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    119 		ts_map[p] = rt_ts;
    120 		high_pri[p] = p;
    121 	}
    122 }
    123 
    124 /*
    125  * Hooks.
    126  */
    127 
    128 void
    129 sched_proc_fork(struct proc *parent, struct proc *child)
    130 {
    131 	struct lwp *l;
    132 
    133 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    134 		lwp_lock(l);
    135 		sched_newts(l);
    136 		lwp_unlock(l);
    137 	}
    138 }
    139 
    140 void
    141 sched_proc_exit(struct proc *child, struct proc *parent)
    142 {
    143 
    144 }
    145 
    146 void
    147 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    148 {
    149 
    150 }
    151 
    152 void
    153 sched_lwp_collect(struct lwp *l)
    154 {
    155 
    156 }
    157 
    158 void
    159 sched_setrunnable(struct lwp *l)
    160 {
    161 
    162 }
    163 
    164 void
    165 sched_schedclock(struct lwp *l)
    166 {
    167 
    168 }
    169 
    170 /*
    171  * Priorities and time-slice.
    172  */
    173 
    174 void
    175 sched_nice(struct proc *p, int prio)
    176 {
    177 	struct lwp *l;
    178 	int n;
    179 
    180 	KASSERT(mutex_owned(p->p_lock));
    181 
    182 	p->p_nice = prio;
    183 	n = (prio - NZERO) >> 2;
    184 	if (n == 0)
    185 		return;
    186 
    187 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    188 		lwp_lock(l);
    189 		if (l->l_class == SCHED_OTHER) {
    190 			pri_t pri = l->l_priority - n;
    191 			pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0);
    192 			lwp_changepri(l, pri);
    193 		}
    194 		lwp_unlock(l);
    195 	}
    196 }
    197 
    198 /* Recalculate the time-slice */
    199 void
    200 sched_newts(struct lwp *l)
    201 {
    202 
    203 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
    204 }
    205 
    206 void
    207 sched_slept(struct lwp *l)
    208 {
    209 
    210 	/*
    211 	 * If thread is in time-sharing queue and batch flag is not marked,
    212 	 * increase the the priority, and run with the lower time-quantum.
    213 	 */
    214 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    215 		struct proc *p = l->l_proc;
    216 
    217 		KASSERT(l->l_class == SCHED_OTHER);
    218 		if (__predict_false(p->p_nice < NZERO)) {
    219 			const int n = max((NZERO - p->p_nice) >> 2, 1);
    220 			l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS);
    221 		} else {
    222 			l->l_priority++;
    223 		}
    224 	}
    225 }
    226 
    227 void
    228 sched_wakeup(struct lwp *l)
    229 {
    230 
    231 	/* If thread was sleeping a second or more - set a high priority */
    232 	if (l->l_slptime >= 1)
    233 		l->l_priority = high_pri[l->l_priority];
    234 }
    235 
    236 void
    237 sched_pstats_hook(struct lwp *l, int batch)
    238 {
    239 	pri_t prio;
    240 
    241 	/*
    242 	 * Estimate threads on time-sharing queue only, however,
    243 	 * exclude the highest priority for performance purposes.
    244 	 */
    245 	KASSERT(lwp_locked(l, NULL));
    246 	if (l->l_priority >= PRI_HIGHEST_TS)
    247 		return;
    248 	KASSERT(l->l_class == SCHED_OTHER);
    249 
    250 	/* If it is CPU-bound not a first time - decrease the priority */
    251 	prio = l->l_priority;
    252 	if (batch && prio != 0)
    253 		prio--;
    254 
    255 	/* If thread was not ran a second or more - set a high priority */
    256 	if (l->l_stat == LSRUN) {
    257 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
    258 			prio = high_pri[prio];
    259 		/* Re-enqueue the thread if priority has changed */
    260 		if (prio != l->l_priority)
    261 			lwp_changepri(l, prio);
    262 	} else {
    263 		/* In other states, change the priority directly */
    264 		l->l_priority = prio;
    265 	}
    266 }
    267 
    268 void
    269 sched_oncpu(lwp_t *l)
    270 {
    271 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
    272 
    273 	/* Update the counters */
    274 	KASSERT(l->l_sched.timeslice >= min_ts);
    275 	KASSERT(l->l_sched.timeslice <= max_ts);
    276 	spc->spc_ticks = l->l_sched.timeslice;
    277 }
    278 
    279 /*
    280  * Time-driven events.
    281  */
    282 
    283 /*
    284  * Called once per time-quantum.  This routine is CPU-local and runs at
    285  * IPL_SCHED, thus the locking is not needed.
    286  */
    287 void
    288 sched_tick(struct cpu_info *ci)
    289 {
    290 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    291 	struct lwp *l = curlwp;
    292 	struct proc *p;
    293 
    294 	if (__predict_false(CURCPU_IDLE_P()))
    295 		return;
    296 
    297 	switch (l->l_class) {
    298 	case SCHED_FIFO:
    299 		/*
    300 		 * Update the time-quantum, and continue running,
    301 		 * if thread runs on FIFO real-time policy.
    302 		 */
    303 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    304 		spc->spc_ticks = l->l_sched.timeslice;
    305 		return;
    306 	case SCHED_OTHER:
    307 		/*
    308 		 * If thread is in time-sharing queue, decrease the priority,
    309 		 * and run with a higher time-quantum.
    310 		 */
    311 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    312 		if (l->l_priority == 0)
    313 			break;
    314 
    315 		p = l->l_proc;
    316 		if (__predict_false(p->p_nice > NZERO)) {
    317 			const int n = max((p->p_nice - NZERO) >> 2, 1);
    318 			l->l_priority = imax(l->l_priority - n, 0);
    319 		} else
    320 			l->l_priority--;
    321 		break;
    322 	}
    323 
    324 	/*
    325 	 * If there are higher priority threads or threads in the same queue,
    326 	 * mark that thread should yield, otherwise, continue running.
    327 	 */
    328 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    329 		spc->spc_flags |= SPCF_SHOULDYIELD;
    330 		cpu_need_resched(ci, 0);
    331 	} else
    332 		spc->spc_ticks = l->l_sched.timeslice;
    333 }
    334 
    335 /*
    336  * Sysctl nodes and initialization.
    337  */
    338 
    339 static int
    340 sysctl_sched_rtts(SYSCTLFN_ARGS)
    341 {
    342 	struct sysctlnode node;
    343 	int rttsms = hztoms(rt_ts);
    344 
    345 	node = *rnode;
    346 	node.sysctl_data = &rttsms;
    347 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    348 }
    349 
    350 static int
    351 sysctl_sched_mints(SYSCTLFN_ARGS)
    352 {
    353 	struct sysctlnode node;
    354 	struct cpu_info *ci;
    355 	int error, newsize;
    356 	CPU_INFO_ITERATOR cii;
    357 
    358 	node = *rnode;
    359 	node.sysctl_data = &newsize;
    360 
    361 	newsize = hztoms(min_ts);
    362 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    363 	if (error || newp == NULL)
    364 		return error;
    365 
    366 	newsize = mstohz(newsize);
    367 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    368 		return EINVAL;
    369 
    370 	/* It is safe to do this in such order */
    371 	for (CPU_INFO_FOREACH(cii, ci))
    372 		spc_lock(ci);
    373 
    374 	min_ts = newsize;
    375 	sched_precalcts();
    376 
    377 	for (CPU_INFO_FOREACH(cii, ci))
    378 		spc_unlock(ci);
    379 
    380 	return 0;
    381 }
    382 
    383 static int
    384 sysctl_sched_maxts(SYSCTLFN_ARGS)
    385 {
    386 	struct sysctlnode node;
    387 	struct cpu_info *ci;
    388 	int error, newsize;
    389 	CPU_INFO_ITERATOR cii;
    390 
    391 	node = *rnode;
    392 	node.sysctl_data = &newsize;
    393 
    394 	newsize = hztoms(max_ts);
    395 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    396 	if (error || newp == NULL)
    397 		return error;
    398 
    399 	newsize = mstohz(newsize);
    400 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    401 		return EINVAL;
    402 
    403 	/* It is safe to do this in such order */
    404 	for (CPU_INFO_FOREACH(cii, ci))
    405 		spc_lock(ci);
    406 
    407 	max_ts = newsize;
    408 	sched_precalcts();
    409 
    410 	for (CPU_INFO_FOREACH(cii, ci))
    411 		spc_unlock(ci);
    412 
    413 	return 0;
    414 }
    415 
    416 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    417 {
    418 	const struct sysctlnode *node = NULL;
    419 
    420 	sysctl_createv(clog, 0, NULL, NULL,
    421 		CTLFLAG_PERMANENT,
    422 		CTLTYPE_NODE, "kern", NULL,
    423 		NULL, 0, NULL, 0,
    424 		CTL_KERN, CTL_EOL);
    425 	sysctl_createv(clog, 0, NULL, &node,
    426 		CTLFLAG_PERMANENT,
    427 		CTLTYPE_NODE, "sched",
    428 		SYSCTL_DESCR("Scheduler options"),
    429 		NULL, 0, NULL, 0,
    430 		CTL_KERN, CTL_CREATE, CTL_EOL);
    431 
    432 	if (node == NULL)
    433 		return;
    434 
    435 	sysctl_createv(NULL, 0, &node, NULL,
    436 		CTLFLAG_PERMANENT,
    437 		CTLTYPE_STRING, "name", NULL,
    438 		NULL, 0, __UNCONST("M2"), 0,
    439 		CTL_CREATE, CTL_EOL);
    440 	sysctl_createv(NULL, 0, &node, NULL,
    441 		CTLFLAG_PERMANENT,
    442 		CTLTYPE_INT, "rtts",
    443 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    444 		sysctl_sched_rtts, 0, NULL, 0,
    445 		CTL_CREATE, CTL_EOL);
    446 	sysctl_createv(NULL, 0, &node, NULL,
    447 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    448 		CTLTYPE_INT, "maxts",
    449 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    450 		sysctl_sched_maxts, 0, &max_ts, 0,
    451 		CTL_CREATE, CTL_EOL);
    452 	sysctl_createv(NULL, 0, &node, NULL,
    453 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    454 		CTLTYPE_INT, "mints",
    455 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    456 		sysctl_sched_mints, 0, &min_ts, 0,
    457 		CTL_CREATE, CTL_EOL);
    458 }
    459