Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.25
      1 /*	$NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/bitops.h>
     41 #include <sys/cpu.h>
     42 #include <sys/callout.h>
     43 #include <sys/errno.h>
     44 #include <sys/kernel.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lwp.h>
     47 #include <sys/mutex.h>
     48 #include <sys/pool.h>
     49 #include <sys/proc.h>
     50 #include <sys/pset.h>
     51 #include <sys/resource.h>
     52 #include <sys/resourcevar.h>
     53 #include <sys/sched.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related defintions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 
     67 /*
     68  * Time-slices and priorities.
     69  */
     70 static u_int	min_ts;			/* Minimal time-slice */
     71 static u_int	max_ts;			/* Maximal time-slice */
     72 static u_int	rt_ts;			/* Real-time time-slice */
     73 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     74 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     75 
     76 static void	sched_precalcts(void);
     77 
     78 typedef struct {
     79 	u_int		sl_timeslice;	/* Time-slice of thread */
     80 } sched_info_lwp_t;
     81 
     82 static pool_cache_t	sil_pool;
     83 
     84 /*
     85  * Initialization and setup.
     86  */
     87 
     88 void
     89 sched_rqinit(void)
     90 {
     91 	struct cpu_info *ci = curcpu();
     92 
     93 	if (hz < 100) {
     94 		panic("sched_rqinit: value of HZ is too low\n");
     95 	}
     96 
     97 	/* Default timing ranges */
     98 	min_ts = mstohz(50);			/* ~50ms  */
     99 	max_ts = mstohz(150);			/* ~150ms */
    100 	rt_ts = mstohz(100);			/* ~100ms */
    101 	sched_precalcts();
    102 
    103 	/* Pool of the scheduler-specific structures */
    104 	sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit,
    105 	    0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
    106 
    107 	/* Attach the primary CPU here */
    108 	sched_cpuattach(ci);
    109 
    110 	sched_lwp_fork(NULL, &lwp0);
    111 	sched_newts(&lwp0);
    112 }
    113 
    114 /* Pre-calculate the time-slices for the priorities */
    115 static void
    116 sched_precalcts(void)
    117 {
    118 	pri_t p;
    119 
    120 	/* Time-sharing range */
    121 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    122 		ts_map[p] = max_ts -
    123 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    124 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    125 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    126 	}
    127 
    128 	/* Real-time range */
    129 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    130 		ts_map[p] = rt_ts;
    131 		high_pri[p] = p;
    132 	}
    133 }
    134 
    135 /*
    136  * Hooks.
    137  */
    138 
    139 void
    140 sched_proc_fork(struct proc *parent, struct proc *child)
    141 {
    142 	struct lwp *l;
    143 
    144 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    145 		lwp_lock(l);
    146 		sched_newts(l);
    147 		lwp_unlock(l);
    148 	}
    149 }
    150 
    151 void
    152 sched_proc_exit(struct proc *child, struct proc *parent)
    153 {
    154 
    155 	/* Dummy */
    156 }
    157 
    158 void
    159 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    160 {
    161 
    162 	KASSERT(l2->l_sched_info == NULL);
    163 	l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
    164 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
    165 }
    166 
    167 void
    168 sched_lwp_exit(struct lwp *l)
    169 {
    170 
    171 	KASSERT(l->l_sched_info != NULL);
    172 	pool_cache_put(sil_pool, l->l_sched_info);
    173 	l->l_sched_info = NULL;
    174 }
    175 
    176 void
    177 sched_lwp_collect(struct lwp *l)
    178 {
    179 
    180 }
    181 
    182 void
    183 sched_setrunnable(struct lwp *l)
    184 {
    185 
    186 	/* Dummy */
    187 }
    188 
    189 void
    190 sched_schedclock(struct lwp *l)
    191 {
    192 
    193 	/* Dummy */
    194 }
    195 
    196 /*
    197  * Priorities and time-slice.
    198  */
    199 
    200 void
    201 sched_nice(struct proc *p, int prio)
    202 {
    203 
    204 	/* TODO: implement as SCHED_IA */
    205 }
    206 
    207 /* Recalculate the time-slice */
    208 void
    209 sched_newts(struct lwp *l)
    210 {
    211 	sched_info_lwp_t *sil = l->l_sched_info;
    212 
    213 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
    214 }
    215 
    216 void
    217 sched_slept(struct lwp *l)
    218 {
    219 
    220 	/*
    221 	 * If thread is in time-sharing queue and batch flag is not marked,
    222 	 * increase the the priority, and run with the lower time-quantum.
    223 	 */
    224 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    225 		KASSERT(l->l_class == SCHED_OTHER);
    226 		l->l_priority++;
    227 	}
    228 }
    229 
    230 void
    231 sched_wakeup(struct lwp *l)
    232 {
    233 
    234 	/* If thread was sleeping a second or more - set a high priority */
    235 	if (l->l_slptime >= 1)
    236 		l->l_priority = high_pri[l->l_priority];
    237 }
    238 
    239 void
    240 sched_pstats_hook(struct lwp *l, int batch)
    241 {
    242 	pri_t prio;
    243 
    244 	/*
    245 	 * Estimate threads on time-sharing queue only, however,
    246 	 * exclude the highest priority for performance purposes.
    247 	 */
    248 	if (l->l_priority >= PRI_HIGHEST_TS)
    249 		return;
    250 	KASSERT(l->l_class == SCHED_OTHER);
    251 
    252 	/* If it is CPU-bound not a first time - decrease the priority */
    253 	prio = l->l_priority;
    254 	if (batch && prio != 0)
    255 		prio--;
    256 
    257 	/* If thread was not ran a second or more - set a high priority */
    258 	if (l->l_stat == LSRUN) {
    259 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
    260 			prio = high_pri[prio];
    261 		/* Re-enqueue the thread if priority has changed */
    262 		if (prio != l->l_priority)
    263 			lwp_changepri(l, prio);
    264 	} else {
    265 		/* In other states, change the priority directly */
    266 		l->l_priority = prio;
    267 	}
    268 }
    269 
    270 void
    271 sched_oncpu(lwp_t *l)
    272 {
    273 	sched_info_lwp_t *sil = l->l_sched_info;
    274 
    275 	/* Update the counters */
    276 	sil = l->l_sched_info;
    277 	KASSERT(sil->sl_timeslice >= min_ts);
    278 	KASSERT(sil->sl_timeslice <= max_ts);
    279 	l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice;
    280 }
    281 
    282 /*
    283  * Time-driven events.
    284  */
    285 
    286 /*
    287  * Called once per time-quantum.  This routine is CPU-local and runs at
    288  * IPL_SCHED, thus the locking is not needed.
    289  */
    290 void
    291 sched_tick(struct cpu_info *ci)
    292 {
    293 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    294 	struct lwp *l = curlwp;
    295 	const sched_info_lwp_t *sil = l->l_sched_info;
    296 
    297 	if (CURCPU_IDLE_P())
    298 		return;
    299 
    300 	switch (l->l_class) {
    301 	case SCHED_FIFO:
    302 		/*
    303 		 * Update the time-quantum, and continue running,
    304 		 * if thread runs on FIFO real-time policy.
    305 		 */
    306 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    307 		spc->spc_ticks = sil->sl_timeslice;
    308 		return;
    309 	case SCHED_OTHER:
    310 		/*
    311 		 * If thread is in time-sharing queue, decrease the priority,
    312 		 * and run with a higher time-quantum.
    313 		 */
    314 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    315 		if (l->l_priority != 0)
    316 			l->l_priority--;
    317 		break;
    318 	}
    319 
    320 	/*
    321 	 * If there are higher priority threads or threads in the same queue,
    322 	 * mark that thread should yield, otherwise, continue running.
    323 	 */
    324 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    325 		spc->spc_flags |= SPCF_SHOULDYIELD;
    326 		cpu_need_resched(ci, 0);
    327 	} else
    328 		spc->spc_ticks = sil->sl_timeslice;
    329 }
    330 
    331 /*
    332  * Sysctl nodes and initialization.
    333  */
    334 
    335 static int
    336 sysctl_sched_rtts(SYSCTLFN_ARGS)
    337 {
    338 	struct sysctlnode node;
    339 	int rttsms = hztoms(rt_ts);
    340 
    341 	node = *rnode;
    342 	node.sysctl_data = &rttsms;
    343 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    344 }
    345 
    346 static int
    347 sysctl_sched_mints(SYSCTLFN_ARGS)
    348 {
    349 	struct sysctlnode node;
    350 	struct cpu_info *ci;
    351 	int error, newsize;
    352 	CPU_INFO_ITERATOR cii;
    353 
    354 	node = *rnode;
    355 	node.sysctl_data = &newsize;
    356 
    357 	newsize = hztoms(min_ts);
    358 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    359 	if (error || newp == NULL)
    360 		return error;
    361 
    362 	newsize = mstohz(newsize);
    363 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    364 		return EINVAL;
    365 
    366 	/* It is safe to do this in such order */
    367 	for (CPU_INFO_FOREACH(cii, ci))
    368 		spc_lock(ci);
    369 
    370 	min_ts = newsize;
    371 	sched_precalcts();
    372 
    373 	for (CPU_INFO_FOREACH(cii, ci))
    374 		spc_unlock(ci);
    375 
    376 	return 0;
    377 }
    378 
    379 static int
    380 sysctl_sched_maxts(SYSCTLFN_ARGS)
    381 {
    382 	struct sysctlnode node;
    383 	struct cpu_info *ci;
    384 	int error, newsize;
    385 	CPU_INFO_ITERATOR cii;
    386 
    387 	node = *rnode;
    388 	node.sysctl_data = &newsize;
    389 
    390 	newsize = hztoms(max_ts);
    391 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    392 	if (error || newp == NULL)
    393 		return error;
    394 
    395 	newsize = mstohz(newsize);
    396 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    397 		return EINVAL;
    398 
    399 	/* It is safe to do this in such order */
    400 	for (CPU_INFO_FOREACH(cii, ci))
    401 		spc_lock(ci);
    402 
    403 	max_ts = newsize;
    404 	sched_precalcts();
    405 
    406 	for (CPU_INFO_FOREACH(cii, ci))
    407 		spc_unlock(ci);
    408 
    409 	return 0;
    410 }
    411 
    412 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    413 {
    414 	const struct sysctlnode *node = NULL;
    415 
    416 	sysctl_createv(clog, 0, NULL, NULL,
    417 		CTLFLAG_PERMANENT,
    418 		CTLTYPE_NODE, "kern", NULL,
    419 		NULL, 0, NULL, 0,
    420 		CTL_KERN, CTL_EOL);
    421 	sysctl_createv(clog, 0, NULL, &node,
    422 		CTLFLAG_PERMANENT,
    423 		CTLTYPE_NODE, "sched",
    424 		SYSCTL_DESCR("Scheduler options"),
    425 		NULL, 0, NULL, 0,
    426 		CTL_KERN, CTL_CREATE, CTL_EOL);
    427 
    428 	if (node == NULL)
    429 		return;
    430 
    431 	sysctl_createv(NULL, 0, &node, NULL,
    432 		CTLFLAG_PERMANENT,
    433 		CTLTYPE_STRING, "name", NULL,
    434 		NULL, 0, __UNCONST("M2"), 0,
    435 		CTL_CREATE, CTL_EOL);
    436 	sysctl_createv(NULL, 0, &node, NULL,
    437 		CTLFLAG_PERMANENT,
    438 		CTLTYPE_INT, "rtts",
    439 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    440 		sysctl_sched_rtts, 0, NULL, 0,
    441 		CTL_CREATE, CTL_EOL);
    442 	sysctl_createv(NULL, 0, &node, NULL,
    443 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    444 		CTLTYPE_INT, "maxts",
    445 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    446 		sysctl_sched_maxts, 0, &max_ts, 0,
    447 		CTL_CREATE, CTL_EOL);
    448 	sysctl_createv(NULL, 0, &node, NULL,
    449 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    450 		CTLTYPE_INT, "mints",
    451 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    452 		sysctl_sched_mints, 0, &min_ts, 0,
    453 		CTL_CREATE, CTL_EOL);
    454 }
    455