Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.24.6.2
      1 /*	$NetBSD: sched_m2.c,v 1.24.6.2 2008/10/10 22:34:14 skrll Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24.6.2 2008/10/10 22:34:14 skrll Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/bitops.h>
     41 #include <sys/cpu.h>
     42 #include <sys/callout.h>
     43 #include <sys/errno.h>
     44 #include <sys/kernel.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lwp.h>
     47 #include <sys/mutex.h>
     48 #include <sys/pool.h>
     49 #include <sys/proc.h>
     50 #include <sys/pset.h>
     51 #include <sys/resource.h>
     52 #include <sys/resourcevar.h>
     53 #include <sys/sched.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related defintions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 
     67 /*
     68  * Time-slices and priorities.
     69  */
     70 static u_int	min_ts;			/* Minimal time-slice */
     71 static u_int	max_ts;			/* Maximal time-slice */
     72 static u_int	rt_ts;			/* Real-time time-slice */
     73 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     74 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     75 
     76 static void	sched_precalcts(void);
     77 
     78 /*
     79  * Initialization and setup.
     80  */
     81 
     82 void
     83 sched_rqinit(void)
     84 {
     85 	struct cpu_info *ci = curcpu();
     86 
     87 	if (hz < 100) {
     88 		panic("sched_rqinit: value of HZ is too low\n");
     89 	}
     90 
     91 	/* Default timing ranges */
     92 	min_ts = mstohz(20);			/*  ~20 ms */
     93 	max_ts = mstohz(150);			/* ~150 ms */
     94 	rt_ts = mstohz(100);			/* ~100 ms */
     95 	sched_precalcts();
     96 
     97 	/* Attach the primary CPU here */
     98 	sched_cpuattach(ci);
     99 
    100 	sched_lwp_fork(NULL, &lwp0);
    101 	sched_newts(&lwp0);
    102 }
    103 
    104 /* Pre-calculate the time-slices for the priorities */
    105 static void
    106 sched_precalcts(void)
    107 {
    108 	pri_t p;
    109 
    110 	/* Time-sharing range */
    111 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    112 		ts_map[p] = max_ts -
    113 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    114 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    115 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    116 	}
    117 
    118 	/* Real-time range */
    119 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    120 		ts_map[p] = rt_ts;
    121 		high_pri[p] = p;
    122 	}
    123 }
    124 
    125 /*
    126  * Hooks.
    127  */
    128 
    129 void
    130 sched_proc_fork(struct proc *parent, struct proc *child)
    131 {
    132 	struct lwp *l;
    133 
    134 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    135 		lwp_lock(l);
    136 		sched_newts(l);
    137 		lwp_unlock(l);
    138 	}
    139 }
    140 
    141 void
    142 sched_proc_exit(struct proc *child, struct proc *parent)
    143 {
    144 
    145 }
    146 
    147 void
    148 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    149 {
    150 
    151 }
    152 
    153 void
    154 sched_lwp_collect(struct lwp *l)
    155 {
    156 
    157 }
    158 
    159 void
    160 sched_setrunnable(struct lwp *l)
    161 {
    162 
    163 }
    164 
    165 void
    166 sched_schedclock(struct lwp *l)
    167 {
    168 
    169 }
    170 
    171 /*
    172  * Priorities and time-slice.
    173  */
    174 
    175 void
    176 sched_nice(struct proc *p, int prio)
    177 {
    178 
    179 }
    180 
    181 /* Recalculate the time-slice */
    182 void
    183 sched_newts(struct lwp *l)
    184 {
    185 
    186 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
    187 }
    188 
    189 void
    190 sched_slept(struct lwp *l)
    191 {
    192 
    193 	/*
    194 	 * If thread is in time-sharing queue and batch flag is not marked,
    195 	 * increase the the priority, and run with the lower time-quantum.
    196 	 */
    197 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    198 		KASSERT(l->l_class == SCHED_OTHER);
    199 		l->l_priority++;
    200 	}
    201 }
    202 
    203 void
    204 sched_wakeup(struct lwp *l)
    205 {
    206 
    207 	/* If thread was sleeping a second or more - set a high priority */
    208 	if (l->l_slptime >= 1)
    209 		l->l_priority = high_pri[l->l_priority];
    210 }
    211 
    212 void
    213 sched_pstats_hook(struct lwp *l, int batch)
    214 {
    215 	pri_t prio;
    216 
    217 	/*
    218 	 * Estimate threads on time-sharing queue only, however,
    219 	 * exclude the highest priority for performance purposes.
    220 	 */
    221 	KASSERT(lwp_locked(l, NULL));
    222 	if (l->l_priority >= PRI_HIGHEST_TS)
    223 		return;
    224 	KASSERT(l->l_class == SCHED_OTHER);
    225 
    226 	/* If it is CPU-bound not a first time - decrease the priority */
    227 	prio = l->l_priority;
    228 	if (batch && prio != 0)
    229 		prio--;
    230 
    231 	/* If thread was not ran a second or more - set a high priority */
    232 	if (l->l_stat == LSRUN) {
    233 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
    234 			prio = high_pri[prio];
    235 		/* Re-enqueue the thread if priority has changed */
    236 		if (prio != l->l_priority)
    237 			lwp_changepri(l, prio);
    238 	} else {
    239 		/* In other states, change the priority directly */
    240 		l->l_priority = prio;
    241 	}
    242 }
    243 
    244 void
    245 sched_oncpu(lwp_t *l)
    246 {
    247 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
    248 
    249 	/* Update the counters */
    250 	KASSERT(l->l_sched.timeslice >= min_ts);
    251 	KASSERT(l->l_sched.timeslice <= max_ts);
    252 	spc->spc_ticks = l->l_sched.timeslice;
    253 }
    254 
    255 /*
    256  * Time-driven events.
    257  */
    258 
    259 /*
    260  * Called once per time-quantum.  This routine is CPU-local and runs at
    261  * IPL_SCHED, thus the locking is not needed.
    262  */
    263 void
    264 sched_tick(struct cpu_info *ci)
    265 {
    266 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    267 	struct lwp *l = curlwp;
    268 
    269 	if (__predict_false(CURCPU_IDLE_P()))
    270 		return;
    271 
    272 	switch (l->l_class) {
    273 	case SCHED_FIFO:
    274 		/*
    275 		 * Update the time-quantum, and continue running,
    276 		 * if thread runs on FIFO real-time policy.
    277 		 */
    278 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    279 		spc->spc_ticks = l->l_sched.timeslice;
    280 		return;
    281 	case SCHED_OTHER:
    282 		/*
    283 		 * If thread is in time-sharing queue, decrease the priority,
    284 		 * and run with a higher time-quantum.
    285 		 */
    286 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    287 		if (l->l_priority != 0)
    288 			l->l_priority--;
    289 		break;
    290 	}
    291 
    292 	/*
    293 	 * If there are higher priority threads or threads in the same queue,
    294 	 * mark that thread should yield, otherwise, continue running.
    295 	 */
    296 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    297 		spc->spc_flags |= SPCF_SHOULDYIELD;
    298 		cpu_need_resched(ci, 0);
    299 	} else
    300 		spc->spc_ticks = l->l_sched.timeslice;
    301 }
    302 
    303 /*
    304  * Sysctl nodes and initialization.
    305  */
    306 
    307 static int
    308 sysctl_sched_rtts(SYSCTLFN_ARGS)
    309 {
    310 	struct sysctlnode node;
    311 	int rttsms = hztoms(rt_ts);
    312 
    313 	node = *rnode;
    314 	node.sysctl_data = &rttsms;
    315 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    316 }
    317 
    318 static int
    319 sysctl_sched_mints(SYSCTLFN_ARGS)
    320 {
    321 	struct sysctlnode node;
    322 	struct cpu_info *ci;
    323 	int error, newsize;
    324 	CPU_INFO_ITERATOR cii;
    325 
    326 	node = *rnode;
    327 	node.sysctl_data = &newsize;
    328 
    329 	newsize = hztoms(min_ts);
    330 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    331 	if (error || newp == NULL)
    332 		return error;
    333 
    334 	newsize = mstohz(newsize);
    335 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    336 		return EINVAL;
    337 
    338 	/* It is safe to do this in such order */
    339 	for (CPU_INFO_FOREACH(cii, ci))
    340 		spc_lock(ci);
    341 
    342 	min_ts = newsize;
    343 	sched_precalcts();
    344 
    345 	for (CPU_INFO_FOREACH(cii, ci))
    346 		spc_unlock(ci);
    347 
    348 	return 0;
    349 }
    350 
    351 static int
    352 sysctl_sched_maxts(SYSCTLFN_ARGS)
    353 {
    354 	struct sysctlnode node;
    355 	struct cpu_info *ci;
    356 	int error, newsize;
    357 	CPU_INFO_ITERATOR cii;
    358 
    359 	node = *rnode;
    360 	node.sysctl_data = &newsize;
    361 
    362 	newsize = hztoms(max_ts);
    363 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    364 	if (error || newp == NULL)
    365 		return error;
    366 
    367 	newsize = mstohz(newsize);
    368 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    369 		return EINVAL;
    370 
    371 	/* It is safe to do this in such order */
    372 	for (CPU_INFO_FOREACH(cii, ci))
    373 		spc_lock(ci);
    374 
    375 	max_ts = newsize;
    376 	sched_precalcts();
    377 
    378 	for (CPU_INFO_FOREACH(cii, ci))
    379 		spc_unlock(ci);
    380 
    381 	return 0;
    382 }
    383 
    384 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    385 {
    386 	const struct sysctlnode *node = NULL;
    387 
    388 	sysctl_createv(clog, 0, NULL, NULL,
    389 		CTLFLAG_PERMANENT,
    390 		CTLTYPE_NODE, "kern", NULL,
    391 		NULL, 0, NULL, 0,
    392 		CTL_KERN, CTL_EOL);
    393 	sysctl_createv(clog, 0, NULL, &node,
    394 		CTLFLAG_PERMANENT,
    395 		CTLTYPE_NODE, "sched",
    396 		SYSCTL_DESCR("Scheduler options"),
    397 		NULL, 0, NULL, 0,
    398 		CTL_KERN, CTL_CREATE, CTL_EOL);
    399 
    400 	if (node == NULL)
    401 		return;
    402 
    403 	sysctl_createv(NULL, 0, &node, NULL,
    404 		CTLFLAG_PERMANENT,
    405 		CTLTYPE_STRING, "name", NULL,
    406 		NULL, 0, __UNCONST("M2"), 0,
    407 		CTL_CREATE, CTL_EOL);
    408 	sysctl_createv(NULL, 0, &node, NULL,
    409 		CTLFLAG_PERMANENT,
    410 		CTLTYPE_INT, "rtts",
    411 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    412 		sysctl_sched_rtts, 0, NULL, 0,
    413 		CTL_CREATE, CTL_EOL);
    414 	sysctl_createv(NULL, 0, &node, NULL,
    415 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    416 		CTLTYPE_INT, "maxts",
    417 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    418 		sysctl_sched_maxts, 0, &max_ts, 0,
    419 		CTL_CREATE, CTL_EOL);
    420 	sysctl_createv(NULL, 0, &node, NULL,
    421 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    422 		CTLTYPE_INT, "mints",
    423 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    424 		sysctl_sched_mints, 0, &min_ts, 0,
    425 		CTL_CREATE, CTL_EOL);
    426 }
    427