Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.24
      1 /*	$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/bitops.h>
     41 #include <sys/cpu.h>
     42 #include <sys/callout.h>
     43 #include <sys/errno.h>
     44 #include <sys/kernel.h>
     45 #include <sys/kmem.h>
     46 #include <sys/lwp.h>
     47 #include <sys/mutex.h>
     48 #include <sys/pool.h>
     49 #include <sys/proc.h>
     50 #include <sys/pset.h>
     51 #include <sys/resource.h>
     52 #include <sys/resourcevar.h>
     53 #include <sys/sched.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related defintions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 
     67 /*
     68  * Time-slices and priorities.
     69  */
     70 static u_int	min_ts;			/* Minimal time-slice */
     71 static u_int	max_ts;			/* Maximal time-slice */
     72 static u_int	rt_ts;			/* Real-time time-slice */
     73 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     74 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     75 
     76 typedef struct {
     77 	u_int		sl_flags;
     78 	u_int		sl_timeslice;	/* Time-slice of thread */
     79 	u_int		sl_slept;	/* Saved sleep time for sleep sum */
     80 	u_int		sl_slpsum;	/* Sum of sleep time */
     81 	u_int		sl_lrtime;	/* Last run time */
     82 } sched_info_lwp_t;
     83 
     84 /* Flags */
     85 #define	SL_BATCH	0x01
     86 
     87 /* Pool of the scheduler-specific structures for threads */
     88 static pool_cache_t	sil_pool;
     89 
     90 /*
     91  * Prototypes.
     92  */
     93 
     94 static void		sched_precalcts(void);
     95 
     96 /*
     97  * Initialization and setup.
     98  */
     99 
    100 void
    101 sched_rqinit(void)
    102 {
    103 	struct cpu_info *ci = curcpu();
    104 
    105 	if (hz < 100) {
    106 		panic("sched_rqinit: value of HZ is too low\n");
    107 	}
    108 
    109 	/* Default timing ranges */
    110 	min_ts = mstohz(50);			/* ~50ms  */
    111 	max_ts = mstohz(150);			/* ~150ms */
    112 	rt_ts = mstohz(100);			/* ~100ms */
    113 	sched_precalcts();
    114 
    115 	/* Pool of the scheduler-specific structures */
    116 	sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit,
    117 	    0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
    118 
    119 	/* Attach the primary CPU here */
    120 	sched_cpuattach(ci);
    121 
    122 	sched_lwp_fork(NULL, &lwp0);
    123 	sched_newts(&lwp0);
    124 }
    125 
    126 /* Pre-calculate the time-slices for the priorities */
    127 static void
    128 sched_precalcts(void)
    129 {
    130 	pri_t p;
    131 
    132 	/* Time-sharing range */
    133 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    134 		ts_map[p] = max_ts -
    135 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    136 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    137 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    138 	}
    139 
    140 	/* Real-time range */
    141 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    142 		ts_map[p] = rt_ts;
    143 		high_pri[p] = p;
    144 	}
    145 }
    146 
    147 /*
    148  * Hooks.
    149  */
    150 
    151 void
    152 sched_proc_fork(struct proc *parent, struct proc *child)
    153 {
    154 	struct lwp *l;
    155 
    156 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    157 		lwp_lock(l);
    158 		sched_newts(l);
    159 		lwp_unlock(l);
    160 	}
    161 }
    162 
    163 void
    164 sched_proc_exit(struct proc *child, struct proc *parent)
    165 {
    166 
    167 	/* Dummy */
    168 }
    169 
    170 void
    171 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    172 {
    173 
    174 	KASSERT(l2->l_sched_info == NULL);
    175 	l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
    176 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
    177 }
    178 
    179 void
    180 sched_lwp_exit(struct lwp *l)
    181 {
    182 
    183 	KASSERT(l->l_sched_info != NULL);
    184 	pool_cache_put(sil_pool, l->l_sched_info);
    185 	l->l_sched_info = NULL;
    186 }
    187 
    188 void
    189 sched_lwp_collect(struct lwp *l)
    190 {
    191 
    192 }
    193 
    194 void
    195 sched_setrunnable(struct lwp *l)
    196 {
    197 
    198 	/* Dummy */
    199 }
    200 
    201 void
    202 sched_schedclock(struct lwp *l)
    203 {
    204 
    205 	/* Dummy */
    206 }
    207 
    208 /*
    209  * Priorities and time-slice.
    210  */
    211 
    212 void
    213 sched_nice(struct proc *p, int prio)
    214 {
    215 
    216 	/* TODO: implement as SCHED_IA */
    217 }
    218 
    219 /* Recalculate the time-slice */
    220 void
    221 sched_newts(struct lwp *l)
    222 {
    223 	sched_info_lwp_t *sil = l->l_sched_info;
    224 
    225 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
    226 }
    227 
    228 void
    229 sched_slept(struct lwp *l)
    230 {
    231 	sched_info_lwp_t *sil = l->l_sched_info;
    232 
    233 	/* Save the time when thread has slept */
    234 	sil->sl_slept = hardclock_ticks;
    235 
    236 	/*
    237 	 * If thread is in time-sharing queue and batch flag is not marked,
    238 	 * increase the the priority, and run with the lower time-quantum.
    239 	 */
    240 	if (l->l_priority < PRI_HIGHEST_TS &&
    241 	    (sil->sl_flags & SL_BATCH) == 0) {
    242 		KASSERT(l->l_class == SCHED_OTHER);
    243 		l->l_priority++;
    244 	}
    245 }
    246 
    247 void
    248 sched_wakeup(struct lwp *l)
    249 {
    250 	sched_info_lwp_t *sil = l->l_sched_info;
    251 	const u_int slptime = hardclock_ticks - sil->sl_slept;
    252 
    253 	/* Update sleep time delta */
    254 	sil->sl_slpsum += (l->l_slptime == 0) ? slptime : hz;
    255 
    256 	/* If thread was sleeping a second or more - set a high priority */
    257 	if (l->l_slptime > 1 || slptime >= hz)
    258 		l->l_priority = high_pri[l->l_priority];
    259 
    260 	/* Also, consider looking for a better CPU to wake up */
    261 	l->l_cpu = sched_takecpu(l);
    262 }
    263 
    264 void
    265 sched_pstats_hook(struct lwp *l)
    266 {
    267 	sched_info_lwp_t *sil = l->l_sched_info;
    268 	pri_t prio;
    269 	bool batch;
    270 
    271 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
    272 	    l->l_stat == LSSUSPENDED)
    273 		l->l_slptime++;
    274 
    275 	/*
    276 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
    277 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
    278 	 */
    279 	batch = (l->l_rticksum > sil->sl_slpsum);
    280 	if (batch) {
    281 		if ((sil->sl_flags & SL_BATCH) == 0)
    282 			batch = false;
    283 		sil->sl_flags |= SL_BATCH;
    284 	} else
    285 		sil->sl_flags &= ~SL_BATCH;
    286 
    287 	/*
    288 	 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
    289 	 * In such case reset the value of last sleep, and check it later, if
    290 	 * it is still zero - perform the migration, unmark the batch flag.
    291 	 */
    292 	if (batch && (l->l_slptime + sil->sl_slpsum) == 0) {
    293 		if (l->l_stat != LSONPROC && sil->sl_slept == 0) {
    294 			struct cpu_info *ci = sched_takecpu(l);
    295 
    296 			if (l->l_cpu != ci)
    297 				l->l_target_cpu = ci;
    298 			sil->sl_flags &= ~SL_BATCH;
    299 		} else {
    300 			sil->sl_slept = 0;
    301 		}
    302 	}
    303 
    304 	/* Reset the time sums */
    305 	sil->sl_slpsum = 0;
    306 	l->l_rticksum = 0;
    307 
    308 	/*
    309 	 * Estimate threads on time-sharing queue only, however,
    310 	 * exclude the highest priority for performance purposes.
    311 	 */
    312 	if (l->l_priority >= PRI_HIGHEST_TS)
    313 		return;
    314 	KASSERT(l->l_class == SCHED_OTHER);
    315 
    316 	/* If it is CPU-bound not a first time - decrease the priority */
    317 	prio = l->l_priority;
    318 	if (batch && prio != 0)
    319 		prio--;
    320 
    321 	/* If thread was not ran a second or more - set a high priority */
    322 	if (l->l_stat == LSRUN) {
    323 		if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz))
    324 			prio = high_pri[prio];
    325 		/* Re-enqueue the thread if priority has changed */
    326 		if (prio != l->l_priority)
    327 			lwp_changepri(l, prio);
    328 	} else {
    329 		/* In other states, change the priority directly */
    330 		l->l_priority = prio;
    331 	}
    332 }
    333 
    334 void
    335 sched_oncpu(lwp_t *l)
    336 {
    337 	sched_info_lwp_t *sil = l->l_sched_info;
    338 
    339 	/* Update the counters */
    340 	sil = l->l_sched_info;
    341 	KASSERT(sil->sl_timeslice >= min_ts);
    342 	KASSERT(sil->sl_timeslice <= max_ts);
    343 	l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice;
    344 }
    345 
    346 /*
    347  * Time-driven events.
    348  */
    349 
    350 /*
    351  * Called once per time-quantum.  This routine is CPU-local and runs at
    352  * IPL_SCHED, thus the locking is not needed.
    353  */
    354 void
    355 sched_tick(struct cpu_info *ci)
    356 {
    357 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    358 	struct lwp *l = curlwp;
    359 	const sched_info_lwp_t *sil = l->l_sched_info;
    360 
    361 	if (CURCPU_IDLE_P())
    362 		return;
    363 
    364 	switch (l->l_class) {
    365 	case SCHED_FIFO:
    366 		/*
    367 		 * Update the time-quantum, and continue running,
    368 		 * if thread runs on FIFO real-time policy.
    369 		 */
    370 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    371 		spc->spc_ticks = sil->sl_timeslice;
    372 		return;
    373 	case SCHED_OTHER:
    374 		/*
    375 		 * If thread is in time-sharing queue, decrease the priority,
    376 		 * and run with a higher time-quantum.
    377 		 */
    378 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    379 		if (l->l_priority != 0)
    380 			l->l_priority--;
    381 		break;
    382 	}
    383 
    384 	/*
    385 	 * If there are higher priority threads or threads in the same queue,
    386 	 * mark that thread should yield, otherwise, continue running.
    387 	 */
    388 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    389 		spc->spc_flags |= SPCF_SHOULDYIELD;
    390 		cpu_need_resched(ci, 0);
    391 	} else
    392 		spc->spc_ticks = sil->sl_timeslice;
    393 }
    394 
    395 /*
    396  * Sysctl nodes and initialization.
    397  */
    398 
    399 static int
    400 sysctl_sched_rtts(SYSCTLFN_ARGS)
    401 {
    402 	struct sysctlnode node;
    403 	int rttsms = hztoms(rt_ts);
    404 
    405 	node = *rnode;
    406 	node.sysctl_data = &rttsms;
    407 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    408 }
    409 
    410 static int
    411 sysctl_sched_mints(SYSCTLFN_ARGS)
    412 {
    413 	struct sysctlnode node;
    414 	struct cpu_info *ci;
    415 	int error, newsize;
    416 	CPU_INFO_ITERATOR cii;
    417 
    418 	node = *rnode;
    419 	node.sysctl_data = &newsize;
    420 
    421 	newsize = hztoms(min_ts);
    422 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    423 	if (error || newp == NULL)
    424 		return error;
    425 
    426 	newsize = mstohz(newsize);
    427 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    428 		return EINVAL;
    429 
    430 	/* It is safe to do this in such order */
    431 	for (CPU_INFO_FOREACH(cii, ci))
    432 		spc_lock(ci);
    433 
    434 	min_ts = newsize;
    435 	sched_precalcts();
    436 
    437 	for (CPU_INFO_FOREACH(cii, ci))
    438 		spc_unlock(ci);
    439 
    440 	return 0;
    441 }
    442 
    443 static int
    444 sysctl_sched_maxts(SYSCTLFN_ARGS)
    445 {
    446 	struct sysctlnode node;
    447 	struct cpu_info *ci;
    448 	int error, newsize;
    449 	CPU_INFO_ITERATOR cii;
    450 
    451 	node = *rnode;
    452 	node.sysctl_data = &newsize;
    453 
    454 	newsize = hztoms(max_ts);
    455 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    456 	if (error || newp == NULL)
    457 		return error;
    458 
    459 	newsize = mstohz(newsize);
    460 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    461 		return EINVAL;
    462 
    463 	/* It is safe to do this in such order */
    464 	for (CPU_INFO_FOREACH(cii, ci))
    465 		spc_lock(ci);
    466 
    467 	max_ts = newsize;
    468 	sched_precalcts();
    469 
    470 	for (CPU_INFO_FOREACH(cii, ci))
    471 		spc_unlock(ci);
    472 
    473 	return 0;
    474 }
    475 
    476 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    477 {
    478 	const struct sysctlnode *node = NULL;
    479 
    480 	sysctl_createv(clog, 0, NULL, NULL,
    481 		CTLFLAG_PERMANENT,
    482 		CTLTYPE_NODE, "kern", NULL,
    483 		NULL, 0, NULL, 0,
    484 		CTL_KERN, CTL_EOL);
    485 	sysctl_createv(clog, 0, NULL, &node,
    486 		CTLFLAG_PERMANENT,
    487 		CTLTYPE_NODE, "sched",
    488 		SYSCTL_DESCR("Scheduler options"),
    489 		NULL, 0, NULL, 0,
    490 		CTL_KERN, CTL_CREATE, CTL_EOL);
    491 
    492 	if (node == NULL)
    493 		return;
    494 
    495 	sysctl_createv(NULL, 0, &node, NULL,
    496 		CTLFLAG_PERMANENT,
    497 		CTLTYPE_STRING, "name", NULL,
    498 		NULL, 0, __UNCONST("M2"), 0,
    499 		CTL_CREATE, CTL_EOL);
    500 	sysctl_createv(NULL, 0, &node, NULL,
    501 		CTLFLAG_PERMANENT,
    502 		CTLTYPE_INT, "rtts",
    503 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    504 		sysctl_sched_rtts, 0, NULL, 0,
    505 		CTL_CREATE, CTL_EOL);
    506 	sysctl_createv(NULL, 0, &node, NULL,
    507 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    508 		CTLTYPE_INT, "maxts",
    509 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    510 		sysctl_sched_maxts, 0, &max_ts, 0,
    511 		CTL_CREATE, CTL_EOL);
    512 	sysctl_createv(NULL, 0, &node, NULL,
    513 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    514 		CTLTYPE_INT, "mints",
    515 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    516 		sysctl_sched_mints, 0, &min_ts, 0,
    517 		CTL_CREATE, CTL_EOL);
    518 }
    519