Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.24.6.2
      1  1.24.6.2     skrll /*	$NetBSD: sched_m2.c,v 1.24.6.2 2008/10/10 22:34:14 skrll Exp $	*/
      2       1.1     rmind 
      3       1.1     rmind /*
      4      1.15     rmind  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5      1.12     rmind  * All rights reserved.
      6       1.1     rmind  *
      7       1.1     rmind  * Redistribution and use in source and binary forms, with or without
      8       1.1     rmind  * modification, are permitted provided that the following conditions
      9       1.1     rmind  * are met:
     10       1.1     rmind  * 1. Redistributions of source code must retain the above copyright
     11       1.1     rmind  *    notice, this list of conditions and the following disclaimer.
     12       1.1     rmind  * 2. Redistributions in binary form must reproduce the above copyright
     13       1.1     rmind  *    notice, this list of conditions and the following disclaimer in the
     14       1.1     rmind  *    documentation and/or other materials provided with the distribution.
     15       1.1     rmind  *
     16      1.19     rmind  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17      1.19     rmind  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18      1.19     rmind  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19      1.19     rmind  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20      1.19     rmind  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21      1.19     rmind  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22      1.19     rmind  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23      1.19     rmind  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24      1.19     rmind  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25      1.19     rmind  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26      1.19     rmind  * SUCH DAMAGE.
     27       1.1     rmind  */
     28       1.1     rmind 
     29       1.1     rmind /*
     30       1.1     rmind  * TODO:
     31       1.1     rmind  *  - Implementation of fair share queue;
     32       1.1     rmind  *  - Support for NUMA;
     33       1.1     rmind  */
     34       1.1     rmind 
     35       1.1     rmind #include <sys/cdefs.h>
     36  1.24.6.2     skrll __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24.6.2 2008/10/10 22:34:14 skrll Exp $");
     37       1.1     rmind 
     38       1.1     rmind #include <sys/param.h>
     39       1.1     rmind 
     40       1.8     rmind #include <sys/bitops.h>
     41       1.1     rmind #include <sys/cpu.h>
     42       1.1     rmind #include <sys/callout.h>
     43       1.1     rmind #include <sys/errno.h>
     44       1.1     rmind #include <sys/kernel.h>
     45       1.1     rmind #include <sys/kmem.h>
     46       1.1     rmind #include <sys/lwp.h>
     47       1.1     rmind #include <sys/mutex.h>
     48       1.1     rmind #include <sys/pool.h>
     49       1.1     rmind #include <sys/proc.h>
     50      1.15     rmind #include <sys/pset.h>
     51       1.1     rmind #include <sys/resource.h>
     52       1.1     rmind #include <sys/resourcevar.h>
     53       1.1     rmind #include <sys/sched.h>
     54       1.1     rmind #include <sys/syscallargs.h>
     55       1.1     rmind #include <sys/sysctl.h>
     56       1.1     rmind #include <sys/types.h>
     57       1.1     rmind 
     58       1.1     rmind /*
     59      1.10        ad  * Priority related defintions.
     60       1.1     rmind  */
     61      1.10        ad #define	PRI_TS_COUNT	(NPRI_USER)
     62      1.10        ad #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63      1.10        ad #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64      1.10        ad 
     65      1.11     rmind #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66      1.10        ad 
     67       1.1     rmind /*
     68       1.1     rmind  * Time-slices and priorities.
     69       1.1     rmind  */
     70       1.1     rmind static u_int	min_ts;			/* Minimal time-slice */
     71       1.1     rmind static u_int	max_ts;			/* Maximal time-slice */
     72       1.1     rmind static u_int	rt_ts;			/* Real-time time-slice */
     73       1.1     rmind static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     74       1.1     rmind static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     75       1.1     rmind 
     76  1.24.6.1  wrstuden static void	sched_precalcts(void);
     77  1.24.6.1  wrstuden 
     78       1.1     rmind /*
     79       1.1     rmind  * Initialization and setup.
     80       1.1     rmind  */
     81       1.1     rmind 
     82       1.1     rmind void
     83       1.1     rmind sched_rqinit(void)
     84       1.1     rmind {
     85       1.1     rmind 	struct cpu_info *ci = curcpu();
     86       1.1     rmind 
     87       1.1     rmind 	if (hz < 100) {
     88       1.1     rmind 		panic("sched_rqinit: value of HZ is too low\n");
     89       1.1     rmind 	}
     90       1.1     rmind 
     91       1.1     rmind 	/* Default timing ranges */
     92  1.24.6.2     skrll 	min_ts = mstohz(20);			/*  ~20 ms */
     93  1.24.6.2     skrll 	max_ts = mstohz(150);			/* ~150 ms */
     94  1.24.6.2     skrll 	rt_ts = mstohz(100);			/* ~100 ms */
     95       1.1     rmind 	sched_precalcts();
     96       1.1     rmind 
     97       1.1     rmind 	/* Attach the primary CPU here */
     98       1.1     rmind 	sched_cpuattach(ci);
     99       1.1     rmind 
    100      1.10        ad 	sched_lwp_fork(NULL, &lwp0);
    101       1.1     rmind 	sched_newts(&lwp0);
    102       1.1     rmind }
    103       1.1     rmind 
    104       1.1     rmind /* Pre-calculate the time-slices for the priorities */
    105       1.1     rmind static void
    106       1.1     rmind sched_precalcts(void)
    107       1.1     rmind {
    108       1.1     rmind 	pri_t p;
    109       1.1     rmind 
    110      1.10        ad 	/* Time-sharing range */
    111      1.10        ad 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    112      1.10        ad 		ts_map[p] = max_ts -
    113      1.10        ad 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    114      1.10        ad 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    115      1.10        ad 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    116      1.10        ad 	}
    117      1.10        ad 
    118      1.10        ad 	/* Real-time range */
    119      1.10        ad 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    120       1.1     rmind 		ts_map[p] = rt_ts;
    121       1.1     rmind 		high_pri[p] = p;
    122       1.1     rmind 	}
    123       1.1     rmind }
    124       1.1     rmind 
    125       1.1     rmind /*
    126       1.1     rmind  * Hooks.
    127       1.1     rmind  */
    128       1.1     rmind 
    129       1.1     rmind void
    130       1.1     rmind sched_proc_fork(struct proc *parent, struct proc *child)
    131       1.1     rmind {
    132       1.1     rmind 	struct lwp *l;
    133       1.1     rmind 
    134       1.1     rmind 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    135       1.1     rmind 		lwp_lock(l);
    136       1.1     rmind 		sched_newts(l);
    137       1.1     rmind 		lwp_unlock(l);
    138       1.1     rmind 	}
    139       1.1     rmind }
    140       1.1     rmind 
    141       1.1     rmind void
    142       1.1     rmind sched_proc_exit(struct proc *child, struct proc *parent)
    143       1.1     rmind {
    144       1.1     rmind 
    145       1.1     rmind }
    146       1.1     rmind 
    147       1.1     rmind void
    148      1.10        ad sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    149       1.1     rmind {
    150       1.1     rmind 
    151       1.1     rmind }
    152       1.1     rmind 
    153       1.1     rmind void
    154      1.10        ad sched_lwp_collect(struct lwp *l)
    155      1.10        ad {
    156      1.10        ad 
    157      1.10        ad }
    158      1.10        ad 
    159      1.10        ad void
    160       1.1     rmind sched_setrunnable(struct lwp *l)
    161       1.1     rmind {
    162       1.1     rmind 
    163       1.1     rmind }
    164       1.1     rmind 
    165       1.1     rmind void
    166       1.1     rmind sched_schedclock(struct lwp *l)
    167       1.1     rmind {
    168       1.1     rmind 
    169       1.1     rmind }
    170       1.1     rmind 
    171       1.1     rmind /*
    172       1.1     rmind  * Priorities and time-slice.
    173       1.1     rmind  */
    174       1.1     rmind 
    175       1.1     rmind void
    176       1.1     rmind sched_nice(struct proc *p, int prio)
    177       1.1     rmind {
    178       1.1     rmind 
    179       1.1     rmind }
    180       1.1     rmind 
    181       1.1     rmind /* Recalculate the time-slice */
    182      1.24        ad void
    183       1.1     rmind sched_newts(struct lwp *l)
    184       1.1     rmind {
    185       1.1     rmind 
    186  1.24.6.2     skrll 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
    187       1.1     rmind }
    188       1.1     rmind 
    189       1.1     rmind void
    190       1.1     rmind sched_slept(struct lwp *l)
    191       1.1     rmind {
    192       1.1     rmind 
    193       1.1     rmind 	/*
    194      1.10        ad 	 * If thread is in time-sharing queue and batch flag is not marked,
    195      1.10        ad 	 * increase the the priority, and run with the lower time-quantum.
    196       1.1     rmind 	 */
    197  1.24.6.1  wrstuden 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    198      1.10        ad 		KASSERT(l->l_class == SCHED_OTHER);
    199      1.10        ad 		l->l_priority++;
    200      1.10        ad 	}
    201       1.1     rmind }
    202       1.1     rmind 
    203       1.1     rmind void
    204       1.1     rmind sched_wakeup(struct lwp *l)
    205       1.1     rmind {
    206       1.1     rmind 
    207       1.1     rmind 	/* If thread was sleeping a second or more - set a high priority */
    208  1.24.6.1  wrstuden 	if (l->l_slptime >= 1)
    209      1.10        ad 		l->l_priority = high_pri[l->l_priority];
    210       1.1     rmind }
    211       1.1     rmind 
    212       1.1     rmind void
    213  1.24.6.1  wrstuden sched_pstats_hook(struct lwp *l, int batch)
    214       1.1     rmind {
    215      1.11     rmind 	pri_t prio;
    216       1.1     rmind 
    217      1.22     rmind 	/*
    218      1.22     rmind 	 * Estimate threads on time-sharing queue only, however,
    219      1.22     rmind 	 * exclude the highest priority for performance purposes.
    220      1.22     rmind 	 */
    221  1.24.6.2     skrll 	KASSERT(lwp_locked(l, NULL));
    222      1.10        ad 	if (l->l_priority >= PRI_HIGHEST_TS)
    223       1.1     rmind 		return;
    224      1.16     rmind 	KASSERT(l->l_class == SCHED_OTHER);
    225       1.1     rmind 
    226      1.10        ad 	/* If it is CPU-bound not a first time - decrease the priority */
    227      1.11     rmind 	prio = l->l_priority;
    228      1.11     rmind 	if (batch && prio != 0)
    229      1.11     rmind 		prio--;
    230      1.10        ad 
    231       1.1     rmind 	/* If thread was not ran a second or more - set a high priority */
    232      1.11     rmind 	if (l->l_stat == LSRUN) {
    233  1.24.6.1  wrstuden 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
    234      1.11     rmind 			prio = high_pri[prio];
    235      1.11     rmind 		/* Re-enqueue the thread if priority has changed */
    236      1.11     rmind 		if (prio != l->l_priority)
    237      1.11     rmind 			lwp_changepri(l, prio);
    238      1.11     rmind 	} else {
    239      1.11     rmind 		/* In other states, change the priority directly */
    240      1.11     rmind 		l->l_priority = prio;
    241      1.11     rmind 	}
    242       1.1     rmind }
    243       1.1     rmind 
    244      1.24        ad void
    245      1.24        ad sched_oncpu(lwp_t *l)
    246      1.16     rmind {
    247  1.24.6.2     skrll 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
    248       1.1     rmind 
    249       1.1     rmind 	/* Update the counters */
    250  1.24.6.2     skrll 	KASSERT(l->l_sched.timeslice >= min_ts);
    251  1.24.6.2     skrll 	KASSERT(l->l_sched.timeslice <= max_ts);
    252  1.24.6.2     skrll 	spc->spc_ticks = l->l_sched.timeslice;
    253       1.1     rmind }
    254       1.1     rmind 
    255       1.1     rmind /*
    256       1.1     rmind  * Time-driven events.
    257       1.1     rmind  */
    258       1.1     rmind 
    259       1.1     rmind /*
    260       1.1     rmind  * Called once per time-quantum.  This routine is CPU-local and runs at
    261       1.1     rmind  * IPL_SCHED, thus the locking is not needed.
    262       1.1     rmind  */
    263       1.1     rmind void
    264       1.1     rmind sched_tick(struct cpu_info *ci)
    265       1.1     rmind {
    266       1.1     rmind 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    267       1.1     rmind 	struct lwp *l = curlwp;
    268       1.1     rmind 
    269  1.24.6.2     skrll 	if (__predict_false(CURCPU_IDLE_P()))
    270       1.2     rmind 		return;
    271       1.1     rmind 
    272      1.10        ad 	switch (l->l_class) {
    273       1.2     rmind 	case SCHED_FIFO:
    274       1.2     rmind 		/*
    275       1.2     rmind 		 * Update the time-quantum, and continue running,
    276       1.2     rmind 		 * if thread runs on FIFO real-time policy.
    277       1.2     rmind 		 */
    278      1.16     rmind 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    279  1.24.6.2     skrll 		spc->spc_ticks = l->l_sched.timeslice;
    280       1.1     rmind 		return;
    281       1.2     rmind 	case SCHED_OTHER:
    282      1.10        ad 		/*
    283      1.10        ad 		 * If thread is in time-sharing queue, decrease the priority,
    284      1.10        ad 		 * and run with a higher time-quantum.
    285      1.10        ad 		 */
    286      1.16     rmind 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    287      1.10        ad 		if (l->l_priority != 0)
    288      1.10        ad 			l->l_priority--;
    289       1.2     rmind 		break;
    290       1.1     rmind 	}
    291       1.1     rmind 
    292       1.1     rmind 	/*
    293       1.2     rmind 	 * If there are higher priority threads or threads in the same queue,
    294       1.2     rmind 	 * mark that thread should yield, otherwise, continue running.
    295       1.1     rmind 	 */
    296      1.24        ad 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    297       1.1     rmind 		spc->spc_flags |= SPCF_SHOULDYIELD;
    298       1.1     rmind 		cpu_need_resched(ci, 0);
    299       1.1     rmind 	} else
    300  1.24.6.2     skrll 		spc->spc_ticks = l->l_sched.timeslice;
    301       1.1     rmind }
    302       1.1     rmind 
    303       1.1     rmind /*
    304       1.1     rmind  * Sysctl nodes and initialization.
    305       1.1     rmind  */
    306       1.1     rmind 
    307       1.1     rmind static int
    308      1.15     rmind sysctl_sched_rtts(SYSCTLFN_ARGS)
    309      1.15     rmind {
    310      1.15     rmind 	struct sysctlnode node;
    311      1.15     rmind 	int rttsms = hztoms(rt_ts);
    312      1.15     rmind 
    313      1.15     rmind 	node = *rnode;
    314      1.15     rmind 	node.sysctl_data = &rttsms;
    315      1.15     rmind 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    316      1.15     rmind }
    317      1.15     rmind 
    318      1.15     rmind static int
    319       1.1     rmind sysctl_sched_mints(SYSCTLFN_ARGS)
    320       1.1     rmind {
    321       1.1     rmind 	struct sysctlnode node;
    322       1.1     rmind 	struct cpu_info *ci;
    323       1.1     rmind 	int error, newsize;
    324       1.1     rmind 	CPU_INFO_ITERATOR cii;
    325       1.1     rmind 
    326       1.1     rmind 	node = *rnode;
    327       1.1     rmind 	node.sysctl_data = &newsize;
    328       1.1     rmind 
    329       1.1     rmind 	newsize = hztoms(min_ts);
    330       1.1     rmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    331       1.1     rmind 	if (error || newp == NULL)
    332       1.1     rmind 		return error;
    333       1.1     rmind 
    334       1.8     rmind 	newsize = mstohz(newsize);
    335       1.1     rmind 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    336       1.1     rmind 		return EINVAL;
    337       1.1     rmind 
    338       1.1     rmind 	/* It is safe to do this in such order */
    339       1.1     rmind 	for (CPU_INFO_FOREACH(cii, ci))
    340       1.1     rmind 		spc_lock(ci);
    341       1.1     rmind 
    342       1.8     rmind 	min_ts = newsize;
    343       1.1     rmind 	sched_precalcts();
    344       1.1     rmind 
    345       1.1     rmind 	for (CPU_INFO_FOREACH(cii, ci))
    346       1.1     rmind 		spc_unlock(ci);
    347       1.1     rmind 
    348       1.1     rmind 	return 0;
    349       1.1     rmind }
    350       1.1     rmind 
    351       1.1     rmind static int
    352       1.1     rmind sysctl_sched_maxts(SYSCTLFN_ARGS)
    353       1.1     rmind {
    354       1.1     rmind 	struct sysctlnode node;
    355       1.1     rmind 	struct cpu_info *ci;
    356       1.1     rmind 	int error, newsize;
    357       1.1     rmind 	CPU_INFO_ITERATOR cii;
    358       1.1     rmind 
    359       1.1     rmind 	node = *rnode;
    360       1.1     rmind 	node.sysctl_data = &newsize;
    361       1.1     rmind 
    362       1.1     rmind 	newsize = hztoms(max_ts);
    363       1.1     rmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    364       1.1     rmind 	if (error || newp == NULL)
    365       1.1     rmind 		return error;
    366       1.1     rmind 
    367       1.8     rmind 	newsize = mstohz(newsize);
    368       1.1     rmind 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    369       1.1     rmind 		return EINVAL;
    370       1.1     rmind 
    371       1.1     rmind 	/* It is safe to do this in such order */
    372       1.1     rmind 	for (CPU_INFO_FOREACH(cii, ci))
    373       1.1     rmind 		spc_lock(ci);
    374       1.1     rmind 
    375       1.8     rmind 	max_ts = newsize;
    376       1.1     rmind 	sched_precalcts();
    377       1.1     rmind 
    378       1.1     rmind 	for (CPU_INFO_FOREACH(cii, ci))
    379       1.1     rmind 		spc_unlock(ci);
    380       1.1     rmind 
    381       1.1     rmind 	return 0;
    382       1.1     rmind }
    383       1.1     rmind 
    384      1.24        ad SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    385       1.1     rmind {
    386       1.1     rmind 	const struct sysctlnode *node = NULL;
    387       1.1     rmind 
    388       1.1     rmind 	sysctl_createv(clog, 0, NULL, NULL,
    389       1.1     rmind 		CTLFLAG_PERMANENT,
    390       1.1     rmind 		CTLTYPE_NODE, "kern", NULL,
    391       1.1     rmind 		NULL, 0, NULL, 0,
    392       1.1     rmind 		CTL_KERN, CTL_EOL);
    393       1.1     rmind 	sysctl_createv(clog, 0, NULL, &node,
    394       1.1     rmind 		CTLFLAG_PERMANENT,
    395       1.1     rmind 		CTLTYPE_NODE, "sched",
    396       1.1     rmind 		SYSCTL_DESCR("Scheduler options"),
    397       1.1     rmind 		NULL, 0, NULL, 0,
    398       1.1     rmind 		CTL_KERN, CTL_CREATE, CTL_EOL);
    399       1.1     rmind 
    400       1.1     rmind 	if (node == NULL)
    401       1.1     rmind 		return;
    402       1.1     rmind 
    403      1.24        ad 	sysctl_createv(NULL, 0, &node, NULL,
    404       1.1     rmind 		CTLFLAG_PERMANENT,
    405       1.1     rmind 		CTLTYPE_STRING, "name", NULL,
    406       1.1     rmind 		NULL, 0, __UNCONST("M2"), 0,
    407       1.1     rmind 		CTL_CREATE, CTL_EOL);
    408      1.24        ad 	sysctl_createv(NULL, 0, &node, NULL,
    409      1.15     rmind 		CTLFLAG_PERMANENT,
    410      1.15     rmind 		CTLTYPE_INT, "rtts",
    411      1.15     rmind 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    412      1.15     rmind 		sysctl_sched_rtts, 0, NULL, 0,
    413      1.15     rmind 		CTL_CREATE, CTL_EOL);
    414      1.24        ad 	sysctl_createv(NULL, 0, &node, NULL,
    415       1.1     rmind 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    416       1.1     rmind 		CTLTYPE_INT, "maxts",
    417       1.8     rmind 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    418       1.1     rmind 		sysctl_sched_maxts, 0, &max_ts, 0,
    419       1.1     rmind 		CTL_CREATE, CTL_EOL);
    420      1.24        ad 	sysctl_createv(NULL, 0, &node, NULL,
    421       1.1     rmind 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    422       1.1     rmind 		CTLTYPE_INT, "mints",
    423       1.8     rmind 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    424       1.1     rmind 		sysctl_sched_mints, 0, &min_ts, 0,
    425       1.1     rmind 		CTL_CREATE, CTL_EOL);
    426       1.1     rmind }
    427