Home | History | Annotate | Line # | Download | only in kern
sched_m2.c revision 1.26
      1  1.26  rmind /*	$NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $	*/
      2   1.1  rmind 
      3   1.1  rmind /*
      4  1.15  rmind  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  1.12  rmind  * All rights reserved.
      6   1.1  rmind  *
      7   1.1  rmind  * Redistribution and use in source and binary forms, with or without
      8   1.1  rmind  * modification, are permitted provided that the following conditions
      9   1.1  rmind  * are met:
     10   1.1  rmind  * 1. Redistributions of source code must retain the above copyright
     11   1.1  rmind  *    notice, this list of conditions and the following disclaimer.
     12   1.1  rmind  * 2. Redistributions in binary form must reproduce the above copyright
     13   1.1  rmind  *    notice, this list of conditions and the following disclaimer in the
     14   1.1  rmind  *    documentation and/or other materials provided with the distribution.
     15   1.1  rmind  *
     16  1.19  rmind  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  1.19  rmind  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  1.19  rmind  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  1.19  rmind  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  1.19  rmind  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  1.19  rmind  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  1.19  rmind  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  1.19  rmind  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  1.19  rmind  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  1.19  rmind  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  1.19  rmind  * SUCH DAMAGE.
     27   1.1  rmind  */
     28   1.1  rmind 
     29   1.1  rmind /*
     30   1.1  rmind  * TODO:
     31   1.1  rmind  *  - Implementation of fair share queue;
     32   1.1  rmind  *  - Support for NUMA;
     33   1.1  rmind  */
     34   1.1  rmind 
     35   1.1  rmind #include <sys/cdefs.h>
     36  1.26  rmind __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.26 2008/10/07 09:48:27 rmind Exp $");
     37   1.1  rmind 
     38   1.1  rmind #include <sys/param.h>
     39   1.1  rmind 
     40   1.8  rmind #include <sys/bitops.h>
     41   1.1  rmind #include <sys/cpu.h>
     42   1.1  rmind #include <sys/callout.h>
     43   1.1  rmind #include <sys/errno.h>
     44   1.1  rmind #include <sys/kernel.h>
     45   1.1  rmind #include <sys/kmem.h>
     46   1.1  rmind #include <sys/lwp.h>
     47   1.1  rmind #include <sys/mutex.h>
     48   1.1  rmind #include <sys/pool.h>
     49   1.1  rmind #include <sys/proc.h>
     50  1.15  rmind #include <sys/pset.h>
     51   1.1  rmind #include <sys/resource.h>
     52   1.1  rmind #include <sys/resourcevar.h>
     53   1.1  rmind #include <sys/sched.h>
     54   1.1  rmind #include <sys/syscallargs.h>
     55   1.1  rmind #include <sys/sysctl.h>
     56   1.1  rmind #include <sys/types.h>
     57   1.1  rmind 
     58   1.1  rmind /*
     59  1.10     ad  * Priority related defintions.
     60   1.1  rmind  */
     61  1.10     ad #define	PRI_TS_COUNT	(NPRI_USER)
     62  1.10     ad #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63  1.10     ad #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64  1.10     ad 
     65  1.11  rmind #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66  1.10     ad 
     67   1.1  rmind /*
     68   1.1  rmind  * Time-slices and priorities.
     69   1.1  rmind  */
     70   1.1  rmind static u_int	min_ts;			/* Minimal time-slice */
     71   1.1  rmind static u_int	max_ts;			/* Maximal time-slice */
     72   1.1  rmind static u_int	rt_ts;			/* Real-time time-slice */
     73   1.1  rmind static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     74   1.1  rmind static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     75   1.1  rmind 
     76  1.25  rmind static void	sched_precalcts(void);
     77  1.25  rmind 
     78   1.1  rmind /*
     79   1.1  rmind  * Initialization and setup.
     80   1.1  rmind  */
     81   1.1  rmind 
     82   1.1  rmind void
     83   1.1  rmind sched_rqinit(void)
     84   1.1  rmind {
     85   1.1  rmind 	struct cpu_info *ci = curcpu();
     86   1.1  rmind 
     87   1.1  rmind 	if (hz < 100) {
     88   1.1  rmind 		panic("sched_rqinit: value of HZ is too low\n");
     89   1.1  rmind 	}
     90   1.1  rmind 
     91   1.1  rmind 	/* Default timing ranges */
     92  1.26  rmind 	min_ts = mstohz(20);			/*  ~20 ms */
     93  1.26  rmind 	max_ts = mstohz(150);			/* ~150 ms */
     94  1.26  rmind 	rt_ts = mstohz(100);			/* ~100 ms */
     95   1.1  rmind 	sched_precalcts();
     96   1.1  rmind 
     97   1.1  rmind 	/* Attach the primary CPU here */
     98   1.1  rmind 	sched_cpuattach(ci);
     99   1.1  rmind 
    100  1.10     ad 	sched_lwp_fork(NULL, &lwp0);
    101   1.1  rmind 	sched_newts(&lwp0);
    102   1.1  rmind }
    103   1.1  rmind 
    104   1.1  rmind /* Pre-calculate the time-slices for the priorities */
    105   1.1  rmind static void
    106   1.1  rmind sched_precalcts(void)
    107   1.1  rmind {
    108   1.1  rmind 	pri_t p;
    109   1.1  rmind 
    110  1.10     ad 	/* Time-sharing range */
    111  1.10     ad 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    112  1.10     ad 		ts_map[p] = max_ts -
    113  1.10     ad 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    114  1.10     ad 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    115  1.10     ad 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    116  1.10     ad 	}
    117  1.10     ad 
    118  1.10     ad 	/* Real-time range */
    119  1.10     ad 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    120   1.1  rmind 		ts_map[p] = rt_ts;
    121   1.1  rmind 		high_pri[p] = p;
    122   1.1  rmind 	}
    123   1.1  rmind }
    124   1.1  rmind 
    125   1.1  rmind /*
    126   1.1  rmind  * Hooks.
    127   1.1  rmind  */
    128   1.1  rmind 
    129   1.1  rmind void
    130   1.1  rmind sched_proc_fork(struct proc *parent, struct proc *child)
    131   1.1  rmind {
    132   1.1  rmind 	struct lwp *l;
    133   1.1  rmind 
    134   1.1  rmind 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    135   1.1  rmind 		lwp_lock(l);
    136   1.1  rmind 		sched_newts(l);
    137   1.1  rmind 		lwp_unlock(l);
    138   1.1  rmind 	}
    139   1.1  rmind }
    140   1.1  rmind 
    141   1.1  rmind void
    142   1.1  rmind sched_proc_exit(struct proc *child, struct proc *parent)
    143   1.1  rmind {
    144   1.1  rmind 
    145   1.1  rmind }
    146   1.1  rmind 
    147   1.1  rmind void
    148  1.10     ad sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    149   1.1  rmind {
    150   1.1  rmind 
    151   1.1  rmind }
    152   1.1  rmind 
    153   1.1  rmind void
    154  1.10     ad sched_lwp_collect(struct lwp *l)
    155  1.10     ad {
    156  1.10     ad 
    157  1.10     ad }
    158  1.10     ad 
    159  1.10     ad void
    160   1.1  rmind sched_setrunnable(struct lwp *l)
    161   1.1  rmind {
    162   1.1  rmind 
    163   1.1  rmind }
    164   1.1  rmind 
    165   1.1  rmind void
    166   1.1  rmind sched_schedclock(struct lwp *l)
    167   1.1  rmind {
    168   1.1  rmind 
    169   1.1  rmind }
    170   1.1  rmind 
    171   1.1  rmind /*
    172   1.1  rmind  * Priorities and time-slice.
    173   1.1  rmind  */
    174   1.1  rmind 
    175   1.1  rmind void
    176   1.1  rmind sched_nice(struct proc *p, int prio)
    177   1.1  rmind {
    178   1.1  rmind 
    179   1.1  rmind }
    180   1.1  rmind 
    181   1.1  rmind /* Recalculate the time-slice */
    182  1.24     ad void
    183   1.1  rmind sched_newts(struct lwp *l)
    184   1.1  rmind {
    185   1.1  rmind 
    186  1.26  rmind 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
    187   1.1  rmind }
    188   1.1  rmind 
    189   1.1  rmind void
    190   1.1  rmind sched_slept(struct lwp *l)
    191   1.1  rmind {
    192   1.1  rmind 
    193   1.1  rmind 	/*
    194  1.10     ad 	 * If thread is in time-sharing queue and batch flag is not marked,
    195  1.10     ad 	 * increase the the priority, and run with the lower time-quantum.
    196   1.1  rmind 	 */
    197  1.25  rmind 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    198  1.10     ad 		KASSERT(l->l_class == SCHED_OTHER);
    199  1.10     ad 		l->l_priority++;
    200  1.10     ad 	}
    201   1.1  rmind }
    202   1.1  rmind 
    203   1.1  rmind void
    204   1.1  rmind sched_wakeup(struct lwp *l)
    205   1.1  rmind {
    206   1.1  rmind 
    207   1.1  rmind 	/* If thread was sleeping a second or more - set a high priority */
    208  1.25  rmind 	if (l->l_slptime >= 1)
    209  1.10     ad 		l->l_priority = high_pri[l->l_priority];
    210   1.1  rmind }
    211   1.1  rmind 
    212   1.1  rmind void
    213  1.25  rmind sched_pstats_hook(struct lwp *l, int batch)
    214   1.1  rmind {
    215  1.11  rmind 	pri_t prio;
    216   1.1  rmind 
    217  1.22  rmind 	/*
    218  1.22  rmind 	 * Estimate threads on time-sharing queue only, however,
    219  1.22  rmind 	 * exclude the highest priority for performance purposes.
    220  1.22  rmind 	 */
    221  1.26  rmind 	KASSERT(lwp_locked(l, NULL));
    222  1.10     ad 	if (l->l_priority >= PRI_HIGHEST_TS)
    223   1.1  rmind 		return;
    224  1.16  rmind 	KASSERT(l->l_class == SCHED_OTHER);
    225   1.1  rmind 
    226  1.10     ad 	/* If it is CPU-bound not a first time - decrease the priority */
    227  1.11  rmind 	prio = l->l_priority;
    228  1.11  rmind 	if (batch && prio != 0)
    229  1.11  rmind 		prio--;
    230  1.10     ad 
    231   1.1  rmind 	/* If thread was not ran a second or more - set a high priority */
    232  1.11  rmind 	if (l->l_stat == LSRUN) {
    233  1.25  rmind 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
    234  1.11  rmind 			prio = high_pri[prio];
    235  1.11  rmind 		/* Re-enqueue the thread if priority has changed */
    236  1.11  rmind 		if (prio != l->l_priority)
    237  1.11  rmind 			lwp_changepri(l, prio);
    238  1.11  rmind 	} else {
    239  1.11  rmind 		/* In other states, change the priority directly */
    240  1.11  rmind 		l->l_priority = prio;
    241  1.11  rmind 	}
    242   1.1  rmind }
    243   1.1  rmind 
    244  1.24     ad void
    245  1.24     ad sched_oncpu(lwp_t *l)
    246  1.16  rmind {
    247  1.26  rmind 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
    248   1.1  rmind 
    249   1.1  rmind 	/* Update the counters */
    250  1.26  rmind 	KASSERT(l->l_sched.timeslice >= min_ts);
    251  1.26  rmind 	KASSERT(l->l_sched.timeslice <= max_ts);
    252  1.26  rmind 	spc->spc_ticks = l->l_sched.timeslice;
    253   1.1  rmind }
    254   1.1  rmind 
    255   1.1  rmind /*
    256   1.1  rmind  * Time-driven events.
    257   1.1  rmind  */
    258   1.1  rmind 
    259   1.1  rmind /*
    260   1.1  rmind  * Called once per time-quantum.  This routine is CPU-local and runs at
    261   1.1  rmind  * IPL_SCHED, thus the locking is not needed.
    262   1.1  rmind  */
    263   1.1  rmind void
    264   1.1  rmind sched_tick(struct cpu_info *ci)
    265   1.1  rmind {
    266   1.1  rmind 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    267   1.1  rmind 	struct lwp *l = curlwp;
    268   1.1  rmind 
    269  1.26  rmind 	if (__predict_false(CURCPU_IDLE_P()))
    270   1.2  rmind 		return;
    271   1.1  rmind 
    272  1.10     ad 	switch (l->l_class) {
    273   1.2  rmind 	case SCHED_FIFO:
    274   1.2  rmind 		/*
    275   1.2  rmind 		 * Update the time-quantum, and continue running,
    276   1.2  rmind 		 * if thread runs on FIFO real-time policy.
    277   1.2  rmind 		 */
    278  1.16  rmind 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    279  1.26  rmind 		spc->spc_ticks = l->l_sched.timeslice;
    280   1.1  rmind 		return;
    281   1.2  rmind 	case SCHED_OTHER:
    282  1.10     ad 		/*
    283  1.10     ad 		 * If thread is in time-sharing queue, decrease the priority,
    284  1.10     ad 		 * and run with a higher time-quantum.
    285  1.10     ad 		 */
    286  1.16  rmind 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    287  1.10     ad 		if (l->l_priority != 0)
    288  1.10     ad 			l->l_priority--;
    289   1.2  rmind 		break;
    290   1.1  rmind 	}
    291   1.1  rmind 
    292   1.1  rmind 	/*
    293   1.2  rmind 	 * If there are higher priority threads or threads in the same queue,
    294   1.2  rmind 	 * mark that thread should yield, otherwise, continue running.
    295   1.1  rmind 	 */
    296  1.24     ad 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    297   1.1  rmind 		spc->spc_flags |= SPCF_SHOULDYIELD;
    298   1.1  rmind 		cpu_need_resched(ci, 0);
    299   1.1  rmind 	} else
    300  1.26  rmind 		spc->spc_ticks = l->l_sched.timeslice;
    301   1.1  rmind }
    302   1.1  rmind 
    303   1.1  rmind /*
    304   1.1  rmind  * Sysctl nodes and initialization.
    305   1.1  rmind  */
    306   1.1  rmind 
    307   1.1  rmind static int
    308  1.15  rmind sysctl_sched_rtts(SYSCTLFN_ARGS)
    309  1.15  rmind {
    310  1.15  rmind 	struct sysctlnode node;
    311  1.15  rmind 	int rttsms = hztoms(rt_ts);
    312  1.15  rmind 
    313  1.15  rmind 	node = *rnode;
    314  1.15  rmind 	node.sysctl_data = &rttsms;
    315  1.15  rmind 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    316  1.15  rmind }
    317  1.15  rmind 
    318  1.15  rmind static int
    319   1.1  rmind sysctl_sched_mints(SYSCTLFN_ARGS)
    320   1.1  rmind {
    321   1.1  rmind 	struct sysctlnode node;
    322   1.1  rmind 	struct cpu_info *ci;
    323   1.1  rmind 	int error, newsize;
    324   1.1  rmind 	CPU_INFO_ITERATOR cii;
    325   1.1  rmind 
    326   1.1  rmind 	node = *rnode;
    327   1.1  rmind 	node.sysctl_data = &newsize;
    328   1.1  rmind 
    329   1.1  rmind 	newsize = hztoms(min_ts);
    330   1.1  rmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    331   1.1  rmind 	if (error || newp == NULL)
    332   1.1  rmind 		return error;
    333   1.1  rmind 
    334   1.8  rmind 	newsize = mstohz(newsize);
    335   1.1  rmind 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    336   1.1  rmind 		return EINVAL;
    337   1.1  rmind 
    338   1.1  rmind 	/* It is safe to do this in such order */
    339   1.1  rmind 	for (CPU_INFO_FOREACH(cii, ci))
    340   1.1  rmind 		spc_lock(ci);
    341   1.1  rmind 
    342   1.8  rmind 	min_ts = newsize;
    343   1.1  rmind 	sched_precalcts();
    344   1.1  rmind 
    345   1.1  rmind 	for (CPU_INFO_FOREACH(cii, ci))
    346   1.1  rmind 		spc_unlock(ci);
    347   1.1  rmind 
    348   1.1  rmind 	return 0;
    349   1.1  rmind }
    350   1.1  rmind 
    351   1.1  rmind static int
    352   1.1  rmind sysctl_sched_maxts(SYSCTLFN_ARGS)
    353   1.1  rmind {
    354   1.1  rmind 	struct sysctlnode node;
    355   1.1  rmind 	struct cpu_info *ci;
    356   1.1  rmind 	int error, newsize;
    357   1.1  rmind 	CPU_INFO_ITERATOR cii;
    358   1.1  rmind 
    359   1.1  rmind 	node = *rnode;
    360   1.1  rmind 	node.sysctl_data = &newsize;
    361   1.1  rmind 
    362   1.1  rmind 	newsize = hztoms(max_ts);
    363   1.1  rmind 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    364   1.1  rmind 	if (error || newp == NULL)
    365   1.1  rmind 		return error;
    366   1.1  rmind 
    367   1.8  rmind 	newsize = mstohz(newsize);
    368   1.1  rmind 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    369   1.1  rmind 		return EINVAL;
    370   1.1  rmind 
    371   1.1  rmind 	/* It is safe to do this in such order */
    372   1.1  rmind 	for (CPU_INFO_FOREACH(cii, ci))
    373   1.1  rmind 		spc_lock(ci);
    374   1.1  rmind 
    375   1.8  rmind 	max_ts = newsize;
    376   1.1  rmind 	sched_precalcts();
    377   1.1  rmind 
    378   1.1  rmind 	for (CPU_INFO_FOREACH(cii, ci))
    379   1.1  rmind 		spc_unlock(ci);
    380   1.1  rmind 
    381   1.1  rmind 	return 0;
    382   1.1  rmind }
    383   1.1  rmind 
    384  1.24     ad SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    385   1.1  rmind {
    386   1.1  rmind 	const struct sysctlnode *node = NULL;
    387   1.1  rmind 
    388   1.1  rmind 	sysctl_createv(clog, 0, NULL, NULL,
    389   1.1  rmind 		CTLFLAG_PERMANENT,
    390   1.1  rmind 		CTLTYPE_NODE, "kern", NULL,
    391   1.1  rmind 		NULL, 0, NULL, 0,
    392   1.1  rmind 		CTL_KERN, CTL_EOL);
    393   1.1  rmind 	sysctl_createv(clog, 0, NULL, &node,
    394   1.1  rmind 		CTLFLAG_PERMANENT,
    395   1.1  rmind 		CTLTYPE_NODE, "sched",
    396   1.1  rmind 		SYSCTL_DESCR("Scheduler options"),
    397   1.1  rmind 		NULL, 0, NULL, 0,
    398   1.1  rmind 		CTL_KERN, CTL_CREATE, CTL_EOL);
    399   1.1  rmind 
    400   1.1  rmind 	if (node == NULL)
    401   1.1  rmind 		return;
    402   1.1  rmind 
    403  1.24     ad 	sysctl_createv(NULL, 0, &node, NULL,
    404   1.1  rmind 		CTLFLAG_PERMANENT,
    405   1.1  rmind 		CTLTYPE_STRING, "name", NULL,
    406   1.1  rmind 		NULL, 0, __UNCONST("M2"), 0,
    407   1.1  rmind 		CTL_CREATE, CTL_EOL);
    408  1.24     ad 	sysctl_createv(NULL, 0, &node, NULL,
    409  1.15  rmind 		CTLFLAG_PERMANENT,
    410  1.15  rmind 		CTLTYPE_INT, "rtts",
    411  1.15  rmind 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
    412  1.15  rmind 		sysctl_sched_rtts, 0, NULL, 0,
    413  1.15  rmind 		CTL_CREATE, CTL_EOL);
    414  1.24     ad 	sysctl_createv(NULL, 0, &node, NULL,
    415   1.1  rmind 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    416   1.1  rmind 		CTLTYPE_INT, "maxts",
    417   1.8  rmind 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
    418   1.1  rmind 		sysctl_sched_maxts, 0, &max_ts, 0,
    419   1.1  rmind 		CTL_CREATE, CTL_EOL);
    420  1.24     ad 	sysctl_createv(NULL, 0, &node, NULL,
    421   1.1  rmind 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    422   1.1  rmind 		CTLTYPE_INT, "mints",
    423   1.8  rmind 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
    424   1.1  rmind 		sysctl_sched_mints, 0, &min_ts, 0,
    425   1.1  rmind 		CTL_CREATE, CTL_EOL);
    426   1.1  rmind }
    427