Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: sched_m2.c,v 1.41 2026/01/04 02:10:01 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  * TODO:
     31  *  - Implementation of fair share queue;
     32  *  - Support for NUMA;
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.41 2026/01/04 02:10:01 riastradh Exp $");
     37 
     38 #include <sys/param.h>
     39 
     40 #include <sys/cpu.h>
     41 #include <sys/callout.h>
     42 #include <sys/errno.h>
     43 #include <sys/kernel.h>
     44 #include <sys/kmem.h>
     45 #include <sys/lwp.h>
     46 #include <sys/mutex.h>
     47 #include <sys/pool.h>
     48 #include <sys/proc.h>
     49 #include <sys/pset.h>
     50 #include <sys/resource.h>
     51 #include <sys/resourcevar.h>
     52 #include <sys/sched.h>
     53 #include <sys/sdt.h>
     54 #include <sys/syscallargs.h>
     55 #include <sys/sysctl.h>
     56 #include <sys/types.h>
     57 
     58 /*
     59  * Priority related definitions.
     60  */
     61 #define	PRI_TS_COUNT	(NPRI_USER)
     62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
     63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
     64 
     65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
     66 
     67 /*
     68  * Time-slices and priorities.
     69  */
     70 static u_int	min_ts;			/* Minimal time-slice */
     71 static u_int	max_ts;			/* Maximal time-slice */
     72 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
     73 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
     74 u_int		sched_rrticks;		/* Real-time time-slice */
     75 
     76 static void	sched_precalcts(void);
     77 
     78 /*
     79  * Initialization and setup.
     80  */
     81 
     82 void
     83 sched_rqinit(void)
     84 {
     85 	if (hz < 100) {
     86 		panic("sched_rqinit: value of HZ is too low\n");
     87 	}
     88 
     89 	/* Default timing ranges */
     90 	min_ts = mstohz(20);			/*  ~20 ms */
     91 	max_ts = mstohz(150);			/* ~150 ms */
     92 	sched_rrticks = mstohz(100);			/* ~100 ms */
     93 	sched_precalcts();
     94 
     95 #ifdef notyet
     96 	/* Need to set the name etc. This does not belong here */
     97 	/* Attach the primary CPU here */
     98 	sched_cpuattach(curcpu());
     99 #endif
    100 
    101 	sched_lwp_fork(NULL, &lwp0);
    102 #ifdef notyet
    103 	/* without attaching the primary CPU l_mutex does not get initialized */
    104 	lwp_lock(&lwp0);
    105 	sched_newts(&lwp0);
    106 	lwp_unlock(&lwp0);
    107 #else
    108 	/* gross */
    109 	lwp0.l_sched.timeslice = ts_map[lwp0.l_auxprio];
    110 #endif
    111 }
    112 
    113 /* Pre-calculate the time-slices for the priorities */
    114 static void
    115 sched_precalcts(void)
    116 {
    117 	pri_t p;
    118 
    119 	/* Time-sharing range */
    120 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
    121 		ts_map[p] = max_ts -
    122 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
    123 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
    124 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
    125 	}
    126 
    127 	/* Real-time range */
    128 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
    129 		ts_map[p] = sched_rrticks;
    130 		high_pri[p] = p;
    131 	}
    132 }
    133 
    134 /*
    135  * Hooks.
    136  */
    137 
    138 void
    139 sched_proc_fork(struct proc *parent, struct proc *child)
    140 {
    141 	struct lwp *l;
    142 
    143 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
    144 		lwp_lock(l);
    145 		sched_newts(l);
    146 		lwp_unlock(l);
    147 	}
    148 }
    149 
    150 void
    151 sched_proc_exit(struct proc *child, struct proc *parent)
    152 {
    153 
    154 }
    155 
    156 void
    157 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
    158 {
    159 
    160 }
    161 
    162 void
    163 sched_lwp_collect(struct lwp *l)
    164 {
    165 
    166 }
    167 
    168 void
    169 sched_setrunnable(struct lwp *l)
    170 {
    171 
    172 }
    173 
    174 void
    175 sched_schedclock(struct lwp *l)
    176 {
    177 
    178 }
    179 
    180 /*
    181  * Priorities and time-slice.
    182  */
    183 
    184 void
    185 sched_nice(struct proc *p, int prio)
    186 {
    187 	struct lwp *l;
    188 	int n;
    189 
    190 	KASSERT(mutex_owned(p->p_lock));
    191 
    192 	p->p_nice = prio;
    193 	n = (prio - NZERO) >> 2;
    194 	if (n == 0)
    195 		return;
    196 
    197 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
    198 		lwp_lock(l);
    199 		if (l->l_class == SCHED_OTHER) {
    200 			pri_t pri = l->l_priority - n;
    201 			pri = (n < 0) ? uimin(pri, PRI_HIGHEST_TS) : imax(pri, 0);
    202 			lwp_changepri(l, pri);
    203 		}
    204 		lwp_unlock(l);
    205 	}
    206 }
    207 
    208 /* Recalculate the time-slice */
    209 void
    210 sched_newts(struct lwp *l)
    211 {
    212 
    213 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
    214 }
    215 
    216 void
    217 sched_slept(struct lwp *l)
    218 {
    219 
    220 	/*
    221 	 * If thread is in time-sharing queue and batch flag is not marked,
    222 	 * increase the priority, and run with the lower time-quantum.
    223 	 */
    224 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
    225 		struct proc *p = l->l_proc;
    226 
    227 		KASSERT(l->l_class == SCHED_OTHER);
    228 		if (__predict_false(p->p_nice < NZERO)) {
    229 			const int n = uimax((NZERO - p->p_nice) >> 2, 1);
    230 			l->l_priority = uimin(l->l_priority + n, PRI_HIGHEST_TS);
    231 		} else {
    232 			l->l_priority++;
    233 		}
    234 	}
    235 }
    236 
    237 void
    238 sched_wakeup(struct lwp *l)
    239 {
    240 
    241 	/* If thread was sleeping a second or more - set a high priority */
    242 	if (l->l_slptime >= 1)
    243 		l->l_priority = high_pri[l->l_priority];
    244 }
    245 
    246 void
    247 sched_pstats_hook(struct lwp *l, int batch)
    248 {
    249 	pri_t prio;
    250 
    251 	/*
    252 	 * Estimate threads on time-sharing queue only, however,
    253 	 * exclude the highest priority for performance purposes.
    254 	 */
    255 	KASSERT(lwp_locked(l, NULL));
    256 	if (l->l_priority >= PRI_HIGHEST_TS)
    257 		return;
    258 	KASSERT(l->l_class == SCHED_OTHER);
    259 
    260 	/* If it is CPU-bound not a first time - decrease the priority */
    261 	prio = l->l_priority;
    262 	if (batch && prio != 0)
    263 		prio--;
    264 
    265 	/* If thread was not ran a second or more - set a high priority */
    266 	if (l->l_stat == LSRUN) {
    267 		if (l->l_rticks && (getticks() - l->l_rticks >= hz))
    268 			prio = high_pri[prio];
    269 		/* Re-enqueue the thread if priority has changed */
    270 		if (prio != l->l_priority)
    271 			lwp_changepri(l, prio);
    272 	} else {
    273 		/* In other states, change the priority directly */
    274 		l->l_priority = prio;
    275 	}
    276 }
    277 
    278 void
    279 sched_oncpu(lwp_t *l)
    280 {
    281 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
    282 
    283 	/* Update the counters */
    284 	KASSERT(l->l_sched.timeslice >= min_ts);
    285 	KASSERT(l->l_sched.timeslice <= max_ts);
    286 	spc->spc_ticks = l->l_sched.timeslice;
    287 }
    288 
    289 /*
    290  * Time-driven events.
    291  */
    292 
    293 /*
    294  * Called once per time-quantum, with the running LWP lock held (spc_lwplock).
    295  */
    296 void
    297 sched_tick(struct cpu_info *ci)
    298 {
    299 	struct schedstate_percpu *spc = &ci->ci_schedstate;
    300 	struct lwp *l = ci->ci_onproc;
    301 	struct proc *p;
    302 
    303 	if (__predict_false(CURCPU_IDLE_P()))
    304 		return;
    305 
    306 	lwp_lock(l);
    307 	KASSERT(l->l_mutex != spc->spc_mutex);
    308 	switch (l->l_class) {
    309 	case SCHED_FIFO:
    310 		/*
    311 		 * Update the time-quantum, and continue running,
    312 		 * if thread runs on FIFO real-time policy.
    313 		 */
    314 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
    315 		spc->spc_ticks = l->l_sched.timeslice;
    316 		lwp_unlock(l);
    317 		return;
    318 	case SCHED_OTHER:
    319 		/*
    320 		 * If thread is in time-sharing queue, decrease the priority,
    321 		 * and run with a higher time-quantum.
    322 		 */
    323 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
    324 		if (l->l_priority == 0)
    325 			break;
    326 
    327 		p = l->l_proc;
    328 		if (__predict_false(p->p_nice > NZERO)) {
    329 			const int n = uimax((p->p_nice - NZERO) >> 2, 1);
    330 			l->l_priority = imax(l->l_priority - n, 0);
    331 		} else
    332 			l->l_priority--;
    333 		break;
    334 	}
    335 
    336 	/*
    337 	 * If there are higher priority threads or threads in the same queue,
    338 	 * mark that thread should yield, otherwise, continue running.
    339 	 */
    340 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
    341 		spc->spc_flags |= SPCF_SHOULDYIELD;
    342 		spc_lock(ci);
    343 		sched_resched_cpu(ci, MAXPRI_KTHREAD, true);
    344 		/* spc now unlocked */
    345 	} else
    346 		spc->spc_ticks = l->l_sched.timeslice;
    347 	lwp_unlock(l);
    348 }
    349 
    350 /*
    351  * Sysctl nodes and initialization.
    352  */
    353 
    354 static int
    355 sysctl_sched_rtts(SYSCTLFN_ARGS)
    356 {
    357 	struct sysctlnode node;
    358 	int rttsms = hztoms(sched_rrticks);
    359 
    360 	node = *rnode;
    361 	node.sysctl_data = &rttsms;
    362 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    363 }
    364 
    365 static int
    366 sysctl_sched_mints(SYSCTLFN_ARGS)
    367 {
    368 	struct sysctlnode node;
    369 	struct cpu_info *ci;
    370 	int error, newsize;
    371 	CPU_INFO_ITERATOR cii;
    372 
    373 	node = *rnode;
    374 	node.sysctl_data = &newsize;
    375 
    376 	newsize = hztoms(min_ts);
    377 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    378 	if (error || newp == NULL)
    379 		return error;
    380 
    381 	newsize = mstohz(newsize);
    382 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
    383 		return SET_ERROR(EINVAL);
    384 
    385 	/* It is safe to do this in such order */
    386 	for (CPU_INFO_FOREACH(cii, ci))
    387 		spc_lock(ci);
    388 
    389 	min_ts = newsize;
    390 	sched_precalcts();
    391 
    392 	for (CPU_INFO_FOREACH(cii, ci))
    393 		spc_unlock(ci);
    394 
    395 	return 0;
    396 }
    397 
    398 static int
    399 sysctl_sched_maxts(SYSCTLFN_ARGS)
    400 {
    401 	struct sysctlnode node;
    402 	struct cpu_info *ci;
    403 	int error, newsize;
    404 	CPU_INFO_ITERATOR cii;
    405 
    406 	node = *rnode;
    407 	node.sysctl_data = &newsize;
    408 
    409 	newsize = hztoms(max_ts);
    410 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    411 	if (error || newp == NULL)
    412 		return error;
    413 
    414 	newsize = mstohz(newsize);
    415 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
    416 		return SET_ERROR(EINVAL);
    417 
    418 	/* It is safe to do this in such order */
    419 	for (CPU_INFO_FOREACH(cii, ci))
    420 		spc_lock(ci);
    421 
    422 	max_ts = newsize;
    423 	sched_precalcts();
    424 
    425 	for (CPU_INFO_FOREACH(cii, ci))
    426 		spc_unlock(ci);
    427 
    428 	return 0;
    429 }
    430 
    431 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
    432 {
    433 	const struct sysctlnode *node = NULL;
    434 
    435 	sysctl_createv(clog, 0, NULL, &node,
    436 		CTLFLAG_PERMANENT,
    437 		CTLTYPE_NODE, "sched",
    438 		SYSCTL_DESCR("Scheduler options"),
    439 		NULL, 0, NULL, 0,
    440 		CTL_KERN, CTL_CREATE, CTL_EOL);
    441 
    442 	if (node == NULL)
    443 		return;
    444 
    445 	sysctl_createv(NULL, 0, &node, NULL,
    446 		CTLFLAG_PERMANENT,
    447 		CTLTYPE_STRING, "name", NULL,
    448 		NULL, 0, __UNCONST("M2"), 0,
    449 		CTL_CREATE, CTL_EOL);
    450 	sysctl_createv(NULL, 0, &node, NULL,
    451 		CTLFLAG_PERMANENT,
    452 		CTLTYPE_INT, "rtts",
    453 		SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
    454 		sysctl_sched_rtts, 0, NULL, 0,
    455 		CTL_CREATE, CTL_EOL);
    456 	sysctl_createv(NULL, 0, &node, NULL,
    457 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    458 		CTLTYPE_INT, "maxts",
    459 		SYSCTL_DESCR("Maximal time quantum (in milliseconds)"),
    460 		sysctl_sched_maxts, 0, &max_ts, 0,
    461 		CTL_CREATE, CTL_EOL);
    462 	sysctl_createv(NULL, 0, &node, NULL,
    463 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    464 		CTLTYPE_INT, "mints",
    465 		SYSCTL_DESCR("Minimal time quantum (in milliseconds)"),
    466 		sysctl_sched_mints, 0, &min_ts, 0,
    467 		CTL_CREATE, CTL_EOL);
    468 }
    469