1 /* $NetBSD: sched_m2.c,v 1.41 2026/01/04 02:10:01 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.41 2026/01/04 02:10:01 riastradh Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/cpu.h> 41 #include <sys/callout.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/kmem.h> 45 #include <sys/lwp.h> 46 #include <sys/mutex.h> 47 #include <sys/pool.h> 48 #include <sys/proc.h> 49 #include <sys/pset.h> 50 #include <sys/resource.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sched.h> 53 #include <sys/sdt.h> 54 #include <sys/syscallargs.h> 55 #include <sys/sysctl.h> 56 #include <sys/types.h> 57 58 /* 59 * Priority related definitions. 60 */ 61 #define PRI_TS_COUNT (NPRI_USER) 62 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 63 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 64 65 #define PRI_HIGHEST_TS (MAXPRI_USER) 66 67 /* 68 * Time-slices and priorities. 69 */ 70 static u_int min_ts; /* Minimal time-slice */ 71 static u_int max_ts; /* Maximal time-slice */ 72 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 73 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 74 u_int sched_rrticks; /* Real-time time-slice */ 75 76 static void sched_precalcts(void); 77 78 /* 79 * Initialization and setup. 80 */ 81 82 void 83 sched_rqinit(void) 84 { 85 if (hz < 100) { 86 panic("sched_rqinit: value of HZ is too low\n"); 87 } 88 89 /* Default timing ranges */ 90 min_ts = mstohz(20); /* ~20 ms */ 91 max_ts = mstohz(150); /* ~150 ms */ 92 sched_rrticks = mstohz(100); /* ~100 ms */ 93 sched_precalcts(); 94 95 #ifdef notyet 96 /* Need to set the name etc. This does not belong here */ 97 /* Attach the primary CPU here */ 98 sched_cpuattach(curcpu()); 99 #endif 100 101 sched_lwp_fork(NULL, &lwp0); 102 #ifdef notyet 103 /* without attaching the primary CPU l_mutex does not get initialized */ 104 lwp_lock(&lwp0); 105 sched_newts(&lwp0); 106 lwp_unlock(&lwp0); 107 #else 108 /* gross */ 109 lwp0.l_sched.timeslice = ts_map[lwp0.l_auxprio]; 110 #endif 111 } 112 113 /* Pre-calculate the time-slices for the priorities */ 114 static void 115 sched_precalcts(void) 116 { 117 pri_t p; 118 119 /* Time-sharing range */ 120 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 121 ts_map[p] = max_ts - 122 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 123 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 124 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 125 } 126 127 /* Real-time range */ 128 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 129 ts_map[p] = sched_rrticks; 130 high_pri[p] = p; 131 } 132 } 133 134 /* 135 * Hooks. 136 */ 137 138 void 139 sched_proc_fork(struct proc *parent, struct proc *child) 140 { 141 struct lwp *l; 142 143 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 144 lwp_lock(l); 145 sched_newts(l); 146 lwp_unlock(l); 147 } 148 } 149 150 void 151 sched_proc_exit(struct proc *child, struct proc *parent) 152 { 153 154 } 155 156 void 157 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 158 { 159 160 } 161 162 void 163 sched_lwp_collect(struct lwp *l) 164 { 165 166 } 167 168 void 169 sched_setrunnable(struct lwp *l) 170 { 171 172 } 173 174 void 175 sched_schedclock(struct lwp *l) 176 { 177 178 } 179 180 /* 181 * Priorities and time-slice. 182 */ 183 184 void 185 sched_nice(struct proc *p, int prio) 186 { 187 struct lwp *l; 188 int n; 189 190 KASSERT(mutex_owned(p->p_lock)); 191 192 p->p_nice = prio; 193 n = (prio - NZERO) >> 2; 194 if (n == 0) 195 return; 196 197 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 198 lwp_lock(l); 199 if (l->l_class == SCHED_OTHER) { 200 pri_t pri = l->l_priority - n; 201 pri = (n < 0) ? uimin(pri, PRI_HIGHEST_TS) : imax(pri, 0); 202 lwp_changepri(l, pri); 203 } 204 lwp_unlock(l); 205 } 206 } 207 208 /* Recalculate the time-slice */ 209 void 210 sched_newts(struct lwp *l) 211 { 212 213 l->l_sched.timeslice = ts_map[lwp_eprio(l)]; 214 } 215 216 void 217 sched_slept(struct lwp *l) 218 { 219 220 /* 221 * If thread is in time-sharing queue and batch flag is not marked, 222 * increase the priority, and run with the lower time-quantum. 223 */ 224 if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 225 struct proc *p = l->l_proc; 226 227 KASSERT(l->l_class == SCHED_OTHER); 228 if (__predict_false(p->p_nice < NZERO)) { 229 const int n = uimax((NZERO - p->p_nice) >> 2, 1); 230 l->l_priority = uimin(l->l_priority + n, PRI_HIGHEST_TS); 231 } else { 232 l->l_priority++; 233 } 234 } 235 } 236 237 void 238 sched_wakeup(struct lwp *l) 239 { 240 241 /* If thread was sleeping a second or more - set a high priority */ 242 if (l->l_slptime >= 1) 243 l->l_priority = high_pri[l->l_priority]; 244 } 245 246 void 247 sched_pstats_hook(struct lwp *l, int batch) 248 { 249 pri_t prio; 250 251 /* 252 * Estimate threads on time-sharing queue only, however, 253 * exclude the highest priority for performance purposes. 254 */ 255 KASSERT(lwp_locked(l, NULL)); 256 if (l->l_priority >= PRI_HIGHEST_TS) 257 return; 258 KASSERT(l->l_class == SCHED_OTHER); 259 260 /* If it is CPU-bound not a first time - decrease the priority */ 261 prio = l->l_priority; 262 if (batch && prio != 0) 263 prio--; 264 265 /* If thread was not ran a second or more - set a high priority */ 266 if (l->l_stat == LSRUN) { 267 if (l->l_rticks && (getticks() - l->l_rticks >= hz)) 268 prio = high_pri[prio]; 269 /* Re-enqueue the thread if priority has changed */ 270 if (prio != l->l_priority) 271 lwp_changepri(l, prio); 272 } else { 273 /* In other states, change the priority directly */ 274 l->l_priority = prio; 275 } 276 } 277 278 void 279 sched_oncpu(lwp_t *l) 280 { 281 struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate; 282 283 /* Update the counters */ 284 KASSERT(l->l_sched.timeslice >= min_ts); 285 KASSERT(l->l_sched.timeslice <= max_ts); 286 spc->spc_ticks = l->l_sched.timeslice; 287 } 288 289 /* 290 * Time-driven events. 291 */ 292 293 /* 294 * Called once per time-quantum, with the running LWP lock held (spc_lwplock). 295 */ 296 void 297 sched_tick(struct cpu_info *ci) 298 { 299 struct schedstate_percpu *spc = &ci->ci_schedstate; 300 struct lwp *l = ci->ci_onproc; 301 struct proc *p; 302 303 if (__predict_false(CURCPU_IDLE_P())) 304 return; 305 306 lwp_lock(l); 307 KASSERT(l->l_mutex != spc->spc_mutex); 308 switch (l->l_class) { 309 case SCHED_FIFO: 310 /* 311 * Update the time-quantum, and continue running, 312 * if thread runs on FIFO real-time policy. 313 */ 314 KASSERT(l->l_priority > PRI_HIGHEST_TS); 315 spc->spc_ticks = l->l_sched.timeslice; 316 lwp_unlock(l); 317 return; 318 case SCHED_OTHER: 319 /* 320 * If thread is in time-sharing queue, decrease the priority, 321 * and run with a higher time-quantum. 322 */ 323 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 324 if (l->l_priority == 0) 325 break; 326 327 p = l->l_proc; 328 if (__predict_false(p->p_nice > NZERO)) { 329 const int n = uimax((p->p_nice - NZERO) >> 2, 1); 330 l->l_priority = imax(l->l_priority - n, 0); 331 } else 332 l->l_priority--; 333 break; 334 } 335 336 /* 337 * If there are higher priority threads or threads in the same queue, 338 * mark that thread should yield, otherwise, continue running. 339 */ 340 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 341 spc->spc_flags |= SPCF_SHOULDYIELD; 342 spc_lock(ci); 343 sched_resched_cpu(ci, MAXPRI_KTHREAD, true); 344 /* spc now unlocked */ 345 } else 346 spc->spc_ticks = l->l_sched.timeslice; 347 lwp_unlock(l); 348 } 349 350 /* 351 * Sysctl nodes and initialization. 352 */ 353 354 static int 355 sysctl_sched_rtts(SYSCTLFN_ARGS) 356 { 357 struct sysctlnode node; 358 int rttsms = hztoms(sched_rrticks); 359 360 node = *rnode; 361 node.sysctl_data = &rttsms; 362 return sysctl_lookup(SYSCTLFN_CALL(&node)); 363 } 364 365 static int 366 sysctl_sched_mints(SYSCTLFN_ARGS) 367 { 368 struct sysctlnode node; 369 struct cpu_info *ci; 370 int error, newsize; 371 CPU_INFO_ITERATOR cii; 372 373 node = *rnode; 374 node.sysctl_data = &newsize; 375 376 newsize = hztoms(min_ts); 377 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 378 if (error || newp == NULL) 379 return error; 380 381 newsize = mstohz(newsize); 382 if (newsize < 1 || newsize > hz || newsize >= max_ts) 383 return SET_ERROR(EINVAL); 384 385 /* It is safe to do this in such order */ 386 for (CPU_INFO_FOREACH(cii, ci)) 387 spc_lock(ci); 388 389 min_ts = newsize; 390 sched_precalcts(); 391 392 for (CPU_INFO_FOREACH(cii, ci)) 393 spc_unlock(ci); 394 395 return 0; 396 } 397 398 static int 399 sysctl_sched_maxts(SYSCTLFN_ARGS) 400 { 401 struct sysctlnode node; 402 struct cpu_info *ci; 403 int error, newsize; 404 CPU_INFO_ITERATOR cii; 405 406 node = *rnode; 407 node.sysctl_data = &newsize; 408 409 newsize = hztoms(max_ts); 410 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 411 if (error || newp == NULL) 412 return error; 413 414 newsize = mstohz(newsize); 415 if (newsize < 10 || newsize > hz || newsize <= min_ts) 416 return SET_ERROR(EINVAL); 417 418 /* It is safe to do this in such order */ 419 for (CPU_INFO_FOREACH(cii, ci)) 420 spc_lock(ci); 421 422 max_ts = newsize; 423 sched_precalcts(); 424 425 for (CPU_INFO_FOREACH(cii, ci)) 426 spc_unlock(ci); 427 428 return 0; 429 } 430 431 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 432 { 433 const struct sysctlnode *node = NULL; 434 435 sysctl_createv(clog, 0, NULL, &node, 436 CTLFLAG_PERMANENT, 437 CTLTYPE_NODE, "sched", 438 SYSCTL_DESCR("Scheduler options"), 439 NULL, 0, NULL, 0, 440 CTL_KERN, CTL_CREATE, CTL_EOL); 441 442 if (node == NULL) 443 return; 444 445 sysctl_createv(NULL, 0, &node, NULL, 446 CTLFLAG_PERMANENT, 447 CTLTYPE_STRING, "name", NULL, 448 NULL, 0, __UNCONST("M2"), 0, 449 CTL_CREATE, CTL_EOL); 450 sysctl_createv(NULL, 0, &node, NULL, 451 CTLFLAG_PERMANENT, 452 CTLTYPE_INT, "rtts", 453 SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), 454 sysctl_sched_rtts, 0, NULL, 0, 455 CTL_CREATE, CTL_EOL); 456 sysctl_createv(NULL, 0, &node, NULL, 457 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 458 CTLTYPE_INT, "maxts", 459 SYSCTL_DESCR("Maximal time quantum (in milliseconds)"), 460 sysctl_sched_maxts, 0, &max_ts, 0, 461 CTL_CREATE, CTL_EOL); 462 sysctl_createv(NULL, 0, &node, NULL, 463 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 464 CTLTYPE_INT, "mints", 465 SYSCTL_DESCR("Minimal time quantum (in milliseconds)"), 466 sysctl_sched_mints, 0, &min_ts, 0, 467 CTL_CREATE, CTL_EOL); 468 } 469