1 1.40 christos /* $NetBSD: sched_m2.c,v 1.40 2024/01/24 16:11:48 christos Exp $ */ 2 1.1 rmind 3 1.1 rmind /* 4 1.15 rmind * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 1.12 rmind * All rights reserved. 6 1.1 rmind * 7 1.1 rmind * Redistribution and use in source and binary forms, with or without 8 1.1 rmind * modification, are permitted provided that the following conditions 9 1.1 rmind * are met: 10 1.1 rmind * 1. Redistributions of source code must retain the above copyright 11 1.1 rmind * notice, this list of conditions and the following disclaimer. 12 1.1 rmind * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 rmind * notice, this list of conditions and the following disclaimer in the 14 1.1 rmind * documentation and/or other materials provided with the distribution. 15 1.1 rmind * 16 1.19 rmind * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 1.19 rmind * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 1.19 rmind * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 1.19 rmind * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 1.19 rmind * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 1.19 rmind * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 1.19 rmind * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 1.19 rmind * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 1.19 rmind * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 1.19 rmind * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 1.19 rmind * SUCH DAMAGE. 27 1.1 rmind */ 28 1.1 rmind 29 1.1 rmind /* 30 1.1 rmind * TODO: 31 1.1 rmind * - Implementation of fair share queue; 32 1.1 rmind * - Support for NUMA; 33 1.1 rmind */ 34 1.1 rmind 35 1.1 rmind #include <sys/cdefs.h> 36 1.40 christos __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.40 2024/01/24 16:11:48 christos Exp $"); 37 1.1 rmind 38 1.1 rmind #include <sys/param.h> 39 1.1 rmind 40 1.1 rmind #include <sys/cpu.h> 41 1.1 rmind #include <sys/callout.h> 42 1.1 rmind #include <sys/errno.h> 43 1.1 rmind #include <sys/kernel.h> 44 1.1 rmind #include <sys/kmem.h> 45 1.1 rmind #include <sys/lwp.h> 46 1.1 rmind #include <sys/mutex.h> 47 1.1 rmind #include <sys/pool.h> 48 1.1 rmind #include <sys/proc.h> 49 1.15 rmind #include <sys/pset.h> 50 1.1 rmind #include <sys/resource.h> 51 1.1 rmind #include <sys/resourcevar.h> 52 1.1 rmind #include <sys/sched.h> 53 1.1 rmind #include <sys/syscallargs.h> 54 1.1 rmind #include <sys/sysctl.h> 55 1.1 rmind #include <sys/types.h> 56 1.1 rmind 57 1.1 rmind /* 58 1.32 maxv * Priority related definitions. 59 1.1 rmind */ 60 1.10 ad #define PRI_TS_COUNT (NPRI_USER) 61 1.10 ad #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 62 1.10 ad #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 63 1.10 ad 64 1.11 rmind #define PRI_HIGHEST_TS (MAXPRI_USER) 65 1.10 ad 66 1.1 rmind /* 67 1.1 rmind * Time-slices and priorities. 68 1.1 rmind */ 69 1.1 rmind static u_int min_ts; /* Minimal time-slice */ 70 1.1 rmind static u_int max_ts; /* Maximal time-slice */ 71 1.1 rmind static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 72 1.1 rmind static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 73 1.39 ad u_int sched_rrticks; /* Real-time time-slice */ 74 1.1 rmind 75 1.25 rmind static void sched_precalcts(void); 76 1.25 rmind 77 1.1 rmind /* 78 1.1 rmind * Initialization and setup. 79 1.1 rmind */ 80 1.1 rmind 81 1.1 rmind void 82 1.1 rmind sched_rqinit(void) 83 1.1 rmind { 84 1.1 rmind if (hz < 100) { 85 1.1 rmind panic("sched_rqinit: value of HZ is too low\n"); 86 1.1 rmind } 87 1.1 rmind 88 1.1 rmind /* Default timing ranges */ 89 1.26 rmind min_ts = mstohz(20); /* ~20 ms */ 90 1.26 rmind max_ts = mstohz(150); /* ~150 ms */ 91 1.39 ad sched_rrticks = mstohz(100); /* ~100 ms */ 92 1.1 rmind sched_precalcts(); 93 1.1 rmind 94 1.40 christos #ifdef notyet 95 1.30 christos /* Need to set the name etc. This does not belong here */ 96 1.1 rmind /* Attach the primary CPU here */ 97 1.30 christos sched_cpuattach(curcpu()); 98 1.30 christos #endif 99 1.1 rmind 100 1.10 ad sched_lwp_fork(NULL, &lwp0); 101 1.40 christos #ifdef notyet 102 1.40 christos /* without attaching the primary CPU l_mutex does not get initialized */ 103 1.40 christos lwp_lock(&lwp0); 104 1.1 rmind sched_newts(&lwp0); 105 1.40 christos lwp_unlock(&lwp0); 106 1.40 christos #else 107 1.40 christos /* gross */ 108 1.40 christos lwp0.l_sched.timeslice = ts_map[lwp0.l_auxprio]; 109 1.40 christos #endif 110 1.1 rmind } 111 1.1 rmind 112 1.1 rmind /* Pre-calculate the time-slices for the priorities */ 113 1.1 rmind static void 114 1.1 rmind sched_precalcts(void) 115 1.1 rmind { 116 1.1 rmind pri_t p; 117 1.1 rmind 118 1.10 ad /* Time-sharing range */ 119 1.10 ad for (p = 0; p <= PRI_HIGHEST_TS; p++) { 120 1.10 ad ts_map[p] = max_ts - 121 1.10 ad (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 122 1.10 ad high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 123 1.10 ad ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 124 1.10 ad } 125 1.10 ad 126 1.10 ad /* Real-time range */ 127 1.10 ad for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 128 1.39 ad ts_map[p] = sched_rrticks; 129 1.1 rmind high_pri[p] = p; 130 1.1 rmind } 131 1.1 rmind } 132 1.1 rmind 133 1.1 rmind /* 134 1.1 rmind * Hooks. 135 1.1 rmind */ 136 1.1 rmind 137 1.1 rmind void 138 1.1 rmind sched_proc_fork(struct proc *parent, struct proc *child) 139 1.1 rmind { 140 1.1 rmind struct lwp *l; 141 1.1 rmind 142 1.1 rmind LIST_FOREACH(l, &child->p_lwps, l_sibling) { 143 1.1 rmind lwp_lock(l); 144 1.1 rmind sched_newts(l); 145 1.1 rmind lwp_unlock(l); 146 1.1 rmind } 147 1.1 rmind } 148 1.1 rmind 149 1.1 rmind void 150 1.1 rmind sched_proc_exit(struct proc *child, struct proc *parent) 151 1.1 rmind { 152 1.1 rmind 153 1.1 rmind } 154 1.1 rmind 155 1.1 rmind void 156 1.10 ad sched_lwp_fork(struct lwp *l1, struct lwp *l2) 157 1.1 rmind { 158 1.1 rmind 159 1.1 rmind } 160 1.1 rmind 161 1.1 rmind void 162 1.10 ad sched_lwp_collect(struct lwp *l) 163 1.10 ad { 164 1.10 ad 165 1.10 ad } 166 1.10 ad 167 1.10 ad void 168 1.1 rmind sched_setrunnable(struct lwp *l) 169 1.1 rmind { 170 1.1 rmind 171 1.1 rmind } 172 1.1 rmind 173 1.1 rmind void 174 1.1 rmind sched_schedclock(struct lwp *l) 175 1.1 rmind { 176 1.1 rmind 177 1.1 rmind } 178 1.1 rmind 179 1.1 rmind /* 180 1.1 rmind * Priorities and time-slice. 181 1.1 rmind */ 182 1.1 rmind 183 1.1 rmind void 184 1.1 rmind sched_nice(struct proc *p, int prio) 185 1.1 rmind { 186 1.27 rmind struct lwp *l; 187 1.27 rmind int n; 188 1.27 rmind 189 1.27 rmind KASSERT(mutex_owned(p->p_lock)); 190 1.1 rmind 191 1.27 rmind p->p_nice = prio; 192 1.27 rmind n = (prio - NZERO) >> 2; 193 1.27 rmind if (n == 0) 194 1.27 rmind return; 195 1.27 rmind 196 1.27 rmind LIST_FOREACH(l, &p->p_lwps, l_sibling) { 197 1.27 rmind lwp_lock(l); 198 1.27 rmind if (l->l_class == SCHED_OTHER) { 199 1.27 rmind pri_t pri = l->l_priority - n; 200 1.33 riastrad pri = (n < 0) ? uimin(pri, PRI_HIGHEST_TS) : imax(pri, 0); 201 1.27 rmind lwp_changepri(l, pri); 202 1.27 rmind } 203 1.27 rmind lwp_unlock(l); 204 1.27 rmind } 205 1.1 rmind } 206 1.1 rmind 207 1.1 rmind /* Recalculate the time-slice */ 208 1.24 ad void 209 1.1 rmind sched_newts(struct lwp *l) 210 1.1 rmind { 211 1.1 rmind 212 1.26 rmind l->l_sched.timeslice = ts_map[lwp_eprio(l)]; 213 1.1 rmind } 214 1.1 rmind 215 1.1 rmind void 216 1.1 rmind sched_slept(struct lwp *l) 217 1.1 rmind { 218 1.1 rmind 219 1.1 rmind /* 220 1.10 ad * If thread is in time-sharing queue and batch flag is not marked, 221 1.29 mbalmer * increase the priority, and run with the lower time-quantum. 222 1.1 rmind */ 223 1.25 rmind if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 224 1.27 rmind struct proc *p = l->l_proc; 225 1.27 rmind 226 1.10 ad KASSERT(l->l_class == SCHED_OTHER); 227 1.27 rmind if (__predict_false(p->p_nice < NZERO)) { 228 1.33 riastrad const int n = uimax((NZERO - p->p_nice) >> 2, 1); 229 1.33 riastrad l->l_priority = uimin(l->l_priority + n, PRI_HIGHEST_TS); 230 1.27 rmind } else { 231 1.27 rmind l->l_priority++; 232 1.27 rmind } 233 1.10 ad } 234 1.1 rmind } 235 1.1 rmind 236 1.1 rmind void 237 1.1 rmind sched_wakeup(struct lwp *l) 238 1.1 rmind { 239 1.1 rmind 240 1.1 rmind /* If thread was sleeping a second or more - set a high priority */ 241 1.25 rmind if (l->l_slptime >= 1) 242 1.10 ad l->l_priority = high_pri[l->l_priority]; 243 1.1 rmind } 244 1.1 rmind 245 1.1 rmind void 246 1.25 rmind sched_pstats_hook(struct lwp *l, int batch) 247 1.1 rmind { 248 1.11 rmind pri_t prio; 249 1.1 rmind 250 1.22 rmind /* 251 1.22 rmind * Estimate threads on time-sharing queue only, however, 252 1.22 rmind * exclude the highest priority for performance purposes. 253 1.22 rmind */ 254 1.26 rmind KASSERT(lwp_locked(l, NULL)); 255 1.10 ad if (l->l_priority >= PRI_HIGHEST_TS) 256 1.1 rmind return; 257 1.16 rmind KASSERT(l->l_class == SCHED_OTHER); 258 1.1 rmind 259 1.10 ad /* If it is CPU-bound not a first time - decrease the priority */ 260 1.11 rmind prio = l->l_priority; 261 1.11 rmind if (batch && prio != 0) 262 1.11 rmind prio--; 263 1.10 ad 264 1.1 rmind /* If thread was not ran a second or more - set a high priority */ 265 1.11 rmind if (l->l_stat == LSRUN) { 266 1.38 maxv if (l->l_rticks && (getticks() - l->l_rticks >= hz)) 267 1.11 rmind prio = high_pri[prio]; 268 1.11 rmind /* Re-enqueue the thread if priority has changed */ 269 1.11 rmind if (prio != l->l_priority) 270 1.11 rmind lwp_changepri(l, prio); 271 1.11 rmind } else { 272 1.11 rmind /* In other states, change the priority directly */ 273 1.11 rmind l->l_priority = prio; 274 1.11 rmind } 275 1.1 rmind } 276 1.1 rmind 277 1.24 ad void 278 1.24 ad sched_oncpu(lwp_t *l) 279 1.16 rmind { 280 1.26 rmind struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate; 281 1.1 rmind 282 1.1 rmind /* Update the counters */ 283 1.26 rmind KASSERT(l->l_sched.timeslice >= min_ts); 284 1.26 rmind KASSERT(l->l_sched.timeslice <= max_ts); 285 1.26 rmind spc->spc_ticks = l->l_sched.timeslice; 286 1.1 rmind } 287 1.1 rmind 288 1.1 rmind /* 289 1.1 rmind * Time-driven events. 290 1.1 rmind */ 291 1.1 rmind 292 1.1 rmind /* 293 1.34 ad * Called once per time-quantum, with the running LWP lock held (spc_lwplock). 294 1.1 rmind */ 295 1.1 rmind void 296 1.1 rmind sched_tick(struct cpu_info *ci) 297 1.1 rmind { 298 1.1 rmind struct schedstate_percpu *spc = &ci->ci_schedstate; 299 1.36 ad struct lwp *l = ci->ci_onproc; 300 1.27 rmind struct proc *p; 301 1.1 rmind 302 1.26 rmind if (__predict_false(CURCPU_IDLE_P())) 303 1.2 rmind return; 304 1.1 rmind 305 1.34 ad lwp_lock(l); 306 1.34 ad KASSERT(l->l_mutex != spc->spc_mutex); 307 1.10 ad switch (l->l_class) { 308 1.2 rmind case SCHED_FIFO: 309 1.2 rmind /* 310 1.2 rmind * Update the time-quantum, and continue running, 311 1.2 rmind * if thread runs on FIFO real-time policy. 312 1.2 rmind */ 313 1.16 rmind KASSERT(l->l_priority > PRI_HIGHEST_TS); 314 1.26 rmind spc->spc_ticks = l->l_sched.timeslice; 315 1.34 ad lwp_unlock(l); 316 1.1 rmind return; 317 1.2 rmind case SCHED_OTHER: 318 1.10 ad /* 319 1.10 ad * If thread is in time-sharing queue, decrease the priority, 320 1.10 ad * and run with a higher time-quantum. 321 1.10 ad */ 322 1.16 rmind KASSERT(l->l_priority <= PRI_HIGHEST_TS); 323 1.27 rmind if (l->l_priority == 0) 324 1.27 rmind break; 325 1.27 rmind 326 1.27 rmind p = l->l_proc; 327 1.27 rmind if (__predict_false(p->p_nice > NZERO)) { 328 1.33 riastrad const int n = uimax((p->p_nice - NZERO) >> 2, 1); 329 1.27 rmind l->l_priority = imax(l->l_priority - n, 0); 330 1.27 rmind } else 331 1.10 ad l->l_priority--; 332 1.2 rmind break; 333 1.1 rmind } 334 1.1 rmind 335 1.1 rmind /* 336 1.2 rmind * If there are higher priority threads or threads in the same queue, 337 1.2 rmind * mark that thread should yield, otherwise, continue running. 338 1.1 rmind */ 339 1.24 ad if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 340 1.1 rmind spc->spc_flags |= SPCF_SHOULDYIELD; 341 1.37 ad spc_lock(ci); 342 1.37 ad sched_resched_cpu(ci, MAXPRI_KTHREAD, true); 343 1.37 ad /* spc now unlocked */ 344 1.1 rmind } else 345 1.34 ad spc->spc_ticks = l->l_sched.timeslice; 346 1.34 ad lwp_unlock(l); 347 1.1 rmind } 348 1.1 rmind 349 1.1 rmind /* 350 1.1 rmind * Sysctl nodes and initialization. 351 1.1 rmind */ 352 1.1 rmind 353 1.1 rmind static int 354 1.15 rmind sysctl_sched_rtts(SYSCTLFN_ARGS) 355 1.15 rmind { 356 1.15 rmind struct sysctlnode node; 357 1.39 ad int rttsms = hztoms(sched_rrticks); 358 1.15 rmind 359 1.15 rmind node = *rnode; 360 1.15 rmind node.sysctl_data = &rttsms; 361 1.15 rmind return sysctl_lookup(SYSCTLFN_CALL(&node)); 362 1.15 rmind } 363 1.15 rmind 364 1.15 rmind static int 365 1.1 rmind sysctl_sched_mints(SYSCTLFN_ARGS) 366 1.1 rmind { 367 1.1 rmind struct sysctlnode node; 368 1.1 rmind struct cpu_info *ci; 369 1.1 rmind int error, newsize; 370 1.1 rmind CPU_INFO_ITERATOR cii; 371 1.1 rmind 372 1.1 rmind node = *rnode; 373 1.1 rmind node.sysctl_data = &newsize; 374 1.1 rmind 375 1.1 rmind newsize = hztoms(min_ts); 376 1.1 rmind error = sysctl_lookup(SYSCTLFN_CALL(&node)); 377 1.1 rmind if (error || newp == NULL) 378 1.1 rmind return error; 379 1.1 rmind 380 1.8 rmind newsize = mstohz(newsize); 381 1.1 rmind if (newsize < 1 || newsize > hz || newsize >= max_ts) 382 1.1 rmind return EINVAL; 383 1.1 rmind 384 1.1 rmind /* It is safe to do this in such order */ 385 1.1 rmind for (CPU_INFO_FOREACH(cii, ci)) 386 1.1 rmind spc_lock(ci); 387 1.1 rmind 388 1.8 rmind min_ts = newsize; 389 1.1 rmind sched_precalcts(); 390 1.1 rmind 391 1.1 rmind for (CPU_INFO_FOREACH(cii, ci)) 392 1.1 rmind spc_unlock(ci); 393 1.1 rmind 394 1.1 rmind return 0; 395 1.1 rmind } 396 1.1 rmind 397 1.1 rmind static int 398 1.1 rmind sysctl_sched_maxts(SYSCTLFN_ARGS) 399 1.1 rmind { 400 1.1 rmind struct sysctlnode node; 401 1.1 rmind struct cpu_info *ci; 402 1.1 rmind int error, newsize; 403 1.1 rmind CPU_INFO_ITERATOR cii; 404 1.1 rmind 405 1.1 rmind node = *rnode; 406 1.1 rmind node.sysctl_data = &newsize; 407 1.1 rmind 408 1.1 rmind newsize = hztoms(max_ts); 409 1.1 rmind error = sysctl_lookup(SYSCTLFN_CALL(&node)); 410 1.1 rmind if (error || newp == NULL) 411 1.1 rmind return error; 412 1.1 rmind 413 1.8 rmind newsize = mstohz(newsize); 414 1.1 rmind if (newsize < 10 || newsize > hz || newsize <= min_ts) 415 1.1 rmind return EINVAL; 416 1.1 rmind 417 1.1 rmind /* It is safe to do this in such order */ 418 1.1 rmind for (CPU_INFO_FOREACH(cii, ci)) 419 1.1 rmind spc_lock(ci); 420 1.1 rmind 421 1.8 rmind max_ts = newsize; 422 1.1 rmind sched_precalcts(); 423 1.1 rmind 424 1.1 rmind for (CPU_INFO_FOREACH(cii, ci)) 425 1.1 rmind spc_unlock(ci); 426 1.1 rmind 427 1.1 rmind return 0; 428 1.1 rmind } 429 1.1 rmind 430 1.24 ad SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 431 1.1 rmind { 432 1.1 rmind const struct sysctlnode *node = NULL; 433 1.1 rmind 434 1.1 rmind sysctl_createv(clog, 0, NULL, &node, 435 1.1 rmind CTLFLAG_PERMANENT, 436 1.1 rmind CTLTYPE_NODE, "sched", 437 1.1 rmind SYSCTL_DESCR("Scheduler options"), 438 1.1 rmind NULL, 0, NULL, 0, 439 1.1 rmind CTL_KERN, CTL_CREATE, CTL_EOL); 440 1.1 rmind 441 1.1 rmind if (node == NULL) 442 1.1 rmind return; 443 1.1 rmind 444 1.24 ad sysctl_createv(NULL, 0, &node, NULL, 445 1.1 rmind CTLFLAG_PERMANENT, 446 1.1 rmind CTLTYPE_STRING, "name", NULL, 447 1.1 rmind NULL, 0, __UNCONST("M2"), 0, 448 1.1 rmind CTL_CREATE, CTL_EOL); 449 1.24 ad sysctl_createv(NULL, 0, &node, NULL, 450 1.15 rmind CTLFLAG_PERMANENT, 451 1.15 rmind CTLTYPE_INT, "rtts", 452 1.32 maxv SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), 453 1.15 rmind sysctl_sched_rtts, 0, NULL, 0, 454 1.15 rmind CTL_CREATE, CTL_EOL); 455 1.24 ad sysctl_createv(NULL, 0, &node, NULL, 456 1.1 rmind CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 457 1.1 rmind CTLTYPE_INT, "maxts", 458 1.32 maxv SYSCTL_DESCR("Maximal time quantum (in milliseconds)"), 459 1.1 rmind sysctl_sched_maxts, 0, &max_ts, 0, 460 1.1 rmind CTL_CREATE, CTL_EOL); 461 1.24 ad sysctl_createv(NULL, 0, &node, NULL, 462 1.1 rmind CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 463 1.1 rmind CTLTYPE_INT, "mints", 464 1.32 maxv SYSCTL_DESCR("Minimal time quantum (in milliseconds)"), 465 1.1 rmind sysctl_sched_mints, 0, &min_ts, 0, 466 1.1 rmind CTL_CREATE, CTL_EOL); 467 1.1 rmind } 468