sched_m2.c revision 1.24 1 /* $NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $ */
2
3 /*
4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * TODO:
31 * - Implementation of fair share queue;
32 * - Support for NUMA;
33 */
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $");
37
38 #include <sys/param.h>
39
40 #include <sys/bitops.h>
41 #include <sys/cpu.h>
42 #include <sys/callout.h>
43 #include <sys/errno.h>
44 #include <sys/kernel.h>
45 #include <sys/kmem.h>
46 #include <sys/lwp.h>
47 #include <sys/mutex.h>
48 #include <sys/pool.h>
49 #include <sys/proc.h>
50 #include <sys/pset.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/syscallargs.h>
55 #include <sys/sysctl.h>
56 #include <sys/types.h>
57
58 /*
59 * Priority related defintions.
60 */
61 #define PRI_TS_COUNT (NPRI_USER)
62 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
63 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
64
65 #define PRI_HIGHEST_TS (MAXPRI_USER)
66
67 /*
68 * Time-slices and priorities.
69 */
70 static u_int min_ts; /* Minimal time-slice */
71 static u_int max_ts; /* Maximal time-slice */
72 static u_int rt_ts; /* Real-time time-slice */
73 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */
74 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */
75
76 typedef struct {
77 u_int sl_flags;
78 u_int sl_timeslice; /* Time-slice of thread */
79 u_int sl_slept; /* Saved sleep time for sleep sum */
80 u_int sl_slpsum; /* Sum of sleep time */
81 u_int sl_lrtime; /* Last run time */
82 } sched_info_lwp_t;
83
84 /* Flags */
85 #define SL_BATCH 0x01
86
87 /* Pool of the scheduler-specific structures for threads */
88 static pool_cache_t sil_pool;
89
90 /*
91 * Prototypes.
92 */
93
94 static void sched_precalcts(void);
95
96 /*
97 * Initialization and setup.
98 */
99
100 void
101 sched_rqinit(void)
102 {
103 struct cpu_info *ci = curcpu();
104
105 if (hz < 100) {
106 panic("sched_rqinit: value of HZ is too low\n");
107 }
108
109 /* Default timing ranges */
110 min_ts = mstohz(50); /* ~50ms */
111 max_ts = mstohz(150); /* ~150ms */
112 rt_ts = mstohz(100); /* ~100ms */
113 sched_precalcts();
114
115 /* Pool of the scheduler-specific structures */
116 sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit,
117 0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
118
119 /* Attach the primary CPU here */
120 sched_cpuattach(ci);
121
122 sched_lwp_fork(NULL, &lwp0);
123 sched_newts(&lwp0);
124 }
125
126 /* Pre-calculate the time-slices for the priorities */
127 static void
128 sched_precalcts(void)
129 {
130 pri_t p;
131
132 /* Time-sharing range */
133 for (p = 0; p <= PRI_HIGHEST_TS; p++) {
134 ts_map[p] = max_ts -
135 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
136 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
137 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
138 }
139
140 /* Real-time range */
141 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
142 ts_map[p] = rt_ts;
143 high_pri[p] = p;
144 }
145 }
146
147 /*
148 * Hooks.
149 */
150
151 void
152 sched_proc_fork(struct proc *parent, struct proc *child)
153 {
154 struct lwp *l;
155
156 LIST_FOREACH(l, &child->p_lwps, l_sibling) {
157 lwp_lock(l);
158 sched_newts(l);
159 lwp_unlock(l);
160 }
161 }
162
163 void
164 sched_proc_exit(struct proc *child, struct proc *parent)
165 {
166
167 /* Dummy */
168 }
169
170 void
171 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
172 {
173
174 KASSERT(l2->l_sched_info == NULL);
175 l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
176 memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
177 }
178
179 void
180 sched_lwp_exit(struct lwp *l)
181 {
182
183 KASSERT(l->l_sched_info != NULL);
184 pool_cache_put(sil_pool, l->l_sched_info);
185 l->l_sched_info = NULL;
186 }
187
188 void
189 sched_lwp_collect(struct lwp *l)
190 {
191
192 }
193
194 void
195 sched_setrunnable(struct lwp *l)
196 {
197
198 /* Dummy */
199 }
200
201 void
202 sched_schedclock(struct lwp *l)
203 {
204
205 /* Dummy */
206 }
207
208 /*
209 * Priorities and time-slice.
210 */
211
212 void
213 sched_nice(struct proc *p, int prio)
214 {
215
216 /* TODO: implement as SCHED_IA */
217 }
218
219 /* Recalculate the time-slice */
220 void
221 sched_newts(struct lwp *l)
222 {
223 sched_info_lwp_t *sil = l->l_sched_info;
224
225 sil->sl_timeslice = ts_map[lwp_eprio(l)];
226 }
227
228 void
229 sched_slept(struct lwp *l)
230 {
231 sched_info_lwp_t *sil = l->l_sched_info;
232
233 /* Save the time when thread has slept */
234 sil->sl_slept = hardclock_ticks;
235
236 /*
237 * If thread is in time-sharing queue and batch flag is not marked,
238 * increase the the priority, and run with the lower time-quantum.
239 */
240 if (l->l_priority < PRI_HIGHEST_TS &&
241 (sil->sl_flags & SL_BATCH) == 0) {
242 KASSERT(l->l_class == SCHED_OTHER);
243 l->l_priority++;
244 }
245 }
246
247 void
248 sched_wakeup(struct lwp *l)
249 {
250 sched_info_lwp_t *sil = l->l_sched_info;
251 const u_int slptime = hardclock_ticks - sil->sl_slept;
252
253 /* Update sleep time delta */
254 sil->sl_slpsum += (l->l_slptime == 0) ? slptime : hz;
255
256 /* If thread was sleeping a second or more - set a high priority */
257 if (l->l_slptime > 1 || slptime >= hz)
258 l->l_priority = high_pri[l->l_priority];
259
260 /* Also, consider looking for a better CPU to wake up */
261 l->l_cpu = sched_takecpu(l);
262 }
263
264 void
265 sched_pstats_hook(struct lwp *l)
266 {
267 sched_info_lwp_t *sil = l->l_sched_info;
268 pri_t prio;
269 bool batch;
270
271 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
272 l->l_stat == LSSUSPENDED)
273 l->l_slptime++;
274
275 /*
276 * Set that thread is more CPU-bound, if sum of run time exceeds the
277 * sum of sleep time. Check if thread is CPU-bound a first time.
278 */
279 batch = (l->l_rticksum > sil->sl_slpsum);
280 if (batch) {
281 if ((sil->sl_flags & SL_BATCH) == 0)
282 batch = false;
283 sil->sl_flags |= SL_BATCH;
284 } else
285 sil->sl_flags &= ~SL_BATCH;
286
287 /*
288 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
289 * In such case reset the value of last sleep, and check it later, if
290 * it is still zero - perform the migration, unmark the batch flag.
291 */
292 if (batch && (l->l_slptime + sil->sl_slpsum) == 0) {
293 if (l->l_stat != LSONPROC && sil->sl_slept == 0) {
294 struct cpu_info *ci = sched_takecpu(l);
295
296 if (l->l_cpu != ci)
297 l->l_target_cpu = ci;
298 sil->sl_flags &= ~SL_BATCH;
299 } else {
300 sil->sl_slept = 0;
301 }
302 }
303
304 /* Reset the time sums */
305 sil->sl_slpsum = 0;
306 l->l_rticksum = 0;
307
308 /*
309 * Estimate threads on time-sharing queue only, however,
310 * exclude the highest priority for performance purposes.
311 */
312 if (l->l_priority >= PRI_HIGHEST_TS)
313 return;
314 KASSERT(l->l_class == SCHED_OTHER);
315
316 /* If it is CPU-bound not a first time - decrease the priority */
317 prio = l->l_priority;
318 if (batch && prio != 0)
319 prio--;
320
321 /* If thread was not ran a second or more - set a high priority */
322 if (l->l_stat == LSRUN) {
323 if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz))
324 prio = high_pri[prio];
325 /* Re-enqueue the thread if priority has changed */
326 if (prio != l->l_priority)
327 lwp_changepri(l, prio);
328 } else {
329 /* In other states, change the priority directly */
330 l->l_priority = prio;
331 }
332 }
333
334 void
335 sched_oncpu(lwp_t *l)
336 {
337 sched_info_lwp_t *sil = l->l_sched_info;
338
339 /* Update the counters */
340 sil = l->l_sched_info;
341 KASSERT(sil->sl_timeslice >= min_ts);
342 KASSERT(sil->sl_timeslice <= max_ts);
343 l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice;
344 }
345
346 /*
347 * Time-driven events.
348 */
349
350 /*
351 * Called once per time-quantum. This routine is CPU-local and runs at
352 * IPL_SCHED, thus the locking is not needed.
353 */
354 void
355 sched_tick(struct cpu_info *ci)
356 {
357 struct schedstate_percpu *spc = &ci->ci_schedstate;
358 struct lwp *l = curlwp;
359 const sched_info_lwp_t *sil = l->l_sched_info;
360
361 if (CURCPU_IDLE_P())
362 return;
363
364 switch (l->l_class) {
365 case SCHED_FIFO:
366 /*
367 * Update the time-quantum, and continue running,
368 * if thread runs on FIFO real-time policy.
369 */
370 KASSERT(l->l_priority > PRI_HIGHEST_TS);
371 spc->spc_ticks = sil->sl_timeslice;
372 return;
373 case SCHED_OTHER:
374 /*
375 * If thread is in time-sharing queue, decrease the priority,
376 * and run with a higher time-quantum.
377 */
378 KASSERT(l->l_priority <= PRI_HIGHEST_TS);
379 if (l->l_priority != 0)
380 l->l_priority--;
381 break;
382 }
383
384 /*
385 * If there are higher priority threads or threads in the same queue,
386 * mark that thread should yield, otherwise, continue running.
387 */
388 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
389 spc->spc_flags |= SPCF_SHOULDYIELD;
390 cpu_need_resched(ci, 0);
391 } else
392 spc->spc_ticks = sil->sl_timeslice;
393 }
394
395 /*
396 * Sysctl nodes and initialization.
397 */
398
399 static int
400 sysctl_sched_rtts(SYSCTLFN_ARGS)
401 {
402 struct sysctlnode node;
403 int rttsms = hztoms(rt_ts);
404
405 node = *rnode;
406 node.sysctl_data = &rttsms;
407 return sysctl_lookup(SYSCTLFN_CALL(&node));
408 }
409
410 static int
411 sysctl_sched_mints(SYSCTLFN_ARGS)
412 {
413 struct sysctlnode node;
414 struct cpu_info *ci;
415 int error, newsize;
416 CPU_INFO_ITERATOR cii;
417
418 node = *rnode;
419 node.sysctl_data = &newsize;
420
421 newsize = hztoms(min_ts);
422 error = sysctl_lookup(SYSCTLFN_CALL(&node));
423 if (error || newp == NULL)
424 return error;
425
426 newsize = mstohz(newsize);
427 if (newsize < 1 || newsize > hz || newsize >= max_ts)
428 return EINVAL;
429
430 /* It is safe to do this in such order */
431 for (CPU_INFO_FOREACH(cii, ci))
432 spc_lock(ci);
433
434 min_ts = newsize;
435 sched_precalcts();
436
437 for (CPU_INFO_FOREACH(cii, ci))
438 spc_unlock(ci);
439
440 return 0;
441 }
442
443 static int
444 sysctl_sched_maxts(SYSCTLFN_ARGS)
445 {
446 struct sysctlnode node;
447 struct cpu_info *ci;
448 int error, newsize;
449 CPU_INFO_ITERATOR cii;
450
451 node = *rnode;
452 node.sysctl_data = &newsize;
453
454 newsize = hztoms(max_ts);
455 error = sysctl_lookup(SYSCTLFN_CALL(&node));
456 if (error || newp == NULL)
457 return error;
458
459 newsize = mstohz(newsize);
460 if (newsize < 10 || newsize > hz || newsize <= min_ts)
461 return EINVAL;
462
463 /* It is safe to do this in such order */
464 for (CPU_INFO_FOREACH(cii, ci))
465 spc_lock(ci);
466
467 max_ts = newsize;
468 sched_precalcts();
469
470 for (CPU_INFO_FOREACH(cii, ci))
471 spc_unlock(ci);
472
473 return 0;
474 }
475
476 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
477 {
478 const struct sysctlnode *node = NULL;
479
480 sysctl_createv(clog, 0, NULL, NULL,
481 CTLFLAG_PERMANENT,
482 CTLTYPE_NODE, "kern", NULL,
483 NULL, 0, NULL, 0,
484 CTL_KERN, CTL_EOL);
485 sysctl_createv(clog, 0, NULL, &node,
486 CTLFLAG_PERMANENT,
487 CTLTYPE_NODE, "sched",
488 SYSCTL_DESCR("Scheduler options"),
489 NULL, 0, NULL, 0,
490 CTL_KERN, CTL_CREATE, CTL_EOL);
491
492 if (node == NULL)
493 return;
494
495 sysctl_createv(NULL, 0, &node, NULL,
496 CTLFLAG_PERMANENT,
497 CTLTYPE_STRING, "name", NULL,
498 NULL, 0, __UNCONST("M2"), 0,
499 CTL_CREATE, CTL_EOL);
500 sysctl_createv(NULL, 0, &node, NULL,
501 CTLFLAG_PERMANENT,
502 CTLTYPE_INT, "rtts",
503 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
504 sysctl_sched_rtts, 0, NULL, 0,
505 CTL_CREATE, CTL_EOL);
506 sysctl_createv(NULL, 0, &node, NULL,
507 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
508 CTLTYPE_INT, "maxts",
509 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
510 sysctl_sched_maxts, 0, &max_ts, 0,
511 CTL_CREATE, CTL_EOL);
512 sysctl_createv(NULL, 0, &node, NULL,
513 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
514 CTLTYPE_INT, "mints",
515 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
516 sysctl_sched_mints, 0, &min_ts, 0,
517 CTL_CREATE, CTL_EOL);
518 }
519