linux_sched.c revision 1.71.2.1 1 /* $NetBSD: linux_sched.c,v 1.71.2.1 2020/04/13 08:04:15 martin Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2019 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center; by Matthias Scheler.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Linux compatibility module. Try to deal with scheduler related syscalls.
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.71.2.1 2020/04/13 08:04:15 martin Exp $");
39
40 #include <sys/param.h>
41 #include <sys/mount.h>
42 #include <sys/proc.h>
43 #include <sys/systm.h>
44 #include <sys/sysctl.h>
45 #include <sys/syscallargs.h>
46 #include <sys/wait.h>
47 #include <sys/kauth.h>
48 #include <sys/ptrace.h>
49 #include <sys/atomic.h>
50
51 #include <sys/cpu.h>
52
53 #include <compat/linux/common/linux_types.h>
54 #include <compat/linux/common/linux_signal.h>
55 #include <compat/linux/common/linux_emuldata.h>
56 #include <compat/linux/common/linux_ipc.h>
57 #include <compat/linux/common/linux_sem.h>
58 #include <compat/linux/common/linux_exec.h>
59 #include <compat/linux/common/linux_machdep.h>
60
61 #include <compat/linux/linux_syscallargs.h>
62
63 #include <compat/linux/common/linux_sched.h>
64
65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *,
66 register_t *);
67
68 /* Unlike Linux, dynamically calculate CPU mask size */
69 #define LINUX_CPU_MASK_SIZE (sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT))
70
71 #if DEBUG_LINUX
72 #define DPRINTF(x) uprintf x
73 #else
74 #define DPRINTF(x)
75 #endif
76
77 static void
78 linux_child_return(void *arg)
79 {
80 struct lwp *l = arg;
81 struct proc *p = l->l_proc;
82 struct linux_emuldata *led = l->l_emuldata;
83 void *ctp = led->led_child_tidptr;
84 int error;
85
86 if (ctp) {
87 if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0)
88 printf("%s: LINUX_CLONE_CHILD_SETTID "
89 "failed (child_tidptr = %p, tid = %d error =%d)\n",
90 __func__, ctp, p->p_pid, error);
91 }
92 child_return(arg);
93 }
94
95 int
96 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap,
97 register_t *retval)
98 {
99 /* {
100 syscallarg(int) flags;
101 syscallarg(void *) stack;
102 syscallarg(void *) parent_tidptr;
103 syscallarg(void *) tls;
104 syscallarg(void *) child_tidptr;
105 } */
106 struct linux_emuldata *led;
107 int flags, sig, error;
108
109 /*
110 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags.
111 */
112 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE))
113 return EINVAL;
114
115 /*
116 * Thread group implies shared signals. Shared signals
117 * imply shared VM. This matches what Linux kernel does.
118 */
119 if (SCARG(uap, flags) & LINUX_CLONE_THREAD
120 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0)
121 return EINVAL;
122 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND
123 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0)
124 return EINVAL;
125
126 /*
127 * The thread group flavor is implemented totally differently.
128 */
129 if (SCARG(uap, flags) & LINUX_CLONE_THREAD)
130 return linux_clone_nptl(l, uap, retval);
131
132 flags = 0;
133 if (SCARG(uap, flags) & LINUX_CLONE_VM)
134 flags |= FORK_SHAREVM;
135 if (SCARG(uap, flags) & LINUX_CLONE_FS)
136 flags |= FORK_SHARECWD;
137 if (SCARG(uap, flags) & LINUX_CLONE_FILES)
138 flags |= FORK_SHAREFILES;
139 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND)
140 flags |= FORK_SHARESIGS;
141 if (SCARG(uap, flags) & LINUX_CLONE_VFORK)
142 flags |= FORK_PPWAIT;
143
144 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL;
145 if (sig < 0 || sig >= LINUX__NSIG)
146 return EINVAL;
147 sig = linux_to_native_signo[sig];
148
149 if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) {
150 led = l->l_emuldata;
151 led->led_child_tidptr = SCARG(uap, child_tidptr);
152 }
153
154 /*
155 * Note that Linux does not provide a portable way of specifying
156 * the stack area; the caller must know if the stack grows up
157 * or down. So, we pass a stack size of 0, so that the code
158 * that makes this adjustment is a noop.
159 */
160 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0,
161 linux_child_return, NULL, retval)) != 0) {
162 DPRINTF(("%s: fork1: error %d\n", __func__, error));
163 return error;
164 }
165
166 return 0;
167 }
168
169 static int
170 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval)
171 {
172 /* {
173 syscallarg(int) flags;
174 syscallarg(void *) stack;
175 syscallarg(void *) parent_tidptr;
176 syscallarg(void *) tls;
177 syscallarg(void *) child_tidptr;
178 } */
179 struct proc *p;
180 struct lwp *l2;
181 struct linux_emuldata *led;
182 void *parent_tidptr, *tls, *child_tidptr;
183 vaddr_t uaddr;
184 lwpid_t lid;
185 int flags, tnprocs, error;
186
187 p = l->l_proc;
188 flags = SCARG(uap, flags);
189 parent_tidptr = SCARG(uap, parent_tidptr);
190 tls = SCARG(uap, tls);
191 child_tidptr = SCARG(uap, child_tidptr);
192
193 tnprocs = atomic_inc_uint_nv(&nprocs);
194 if (__predict_false(tnprocs >= maxproc) ||
195 kauth_authorize_process(l->l_cred, KAUTH_PROCESS_FORK, p,
196 KAUTH_ARG(tnprocs), NULL, NULL) != 0) {
197 atomic_dec_uint(&nprocs);
198 return EAGAIN;
199 }
200
201 uaddr = uvm_uarea_alloc();
202 if (__predict_false(uaddr == 0)) {
203 atomic_dec_uint(&nprocs);
204 return ENOMEM;
205 }
206
207 error = lwp_create(l, p, uaddr, LWP_DETACHED | LWP_PIDLID,
208 SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class,
209 &l->l_sigmask, &l->l_sigstk);
210 if (__predict_false(error)) {
211 DPRINTF(("%s: lwp_create error=%d\n", __func__, error));
212 atomic_dec_uint(&nprocs);
213 uvm_uarea_free(uaddr);
214 return error;
215 }
216 lid = l2->l_lid;
217
218 /* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */
219 if (flags & LINUX_CLONE_CHILD_CLEARTID) {
220 led = l2->l_emuldata;
221 led->led_clear_tid = child_tidptr;
222 }
223
224 /* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */
225 if (flags & LINUX_CLONE_PARENT_SETTID) {
226 if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0)
227 printf("%s: LINUX_CLONE_PARENT_SETTID "
228 "failed (parent_tidptr = %p tid = %d error=%d)\n",
229 __func__, parent_tidptr, lid, error);
230 }
231
232 /* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory */
233 if (flags & LINUX_CLONE_CHILD_SETTID) {
234 if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0)
235 printf("%s: LINUX_CLONE_CHILD_SETTID "
236 "failed (child_tidptr = %p, tid = %d error=%d)\n",
237 __func__, child_tidptr, lid, error);
238 }
239
240 if (flags & LINUX_CLONE_SETTLS) {
241 error = LINUX_LWP_SETPRIVATE(l2, tls);
242 if (error) {
243 DPRINTF(("%s: LINUX_LWP_SETPRIVATE %d\n", __func__,
244 error));
245 lwp_exit(l2);
246 return error;
247 }
248 }
249
250 /* Set the new LWP running. */
251 lwp_start(l2, 0);
252
253 retval[0] = lid;
254 retval[1] = 0;
255 return 0;
256 }
257
258 /*
259 * linux realtime priority
260 *
261 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99].
262 *
263 * - SCHED_OTHER tasks don't have realtime priorities.
264 * in particular, sched_param::sched_priority is always 0.
265 */
266
267 #define LINUX_SCHED_RTPRIO_MIN 1
268 #define LINUX_SCHED_RTPRIO_MAX 99
269
270 static int
271 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params,
272 int *native_policy, struct sched_param *native_params)
273 {
274
275 switch (linux_policy) {
276 case LINUX_SCHED_OTHER:
277 if (native_policy != NULL) {
278 *native_policy = SCHED_OTHER;
279 }
280 break;
281
282 case LINUX_SCHED_FIFO:
283 if (native_policy != NULL) {
284 *native_policy = SCHED_FIFO;
285 }
286 break;
287
288 case LINUX_SCHED_RR:
289 if (native_policy != NULL) {
290 *native_policy = SCHED_RR;
291 }
292 break;
293
294 default:
295 return EINVAL;
296 }
297
298 if (linux_params != NULL) {
299 int prio = linux_params->sched_priority;
300
301 KASSERT(native_params != NULL);
302
303 if (linux_policy == LINUX_SCHED_OTHER) {
304 if (prio != 0) {
305 return EINVAL;
306 }
307 native_params->sched_priority = PRI_NONE; /* XXX */
308 } else {
309 if (prio < LINUX_SCHED_RTPRIO_MIN ||
310 prio > LINUX_SCHED_RTPRIO_MAX) {
311 return EINVAL;
312 }
313 native_params->sched_priority =
314 (prio - LINUX_SCHED_RTPRIO_MIN)
315 * (SCHED_PRI_MAX - SCHED_PRI_MIN)
316 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
317 + SCHED_PRI_MIN;
318 }
319 }
320
321 return 0;
322 }
323
324 static int
325 sched_native2linux(int native_policy, struct sched_param *native_params,
326 int *linux_policy, struct linux_sched_param *linux_params)
327 {
328
329 switch (native_policy) {
330 case SCHED_OTHER:
331 if (linux_policy != NULL) {
332 *linux_policy = LINUX_SCHED_OTHER;
333 }
334 break;
335
336 case SCHED_FIFO:
337 if (linux_policy != NULL) {
338 *linux_policy = LINUX_SCHED_FIFO;
339 }
340 break;
341
342 case SCHED_RR:
343 if (linux_policy != NULL) {
344 *linux_policy = LINUX_SCHED_RR;
345 }
346 break;
347
348 default:
349 panic("%s: unknown policy %d\n", __func__, native_policy);
350 }
351
352 if (native_params != NULL) {
353 int prio = native_params->sched_priority;
354
355 KASSERT(prio >= SCHED_PRI_MIN);
356 KASSERT(prio <= SCHED_PRI_MAX);
357 KASSERT(linux_params != NULL);
358
359 DPRINTF(("%s: native: policy %d, priority %d\n",
360 __func__, native_policy, prio));
361
362 if (native_policy == SCHED_OTHER) {
363 linux_params->sched_priority = 0;
364 } else {
365 linux_params->sched_priority =
366 (prio - SCHED_PRI_MIN)
367 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
368 / (SCHED_PRI_MAX - SCHED_PRI_MIN)
369 + LINUX_SCHED_RTPRIO_MIN;
370 }
371 DPRINTF(("%s: linux: policy %d, priority %d\n",
372 __func__, -1, linux_params->sched_priority));
373 }
374
375 return 0;
376 }
377
378 int
379 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval)
380 {
381 /* {
382 syscallarg(linux_pid_t) pid;
383 syscallarg(const struct linux_sched_param *) sp;
384 } */
385 int error, policy;
386 struct linux_sched_param lp;
387 struct sched_param sp;
388
389 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
390 error = EINVAL;
391 goto out;
392 }
393
394 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
395 if (error)
396 goto out;
397
398 /* We need the current policy in Linux terms. */
399 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
400 if (error)
401 goto out;
402 error = sched_native2linux(policy, NULL, &policy, NULL);
403 if (error)
404 goto out;
405
406 error = sched_linux2native(policy, &lp, &policy, &sp);
407 if (error)
408 goto out;
409
410 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
411 if (error)
412 goto out;
413
414 out:
415 return error;
416 }
417
418 int
419 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval)
420 {
421 /* {
422 syscallarg(linux_pid_t) pid;
423 syscallarg(struct linux_sched_param *) sp;
424 } */
425 struct linux_sched_param lp;
426 struct sched_param sp;
427 int error, policy;
428
429 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
430 error = EINVAL;
431 goto out;
432 }
433
434 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp);
435 if (error)
436 goto out;
437 DPRINTF(("%s: native: policy %d, priority %d\n",
438 __func__, policy, sp.sched_priority));
439
440 error = sched_native2linux(policy, &sp, NULL, &lp);
441 if (error)
442 goto out;
443 DPRINTF(("%s: linux: policy %d, priority %d\n",
444 __func__, policy, lp.sched_priority));
445
446 error = copyout(&lp, SCARG(uap, sp), sizeof(lp));
447 if (error)
448 goto out;
449
450 out:
451 return error;
452 }
453
454 int
455 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval)
456 {
457 /* {
458 syscallarg(linux_pid_t) pid;
459 syscallarg(int) policy;
460 syscallarg(cont struct linux_sched_param *) sp;
461 } */
462 int error, policy;
463 struct linux_sched_param lp;
464 struct sched_param sp;
465
466 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
467 error = EINVAL;
468 goto out;
469 }
470
471 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
472 if (error)
473 goto out;
474 DPRINTF(("%s: linux: policy %d, priority %d\n",
475 __func__, SCARG(uap, policy), lp.sched_priority));
476
477 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp);
478 if (error)
479 goto out;
480 DPRINTF(("%s: native: policy %d, priority %d\n",
481 __func__, policy, sp.sched_priority));
482
483 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
484 if (error)
485 goto out;
486
487 out:
488 return error;
489 }
490
491 int
492 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval)
493 {
494 /* {
495 syscallarg(linux_pid_t) pid;
496 } */
497 int error, policy;
498
499 *retval = -1;
500
501 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
502 if (error)
503 goto out;
504
505 error = sched_native2linux(policy, NULL, &policy, NULL);
506 if (error)
507 goto out;
508
509 *retval = policy;
510
511 out:
512 return error;
513 }
514
515 int
516 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
517 {
518
519 yield();
520 return 0;
521 }
522
523 int
524 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval)
525 {
526 /* {
527 syscallarg(int) policy;
528 } */
529
530 switch (SCARG(uap, policy)) {
531 case LINUX_SCHED_OTHER:
532 *retval = 0;
533 break;
534 case LINUX_SCHED_FIFO:
535 case LINUX_SCHED_RR:
536 *retval = LINUX_SCHED_RTPRIO_MAX;
537 break;
538 default:
539 return EINVAL;
540 }
541
542 return 0;
543 }
544
545 int
546 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval)
547 {
548 /* {
549 syscallarg(int) policy;
550 } */
551
552 switch (SCARG(uap, policy)) {
553 case LINUX_SCHED_OTHER:
554 *retval = 0;
555 break;
556 case LINUX_SCHED_FIFO:
557 case LINUX_SCHED_RR:
558 *retval = LINUX_SCHED_RTPRIO_MIN;
559 break;
560 default:
561 return EINVAL;
562 }
563
564 return 0;
565 }
566
567 int
568 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval)
569 {
570
571 lwp_exit(l);
572 return 0;
573 }
574
575 #ifndef __m68k__
576 /* Present on everything but m68k */
577 int
578 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval)
579 {
580
581 return sys_exit(l, (const void *)uap, retval);
582 }
583 #endif /* !__m68k__ */
584
585 int
586 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval)
587 {
588 /* {
589 syscallarg(int *) tidptr;
590 } */
591 struct linux_emuldata *led;
592
593 led = (struct linux_emuldata *)l->l_emuldata;
594 led->led_clear_tid = SCARG(uap, tid);
595 *retval = l->l_lid;
596
597 return 0;
598 }
599
600 /* ARGUSED1 */
601 int
602 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval)
603 {
604
605 *retval = l->l_lid;
606 return 0;
607 }
608
609 /*
610 * The affinity syscalls assume that the layout of our cpu kcpuset is
611 * the same as linux's: a linear bitmask.
612 */
613 int
614 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval)
615 {
616 /* {
617 syscallarg(linux_pid_t) pid;
618 syscallarg(unsigned int) len;
619 syscallarg(unsigned long *) mask;
620 } */
621 struct lwp *t;
622 kcpuset_t *kcset;
623 size_t size;
624 cpuid_t i;
625 int error;
626
627 size = LINUX_CPU_MASK_SIZE;
628 if (SCARG(uap, len) < size)
629 return EINVAL;
630
631 /* Lock the LWP */
632 t = lwp_find2(SCARG(uap, pid), l->l_lid);
633 if (t == NULL)
634 return ESRCH;
635
636 /* Check the permission */
637 if (kauth_authorize_process(l->l_cred,
638 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
639 mutex_exit(t->l_proc->p_lock);
640 return EPERM;
641 }
642
643 kcpuset_create(&kcset, true);
644 lwp_lock(t);
645 if (t->l_affinity != NULL)
646 kcpuset_copy(kcset, t->l_affinity);
647 else {
648 /*
649 * All available CPUs should be masked when affinity has not
650 * been set.
651 */
652 kcpuset_zero(kcset);
653 for (i = 0; i < ncpu; i++)
654 kcpuset_set(kcset, i);
655 }
656 lwp_unlock(t);
657 mutex_exit(t->l_proc->p_lock);
658 error = kcpuset_copyout(kcset, (cpuset_t *)SCARG(uap, mask), size);
659 kcpuset_unuse(kcset, NULL);
660 *retval = size;
661 return error;
662 }
663
664 int
665 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval)
666 {
667 /* {
668 syscallarg(linux_pid_t) pid;
669 syscallarg(unsigned int) len;
670 syscallarg(unsigned long *) mask;
671 } */
672 struct sys__sched_setaffinity_args ssa;
673 size_t size;
674
675 size = LINUX_CPU_MASK_SIZE;
676 if (SCARG(uap, len) < size)
677 return EINVAL;
678
679 SCARG(&ssa, pid) = SCARG(uap, pid);
680 SCARG(&ssa, lid) = l->l_lid;
681 SCARG(&ssa, size) = size;
682 SCARG(&ssa, cpuset) = (cpuset_t *)SCARG(uap, mask);
683
684 return sys__sched_setaffinity(l, &ssa, retval);
685 }
686