linux_sched.c revision 1.67 1 /* $NetBSD: linux_sched.c,v 1.67 2014/11/09 17:48:08 maxv Exp $ */
2
3 /*-
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center; by Matthias Scheler.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Linux compatibility module. Try to deal with scheduler related syscalls.
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.67 2014/11/09 17:48:08 maxv Exp $");
39
40 #include <sys/param.h>
41 #include <sys/mount.h>
42 #include <sys/proc.h>
43 #include <sys/systm.h>
44 #include <sys/sysctl.h>
45 #include <sys/syscallargs.h>
46 #include <sys/wait.h>
47 #include <sys/kauth.h>
48 #include <sys/ptrace.h>
49 #include <sys/atomic.h>
50
51 #include <sys/cpu.h>
52
53 #include <compat/linux/common/linux_types.h>
54 #include <compat/linux/common/linux_signal.h>
55 #include <compat/linux/common/linux_emuldata.h>
56 #include <compat/linux/common/linux_ipc.h>
57 #include <compat/linux/common/linux_sem.h>
58 #include <compat/linux/common/linux_exec.h>
59 #include <compat/linux/common/linux_machdep.h>
60
61 #include <compat/linux/linux_syscallargs.h>
62
63 #include <compat/linux/common/linux_sched.h>
64
65 static int linux_clone_nptl(struct lwp *, const struct linux_sys_clone_args *,
66 register_t *);
67
68 #if DEBUG_LINUX
69 #define DPRINTF(x) uprintf x
70 #else
71 #define DPRINTF(x)
72 #endif
73
74 static void
75 linux_child_return(void *arg)
76 {
77 struct lwp *l = arg;
78 struct proc *p = l->l_proc;
79 struct linux_emuldata *led = l->l_emuldata;
80 void *ctp = led->led_child_tidptr;
81 int error;
82
83 if (ctp) {
84 if ((error = copyout(&p->p_pid, ctp, sizeof(p->p_pid))) != 0)
85 printf("%s: LINUX_CLONE_CHILD_SETTID "
86 "failed (child_tidptr = %p, tid = %d error =%d)\n",
87 __func__, ctp, p->p_pid, error);
88 }
89 child_return(arg);
90 }
91
92 int
93 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap,
94 register_t *retval)
95 {
96 /* {
97 syscallarg(int) flags;
98 syscallarg(void *) stack;
99 syscallarg(void *) parent_tidptr;
100 syscallarg(void *) tls;
101 syscallarg(void *) child_tidptr;
102 } */
103 struct proc *p;
104 struct linux_emuldata *led;
105 int flags, sig, error;
106
107 /*
108 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags.
109 */
110 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE))
111 return EINVAL;
112
113 /*
114 * Thread group implies shared signals. Shared signals
115 * imply shared VM. This matches what Linux kernel does.
116 */
117 if (SCARG(uap, flags) & LINUX_CLONE_THREAD
118 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0)
119 return EINVAL;
120 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND
121 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0)
122 return EINVAL;
123
124 /*
125 * The thread group flavor is implemented totally differently.
126 */
127 if (SCARG(uap, flags) & LINUX_CLONE_THREAD)
128 return linux_clone_nptl(l, uap, retval);
129
130 flags = 0;
131 if (SCARG(uap, flags) & LINUX_CLONE_VM)
132 flags |= FORK_SHAREVM;
133 if (SCARG(uap, flags) & LINUX_CLONE_FS)
134 flags |= FORK_SHARECWD;
135 if (SCARG(uap, flags) & LINUX_CLONE_FILES)
136 flags |= FORK_SHAREFILES;
137 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND)
138 flags |= FORK_SHARESIGS;
139 if (SCARG(uap, flags) & LINUX_CLONE_VFORK)
140 flags |= FORK_PPWAIT;
141
142 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL;
143 if (sig < 0 || sig >= LINUX__NSIG)
144 return EINVAL;
145 sig = linux_to_native_signo[sig];
146
147 if (SCARG(uap, flags) & LINUX_CLONE_CHILD_SETTID) {
148 led = l->l_emuldata;
149 led->led_child_tidptr = SCARG(uap, child_tidptr);
150 }
151
152 /*
153 * Note that Linux does not provide a portable way of specifying
154 * the stack area; the caller must know if the stack grows up
155 * or down. So, we pass a stack size of 0, so that the code
156 * that makes this adjustment is a noop.
157 */
158 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0,
159 linux_child_return, NULL, retval, &p)) != 0) {
160 DPRINTF(("%s: fork1: error %d\n", __func__, error));
161 return error;
162 }
163
164 return 0;
165 }
166
167 static int
168 linux_clone_nptl(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval)
169 {
170 /* {
171 syscallarg(int) flags;
172 syscallarg(void *) stack;
173 syscallarg(void *) parent_tidptr;
174 syscallarg(void *) tls;
175 syscallarg(void *) child_tidptr;
176 } */
177 struct proc *p;
178 struct lwp *l2;
179 struct linux_emuldata *led;
180 void *parent_tidptr, *tls, *child_tidptr;
181 struct schedstate_percpu *spc;
182 vaddr_t uaddr;
183 lwpid_t lid;
184 int flags, tnprocs, error;
185
186 p = l->l_proc;
187 flags = SCARG(uap, flags);
188 parent_tidptr = SCARG(uap, parent_tidptr);
189 tls = SCARG(uap, tls);
190 child_tidptr = SCARG(uap, child_tidptr);
191
192 tnprocs = atomic_inc_uint_nv(&nprocs);
193 if (__predict_false(tnprocs >= maxproc) ||
194 kauth_authorize_process(l->l_cred, KAUTH_PROCESS_FORK, p,
195 KAUTH_ARG(tnprocs), NULL, NULL) != 0) {
196 atomic_dec_uint(&nprocs);
197 return EAGAIN;
198 }
199
200 uaddr = uvm_uarea_alloc();
201 if (__predict_false(uaddr == 0)) {
202 atomic_dec_uint(&nprocs);
203 return ENOMEM;
204 }
205
206 error = lwp_create(l, p, uaddr, LWP_DETACHED | LWP_PIDLID,
207 SCARG(uap, stack), 0, child_return, NULL, &l2, l->l_class);
208 if (__predict_false(error)) {
209 DPRINTF(("%s: lwp_create error=%d\n", __func__, error));
210 atomic_dec_uint(&nprocs);
211 uvm_uarea_free(uaddr);
212 return error;
213 }
214 lid = l2->l_lid;
215
216 /* LINUX_CLONE_CHILD_CLEARTID: clear TID in child's memory on exit() */
217 if (flags & LINUX_CLONE_CHILD_CLEARTID) {
218 led = l2->l_emuldata;
219 led->led_clear_tid = child_tidptr;
220 }
221
222 /* LINUX_CLONE_PARENT_SETTID: store child's TID in parent's memory */
223 if (flags & LINUX_CLONE_PARENT_SETTID) {
224 if ((error = copyout(&lid, parent_tidptr, sizeof(lid))) != 0)
225 printf("%s: LINUX_CLONE_PARENT_SETTID "
226 "failed (parent_tidptr = %p tid = %d error=%d)\n",
227 __func__, parent_tidptr, lid, error);
228 }
229
230 /* LINUX_CLONE_CHILD_SETTID: store child's TID in child's memory */
231 if (flags & LINUX_CLONE_CHILD_SETTID) {
232 if ((error = copyout(&lid, child_tidptr, sizeof(lid))) != 0)
233 printf("%s: LINUX_CLONE_CHILD_SETTID "
234 "failed (child_tidptr = %p, tid = %d error=%d)\n",
235 __func__, child_tidptr, lid, error);
236 }
237
238 if (flags & LINUX_CLONE_SETTLS) {
239 error = LINUX_LWP_SETPRIVATE(l2, tls);
240 if (error) {
241 DPRINTF(("%s: LINUX_LWP_SETPRIVATE %d\n", __func__,
242 error));
243 lwp_exit(l2);
244 return error;
245 }
246 }
247
248 /*
249 * Set the new LWP running, unless the process is stopping,
250 * then the LWP is created stopped.
251 */
252 mutex_enter(p->p_lock);
253 lwp_lock(l2);
254 spc = &l2->l_cpu->ci_schedstate;
255 if ((l->l_flag & (LW_WREBOOT | LW_WSUSPEND | LW_WEXIT)) == 0) {
256 if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
257 KASSERT(l2->l_wchan == NULL);
258 l2->l_stat = LSSTOP;
259 p->p_nrlwps--;
260 lwp_unlock_to(l2, spc->spc_lwplock);
261 } else {
262 KASSERT(lwp_locked(l2, spc->spc_mutex));
263 l2->l_stat = LSRUN;
264 sched_enqueue(l2, false);
265 lwp_unlock(l2);
266 }
267 } else {
268 l2->l_stat = LSSUSPENDED;
269 p->p_nrlwps--;
270 lwp_unlock_to(l2, spc->spc_lwplock);
271 }
272 mutex_exit(p->p_lock);
273
274 retval[0] = lid;
275 retval[1] = 0;
276 return 0;
277 }
278
279 /*
280 * linux realtime priority
281 *
282 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99].
283 *
284 * - SCHED_OTHER tasks don't have realtime priorities.
285 * in particular, sched_param::sched_priority is always 0.
286 */
287
288 #define LINUX_SCHED_RTPRIO_MIN 1
289 #define LINUX_SCHED_RTPRIO_MAX 99
290
291 static int
292 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params,
293 int *native_policy, struct sched_param *native_params)
294 {
295
296 switch (linux_policy) {
297 case LINUX_SCHED_OTHER:
298 if (native_policy != NULL) {
299 *native_policy = SCHED_OTHER;
300 }
301 break;
302
303 case LINUX_SCHED_FIFO:
304 if (native_policy != NULL) {
305 *native_policy = SCHED_FIFO;
306 }
307 break;
308
309 case LINUX_SCHED_RR:
310 if (native_policy != NULL) {
311 *native_policy = SCHED_RR;
312 }
313 break;
314
315 default:
316 return EINVAL;
317 }
318
319 if (linux_params != NULL) {
320 int prio = linux_params->sched_priority;
321
322 KASSERT(native_params != NULL);
323
324 if (linux_policy == LINUX_SCHED_OTHER) {
325 if (prio != 0) {
326 return EINVAL;
327 }
328 native_params->sched_priority = PRI_NONE; /* XXX */
329 } else {
330 if (prio < LINUX_SCHED_RTPRIO_MIN ||
331 prio > LINUX_SCHED_RTPRIO_MAX) {
332 return EINVAL;
333 }
334 native_params->sched_priority =
335 (prio - LINUX_SCHED_RTPRIO_MIN)
336 * (SCHED_PRI_MAX - SCHED_PRI_MIN)
337 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
338 + SCHED_PRI_MIN;
339 }
340 }
341
342 return 0;
343 }
344
345 static int
346 sched_native2linux(int native_policy, struct sched_param *native_params,
347 int *linux_policy, struct linux_sched_param *linux_params)
348 {
349
350 switch (native_policy) {
351 case SCHED_OTHER:
352 if (linux_policy != NULL) {
353 *linux_policy = LINUX_SCHED_OTHER;
354 }
355 break;
356
357 case SCHED_FIFO:
358 if (linux_policy != NULL) {
359 *linux_policy = LINUX_SCHED_FIFO;
360 }
361 break;
362
363 case SCHED_RR:
364 if (linux_policy != NULL) {
365 *linux_policy = LINUX_SCHED_RR;
366 }
367 break;
368
369 default:
370 panic("%s: unknown policy %d\n", __func__, native_policy);
371 }
372
373 if (native_params != NULL) {
374 int prio = native_params->sched_priority;
375
376 KASSERT(prio >= SCHED_PRI_MIN);
377 KASSERT(prio <= SCHED_PRI_MAX);
378 KASSERT(linux_params != NULL);
379
380 DPRINTF(("%s: native: policy %d, priority %d\n",
381 __func__, native_policy, prio));
382
383 if (native_policy == SCHED_OTHER) {
384 linux_params->sched_priority = 0;
385 } else {
386 linux_params->sched_priority =
387 (prio - SCHED_PRI_MIN)
388 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
389 / (SCHED_PRI_MAX - SCHED_PRI_MIN)
390 + LINUX_SCHED_RTPRIO_MIN;
391 }
392 DPRINTF(("%s: linux: policy %d, priority %d\n",
393 __func__, -1, linux_params->sched_priority));
394 }
395
396 return 0;
397 }
398
399 int
400 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval)
401 {
402 /* {
403 syscallarg(linux_pid_t) pid;
404 syscallarg(const struct linux_sched_param *) sp;
405 } */
406 int error, policy;
407 struct linux_sched_param lp;
408 struct sched_param sp;
409
410 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
411 error = EINVAL;
412 goto out;
413 }
414
415 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
416 if (error)
417 goto out;
418
419 /* We need the current policy in Linux terms. */
420 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
421 if (error)
422 goto out;
423 error = sched_native2linux(policy, NULL, &policy, NULL);
424 if (error)
425 goto out;
426
427 error = sched_linux2native(policy, &lp, &policy, &sp);
428 if (error)
429 goto out;
430
431 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
432 if (error)
433 goto out;
434
435 out:
436 return error;
437 }
438
439 int
440 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval)
441 {
442 /* {
443 syscallarg(linux_pid_t) pid;
444 syscallarg(struct linux_sched_param *) sp;
445 } */
446 struct linux_sched_param lp;
447 struct sched_param sp;
448 int error, policy;
449
450 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
451 error = EINVAL;
452 goto out;
453 }
454
455 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp);
456 if (error)
457 goto out;
458 DPRINTF(("%s: native: policy %d, priority %d\n",
459 __func__, policy, sp.sched_priority));
460
461 error = sched_native2linux(policy, &sp, NULL, &lp);
462 if (error)
463 goto out;
464 DPRINTF(("%s: linux: policy %d, priority %d\n",
465 __func__, policy, lp.sched_priority));
466
467 error = copyout(&lp, SCARG(uap, sp), sizeof(lp));
468 if (error)
469 goto out;
470
471 out:
472 return error;
473 }
474
475 int
476 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval)
477 {
478 /* {
479 syscallarg(linux_pid_t) pid;
480 syscallarg(int) policy;
481 syscallarg(cont struct linux_sched_param *) sp;
482 } */
483 int error, policy;
484 struct linux_sched_param lp;
485 struct sched_param sp;
486
487 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
488 error = EINVAL;
489 goto out;
490 }
491
492 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
493 if (error)
494 goto out;
495 DPRINTF(("%s: linux: policy %d, priority %d\n",
496 __func__, SCARG(uap, policy), lp.sched_priority));
497
498 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp);
499 if (error)
500 goto out;
501 DPRINTF(("%s: native: policy %d, priority %d\n",
502 __func__, policy, sp.sched_priority));
503
504 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
505 if (error)
506 goto out;
507
508 out:
509 return error;
510 }
511
512 int
513 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval)
514 {
515 /* {
516 syscallarg(linux_pid_t) pid;
517 } */
518 int error, policy;
519
520 *retval = -1;
521
522 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
523 if (error)
524 goto out;
525
526 error = sched_native2linux(policy, NULL, &policy, NULL);
527 if (error)
528 goto out;
529
530 *retval = policy;
531
532 out:
533 return error;
534 }
535
536 int
537 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
538 {
539
540 yield();
541 return 0;
542 }
543
544 int
545 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval)
546 {
547 /* {
548 syscallarg(int) policy;
549 } */
550
551 switch (SCARG(uap, policy)) {
552 case LINUX_SCHED_OTHER:
553 *retval = 0;
554 break;
555 case LINUX_SCHED_FIFO:
556 case LINUX_SCHED_RR:
557 *retval = LINUX_SCHED_RTPRIO_MAX;
558 break;
559 default:
560 return EINVAL;
561 }
562
563 return 0;
564 }
565
566 int
567 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval)
568 {
569 /* {
570 syscallarg(int) policy;
571 } */
572
573 switch (SCARG(uap, policy)) {
574 case LINUX_SCHED_OTHER:
575 *retval = 0;
576 break;
577 case LINUX_SCHED_FIFO:
578 case LINUX_SCHED_RR:
579 *retval = LINUX_SCHED_RTPRIO_MIN;
580 break;
581 default:
582 return EINVAL;
583 }
584
585 return 0;
586 }
587
588 int
589 linux_sys_exit(struct lwp *l, const struct linux_sys_exit_args *uap, register_t *retval)
590 {
591
592 lwp_exit(l);
593 return 0;
594 }
595
596 #ifndef __m68k__
597 /* Present on everything but m68k */
598 int
599 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval)
600 {
601
602 return sys_exit(l, (const void *)uap, retval);
603 }
604 #endif /* !__m68k__ */
605
606 int
607 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval)
608 {
609 /* {
610 syscallarg(int *) tidptr;
611 } */
612 struct linux_emuldata *led;
613
614 led = (struct linux_emuldata *)l->l_emuldata;
615 led->led_clear_tid = SCARG(uap, tid);
616 *retval = l->l_lid;
617
618 return 0;
619 }
620
621 /* ARGUSED1 */
622 int
623 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval)
624 {
625
626 *retval = l->l_lid;
627 return 0;
628 }
629
630 int
631 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval)
632 {
633 /* {
634 syscallarg(linux_pid_t) pid;
635 syscallarg(unsigned int) len;
636 syscallarg(unsigned long *) mask;
637 } */
638 proc_t *p;
639 unsigned long *lp, *data;
640 int error, size, nb = ncpu;
641
642 /* Unlike Linux, dynamically calculate cpu mask size */
643 size = sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT);
644 if (SCARG(uap, len) < size)
645 return EINVAL;
646
647 /* XXX: Pointless check. TODO: Actually implement this. */
648 mutex_enter(proc_lock);
649 p = proc_find(SCARG(uap, pid));
650 mutex_exit(proc_lock);
651 if (p == NULL) {
652 return ESRCH;
653 }
654
655 /*
656 * return the actual number of CPU, tag all of them as available
657 * The result is a mask, the first CPU being in the least significant
658 * bit.
659 */
660 data = kmem_zalloc(size, KM_SLEEP);
661 lp = data;
662 while (nb > LONG_BIT) {
663 *lp++ = ~0UL;
664 nb -= LONG_BIT;
665 }
666 if (nb)
667 *lp = (1 << ncpu) - 1;
668
669 error = copyout(data, SCARG(uap, mask), size);
670 kmem_free(data, size);
671 *retval = size;
672 return error;
673 }
674
675 int
676 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval)
677 {
678 /* {
679 syscallarg(linux_pid_t) pid;
680 syscallarg(unsigned int) len;
681 syscallarg(unsigned long *) mask;
682 } */
683 proc_t *p;
684
685 /* XXX: Pointless check. TODO: Actually implement this. */
686 mutex_enter(proc_lock);
687 p = proc_find(SCARG(uap, pid));
688 mutex_exit(proc_lock);
689 if (p == NULL) {
690 return ESRCH;
691 }
692
693 /* Let's ignore it */
694 DPRINTF(("%s\n", __func__));
695 return 0;
696 }
697