linux_sched.c revision 1.61.4.1 1 /* $NetBSD: linux_sched.c,v 1.61.4.1 2010/07/03 01:19:31 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center; by Matthias Scheler.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Linux compatibility module. Try to deal with scheduler related syscalls.
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_sched.c,v 1.61.4.1 2010/07/03 01:19:31 rmind Exp $");
39
40 #include <sys/param.h>
41 #include <sys/mount.h>
42 #include <sys/proc.h>
43 #include <sys/systm.h>
44 #include <sys/sysctl.h>
45 #include <sys/malloc.h>
46 #include <sys/syscallargs.h>
47 #include <sys/wait.h>
48 #include <sys/kauth.h>
49 #include <sys/ptrace.h>
50 #include <sys/types.h>
51
52 #include <sys/cpu.h>
53
54 #include <compat/linux/common/linux_types.h>
55 #include <compat/linux/common/linux_signal.h>
56 #include <compat/linux/common/linux_machdep.h> /* For LINUX_NPTL */
57 #include <compat/linux/common/linux_emuldata.h>
58 #include <compat/linux/common/linux_ipc.h>
59 #include <compat/linux/common/linux_sem.h>
60 #include <compat/linux/common/linux_exec.h>
61
62 #include <compat/linux/linux_syscallargs.h>
63
64 #include <compat/linux/common/linux_sched.h>
65
66 int
67 linux_sys_clone(struct lwp *l, const struct linux_sys_clone_args *uap, register_t *retval)
68 {
69 /* {
70 syscallarg(int) flags;
71 syscallarg(void *) stack;
72 #ifdef LINUX_NPTL
73 syscallarg(void *) parent_tidptr;
74 syscallarg(void *) child_tidptr;
75 #endif
76 } */
77 int flags, sig;
78 int error;
79 struct proc *p;
80 #ifdef LINUX_NPTL
81 struct linux_emuldata *led;
82 #endif
83
84 /*
85 * We don't support the Linux CLONE_PID or CLONE_PTRACE flags.
86 */
87 if (SCARG(uap, flags) & (LINUX_CLONE_PID|LINUX_CLONE_PTRACE))
88 return (EINVAL);
89
90 /*
91 * Thread group implies shared signals. Shared signals
92 * imply shared VM. This matches what Linux kernel does.
93 */
94 if (SCARG(uap, flags) & LINUX_CLONE_THREAD
95 && (SCARG(uap, flags) & LINUX_CLONE_SIGHAND) == 0)
96 return (EINVAL);
97 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND
98 && (SCARG(uap, flags) & LINUX_CLONE_VM) == 0)
99 return (EINVAL);
100
101 flags = 0;
102
103 if (SCARG(uap, flags) & LINUX_CLONE_VM)
104 flags |= FORK_SHAREVM;
105 if (SCARG(uap, flags) & LINUX_CLONE_FS)
106 flags |= FORK_SHARECWD;
107 if (SCARG(uap, flags) & LINUX_CLONE_FILES)
108 flags |= FORK_SHAREFILES;
109 if (SCARG(uap, flags) & LINUX_CLONE_SIGHAND)
110 flags |= FORK_SHARESIGS;
111 if (SCARG(uap, flags) & LINUX_CLONE_VFORK)
112 flags |= FORK_PPWAIT;
113
114 sig = SCARG(uap, flags) & LINUX_CLONE_CSIGNAL;
115 if (sig < 0 || sig >= LINUX__NSIG)
116 return (EINVAL);
117 sig = linux_to_native_signo[sig];
118
119 #ifdef LINUX_NPTL
120 led = (struct linux_emuldata *)l->l_proc->p_emuldata;
121
122 led->parent_tidptr = SCARG(uap, parent_tidptr);
123 led->child_tidptr = SCARG(uap, child_tidptr);
124 led->clone_flags = SCARG(uap, flags);
125 #endif /* LINUX_NPTL */
126
127 /*
128 * Note that Linux does not provide a portable way of specifying
129 * the stack area; the caller must know if the stack grows up
130 * or down. So, we pass a stack size of 0, so that the code
131 * that makes this adjustment is a noop.
132 */
133 if ((error = fork1(l, flags, sig, SCARG(uap, stack), 0,
134 NULL, NULL, retval, &p)) != 0)
135 return error;
136
137 #ifdef LINUX_NPTL
138 if ((SCARG(uap, flags) & LINUX_CLONE_SETTLS) != 0)
139 return linux_init_thread_area(l, LIST_FIRST(&p->p_lwps));
140 #endif /* LINUX_NPTL */
141
142 return 0;
143 }
144
145 /*
146 * linux realtime priority
147 *
148 * - SCHED_RR and SCHED_FIFO tasks have priorities [1,99].
149 *
150 * - SCHED_OTHER tasks don't have realtime priorities.
151 * in particular, sched_param::sched_priority is always 0.
152 */
153
154 #define LINUX_SCHED_RTPRIO_MIN 1
155 #define LINUX_SCHED_RTPRIO_MAX 99
156
157 static int
158 sched_linux2native(int linux_policy, struct linux_sched_param *linux_params,
159 int *native_policy, struct sched_param *native_params)
160 {
161
162 switch (linux_policy) {
163 case LINUX_SCHED_OTHER:
164 if (native_policy != NULL) {
165 *native_policy = SCHED_OTHER;
166 }
167 break;
168
169 case LINUX_SCHED_FIFO:
170 if (native_policy != NULL) {
171 *native_policy = SCHED_FIFO;
172 }
173 break;
174
175 case LINUX_SCHED_RR:
176 if (native_policy != NULL) {
177 *native_policy = SCHED_RR;
178 }
179 break;
180
181 default:
182 return EINVAL;
183 }
184
185 if (linux_params != NULL) {
186 int prio = linux_params->sched_priority;
187
188 KASSERT(native_params != NULL);
189
190 if (linux_policy == LINUX_SCHED_OTHER) {
191 if (prio != 0) {
192 return EINVAL;
193 }
194 native_params->sched_priority = PRI_NONE; /* XXX */
195 } else {
196 if (prio < LINUX_SCHED_RTPRIO_MIN ||
197 prio > LINUX_SCHED_RTPRIO_MAX) {
198 return EINVAL;
199 }
200 native_params->sched_priority =
201 (prio - LINUX_SCHED_RTPRIO_MIN)
202 * (SCHED_PRI_MAX - SCHED_PRI_MIN)
203 / (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
204 + SCHED_PRI_MIN;
205 }
206 }
207
208 return 0;
209 }
210
211 static int
212 sched_native2linux(int native_policy, struct sched_param *native_params,
213 int *linux_policy, struct linux_sched_param *linux_params)
214 {
215
216 switch (native_policy) {
217 case SCHED_OTHER:
218 if (linux_policy != NULL) {
219 *linux_policy = LINUX_SCHED_OTHER;
220 }
221 break;
222
223 case SCHED_FIFO:
224 if (linux_policy != NULL) {
225 *linux_policy = LINUX_SCHED_FIFO;
226 }
227 break;
228
229 case SCHED_RR:
230 if (linux_policy != NULL) {
231 *linux_policy = LINUX_SCHED_RR;
232 }
233 break;
234
235 default:
236 panic("%s: unknown policy %d\n", __func__, native_policy);
237 }
238
239 if (native_params != NULL) {
240 int prio = native_params->sched_priority;
241
242 KASSERT(prio >= SCHED_PRI_MIN);
243 KASSERT(prio <= SCHED_PRI_MAX);
244 KASSERT(linux_params != NULL);
245
246 #ifdef DEBUG_LINUX
247 printf("native2linux: native: policy %d, priority %d\n",
248 native_policy, prio);
249 #endif
250
251 if (native_policy == SCHED_OTHER) {
252 linux_params->sched_priority = 0;
253 } else {
254 linux_params->sched_priority =
255 (prio - SCHED_PRI_MIN)
256 * (LINUX_SCHED_RTPRIO_MAX - LINUX_SCHED_RTPRIO_MIN)
257 / (SCHED_PRI_MAX - SCHED_PRI_MIN)
258 + LINUX_SCHED_RTPRIO_MIN;
259 }
260 #ifdef DEBUG_LINUX
261 printf("native2linux: linux: policy %d, priority %d\n",
262 -1, linux_params->sched_priority);
263 #endif
264 }
265
266 return 0;
267 }
268
269 int
270 linux_sys_sched_setparam(struct lwp *l, const struct linux_sys_sched_setparam_args *uap, register_t *retval)
271 {
272 /* {
273 syscallarg(linux_pid_t) pid;
274 syscallarg(const struct linux_sched_param *) sp;
275 } */
276 int error, policy;
277 struct linux_sched_param lp;
278 struct sched_param sp;
279
280 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
281 error = EINVAL;
282 goto out;
283 }
284
285 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
286 if (error)
287 goto out;
288
289 /* We need the current policy in Linux terms. */
290 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
291 if (error)
292 goto out;
293 error = sched_native2linux(policy, NULL, &policy, NULL);
294 if (error)
295 goto out;
296
297 error = sched_linux2native(policy, &lp, &policy, &sp);
298 if (error)
299 goto out;
300
301 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
302 if (error)
303 goto out;
304
305 out:
306 return error;
307 }
308
309 int
310 linux_sys_sched_getparam(struct lwp *l, const struct linux_sys_sched_getparam_args *uap, register_t *retval)
311 {
312 /* {
313 syscallarg(linux_pid_t) pid;
314 syscallarg(struct linux_sched_param *) sp;
315 } */
316 struct linux_sched_param lp;
317 struct sched_param sp;
318 int error, policy;
319
320 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
321 error = EINVAL;
322 goto out;
323 }
324
325 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, &sp);
326 if (error)
327 goto out;
328 #ifdef DEBUG_LINUX
329 printf("getparam: native: policy %d, priority %d\n",
330 policy, sp.sched_priority);
331 #endif
332
333 error = sched_native2linux(policy, &sp, NULL, &lp);
334 if (error)
335 goto out;
336 #ifdef DEBUG_LINUX
337 printf("getparam: linux: policy %d, priority %d\n",
338 policy, lp.sched_priority);
339 #endif
340
341 error = copyout(&lp, SCARG(uap, sp), sizeof(lp));
342 if (error)
343 goto out;
344
345 out:
346 return error;
347 }
348
349 int
350 linux_sys_sched_setscheduler(struct lwp *l, const struct linux_sys_sched_setscheduler_args *uap, register_t *retval)
351 {
352 /* {
353 syscallarg(linux_pid_t) pid;
354 syscallarg(int) policy;
355 syscallarg(cont struct linux_sched_param *) sp;
356 } */
357 int error, policy;
358 struct linux_sched_param lp;
359 struct sched_param sp;
360
361 if (SCARG(uap, pid) < 0 || SCARG(uap, sp) == NULL) {
362 error = EINVAL;
363 goto out;
364 }
365
366 error = copyin(SCARG(uap, sp), &lp, sizeof(lp));
367 if (error)
368 goto out;
369 #ifdef DEBUG_LINUX
370 printf("setscheduler: linux: policy %d, priority %d\n",
371 SCARG(uap, policy), lp.sched_priority);
372 #endif
373
374 error = sched_linux2native(SCARG(uap, policy), &lp, &policy, &sp);
375 if (error)
376 goto out;
377 #ifdef DEBUG_LINUX
378 printf("setscheduler: native: policy %d, priority %d\n",
379 policy, sp.sched_priority);
380 #endif
381
382 error = do_sched_setparam(SCARG(uap, pid), 0, policy, &sp);
383 if (error)
384 goto out;
385
386 out:
387 return error;
388 }
389
390 int
391 linux_sys_sched_getscheduler(struct lwp *l, const struct linux_sys_sched_getscheduler_args *uap, register_t *retval)
392 {
393 /* {
394 syscallarg(linux_pid_t) pid;
395 } */
396 int error, policy;
397
398 *retval = -1;
399
400 error = do_sched_getparam(SCARG(uap, pid), 0, &policy, NULL);
401 if (error)
402 goto out;
403
404 error = sched_native2linux(policy, NULL, &policy, NULL);
405 if (error)
406 goto out;
407
408 *retval = policy;
409
410 out:
411 return error;
412 }
413
414 int
415 linux_sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
416 {
417
418 yield();
419 return 0;
420 }
421
422 int
423 linux_sys_sched_get_priority_max(struct lwp *l, const struct linux_sys_sched_get_priority_max_args *uap, register_t *retval)
424 {
425 /* {
426 syscallarg(int) policy;
427 } */
428
429 switch (SCARG(uap, policy)) {
430 case LINUX_SCHED_OTHER:
431 *retval = 0;
432 break;
433 case LINUX_SCHED_FIFO:
434 case LINUX_SCHED_RR:
435 *retval = LINUX_SCHED_RTPRIO_MAX;
436 break;
437 default:
438 return EINVAL;
439 }
440
441 return 0;
442 }
443
444 int
445 linux_sys_sched_get_priority_min(struct lwp *l, const struct linux_sys_sched_get_priority_min_args *uap, register_t *retval)
446 {
447 /* {
448 syscallarg(int) policy;
449 } */
450
451 switch (SCARG(uap, policy)) {
452 case LINUX_SCHED_OTHER:
453 *retval = 0;
454 break;
455 case LINUX_SCHED_FIFO:
456 case LINUX_SCHED_RR:
457 *retval = LINUX_SCHED_RTPRIO_MIN;
458 break;
459 default:
460 return EINVAL;
461 }
462
463 return 0;
464 }
465
466 #ifndef __m68k__
467 /* Present on everything but m68k */
468 int
469 linux_sys_exit_group(struct lwp *l, const struct linux_sys_exit_group_args *uap, register_t *retval)
470 {
471 #ifdef LINUX_NPTL
472 /* {
473 syscallarg(int) error_code;
474 } */
475 struct proc *p = l->l_proc;
476 struct linux_emuldata *led = p->p_emuldata;
477 struct linux_emuldata *e;
478
479 if (led->s->flags & LINUX_LES_USE_NPTL) {
480
481 #ifdef DEBUG_LINUX
482 printf("%s:%d, led->s->refs = %d\n", __func__, __LINE__,
483 led->s->refs);
484 #endif
485
486 /*
487 * The calling thread is supposed to kill all threads
488 * in the same thread group (i.e. all threads created
489 * via clone(2) with CLONE_THREAD flag set).
490 *
491 * If there is only one thread, things are quite simple
492 */
493 if (led->s->refs == 1)
494 return sys_exit(l, (const void *)uap, retval);
495
496 #ifdef DEBUG_LINUX
497 printf("%s:%d\n", __func__, __LINE__);
498 #endif
499
500 mutex_enter(proc_lock);
501 led->s->flags |= LINUX_LES_INEXITGROUP;
502 led->s->xstat = W_EXITCODE(SCARG(uap, error_code), 0);
503
504 /*
505 * Kill all threads in the group. The emulation exit hook takes
506 * care of hiding the zombies and reporting the exit code
507 * properly.
508 */
509 LIST_FOREACH(e, &led->s->threads, threads) {
510 if (e->proc == p)
511 continue;
512
513 #ifdef DEBUG_LINUX
514 printf("%s: kill PID %d\n", __func__, e->proc->p_pid);
515 #endif
516 psignal(e->proc, SIGKILL);
517 }
518
519 /* Now, kill ourselves */
520 psignal(p, SIGKILL);
521 mutex_exit(proc_lock);
522
523 return 0;
524
525 }
526 #endif /* LINUX_NPTL */
527
528 return sys_exit(l, (const void *)uap, retval);
529 }
530 #endif /* !__m68k__ */
531
532 #ifdef LINUX_NPTL
533 int
534 linux_sys_set_tid_address(struct lwp *l, const struct linux_sys_set_tid_address_args *uap, register_t *retval)
535 {
536 /* {
537 syscallarg(int *) tidptr;
538 } */
539 struct linux_emuldata *led;
540
541 led = (struct linux_emuldata *)l->l_proc->p_emuldata;
542 led->clear_tid = SCARG(uap, tid);
543
544 led->s->flags |= LINUX_LES_USE_NPTL;
545
546 *retval = l->l_proc->p_pid;
547
548 return 0;
549 }
550
551 /* ARGUSED1 */
552 int
553 linux_sys_gettid(struct lwp *l, const void *v, register_t *retval)
554 {
555 /* The Linux kernel does it exactly that way */
556 *retval = l->l_proc->p_pid;
557 return 0;
558 }
559
560 #ifdef LINUX_NPTL
561 /* ARGUSED1 */
562 int
563 linux_sys_getpid(struct lwp *l, const void *v, register_t *retval)
564 {
565 struct linux_emuldata *led = l->l_proc->p_emuldata;
566
567 if (led->s->flags & LINUX_LES_USE_NPTL) {
568 /* The Linux kernel does it exactly that way */
569 *retval = led->s->group_pid;
570 } else {
571 *retval = l->l_proc->p_pid;
572 }
573
574 return 0;
575 }
576
577 /* ARGUSED1 */
578 int
579 linux_sys_getppid(struct lwp *l, const void *v, register_t *retval)
580 {
581 struct proc *p = l->l_proc;
582 struct linux_emuldata *led = p->p_emuldata;
583 struct proc *glp;
584 struct proc *pp;
585
586 mutex_enter(proc_lock);
587 if (led->s->flags & LINUX_LES_USE_NPTL) {
588
589 /* Find the thread group leader's parent */
590 glp = proc_find(led->s->group_pid);
591 if (glp == NULL) {
592 /* Maybe panic... */
593 printf("linux_sys_getppid: missing group leader PID"
594 " %d\n", led->s->group_pid);
595 mutex_exit(proc_lock);
596 return -1;
597 }
598 pp = glp->p_pptr;
599
600 /* If this is a Linux process too, return thread group PID */
601 if (pp->p_emul == p->p_emul) {
602 struct linux_emuldata *pled;
603
604 pled = pp->p_emuldata;
605 *retval = pled->s->group_pid;
606 } else {
607 *retval = pp->p_pid;
608 }
609
610 } else {
611 *retval = p->p_pptr->p_pid;
612 }
613 mutex_exit(proc_lock);
614
615 return 0;
616 }
617 #endif /* LINUX_NPTL */
618
619 int
620 linux_sys_sched_getaffinity(struct lwp *l, const struct linux_sys_sched_getaffinity_args *uap, register_t *retval)
621 {
622 /* {
623 syscallarg(pid_t) pid;
624 syscallarg(unsigned int) len;
625 syscallarg(unsigned long *) mask;
626 } */
627 int error, size, nb = ncpu;
628 unsigned long *c, *data;
629 proc_t *p;
630
631 /* Unlike Linux, dynamically calculate cpu mask size */
632 size = sizeof(long) * ((ncpu + LONG_BIT - 1) / LONG_BIT);
633 if (SCARG(uap, len) < size)
634 return EINVAL;
635
636 /* XXX: Pointless check. TODO: Actually implement this. */
637 mutex_enter(proc_lock);
638 p = proc_find(SCARG(uap, pid));
639 mutex_exit(proc_lock);
640 if (p == NULL) {
641 return ESRCH;
642 }
643
644 /*
645 * return the actual number of CPU, tag all of them as available
646 * The result is a mask, the first CPU being in the least significant
647 * bit.
648 */
649 data = kmem_zalloc(size, KM_SLEEP);
650 c = data;
651 while (nb > LONG_BIT) {
652 *c++ = ~0UL;
653 nb -= LONG_BIT;
654 }
655 if (nb)
656 *c = (1 << ncpu) - 1;
657
658 error = copyout(data, SCARG(uap, mask), size);
659 kmem_free(data, size);
660
661 *retval = size;
662 return error;
663
664 }
665
666 int
667 linux_sys_sched_setaffinity(struct lwp *l, const struct linux_sys_sched_setaffinity_args *uap, register_t *retval)
668 {
669 /* {
670 syscallarg(pid_t) pid;
671 syscallarg(unsigned int) len;
672 syscallarg(unsigned long *) mask;
673 } */
674 proc_t *p;
675
676 /* XXX: Pointless check. TODO: Actually implement this. */
677 mutex_enter(proc_lock);
678 p = proc_find(SCARG(uap, pid));
679 mutex_exit(proc_lock);
680 if (p == NULL) {
681 return ESRCH;
682 }
683
684 /* Let's ignore it */
685 #ifdef DEBUG_LINUX
686 printf("linux_sys_sched_setaffinity\n");
687 #endif
688 return 0;
689 };
690 #endif /* LINUX_NPTL */
691