linux_machdep.c revision 1.151 1 /* $NetBSD: linux_machdep.c,v 1.151 2011/11/18 04:07:44 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1995, 2000, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Frank van der Linden, and by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: linux_machdep.c,v 1.151 2011/11/18 04:07:44 christos Exp $");
34
35 #if defined(_KERNEL_OPT)
36 #include "opt_vm86.h"
37 #include "opt_user_ldt.h"
38 #endif
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/signalvar.h>
43 #include <sys/kernel.h>
44 #include <sys/proc.h>
45 #include <sys/buf.h>
46 #include <sys/reboot.h>
47 #include <sys/conf.h>
48 #include <sys/exec.h>
49 #include <sys/file.h>
50 #include <sys/callout.h>
51 #include <sys/malloc.h>
52 #include <sys/mbuf.h>
53 #include <sys/msgbuf.h>
54 #include <sys/mount.h>
55 #include <sys/vnode.h>
56 #include <sys/device.h>
57 #include <sys/syscallargs.h>
58 #include <sys/filedesc.h>
59 #include <sys/exec_elf.h>
60 #include <sys/disklabel.h>
61 #include <sys/ioctl.h>
62 #include <sys/wait.h>
63 #include <sys/kauth.h>
64 #include <sys/kmem.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <compat/linux/common/linux_types.h>
69 #include <compat/linux/common/linux_signal.h>
70 #include <compat/linux/common/linux_util.h>
71 #include <compat/linux/common/linux_ioctl.h>
72 #include <compat/linux/common/linux_hdio.h>
73 #include <compat/linux/common/linux_exec.h>
74 #include <compat/linux/common/linux_machdep.h>
75 #include <compat/linux/common/linux_errno.h>
76
77 #include <compat/linux/linux_syscallargs.h>
78
79 #include <sys/cpu.h>
80 #include <machine/cpufunc.h>
81 #include <machine/psl.h>
82 #include <machine/reg.h>
83 #include <machine/segments.h>
84 #include <machine/specialreg.h>
85 #include <machine/sysarch.h>
86 #include <machine/vm86.h>
87 #include <machine/vmparam.h>
88
89 /*
90 * To see whether wscons is configured (for virtual console ioctl calls).
91 */
92 #if defined(_KERNEL_OPT)
93 #include "wsdisplay.h"
94 #endif
95 #if (NWSDISPLAY > 0)
96 #include <dev/wscons/wsconsio.h>
97 #include <dev/wscons/wsdisplay_usl_io.h>
98 #if defined(_KERNEL_OPT)
99 #include "opt_xserver.h"
100 #endif
101 #endif
102
103 #ifdef DEBUG_LINUX
104 #define DPRINTF(a) uprintf a
105 #else
106 #define DPRINTF(a)
107 #endif
108
109 static struct biosdisk_info *fd2biosinfo(struct proc *, struct file *);
110 extern struct disklist *x86_alldisks;
111 static void linux_save_ucontext(struct lwp *, struct trapframe *,
112 const sigset_t *, struct sigaltstack *, struct linux_ucontext *);
113 static void linux_save_sigcontext(struct lwp *, struct trapframe *,
114 const sigset_t *, struct linux_sigcontext *);
115 static int linux_restore_sigcontext(struct lwp *,
116 struct linux_sigcontext *, register_t *);
117 static void linux_rt_sendsig(const ksiginfo_t *, const sigset_t *);
118 static void linux_old_sendsig(const ksiginfo_t *, const sigset_t *);
119
120 extern char linux_sigcode[], linux_rt_sigcode[];
121
122 /*
123 * Deal with some i386-specific things in the Linux emulation code.
124 */
125
126 void
127 linux_setregs(struct lwp *l, struct exec_package *epp, vaddr_t stack)
128 {
129 struct pcb *pcb = lwp_getpcb(l);
130 struct trapframe *tf;
131
132 #if NNPX > 0
133 /* If we were using the FPU, forget about it. */
134 if (npxproc == l)
135 npxdrop();
136 #endif
137
138 #ifdef USER_LDT
139 pmap_ldt_cleanup(l);
140 #endif
141
142 l->l_md.md_flags &= ~MDL_USEDFPU;
143
144 if (i386_use_fxsave) {
145 pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __Linux_NPXCW__;
146 pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
147 } else
148 pcb->pcb_savefpu.sv_87.sv_env.en_cw = __Linux_NPXCW__;
149
150 tf = l->l_md.md_regs;
151 tf->tf_gs = 0;
152 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
153 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
154 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
155 tf->tf_edi = 0;
156 tf->tf_esi = 0;
157 tf->tf_ebp = 0;
158 tf->tf_ebx = l->l_proc->p_psstrp;
159 tf->tf_edx = 0;
160 tf->tf_ecx = 0;
161 tf->tf_eax = 0;
162 tf->tf_eip = epp->ep_entry;
163 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
164 tf->tf_eflags = PSL_USERSET;
165 tf->tf_esp = stack;
166 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
167 }
168
169 /*
170 * Send an interrupt to process.
171 *
172 * Stack is set up to allow sigcode stored
173 * in u. to call routine, followed by kcall
174 * to sigreturn routine below. After sigreturn
175 * resets the signal mask, the stack, and the
176 * frame pointer, it returns to the user
177 * specified pc, psl.
178 */
179
180 void
181 linux_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
182 {
183 if (SIGACTION(curproc, ksi->ksi_signo).sa_flags & SA_SIGINFO)
184 linux_rt_sendsig(ksi, mask);
185 else
186 linux_old_sendsig(ksi, mask);
187 }
188
189
190 static void
191 linux_save_ucontext(struct lwp *l, struct trapframe *tf, const sigset_t *mask, struct sigaltstack *sas, struct linux_ucontext *uc)
192 {
193 uc->uc_flags = 0;
194 uc->uc_link = NULL;
195 native_to_linux_sigaltstack(&uc->uc_stack, sas);
196 linux_save_sigcontext(l, tf, mask, &uc->uc_mcontext);
197 native_to_linux_sigset(&uc->uc_sigmask, mask);
198 (void)memset(&uc->uc_fpregs_mem, 0, sizeof(uc->uc_fpregs_mem));
199 }
200
201 static void
202 linux_save_sigcontext(struct lwp *l, struct trapframe *tf,
203 const sigset_t *mask, struct linux_sigcontext *sc)
204 {
205 struct pcb *pcb = lwp_getpcb(l);
206
207 /* Save register context. */
208 #ifdef VM86
209 if (tf->tf_eflags & PSL_VM) {
210 sc->sc_gs = tf->tf_vm86_gs;
211 sc->sc_fs = tf->tf_vm86_fs;
212 sc->sc_es = tf->tf_vm86_es;
213 sc->sc_ds = tf->tf_vm86_ds;
214 sc->sc_eflags = get_vflags(l);
215 } else
216 #endif
217 {
218 sc->sc_gs = tf->tf_gs;
219 sc->sc_fs = tf->tf_fs;
220 sc->sc_es = tf->tf_es;
221 sc->sc_ds = tf->tf_ds;
222 sc->sc_eflags = tf->tf_eflags;
223 }
224 sc->sc_edi = tf->tf_edi;
225 sc->sc_esi = tf->tf_esi;
226 sc->sc_esp = tf->tf_esp;
227 sc->sc_ebp = tf->tf_ebp;
228 sc->sc_ebx = tf->tf_ebx;
229 sc->sc_edx = tf->tf_edx;
230 sc->sc_ecx = tf->tf_ecx;
231 sc->sc_eax = tf->tf_eax;
232 sc->sc_eip = tf->tf_eip;
233 sc->sc_cs = tf->tf_cs;
234 sc->sc_esp_at_signal = tf->tf_esp;
235 sc->sc_ss = tf->tf_ss;
236 sc->sc_err = tf->tf_err;
237 sc->sc_trapno = tf->tf_trapno;
238 sc->sc_cr2 = pcb->pcb_cr2;
239 sc->sc_387 = NULL;
240
241 /* Save signal stack. */
242 /* Linux doesn't save the onstack flag in sigframe */
243
244 /* Save signal mask. */
245 native_to_linux_old_sigset(&sc->sc_mask, mask);
246 }
247
248 static void
249 linux_rt_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
250 {
251 struct lwp *l = curlwp;
252 struct proc *p = l->l_proc;
253 struct trapframe *tf;
254 struct linux_rt_sigframe *fp, frame;
255 int onstack, error;
256 int sig = ksi->ksi_signo;
257 sig_t catcher = SIGACTION(p, sig).sa_handler;
258 struct sigaltstack *sas = &l->l_sigstk;
259
260 tf = l->l_md.md_regs;
261 /* Do we need to jump onto the signal stack? */
262 onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
263 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
264
265
266 /* Allocate space for the signal handler context. */
267 if (onstack)
268 fp = (struct linux_rt_sigframe *)((char *)sas->ss_sp +
269 sas->ss_size);
270 else
271 fp = (struct linux_rt_sigframe *)tf->tf_esp;
272 fp--;
273
274 DPRINTF(("rt: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
275 onstack, fp, sig, tf->tf_eip,
276 ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
277
278 /* Build stack frame for signal trampoline. */
279 frame.sf_handler = catcher;
280 frame.sf_sig = native_to_linux_signo[sig];
281 frame.sf_sip = &fp->sf_si;
282 frame.sf_ucp = &fp->sf_uc;
283
284 /*
285 * XXX: the following code assumes that the constants for
286 * siginfo are the same between linux and NetBSD.
287 */
288 native_to_linux_siginfo(&frame.sf_si, &ksi->ksi_info);
289
290 /* Save register context. */
291 linux_save_ucontext(l, tf, mask, sas, &frame.sf_uc);
292 sendsig_reset(l, sig);
293
294 mutex_exit(p->p_lock);
295 error = copyout(&frame, fp, sizeof(frame));
296 mutex_enter(p->p_lock);
297
298 if (error != 0) {
299 /*
300 * Process has trashed its stack; give it an illegal
301 * instruction to halt it in its tracks.
302 */
303 sigexit(l, SIGILL);
304 /* NOTREACHED */
305 }
306
307 /*
308 * Build context to run handler in.
309 */
310 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
311 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
312 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
313 tf->tf_eip = ((int)p->p_sigctx.ps_sigcode) +
314 (linux_rt_sigcode - linux_sigcode);
315 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
316 tf->tf_eflags &= ~PSL_CLEARSIG;
317 tf->tf_esp = (int)fp;
318 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
319
320 /* Remember that we're now on the signal stack. */
321 if (onstack)
322 sas->ss_flags |= SS_ONSTACK;
323 }
324
325 static void
326 linux_old_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
327 {
328 struct lwp *l = curlwp;
329 struct proc *p = l->l_proc;
330 struct trapframe *tf;
331 struct linux_sigframe *fp, frame;
332 int onstack, error;
333 int sig = ksi->ksi_signo;
334 sig_t catcher = SIGACTION(p, sig).sa_handler;
335 struct sigaltstack *sas = &l->l_sigstk;
336
337 tf = l->l_md.md_regs;
338
339 /* Do we need to jump onto the signal stack? */
340 onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
341 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
342
343 /* Allocate space for the signal handler context. */
344 if (onstack)
345 fp = (struct linux_sigframe *) ((char *)sas->ss_sp +
346 sas->ss_size);
347 else
348 fp = (struct linux_sigframe *)tf->tf_esp;
349 fp--;
350
351 DPRINTF(("old: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
352 onstack, fp, sig, tf->tf_eip,
353 ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
354
355 /* Build stack frame for signal trampoline. */
356 frame.sf_handler = catcher;
357 frame.sf_sig = native_to_linux_signo[sig];
358
359 linux_save_sigcontext(l, tf, mask, &frame.sf_sc);
360 sendsig_reset(l, sig);
361
362 mutex_exit(p->p_lock);
363 error = copyout(&frame, fp, sizeof(frame));
364 mutex_enter(p->p_lock);
365
366 if (error != 0) {
367 /*
368 * Process has trashed its stack; give it an illegal
369 * instruction to halt it in its tracks.
370 */
371 sigexit(l, SIGILL);
372 /* NOTREACHED */
373 }
374
375 /*
376 * Build context to run handler in.
377 */
378 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
379 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
380 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
381 tf->tf_eip = (int)p->p_sigctx.ps_sigcode;
382 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
383 tf->tf_eflags &= ~PSL_CLEARSIG;
384 tf->tf_esp = (int)fp;
385 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
386
387 /* Remember that we're now on the signal stack. */
388 if (onstack)
389 sas->ss_flags |= SS_ONSTACK;
390 }
391
392 /*
393 * System call to cleanup state after a signal
394 * has been taken. Reset signal mask and
395 * stack state from context left by sendsig (above).
396 * Return to previous pc and psl as specified by
397 * context left by sendsig. Check carefully to
398 * make sure that the user has not modified the
399 * psl to gain improper privileges or to cause
400 * a machine fault.
401 */
402 int
403 linux_sys_rt_sigreturn(struct lwp *l, const struct linux_sys_rt_sigreturn_args *uap, register_t *retval)
404 {
405 /* {
406 syscallarg(struct linux_ucontext *) ucp;
407 } */
408 struct linux_ucontext context, *ucp = SCARG(uap, ucp);
409 int error;
410
411 /*
412 * The trampoline code hands us the context.
413 * It is unsafe to keep track of it ourselves, in the event that a
414 * program jumps out of a signal handler.
415 */
416 if ((error = copyin(ucp, &context, sizeof(*ucp))) != 0)
417 return error;
418
419 /* XXX XAX we can do better here by using more of the ucontext */
420 return linux_restore_sigcontext(l, &context.uc_mcontext, retval);
421 }
422
423 int
424 linux_sys_sigreturn(struct lwp *l, const struct linux_sys_sigreturn_args *uap, register_t *retval)
425 {
426 /* {
427 syscallarg(struct linux_sigcontext *) scp;
428 } */
429 struct linux_sigcontext context, *scp = SCARG(uap, scp);
430 int error;
431
432 /*
433 * The trampoline code hands us the context.
434 * It is unsafe to keep track of it ourselves, in the event that a
435 * program jumps out of a signal handler.
436 */
437 if ((error = copyin((void *)scp, &context, sizeof(*scp))) != 0)
438 return error;
439 return linux_restore_sigcontext(l, &context, retval);
440 }
441
442 static int
443 linux_restore_sigcontext(struct lwp *l, struct linux_sigcontext *scp,
444 register_t *retval)
445 {
446 struct proc *p = l->l_proc;
447 struct sigaltstack *sas = &l->l_sigstk;
448 struct trapframe *tf;
449 sigset_t mask;
450 ssize_t ss_gap;
451
452 /* Restore register context. */
453 tf = l->l_md.md_regs;
454 DPRINTF(("sigreturn enter esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
455
456 #ifdef VM86
457 if (scp->sc_eflags & PSL_VM) {
458 void syscall_vm86(struct trapframe *);
459
460 tf->tf_vm86_gs = scp->sc_gs;
461 tf->tf_vm86_fs = scp->sc_fs;
462 tf->tf_vm86_es = scp->sc_es;
463 tf->tf_vm86_ds = scp->sc_ds;
464 set_vflags(l, scp->sc_eflags);
465 p->p_md.md_syscall = syscall_vm86;
466 } else
467 #endif
468 {
469 /*
470 * Check for security violations. If we're returning to
471 * protected mode, the CPU will validate the segment registers
472 * automatically and generate a trap on violations. We handle
473 * the trap, rather than doing all of the checking here.
474 */
475 if (((scp->sc_eflags ^ tf->tf_eflags) & PSL_USERSTATIC) != 0 ||
476 !USERMODE(scp->sc_cs, scp->sc_eflags))
477 return EINVAL;
478
479 tf->tf_gs = scp->sc_gs;
480 tf->tf_fs = scp->sc_fs;
481 tf->tf_es = scp->sc_es;
482 tf->tf_ds = scp->sc_ds;
483 #ifdef VM86
484 if (tf->tf_eflags & PSL_VM)
485 (*p->p_emul->e_syscall_intern)(p);
486 #endif
487 tf->tf_eflags = scp->sc_eflags;
488 }
489 tf->tf_edi = scp->sc_edi;
490 tf->tf_esi = scp->sc_esi;
491 tf->tf_ebp = scp->sc_ebp;
492 tf->tf_ebx = scp->sc_ebx;
493 tf->tf_edx = scp->sc_edx;
494 tf->tf_ecx = scp->sc_ecx;
495 tf->tf_eax = scp->sc_eax;
496 tf->tf_eip = scp->sc_eip;
497 tf->tf_cs = scp->sc_cs;
498 tf->tf_esp = scp->sc_esp_at_signal;
499 tf->tf_ss = scp->sc_ss;
500
501 /* Restore signal stack. */
502 /*
503 * Linux really does it this way; it doesn't have space in sigframe
504 * to save the onstack flag.
505 */
506 mutex_enter(p->p_lock);
507 ss_gap = (ssize_t)((char *)scp->sc_esp_at_signal - (char *)sas->ss_sp);
508 if (ss_gap >= 0 && ss_gap < sas->ss_size)
509 sas->ss_flags |= SS_ONSTACK;
510 else
511 sas->ss_flags &= ~SS_ONSTACK;
512
513 /* Restore signal mask. */
514 linux_old_to_native_sigset(&mask, &scp->sc_mask);
515 (void) sigprocmask1(l, SIG_SETMASK, &mask, 0);
516 mutex_exit(p->p_lock);
517
518 DPRINTF(("sigreturn exit esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
519 return EJUSTRETURN;
520 }
521
522 #ifdef USER_LDT
523
524 static int
525 linux_read_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
526 register_t *retval)
527 {
528 struct x86_get_ldt_args gl;
529 int error;
530 union descriptor *ldt_buf;
531 size_t sz;
532
533 /*
534 * I've checked the linux code - this function is asymetric with
535 * linux_write_ldt, and returns raw ldt entries.
536 * NB, the code I saw zerod the spare parts of the user buffer.
537 */
538
539 DPRINTF(("linux_read_ldt!"));
540
541 sz = 8192 * sizeof(*ldt_buf);
542 ldt_buf = kmem_zalloc(sz, KM_SLEEP);
543 gl.start = 0;
544 gl.desc = NULL;
545 gl.num = SCARG(uap, bytecount) / sizeof(union descriptor);
546 error = x86_get_ldt1(l, &gl, ldt_buf);
547 /* NB gl.num might have changed */
548 if (error == 0) {
549 *retval = gl.num * sizeof *ldt;
550 error = copyout(ldt_buf, SCARG(uap, ptr),
551 gl.num * sizeof *ldt_buf);
552 }
553 kmem_free(ldt_buf, sz);
554
555 return error;
556 }
557
558 struct linux_ldt_info {
559 u_int entry_number;
560 u_long base_addr;
561 u_int limit;
562 u_int seg_32bit:1;
563 u_int contents:2;
564 u_int read_exec_only:1;
565 u_int limit_in_pages:1;
566 u_int seg_not_present:1;
567 u_int useable:1;
568 };
569
570 static int
571 linux_write_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
572 int oldmode)
573 {
574 struct linux_ldt_info ldt_info;
575 union descriptor d;
576 struct x86_set_ldt_args sl;
577 int error;
578
579 DPRINTF(("linux_write_ldt %d\n", oldmode));
580 if (SCARG(uap, bytecount) != sizeof(ldt_info))
581 return (EINVAL);
582 if ((error = copyin(SCARG(uap, ptr), &ldt_info, sizeof(ldt_info))) != 0)
583 return error;
584 if (ldt_info.entry_number >= 8192)
585 return (EINVAL);
586 if (ldt_info.contents == 3) {
587 if (oldmode)
588 return (EINVAL);
589 if (ldt_info.seg_not_present)
590 return (EINVAL);
591 }
592
593 if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
594 (oldmode || (ldt_info.contents == 0 &&
595 ldt_info.read_exec_only == 1 && ldt_info.seg_32bit == 0 &&
596 ldt_info.limit_in_pages == 0 && ldt_info.seg_not_present == 1 &&
597 ldt_info.useable == 0))) {
598 /* this means you should zero the ldt */
599 (void)memset(&d, 0, sizeof(d));
600 } else {
601 d.sd.sd_lobase = ldt_info.base_addr & 0xffffff;
602 d.sd.sd_hibase = (ldt_info.base_addr >> 24) & 0xff;
603 d.sd.sd_lolimit = ldt_info.limit & 0xffff;
604 d.sd.sd_hilimit = (ldt_info.limit >> 16) & 0xf;
605 d.sd.sd_type = 16 | (ldt_info.contents << 2) |
606 (!ldt_info.read_exec_only << 1);
607 d.sd.sd_dpl = SEL_UPL;
608 d.sd.sd_p = !ldt_info.seg_not_present;
609 d.sd.sd_def32 = ldt_info.seg_32bit;
610 d.sd.sd_gran = ldt_info.limit_in_pages;
611 if (!oldmode)
612 d.sd.sd_xx = ldt_info.useable;
613 else
614 d.sd.sd_xx = 0;
615 }
616 sl.start = ldt_info.entry_number;
617 sl.desc = NULL;
618 sl.num = 1;
619
620 DPRINTF(("linux_write_ldt: idx=%d, base=0x%lx, limit=0x%x\n",
621 ldt_info.entry_number, ldt_info.base_addr, ldt_info.limit));
622
623 return x86_set_ldt1(l, &sl, &d);
624 }
625
626 #endif /* USER_LDT */
627
628 int
629 linux_sys_modify_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap, register_t *retval)
630 {
631 /* {
632 syscallarg(int) func;
633 syscallarg(void *) ptr;
634 syscallarg(size_t) bytecount;
635 } */
636
637 switch (SCARG(uap, func)) {
638 #ifdef USER_LDT
639 case 0:
640 return linux_read_ldt(l, (const void *)uap, retval);
641 case 1:
642 return linux_write_ldt(l, (const void *)uap, 1);
643 case 2:
644 #ifdef notyet
645 return linux_read_default_ldt(l, (const void *)uap, retval);
646 #else
647 return (ENOSYS);
648 #endif
649 case 0x11:
650 return linux_write_ldt(l, (const void *)uap, 0);
651 #endif /* USER_LDT */
652
653 default:
654 return (ENOSYS);
655 }
656 }
657
658 /*
659 * XXX Pathetic hack to make svgalib work. This will fake the major
660 * device number of an opened VT so that svgalib likes it. grmbl.
661 * Should probably do it 'wrong the right way' and use a mapping
662 * array for all major device numbers, and map linux_mknod too.
663 */
664 dev_t
665 linux_fakedev(dev_t dev, int raw)
666 {
667 extern const struct cdevsw ptc_cdevsw, pts_cdevsw;
668 const struct cdevsw *cd = cdevsw_lookup(dev);
669
670 if (raw) {
671 #if (NWSDISPLAY > 0)
672 extern const struct cdevsw wsdisplay_cdevsw;
673 if (cd == &wsdisplay_cdevsw)
674 return makedev(LINUX_CONS_MAJOR, (minor(dev) + 1));
675 #endif
676 }
677
678 if (cd == &ptc_cdevsw)
679 return makedev(LINUX_PTC_MAJOR, minor(dev));
680 if (cd == &pts_cdevsw)
681 return makedev(LINUX_PTS_MAJOR, minor(dev));
682
683 return dev;
684 }
685
686 #if (NWSDISPLAY > 0)
687 /*
688 * That's not complete, but enough to get an X server running.
689 */
690 #define NR_KEYS 128
691 static const u_short plain_map[NR_KEYS] = {
692 0x0200, 0x001b, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036,
693 0x0037, 0x0038, 0x0039, 0x0030, 0x002d, 0x003d, 0x007f, 0x0009,
694 0x0b71, 0x0b77, 0x0b65, 0x0b72, 0x0b74, 0x0b79, 0x0b75, 0x0b69,
695 0x0b6f, 0x0b70, 0x005b, 0x005d, 0x0201, 0x0702, 0x0b61, 0x0b73,
696 0x0b64, 0x0b66, 0x0b67, 0x0b68, 0x0b6a, 0x0b6b, 0x0b6c, 0x003b,
697 0x0027, 0x0060, 0x0700, 0x005c, 0x0b7a, 0x0b78, 0x0b63, 0x0b76,
698 0x0b62, 0x0b6e, 0x0b6d, 0x002c, 0x002e, 0x002f, 0x0700, 0x030c,
699 0x0703, 0x0020, 0x0207, 0x0100, 0x0101, 0x0102, 0x0103, 0x0104,
700 0x0105, 0x0106, 0x0107, 0x0108, 0x0109, 0x0208, 0x0209, 0x0307,
701 0x0308, 0x0309, 0x030b, 0x0304, 0x0305, 0x0306, 0x030a, 0x0301,
702 0x0302, 0x0303, 0x0300, 0x0310, 0x0206, 0x0200, 0x003c, 0x010a,
703 0x010b, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
704 0x030e, 0x0702, 0x030d, 0x001c, 0x0701, 0x0205, 0x0114, 0x0603,
705 0x0118, 0x0601, 0x0602, 0x0117, 0x0600, 0x0119, 0x0115, 0x0116,
706 0x011a, 0x010c, 0x010d, 0x011b, 0x011c, 0x0110, 0x0311, 0x011d,
707 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
708 }, shift_map[NR_KEYS] = {
709 0x0200, 0x001b, 0x0021, 0x0040, 0x0023, 0x0024, 0x0025, 0x005e,
710 0x0026, 0x002a, 0x0028, 0x0029, 0x005f, 0x002b, 0x007f, 0x0009,
711 0x0b51, 0x0b57, 0x0b45, 0x0b52, 0x0b54, 0x0b59, 0x0b55, 0x0b49,
712 0x0b4f, 0x0b50, 0x007b, 0x007d, 0x0201, 0x0702, 0x0b41, 0x0b53,
713 0x0b44, 0x0b46, 0x0b47, 0x0b48, 0x0b4a, 0x0b4b, 0x0b4c, 0x003a,
714 0x0022, 0x007e, 0x0700, 0x007c, 0x0b5a, 0x0b58, 0x0b43, 0x0b56,
715 0x0b42, 0x0b4e, 0x0b4d, 0x003c, 0x003e, 0x003f, 0x0700, 0x030c,
716 0x0703, 0x0020, 0x0207, 0x010a, 0x010b, 0x010c, 0x010d, 0x010e,
717 0x010f, 0x0110, 0x0111, 0x0112, 0x0113, 0x0213, 0x0203, 0x0307,
718 0x0308, 0x0309, 0x030b, 0x0304, 0x0305, 0x0306, 0x030a, 0x0301,
719 0x0302, 0x0303, 0x0300, 0x0310, 0x0206, 0x0200, 0x003e, 0x010a,
720 0x010b, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
721 0x030e, 0x0702, 0x030d, 0x0200, 0x0701, 0x0205, 0x0114, 0x0603,
722 0x020b, 0x0601, 0x0602, 0x0117, 0x0600, 0x020a, 0x0115, 0x0116,
723 0x011a, 0x010c, 0x010d, 0x011b, 0x011c, 0x0110, 0x0311, 0x011d,
724 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
725 }, altgr_map[NR_KEYS] = {
726 0x0200, 0x0200, 0x0200, 0x0040, 0x0200, 0x0024, 0x0200, 0x0200,
727 0x007b, 0x005b, 0x005d, 0x007d, 0x005c, 0x0200, 0x0200, 0x0200,
728 0x0b71, 0x0b77, 0x0918, 0x0b72, 0x0b74, 0x0b79, 0x0b75, 0x0b69,
729 0x0b6f, 0x0b70, 0x0200, 0x007e, 0x0201, 0x0702, 0x0914, 0x0b73,
730 0x0917, 0x0919, 0x0b67, 0x0b68, 0x0b6a, 0x0b6b, 0x0b6c, 0x0200,
731 0x0200, 0x0200, 0x0700, 0x0200, 0x0b7a, 0x0b78, 0x0916, 0x0b76,
732 0x0915, 0x0b6e, 0x0b6d, 0x0200, 0x0200, 0x0200, 0x0700, 0x030c,
733 0x0703, 0x0200, 0x0207, 0x050c, 0x050d, 0x050e, 0x050f, 0x0510,
734 0x0511, 0x0512, 0x0513, 0x0514, 0x0515, 0x0208, 0x0202, 0x0911,
735 0x0912, 0x0913, 0x030b, 0x090e, 0x090f, 0x0910, 0x030a, 0x090b,
736 0x090c, 0x090d, 0x090a, 0x0310, 0x0206, 0x0200, 0x007c, 0x0516,
737 0x0517, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
738 0x030e, 0x0702, 0x030d, 0x0200, 0x0701, 0x0205, 0x0114, 0x0603,
739 0x0118, 0x0601, 0x0602, 0x0117, 0x0600, 0x0119, 0x0115, 0x0116,
740 0x011a, 0x010c, 0x010d, 0x011b, 0x011c, 0x0110, 0x0311, 0x011d,
741 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
742 }, ctrl_map[NR_KEYS] = {
743 0x0200, 0x0200, 0x0200, 0x0000, 0x001b, 0x001c, 0x001d, 0x001e,
744 0x001f, 0x007f, 0x0200, 0x0200, 0x001f, 0x0200, 0x0008, 0x0200,
745 0x0011, 0x0017, 0x0005, 0x0012, 0x0014, 0x0019, 0x0015, 0x0009,
746 0x000f, 0x0010, 0x001b, 0x001d, 0x0201, 0x0702, 0x0001, 0x0013,
747 0x0004, 0x0006, 0x0007, 0x0008, 0x000a, 0x000b, 0x000c, 0x0200,
748 0x0007, 0x0000, 0x0700, 0x001c, 0x001a, 0x0018, 0x0003, 0x0016,
749 0x0002, 0x000e, 0x000d, 0x0200, 0x020e, 0x007f, 0x0700, 0x030c,
750 0x0703, 0x0000, 0x0207, 0x0100, 0x0101, 0x0102, 0x0103, 0x0104,
751 0x0105, 0x0106, 0x0107, 0x0108, 0x0109, 0x0208, 0x0204, 0x0307,
752 0x0308, 0x0309, 0x030b, 0x0304, 0x0305, 0x0306, 0x030a, 0x0301,
753 0x0302, 0x0303, 0x0300, 0x0310, 0x0206, 0x0200, 0x0200, 0x010a,
754 0x010b, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
755 0x030e, 0x0702, 0x030d, 0x001c, 0x0701, 0x0205, 0x0114, 0x0603,
756 0x0118, 0x0601, 0x0602, 0x0117, 0x0600, 0x0119, 0x0115, 0x0116,
757 0x011a, 0x010c, 0x010d, 0x011b, 0x011c, 0x0110, 0x0311, 0x011d,
758 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200, 0x0200,
759 };
760
761 const u_short * const linux_keytabs[] = {
762 plain_map, shift_map, altgr_map, altgr_map, ctrl_map
763 };
764 #endif
765
766 static struct biosdisk_info *
767 fd2biosinfo(struct proc *p, struct file *fp)
768 {
769 struct vnode *vp;
770 const char *blkname;
771 char diskname[16];
772 int i;
773 struct nativedisk_info *nip;
774 struct disklist *dl = x86_alldisks;
775
776 if (fp->f_type != DTYPE_VNODE)
777 return NULL;
778 vp = (struct vnode *)fp->f_data;
779
780 if (vp->v_type != VBLK)
781 return NULL;
782
783 blkname = devsw_blk2name(major(vp->v_rdev));
784 snprintf(diskname, sizeof diskname, "%s%llu", blkname,
785 (unsigned long long)DISKUNIT(vp->v_rdev));
786
787 for (i = 0; i < dl->dl_nnativedisks; i++) {
788 nip = &dl->dl_nativedisks[i];
789 if (strcmp(diskname, nip->ni_devname))
790 continue;
791 if (nip->ni_nmatches != 0)
792 return &dl->dl_biosdisks[nip->ni_biosmatches[0]];
793 }
794
795 return NULL;
796 }
797
798
799 /*
800 * We come here in a last attempt to satisfy a Linux ioctl() call
801 */
802 int
803 linux_machdepioctl(struct lwp *l, const struct linux_sys_ioctl_args *uap, register_t *retval)
804 {
805 /* {
806 syscallarg(int) fd;
807 syscallarg(u_long) com;
808 syscallarg(void *) data;
809 } */
810 struct sys_ioctl_args bia;
811 u_long com;
812 int error, error1;
813 #if (NWSDISPLAY > 0)
814 struct vt_mode lvt;
815 struct kbentry kbe;
816 #endif
817 struct linux_hd_geometry hdg;
818 struct linux_hd_big_geometry hdg_big;
819 struct biosdisk_info *bip;
820 file_t *fp;
821 int fd;
822 struct disklabel label, *labp;
823 struct partinfo partp;
824 int (*ioctlf)(struct file *, u_long, void *);
825 u_long start, biostotal, realtotal;
826 u_char heads, sectors;
827 u_int cylinders;
828 struct ioctl_pt pt;
829
830 fd = SCARG(uap, fd);
831 SCARG(&bia, fd) = fd;
832 SCARG(&bia, data) = SCARG(uap, data);
833 com = SCARG(uap, com);
834
835 if ((fp = fd_getfile(fd)) == NULL)
836 return (EBADF);
837
838 switch (com) {
839 #if (NWSDISPLAY > 0)
840 case LINUX_KDGKBMODE:
841 com = KDGKBMODE;
842 break;
843 case LINUX_KDSKBMODE:
844 com = KDSKBMODE;
845 if ((unsigned)SCARG(uap, data) == LINUX_K_MEDIUMRAW)
846 SCARG(&bia, data) = (void *)K_RAW;
847 break;
848 case LINUX_KIOCSOUND:
849 SCARG(&bia, data) =
850 (void *)(((unsigned long)SCARG(&bia, data)) & 0xffff);
851 /* fall through */
852 case LINUX_KDMKTONE:
853 com = KDMKTONE;
854 break;
855 case LINUX_KDSETMODE:
856 com = KDSETMODE;
857 break;
858 case LINUX_KDGETMODE:
859 /* KD_* values are equal to the wscons numbers */
860 com = WSDISPLAYIO_GMODE;
861 break;
862 case LINUX_KDENABIO:
863 com = KDENABIO;
864 break;
865 case LINUX_KDDISABIO:
866 com = KDDISABIO;
867 break;
868 case LINUX_KDGETLED:
869 com = KDGETLED;
870 break;
871 case LINUX_KDSETLED:
872 com = KDSETLED;
873 break;
874 case LINUX_VT_OPENQRY:
875 com = VT_OPENQRY;
876 break;
877 case LINUX_VT_GETMODE:
878 error = fp->f_ops->fo_ioctl(fp, VT_GETMODE, &lvt);
879 if (error != 0)
880 goto out;
881 lvt.relsig = native_to_linux_signo[lvt.relsig];
882 lvt.acqsig = native_to_linux_signo[lvt.acqsig];
883 lvt.frsig = native_to_linux_signo[lvt.frsig];
884 error = copyout(&lvt, SCARG(uap, data), sizeof (lvt));
885 goto out;
886 case LINUX_VT_SETMODE:
887 error = copyin(SCARG(uap, data), &lvt, sizeof (lvt));
888 if (error != 0)
889 goto out;
890 lvt.relsig = linux_to_native_signo[lvt.relsig];
891 lvt.acqsig = linux_to_native_signo[lvt.acqsig];
892 lvt.frsig = linux_to_native_signo[lvt.frsig];
893 error = fp->f_ops->fo_ioctl(fp, VT_SETMODE, &lvt);
894 goto out;
895 case LINUX_VT_DISALLOCATE:
896 /* XXX should use WSDISPLAYIO_DELSCREEN */
897 error = 0;
898 goto out;
899 case LINUX_VT_RELDISP:
900 com = VT_RELDISP;
901 break;
902 case LINUX_VT_ACTIVATE:
903 com = VT_ACTIVATE;
904 break;
905 case LINUX_VT_WAITACTIVE:
906 com = VT_WAITACTIVE;
907 break;
908 case LINUX_VT_GETSTATE:
909 com = VT_GETSTATE;
910 break;
911 case LINUX_KDGKBTYPE:
912 {
913 static const u_int8_t kb101 = KB_101;
914
915 /* This is what Linux does. */
916 error = copyout(&kb101, SCARG(uap, data), 1);
917 goto out;
918 }
919 case LINUX_KDGKBENT:
920 /*
921 * The Linux KDGKBENT ioctl is different from the
922 * SYSV original. So we handle it in machdep code.
923 * XXX We should use keyboard mapping information
924 * from wsdisplay, but this would be expensive.
925 */
926 if ((error = copyin(SCARG(uap, data), &kbe,
927 sizeof(struct kbentry))))
928 goto out;
929 if (kbe.kb_table >= sizeof(linux_keytabs) / sizeof(u_short *)
930 || kbe.kb_index >= NR_KEYS) {
931 error = EINVAL;
932 goto out;
933 }
934 kbe.kb_value = linux_keytabs[kbe.kb_table][kbe.kb_index];
935 error = copyout(&kbe, SCARG(uap, data),
936 sizeof(struct kbentry));
937 goto out;
938 #endif
939 case LINUX_HDIO_GETGEO:
940 case LINUX_HDIO_GETGEO_BIG:
941 /*
942 * Try to mimic Linux behaviour: return the BIOS geometry
943 * if possible (extending its # of cylinders if it's beyond
944 * the 1023 limit), fall back to the MI geometry (i.e.
945 * the real geometry) if not found, by returning an
946 * error. See common/linux_hdio.c
947 */
948 bip = fd2biosinfo(curproc, fp);
949 ioctlf = fp->f_ops->fo_ioctl;
950 error = ioctlf(fp, DIOCGDEFLABEL, (void *)&label);
951 error1 = ioctlf(fp, DIOCGPART, (void *)&partp);
952 if (error != 0 && error1 != 0) {
953 error = error1;
954 goto out;
955 }
956 labp = error != 0 ? &label : partp.disklab;
957 start = error1 != 0 ? partp.part->p_offset : 0;
958 if (bip != NULL && bip->bi_head != 0 && bip->bi_sec != 0
959 && bip->bi_cyl != 0) {
960 heads = bip->bi_head;
961 sectors = bip->bi_sec;
962 cylinders = bip->bi_cyl;
963 biostotal = heads * sectors * cylinders;
964 realtotal = labp->d_ntracks * labp->d_nsectors *
965 labp->d_ncylinders;
966 if (realtotal > biostotal)
967 cylinders = realtotal / (heads * sectors);
968 } else {
969 heads = labp->d_ntracks;
970 cylinders = labp->d_ncylinders;
971 sectors = labp->d_nsectors;
972 }
973 if (com == LINUX_HDIO_GETGEO) {
974 hdg.start = start;
975 hdg.heads = heads;
976 hdg.cylinders = cylinders;
977 hdg.sectors = sectors;
978 error = copyout(&hdg, SCARG(uap, data), sizeof hdg);
979 goto out;
980 } else {
981 hdg_big.start = start;
982 hdg_big.heads = heads;
983 hdg_big.cylinders = cylinders;
984 hdg_big.sectors = sectors;
985 error = copyout(&hdg_big, SCARG(uap, data),
986 sizeof hdg_big);
987 goto out;
988 }
989
990 default:
991 /*
992 * Unknown to us. If it's on a device, just pass it through
993 * using PTIOCLINUX, the device itself might be able to
994 * make some sense of it.
995 * XXX hack: if the function returns EJUSTRETURN,
996 * it has stuffed a sysctl return value in pt.data.
997 */
998 ioctlf = fp->f_ops->fo_ioctl;
999 pt.com = SCARG(uap, com);
1000 pt.data = SCARG(uap, data);
1001 error = ioctlf(fp, PTIOCLINUX, &pt);
1002 if (error == EJUSTRETURN) {
1003 retval[0] = (register_t)pt.data;
1004 error = 0;
1005 }
1006
1007 if (error == ENOTTY) {
1008 DPRINTF(("linux_machdepioctl: invalid ioctl %08lx\n",
1009 com));
1010 }
1011 goto out;
1012 }
1013 SCARG(&bia, com) = com;
1014 error = sys_ioctl(curlwp, &bia, retval);
1015 out:
1016 fd_putfile(fd);
1017 return error;
1018 }
1019
1020 /*
1021 * Set I/O permissions for a process. Just set the maximum level
1022 * right away (ignoring the argument), otherwise we would have
1023 * to rely on I/O permission maps, which are not implemented.
1024 */
1025 int
1026 linux_sys_iopl(struct lwp *l, const struct linux_sys_iopl_args *uap, register_t *retval)
1027 {
1028 /* {
1029 syscallarg(int) level;
1030 } */
1031 struct trapframe *fp = l->l_md.md_regs;
1032
1033 if (kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_IOPL,
1034 NULL, NULL, NULL, NULL) != 0)
1035 return EPERM;
1036 fp->tf_eflags |= PSL_IOPL;
1037 *retval = 0;
1038 return 0;
1039 }
1040
1041 /*
1042 * See above. If a root process tries to set access to an I/O port,
1043 * just let it have the whole range.
1044 */
1045 int
1046 linux_sys_ioperm(struct lwp *l, const struct linux_sys_ioperm_args *uap, register_t *retval)
1047 {
1048 /* {
1049 syscallarg(unsigned int) lo;
1050 syscallarg(unsigned int) hi;
1051 syscallarg(int) val;
1052 } */
1053 struct trapframe *fp = l->l_md.md_regs;
1054
1055 if (kauth_authorize_machdep(l->l_cred, SCARG(uap, val) ?
1056 KAUTH_MACHDEP_IOPERM_SET : KAUTH_MACHDEP_IOPERM_GET, NULL, NULL,
1057 NULL, NULL) != 0)
1058 return EPERM;
1059 if (SCARG(uap, val))
1060 fp->tf_eflags |= PSL_IOPL;
1061 *retval = 0;
1062 return 0;
1063 }
1064
1065 int
1066 linux_usertrap(struct lwp *l, vaddr_t trapaddr,
1067 void *arg)
1068 {
1069 return 0;
1070 }
1071
1072 const char *
1073 linux_get_uname_arch(void)
1074 {
1075 static char uname_arch[5] = "i386";
1076
1077 if (uname_arch[1] == '3')
1078 uname_arch[1] += cpu_class;
1079 return uname_arch;
1080 }
1081