1 /* $NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $ */ 2 3 /* 4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center. 11 * 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Coyote Point Systems, Inc. which was written under contract to Coyote 14 * Point by Jed Davis and Devon O'Dell. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 /* 39 * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr> 40 * 41 * Permission to use, copy, modify, and distribute this software for any 42 * purpose with or without fee is hereby granted, provided that the above 43 * copyright notice and this permission notice appear in all copies. 44 * 45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 52 */ 53 54 /* 55 * Copyright (c) 2007 Manuel Bouyer. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* 79 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 80 * All rights reserved. 81 * 82 * This code is derived from software contributed to Berkeley by 83 * William Jolitz. 84 * 85 * Redistribution and use in source and binary forms, with or without 86 * modification, are permitted provided that the following conditions 87 * are met: 88 * 1. Redistributions of source code must retain the above copyright 89 * notice, this list of conditions and the following disclaimer. 90 * 2. Redistributions in binary form must reproduce the above copyright 91 * notice, this list of conditions and the following disclaimer in the 92 * documentation and/or other materials provided with the distribution. 93 * 3. Neither the name of the University nor the names of its contributors 94 * may be used to endorse or promote products derived from this software 95 * without specific prior written permission. 96 * 97 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 98 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 99 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 100 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 101 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 102 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 103 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 104 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 105 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 106 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 107 * SUCH DAMAGE. 108 * 109 * @(#)machdep.c 7.4 (Berkeley) 6/3/91 110 */ 111 112 #include <sys/cdefs.h> 113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $"); 114 115 #include "opt_modular.h" 116 #include "opt_user_ldt.h" 117 #include "opt_ddb.h" 118 #include "opt_kgdb.h" 119 #include "opt_cpureset_delay.h" 120 #include "opt_mtrr.h" 121 #include "opt_realmem.h" 122 #include "opt_xen.h" 123 #include "opt_svs.h" 124 #include "opt_kaslr.h" 125 #ifndef XENPV 126 #include "opt_physmem.h" 127 #endif 128 #include "isa.h" 129 #include "pci.h" 130 131 #include <sys/param.h> 132 #include <sys/systm.h> 133 #include <sys/signal.h> 134 #include <sys/signalvar.h> 135 #include <sys/kernel.h> 136 #include <sys/cpu.h> 137 #include <sys/exec.h> 138 #include <sys/exec_aout.h> /* for MID_* */ 139 #include <sys/reboot.h> 140 #include <sys/conf.h> 141 #include <sys/msgbuf.h> 142 #include <sys/mount.h> 143 #include <sys/core.h> 144 #include <sys/kcore.h> 145 #include <sys/ucontext.h> 146 #include <machine/kcore.h> 147 #include <sys/ras.h> 148 #include <sys/syscallargs.h> 149 #include <sys/ksyms.h> 150 #include <sys/device.h> 151 #include <sys/lwp.h> 152 #include <sys/proc.h> 153 #include <sys/asan.h> 154 #include <sys/csan.h> 155 #include <sys/msan.h> 156 #include <sys/module.h> 157 #include <sys/timevar.h> 158 159 #ifdef KGDB 160 #include <sys/kgdb.h> 161 #endif 162 163 #include <lib/libkern/entpool.h> /* XXX */ 164 165 #include <dev/cons.h> 166 #include <dev/mm.h> 167 168 #include <uvm/uvm.h> 169 #include <uvm/uvm_page.h> 170 171 #include <sys/sysctl.h> 172 173 #include <machine/cpu.h> 174 #include <machine/cpu_rng.h> 175 #include <machine/cpufunc.h> 176 #include <machine/gdt.h> 177 #include <machine/intr.h> 178 #include <machine/pio.h> 179 #include <machine/psl.h> 180 #include <machine/reg.h> 181 #include <machine/specialreg.h> 182 #include <machine/bootinfo.h> 183 #include <x86/fpu.h> 184 #include <x86/dbregs.h> 185 #include <machine/mtrr.h> 186 #include <machine/mpbiosvar.h> 187 #include <machine/pmap_private.h> 188 189 #include <x86/bootspace.h> 190 #include <x86/cputypes.h> 191 #include <x86/cpuvar.h> 192 #include <x86/machdep.h> 193 #include <x86/x86/tsc.h> 194 195 #include <dev/isa/isareg.h> 196 #include <machine/isa_machdep.h> 197 #include <dev/ic/i8042reg.h> 198 199 #ifdef XEN 200 #include <xen/xen.h> 201 #include <xen/hypervisor.h> 202 #include <xen/evtchn.h> 203 #include <xen/include/public/version.h> 204 #include <xen/include/public/vcpu.h> 205 #endif /* XEN */ 206 207 #include <ddb/db_active.h> 208 209 #ifdef DDB 210 #include <machine/db_machdep.h> 211 #include <ddb/db_extern.h> 212 #include <ddb/db_output.h> 213 #include <ddb/db_interface.h> 214 #endif 215 216 #include "acpica.h" 217 218 #if NACPICA > 0 219 #include <dev/acpi/acpivar.h> 220 #define ACPI_MACHDEP_PRIVATE 221 #include <machine/acpi_machdep.h> 222 #else 223 #include <machine/i82489var.h> 224 #endif 225 226 #include "isa.h" 227 #include "isadma.h" 228 #include "ksyms.h" 229 230 /* the following is used externally (sysctl_hw) */ 231 char machine[] = "amd64"; /* CPU "architecture" */ 232 char machine_arch[] = "x86_64"; /* machine == machine_arch */ 233 234 #ifdef CPURESET_DELAY 235 int cpureset_delay = CPURESET_DELAY; 236 #else 237 int cpureset_delay = 2000; /* default to 2s */ 238 #endif 239 240 int cpu_class = CPUCLASS_686; 241 242 #ifdef MTRR 243 const struct mtrr_funcs *mtrr_funcs; 244 #endif 245 246 int cpu_class; 247 int use_pae; 248 249 #ifndef NO_SPARSE_DUMP 250 int sparse_dump = 1; 251 252 paddr_t max_paddr = 0; 253 unsigned char *sparse_dump_physmap; 254 #endif 255 256 char *dump_headerbuf, *dump_headerbuf_ptr; 257 #define dump_headerbuf_size PAGE_SIZE 258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size) 259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr) 260 daddr_t dump_header_blkno; 261 262 size_t dump_nmemsegs; 263 size_t dump_npages; 264 size_t dump_header_size; 265 size_t dump_totalbytesleft; 266 267 vaddr_t idt_vaddr; 268 paddr_t idt_paddr; 269 vaddr_t gdt_vaddr; 270 paddr_t gdt_paddr; 271 vaddr_t ldt_vaddr; 272 paddr_t ldt_paddr; 273 274 static struct vm_map module_map_store; 275 extern struct bootspace bootspace; 276 extern struct slotspace slotspace; 277 278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT; 279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT; 280 pd_entry_t *pte_base __read_mostly; 281 282 struct vm_map *phys_map = NULL; 283 284 extern paddr_t lowmem_rsvd; 285 extern paddr_t avail_start, avail_end; 286 #ifdef XENPV 287 extern paddr_t pmap_pa_start, pmap_pa_end; 288 #endif 289 290 struct nmistore { 291 uint64_t cr3; 292 uint64_t scratch; 293 } __packed; 294 295 /* 296 * Size of memory segments, before any memory is stolen. 297 */ 298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 299 int mem_cluster_cnt; 300 301 int cpu_dump(void); 302 int cpu_dumpsize(void); 303 u_long cpu_dump_mempagecnt(void); 304 void dodumpsys(void); 305 void dumpsys(void); 306 307 static void x86_64_proc0_pcb_ldt_init(void); 308 309 void dump_misc_init(void); 310 void dump_seg_prep(void); 311 int dump_seg_iter(int (*)(paddr_t, paddr_t)); 312 313 #ifndef NO_SPARSE_DUMP 314 void sparse_dump_reset(void); 315 void sparse_dump_mark(void); 316 void cpu_dump_prep_sparse(void); 317 #endif 318 319 void dump_header_start(void); 320 int dump_header_flush(void); 321 int dump_header_addbytes(const void*, size_t); 322 int dump_header_addseg(paddr_t, paddr_t); 323 int dump_header_finish(void); 324 325 int dump_seg_count_range(paddr_t, paddr_t); 326 int dumpsys_seg(paddr_t, paddr_t); 327 328 void init_bootspace(void); 329 void init_slotspace(void); 330 void init_x86_64(paddr_t); 331 332 /* 333 * Machine-dependent startup code 334 */ 335 void 336 cpu_startup(void) 337 { 338 int x, y; 339 vaddr_t minaddr, maxaddr; 340 psize_t sz; 341 342 /* 343 * For console drivers that require uvm and pmap to be initialized, 344 * we'll give them one more chance here... 345 */ 346 consinit(); 347 348 /* 349 * Initialize error message buffer (at end of core). 350 */ 351 if (msgbuf_p_cnt == 0) 352 panic("msgbuf paddr map has not been set up"); 353 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) 354 continue; 355 356 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY); 357 if (msgbuf_vaddr == 0) 358 panic("failed to valloc msgbuf_vaddr"); 359 360 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { 361 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) 362 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, 363 msgbuf_p_seg[y].paddr + x * PAGE_SIZE, 364 VM_PROT_READ|VM_PROT_WRITE, 0); 365 } 366 367 pmap_update(pmap_kernel()); 368 369 initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); 370 371 minaddr = 0; 372 373 /* 374 * Allocate a submap for physio. 375 */ 376 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 377 VM_PHYS_SIZE, 0, false, NULL); 378 379 /* 380 * Create the module map. 381 * 382 * The kernel uses RIP-relative addressing with a maximum offset of 383 * 2GB. Because of that, we can't put the kernel modules in kernel_map 384 * (like i386 does), since kernel_map is too far away in memory from 385 * the kernel sections. So we have to create a special module_map. 386 * 387 * The module map is taken as what is left of the bootstrap memory 388 * created in locore/prekern. 389 */ 390 uvm_map_setup(&module_map_store, bootspace.smodule, 391 bootspace.emodule, 0); 392 module_map_store.pmap = pmap_kernel(); 393 module_map = &module_map_store; 394 395 /* Say hello. */ 396 banner(); 397 398 #if NISA > 0 || NPCI > 0 399 /* Safe for i/o port / memory space allocation to use malloc now. */ 400 x86_bus_space_mallocok(); 401 #endif 402 403 #ifdef __HAVE_PCPU_AREA 404 cpu_pcpuarea_init(&cpu_info_primary); 405 #endif 406 gdt_init(); 407 x86_64_proc0_pcb_ldt_init(); 408 409 cpu_init_tss(&cpu_info_primary); 410 #if !defined(XENPV) 411 ltr(cpu_info_primary.ci_tss_sel); 412 #endif 413 414 x86_startup(); 415 } 416 417 #ifdef XENPV 418 /* used in assembly */ 419 void hypervisor_callback(void); 420 void failsafe_callback(void); 421 void x86_64_switch_context(struct pcb *); 422 void x86_64_tls_switch(struct lwp *); 423 424 void 425 x86_64_switch_context(struct pcb *new) 426 { 427 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); 428 struct physdev_set_iopl set_iopl; 429 set_iopl.iopl = new->pcb_iopl; 430 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 431 } 432 433 void 434 x86_64_tls_switch(struct lwp *l) 435 { 436 struct cpu_info *ci = curcpu(); 437 struct pcb *pcb = lwp_getpcb(l); 438 struct trapframe *tf = l->l_md.md_regs; 439 uint64_t zero = 0; 440 441 /* 442 * Raise the IPL to IPL_HIGH. XXX Still needed? 443 */ 444 (void)splhigh(); 445 446 /* Update segment registers */ 447 if (pcb->pcb_flags & PCB_COMPAT32) { 448 update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 449 update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 450 setds(GSEL(GUDATA32_SEL, SEL_UPL)); 451 setes(GSEL(GUDATA32_SEL, SEL_UPL)); 452 setfs(GSEL(GUDATA32_SEL, SEL_UPL)); 453 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); 454 } else { 455 update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero); 456 update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero); 457 setds(GSEL(GUDATA_SEL, SEL_UPL)); 458 setes(GSEL(GUDATA_SEL, SEL_UPL)); 459 setfs(0); 460 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); 461 HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); 462 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); 463 } 464 } 465 #endif /* XENPV */ 466 467 /* 468 * Set up proc0's PCB and LDT. 469 */ 470 static void 471 x86_64_proc0_pcb_ldt_init(void) 472 { 473 struct lwp *l = &lwp0; 474 struct pcb *pcb = lwp_getpcb(l); 475 476 pcb->pcb_flags = 0; 477 pcb->pcb_fs = 0; 478 pcb->pcb_gs = 0; 479 pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf; 480 pcb->pcb_iopl = IOPL_KPL; 481 pcb->pcb_dbregs = NULL; 482 pcb->pcb_cr0 = rcr0() & ~CR0_TS; 483 l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; 484 485 #if !defined(XENPV) 486 lldt(GSYSSEL(GLDT_SEL, SEL_KPL)); 487 #else 488 xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3); 489 /* Reset TS bit and set kernel stack for interrupt handlers */ 490 HYPERVISOR_fpu_taskswitch(1); 491 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); 492 struct physdev_set_iopl set_iopl; 493 set_iopl.iopl = pcb->pcb_iopl; 494 HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 495 #endif 496 } 497 498 /* 499 * Set up TSS and I/O bitmap. 500 */ 501 void 502 cpu_init_tss(struct cpu_info *ci) 503 { 504 #ifdef __HAVE_PCPU_AREA 505 const cpuid_t cid = cpu_index(ci); 506 #endif 507 struct cpu_tss *cputss; 508 struct nmistore *store; 509 uintptr_t p; 510 511 #ifdef __HAVE_PCPU_AREA 512 cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss; 513 #else 514 cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map, 515 sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 516 #endif 517 518 cputss->tss.tss_iobase = IOMAP_INVALOFF << 16; 519 520 /* DDB stack */ 521 #ifdef __HAVE_PCPU_AREA 522 p = (vaddr_t)&pcpuarea->ent[cid].ist0; 523 #else 524 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 525 #endif 526 cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16; 527 528 /* double fault */ 529 #ifdef __HAVE_PCPU_AREA 530 p = (vaddr_t)&pcpuarea->ent[cid].ist1; 531 #else 532 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 533 #endif 534 cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16; 535 536 /* NMI - store a structure at the top of the stack */ 537 #ifdef __HAVE_PCPU_AREA 538 p = (vaddr_t)&pcpuarea->ent[cid].ist2; 539 #else 540 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 541 #endif 542 cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore); 543 store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore)); 544 store->cr3 = pmap_pdirpa(pmap_kernel(), 0); 545 546 /* DB */ 547 #ifdef __HAVE_PCPU_AREA 548 p = (vaddr_t)&pcpuarea->ent[cid].ist3; 549 #else 550 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO); 551 #endif 552 cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16; 553 554 ci->ci_tss = cputss; 555 ci->ci_tss_sel = tss_alloc(&cputss->tss); 556 } 557 558 void 559 buildcontext(struct lwp *l, void *catcher, void *f) 560 { 561 struct trapframe *tf = l->l_md.md_regs; 562 563 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 564 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 565 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); 566 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); 567 568 tf->tf_rip = (uint64_t)catcher; 569 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 570 tf->tf_rflags &= ~PSL_CLEARSIG; 571 tf->tf_rsp = (uint64_t)f; 572 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); 573 574 /* Ensure FP state is sane */ 575 fpu_sigreset(l); 576 } 577 578 void 579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) 580 { 581 582 printf("sendsig_sigcontext: illegal\n"); 583 sigexit(curlwp, SIGILL); 584 } 585 586 void 587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) 588 { 589 struct lwp *l = curlwp; 590 struct proc *p = l->l_proc; 591 struct sigacts *ps = p->p_sigacts; 592 int onstack, error; 593 int sig = ksi->ksi_signo; 594 struct sigframe_siginfo *fp, frame; 595 sig_t catcher = SIGACTION(p, sig).sa_handler; 596 struct trapframe *tf = l->l_md.md_regs; 597 char *sp; 598 599 KASSERT(mutex_owned(p->p_lock)); 600 601 /* Do we need to jump onto the signal stack? */ 602 onstack = 603 (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && 604 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; 605 606 /* Allocate space for the signal handler context. */ 607 if (onstack) 608 sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); 609 else 610 /* AMD64 ABI 128-bytes "red zone". */ 611 sp = (char *)tf->tf_rsp - 128; 612 613 sp -= sizeof(struct sigframe_siginfo); 614 /* Round down the stackpointer to a multiple of 16 for the ABI. */ 615 fp = (struct sigframe_siginfo *)(((unsigned long)sp & 616 ~STACK_ALIGNBYTES) - 8); 617 618 memset(&frame, 0, sizeof(frame)); 619 frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; 620 frame.sf_si._info = ksi->ksi_info; 621 frame.sf_uc.uc_flags = _UC_SIGMASK; 622 frame.sf_uc.uc_sigmask = *mask; 623 frame.sf_uc.uc_link = l->l_ctxlink; 624 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) 625 ? _UC_SETSTACK : _UC_CLRSTACK; 626 sendsig_reset(l, sig); 627 628 mutex_exit(p->p_lock); 629 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); 630 /* Copyout all the fp regs, the signal handler might expect them. */ 631 error = copyout(&frame, fp, sizeof frame); 632 mutex_enter(p->p_lock); 633 634 if (error != 0) { 635 /* 636 * Process has trashed its stack; give it an illegal 637 * instruction to halt it in its tracks. 638 */ 639 sigexit(l, SIGILL); 640 /* NOTREACHED */ 641 } 642 643 buildcontext(l, catcher, fp); 644 645 tf->tf_rdi = sig; 646 tf->tf_rsi = (uint64_t)&fp->sf_si; 647 tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; 648 649 /* Remember that we're now on the signal stack. */ 650 if (onstack) 651 l->l_sigstk.ss_flags |= SS_ONSTACK; 652 653 if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { 654 /* 655 * process has given an invalid address for the 656 * handler. Stop it, but do not do it before so 657 * we can return the right info to userland (or in core dump) 658 */ 659 sigexit(l, SIGILL); 660 /* NOTREACHED */ 661 } 662 } 663 664 struct pcb dumppcb; 665 666 void 667 cpu_reboot(int howto, char *bootstr) 668 { 669 static bool syncdone = false; 670 int s = IPL_NONE; 671 __USE(s); /* ugly otherwise */ 672 673 if (cold) { 674 howto |= RB_HALT; 675 goto haltsys; 676 } 677 678 boothowto = howto; 679 680 /* i386 maybe_dump() */ 681 682 /* 683 * If we've panic'd, don't make the situation potentially 684 * worse by syncing or unmounting the file systems. 685 */ 686 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { 687 if (!syncdone) { 688 syncdone = true; 689 /* XXX used to force unmount as well, here */ 690 vfs_sync_all(curlwp); 691 } 692 693 while (vfs_unmountall1(curlwp, false, false) || 694 config_detach_all(boothowto) || 695 vfs_unmount_forceone(curlwp)) 696 ; /* do nothing */ 697 } else { 698 if (!db_active) 699 suspendsched(); 700 } 701 702 pmf_system_shutdown(boothowto); 703 704 /* Disable interrupts. */ 705 s = splhigh(); 706 707 /* Do a dump if requested. */ 708 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) 709 dumpsys(); 710 711 haltsys: 712 doshutdownhooks(); 713 714 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { 715 #if NACPICA > 0 716 if (s != IPL_NONE) 717 splx(s); 718 719 acpi_enter_sleep_state(ACPI_STATE_S5); 720 #endif 721 #ifdef XEN 722 if (vm_guest == VM_GUEST_XENPV || 723 vm_guest == VM_GUEST_XENPVH || 724 vm_guest == VM_GUEST_XENPVHVM) 725 HYPERVISOR_shutdown(); 726 #endif /* XEN */ 727 } 728 729 cpu_broadcast_halt(); 730 731 if (howto & RB_HALT) { 732 #if NACPICA > 0 733 acpi_disable(); 734 #endif 735 736 printf("\n"); 737 printf("The operating system has halted.\n"); 738 printf("Please press any key to reboot.\n\n"); 739 cnpollc(1); /* for proper keyboard command handling */ 740 if (cngetc() == 0) { 741 /* no console attached, so just hlt */ 742 printf("No keyboard - cannot reboot after all.\n"); 743 for(;;) { 744 x86_hlt(); 745 } 746 } 747 cnpollc(0); 748 } 749 750 printf("rebooting...\n"); 751 if (cpureset_delay > 0) 752 delay(cpureset_delay * 1000); 753 cpu_reset(); 754 for(;;) ; 755 /*NOTREACHED*/ 756 } 757 758 /* 759 * XXXfvdl share dumpcode. 760 */ 761 762 /* 763 * Perform assorted dump-related initialization tasks. Assumes that 764 * the maximum physical memory address will not increase afterwards. 765 */ 766 void 767 dump_misc_init(void) 768 { 769 #ifndef NO_SPARSE_DUMP 770 int i; 771 #endif 772 773 if (dump_headerbuf != NULL) 774 return; /* already called */ 775 776 #ifndef NO_SPARSE_DUMP 777 for (i = 0; i < mem_cluster_cnt; ++i) { 778 paddr_t top = mem_clusters[i].start + mem_clusters[i].size; 779 if (max_paddr < top) 780 max_paddr = top; 781 } 782 #ifdef DEBUG 783 printf("dump_misc_init: max_paddr = 0x%lx\n", 784 (unsigned long)max_paddr); 785 #endif 786 if (max_paddr == 0) { 787 printf("Your machine does not initialize mem_clusters; " 788 "sparse_dumps disabled\n"); 789 sparse_dump = 0; 790 } else { 791 sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, 792 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), 793 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 794 } 795 #endif 796 dump_headerbuf = (void *)uvm_km_alloc(kernel_map, 797 dump_headerbuf_size, 798 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); 799 /* XXXjld should check for failure here, disable dumps if so. */ 800 } 801 802 #ifndef NO_SPARSE_DUMP 803 /* 804 * Clear the set of pages to include in a sparse dump. 805 */ 806 void 807 sparse_dump_reset(void) 808 { 809 memset(sparse_dump_physmap, 0, 810 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); 811 } 812 813 /* 814 * Include or exclude pages in a sparse dump. 815 */ 816 void 817 sparse_dump_mark(void) 818 { 819 paddr_t p, pstart, pend; 820 struct vm_page *pg; 821 int i; 822 uvm_physseg_t upm; 823 824 /* 825 * Mark all memory pages, then unmark pages that are uninteresting. 826 * Dereferenceing pg->uobject might crash again if another CPU 827 * frees the object out from under us, but we can't lock anything 828 * so it's a risk we have to take. 829 */ 830 831 for (i = 0; i < mem_cluster_cnt; ++i) { 832 pstart = mem_clusters[i].start / PAGE_SIZE; 833 pend = pstart + mem_clusters[i].size / PAGE_SIZE; 834 835 for (p = pstart; p < pend; p++) { 836 setbit(sparse_dump_physmap, p); 837 } 838 } 839 for (upm = uvm_physseg_get_first(); 840 uvm_physseg_valid_p(upm); 841 upm = uvm_physseg_get_next(upm)) { 842 paddr_t pfn; 843 844 /* 845 * We assume that seg->start to seg->end are 846 * uvm_page_physload()ed 847 */ 848 for (pfn = uvm_physseg_get_start(upm); 849 pfn < uvm_physseg_get_end(upm); 850 pfn++) { 851 pg = PHYS_TO_VM_PAGE(ptoa(pfn)); 852 853 if (pg->uanon || (pg->flags & PG_FREE) || 854 (pg->uobject && pg->uobject->pgops)) { 855 p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE; 856 clrbit(sparse_dump_physmap, p); 857 } 858 } 859 } 860 } 861 862 /* 863 * Machine-dependently decides on the contents of a sparse dump, using 864 * the above. 865 */ 866 void 867 cpu_dump_prep_sparse(void) 868 { 869 sparse_dump_reset(); 870 /* XXX could the alternate recursive page table be skipped? */ 871 sparse_dump_mark(); 872 /* Memory for I/O buffers could be unmarked here, for example. */ 873 /* The kernel text could also be unmarked, but gdb would be upset. */ 874 } 875 #endif 876 877 /* 878 * Abstractly iterate over the collection of memory segments to be 879 * dumped; the callback lacks the customary environment-pointer 880 * argument because none of the current users really need one. 881 * 882 * To be used only after dump_seg_prep is called to set things up. 883 */ 884 int 885 dump_seg_iter(int (*callback)(paddr_t, paddr_t)) 886 { 887 int error, i; 888 889 #define CALLBACK(start,size) do { \ 890 error = callback(start,size); \ 891 if (error) \ 892 return error; \ 893 } while(0) 894 895 for (i = 0; i < mem_cluster_cnt; ++i) { 896 #ifndef NO_SPARSE_DUMP 897 /* 898 * The bitmap is scanned within each memory segment, 899 * rather than over its entire domain, in case any 900 * pages outside of the memory proper have been mapped 901 * into kva; they might be devices that wouldn't 902 * appreciate being arbitrarily read, and including 903 * them could also break the assumption that a sparse 904 * dump will always be smaller than a full one. 905 */ 906 if (sparse_dump && sparse_dump_physmap) { 907 paddr_t p, sp_start, sp_end; 908 int lastset; 909 910 sp_start = mem_clusters[i].start; 911 sp_end = sp_start + mem_clusters[i].size; 912 sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */ 913 lastset = 0; 914 for (p = sp_start; p < sp_end; p += PAGE_SIZE) { 915 int thisset = isset(sparse_dump_physmap, 916 p/PAGE_SIZE); 917 918 if (!lastset && thisset) 919 sp_start = p; 920 if (lastset && !thisset) 921 CALLBACK(sp_start, p - sp_start); 922 lastset = thisset; 923 } 924 if (lastset) 925 CALLBACK(sp_start, p - sp_start); 926 } else 927 #endif 928 CALLBACK(mem_clusters[i].start, mem_clusters[i].size); 929 } 930 return 0; 931 #undef CALLBACK 932 } 933 934 /* 935 * Prepare for an impending core dump: decide what's being dumped and 936 * how much space it will take up. 937 */ 938 void 939 dump_seg_prep(void) 940 { 941 #ifndef NO_SPARSE_DUMP 942 if (sparse_dump && sparse_dump_physmap) 943 cpu_dump_prep_sparse(); 944 #endif 945 946 dump_nmemsegs = 0; 947 dump_npages = 0; 948 dump_seg_iter(dump_seg_count_range); 949 950 dump_header_size = ALIGN(sizeof(kcore_seg_t)) + 951 ALIGN(sizeof(cpu_kcore_hdr_t)) + 952 ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); 953 dump_header_size = roundup(dump_header_size, dbtob(1)); 954 955 /* 956 * savecore(8) will read this to decide how many pages to 957 * copy, and cpu_dumpconf has already used the pessimistic 958 * value to set dumplo, so it's time to tell the truth. 959 */ 960 dumpsize = dump_npages; /* XXX could these just be one variable? */ 961 } 962 963 int 964 dump_seg_count_range(paddr_t start, paddr_t size) 965 { 966 ++dump_nmemsegs; 967 dump_npages += size / PAGE_SIZE; 968 return 0; 969 } 970 971 /* 972 * A sparse dump's header may be rather large, due to the number of 973 * "segments" emitted. These routines manage a simple output buffer, 974 * so that the header can be written to disk incrementally. 975 */ 976 void 977 dump_header_start(void) 978 { 979 dump_headerbuf_ptr = dump_headerbuf; 980 dump_header_blkno = dumplo; 981 } 982 983 int 984 dump_header_flush(void) 985 { 986 const struct bdevsw *bdev; 987 size_t to_write; 988 int error; 989 990 bdev = bdevsw_lookup(dumpdev); 991 to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); 992 error = bdev->d_dump(dumpdev, dump_header_blkno, 993 dump_headerbuf, to_write); 994 dump_header_blkno += btodb(to_write); 995 dump_headerbuf_ptr = dump_headerbuf; 996 return error; 997 } 998 999 int 1000 dump_header_addbytes(const void* vptr, size_t n) 1001 { 1002 const char* ptr = vptr; 1003 int error; 1004 1005 while (n > dump_headerbuf_avail) { 1006 memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); 1007 ptr += dump_headerbuf_avail; 1008 n -= dump_headerbuf_avail; 1009 dump_headerbuf_ptr = dump_headerbuf_end; 1010 error = dump_header_flush(); 1011 if (error) 1012 return error; 1013 } 1014 memcpy(dump_headerbuf_ptr, ptr, n); 1015 dump_headerbuf_ptr += n; 1016 1017 return 0; 1018 } 1019 1020 int 1021 dump_header_addseg(paddr_t start, paddr_t size) 1022 { 1023 phys_ram_seg_t seg = { start, size }; 1024 int error; 1025 1026 error = dump_header_addbytes(&seg, sizeof(seg)); 1027 if (error) { 1028 printf("[seg 0x%"PRIxPADDR" bytes 0x%"PRIxPSIZE" failed," 1029 " error=%d] ", start, size, error); 1030 } 1031 return error; 1032 } 1033 1034 int 1035 dump_header_finish(void) 1036 { 1037 int error; 1038 1039 memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); 1040 error = dump_header_flush(); 1041 if (error) 1042 printf("[finish failed, error=%d] ", error); 1043 return error; 1044 } 1045 1046 1047 /* 1048 * These variables are needed by /sbin/savecore 1049 */ 1050 uint32_t dumpmag = 0x8fca0101; /* magic number */ 1051 int dumpsize = 0; /* pages */ 1052 long dumplo = 0; /* blocks */ 1053 1054 /* 1055 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers 1056 * for a full (non-sparse) dump. 1057 */ 1058 int 1059 cpu_dumpsize(void) 1060 { 1061 int size; 1062 1063 size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + 1064 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); 1065 if (roundup(size, dbtob(1)) != dbtob(1)) 1066 return (-1); 1067 1068 return (1); 1069 } 1070 1071 /* 1072 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped 1073 * for a full (non-sparse) dump. 1074 */ 1075 u_long 1076 cpu_dump_mempagecnt(void) 1077 { 1078 u_long i, n; 1079 1080 n = 0; 1081 for (i = 0; i < mem_cluster_cnt; i++) 1082 n += atop(mem_clusters[i].size); 1083 return (n); 1084 } 1085 1086 /* 1087 * cpu_dump: dump the machine-dependent kernel core dump headers. 1088 */ 1089 int 1090 cpu_dump(void) 1091 { 1092 kcore_seg_t seg; 1093 cpu_kcore_hdr_t cpuhdr; 1094 const struct bdevsw *bdev; 1095 int error; 1096 1097 bdev = bdevsw_lookup(dumpdev); 1098 if (bdev == NULL) { 1099 printf("[device 0x%llx ENXIO] ", (unsigned long long)dumpdev); 1100 return ENXIO; 1101 } 1102 1103 /* 1104 * Generate a segment header. 1105 */ 1106 CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); 1107 seg.c_size = dump_header_size - ALIGN(sizeof(seg)); 1108 error = dump_header_addbytes(&seg, ALIGN(sizeof(seg))); 1109 if (error) { 1110 printf("[segment header %zu bytes failed, error=%d] ", 1111 ALIGN(sizeof(seg)), error); 1112 /* blithely proceed (can't fail?) */ 1113 } 1114 1115 /* 1116 * Add the machine-dependent header info. 1117 */ 1118 cpuhdr.ptdpaddr = PDPpaddr; 1119 cpuhdr.nmemsegs = dump_nmemsegs; 1120 error = dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); 1121 if (error) { 1122 printf("[MD header %zu bytes failed, error=%d] ", 1123 ALIGN(sizeof(cpuhdr)), error); 1124 /* blithely proceed (can't fail?) */ 1125 } 1126 1127 /* 1128 * Write out the memory segment descriptors. 1129 */ 1130 return dump_seg_iter(dump_header_addseg); 1131 } 1132 1133 /* 1134 * Doadump comes here after turning off memory management and 1135 * getting on the dump stack, either when called above, or by 1136 * the auto-restart code. 1137 */ 1138 #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ 1139 static vaddr_t dumpspace; 1140 1141 vaddr_t 1142 reserve_dumppages(vaddr_t p) 1143 { 1144 1145 dumpspace = p; 1146 return (p + BYTES_PER_DUMP); 1147 } 1148 1149 int 1150 dumpsys_seg(paddr_t maddr, paddr_t bytes) 1151 { 1152 u_long i, m, n; 1153 daddr_t blkno; 1154 const struct bdevsw *bdev; 1155 int (*dump)(dev_t, daddr_t, void *, size_t); 1156 int error; 1157 1158 if (dumpdev == NODEV) 1159 return ENODEV; 1160 bdev = bdevsw_lookup(dumpdev); 1161 if (bdev == NULL || bdev->d_psize == NULL) 1162 return ENODEV; 1163 1164 dump = bdev->d_dump; 1165 1166 blkno = dump_header_blkno; 1167 for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { 1168 /* Print out how many MBs we have left to go. */ 1169 if ((dump_totalbytesleft % (1024*1024)) == 0) 1170 printf_nolog("%lu ", (unsigned long) 1171 (dump_totalbytesleft / (1024 * 1024))); 1172 1173 /* Limit size for next transfer. */ 1174 n = bytes - i; 1175 if (n > BYTES_PER_DUMP) 1176 n = BYTES_PER_DUMP; 1177 1178 for (m = 0; m < n; m += NBPG) 1179 pmap_kenter_pa(dumpspace + m, maddr + m, 1180 VM_PROT_READ, 0); 1181 pmap_update(pmap_kernel()); 1182 1183 error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); 1184 pmap_kremove_local(dumpspace, n); 1185 if (error) 1186 return error; 1187 maddr += n; 1188 blkno += btodb(n); /* XXX? */ 1189 1190 #if 0 /* XXX this doesn't work. grr. */ 1191 /* operator aborting dump? */ 1192 if (sget() != NULL) 1193 return EINTR; 1194 #endif 1195 } 1196 dump_header_blkno = blkno; 1197 1198 return 0; 1199 } 1200 1201 void 1202 dodumpsys(void) 1203 { 1204 const struct bdevsw *bdev; 1205 int dumpend, psize; 1206 int error; 1207 1208 if (dumpdev == NODEV) 1209 return; 1210 1211 bdev = bdevsw_lookup(dumpdev); 1212 if (bdev == NULL || bdev->d_psize == NULL) 1213 return; 1214 /* 1215 * For dumps during autoconfiguration, 1216 * if dump device has already configured... 1217 */ 1218 if (dumpsize == 0) 1219 cpu_dumpconf(); 1220 1221 printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):", 1222 (unsigned long long)major(dumpdev), 1223 (unsigned long long)minor(dumpdev), dumplo, dumpsize); 1224 1225 if (dumplo <= 0 || dumpsize <= 0) { 1226 printf(" not possible\n"); 1227 return; 1228 } 1229 1230 psize = bdev_size(dumpdev); 1231 printf("\ndump "); 1232 if (psize == -1) { 1233 printf("area unavailable\n"); 1234 return; 1235 } 1236 1237 #if 0 /* XXX this doesn't work. grr. */ 1238 /* toss any characters present prior to dump */ 1239 while (sget() != NULL); /*syscons and pccons differ */ 1240 #endif 1241 1242 dump_seg_prep(); 1243 dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); 1244 if (dumpend > psize) { 1245 printf("failed: insufficient space (%d < %d)\n", 1246 psize, dumpend); 1247 goto failed; 1248 } 1249 1250 dump_header_start(); 1251 if ((error = cpu_dump()) != 0) 1252 goto err; 1253 if ((error = dump_header_finish()) != 0) 1254 goto err; 1255 1256 if (dump_header_blkno != dumplo + btodb(dump_header_size)) { 1257 printf("BAD header size (%ld [written] != %ld [expected])\n", 1258 (long)(dump_header_blkno - dumplo), 1259 (long)btodb(dump_header_size)); 1260 goto failed; 1261 } 1262 1263 dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); 1264 error = dump_seg_iter(dumpsys_seg); 1265 1266 if (error == 0 && dump_header_blkno != dumpend) { 1267 printf("BAD dump size (%ld [written] != %ld [expected])\n", 1268 (long)(dumpend - dumplo), 1269 (long)(dump_header_blkno - dumplo)); 1270 goto failed; 1271 } 1272 1273 err: 1274 switch (error) { 1275 1276 case ENXIO: 1277 printf("device bad\n"); 1278 break; 1279 1280 case EFAULT: 1281 printf("device not ready\n"); 1282 break; 1283 1284 case EINVAL: 1285 printf("area improper\n"); 1286 break; 1287 1288 case EIO: 1289 printf("i/o error\n"); 1290 break; 1291 1292 case EINTR: 1293 printf("aborted from console\n"); 1294 break; 1295 1296 case 0: 1297 printf("succeeded\n"); 1298 break; 1299 1300 default: 1301 printf("error %d\n", error); 1302 break; 1303 } 1304 failed: 1305 printf("\n\n"); 1306 delay(5000000); /* 5 seconds */ 1307 } 1308 1309 /* 1310 * This is called by main to set dumplo and dumpsize. 1311 * Dumps always skip the first PAGE_SIZE of disk space 1312 * in case there might be a disk label stored there. 1313 * If there is extra space, put dump at the end to 1314 * reduce the chance that swapping trashes it. 1315 * 1316 * Sparse dumps can't placed as close to the end as possible, because 1317 * savecore(8) has to know where to start reading in the dump device 1318 * before it has access to any of the crashed system's state. 1319 * 1320 * Note also that a sparse dump will never be larger than a full one: 1321 * in order to add a phys_ram_seg_t to the header, at least one page 1322 * must be removed. 1323 */ 1324 void 1325 cpu_dumpconf(void) 1326 { 1327 int nblks, dumpblks; /* size of dump area */ 1328 1329 if (dumpdev == NODEV) 1330 goto bad; 1331 nblks = bdev_size(dumpdev); 1332 if (nblks <= ctod(1)) 1333 goto bad; 1334 1335 dumpblks = cpu_dumpsize(); 1336 if (dumpblks < 0) 1337 goto bad; 1338 1339 /* dumpsize is in page units, and doesn't include headers. */ 1340 dumpsize = cpu_dump_mempagecnt(); 1341 1342 dumpblks += ctod(dumpsize); 1343 1344 /* If dump won't fit (incl. room for possible label), punt. */ 1345 if (dumpblks > (nblks - ctod(1))) { 1346 #ifndef NO_SPARSE_DUMP 1347 /* A sparse dump might (and hopefully will) fit. */ 1348 dumplo = ctod(1); 1349 #else 1350 /* But if we're not configured for that, punt. */ 1351 goto bad; 1352 #endif 1353 } else { 1354 /* Put dump at end of partition */ 1355 dumplo = nblks - dumpblks; 1356 } 1357 1358 1359 /* Now that we've decided this will work, init ancillary stuff. */ 1360 dump_misc_init(); 1361 return; 1362 1363 bad: 1364 dumpsize = 0; 1365 } 1366 1367 /* 1368 * Clear registers on exec 1369 */ 1370 void 1371 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) 1372 { 1373 struct pcb *pcb = lwp_getpcb(l); 1374 struct trapframe *tf; 1375 1376 #ifdef USER_LDT 1377 pmap_ldt_cleanup(l); 1378 #endif 1379 1380 fpu_clear(l, pack->ep_osversion >= 699002600 1381 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); 1382 x86_dbregs_clear(l); 1383 1384 kpreempt_disable(); 1385 pcb->pcb_flags = 0; 1386 l->l_proc->p_flag &= ~PK_32; 1387 l->l_md.md_flags = MDL_IRET; 1388 cpu_segregs64_zero(l); 1389 kpreempt_enable(); 1390 1391 tf = l->l_md.md_regs; 1392 memset(tf, 0, sizeof(*tf)); 1393 1394 tf->tf_trapno = T_ASTFLT; 1395 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 1396 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 1397 tf->tf_rdi = 0; 1398 tf->tf_rsi = 0; 1399 tf->tf_rbp = 0; 1400 tf->tf_rbx = l->l_proc->p_psstrp; 1401 tf->tf_rdx = 0; 1402 tf->tf_rcx = 0; 1403 tf->tf_rax = 0; 1404 tf->tf_rip = pack->ep_entry; 1405 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); 1406 tf->tf_rflags = PSL_USERSET; 1407 tf->tf_rsp = stack; 1408 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 1409 } 1410 1411 /* 1412 * Initialize segments and descriptor tables 1413 */ 1414 char *ldtstore; 1415 char *gdtstore; 1416 1417 void 1418 setgate(struct gate_descriptor *gd, void *func, 1419 int ist, int type, int dpl, int sel) 1420 { 1421 vaddr_t vaddr; 1422 1423 vaddr = ((vaddr_t)gd) & ~PAGE_MASK; 1424 1425 kpreempt_disable(); 1426 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 1427 1428 gd->gd_looffset = (uint64_t)func & 0xffff; 1429 gd->gd_selector = sel; 1430 gd->gd_ist = ist; 1431 gd->gd_type = type; 1432 gd->gd_dpl = dpl; 1433 gd->gd_p = 1; 1434 gd->gd_hioffset = (uint64_t)func >> 16; 1435 gd->gd_zero = 0; 1436 gd->gd_xx1 = 0; 1437 gd->gd_xx2 = 0; 1438 gd->gd_xx3 = 0; 1439 1440 pmap_changeprot_local(vaddr, VM_PROT_READ); 1441 kpreempt_enable(); 1442 } 1443 1444 void 1445 unsetgate(struct gate_descriptor *gd) 1446 { 1447 vaddr_t vaddr; 1448 1449 vaddr = ((vaddr_t)gd) & ~PAGE_MASK; 1450 1451 kpreempt_disable(); 1452 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 1453 1454 memset(gd, 0, sizeof (*gd)); 1455 1456 pmap_changeprot_local(vaddr, VM_PROT_READ); 1457 kpreempt_enable(); 1458 } 1459 1460 void 1461 setregion(struct region_descriptor *rd, void *base, uint16_t limit) 1462 { 1463 rd->rd_limit = limit; 1464 rd->rd_base = (uint64_t)base; 1465 } 1466 1467 /* 1468 * Note that the base and limit fields are ignored in long mode. 1469 */ 1470 void 1471 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, 1472 int type, int dpl, int gran, int def32, int is64) 1473 { 1474 sd->sd_lolimit = (unsigned)limit; 1475 sd->sd_lobase = (unsigned long)base; 1476 sd->sd_type = type; 1477 sd->sd_dpl = dpl; 1478 sd->sd_p = 1; 1479 sd->sd_hilimit = (unsigned)limit >> 16; 1480 sd->sd_avl = 0; 1481 sd->sd_long = is64; 1482 sd->sd_def32 = def32; 1483 sd->sd_gran = gran; 1484 sd->sd_hibase = (unsigned long)base >> 24; 1485 } 1486 1487 void 1488 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, 1489 int type, int dpl, int gran) 1490 { 1491 memset(sd, 0, sizeof *sd); 1492 sd->sd_lolimit = (unsigned)limit; 1493 sd->sd_lobase = (uint64_t)base; 1494 sd->sd_type = type; 1495 sd->sd_dpl = dpl; 1496 sd->sd_p = 1; 1497 sd->sd_hilimit = (unsigned)limit >> 16; 1498 sd->sd_gran = gran; 1499 sd->sd_hibase = (uint64_t)base >> 24; 1500 } 1501 1502 void 1503 cpu_init_idt(struct cpu_info *ci) 1504 { 1505 struct region_descriptor region; 1506 idt_descriptor_t *idt; 1507 1508 idt = ci->ci_idtvec.iv_idt; 1509 setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); 1510 lidt(®ion); 1511 } 1512 1513 #define IDTVEC(name) __CONCAT(X, name) 1514 typedef void (vector)(void); 1515 extern vector IDTVEC(syscall); 1516 extern vector IDTVEC(syscall32); 1517 extern vector IDTVEC(osyscall); 1518 extern vector *x86_exceptions[]; 1519 1520 #ifndef XENPV 1521 static void 1522 init_x86_64_ksyms(void) 1523 { 1524 #if NKSYMS || defined(DDB) || defined(MODULAR) 1525 extern int end; 1526 extern int *esym; 1527 struct btinfo_symtab *symtab; 1528 vaddr_t tssym, tesym; 1529 1530 #ifdef DDB 1531 db_machine_init(); 1532 #endif 1533 1534 symtab = lookup_bootinfo(BTINFO_SYMTAB); 1535 if (symtab) { 1536 #ifdef KASLR 1537 tssym = bootspace.head.va; 1538 tesym = bootspace.head.va; /* (unused...) */ 1539 #else 1540 tssym = (vaddr_t)symtab->ssym + KERNBASE; 1541 tesym = (vaddr_t)symtab->esym + KERNBASE; 1542 #endif 1543 ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); 1544 } else { 1545 uintptr_t endp = (uintptr_t)(void *)&end; 1546 #ifdef XEN 1547 /* 1548 * cpu_probe() / identify_hypervisor() overrides VM_GUEST_GENPVH, 1549 * we can't rely on vm_guest == VM_GUEST_GENPVH 1550 */ 1551 if (pvh_boot && vm_guest != VM_GUEST_XENPVH) 1552 ksyms_addsyms_elf(0, ((long *)endp) + 1, esym); 1553 else 1554 #endif 1555 ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym); 1556 } 1557 #endif 1558 } 1559 #endif /* XENPV */ 1560 1561 void __noasan 1562 init_bootspace(void) 1563 { 1564 extern char __rodata_start; 1565 extern char __data_start; 1566 extern char __kernel_end; 1567 size_t i = 0; 1568 1569 memset(&bootspace, 0, sizeof(bootspace)); 1570 1571 bootspace.head.va = KERNTEXTOFF; 1572 bootspace.head.pa = KERNTEXTOFF - KERNBASE; 1573 bootspace.head.sz = 0; 1574 1575 bootspace.segs[i].type = BTSEG_TEXT; 1576 bootspace.segs[i].va = KERNTEXTOFF; 1577 bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE; 1578 bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF; 1579 i++; 1580 1581 bootspace.segs[i].type = BTSEG_RODATA; 1582 bootspace.segs[i].va = (vaddr_t)&__rodata_start; 1583 bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE; 1584 bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start; 1585 i++; 1586 1587 bootspace.segs[i].type = BTSEG_DATA; 1588 bootspace.segs[i].va = (vaddr_t)&__data_start; 1589 bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE; 1590 bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start; 1591 i++; 1592 1593 bootspace.boot.va = (vaddr_t)&__kernel_end; 1594 bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE; 1595 bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) - 1596 (size_t)&__kernel_end; 1597 1598 /* In locore.S, we allocated a tmp va. We will use it now. */ 1599 bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; 1600 1601 /* Virtual address of the L4 page. */ 1602 bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE); 1603 1604 /* Kernel module map. */ 1605 bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE; 1606 bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; 1607 } 1608 1609 static void 1610 init_pte(void) 1611 { 1612 #ifndef XENPV 1613 extern uint32_t nox_flag; 1614 pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir; 1615 pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) | 1616 PTE_P; 1617 #endif 1618 1619 extern pd_entry_t *normal_pdes[3]; 1620 normal_pdes[0] = L2_BASE; 1621 normal_pdes[1] = L3_BASE; 1622 normal_pdes[2] = L4_BASE; 1623 } 1624 1625 void 1626 init_slotspace(void) 1627 { 1628 /* 1629 * XXX Too early to use cprng(9), or even entropy_extract. 1630 */ 1631 struct entpool pool; 1632 size_t randhole; 1633 vaddr_t randva; 1634 uint64_t sample; 1635 vaddr_t va; 1636 1637 memset(&pool, 0, sizeof pool); 1638 cpu_rng_early_sample(&sample); 1639 entpool_enter(&pool, &sample, sizeof sample); 1640 1641 memset(&slotspace, 0, sizeof(slotspace)); 1642 1643 /* User. [256, because we want to land in >= 256] */ 1644 slotspace.area[SLAREA_USER].sslot = 0; 1645 slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1; 1646 slotspace.area[SLAREA_USER].active = true; 1647 1648 #ifdef XENPV 1649 /* PTE. */ 1650 slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE; 1651 slotspace.area[SLAREA_PTE].nslot = 1; 1652 slotspace.area[SLAREA_PTE].active = true; 1653 #endif 1654 1655 #ifdef __HAVE_PCPU_AREA 1656 /* Per-CPU. */ 1657 slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU; 1658 slotspace.area[SLAREA_PCPU].nslot = 1; 1659 slotspace.area[SLAREA_PCPU].active = true; 1660 #endif 1661 1662 #ifdef __HAVE_DIRECT_MAP 1663 /* Direct Map. [Randomized later] */ 1664 slotspace.area[SLAREA_DMAP].active = false; 1665 #endif 1666 1667 #ifdef XENPV 1668 /* Hypervisor. */ 1669 slotspace.area[SLAREA_HYPV].sslot = 256; 1670 slotspace.area[SLAREA_HYPV].nslot = 17; 1671 slotspace.area[SLAREA_HYPV].active = true; 1672 #endif 1673 1674 #ifdef KASAN 1675 /* ASAN. */ 1676 slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN; 1677 slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN; 1678 slotspace.area[SLAREA_ASAN].active = true; 1679 #endif 1680 1681 #ifdef KMSAN 1682 /* MSAN. */ 1683 slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN; 1684 slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN; 1685 slotspace.area[SLAREA_MSAN].active = true; 1686 #endif 1687 1688 /* Kernel. */ 1689 slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE; 1690 slotspace.area[SLAREA_KERN].nslot = 1; 1691 slotspace.area[SLAREA_KERN].active = true; 1692 1693 /* Main. */ 1694 cpu_rng_early_sample(&sample); 1695 entpool_enter(&pool, &sample, sizeof sample); 1696 entpool_extract(&pool, &randhole, sizeof randhole); 1697 entpool_extract(&pool, &randva, sizeof randva); 1698 va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4, 1699 NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */ 1700 vm_min_kernel_address = va; 1701 vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4; 1702 1703 #ifndef XENPV 1704 /* PTE. */ 1705 cpu_rng_early_sample(&sample); 1706 entpool_enter(&pool, &sample, sizeof sample); 1707 entpool_extract(&pool, &randhole, sizeof randhole); 1708 entpool_extract(&pool, &randva, sizeof randva); 1709 va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva); 1710 pte_base = (pd_entry_t *)va; 1711 #endif 1712 1713 explicit_memset(&pool, 0, sizeof pool); 1714 } 1715 1716 void 1717 init_x86_64(paddr_t first_avail) 1718 { 1719 extern void consinit(void); 1720 struct region_descriptor region; 1721 struct mem_segment_descriptor *ldt_segp; 1722 struct idt_vec *iv; 1723 idt_descriptor_t *idt; 1724 int x; 1725 struct pcb *pcb; 1726 extern vaddr_t lwp0uarea; 1727 #ifndef XENPV 1728 extern paddr_t local_apic_pa; 1729 #endif 1730 1731 KASSERT(first_avail % PAGE_SIZE == 0); 1732 1733 #ifdef XENPV 1734 KASSERT(HYPERVISOR_shared_info != NULL); 1735 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; 1736 #endif 1737 1738 #ifdef XEN 1739 if (pvh_boot) 1740 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); 1741 #endif 1742 init_pte(); 1743 1744 uvm_lwp_setuarea(&lwp0, lwp0uarea); 1745 1746 cpu_probe(&cpu_info_primary); 1747 #ifdef SVS 1748 svs_init(); 1749 #endif 1750 1751 /* 1752 * Initialize MSRs on cpu0: 1753 * 1754 * - Enables SYSCALL/SYSRET. 1755 * 1756 * - Sets up %fs and %gs so that %gs points to the current 1757 * struct cpu_info as needed for CPUVAR(...), curcpu(), and 1758 * curlwp. 1759 * 1760 * - Enables the no-execute bit if supported. 1761 * 1762 * Thus, after this point, CPUVAR(...), curcpu(), and curlwp 1763 * will work on cpu0. 1764 * 1765 * Note: The call to cpu_init_msrs for secondary CPUs happens 1766 * in cpu_hatch. 1767 */ 1768 cpu_init_msrs(&cpu_info_primary, true); 1769 1770 #ifndef XENPV 1771 cpu_speculation_init(&cpu_info_primary); 1772 #endif 1773 1774 use_pae = 1; /* PAE always enabled in long mode */ 1775 1776 pcb = lwp_getpcb(&lwp0); 1777 #ifdef XENPV 1778 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); 1779 pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; 1780 #else 1781 pcb->pcb_cr3 = PDPpaddr; 1782 #endif 1783 1784 #if NISA > 0 || NPCI > 0 1785 x86_bus_space_init(); 1786 #endif 1787 1788 pat_init(&cpu_info_primary); 1789 1790 consinit(); /* XXX SHOULD NOT BE DONE HERE */ 1791 1792 /* 1793 * Initialize PAGE_SIZE-dependent variables. 1794 */ 1795 uvm_md_init(); 1796 1797 uvmexp.ncolors = 2; 1798 1799 avail_start = first_avail; 1800 1801 #ifndef XENPV 1802 /* 1803 * Low memory reservations: 1804 * Page 0: BIOS data 1805 * Page 1: BIOS callback (not used yet, for symmetry with i386) 1806 * Page 2: MP bootstrap code (MP_TRAMPOLINE) 1807 * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR) 1808 * Page 4: Temporary page table for 0MB-4MB 1809 * Page 5: Temporary page directory 1810 * Page 6: Temporary page map level 3 1811 * Page 7: Temporary page map level 4 1812 */ 1813 lowmem_rsvd = 8 * PAGE_SIZE; 1814 1815 /* Initialize the memory clusters (needed in pmap_bootstrap). */ 1816 init_x86_clusters(); 1817 #else 1818 /* Parse Xen command line (replace bootinfo) */ 1819 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); 1820 1821 avail_end = ctob(xen_start_info.nr_pages); 1822 pmap_pa_start = (KERNTEXTOFF - KERNBASE); 1823 pmap_pa_end = avail_end; 1824 #endif 1825 1826 /* 1827 * Call pmap initialization to make new kernel address space. 1828 * We must do this before loading pages into the VM system. 1829 */ 1830 pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); 1831 1832 /* 1833 * Initialize RNG to get entropy ASAP either from CPU 1834 * RDRAND/RDSEED or from seed on disk. Constraints: 1835 * 1836 * - Must happen after cpu_init_msrs so that curcpu() and 1837 * curlwp work. 1838 * 1839 * - Must happen after consinit so we have the opportunity to 1840 * print useful feedback. 1841 * 1842 * - On KASLR kernels, must happen after pmap_bootstrap because 1843 * x86_rndseed requires access to the direct map. 1844 */ 1845 cpu_rng_init(); 1846 x86_rndseed(); 1847 1848 #ifndef XENPV 1849 /* Internalize the physical pages into the VM system. */ 1850 init_x86_vm(avail_start); 1851 #else 1852 physmem = xen_start_info.nr_pages; 1853 uvm_page_physload(atop(avail_start), atop(avail_end), 1854 atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT); 1855 #endif 1856 1857 init_x86_msgbuf(); 1858 1859 kasan_init(); 1860 kcsan_init(); 1861 kmsan_init((void *)lwp0uarea); 1862 1863 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); 1864 1865 kpreempt_disable(); 1866 1867 #ifndef XENPV 1868 pmap_kenter_pa(local_apic_va, local_apic_pa, 1869 VM_PROT_READ|VM_PROT_WRITE, 0); 1870 pmap_update(pmap_kernel()); 1871 memset((void *)local_apic_va, 0, PAGE_SIZE); 1872 #endif 1873 1874 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1875 pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1876 pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); 1877 pmap_update(pmap_kernel()); 1878 memset((void *)idt_vaddr, 0, PAGE_SIZE); 1879 memset((void *)gdt_vaddr, 0, PAGE_SIZE); 1880 memset((void *)ldt_vaddr, 0, PAGE_SIZE); 1881 1882 #ifndef XENPV 1883 pmap_changeprot_local(idt_vaddr, VM_PROT_READ); 1884 #endif 1885 1886 pmap_update(pmap_kernel()); 1887 1888 iv = &(cpu_info_primary.ci_idtvec); 1889 idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary)); 1890 idt = iv->iv_idt; 1891 gdtstore = (char *)gdt_vaddr; 1892 ldtstore = (char *)ldt_vaddr; 1893 1894 /* 1895 * Make GDT gates and memory segments. 1896 */ 1897 set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, 1898 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); 1899 1900 set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, 1901 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); 1902 1903 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, 1904 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); 1905 1906 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, 1907 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); 1908 1909 #ifndef XENPV 1910 set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, 1911 LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); 1912 #endif 1913 1914 /* 1915 * Make LDT memory segments. 1916 */ 1917 *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = 1918 *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); 1919 *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = 1920 *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); 1921 1922 /* 1923 * 32 bit GDT entries. 1924 */ 1925 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, 1926 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); 1927 1928 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, 1929 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1930 1931 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, 1932 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1933 1934 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, 1935 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1936 1937 /* 1938 * 32 bit LDT entries. 1939 */ 1940 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); 1941 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1942 SDT_MEMERA, SEL_UPL, 1, 1, 0); 1943 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); 1944 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, 1945 SDT_MEMRWA, SEL_UPL, 1, 1, 0); 1946 1947 /* CPU-specific IDT exceptions. */ 1948 for (x = 0; x < NCPUIDT; x++) { 1949 int sel, ist; 1950 1951 /* Reset to default. Special cases below */ 1952 sel = SEL_KPL; 1953 ist = 0; 1954 1955 idt_vec_reserve(iv, x); 1956 1957 switch (x) { 1958 case 1: /* DB */ 1959 ist = 4; 1960 break; 1961 case 2: /* NMI */ 1962 ist = 3; 1963 break; 1964 case 3: 1965 case 4: 1966 sel = SEL_UPL; 1967 break; 1968 case 8: /* double fault */ 1969 ist = 2; 1970 break; 1971 #ifdef XENPV 1972 case 18: /* MCA */ 1973 sel |= 0x4; /* Auto EOI/mask */ 1974 break; 1975 #endif /* XENPV */ 1976 default: 1977 break; 1978 } 1979 1980 set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT, 1981 sel, GSEL(GCODE_SEL, SEL_KPL)); 1982 } 1983 1984 /* new-style interrupt gate for syscalls */ 1985 idt_vec_reserve(iv, 128); 1986 set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, 1987 GSEL(GCODE_SEL, SEL_KPL)); 1988 1989 kpreempt_enable(); 1990 1991 setregion(®ion, gdtstore, DYNSEL_START - 1); 1992 lgdt(®ion); 1993 1994 #ifdef XENPV 1995 /* Init Xen callbacks and syscall handlers */ 1996 if (HYPERVISOR_set_callbacks( 1997 (unsigned long) hypervisor_callback, 1998 (unsigned long) failsafe_callback, 1999 (unsigned long) Xsyscall)) 2000 panic("HYPERVISOR_set_callbacks() failed"); 2001 #endif /* XENPV */ 2002 2003 cpu_init_idt(&cpu_info_primary); 2004 2005 #ifdef XENPV 2006 xen_init_ksyms(); 2007 #else /* XENPV */ 2008 #ifdef XEN 2009 if (vm_guest == VM_GUEST_XENPVH) 2010 xen_init_ksyms(); 2011 else 2012 #endif /* XEN */ 2013 init_x86_64_ksyms(); 2014 #endif /* XENPV */ 2015 2016 #ifndef XENPV 2017 intr_default_setup(); 2018 #else 2019 events_default_setup(); 2020 #endif 2021 2022 splraise(IPL_HIGH); 2023 x86_enable_intr(); 2024 2025 #ifdef DDB 2026 if (boothowto & RB_KDB) 2027 Debugger(); 2028 #endif 2029 #ifdef KGDB 2030 kgdb_port_init(); 2031 if (boothowto & RB_KDB) { 2032 kgdb_debug_init = 1; 2033 kgdb_connect(1); 2034 } 2035 #endif 2036 2037 pcb->pcb_dbregs = NULL; 2038 x86_dbregs_init(); 2039 } 2040 2041 void 2042 cpu_reset(void) 2043 { 2044 #ifndef XENPV 2045 idt_descriptor_t *idt; 2046 vaddr_t vaddr; 2047 2048 idt = cpu_info_primary.ci_idtvec.iv_idt; 2049 vaddr = (vaddr_t)idt; 2050 #endif 2051 2052 x86_disable_intr(); 2053 2054 #ifdef XENPV 2055 HYPERVISOR_reboot(); 2056 #else 2057 2058 x86_reset(); 2059 2060 /* 2061 * Try to cause a triple fault and watchdog reset by making the IDT 2062 * invalid and causing a fault. 2063 */ 2064 kpreempt_disable(); 2065 pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE); 2066 memset((void *)idt, 0, NIDT * sizeof(idt[0])); 2067 kpreempt_enable(); 2068 breakpoint(); 2069 2070 #if 0 2071 /* 2072 * Try to cause a triple fault and watchdog reset by unmapping the 2073 * entire address space and doing a TLB flush. 2074 */ 2075 memset((void *)PTD, 0, PAGE_SIZE); 2076 tlbflush(); 2077 #endif 2078 #endif /* XENPV */ 2079 2080 for (;;); 2081 } 2082 2083 void 2084 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) 2085 { 2086 const struct trapframe *tf = l->l_md.md_regs; 2087 __greg_t ras_rip; 2088 2089 mcp->__gregs[_REG_RDI] = tf->tf_rdi; 2090 mcp->__gregs[_REG_RSI] = tf->tf_rsi; 2091 mcp->__gregs[_REG_RDX] = tf->tf_rdx; 2092 mcp->__gregs[_REG_R10] = tf->tf_r10; 2093 mcp->__gregs[_REG_R8] = tf->tf_r8; 2094 mcp->__gregs[_REG_R9] = tf->tf_r9; 2095 /* argX not touched */ 2096 mcp->__gregs[_REG_RCX] = tf->tf_rcx; 2097 mcp->__gregs[_REG_R11] = tf->tf_r11; 2098 mcp->__gregs[_REG_R12] = tf->tf_r12; 2099 mcp->__gregs[_REG_R13] = tf->tf_r13; 2100 mcp->__gregs[_REG_R14] = tf->tf_r14; 2101 mcp->__gregs[_REG_R15] = tf->tf_r15; 2102 mcp->__gregs[_REG_RBP] = tf->tf_rbp; 2103 mcp->__gregs[_REG_RBX] = tf->tf_rbx; 2104 mcp->__gregs[_REG_RAX] = tf->tf_rax; 2105 mcp->__gregs[_REG_GS] = 0; 2106 mcp->__gregs[_REG_FS] = 0; 2107 mcp->__gregs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL); 2108 mcp->__gregs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL); 2109 mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno; 2110 mcp->__gregs[_REG_ERR] = tf->tf_err; 2111 mcp->__gregs[_REG_RIP] = tf->tf_rip; 2112 mcp->__gregs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL); 2113 mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags; 2114 mcp->__gregs[_REG_RSP] = tf->tf_rsp; 2115 mcp->__gregs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL); 2116 2117 if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, 2118 (void *) mcp->__gregs[_REG_RIP])) != -1) 2119 mcp->__gregs[_REG_RIP] = ras_rip; 2120 2121 *flags |= _UC_CPU; 2122 2123 mcp->_mc_tlsbase = (uintptr_t)l->l_private; 2124 *flags |= _UC_TLSBASE; 2125 2126 process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs); 2127 *flags |= _UC_FPU; 2128 } 2129 2130 int 2131 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) 2132 { 2133 struct trapframe *tf = l->l_md.md_regs; 2134 const __greg_t *gr = mcp->__gregs; 2135 struct proc *p = l->l_proc; 2136 int error; 2137 int64_t rflags; 2138 2139 CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512); 2140 2141 if ((flags & _UC_CPU) != 0) { 2142 error = cpu_mcontext_validate(l, mcp); 2143 if (error != 0) 2144 return error; 2145 2146 tf->tf_rdi = gr[_REG_RDI]; 2147 tf->tf_rsi = gr[_REG_RSI]; 2148 tf->tf_rdx = gr[_REG_RDX]; 2149 tf->tf_r10 = gr[_REG_R10]; 2150 tf->tf_r8 = gr[_REG_R8]; 2151 tf->tf_r9 = gr[_REG_R9]; 2152 /* argX not touched */ 2153 tf->tf_rcx = gr[_REG_RCX]; 2154 tf->tf_r11 = gr[_REG_R11]; 2155 tf->tf_r12 = gr[_REG_R12]; 2156 tf->tf_r13 = gr[_REG_R13]; 2157 tf->tf_r14 = gr[_REG_R14]; 2158 tf->tf_r15 = gr[_REG_R15]; 2159 tf->tf_rbp = gr[_REG_RBP]; 2160 tf->tf_rbx = gr[_REG_RBX]; 2161 tf->tf_rax = gr[_REG_RAX]; 2162 tf->tf_gs = 0; 2163 tf->tf_fs = 0; 2164 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); 2165 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); 2166 /* trapno, err not touched */ 2167 tf->tf_rip = gr[_REG_RIP]; 2168 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); 2169 rflags = tf->tf_rflags; 2170 rflags &= ~PSL_USER; 2171 tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); 2172 tf->tf_rsp = gr[_REG_RSP]; 2173 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); 2174 2175 l->l_md.md_flags |= MDL_IRET; 2176 } 2177 2178 if ((flags & _UC_FPU) != 0) 2179 process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs); 2180 2181 if ((flags & _UC_TLSBASE) != 0) 2182 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); 2183 2184 mutex_enter(p->p_lock); 2185 if (flags & _UC_SETSTACK) 2186 l->l_sigstk.ss_flags |= SS_ONSTACK; 2187 if (flags & _UC_CLRSTACK) 2188 l->l_sigstk.ss_flags &= ~SS_ONSTACK; 2189 mutex_exit(p->p_lock); 2190 2191 return 0; 2192 } 2193 2194 int 2195 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) 2196 { 2197 struct proc *p __diagused = l->l_proc; 2198 struct trapframe *tf = l->l_md.md_regs; 2199 const __greg_t *gr; 2200 uint16_t sel; 2201 2202 KASSERT((p->p_flag & PK_32) == 0); 2203 gr = mcp->__gregs; 2204 2205 if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) 2206 return EINVAL; 2207 2208 sel = gr[_REG_ES] & 0xffff; 2209 if (sel != 0 && !VALID_USER_DSEL(sel)) 2210 return EINVAL; 2211 2212 sel = gr[_REG_FS] & 0xffff; 2213 if (sel != 0 && !VALID_USER_DSEL(sel)) 2214 return EINVAL; 2215 2216 sel = gr[_REG_GS] & 0xffff; 2217 if (sel != 0 && !VALID_USER_DSEL(sel)) 2218 return EINVAL; 2219 2220 sel = gr[_REG_DS] & 0xffff; 2221 if (!VALID_USER_DSEL(sel)) 2222 return EINVAL; 2223 2224 #ifndef XENPV 2225 sel = gr[_REG_SS] & 0xffff; 2226 if (!VALID_USER_DSEL(sel)) 2227 return EINVAL; 2228 2229 sel = gr[_REG_CS] & 0xffff; 2230 if (!VALID_USER_CSEL(sel)) 2231 return EINVAL; 2232 #endif 2233 2234 if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) 2235 return EINVAL; 2236 2237 return 0; 2238 } 2239 2240 int 2241 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) 2242 { 2243 const vaddr_t v = (vaddr_t)ptr; 2244 vaddr_t kva, kva_end; 2245 size_t i; 2246 2247 kva = bootspace.head.va; 2248 kva_end = kva + bootspace.head.sz; 2249 if (v >= kva && v < kva_end) { 2250 *handled = true; 2251 return 0; 2252 } 2253 2254 for (i = 0; i < BTSPACE_NSEGS; i++) { 2255 kva = bootspace.segs[i].va; 2256 kva_end = kva + bootspace.segs[i].sz; 2257 if (v < kva || v >= kva_end) 2258 continue; 2259 *handled = true; 2260 if (bootspace.segs[i].type == BTSEG_TEXT || 2261 bootspace.segs[i].type == BTSEG_RODATA) { 2262 if (prot & VM_PROT_WRITE) { 2263 return EFAULT; 2264 } 2265 } 2266 return 0; 2267 } 2268 2269 kva = bootspace.boot.va; 2270 kva_end = kva + bootspace.boot.sz; 2271 if (v >= kva && v < kva_end) { 2272 *handled = true; 2273 return 0; 2274 } 2275 2276 if (v >= bootspace.smodule && v < bootspace.emodule) { 2277 *handled = true; 2278 if (!uvm_map_checkprot(module_map, v, v + 1, prot)) { 2279 return EFAULT; 2280 } 2281 } else { 2282 *handled = false; 2283 } 2284 return 0; 2285 } 2286 2287 /* 2288 * Zero out a 64bit LWP's segments registers. Used when exec'ing a new 2289 * 64bit program. 2290 */ 2291 void 2292 cpu_segregs64_zero(struct lwp *l) 2293 { 2294 struct trapframe * const tf = l->l_md.md_regs; 2295 struct pcb *pcb; 2296 uint64_t zero = 0; 2297 2298 KASSERT(kpreempt_disabled()); 2299 KASSERT((l->l_proc->p_flag & PK_32) == 0); 2300 KASSERT(l == curlwp); 2301 2302 pcb = lwp_getpcb(l); 2303 2304 tf->tf_fs = 0; 2305 tf->tf_gs = 0; 2306 setds(GSEL(GUDATA_SEL, SEL_UPL)); 2307 setes(GSEL(GUDATA_SEL, SEL_UPL)); 2308 setfs(0); 2309 setusergs(0); 2310 2311 #ifndef XENPV 2312 wrmsr(MSR_FSBASE, 0); 2313 wrmsr(MSR_KERNELGSBASE, 0); 2314 #else 2315 HYPERVISOR_set_segment_base(SEGBASE_FS, 0); 2316 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); 2317 #endif 2318 2319 pcb->pcb_fs = 0; 2320 pcb->pcb_gs = 0; 2321 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); 2322 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); 2323 } 2324 2325 /* 2326 * Zero out a 32bit LWP's segments registers. Used when exec'ing a new 2327 * 32bit program. 2328 */ 2329 void 2330 cpu_segregs32_zero(struct lwp *l) 2331 { 2332 struct trapframe * const tf = l->l_md.md_regs; 2333 struct pcb *pcb; 2334 uint64_t zero = 0; 2335 2336 KASSERT(kpreempt_disabled()); 2337 KASSERT(l->l_proc->p_flag & PK_32); 2338 KASSERT(l == curlwp); 2339 2340 pcb = lwp_getpcb(l); 2341 2342 tf->tf_fs = 0; 2343 tf->tf_gs = 0; 2344 setds(GSEL(GUDATA32_SEL, SEL_UPL)); 2345 setes(GSEL(GUDATA32_SEL, SEL_UPL)); 2346 setfs(0); 2347 setusergs(0); 2348 pcb->pcb_fs = 0; 2349 pcb->pcb_gs = 0; 2350 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); 2351 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); 2352 } 2353 2354 /* 2355 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. 2356 * Used only for 32-bit processes. 2357 */ 2358 void 2359 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) 2360 { 2361 struct trapframe *tf; 2362 struct pcb *pcb; 2363 2364 KASSERT(l->l_proc->p_flag & PK_32); 2365 KASSERT(l == curlwp); 2366 2367 tf = l->l_md.md_regs; 2368 fssel &= 0xFFFF; 2369 gssel &= 0xFFFF; 2370 2371 pcb = lwp_getpcb(l); 2372 kpreempt_disable(); 2373 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); 2374 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); 2375 2376 #ifdef XENPV 2377 setusergs(gssel); 2378 #endif 2379 2380 tf->tf_fs = fssel; 2381 tf->tf_gs = gssel; 2382 kpreempt_enable(); 2383 } 2384 2385 bool 2386 mm_md_direct_mapped_io(void *addr, paddr_t *paddr) 2387 { 2388 vaddr_t va = (vaddr_t)addr; 2389 2390 #ifdef __HAVE_DIRECT_MAP 2391 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2392 *paddr = PMAP_DIRECT_UNMAP(va); 2393 return true; 2394 } 2395 #else 2396 __USE(va); 2397 #endif 2398 2399 return false; 2400 } 2401 2402 bool 2403 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) 2404 { 2405 #ifdef __HAVE_DIRECT_MAP 2406 *vaddr = PMAP_DIRECT_MAP(paddr); 2407 return true; 2408 #else 2409 return false; 2410 #endif 2411 } 2412 2413 static void 2414 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src) 2415 { 2416 idt_descriptor_t *idt_dst; 2417 2418 idt_dst = dst->iv_idt; 2419 2420 kpreempt_disable(); 2421 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE); 2422 2423 memcpy(idt_dst, src->iv_idt, PAGE_SIZE); 2424 memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap)); 2425 2426 pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ); 2427 kpreempt_enable(); 2428 } 2429 2430 void 2431 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid) 2432 { 2433 vaddr_t va; 2434 2435 if (cid != cpu_index(&cpu_info_primary) && 2436 idt_vec_is_pcpu()) { 2437 #ifdef __HAVE_PCPU_AREA 2438 va = (vaddr_t)&pcpuarea->ent[cid].idt; 2439 #else 2440 struct vm_page *pg; 2441 2442 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 2443 UVM_KMF_VAONLY); 2444 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 2445 if (pg == NULL) { 2446 panic("failed to allocate a page for IDT"); 2447 } 2448 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg), 2449 VM_PROT_READ|VM_PROT_WRITE, 0); 2450 pmap_update(pmap_kernel()); 2451 #endif 2452 2453 memset((void *)va, 0, PAGE_SIZE); 2454 #ifndef XENPV 2455 pmap_changeprot_local(va, VM_PROT_READ); 2456 #endif 2457 pmap_update(pmap_kernel()); 2458 2459 iv->iv_idt = (void *)va; 2460 idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec)); 2461 } else { 2462 iv->iv_idt = (void *)idt_vaddr; 2463 } 2464 } 2465