Home | History | Annotate | Line # | Download | only in amd64
      1 /*	$NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
      5  *     The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
     10  * Simulation Facility, NASA Ames Research Center.
     11  *
     12  * This code is derived from software contributed to The NetBSD Foundation
     13  * by Coyote Point Systems, Inc. which was written under contract to Coyote
     14  * Point by Jed Davis and Devon O'Dell.
     15  *
     16  * Redistribution and use in source and binary forms, with or without
     17  * modification, are permitted provided that the following conditions
     18  * are met:
     19  * 1. Redistributions of source code must retain the above copyright
     20  *    notice, this list of conditions and the following disclaimer.
     21  * 2. Redistributions in binary form must reproduce the above copyright
     22  *    notice, this list of conditions and the following disclaimer in the
     23  *    documentation and/or other materials provided with the distribution.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 /*
     39  * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
     40  *
     41  * Permission to use, copy, modify, and distribute this software for any
     42  * purpose with or without fee is hereby granted, provided that the above
     43  * copyright notice and this permission notice appear in all copies.
     44  *
     45  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     46  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     47  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     48  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     49  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     50  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     51  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     52  */
     53 
     54 /*
     55  * Copyright (c) 2007 Manuel Bouyer.
     56  *
     57  * Redistribution and use in source and binary forms, with or without
     58  * modification, are permitted provided that the following conditions
     59  * are met:
     60  * 1. Redistributions of source code must retain the above copyright
     61  *    notice, this list of conditions and the following disclaimer.
     62  * 2. Redistributions in binary form must reproduce the above copyright
     63  *    notice, this list of conditions and the following disclaimer in the
     64  *    documentation and/or other materials provided with the distribution.
     65  *
     66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     67  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     68  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     69  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     70  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     71  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     72  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     73  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     74  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     75  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     76  */
     77 
     78 /*
     79  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
     80  * All rights reserved.
     81  *
     82  * This code is derived from software contributed to Berkeley by
     83  * William Jolitz.
     84  *
     85  * Redistribution and use in source and binary forms, with or without
     86  * modification, are permitted provided that the following conditions
     87  * are met:
     88  * 1. Redistributions of source code must retain the above copyright
     89  *    notice, this list of conditions and the following disclaimer.
     90  * 2. Redistributions in binary form must reproduce the above copyright
     91  *    notice, this list of conditions and the following disclaimer in the
     92  *    documentation and/or other materials provided with the distribution.
     93  * 3. Neither the name of the University nor the names of its contributors
     94  *    may be used to endorse or promote products derived from this software
     95  *    without specific prior written permission.
     96  *
     97  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     98  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     99  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    100  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    101  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    102  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    103  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    104  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    105  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    106  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    107  * SUCH DAMAGE.
    108  *
    109  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
    110  */
    111 
    112 #include <sys/cdefs.h>
    113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $");
    114 
    115 #include "opt_modular.h"
    116 #include "opt_user_ldt.h"
    117 #include "opt_ddb.h"
    118 #include "opt_kgdb.h"
    119 #include "opt_cpureset_delay.h"
    120 #include "opt_mtrr.h"
    121 #include "opt_realmem.h"
    122 #include "opt_xen.h"
    123 #include "opt_svs.h"
    124 #include "opt_kaslr.h"
    125 #ifndef XENPV
    126 #include "opt_physmem.h"
    127 #endif
    128 #include "isa.h"
    129 #include "pci.h"
    130 
    131 #include <sys/param.h>
    132 #include <sys/systm.h>
    133 #include <sys/signal.h>
    134 #include <sys/signalvar.h>
    135 #include <sys/kernel.h>
    136 #include <sys/cpu.h>
    137 #include <sys/exec.h>
    138 #include <sys/exec_aout.h>	/* for MID_* */
    139 #include <sys/reboot.h>
    140 #include <sys/conf.h>
    141 #include <sys/msgbuf.h>
    142 #include <sys/mount.h>
    143 #include <sys/core.h>
    144 #include <sys/kcore.h>
    145 #include <sys/ucontext.h>
    146 #include <machine/kcore.h>
    147 #include <sys/ras.h>
    148 #include <sys/syscallargs.h>
    149 #include <sys/ksyms.h>
    150 #include <sys/device.h>
    151 #include <sys/lwp.h>
    152 #include <sys/proc.h>
    153 #include <sys/asan.h>
    154 #include <sys/csan.h>
    155 #include <sys/msan.h>
    156 #include <sys/module.h>
    157 #include <sys/timevar.h>
    158 
    159 #ifdef KGDB
    160 #include <sys/kgdb.h>
    161 #endif
    162 
    163 #include <lib/libkern/entpool.h> /* XXX */
    164 
    165 #include <dev/cons.h>
    166 #include <dev/mm.h>
    167 
    168 #include <uvm/uvm.h>
    169 #include <uvm/uvm_page.h>
    170 
    171 #include <sys/sysctl.h>
    172 
    173 #include <machine/cpu.h>
    174 #include <machine/cpu_rng.h>
    175 #include <machine/cpufunc.h>
    176 #include <machine/gdt.h>
    177 #include <machine/intr.h>
    178 #include <machine/pio.h>
    179 #include <machine/psl.h>
    180 #include <machine/reg.h>
    181 #include <machine/specialreg.h>
    182 #include <machine/bootinfo.h>
    183 #include <x86/fpu.h>
    184 #include <x86/dbregs.h>
    185 #include <machine/mtrr.h>
    186 #include <machine/mpbiosvar.h>
    187 #include <machine/pmap_private.h>
    188 
    189 #include <x86/bootspace.h>
    190 #include <x86/cputypes.h>
    191 #include <x86/cpuvar.h>
    192 #include <x86/machdep.h>
    193 #include <x86/x86/tsc.h>
    194 
    195 #include <dev/isa/isareg.h>
    196 #include <machine/isa_machdep.h>
    197 #include <dev/ic/i8042reg.h>
    198 
    199 #ifdef XEN
    200 #include <xen/xen.h>
    201 #include <xen/hypervisor.h>
    202 #include <xen/evtchn.h>
    203 #include <xen/include/public/version.h>
    204 #include <xen/include/public/vcpu.h>
    205 #endif /* XEN */
    206 
    207 #include <ddb/db_active.h>
    208 
    209 #ifdef DDB
    210 #include <machine/db_machdep.h>
    211 #include <ddb/db_extern.h>
    212 #include <ddb/db_output.h>
    213 #include <ddb/db_interface.h>
    214 #endif
    215 
    216 #include "acpica.h"
    217 
    218 #if NACPICA > 0
    219 #include <dev/acpi/acpivar.h>
    220 #define ACPI_MACHDEP_PRIVATE
    221 #include <machine/acpi_machdep.h>
    222 #else
    223 #include <machine/i82489var.h>
    224 #endif
    225 
    226 #include "isa.h"
    227 #include "isadma.h"
    228 #include "ksyms.h"
    229 
    230 /* the following is used externally (sysctl_hw) */
    231 char machine[] = "amd64";		/* CPU "architecture" */
    232 char machine_arch[] = "x86_64";		/* machine == machine_arch */
    233 
    234 #ifdef CPURESET_DELAY
    235 int cpureset_delay = CPURESET_DELAY;
    236 #else
    237 int cpureset_delay = 2000; /* default to 2s */
    238 #endif
    239 
    240 int cpu_class = CPUCLASS_686;
    241 
    242 #ifdef MTRR
    243 const struct mtrr_funcs *mtrr_funcs;
    244 #endif
    245 
    246 int cpu_class;
    247 int use_pae;
    248 
    249 #ifndef NO_SPARSE_DUMP
    250 int sparse_dump = 1;
    251 
    252 paddr_t max_paddr = 0;
    253 unsigned char *sparse_dump_physmap;
    254 #endif
    255 
    256 char *dump_headerbuf, *dump_headerbuf_ptr;
    257 #define dump_headerbuf_size PAGE_SIZE
    258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
    259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
    260 daddr_t dump_header_blkno;
    261 
    262 size_t dump_nmemsegs;
    263 size_t dump_npages;
    264 size_t dump_header_size;
    265 size_t dump_totalbytesleft;
    266 
    267 vaddr_t idt_vaddr;
    268 paddr_t idt_paddr;
    269 vaddr_t gdt_vaddr;
    270 paddr_t gdt_paddr;
    271 vaddr_t ldt_vaddr;
    272 paddr_t ldt_paddr;
    273 
    274 static struct vm_map module_map_store;
    275 extern struct bootspace bootspace;
    276 extern struct slotspace slotspace;
    277 
    278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT;
    279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT;
    280 pd_entry_t *pte_base __read_mostly;
    281 
    282 struct vm_map *phys_map = NULL;
    283 
    284 extern paddr_t lowmem_rsvd;
    285 extern paddr_t avail_start, avail_end;
    286 #ifdef XENPV
    287 extern paddr_t pmap_pa_start, pmap_pa_end;
    288 #endif
    289 
    290 struct nmistore {
    291 	uint64_t cr3;
    292 	uint64_t scratch;
    293 } __packed;
    294 
    295 /*
    296  * Size of memory segments, before any memory is stolen.
    297  */
    298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
    299 int mem_cluster_cnt;
    300 
    301 int cpu_dump(void);
    302 int cpu_dumpsize(void);
    303 u_long cpu_dump_mempagecnt(void);
    304 void dodumpsys(void);
    305 void dumpsys(void);
    306 
    307 static void x86_64_proc0_pcb_ldt_init(void);
    308 
    309 void dump_misc_init(void);
    310 void dump_seg_prep(void);
    311 int dump_seg_iter(int (*)(paddr_t, paddr_t));
    312 
    313 #ifndef NO_SPARSE_DUMP
    314 void sparse_dump_reset(void);
    315 void sparse_dump_mark(void);
    316 void cpu_dump_prep_sparse(void);
    317 #endif
    318 
    319 void dump_header_start(void);
    320 int dump_header_flush(void);
    321 int dump_header_addbytes(const void*, size_t);
    322 int dump_header_addseg(paddr_t, paddr_t);
    323 int dump_header_finish(void);
    324 
    325 int dump_seg_count_range(paddr_t, paddr_t);
    326 int dumpsys_seg(paddr_t, paddr_t);
    327 
    328 void init_bootspace(void);
    329 void init_slotspace(void);
    330 void init_x86_64(paddr_t);
    331 
    332 /*
    333  * Machine-dependent startup code
    334  */
    335 void
    336 cpu_startup(void)
    337 {
    338 	int x, y;
    339 	vaddr_t minaddr, maxaddr;
    340 	psize_t sz;
    341 
    342 	/*
    343 	 * For console drivers that require uvm and pmap to be initialized,
    344 	 * we'll give them one more chance here...
    345 	 */
    346 	consinit();
    347 
    348 	/*
    349 	 * Initialize error message buffer (at end of core).
    350 	 */
    351 	if (msgbuf_p_cnt == 0)
    352 		panic("msgbuf paddr map has not been set up");
    353 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
    354 		continue;
    355 
    356 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
    357 	if (msgbuf_vaddr == 0)
    358 		panic("failed to valloc msgbuf_vaddr");
    359 
    360 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
    361 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
    362 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
    363 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
    364 			    VM_PROT_READ|VM_PROT_WRITE, 0);
    365 	}
    366 
    367 	pmap_update(pmap_kernel());
    368 
    369 	initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
    370 
    371 	minaddr = 0;
    372 
    373 	/*
    374 	 * Allocate a submap for physio.
    375 	 */
    376 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
    377 	    VM_PHYS_SIZE, 0, false, NULL);
    378 
    379 	/*
    380 	 * Create the module map.
    381 	 *
    382 	 * The kernel uses RIP-relative addressing with a maximum offset of
    383 	 * 2GB. Because of that, we can't put the kernel modules in kernel_map
    384 	 * (like i386 does), since kernel_map is too far away in memory from
    385 	 * the kernel sections. So we have to create a special module_map.
    386 	 *
    387 	 * The module map is taken as what is left of the bootstrap memory
    388 	 * created in locore/prekern.
    389 	 */
    390 	uvm_map_setup(&module_map_store, bootspace.smodule,
    391 	    bootspace.emodule, 0);
    392 	module_map_store.pmap = pmap_kernel();
    393 	module_map = &module_map_store;
    394 
    395 	/* Say hello. */
    396 	banner();
    397 
    398 #if NISA > 0 || NPCI > 0
    399 	/* Safe for i/o port / memory space allocation to use malloc now. */
    400 	x86_bus_space_mallocok();
    401 #endif
    402 
    403 #ifdef __HAVE_PCPU_AREA
    404 	cpu_pcpuarea_init(&cpu_info_primary);
    405 #endif
    406 	gdt_init();
    407 	x86_64_proc0_pcb_ldt_init();
    408 
    409 	cpu_init_tss(&cpu_info_primary);
    410 #if !defined(XENPV)
    411 	ltr(cpu_info_primary.ci_tss_sel);
    412 #endif
    413 
    414 	x86_startup();
    415 }
    416 
    417 #ifdef XENPV
    418 /* used in assembly */
    419 void hypervisor_callback(void);
    420 void failsafe_callback(void);
    421 void x86_64_switch_context(struct pcb *);
    422 void x86_64_tls_switch(struct lwp *);
    423 
    424 void
    425 x86_64_switch_context(struct pcb *new)
    426 {
    427 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
    428 	struct physdev_set_iopl set_iopl;
    429 	set_iopl.iopl = new->pcb_iopl;
    430 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
    431 }
    432 
    433 void
    434 x86_64_tls_switch(struct lwp *l)
    435 {
    436 	struct cpu_info *ci = curcpu();
    437 	struct pcb *pcb = lwp_getpcb(l);
    438 	struct trapframe *tf = l->l_md.md_regs;
    439 	uint64_t zero = 0;
    440 
    441 	/*
    442 	 * Raise the IPL to IPL_HIGH. XXX Still needed?
    443 	 */
    444 	(void)splhigh();
    445 
    446 	/* Update segment registers */
    447 	if (pcb->pcb_flags & PCB_COMPAT32) {
    448 		update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
    449 		update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
    450 		setds(GSEL(GUDATA32_SEL, SEL_UPL));
    451 		setes(GSEL(GUDATA32_SEL, SEL_UPL));
    452 		setfs(GSEL(GUDATA32_SEL, SEL_UPL));
    453 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
    454 	} else {
    455 		update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero);
    456 		update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero);
    457 		setds(GSEL(GUDATA_SEL, SEL_UPL));
    458 		setes(GSEL(GUDATA_SEL, SEL_UPL));
    459 		setfs(0);
    460 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
    461 		HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
    462 		HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
    463 	}
    464 }
    465 #endif /* XENPV */
    466 
    467 /*
    468  * Set up proc0's PCB and LDT.
    469  */
    470 static void
    471 x86_64_proc0_pcb_ldt_init(void)
    472 {
    473 	struct lwp *l = &lwp0;
    474 	struct pcb *pcb = lwp_getpcb(l);
    475 
    476 	pcb->pcb_flags = 0;
    477 	pcb->pcb_fs = 0;
    478 	pcb->pcb_gs = 0;
    479 	pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
    480 	pcb->pcb_iopl = IOPL_KPL;
    481 	pcb->pcb_dbregs = NULL;
    482 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
    483 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
    484 
    485 #if !defined(XENPV)
    486 	lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
    487 #else
    488 	xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3);
    489 	/* Reset TS bit and set kernel stack for interrupt handlers */
    490 	HYPERVISOR_fpu_taskswitch(1);
    491 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
    492 	struct physdev_set_iopl set_iopl;
    493 	set_iopl.iopl = pcb->pcb_iopl;
    494 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
    495 #endif
    496 }
    497 
    498 /*
    499  * Set up TSS and I/O bitmap.
    500  */
    501 void
    502 cpu_init_tss(struct cpu_info *ci)
    503 {
    504 #ifdef __HAVE_PCPU_AREA
    505 	const cpuid_t cid = cpu_index(ci);
    506 #endif
    507 	struct cpu_tss *cputss;
    508 	struct nmistore *store;
    509 	uintptr_t p;
    510 
    511 #ifdef __HAVE_PCPU_AREA
    512 	cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss;
    513 #else
    514 	cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
    515 	    sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    516 #endif
    517 
    518 	cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
    519 
    520 	/* DDB stack */
    521 #ifdef __HAVE_PCPU_AREA
    522 	p = (vaddr_t)&pcpuarea->ent[cid].ist0;
    523 #else
    524 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    525 #endif
    526 	cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16;
    527 
    528 	/* double fault */
    529 #ifdef __HAVE_PCPU_AREA
    530 	p = (vaddr_t)&pcpuarea->ent[cid].ist1;
    531 #else
    532 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    533 #endif
    534 	cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16;
    535 
    536 	/* NMI - store a structure at the top of the stack */
    537 #ifdef __HAVE_PCPU_AREA
    538 	p = (vaddr_t)&pcpuarea->ent[cid].ist2;
    539 #else
    540 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    541 #endif
    542 	cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore);
    543 	store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore));
    544 	store->cr3 = pmap_pdirpa(pmap_kernel(), 0);
    545 
    546 	/* DB */
    547 #ifdef __HAVE_PCPU_AREA
    548 	p = (vaddr_t)&pcpuarea->ent[cid].ist3;
    549 #else
    550 	p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    551 #endif
    552 	cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16;
    553 
    554 	ci->ci_tss = cputss;
    555 	ci->ci_tss_sel = tss_alloc(&cputss->tss);
    556 }
    557 
    558 void
    559 buildcontext(struct lwp *l, void *catcher, void *f)
    560 {
    561 	struct trapframe *tf = l->l_md.md_regs;
    562 
    563 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
    564 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
    565 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
    566 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
    567 
    568 	tf->tf_rip = (uint64_t)catcher;
    569 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
    570 	tf->tf_rflags &= ~PSL_CLEARSIG;
    571 	tf->tf_rsp = (uint64_t)f;
    572 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
    573 
    574 	/* Ensure FP state is sane */
    575 	fpu_sigreset(l);
    576 }
    577 
    578 void
    579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
    580 {
    581 
    582 	printf("sendsig_sigcontext: illegal\n");
    583 	sigexit(curlwp, SIGILL);
    584 }
    585 
    586 void
    587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
    588 {
    589 	struct lwp *l = curlwp;
    590 	struct proc *p = l->l_proc;
    591 	struct sigacts *ps = p->p_sigacts;
    592 	int onstack, error;
    593 	int sig = ksi->ksi_signo;
    594 	struct sigframe_siginfo *fp, frame;
    595 	sig_t catcher = SIGACTION(p, sig).sa_handler;
    596 	struct trapframe *tf = l->l_md.md_regs;
    597 	char *sp;
    598 
    599 	KASSERT(mutex_owned(p->p_lock));
    600 
    601 	/* Do we need to jump onto the signal stack? */
    602 	onstack =
    603 	    (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
    604 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
    605 
    606 	/* Allocate space for the signal handler context. */
    607 	if (onstack)
    608 		sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
    609 	else
    610 		/* AMD64 ABI 128-bytes "red zone". */
    611 		sp = (char *)tf->tf_rsp - 128;
    612 
    613 	sp -= sizeof(struct sigframe_siginfo);
    614 	/* Round down the stackpointer to a multiple of 16 for the ABI. */
    615 	fp = (struct sigframe_siginfo *)(((unsigned long)sp &
    616 		~STACK_ALIGNBYTES) - 8);
    617 
    618 	memset(&frame, 0, sizeof(frame));
    619 	frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
    620 	frame.sf_si._info = ksi->ksi_info;
    621 	frame.sf_uc.uc_flags = _UC_SIGMASK;
    622 	frame.sf_uc.uc_sigmask = *mask;
    623 	frame.sf_uc.uc_link = l->l_ctxlink;
    624 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
    625 	    ? _UC_SETSTACK : _UC_CLRSTACK;
    626 	sendsig_reset(l, sig);
    627 
    628 	mutex_exit(p->p_lock);
    629 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
    630 	/* Copyout all the fp regs, the signal handler might expect them. */
    631 	error = copyout(&frame, fp, sizeof frame);
    632 	mutex_enter(p->p_lock);
    633 
    634 	if (error != 0) {
    635 		/*
    636 		 * Process has trashed its stack; give it an illegal
    637 		 * instruction to halt it in its tracks.
    638 		 */
    639 		sigexit(l, SIGILL);
    640 		/* NOTREACHED */
    641 	}
    642 
    643 	buildcontext(l, catcher, fp);
    644 
    645 	tf->tf_rdi = sig;
    646 	tf->tf_rsi = (uint64_t)&fp->sf_si;
    647 	tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
    648 
    649 	/* Remember that we're now on the signal stack. */
    650 	if (onstack)
    651 		l->l_sigstk.ss_flags |= SS_ONSTACK;
    652 
    653 	if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
    654 		/*
    655 		 * process has given an invalid address for the
    656 		 * handler. Stop it, but do not do it before so
    657 		 * we can return the right info to userland (or in core dump)
    658 		 */
    659 		sigexit(l, SIGILL);
    660 		/* NOTREACHED */
    661 	}
    662 }
    663 
    664 struct pcb dumppcb;
    665 
    666 void
    667 cpu_reboot(int howto, char *bootstr)
    668 {
    669 	static bool syncdone = false;
    670 	int s = IPL_NONE;
    671 	__USE(s);	/* ugly otherwise */
    672 
    673 	if (cold) {
    674 		howto |= RB_HALT;
    675 		goto haltsys;
    676 	}
    677 
    678 	boothowto = howto;
    679 
    680 	/* i386 maybe_dump() */
    681 
    682 	/*
    683 	 * If we've panic'd, don't make the situation potentially
    684 	 * worse by syncing or unmounting the file systems.
    685 	 */
    686 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
    687 		if (!syncdone) {
    688 			syncdone = true;
    689 			/* XXX used to force unmount as well, here */
    690 			vfs_sync_all(curlwp);
    691 		}
    692 
    693 		while (vfs_unmountall1(curlwp, false, false) ||
    694 		       config_detach_all(boothowto) ||
    695 		       vfs_unmount_forceone(curlwp))
    696 			;	/* do nothing */
    697 	} else {
    698 		if (!db_active)
    699 			suspendsched();
    700 	}
    701 
    702 	pmf_system_shutdown(boothowto);
    703 
    704 	/* Disable interrupts. */
    705 	s = splhigh();
    706 
    707 	/* Do a dump if requested. */
    708 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
    709 		dumpsys();
    710 
    711 haltsys:
    712 	doshutdownhooks();
    713 
    714         if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
    715 #if NACPICA > 0
    716 		if (s != IPL_NONE)
    717 			splx(s);
    718 
    719 		acpi_enter_sleep_state(ACPI_STATE_S5);
    720 #endif
    721 #ifdef XEN
    722 		if (vm_guest == VM_GUEST_XENPV ||
    723 		    vm_guest == VM_GUEST_XENPVH ||
    724 		    vm_guest == VM_GUEST_XENPVHVM)
    725 			HYPERVISOR_shutdown();
    726 #endif /* XEN */
    727 	}
    728 
    729 	cpu_broadcast_halt();
    730 
    731 	if (howto & RB_HALT) {
    732 #if NACPICA > 0
    733 		acpi_disable();
    734 #endif
    735 
    736 		printf("\n");
    737 		printf("The operating system has halted.\n");
    738 		printf("Please press any key to reboot.\n\n");
    739 		cnpollc(1);	/* for proper keyboard command handling */
    740 		if (cngetc() == 0) {
    741 			/* no console attached, so just hlt */
    742 			printf("No keyboard - cannot reboot after all.\n");
    743 			for(;;) {
    744 				x86_hlt();
    745 			}
    746 		}
    747 		cnpollc(0);
    748 	}
    749 
    750 	printf("rebooting...\n");
    751 	if (cpureset_delay > 0)
    752 		delay(cpureset_delay * 1000);
    753 	cpu_reset();
    754 	for(;;) ;
    755 	/*NOTREACHED*/
    756 }
    757 
    758 /*
    759  * XXXfvdl share dumpcode.
    760  */
    761 
    762 /*
    763  * Perform assorted dump-related initialization tasks.  Assumes that
    764  * the maximum physical memory address will not increase afterwards.
    765  */
    766 void
    767 dump_misc_init(void)
    768 {
    769 #ifndef NO_SPARSE_DUMP
    770 	int i;
    771 #endif
    772 
    773 	if (dump_headerbuf != NULL)
    774 		return; /* already called */
    775 
    776 #ifndef NO_SPARSE_DUMP
    777 	for (i = 0; i < mem_cluster_cnt; ++i) {
    778 		paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
    779 		if (max_paddr < top)
    780 			max_paddr = top;
    781 	}
    782 #ifdef DEBUG
    783 	printf("dump_misc_init: max_paddr = 0x%lx\n",
    784 	    (unsigned long)max_paddr);
    785 #endif
    786 	if (max_paddr == 0) {
    787 		printf("Your machine does not initialize mem_clusters; "
    788 		    "sparse_dumps disabled\n");
    789 		sparse_dump = 0;
    790 	} else {
    791 		sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
    792 		    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
    793 		    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
    794 	}
    795 #endif
    796 	dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
    797 	    dump_headerbuf_size,
    798 	    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
    799 	/* XXXjld should check for failure here, disable dumps if so. */
    800 }
    801 
    802 #ifndef NO_SPARSE_DUMP
    803 /*
    804  * Clear the set of pages to include in a sparse dump.
    805  */
    806 void
    807 sparse_dump_reset(void)
    808 {
    809 	memset(sparse_dump_physmap, 0,
    810 	    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
    811 }
    812 
    813 /*
    814  * Include or exclude pages in a sparse dump.
    815  */
    816 void
    817 sparse_dump_mark(void)
    818 {
    819 	paddr_t p, pstart, pend;
    820 	struct vm_page *pg;
    821 	int i;
    822 	uvm_physseg_t upm;
    823 
    824 	/*
    825 	 * Mark all memory pages, then unmark pages that are uninteresting.
    826 	 * Dereferenceing pg->uobject might crash again if another CPU
    827 	 * frees the object out from under us, but we can't lock anything
    828 	 * so it's a risk we have to take.
    829 	 */
    830 
    831 	for (i = 0; i < mem_cluster_cnt; ++i) {
    832 		pstart = mem_clusters[i].start / PAGE_SIZE;
    833 		pend = pstart + mem_clusters[i].size / PAGE_SIZE;
    834 
    835 		for (p = pstart; p < pend; p++) {
    836 			setbit(sparse_dump_physmap, p);
    837 		}
    838 	}
    839         for (upm = uvm_physseg_get_first();
    840 	     uvm_physseg_valid_p(upm);
    841 	     upm = uvm_physseg_get_next(upm)) {
    842 		paddr_t pfn;
    843 
    844 		/*
    845 		 * We assume that seg->start to seg->end are
    846 		 * uvm_page_physload()ed
    847 		 */
    848 		for (pfn = uvm_physseg_get_start(upm);
    849 		     pfn < uvm_physseg_get_end(upm);
    850 		     pfn++) {
    851 			pg = PHYS_TO_VM_PAGE(ptoa(pfn));
    852 
    853 			if (pg->uanon || (pg->flags & PG_FREE) ||
    854 			    (pg->uobject && pg->uobject->pgops)) {
    855 				p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
    856 				clrbit(sparse_dump_physmap, p);
    857 			}
    858 		}
    859 	}
    860 }
    861 
    862 /*
    863  * Machine-dependently decides on the contents of a sparse dump, using
    864  * the above.
    865  */
    866 void
    867 cpu_dump_prep_sparse(void)
    868 {
    869 	sparse_dump_reset();
    870 	/* XXX could the alternate recursive page table be skipped? */
    871 	sparse_dump_mark();
    872 	/* Memory for I/O buffers could be unmarked here, for example. */
    873 	/* The kernel text could also be unmarked, but gdb would be upset. */
    874 }
    875 #endif
    876 
    877 /*
    878  * Abstractly iterate over the collection of memory segments to be
    879  * dumped; the callback lacks the customary environment-pointer
    880  * argument because none of the current users really need one.
    881  *
    882  * To be used only after dump_seg_prep is called to set things up.
    883  */
    884 int
    885 dump_seg_iter(int (*callback)(paddr_t, paddr_t))
    886 {
    887 	int error, i;
    888 
    889 #define CALLBACK(start,size) do {     \
    890 	error = callback(start,size); \
    891 	if (error)                    \
    892 		return error;         \
    893 } while(0)
    894 
    895 	for (i = 0; i < mem_cluster_cnt; ++i) {
    896 #ifndef NO_SPARSE_DUMP
    897 		/*
    898 		 * The bitmap is scanned within each memory segment,
    899 		 * rather than over its entire domain, in case any
    900 		 * pages outside of the memory proper have been mapped
    901 		 * into kva; they might be devices that wouldn't
    902 		 * appreciate being arbitrarily read, and including
    903 		 * them could also break the assumption that a sparse
    904 		 * dump will always be smaller than a full one.
    905 		 */
    906 		if (sparse_dump && sparse_dump_physmap) {
    907 			paddr_t p, sp_start, sp_end;
    908 			int lastset;
    909 
    910 			sp_start = mem_clusters[i].start;
    911 			sp_end = sp_start + mem_clusters[i].size;
    912 			sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */
    913 			lastset = 0;
    914 			for (p = sp_start; p < sp_end; p += PAGE_SIZE) {
    915 				int thisset = isset(sparse_dump_physmap,
    916 				    p/PAGE_SIZE);
    917 
    918 				if (!lastset && thisset)
    919 					sp_start = p;
    920 				if (lastset && !thisset)
    921 					CALLBACK(sp_start, p - sp_start);
    922 				lastset = thisset;
    923 			}
    924 			if (lastset)
    925 				CALLBACK(sp_start, p - sp_start);
    926 		} else
    927 #endif
    928 			CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
    929 	}
    930 	return 0;
    931 #undef CALLBACK
    932 }
    933 
    934 /*
    935  * Prepare for an impending core dump: decide what's being dumped and
    936  * how much space it will take up.
    937  */
    938 void
    939 dump_seg_prep(void)
    940 {
    941 #ifndef NO_SPARSE_DUMP
    942 	if (sparse_dump && sparse_dump_physmap)
    943 		cpu_dump_prep_sparse();
    944 #endif
    945 
    946 	dump_nmemsegs = 0;
    947 	dump_npages = 0;
    948 	dump_seg_iter(dump_seg_count_range);
    949 
    950 	dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
    951 	    ALIGN(sizeof(cpu_kcore_hdr_t)) +
    952 	    ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
    953 	dump_header_size = roundup(dump_header_size, dbtob(1));
    954 
    955 	/*
    956 	 * savecore(8) will read this to decide how many pages to
    957 	 * copy, and cpu_dumpconf has already used the pessimistic
    958 	 * value to set dumplo, so it's time to tell the truth.
    959 	 */
    960 	dumpsize = dump_npages; /* XXX could these just be one variable? */
    961 }
    962 
    963 int
    964 dump_seg_count_range(paddr_t start, paddr_t size)
    965 {
    966 	++dump_nmemsegs;
    967 	dump_npages += size / PAGE_SIZE;
    968 	return 0;
    969 }
    970 
    971 /*
    972  * A sparse dump's header may be rather large, due to the number of
    973  * "segments" emitted.  These routines manage a simple output buffer,
    974  * so that the header can be written to disk incrementally.
    975  */
    976 void
    977 dump_header_start(void)
    978 {
    979 	dump_headerbuf_ptr = dump_headerbuf;
    980 	dump_header_blkno = dumplo;
    981 }
    982 
    983 int
    984 dump_header_flush(void)
    985 {
    986 	const struct bdevsw *bdev;
    987 	size_t to_write;
    988 	int error;
    989 
    990 	bdev = bdevsw_lookup(dumpdev);
    991 	to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
    992 	error = bdev->d_dump(dumpdev, dump_header_blkno,
    993 	    dump_headerbuf, to_write);
    994 	dump_header_blkno += btodb(to_write);
    995 	dump_headerbuf_ptr = dump_headerbuf;
    996 	return error;
    997 }
    998 
    999 int
   1000 dump_header_addbytes(const void* vptr, size_t n)
   1001 {
   1002 	const char* ptr = vptr;
   1003 	int error;
   1004 
   1005 	while (n > dump_headerbuf_avail) {
   1006 		memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
   1007 		ptr += dump_headerbuf_avail;
   1008 		n -= dump_headerbuf_avail;
   1009 		dump_headerbuf_ptr = dump_headerbuf_end;
   1010 		error = dump_header_flush();
   1011 		if (error)
   1012 			return error;
   1013 	}
   1014 	memcpy(dump_headerbuf_ptr, ptr, n);
   1015 	dump_headerbuf_ptr += n;
   1016 
   1017 	return 0;
   1018 }
   1019 
   1020 int
   1021 dump_header_addseg(paddr_t start, paddr_t size)
   1022 {
   1023 	phys_ram_seg_t seg = { start, size };
   1024 	int error;
   1025 
   1026 	error = dump_header_addbytes(&seg, sizeof(seg));
   1027 	if (error) {
   1028 		printf("[seg 0x%"PRIxPADDR" bytes 0x%"PRIxPSIZE" failed,"
   1029 		    " error=%d] ", start, size, error);
   1030 	}
   1031 	return error;
   1032 }
   1033 
   1034 int
   1035 dump_header_finish(void)
   1036 {
   1037 	int error;
   1038 
   1039 	memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
   1040 	error = dump_header_flush();
   1041 	if (error)
   1042 		printf("[finish failed, error=%d] ", error);
   1043 	return error;
   1044 }
   1045 
   1046 
   1047 /*
   1048  * These variables are needed by /sbin/savecore
   1049  */
   1050 uint32_t	dumpmag = 0x8fca0101;	/* magic number */
   1051 int 	dumpsize = 0;		/* pages */
   1052 long	dumplo = 0; 		/* blocks */
   1053 
   1054 /*
   1055  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
   1056  * for a full (non-sparse) dump.
   1057  */
   1058 int
   1059 cpu_dumpsize(void)
   1060 {
   1061 	int size;
   1062 
   1063 	size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
   1064 	    ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
   1065 	if (roundup(size, dbtob(1)) != dbtob(1))
   1066 		return (-1);
   1067 
   1068 	return (1);
   1069 }
   1070 
   1071 /*
   1072  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
   1073  * for a full (non-sparse) dump.
   1074  */
   1075 u_long
   1076 cpu_dump_mempagecnt(void)
   1077 {
   1078 	u_long i, n;
   1079 
   1080 	n = 0;
   1081 	for (i = 0; i < mem_cluster_cnt; i++)
   1082 		n += atop(mem_clusters[i].size);
   1083 	return (n);
   1084 }
   1085 
   1086 /*
   1087  * cpu_dump: dump the machine-dependent kernel core dump headers.
   1088  */
   1089 int
   1090 cpu_dump(void)
   1091 {
   1092 	kcore_seg_t seg;
   1093 	cpu_kcore_hdr_t cpuhdr;
   1094 	const struct bdevsw *bdev;
   1095 	int error;
   1096 
   1097 	bdev = bdevsw_lookup(dumpdev);
   1098 	if (bdev == NULL) {
   1099 		printf("[device 0x%llx ENXIO] ", (unsigned long long)dumpdev);
   1100 		return ENXIO;
   1101 	}
   1102 
   1103 	/*
   1104 	 * Generate a segment header.
   1105 	 */
   1106 	CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
   1107 	seg.c_size = dump_header_size - ALIGN(sizeof(seg));
   1108 	error = dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
   1109 	if (error) {
   1110 		printf("[segment header %zu bytes failed, error=%d] ",
   1111 		    ALIGN(sizeof(seg)), error);
   1112 		/* blithely proceed (can't fail?) */
   1113 	}
   1114 
   1115 	/*
   1116 	 * Add the machine-dependent header info.
   1117 	 */
   1118 	cpuhdr.ptdpaddr = PDPpaddr;
   1119 	cpuhdr.nmemsegs = dump_nmemsegs;
   1120 	error = dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
   1121 	if (error) {
   1122 		printf("[MD header %zu bytes failed, error=%d] ",
   1123 		    ALIGN(sizeof(cpuhdr)), error);
   1124 		/* blithely proceed (can't fail?) */
   1125 	}
   1126 
   1127 	/*
   1128 	 * Write out the memory segment descriptors.
   1129 	 */
   1130 	return dump_seg_iter(dump_header_addseg);
   1131 }
   1132 
   1133 /*
   1134  * Doadump comes here after turning off memory management and
   1135  * getting on the dump stack, either when called above, or by
   1136  * the auto-restart code.
   1137  */
   1138 #define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
   1139 static vaddr_t dumpspace;
   1140 
   1141 vaddr_t
   1142 reserve_dumppages(vaddr_t p)
   1143 {
   1144 
   1145 	dumpspace = p;
   1146 	return (p + BYTES_PER_DUMP);
   1147 }
   1148 
   1149 int
   1150 dumpsys_seg(paddr_t maddr, paddr_t bytes)
   1151 {
   1152 	u_long i, m, n;
   1153 	daddr_t blkno;
   1154 	const struct bdevsw *bdev;
   1155 	int (*dump)(dev_t, daddr_t, void *, size_t);
   1156 	int error;
   1157 
   1158 	if (dumpdev == NODEV)
   1159 		return ENODEV;
   1160 	bdev = bdevsw_lookup(dumpdev);
   1161 	if (bdev == NULL || bdev->d_psize == NULL)
   1162 		return ENODEV;
   1163 
   1164 	dump = bdev->d_dump;
   1165 
   1166 	blkno = dump_header_blkno;
   1167 	for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
   1168 		/* Print out how many MBs we have left to go. */
   1169 		if ((dump_totalbytesleft % (1024*1024)) == 0)
   1170 			printf_nolog("%lu ", (unsigned long)
   1171 			    (dump_totalbytesleft / (1024 * 1024)));
   1172 
   1173 		/* Limit size for next transfer. */
   1174 		n = bytes - i;
   1175 		if (n > BYTES_PER_DUMP)
   1176 			n = BYTES_PER_DUMP;
   1177 
   1178 		for (m = 0; m < n; m += NBPG)
   1179 			pmap_kenter_pa(dumpspace + m, maddr + m,
   1180 			    VM_PROT_READ, 0);
   1181 		pmap_update(pmap_kernel());
   1182 
   1183 		error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
   1184 		pmap_kremove_local(dumpspace, n);
   1185 		if (error)
   1186 			return error;
   1187 		maddr += n;
   1188 		blkno += btodb(n);		/* XXX? */
   1189 
   1190 #if 0	/* XXX this doesn't work.  grr. */
   1191 		/* operator aborting dump? */
   1192 		if (sget() != NULL)
   1193 			return EINTR;
   1194 #endif
   1195 	}
   1196 	dump_header_blkno = blkno;
   1197 
   1198 	return 0;
   1199 }
   1200 
   1201 void
   1202 dodumpsys(void)
   1203 {
   1204 	const struct bdevsw *bdev;
   1205 	int dumpend, psize;
   1206 	int error;
   1207 
   1208 	if (dumpdev == NODEV)
   1209 		return;
   1210 
   1211 	bdev = bdevsw_lookup(dumpdev);
   1212 	if (bdev == NULL || bdev->d_psize == NULL)
   1213 		return;
   1214 	/*
   1215 	 * For dumps during autoconfiguration,
   1216 	 * if dump device has already configured...
   1217 	 */
   1218 	if (dumpsize == 0)
   1219 		cpu_dumpconf();
   1220 
   1221 	printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
   1222 	    (unsigned long long)major(dumpdev),
   1223 	    (unsigned long long)minor(dumpdev), dumplo, dumpsize);
   1224 
   1225 	if (dumplo <= 0 || dumpsize <= 0) {
   1226 		printf(" not possible\n");
   1227 		return;
   1228 	}
   1229 
   1230 	psize = bdev_size(dumpdev);
   1231 	printf("\ndump ");
   1232 	if (psize == -1) {
   1233 		printf("area unavailable\n");
   1234 		return;
   1235 	}
   1236 
   1237 #if 0	/* XXX this doesn't work.  grr. */
   1238 	/* toss any characters present prior to dump */
   1239 	while (sget() != NULL); /*syscons and pccons differ */
   1240 #endif
   1241 
   1242 	dump_seg_prep();
   1243 	dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
   1244 	if (dumpend > psize) {
   1245 		printf("failed: insufficient space (%d < %d)\n",
   1246 		    psize, dumpend);
   1247 		goto failed;
   1248 	}
   1249 
   1250 	dump_header_start();
   1251 	if ((error = cpu_dump()) != 0)
   1252 		goto err;
   1253 	if ((error = dump_header_finish()) != 0)
   1254 		goto err;
   1255 
   1256 	if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
   1257 		printf("BAD header size (%ld [written] != %ld [expected])\n",
   1258 		    (long)(dump_header_blkno - dumplo),
   1259 		    (long)btodb(dump_header_size));
   1260 		goto failed;
   1261 	}
   1262 
   1263 	dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
   1264 	error = dump_seg_iter(dumpsys_seg);
   1265 
   1266 	if (error == 0 && dump_header_blkno != dumpend) {
   1267 		printf("BAD dump size (%ld [written] != %ld [expected])\n",
   1268 		    (long)(dumpend - dumplo),
   1269 		    (long)(dump_header_blkno - dumplo));
   1270 		goto failed;
   1271 	}
   1272 
   1273 err:
   1274 	switch (error) {
   1275 
   1276 	case ENXIO:
   1277 		printf("device bad\n");
   1278 		break;
   1279 
   1280 	case EFAULT:
   1281 		printf("device not ready\n");
   1282 		break;
   1283 
   1284 	case EINVAL:
   1285 		printf("area improper\n");
   1286 		break;
   1287 
   1288 	case EIO:
   1289 		printf("i/o error\n");
   1290 		break;
   1291 
   1292 	case EINTR:
   1293 		printf("aborted from console\n");
   1294 		break;
   1295 
   1296 	case 0:
   1297 		printf("succeeded\n");
   1298 		break;
   1299 
   1300 	default:
   1301 		printf("error %d\n", error);
   1302 		break;
   1303 	}
   1304 failed:
   1305 	printf("\n\n");
   1306 	delay(5000000);		/* 5 seconds */
   1307 }
   1308 
   1309 /*
   1310  * This is called by main to set dumplo and dumpsize.
   1311  * Dumps always skip the first PAGE_SIZE of disk space
   1312  * in case there might be a disk label stored there.
   1313  * If there is extra space, put dump at the end to
   1314  * reduce the chance that swapping trashes it.
   1315  *
   1316  * Sparse dumps can't placed as close to the end as possible, because
   1317  * savecore(8) has to know where to start reading in the dump device
   1318  * before it has access to any of the crashed system's state.
   1319  *
   1320  * Note also that a sparse dump will never be larger than a full one:
   1321  * in order to add a phys_ram_seg_t to the header, at least one page
   1322  * must be removed.
   1323  */
   1324 void
   1325 cpu_dumpconf(void)
   1326 {
   1327 	int nblks, dumpblks;	/* size of dump area */
   1328 
   1329 	if (dumpdev == NODEV)
   1330 		goto bad;
   1331 	nblks = bdev_size(dumpdev);
   1332 	if (nblks <= ctod(1))
   1333 		goto bad;
   1334 
   1335 	dumpblks = cpu_dumpsize();
   1336 	if (dumpblks < 0)
   1337 		goto bad;
   1338 
   1339 	/* dumpsize is in page units, and doesn't include headers. */
   1340 	dumpsize = cpu_dump_mempagecnt();
   1341 
   1342 	dumpblks += ctod(dumpsize);
   1343 
   1344 	/* If dump won't fit (incl. room for possible label), punt. */
   1345 	if (dumpblks > (nblks - ctod(1))) {
   1346 #ifndef NO_SPARSE_DUMP
   1347 		/* A sparse dump might (and hopefully will) fit. */
   1348 		dumplo = ctod(1);
   1349 #else
   1350 		/* But if we're not configured for that, punt. */
   1351 		goto bad;
   1352 #endif
   1353 	} else {
   1354 		/* Put dump at end of partition */
   1355 		dumplo = nblks - dumpblks;
   1356 	}
   1357 
   1358 
   1359 	/* Now that we've decided this will work, init ancillary stuff. */
   1360 	dump_misc_init();
   1361 	return;
   1362 
   1363  bad:
   1364 	dumpsize = 0;
   1365 }
   1366 
   1367 /*
   1368  * Clear registers on exec
   1369  */
   1370 void
   1371 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
   1372 {
   1373 	struct pcb *pcb = lwp_getpcb(l);
   1374 	struct trapframe *tf;
   1375 
   1376 #ifdef USER_LDT
   1377 	pmap_ldt_cleanup(l);
   1378 #endif
   1379 
   1380 	fpu_clear(l, pack->ep_osversion >= 699002600
   1381 	    ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
   1382 	x86_dbregs_clear(l);
   1383 
   1384 	kpreempt_disable();
   1385 	pcb->pcb_flags = 0;
   1386 	l->l_proc->p_flag &= ~PK_32;
   1387 	l->l_md.md_flags = MDL_IRET;
   1388 	cpu_segregs64_zero(l);
   1389 	kpreempt_enable();
   1390 
   1391 	tf = l->l_md.md_regs;
   1392 	memset(tf, 0, sizeof(*tf));
   1393 
   1394 	tf->tf_trapno = T_ASTFLT;
   1395 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
   1396 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
   1397 	tf->tf_rdi = 0;
   1398 	tf->tf_rsi = 0;
   1399 	tf->tf_rbp = 0;
   1400 	tf->tf_rbx = l->l_proc->p_psstrp;
   1401 	tf->tf_rdx = 0;
   1402 	tf->tf_rcx = 0;
   1403 	tf->tf_rax = 0;
   1404 	tf->tf_rip = pack->ep_entry;
   1405 	tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
   1406 	tf->tf_rflags = PSL_USERSET;
   1407 	tf->tf_rsp = stack;
   1408 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
   1409 }
   1410 
   1411 /*
   1412  * Initialize segments and descriptor tables
   1413  */
   1414 char *ldtstore;
   1415 char *gdtstore;
   1416 
   1417 void
   1418 setgate(struct gate_descriptor *gd, void *func,
   1419     int ist, int type, int dpl, int sel)
   1420 {
   1421 	vaddr_t vaddr;
   1422 
   1423 	vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
   1424 
   1425 	kpreempt_disable();
   1426 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
   1427 
   1428 	gd->gd_looffset = (uint64_t)func & 0xffff;
   1429 	gd->gd_selector = sel;
   1430 	gd->gd_ist = ist;
   1431 	gd->gd_type = type;
   1432 	gd->gd_dpl = dpl;
   1433 	gd->gd_p = 1;
   1434 	gd->gd_hioffset = (uint64_t)func >> 16;
   1435 	gd->gd_zero = 0;
   1436 	gd->gd_xx1 = 0;
   1437 	gd->gd_xx2 = 0;
   1438 	gd->gd_xx3 = 0;
   1439 
   1440 	pmap_changeprot_local(vaddr, VM_PROT_READ);
   1441 	kpreempt_enable();
   1442 }
   1443 
   1444 void
   1445 unsetgate(struct gate_descriptor *gd)
   1446 {
   1447 	vaddr_t vaddr;
   1448 
   1449 	vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
   1450 
   1451 	kpreempt_disable();
   1452 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
   1453 
   1454 	memset(gd, 0, sizeof (*gd));
   1455 
   1456 	pmap_changeprot_local(vaddr, VM_PROT_READ);
   1457 	kpreempt_enable();
   1458 }
   1459 
   1460 void
   1461 setregion(struct region_descriptor *rd, void *base, uint16_t limit)
   1462 {
   1463 	rd->rd_limit = limit;
   1464 	rd->rd_base = (uint64_t)base;
   1465 }
   1466 
   1467 /*
   1468  * Note that the base and limit fields are ignored in long mode.
   1469  */
   1470 void
   1471 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
   1472 	int type, int dpl, int gran, int def32, int is64)
   1473 {
   1474 	sd->sd_lolimit = (unsigned)limit;
   1475 	sd->sd_lobase = (unsigned long)base;
   1476 	sd->sd_type = type;
   1477 	sd->sd_dpl = dpl;
   1478 	sd->sd_p = 1;
   1479 	sd->sd_hilimit = (unsigned)limit >> 16;
   1480 	sd->sd_avl = 0;
   1481 	sd->sd_long = is64;
   1482 	sd->sd_def32 = def32;
   1483 	sd->sd_gran = gran;
   1484 	sd->sd_hibase = (unsigned long)base >> 24;
   1485 }
   1486 
   1487 void
   1488 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
   1489 	int type, int dpl, int gran)
   1490 {
   1491 	memset(sd, 0, sizeof *sd);
   1492 	sd->sd_lolimit = (unsigned)limit;
   1493 	sd->sd_lobase = (uint64_t)base;
   1494 	sd->sd_type = type;
   1495 	sd->sd_dpl = dpl;
   1496 	sd->sd_p = 1;
   1497 	sd->sd_hilimit = (unsigned)limit >> 16;
   1498 	sd->sd_gran = gran;
   1499 	sd->sd_hibase = (uint64_t)base >> 24;
   1500 }
   1501 
   1502 void
   1503 cpu_init_idt(struct cpu_info *ci)
   1504 {
   1505 	struct region_descriptor region;
   1506 	idt_descriptor_t *idt;
   1507 
   1508 	idt = ci->ci_idtvec.iv_idt;
   1509 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
   1510 	lidt(&region);
   1511 }
   1512 
   1513 #define	IDTVEC(name)	__CONCAT(X, name)
   1514 typedef void (vector)(void);
   1515 extern vector IDTVEC(syscall);
   1516 extern vector IDTVEC(syscall32);
   1517 extern vector IDTVEC(osyscall);
   1518 extern vector *x86_exceptions[];
   1519 
   1520 #ifndef XENPV
   1521 static void
   1522 init_x86_64_ksyms(void)
   1523 {
   1524 #if NKSYMS || defined(DDB) || defined(MODULAR)
   1525 	extern int end;
   1526 	extern int *esym;
   1527 	struct btinfo_symtab *symtab;
   1528 	vaddr_t tssym, tesym;
   1529 
   1530 #ifdef DDB
   1531 	db_machine_init();
   1532 #endif
   1533 
   1534 	symtab = lookup_bootinfo(BTINFO_SYMTAB);
   1535 	if (symtab) {
   1536 #ifdef KASLR
   1537 		tssym = bootspace.head.va;
   1538 		tesym = bootspace.head.va; /* (unused...) */
   1539 #else
   1540 		tssym = (vaddr_t)symtab->ssym + KERNBASE;
   1541 		tesym = (vaddr_t)symtab->esym + KERNBASE;
   1542 #endif
   1543 		ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
   1544 	} else {
   1545 		uintptr_t endp = (uintptr_t)(void *)&end;
   1546 #ifdef XEN
   1547 		/*
   1548 		 * cpu_probe() / identify_hypervisor() overrides VM_GUEST_GENPVH,
   1549 		 * we can't rely on vm_guest == VM_GUEST_GENPVH
   1550 		 */
   1551 		if (pvh_boot && vm_guest != VM_GUEST_XENPVH)
   1552 			ksyms_addsyms_elf(0, ((long *)endp) + 1, esym);
   1553 		else
   1554 #endif
   1555 			ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym);
   1556 	}
   1557 #endif
   1558 }
   1559 #endif /* XENPV */
   1560 
   1561 void __noasan
   1562 init_bootspace(void)
   1563 {
   1564 	extern char __rodata_start;
   1565 	extern char __data_start;
   1566 	extern char __kernel_end;
   1567 	size_t i = 0;
   1568 
   1569 	memset(&bootspace, 0, sizeof(bootspace));
   1570 
   1571 	bootspace.head.va = KERNTEXTOFF;
   1572 	bootspace.head.pa = KERNTEXTOFF - KERNBASE;
   1573 	bootspace.head.sz = 0;
   1574 
   1575 	bootspace.segs[i].type = BTSEG_TEXT;
   1576 	bootspace.segs[i].va = KERNTEXTOFF;
   1577 	bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
   1578 	bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
   1579 	i++;
   1580 
   1581 	bootspace.segs[i].type = BTSEG_RODATA;
   1582 	bootspace.segs[i].va = (vaddr_t)&__rodata_start;
   1583 	bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE;
   1584 	bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
   1585 	i++;
   1586 
   1587 	bootspace.segs[i].type = BTSEG_DATA;
   1588 	bootspace.segs[i].va = (vaddr_t)&__data_start;
   1589 	bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE;
   1590 	bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
   1591 	i++;
   1592 
   1593 	bootspace.boot.va = (vaddr_t)&__kernel_end;
   1594 	bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE;
   1595 	bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
   1596 	    (size_t)&__kernel_end;
   1597 
   1598 	/* In locore.S, we allocated a tmp va. We will use it now. */
   1599 	bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
   1600 
   1601 	/* Virtual address of the L4 page. */
   1602 	bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
   1603 
   1604 	/* Kernel module map. */
   1605 	bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE;
   1606 	bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
   1607 }
   1608 
   1609 static void
   1610 init_pte(void)
   1611 {
   1612 #ifndef XENPV
   1613 	extern uint32_t nox_flag;
   1614 	pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir;
   1615 	pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) |
   1616 	    PTE_P;
   1617 #endif
   1618 
   1619 	extern pd_entry_t *normal_pdes[3];
   1620 	normal_pdes[0] = L2_BASE;
   1621 	normal_pdes[1] = L3_BASE;
   1622 	normal_pdes[2] = L4_BASE;
   1623 }
   1624 
   1625 void
   1626 init_slotspace(void)
   1627 {
   1628 	/*
   1629 	 * XXX Too early to use cprng(9), or even entropy_extract.
   1630 	 */
   1631 	struct entpool pool;
   1632 	size_t randhole;
   1633 	vaddr_t randva;
   1634 	uint64_t sample;
   1635 	vaddr_t va;
   1636 
   1637 	memset(&pool, 0, sizeof pool);
   1638 	cpu_rng_early_sample(&sample);
   1639 	entpool_enter(&pool, &sample, sizeof sample);
   1640 
   1641 	memset(&slotspace, 0, sizeof(slotspace));
   1642 
   1643 	/* User. [256, because we want to land in >= 256] */
   1644 	slotspace.area[SLAREA_USER].sslot = 0;
   1645 	slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1;
   1646 	slotspace.area[SLAREA_USER].active = true;
   1647 
   1648 #ifdef XENPV
   1649 	/* PTE. */
   1650 	slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE;
   1651 	slotspace.area[SLAREA_PTE].nslot = 1;
   1652 	slotspace.area[SLAREA_PTE].active = true;
   1653 #endif
   1654 
   1655 #ifdef __HAVE_PCPU_AREA
   1656 	/* Per-CPU. */
   1657 	slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU;
   1658 	slotspace.area[SLAREA_PCPU].nslot = 1;
   1659 	slotspace.area[SLAREA_PCPU].active = true;
   1660 #endif
   1661 
   1662 #ifdef __HAVE_DIRECT_MAP
   1663 	/* Direct Map. [Randomized later] */
   1664 	slotspace.area[SLAREA_DMAP].active = false;
   1665 #endif
   1666 
   1667 #ifdef XENPV
   1668 	/* Hypervisor. */
   1669 	slotspace.area[SLAREA_HYPV].sslot = 256;
   1670 	slotspace.area[SLAREA_HYPV].nslot = 17;
   1671 	slotspace.area[SLAREA_HYPV].active = true;
   1672 #endif
   1673 
   1674 #ifdef KASAN
   1675 	/* ASAN. */
   1676 	slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN;
   1677 	slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN;
   1678 	slotspace.area[SLAREA_ASAN].active = true;
   1679 #endif
   1680 
   1681 #ifdef KMSAN
   1682 	/* MSAN. */
   1683 	slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN;
   1684 	slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN;
   1685 	slotspace.area[SLAREA_MSAN].active = true;
   1686 #endif
   1687 
   1688 	/* Kernel. */
   1689 	slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE;
   1690 	slotspace.area[SLAREA_KERN].nslot = 1;
   1691 	slotspace.area[SLAREA_KERN].active = true;
   1692 
   1693 	/* Main. */
   1694 	cpu_rng_early_sample(&sample);
   1695 	entpool_enter(&pool, &sample, sizeof sample);
   1696 	entpool_extract(&pool, &randhole, sizeof randhole);
   1697 	entpool_extract(&pool, &randva, sizeof randva);
   1698 	va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4,
   1699 	    NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */
   1700 	vm_min_kernel_address = va;
   1701 	vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4;
   1702 
   1703 #ifndef XENPV
   1704 	/* PTE. */
   1705 	cpu_rng_early_sample(&sample);
   1706 	entpool_enter(&pool, &sample, sizeof sample);
   1707 	entpool_extract(&pool, &randhole, sizeof randhole);
   1708 	entpool_extract(&pool, &randva, sizeof randva);
   1709 	va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva);
   1710 	pte_base = (pd_entry_t *)va;
   1711 #endif
   1712 
   1713 	explicit_memset(&pool, 0, sizeof pool);
   1714 }
   1715 
   1716 void
   1717 init_x86_64(paddr_t first_avail)
   1718 {
   1719 	extern void consinit(void);
   1720 	struct region_descriptor region;
   1721 	struct mem_segment_descriptor *ldt_segp;
   1722 	struct idt_vec *iv;
   1723 	idt_descriptor_t *idt;
   1724 	int x;
   1725 	struct pcb *pcb;
   1726 	extern vaddr_t lwp0uarea;
   1727 #ifndef XENPV
   1728 	extern paddr_t local_apic_pa;
   1729 #endif
   1730 
   1731 	KASSERT(first_avail % PAGE_SIZE == 0);
   1732 
   1733 #ifdef XENPV
   1734 	KASSERT(HYPERVISOR_shared_info != NULL);
   1735 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
   1736 #endif
   1737 
   1738 #ifdef XEN
   1739 	if (pvh_boot)
   1740 		xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
   1741 #endif
   1742 	init_pte();
   1743 
   1744 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
   1745 
   1746 	cpu_probe(&cpu_info_primary);
   1747 #ifdef SVS
   1748 	svs_init();
   1749 #endif
   1750 
   1751 	/*
   1752 	 * Initialize MSRs on cpu0:
   1753 	 *
   1754 	 * - Enables SYSCALL/SYSRET.
   1755 	 *
   1756 	 * - Sets up %fs and %gs so that %gs points to the current
   1757 	 *   struct cpu_info as needed for CPUVAR(...), curcpu(), and
   1758 	 *   curlwp.
   1759 	 *
   1760 	 * - Enables the no-execute bit if supported.
   1761 	 *
   1762 	 * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
   1763 	 * will work on cpu0.
   1764 	 *
   1765 	 * Note: The call to cpu_init_msrs for secondary CPUs happens
   1766 	 * in cpu_hatch.
   1767 	 */
   1768 	cpu_init_msrs(&cpu_info_primary, true);
   1769 
   1770 #ifndef XENPV
   1771 	cpu_speculation_init(&cpu_info_primary);
   1772 #endif
   1773 
   1774 	use_pae = 1; /* PAE always enabled in long mode */
   1775 
   1776 	pcb = lwp_getpcb(&lwp0);
   1777 #ifdef XENPV
   1778 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
   1779 	pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
   1780 #else
   1781 	pcb->pcb_cr3 = PDPpaddr;
   1782 #endif
   1783 
   1784 #if NISA > 0 || NPCI > 0
   1785 	x86_bus_space_init();
   1786 #endif
   1787 
   1788 	pat_init(&cpu_info_primary);
   1789 
   1790 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
   1791 
   1792 	/*
   1793 	 * Initialize PAGE_SIZE-dependent variables.
   1794 	 */
   1795 	uvm_md_init();
   1796 
   1797 	uvmexp.ncolors = 2;
   1798 
   1799 	avail_start = first_avail;
   1800 
   1801 #ifndef XENPV
   1802 	/*
   1803 	 * Low memory reservations:
   1804 	 * Page 0:	BIOS data
   1805 	 * Page 1:	BIOS callback (not used yet, for symmetry with i386)
   1806 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
   1807 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
   1808 	 * Page 4:	Temporary page table for 0MB-4MB
   1809 	 * Page 5:	Temporary page directory
   1810 	 * Page 6:	Temporary page map level 3
   1811 	 * Page 7:	Temporary page map level 4
   1812 	 */
   1813 	lowmem_rsvd = 8 * PAGE_SIZE;
   1814 
   1815 	/* Initialize the memory clusters (needed in pmap_bootstrap). */
   1816 	init_x86_clusters();
   1817 #else
   1818 	/* Parse Xen command line (replace bootinfo) */
   1819 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
   1820 
   1821 	avail_end = ctob(xen_start_info.nr_pages);
   1822 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
   1823 	pmap_pa_end = avail_end;
   1824 #endif
   1825 
   1826 	/*
   1827 	 * Call pmap initialization to make new kernel address space.
   1828 	 * We must do this before loading pages into the VM system.
   1829 	 */
   1830 	pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
   1831 
   1832 	/*
   1833 	 * Initialize RNG to get entropy ASAP either from CPU
   1834 	 * RDRAND/RDSEED or from seed on disk.  Constraints:
   1835 	 *
   1836 	 * - Must happen after cpu_init_msrs so that curcpu() and
   1837 	 *   curlwp work.
   1838 	 *
   1839 	 * - Must happen after consinit so we have the opportunity to
   1840 	 *   print useful feedback.
   1841 	 *
   1842 	 * - On KASLR kernels, must happen after pmap_bootstrap because
   1843 	 *   x86_rndseed requires access to the direct map.
   1844 	 */
   1845 	cpu_rng_init();
   1846 	x86_rndseed();
   1847 
   1848 #ifndef XENPV
   1849 	/* Internalize the physical pages into the VM system. */
   1850 	init_x86_vm(avail_start);
   1851 #else
   1852 	physmem = xen_start_info.nr_pages;
   1853 	uvm_page_physload(atop(avail_start), atop(avail_end),
   1854 	    atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT);
   1855 #endif
   1856 
   1857 	init_x86_msgbuf();
   1858 
   1859 	kasan_init();
   1860 	kcsan_init();
   1861 	kmsan_init((void *)lwp0uarea);
   1862 
   1863 	pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
   1864 
   1865 	kpreempt_disable();
   1866 
   1867 #ifndef XENPV
   1868 	pmap_kenter_pa(local_apic_va, local_apic_pa,
   1869 	    VM_PROT_READ|VM_PROT_WRITE, 0);
   1870 	pmap_update(pmap_kernel());
   1871 	memset((void *)local_apic_va, 0, PAGE_SIZE);
   1872 #endif
   1873 
   1874 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1875 	pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1876 	pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1877 	pmap_update(pmap_kernel());
   1878 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
   1879 	memset((void *)gdt_vaddr, 0, PAGE_SIZE);
   1880 	memset((void *)ldt_vaddr, 0, PAGE_SIZE);
   1881 
   1882 #ifndef XENPV
   1883 	pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
   1884 #endif
   1885 
   1886 	pmap_update(pmap_kernel());
   1887 
   1888 	iv = &(cpu_info_primary.ci_idtvec);
   1889 	idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
   1890 	idt = iv->iv_idt;
   1891 	gdtstore = (char *)gdt_vaddr;
   1892 	ldtstore = (char *)ldt_vaddr;
   1893 
   1894 	/*
   1895 	 * Make GDT gates and memory segments.
   1896 	 */
   1897 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
   1898 	    0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
   1899 
   1900 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
   1901 	    0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
   1902 
   1903 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
   1904 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
   1905 
   1906 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
   1907 	    x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
   1908 
   1909 #ifndef XENPV
   1910 	set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
   1911 	    LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
   1912 #endif
   1913 
   1914 	/*
   1915 	 * Make LDT memory segments.
   1916 	 */
   1917 	*(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
   1918 	    *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
   1919 	*(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
   1920 	    *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
   1921 
   1922 	/*
   1923 	 * 32 bit GDT entries.
   1924 	 */
   1925 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
   1926 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
   1927 
   1928 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
   1929 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
   1930 
   1931 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
   1932 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
   1933 
   1934 	set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
   1935 	    x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
   1936 
   1937 	/*
   1938 	 * 32 bit LDT entries.
   1939 	 */
   1940 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
   1941 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
   1942 	    SDT_MEMERA, SEL_UPL, 1, 1, 0);
   1943 	ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
   1944 	set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
   1945 	    SDT_MEMRWA, SEL_UPL, 1, 1, 0);
   1946 
   1947 	/* CPU-specific IDT exceptions. */
   1948 	for (x = 0; x < NCPUIDT; x++) {
   1949 		int sel, ist;
   1950 
   1951 		/* Reset to default. Special cases below */
   1952 		sel = SEL_KPL;
   1953 		ist = 0;
   1954 
   1955 		idt_vec_reserve(iv, x);
   1956 
   1957 		switch (x) {
   1958 		case 1:	/* DB */
   1959 			ist = 4;
   1960 			break;
   1961 		case 2:	/* NMI */
   1962 			ist = 3;
   1963 			break;
   1964 		case 3:
   1965 		case 4:
   1966 			sel = SEL_UPL;
   1967 			break;
   1968 		case 8:	/* double fault */
   1969 			ist = 2;
   1970 			break;
   1971 #ifdef XENPV
   1972 		case 18: /* MCA */
   1973 			sel |= 0x4; /* Auto EOI/mask */
   1974 			break;
   1975 #endif /* XENPV */
   1976 		default:
   1977 			break;
   1978 		}
   1979 
   1980 		set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT,
   1981 		    sel, GSEL(GCODE_SEL, SEL_KPL));
   1982 	}
   1983 
   1984 	/* new-style interrupt gate for syscalls */
   1985 	idt_vec_reserve(iv, 128);
   1986 	set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
   1987 	    GSEL(GCODE_SEL, SEL_KPL));
   1988 
   1989 	kpreempt_enable();
   1990 
   1991 	setregion(&region, gdtstore, DYNSEL_START - 1);
   1992 	lgdt(&region);
   1993 
   1994 #ifdef XENPV
   1995 	/* Init Xen callbacks and syscall handlers */
   1996 	if (HYPERVISOR_set_callbacks(
   1997 	    (unsigned long) hypervisor_callback,
   1998 	    (unsigned long) failsafe_callback,
   1999 	    (unsigned long) Xsyscall))
   2000 		panic("HYPERVISOR_set_callbacks() failed");
   2001 #endif /* XENPV */
   2002 
   2003 	cpu_init_idt(&cpu_info_primary);
   2004 
   2005 #ifdef XENPV
   2006 	xen_init_ksyms();
   2007 #else /* XENPV */
   2008 #ifdef XEN
   2009 	if (vm_guest == VM_GUEST_XENPVH)
   2010 		xen_init_ksyms();
   2011 	else
   2012 #endif /* XEN */
   2013 		init_x86_64_ksyms();
   2014 #endif /* XENPV */
   2015 
   2016 #ifndef XENPV
   2017 	intr_default_setup();
   2018 #else
   2019 	events_default_setup();
   2020 #endif
   2021 
   2022 	splraise(IPL_HIGH);
   2023 	x86_enable_intr();
   2024 
   2025 #ifdef DDB
   2026 	if (boothowto & RB_KDB)
   2027 		Debugger();
   2028 #endif
   2029 #ifdef KGDB
   2030 	kgdb_port_init();
   2031 	if (boothowto & RB_KDB) {
   2032 		kgdb_debug_init = 1;
   2033 		kgdb_connect(1);
   2034 	}
   2035 #endif
   2036 
   2037 	pcb->pcb_dbregs = NULL;
   2038 	x86_dbregs_init();
   2039 }
   2040 
   2041 void
   2042 cpu_reset(void)
   2043 {
   2044 #ifndef XENPV
   2045 	idt_descriptor_t *idt;
   2046 	vaddr_t vaddr;
   2047 
   2048 	idt = cpu_info_primary.ci_idtvec.iv_idt;
   2049 	vaddr = (vaddr_t)idt;
   2050 #endif
   2051 
   2052 	x86_disable_intr();
   2053 
   2054 #ifdef XENPV
   2055 	HYPERVISOR_reboot();
   2056 #else
   2057 
   2058 	x86_reset();
   2059 
   2060 	/*
   2061 	 * Try to cause a triple fault and watchdog reset by making the IDT
   2062 	 * invalid and causing a fault.
   2063 	 */
   2064 	kpreempt_disable();
   2065 	pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
   2066 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
   2067 	kpreempt_enable();
   2068 	breakpoint();
   2069 
   2070 #if 0
   2071 	/*
   2072 	 * Try to cause a triple fault and watchdog reset by unmapping the
   2073 	 * entire address space and doing a TLB flush.
   2074 	 */
   2075 	memset((void *)PTD, 0, PAGE_SIZE);
   2076 	tlbflush();
   2077 #endif
   2078 #endif	/* XENPV */
   2079 
   2080 	for (;;);
   2081 }
   2082 
   2083 void
   2084 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
   2085 {
   2086 	const struct trapframe *tf = l->l_md.md_regs;
   2087 	__greg_t ras_rip;
   2088 
   2089 	mcp->__gregs[_REG_RDI] = tf->tf_rdi;
   2090 	mcp->__gregs[_REG_RSI] = tf->tf_rsi;
   2091 	mcp->__gregs[_REG_RDX] = tf->tf_rdx;
   2092 	mcp->__gregs[_REG_R10] = tf->tf_r10;
   2093 	mcp->__gregs[_REG_R8]  = tf->tf_r8;
   2094 	mcp->__gregs[_REG_R9]  = tf->tf_r9;
   2095 	/* argX not touched */
   2096 	mcp->__gregs[_REG_RCX] = tf->tf_rcx;
   2097 	mcp->__gregs[_REG_R11] = tf->tf_r11;
   2098 	mcp->__gregs[_REG_R12] = tf->tf_r12;
   2099 	mcp->__gregs[_REG_R13] = tf->tf_r13;
   2100 	mcp->__gregs[_REG_R14] = tf->tf_r14;
   2101 	mcp->__gregs[_REG_R15] = tf->tf_r15;
   2102 	mcp->__gregs[_REG_RBP] = tf->tf_rbp;
   2103 	mcp->__gregs[_REG_RBX] = tf->tf_rbx;
   2104 	mcp->__gregs[_REG_RAX] = tf->tf_rax;
   2105 	mcp->__gregs[_REG_GS]  = 0;
   2106 	mcp->__gregs[_REG_FS]  = 0;
   2107 	mcp->__gregs[_REG_ES]  = GSEL(GUDATA_SEL, SEL_UPL);
   2108 	mcp->__gregs[_REG_DS]  = GSEL(GUDATA_SEL, SEL_UPL);
   2109 	mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno;
   2110 	mcp->__gregs[_REG_ERR] = tf->tf_err;
   2111 	mcp->__gregs[_REG_RIP] = tf->tf_rip;
   2112 	mcp->__gregs[_REG_CS]  = LSEL(LUCODE_SEL, SEL_UPL);
   2113 	mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags;
   2114 	mcp->__gregs[_REG_RSP] = tf->tf_rsp;
   2115 	mcp->__gregs[_REG_SS]  = LSEL(LUDATA_SEL, SEL_UPL);
   2116 
   2117 	if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
   2118 	    (void *) mcp->__gregs[_REG_RIP])) != -1)
   2119 		mcp->__gregs[_REG_RIP] = ras_rip;
   2120 
   2121 	*flags |= _UC_CPU;
   2122 
   2123 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
   2124 	*flags |= _UC_TLSBASE;
   2125 
   2126 	process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
   2127 	*flags |= _UC_FPU;
   2128 }
   2129 
   2130 int
   2131 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
   2132 {
   2133 	struct trapframe *tf = l->l_md.md_regs;
   2134 	const __greg_t *gr = mcp->__gregs;
   2135 	struct proc *p = l->l_proc;
   2136 	int error;
   2137 	int64_t rflags;
   2138 
   2139 	CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
   2140 
   2141 	if ((flags & _UC_CPU) != 0) {
   2142 		error = cpu_mcontext_validate(l, mcp);
   2143 		if (error != 0)
   2144 			return error;
   2145 
   2146 		tf->tf_rdi  = gr[_REG_RDI];
   2147 		tf->tf_rsi  = gr[_REG_RSI];
   2148 		tf->tf_rdx  = gr[_REG_RDX];
   2149 		tf->tf_r10  = gr[_REG_R10];
   2150 		tf->tf_r8   = gr[_REG_R8];
   2151 		tf->tf_r9   = gr[_REG_R9];
   2152 		/* argX not touched */
   2153 		tf->tf_rcx  = gr[_REG_RCX];
   2154 		tf->tf_r11  = gr[_REG_R11];
   2155 		tf->tf_r12  = gr[_REG_R12];
   2156 		tf->tf_r13  = gr[_REG_R13];
   2157 		tf->tf_r14  = gr[_REG_R14];
   2158 		tf->tf_r15  = gr[_REG_R15];
   2159 		tf->tf_rbp  = gr[_REG_RBP];
   2160 		tf->tf_rbx  = gr[_REG_RBX];
   2161 		tf->tf_rax  = gr[_REG_RAX];
   2162 		tf->tf_gs   = 0;
   2163 		tf->tf_fs   = 0;
   2164 		tf->tf_es   = GSEL(GUDATA_SEL, SEL_UPL);
   2165 		tf->tf_ds   = GSEL(GUDATA_SEL, SEL_UPL);
   2166 		/* trapno, err not touched */
   2167 		tf->tf_rip  = gr[_REG_RIP];
   2168 		tf->tf_cs   = LSEL(LUCODE_SEL, SEL_UPL);
   2169 		rflags = tf->tf_rflags;
   2170 		rflags &= ~PSL_USER;
   2171 		tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
   2172 		tf->tf_rsp  = gr[_REG_RSP];
   2173 		tf->tf_ss   = LSEL(LUDATA_SEL, SEL_UPL);
   2174 
   2175 		l->l_md.md_flags |= MDL_IRET;
   2176 	}
   2177 
   2178 	if ((flags & _UC_FPU) != 0)
   2179 		process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
   2180 
   2181 	if ((flags & _UC_TLSBASE) != 0)
   2182 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
   2183 
   2184 	mutex_enter(p->p_lock);
   2185 	if (flags & _UC_SETSTACK)
   2186 		l->l_sigstk.ss_flags |= SS_ONSTACK;
   2187 	if (flags & _UC_CLRSTACK)
   2188 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
   2189 	mutex_exit(p->p_lock);
   2190 
   2191 	return 0;
   2192 }
   2193 
   2194 int
   2195 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
   2196 {
   2197 	struct proc *p __diagused = l->l_proc;
   2198 	struct trapframe *tf = l->l_md.md_regs;
   2199 	const __greg_t *gr;
   2200 	uint16_t sel;
   2201 
   2202 	KASSERT((p->p_flag & PK_32) == 0);
   2203 	gr = mcp->__gregs;
   2204 
   2205 	if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
   2206 		return EINVAL;
   2207 
   2208 	sel = gr[_REG_ES] & 0xffff;
   2209 	if (sel != 0 && !VALID_USER_DSEL(sel))
   2210 		return EINVAL;
   2211 
   2212 	sel = gr[_REG_FS] & 0xffff;
   2213 	if (sel != 0 && !VALID_USER_DSEL(sel))
   2214 		return EINVAL;
   2215 
   2216 	sel = gr[_REG_GS] & 0xffff;
   2217 	if (sel != 0 && !VALID_USER_DSEL(sel))
   2218 		return EINVAL;
   2219 
   2220 	sel = gr[_REG_DS] & 0xffff;
   2221 	if (!VALID_USER_DSEL(sel))
   2222 		return EINVAL;
   2223 
   2224 #ifndef XENPV
   2225 	sel = gr[_REG_SS] & 0xffff;
   2226 	if (!VALID_USER_DSEL(sel))
   2227 		return EINVAL;
   2228 
   2229 	sel = gr[_REG_CS] & 0xffff;
   2230 	if (!VALID_USER_CSEL(sel))
   2231 		return EINVAL;
   2232 #endif
   2233 
   2234 	if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
   2235 		return EINVAL;
   2236 
   2237 	return 0;
   2238 }
   2239 
   2240 int
   2241 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
   2242 {
   2243 	const vaddr_t v = (vaddr_t)ptr;
   2244 	vaddr_t kva, kva_end;
   2245 	size_t i;
   2246 
   2247 	kva = bootspace.head.va;
   2248 	kva_end = kva + bootspace.head.sz;
   2249 	if (v >= kva && v < kva_end) {
   2250 		*handled = true;
   2251 		return 0;
   2252 	}
   2253 
   2254 	for (i = 0; i < BTSPACE_NSEGS; i++) {
   2255 		kva = bootspace.segs[i].va;
   2256 		kva_end = kva + bootspace.segs[i].sz;
   2257 		if (v < kva || v >= kva_end)
   2258 			continue;
   2259 		*handled = true;
   2260 		if (bootspace.segs[i].type == BTSEG_TEXT ||
   2261 		    bootspace.segs[i].type == BTSEG_RODATA) {
   2262 			if (prot & VM_PROT_WRITE) {
   2263 				return EFAULT;
   2264 			}
   2265 		}
   2266 		return 0;
   2267 	}
   2268 
   2269 	kva = bootspace.boot.va;
   2270 	kva_end = kva + bootspace.boot.sz;
   2271 	if (v >= kva && v < kva_end) {
   2272 		*handled = true;
   2273 		return 0;
   2274 	}
   2275 
   2276 	if (v >= bootspace.smodule && v < bootspace.emodule) {
   2277 		*handled = true;
   2278 		if (!uvm_map_checkprot(module_map, v, v + 1, prot)) {
   2279 			return EFAULT;
   2280 		}
   2281 	} else {
   2282 		*handled = false;
   2283 	}
   2284 	return 0;
   2285 }
   2286 
   2287 /*
   2288  * Zero out a 64bit LWP's segments registers. Used when exec'ing a new
   2289  * 64bit program.
   2290  */
   2291 void
   2292 cpu_segregs64_zero(struct lwp *l)
   2293 {
   2294 	struct trapframe * const tf = l->l_md.md_regs;
   2295 	struct pcb *pcb;
   2296 	uint64_t zero = 0;
   2297 
   2298 	KASSERT(kpreempt_disabled());
   2299 	KASSERT((l->l_proc->p_flag & PK_32) == 0);
   2300 	KASSERT(l == curlwp);
   2301 
   2302 	pcb = lwp_getpcb(l);
   2303 
   2304 	tf->tf_fs = 0;
   2305 	tf->tf_gs = 0;
   2306 	setds(GSEL(GUDATA_SEL, SEL_UPL));
   2307 	setes(GSEL(GUDATA_SEL, SEL_UPL));
   2308 	setfs(0);
   2309 	setusergs(0);
   2310 
   2311 #ifndef XENPV
   2312 	wrmsr(MSR_FSBASE, 0);
   2313 	wrmsr(MSR_KERNELGSBASE, 0);
   2314 #else
   2315 	HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
   2316 	HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
   2317 #endif
   2318 
   2319 	pcb->pcb_fs = 0;
   2320 	pcb->pcb_gs = 0;
   2321 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
   2322 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
   2323 }
   2324 
   2325 /*
   2326  * Zero out a 32bit LWP's segments registers. Used when exec'ing a new
   2327  * 32bit program.
   2328  */
   2329 void
   2330 cpu_segregs32_zero(struct lwp *l)
   2331 {
   2332 	struct trapframe * const tf = l->l_md.md_regs;
   2333 	struct pcb *pcb;
   2334 	uint64_t zero = 0;
   2335 
   2336 	KASSERT(kpreempt_disabled());
   2337 	KASSERT(l->l_proc->p_flag & PK_32);
   2338 	KASSERT(l == curlwp);
   2339 
   2340 	pcb = lwp_getpcb(l);
   2341 
   2342 	tf->tf_fs = 0;
   2343 	tf->tf_gs = 0;
   2344 	setds(GSEL(GUDATA32_SEL, SEL_UPL));
   2345 	setes(GSEL(GUDATA32_SEL, SEL_UPL));
   2346 	setfs(0);
   2347 	setusergs(0);
   2348 	pcb->pcb_fs = 0;
   2349 	pcb->pcb_gs = 0;
   2350 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
   2351 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
   2352 }
   2353 
   2354 /*
   2355  * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
   2356  * Used only for 32-bit processes.
   2357  */
   2358 void
   2359 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
   2360 {
   2361 	struct trapframe *tf;
   2362 	struct pcb *pcb;
   2363 
   2364 	KASSERT(l->l_proc->p_flag & PK_32);
   2365 	KASSERT(l == curlwp);
   2366 
   2367 	tf = l->l_md.md_regs;
   2368 	fssel &= 0xFFFF;
   2369 	gssel &= 0xFFFF;
   2370 
   2371 	pcb = lwp_getpcb(l);
   2372 	kpreempt_disable();
   2373 	update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
   2374 	update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
   2375 
   2376 #ifdef XENPV
   2377 	setusergs(gssel);
   2378 #endif
   2379 
   2380 	tf->tf_fs = fssel;
   2381 	tf->tf_gs = gssel;
   2382 	kpreempt_enable();
   2383 }
   2384 
   2385 bool
   2386 mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
   2387 {
   2388 	vaddr_t va = (vaddr_t)addr;
   2389 
   2390 #ifdef __HAVE_DIRECT_MAP
   2391 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
   2392 		*paddr = PMAP_DIRECT_UNMAP(va);
   2393 		return true;
   2394 	}
   2395 #else
   2396 	__USE(va);
   2397 #endif
   2398 
   2399 	return false;
   2400 }
   2401 
   2402 bool
   2403 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
   2404 {
   2405 #ifdef __HAVE_DIRECT_MAP
   2406 	*vaddr = PMAP_DIRECT_MAP(paddr);
   2407 	return true;
   2408 #else
   2409 	return false;
   2410 #endif
   2411 }
   2412 
   2413 static void
   2414 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
   2415 {
   2416 	idt_descriptor_t *idt_dst;
   2417 
   2418 	idt_dst = dst->iv_idt;
   2419 
   2420 	kpreempt_disable();
   2421 	pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE);
   2422 
   2423 	memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
   2424 	memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
   2425 
   2426 	pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ);
   2427 	kpreempt_enable();
   2428 }
   2429 
   2430 void
   2431 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
   2432 {
   2433 	vaddr_t va;
   2434 
   2435 	if (cid != cpu_index(&cpu_info_primary) &&
   2436 	    idt_vec_is_pcpu()) {
   2437 #ifdef __HAVE_PCPU_AREA
   2438 		va = (vaddr_t)&pcpuarea->ent[cid].idt;
   2439 #else
   2440 		struct vm_page *pg;
   2441 
   2442 		va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
   2443 		    UVM_KMF_VAONLY);
   2444 		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
   2445 		if (pg == NULL) {
   2446 			panic("failed to allocate a page for IDT");
   2447 		}
   2448 		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
   2449 		    VM_PROT_READ|VM_PROT_WRITE, 0);
   2450 		pmap_update(pmap_kernel());
   2451 #endif
   2452 
   2453 		memset((void *)va, 0, PAGE_SIZE);
   2454 #ifndef XENPV
   2455 		pmap_changeprot_local(va, VM_PROT_READ);
   2456 #endif
   2457 		pmap_update(pmap_kernel());
   2458 
   2459 		iv->iv_idt = (void *)va;
   2460 		idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
   2461 	} else {
   2462 		iv->iv_idt = (void *)idt_vaddr;
   2463 	}
   2464 }
   2465