Home | History | Annotate | Line # | Download | only in i386
      1 /*	$NetBSD: machdep.c,v 1.849 2025/05/05 16:57:41 imil Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009, 2017
      5  *     The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
     10  * Simulation Facility NASA Ames Research Center, by Julio M. Merino Vidal,
     11  * by Andrew Doran, and by Maxime Villard.
     12  *
     13  * Redistribution and use in source and binary forms, with or without
     14  * modification, are permitted provided that the following conditions
     15  * are met:
     16  * 1. Redistributions of source code must retain the above copyright
     17  *    notice, this list of conditions and the following disclaimer.
     18  * 2. Redistributions in binary form must reproduce the above copyright
     19  *    notice, this list of conditions and the following disclaimer in the
     20  *    documentation and/or other materials provided with the distribution.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     32  * POSSIBILITY OF SUCH DAMAGE.
     33  */
     34 
     35 /*
     36  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
     37  * All rights reserved.
     38  *
     39  * This code is derived from software contributed to Berkeley by
     40  * William Jolitz.
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. Neither the name of the University nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     64  * SUCH DAMAGE.
     65  *
     66  *	@(#)machdep.c	7.4 (Berkeley) 6/3/91
     67  */
     68 
     69 #include <sys/cdefs.h>
     70 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.849 2025/05/05 16:57:41 imil Exp $");
     71 
     72 #include "opt_beep.h"
     73 #include "opt_compat_freebsd.h"
     74 #include "opt_compat_netbsd.h"
     75 #include "opt_cpureset_delay.h"
     76 #include "opt_ddb.h"
     77 #include "opt_kgdb.h"
     78 #include "opt_mtrr.h"
     79 #include "opt_modular.h"
     80 #include "opt_multiboot.h"
     81 #include "opt_multiprocessor.h"
     82 #include "opt_physmem.h"
     83 #include "opt_realmem.h"
     84 #include "opt_user_ldt.h"
     85 #include "opt_xen.h"
     86 #include "isa.h"
     87 #include "pci.h"
     88 
     89 #include <sys/param.h>
     90 #include <sys/systm.h>
     91 #include <sys/signal.h>
     92 #include <sys/signalvar.h>
     93 #include <sys/kernel.h>
     94 #include <sys/cpu.h>
     95 #include <sys/exec.h>
     96 #include <sys/fcntl.h>
     97 #include <sys/reboot.h>
     98 #include <sys/conf.h>
     99 #include <sys/kauth.h>
    100 #include <sys/msgbuf.h>
    101 #include <sys/mount.h>
    102 #include <sys/syscallargs.h>
    103 #include <sys/core.h>
    104 #include <sys/kcore.h>
    105 #include <sys/ucontext.h>
    106 #include <sys/ras.h>
    107 #include <sys/ksyms.h>
    108 #include <sys/device.h>
    109 #include <sys/timevar.h>
    110 
    111 #ifdef KGDB
    112 #include <sys/kgdb.h>
    113 #endif
    114 
    115 #include <dev/cons.h>
    116 #include <dev/mm.h>
    117 
    118 #include <uvm/uvm.h>
    119 #include <uvm/uvm_page.h>
    120 
    121 #include <sys/sysctl.h>
    122 
    123 #include <x86/efi.h>
    124 
    125 #include <machine/cpu.h>
    126 #include <machine/cpu_rng.h>
    127 #include <machine/cpufunc.h>
    128 #include <machine/cpuvar.h>
    129 #include <machine/gdt.h>
    130 #include <machine/intr.h>
    131 #include <machine/kcore.h>
    132 #include <machine/pio.h>
    133 #include <machine/psl.h>
    134 #include <machine/reg.h>
    135 #include <machine/specialreg.h>
    136 #include <machine/bootinfo.h>
    137 #include <machine/mtrr.h>
    138 #include <machine/pmap_private.h>
    139 #include <x86/x86/tsc.h>
    140 
    141 #include <x86/bootspace.h>
    142 #include <x86/fpu.h>
    143 #include <x86/dbregs.h>
    144 #include <x86/machdep.h>
    145 
    146 #include <machine/multiboot.h>
    147 
    148 #ifdef XEN
    149 #include <xen/evtchn.h>
    150 #include <xen/xen.h>
    151 #include <xen/hypervisor.h>
    152 #endif
    153 
    154 #include <dev/isa/isareg.h>
    155 #include <machine/isa_machdep.h>
    156 #include <dev/ic/i8042reg.h>
    157 
    158 #include <ddb/db_active.h>
    159 
    160 #ifdef DDB
    161 #include <machine/db_machdep.h>
    162 #include <ddb/db_extern.h>
    163 #endif
    164 
    165 #include "acpica.h"
    166 #include "bioscall.h"
    167 
    168 #if NBIOSCALL > 0
    169 #include <machine/bioscall.h>
    170 #endif
    171 
    172 #if NACPICA > 0
    173 #include <dev/acpi/acpivar.h>
    174 #define ACPI_MACHDEP_PRIVATE
    175 #include <machine/acpi_machdep.h>
    176 #else
    177 #include <machine/i82489var.h>
    178 #endif
    179 
    180 #include "isa.h"
    181 #include "isadma.h"
    182 #include "ksyms.h"
    183 
    184 #include "cardbus.h"
    185 #if NCARDBUS > 0
    186 /* For rbus_min_start hint. */
    187 #include <sys/bus.h>
    188 #include <dev/cardbus/rbus.h>
    189 #include <machine/rbus_machdep.h>
    190 #endif
    191 
    192 #include "mca.h"
    193 #if NMCA > 0
    194 #include <machine/mca_machdep.h>	/* for mca_busprobe() */
    195 #endif
    196 
    197 #ifdef MULTIPROCESSOR		/* XXX */
    198 #include <machine/mpbiosvar.h>	/* XXX */
    199 #endif				/* XXX */
    200 
    201 /* the following is used externally (sysctl_hw) */
    202 char machine[] = "i386";		/* CPU "architecture" */
    203 char machine_arch[] = "i386";		/* machine == machine_arch */
    204 
    205 #ifdef CPURESET_DELAY
    206 int cpureset_delay = CPURESET_DELAY;
    207 #else
    208 int cpureset_delay = 2000; /* default to 2s */
    209 #endif
    210 
    211 #ifdef MTRR
    212 const struct mtrr_funcs *mtrr_funcs;
    213 #endif
    214 
    215 int cpu_class;
    216 int use_pae;
    217 int i386_fpu_fdivbug;
    218 
    219 int i386_use_fxsave;
    220 int i386_has_sse;
    221 int i386_has_sse2;
    222 
    223 vaddr_t idt_vaddr;
    224 paddr_t idt_paddr;
    225 vaddr_t gdt_vaddr;
    226 paddr_t gdt_paddr;
    227 vaddr_t ldt_vaddr;
    228 paddr_t ldt_paddr;
    229 
    230 vaddr_t pentium_idt_vaddr;
    231 
    232 struct vm_map *phys_map = NULL;
    233 
    234 extern struct bootspace bootspace;
    235 
    236 extern paddr_t lowmem_rsvd;
    237 extern paddr_t avail_start, avail_end;
    238 #ifdef XENPV
    239 extern paddr_t pmap_pa_start, pmap_pa_end;
    240 void hypervisor_callback(void);
    241 void failsafe_callback(void);
    242 #endif
    243 
    244 /*
    245  * Size of memory segments, before any memory is stolen.
    246  */
    247 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
    248 int mem_cluster_cnt = 0;
    249 
    250 void init_bootspace(void);
    251 void init386(paddr_t);
    252 void initgdt(union descriptor *);
    253 
    254 static void i386_proc0_pcb_ldt_init(void);
    255 
    256 int *esym;
    257 int *eblob;
    258 extern int boothowto;
    259 
    260 #ifndef XENPV
    261 
    262 /* Base memory reported by BIOS. */
    263 #ifndef REALBASEMEM
    264 int biosbasemem = 0;
    265 #else
    266 int biosbasemem = REALBASEMEM;
    267 #endif
    268 
    269 /* Extended memory reported by BIOS. */
    270 #ifndef REALEXTMEM
    271 int biosextmem = 0;
    272 #else
    273 int biosextmem = REALEXTMEM;
    274 #endif
    275 
    276 /* Set if any boot-loader set biosbasemem/biosextmem. */
    277 int biosmem_implicit;
    278 
    279 /*
    280  * Representation of the bootinfo structure constructed by a NetBSD native
    281  * boot loader.  Only be used by native_loader().
    282  */
    283 struct bootinfo_source {
    284 	uint32_t bs_naddrs;
    285 	void *bs_addrs[1]; /* Actually longer. */
    286 };
    287 
    288 /* Only called by locore.S; no need to be in a header file. */
    289 void native_loader(int, int, struct bootinfo_source *, paddr_t, int, int);
    290 
    291 /*
    292  * Called as one of the very first things during system startup (just after
    293  * the boot loader gave control to the kernel image), this routine is in
    294  * charge of retrieving the parameters passed in by the boot loader and
    295  * storing them in the appropriate kernel variables.
    296  *
    297  * WARNING: Because the kernel has not yet relocated itself to KERNBASE,
    298  * special care has to be taken when accessing memory because absolute
    299  * addresses (referring to kernel symbols) do not work.  So:
    300  *
    301  *     1) Avoid jumps to absolute addresses (such as gotos and switches).
    302  *     2) To access global variables use their physical address, which
    303  *        can be obtained using the RELOC macro.
    304  */
    305 void
    306 native_loader(int bl_boothowto, int bl_bootdev,
    307     struct bootinfo_source *bl_bootinfo, paddr_t bl_esym,
    308     int bl_biosextmem, int bl_biosbasemem)
    309 {
    310 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
    311 
    312 	*RELOC(int *, &boothowto) = bl_boothowto;
    313 
    314 	/*
    315 	 * The boot loader provides a physical, non-relocated address
    316 	 * for the symbols table's end.  We need to convert it to a
    317 	 * virtual address.
    318 	 */
    319 	if (bl_esym != 0)
    320 		*RELOC(int **, &esym) = (int *)((vaddr_t)bl_esym + KERNBASE);
    321 	else
    322 		*RELOC(int **, &esym) = 0;
    323 
    324 	/*
    325 	 * Copy bootinfo entries (if any) from the boot loader's
    326 	 * representation to the kernel's bootinfo space.
    327 	 */
    328 	if (bl_bootinfo != NULL) {
    329 		size_t i;
    330 		uint8_t *data;
    331 		struct bootinfo *bidest;
    332 		struct btinfo_modulelist *bi;
    333 
    334 		bidest = RELOC(struct bootinfo *, &bootinfo);
    335 
    336 		data = &bidest->bi_data[0];
    337 
    338 		for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
    339 			struct btinfo_common *bc;
    340 
    341 			bc = bl_bootinfo->bs_addrs[i];
    342 
    343 			if ((data + bc->len) >
    344 			    (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
    345 				break;
    346 
    347 			memcpy(data, bc, bc->len);
    348 			/*
    349 			 * If any modules were loaded, record where they
    350 			 * end.  We'll need to skip over them.
    351 			 */
    352 			bi = (struct btinfo_modulelist *)data;
    353 			if (bi->common.type == BTINFO_MODULELIST) {
    354 				*RELOC(int **, &eblob) =
    355 				    (int *)(bi->endpa + KERNBASE);
    356 			}
    357 			data += bc->len;
    358 		}
    359 		bidest->bi_nentries = i;
    360 	}
    361 
    362 	/*
    363 	 * Configure biosbasemem and biosextmem only if they were not
    364 	 * explicitly given during the kernel's build.
    365 	 */
    366 	if (*RELOC(int *, &biosbasemem) == 0) {
    367 		*RELOC(int *, &biosbasemem) = bl_biosbasemem;
    368 		*RELOC(int *, &biosmem_implicit) = 1;
    369 	}
    370 	if (*RELOC(int *, &biosextmem) == 0) {
    371 		*RELOC(int *, &biosextmem) = bl_biosextmem;
    372 		*RELOC(int *, &biosmem_implicit) = 1;
    373 	}
    374 #undef RELOC
    375 }
    376 
    377 #endif /* XENPV */
    378 
    379 /*
    380  * Machine-dependent startup code
    381  */
    382 void
    383 cpu_startup(void)
    384 {
    385 	int x, y;
    386 	vaddr_t minaddr, maxaddr;
    387 	psize_t sz;
    388 
    389 	/*
    390 	 * For console drivers that require uvm and pmap to be initialized,
    391 	 * we'll give them one more chance here...
    392 	 */
    393 	consinit();
    394 
    395 	/*
    396 	 * Initialize error message buffer (et end of core).
    397 	 */
    398 	if (msgbuf_p_cnt == 0)
    399 		panic("msgbuf paddr map has not been set up");
    400 	for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
    401 		continue;
    402 
    403 	msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
    404 	if (msgbuf_vaddr == 0)
    405 		panic("failed to valloc msgbuf_vaddr");
    406 
    407 	for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
    408 		for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
    409 			pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
    410 			    msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
    411 			    VM_PROT_READ|VM_PROT_WRITE, 0);
    412 	}
    413 
    414 	pmap_update(pmap_kernel());
    415 
    416 	initmsgbuf((void *)msgbuf_vaddr, sz);
    417 
    418 #ifdef MULTIBOOT
    419 	multiboot1_print_info();
    420 	multiboot2_print_info();
    421 #endif
    422 
    423 #if NCARDBUS > 0
    424 	/* Tell RBUS how much RAM we have, so it can use heuristics. */
    425 	rbus_min_start_hint(ctob((psize_t)physmem));
    426 #endif
    427 
    428 	minaddr = 0;
    429 
    430 	/*
    431 	 * Allocate a submap for physio
    432 	 */
    433 	phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
    434 	    VM_PHYS_SIZE, 0, false, NULL);
    435 
    436 	/* Say hello. */
    437 	banner();
    438 
    439 	/* Safe for i/o port / memory space allocation to use malloc now. */
    440 #if NISA > 0 || NPCI > 0
    441 	x86_bus_space_mallocok();
    442 #endif
    443 
    444 	gdt_init();
    445 	i386_proc0_pcb_ldt_init();
    446 
    447 	cpu_init_tss(&cpu_info_primary);
    448 #ifndef XENPV
    449 	ltr(cpu_info_primary.ci_tss_sel);
    450 #endif
    451 
    452 	x86_startup();
    453 }
    454 
    455 /*
    456  * Set up proc0's PCB and LDT.
    457  */
    458 static void
    459 i386_proc0_pcb_ldt_init(void)
    460 {
    461 	struct lwp *l = &lwp0;
    462 	struct pcb *pcb = lwp_getpcb(l);
    463 
    464 	pcb->pcb_cr0 = rcr0() & ~CR0_TS;
    465 	pcb->pcb_esp0 = uvm_lwp_getuarea(l) + USPACE - 16;
    466 	pcb->pcb_iopl = IOPL_KPL;
    467 	l->l_md.md_regs = (struct trapframe *)pcb->pcb_esp0 - 1;
    468 	memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
    469 	memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
    470 	pcb->pcb_dbregs = NULL;
    471 
    472 #ifndef XENPV
    473 	lldt(GSEL(GLDT_SEL, SEL_KPL));
    474 #else
    475 	HYPERVISOR_fpu_taskswitch(1);
    476 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
    477 #endif
    478 }
    479 
    480 #ifdef XENPV
    481 /* used in assembly */
    482 void i386_switch_context(lwp_t *);
    483 void i386_tls_switch(lwp_t *);
    484 
    485 /*
    486  * Switch context:
    487  * - switch stack pointer for user->kernel transition
    488  */
    489 void
    490 i386_switch_context(lwp_t *l)
    491 {
    492 	struct pcb *pcb;
    493 
    494 	pcb = lwp_getpcb(l);
    495 
    496 	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
    497 
    498 	struct physdev_set_iopl set_iopl;
    499 	set_iopl.iopl = pcb->pcb_iopl;
    500 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
    501 }
    502 
    503 void
    504 i386_tls_switch(lwp_t *l)
    505 {
    506 	struct cpu_info *ci = curcpu();
    507 	struct pcb *pcb = lwp_getpcb(l);
    508 
    509 	/*
    510 	 * Raise the IPL to IPL_HIGH. XXX Still needed?
    511 	 */
    512 	(void)splhigh();
    513 
    514 	/* Update TLS segment pointers */
    515 	update_descriptor(&ci->ci_gdt[GUFS_SEL],
    516 	    (union descriptor *)&pcb->pcb_fsd);
    517 	update_descriptor(&ci->ci_gdt[GUGS_SEL],
    518 	    (union descriptor *)&pcb->pcb_gsd);
    519 }
    520 #endif /* XENPV */
    521 
    522 /* XXX */
    523 #define IDTVEC(name)	__CONCAT(X, name)
    524 typedef void (vector)(void);
    525 
    526 #ifndef XENPV
    527 static void	tss_init(struct i386tss *, void *, void *);
    528 
    529 static void
    530 tss_init(struct i386tss *tss, void *stack, void *func)
    531 {
    532 	KASSERT(curcpu()->ci_pmap == pmap_kernel());
    533 
    534 	memset(tss, 0, sizeof *tss);
    535 	tss->tss_esp0 = tss->tss_esp = (int)((char *)stack + USPACE - 16);
    536 	tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
    537 	tss->__tss_cs = GSEL(GCODE_SEL, SEL_KPL);
    538 	tss->tss_fs = GSEL(GCPU_SEL, SEL_KPL);
    539 	tss->tss_gs = tss->__tss_es = tss->__tss_ds =
    540 	    tss->__tss_ss = GSEL(GDATA_SEL, SEL_KPL);
    541 	/* %cr3 contains the value associated to pmap_kernel */
    542 	tss->tss_cr3 = rcr3();
    543 	tss->tss_esp = (int)((char *)stack + USPACE - 16);
    544 	tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
    545 	tss->__tss_eflags = PSL_MBO | PSL_NT;	/* XXX not needed? */
    546 	tss->__tss_eip = (int)func;
    547 }
    548 
    549 extern vector IDTVEC(tss_trap08);
    550 #if defined(DDB) && defined(MULTIPROCESSOR)
    551 extern vector Xintr_ddbipi, Xintr_x2apic_ddbipi;
    552 extern int ddb_vec;
    553 #endif
    554 
    555 void
    556 cpu_set_tss_gates(struct cpu_info *ci)
    557 {
    558 	struct segment_descriptor sd;
    559 	void *doubleflt_stack;
    560 	idt_descriptor_t *idt;
    561 
    562 	doubleflt_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
    563 	    UVM_KMF_WIRED);
    564 	tss_init(&ci->ci_tss->dblflt_tss, doubleflt_stack, IDTVEC(tss_trap08));
    565 
    566 	setsegment(&sd, &ci->ci_tss->dblflt_tss, sizeof(struct i386tss) - 1,
    567 	    SDT_SYS386TSS, SEL_KPL, 0, 0);
    568 	ci->ci_gdt[GTRAPTSS_SEL].sd = sd;
    569 
    570 	idt = cpu_info_primary.ci_idtvec.iv_idt;
    571 	set_idtgate(&idt[8], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
    572 	    GSEL(GTRAPTSS_SEL, SEL_KPL));
    573 
    574 #if defined(DDB) && defined(MULTIPROCESSOR)
    575 	/*
    576 	 * Set up separate handler for the DDB IPI, so that it doesn't
    577 	 * stomp on a possibly corrupted stack.
    578 	 *
    579 	 * XXX overwriting the gate set in db_machine_init.
    580 	 * Should rearrange the code so that it's set only once.
    581 	 */
    582 	void *ddbipi_stack;
    583 
    584 	ddbipi_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
    585 	    UVM_KMF_WIRED);
    586 	tss_init(&ci->ci_tss->ddbipi_tss, ddbipi_stack,
    587 	    x2apic_mode ? Xintr_x2apic_ddbipi : Xintr_ddbipi);
    588 
    589 	setsegment(&sd, &ci->ci_tss->ddbipi_tss, sizeof(struct i386tss) - 1,
    590 	    SDT_SYS386TSS, SEL_KPL, 0, 0);
    591 	ci->ci_gdt[GIPITSS_SEL].sd = sd;
    592 
    593 	set_idtgate(&idt[ddb_vec], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
    594 	    GSEL(GIPITSS_SEL, SEL_KPL));
    595 #endif
    596 }
    597 #endif /* XENPV */
    598 
    599 /*
    600  * Set up TSS and I/O bitmap.
    601  */
    602 void
    603 cpu_init_tss(struct cpu_info *ci)
    604 {
    605 	struct cpu_tss *cputss;
    606 
    607 	cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
    608 	    sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
    609 
    610 	cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
    611 #ifndef XENPV
    612 	cputss->tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
    613 	cputss->tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
    614 	cputss->tss.tss_cr3 = rcr3();
    615 #endif
    616 
    617 	ci->ci_tss = cputss;
    618 #ifndef XENPV
    619 	ci->ci_tss_sel = tss_alloc(&cputss->tss);
    620 #endif
    621 }
    622 
    623 void *
    624 getframe(struct lwp *l, int sig, int *onstack)
    625 {
    626 	struct proc *p = l->l_proc;
    627 	struct trapframe *tf = l->l_md.md_regs;
    628 
    629 	/* Do we need to jump onto the signal stack? */
    630 	*onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
    631 	    && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
    632 	if (*onstack)
    633 		return (char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size;
    634 	return (void *)tf->tf_esp;
    635 }
    636 
    637 /*
    638  * Build context to run handler in.  We invoke the handler
    639  * directly, only returning via the trampoline.  Note the
    640  * trampoline version numbers are coordinated with machine-
    641  * dependent code in libc.
    642  */
    643 void
    644 buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
    645 {
    646 	struct trapframe *tf = l->l_md.md_regs;
    647 
    648 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
    649 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
    650 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
    651 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
    652 	tf->tf_eip = (int)catcher;
    653 	tf->tf_cs = GSEL(sel, SEL_UPL);
    654 	tf->tf_eflags &= ~PSL_CLEARSIG;
    655 	tf->tf_esp = (int)fp;
    656 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
    657 
    658 	/* Ensure FP state is reset. */
    659 	fpu_sigreset(l);
    660 }
    661 
    662 void
    663 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
    664 {
    665 	struct lwp *l = curlwp;
    666 	struct proc *p = l->l_proc;
    667 	struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
    668 	int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
    669 	    GUCODEBIG_SEL : GUCODE_SEL;
    670 	struct sigacts *ps = p->p_sigacts;
    671 	int onstack, error;
    672 	int sig = ksi->ksi_signo;
    673 	struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
    674 	sig_t catcher = SIGACTION(p, sig).sa_handler;
    675 
    676 	KASSERT(mutex_owned(p->p_lock));
    677 
    678 	fp--;
    679 	fp = (struct sigframe_siginfo *)((uintptr_t)fp & ~STACK_ALIGNBYTES);
    680 
    681 	memset(&frame, 0, sizeof(frame));
    682 	frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
    683 	frame.sf_signum = sig;
    684 	frame.sf_sip = &fp->sf_si;
    685 	frame.sf_ucp = &fp->sf_uc;
    686 	frame.sf_si._info = ksi->ksi_info;
    687 	frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
    688 	frame.sf_uc.uc_sigmask = *mask;
    689 	frame.sf_uc.uc_link = l->l_ctxlink;
    690 	frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
    691 	    ? _UC_SETSTACK : _UC_CLRSTACK;
    692 
    693 	sendsig_reset(l, sig);
    694 
    695 	mutex_exit(p->p_lock);
    696 	cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
    697 	error = copyout(&frame, fp, sizeof(frame));
    698 	mutex_enter(p->p_lock);
    699 
    700 	if (error != 0) {
    701 		/*
    702 		 * Process has trashed its stack; give it an illegal
    703 		 * instruction to halt it in its tracks.
    704 		 */
    705 		sigexit(l, SIGILL);
    706 		/* NOTREACHED */
    707 	}
    708 
    709 	buildcontext(l, sel, catcher, fp);
    710 
    711 	/* Remember that we're now on the signal stack. */
    712 	if (onstack)
    713 		l->l_sigstk.ss_flags |= SS_ONSTACK;
    714 }
    715 
    716 static void
    717 maybe_dump(int howto)
    718 {
    719 	int s;
    720 
    721 	/* Disable interrupts. */
    722 	s = splhigh();
    723 
    724 	/* Do a dump if requested. */
    725 	if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
    726 		dumpsys();
    727 
    728 	splx(s);
    729 }
    730 
    731 void
    732 cpu_reboot(int howto, char *bootstr)
    733 {
    734 	static bool syncdone = false;
    735 	int s = IPL_NONE;
    736 
    737 	if (cold) {
    738 		howto |= RB_HALT;
    739 		goto haltsys;
    740 	}
    741 
    742 	boothowto = howto;
    743 
    744 	/* XXX used to dump after vfs_shutdown() and before
    745 	 * detaching devices / shutdown hooks / pmf_system_shutdown().
    746 	 */
    747 	maybe_dump(howto);
    748 
    749 	/*
    750 	 * If we've panic'd, don't make the situation potentially
    751 	 * worse by syncing or unmounting the file systems.
    752 	 */
    753 	if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
    754 		if (!syncdone) {
    755 			syncdone = true;
    756 			/* XXX used to force unmount as well, here */
    757 			vfs_sync_all(curlwp);
    758 		}
    759 
    760 		while (vfs_unmountall1(curlwp, false, false) ||
    761 		       config_detach_all(boothowto) ||
    762 		       vfs_unmount_forceone(curlwp))
    763 			;	/* do nothing */
    764 	} else {
    765 		if (!db_active)
    766 			suspendsched();
    767 	}
    768 
    769 	pmf_system_shutdown(boothowto);
    770 
    771 	s = splhigh();
    772 
    773 	/* amd64 maybe_dump() */
    774 
    775 haltsys:
    776 	doshutdownhooks();
    777 
    778 	if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
    779 #if NACPICA > 0
    780 		if (s != IPL_NONE)
    781 			splx(s);
    782 
    783 		acpi_enter_sleep_state(ACPI_STATE_S5);
    784 #else
    785 		__USE(s);
    786 #endif
    787 #ifdef XEN
    788 		if (vm_guest == VM_GUEST_XENPV ||
    789 		    vm_guest == VM_GUEST_XENPVH ||
    790 		    vm_guest == VM_GUEST_XENPVHVM)
    791 			HYPERVISOR_shutdown();
    792 #endif /* XEN */
    793 	}
    794 
    795 #ifdef MULTIPROCESSOR
    796 	cpu_broadcast_halt();
    797 #endif /* MULTIPROCESSOR */
    798 
    799 	if (howto & RB_HALT) {
    800 #if NACPICA > 0
    801 		acpi_disable();
    802 #endif
    803 
    804 		printf("\n");
    805 		printf("The operating system has halted.\n");
    806 		printf("Please press any key to reboot.\n\n");
    807 
    808 #ifdef BEEP_ONHALT
    809 		{
    810 			int c;
    811 			for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
    812 				sysbeep(BEEP_ONHALT_PITCH,
    813 					BEEP_ONHALT_PERIOD * hz / 1000);
    814 				delay(BEEP_ONHALT_PERIOD * 1000);
    815 				sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
    816 				delay(BEEP_ONHALT_PERIOD * 1000);
    817 			}
    818 		}
    819 #endif
    820 
    821 		cnpollc(1);	/* for proper keyboard command handling */
    822 		if (cngetc() == 0) {
    823 			/* no console attached, so just hlt */
    824 			printf("No keyboard - cannot reboot after all.\n");
    825 			for(;;) {
    826 				x86_hlt();
    827 			}
    828 		}
    829 		cnpollc(0);
    830 	}
    831 
    832 	printf("rebooting...\n");
    833 	if (cpureset_delay > 0)
    834 		delay(cpureset_delay * 1000);
    835 	cpu_reset();
    836 	for(;;) ;
    837 	/*NOTREACHED*/
    838 }
    839 
    840 /*
    841  * Clear registers on exec
    842  */
    843 void
    844 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
    845 {
    846 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
    847 	struct pcb *pcb = lwp_getpcb(l);
    848 	struct trapframe *tf;
    849 
    850 #ifdef USER_LDT
    851 	pmap_ldt_cleanup(l);
    852 #endif
    853 
    854 	fpu_clear(l, pack->ep_osversion >= 699002600
    855 	    ? __INITIAL_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
    856 
    857 	memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
    858 	memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
    859 
    860 	x86_dbregs_clear(l);
    861 
    862 	tf = l->l_md.md_regs;
    863 	memset(tf, 0, sizeof(*tf));
    864 
    865 	tf->tf_trapno = T_ASTFLT;
    866 	tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
    867 	tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
    868 	tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
    869 	tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
    870 	tf->tf_edi = 0;
    871 	tf->tf_esi = 0;
    872 	tf->tf_ebp = 0;
    873 	tf->tf_ebx = l->l_proc->p_psstrp;
    874 	tf->tf_edx = 0;
    875 	tf->tf_ecx = 0;
    876 	tf->tf_eax = 0;
    877 	tf->tf_eip = pack->ep_entry;
    878 	tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
    879 	    LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
    880 	tf->tf_eflags = PSL_USERSET;
    881 	tf->tf_esp = stack;
    882 	tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
    883 }
    884 
    885 /*
    886  * Initialize segments and descriptor tables
    887  */
    888 
    889 union descriptor *gdtstore, *ldtstore;
    890 union descriptor *pentium_idt;
    891 extern vaddr_t lwp0uarea;
    892 
    893 void
    894 setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
    895     int sel)
    896 {
    897 
    898 	gd->gd_looffset = (int)func;
    899 	gd->gd_selector = sel;
    900 	gd->gd_stkcpy = args;
    901 	gd->gd_xx = 0;
    902 	gd->gd_type = type;
    903 	gd->gd_dpl = dpl;
    904 	gd->gd_p = 1;
    905 	gd->gd_hioffset = (int)func >> 16;
    906 }
    907 
    908 void
    909 unsetgate(struct gate_descriptor *gd)
    910 {
    911 
    912 	gd->gd_p = 0;
    913 	gd->gd_hioffset = 0;
    914 	gd->gd_looffset = 0;
    915 	gd->gd_selector = 0;
    916 	gd->gd_xx = 0;
    917 	gd->gd_stkcpy = 0;
    918 	gd->gd_type = 0;
    919 	gd->gd_dpl = 0;
    920 }
    921 
    922 void
    923 setregion(struct region_descriptor *rd, void *base, size_t limit)
    924 {
    925 
    926 	rd->rd_limit = (int)limit;
    927 	rd->rd_base = (int)base;
    928 }
    929 
    930 void
    931 setsegment(struct segment_descriptor *sd, const void *base, size_t limit,
    932     int type, int dpl, int def32, int gran)
    933 {
    934 
    935 	sd->sd_lolimit = (int)limit;
    936 	sd->sd_lobase = (int)base;
    937 	sd->sd_type = type;
    938 	sd->sd_dpl = dpl;
    939 	sd->sd_p = 1;
    940 	sd->sd_hilimit = (int)limit >> 16;
    941 	sd->sd_xx = 0;
    942 	sd->sd_def32 = def32;
    943 	sd->sd_gran = gran;
    944 	sd->sd_hibase = (int)base >> 24;
    945 }
    946 
    947 /* XXX */
    948 extern vector IDTVEC(syscall);
    949 extern vector *IDTVEC(exceptions)[];
    950 #ifdef XENPV
    951 extern union descriptor tmpgdt[];
    952 #endif
    953 
    954 void
    955 cpu_init_idt(struct cpu_info *ci)
    956 {
    957 	struct region_descriptor region;
    958 	struct idt_vec *iv;
    959 	idt_descriptor_t *idt;
    960 
    961 	iv = &ci->ci_idtvec;
    962 	idt = iv->iv_idt_pentium;
    963 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
    964 	lidt(&region);
    965 }
    966 
    967 /*
    968  * initgdt(tgdt)
    969  *
    970  *	Initialize a temporary Global Descriptor Table (GDT) using
    971  *	storage space at tgdt.
    972  *
    973  *	1. Set up segment descriptors for our purposes, including a
    974  *	   CPU-local segment descriptor pointing at &cpu_info_primary.
    975  *
    976  *	2. Load the address into the Global Descriptor Table Register.
    977  *
    978  *	3. Set up segment selectors for all the segment registers using
    979  *	   it so that %fs-relative addressing works for the CPU-local
    980  *	   data.
    981  *
    982  *	After this put, CPUVAR(...), curcpu(), and curlwp will work.
    983  *
    984  *	Eventually the kernel will switch to a second temporary GDT
    985  *	allocated with pmap_bootstrap_valloc in pmap_bootstrap, and
    986  *	then to permanent GDT allocated with uvm_km(9) in gdt_init.
    987  *	But the first temporary GDT is needed now to get us going with
    988  *	early access to curcpu() and curlwp before we enter kernel
    989  *	main.
    990  *
    991  *	XXX The purpose of each of the segment descriptors should be
    992  *	written down somewhere in a single place that can be cross-
    993  *	referenced.
    994  *
    995  *	References:
    996  *
    997  *	- Intel 64 and IA-32 Architectures Software Developer's Manual,
    998  *	  Volume 3: System Programming Guide, Order Number 325384,
    999  *	  April 2022, Sec. 3.5.1 `Segment Descriptor Tables',
   1000  *	  pp. 3-14 through 3-16.
   1001  */
   1002 void
   1003 initgdt(union descriptor *tgdt)
   1004 {
   1005 	KASSERT(tgdt != NULL);
   1006 
   1007 	gdtstore = tgdt;
   1008 #ifdef XENPV
   1009 	u_long	frames[16];
   1010 #else
   1011 	struct region_descriptor region;
   1012 	memset(gdtstore, 0, NGDT * sizeof(*gdtstore));
   1013 #endif
   1014 
   1015 	/* make gdt gates and memory segments */
   1016 	setsegment(&gdtstore[GCODE_SEL].sd, 0, 0xfffff,
   1017 	    SDT_MEMERA, SEL_KPL, 1, 1);
   1018 	setsegment(&gdtstore[GDATA_SEL].sd, 0, 0xfffff,
   1019 	    SDT_MEMRWA, SEL_KPL, 1, 1);
   1020 	setsegment(&gdtstore[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
   1021 	    SDT_MEMERA, SEL_UPL, 1, 1);
   1022 	setsegment(&gdtstore[GUCODEBIG_SEL].sd, 0, 0xfffff,
   1023 	    SDT_MEMERA, SEL_UPL, 1, 1);
   1024 	setsegment(&gdtstore[GUDATA_SEL].sd, 0, 0xfffff,
   1025 	    SDT_MEMRWA, SEL_UPL, 1, 1);
   1026 #if NBIOSCALL > 0 && !defined(XENPV)
   1027 	/* bios trampoline GDT entries */
   1028 	setsegment(&gdtstore[GBIOSCODE_SEL].sd, 0, 0xfffff,
   1029 	    SDT_MEMERA, SEL_KPL, 0, 0);
   1030 	setsegment(&gdtstore[GBIOSDATA_SEL].sd, 0, 0xfffff,
   1031 	    SDT_MEMRWA, SEL_KPL, 0, 0);
   1032 #endif
   1033 	setsegment(&gdtstore[GCPU_SEL].sd, &cpu_info_primary,
   1034 	    sizeof(struct cpu_info) - 1, SDT_MEMRWA, SEL_KPL, 1, 0);
   1035 
   1036 #ifndef XENPV
   1037 	setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
   1038 	lgdt(&region);
   1039 #else /* !XENPV */
   1040 	/*
   1041 	 * We jumpstart the bootstrap process a bit so we can update
   1042 	 * page permissions. This is done redundantly later from
   1043 	 * x86_xpmap.c:xen_locore() - harmless.
   1044 	 */
   1045 	xpmap_phys_to_machine_mapping =
   1046 	    (unsigned long *)xen_start_info.mfn_list;
   1047 
   1048 	frames[0] = xpmap_ptom((uint32_t)gdtstore - KERNBASE) >> PAGE_SHIFT;
   1049 	{	/*
   1050 		 * Enter the gdt page RO into the kernel map. We can't
   1051 		 * use pmap_kenter_pa() here, because %fs is not
   1052 		 * usable until the gdt is loaded, and %fs is used as
   1053 		 * the base pointer for curcpu() and curlwp(), both of
   1054 		 * which are in the callpath of pmap_kenter_pa().
   1055 		 * So we mash up our own - this is MD code anyway.
   1056 		 */
   1057 		extern pt_entry_t xpmap_pg_nx;
   1058 		pt_entry_t pte;
   1059 
   1060 		pte = pmap_pa2pte((vaddr_t)gdtstore - KERNBASE);
   1061 		pte |= xpmap_pg_nx | PTE_P;
   1062 
   1063 		if (HYPERVISOR_update_va_mapping((vaddr_t)gdtstore, pte,
   1064 		    UVMF_INVLPG) < 0) {
   1065 			panic("gdt page RO update failed.\n");
   1066 		}
   1067 	}
   1068 
   1069 	if (HYPERVISOR_set_gdt(frames, NGDT /* XXX is it right ? */))
   1070 		panic("HYPERVISOR_set_gdt failed!\n");
   1071 
   1072 	lgdt_finish();
   1073 #endif /* !XENPV */
   1074 }
   1075 
   1076 #if !defined(XENPV)  && NBIOSCALL > 0
   1077 static void
   1078 init386_pte0(void)
   1079 {
   1080 	paddr_t paddr;
   1081 	vaddr_t vaddr;
   1082 
   1083 	paddr = 4 * PAGE_SIZE;
   1084 	vaddr = (vaddr_t)vtopte(0);
   1085 	pmap_kenter_pa(vaddr, paddr, VM_PROT_ALL, 0);
   1086 	pmap_update(pmap_kernel());
   1087 	/* make sure it is clean before using */
   1088 	memset((void *)vaddr, 0, PAGE_SIZE);
   1089 }
   1090 #endif /* !XENPV && NBIOSCALL > 0 */
   1091 
   1092 #ifndef XENPV
   1093 static void
   1094 init386_ksyms(void)
   1095 {
   1096 #if NKSYMS || defined(DDB) || defined(MODULAR)
   1097 	extern int end;
   1098 	struct btinfo_symtab *symtab;
   1099 
   1100 #ifdef DDB
   1101 	db_machine_init();
   1102 #endif
   1103 
   1104 #if defined(MULTIBOOT)
   1105 	if (multiboot1_ksyms_addsyms_elf())
   1106 		return;
   1107 
   1108 	if (multiboot2_ksyms_addsyms_elf())
   1109 		return;
   1110 #endif
   1111 
   1112 #ifdef XEN
   1113 	if (pvh_boot && vm_guest != VM_GUEST_XENPVH) {
   1114 		ksyms_addsyms_elf(0, ((int *)&end) + 1, esym);
   1115 		return;
   1116 	}
   1117 #endif
   1118 
   1119 	if ((symtab = lookup_bootinfo(BTINFO_SYMTAB)) == NULL) {
   1120 		ksyms_addsyms_elf(*(int *)&end, ((int *)&end) + 1, esym);
   1121 		return;
   1122 	}
   1123 
   1124 	symtab->ssym += KERNBASE;
   1125 	symtab->esym += KERNBASE;
   1126 	ksyms_addsyms_elf(symtab->nsym, (int *)symtab->ssym, (int *)symtab->esym);
   1127 #endif
   1128 }
   1129 #endif /* XENPV */
   1130 
   1131 void
   1132 init_bootspace(void)
   1133 {
   1134 	extern char __rodata_start;
   1135 	extern char __data_start;
   1136 	extern char __kernel_end;
   1137 	size_t i = 0;
   1138 
   1139 	memset(&bootspace, 0, sizeof(bootspace));
   1140 
   1141 	bootspace.head.va = KERNTEXTOFF;
   1142 	bootspace.head.pa = KERNTEXTOFF - KERNBASE;
   1143 	bootspace.head.sz = 0;
   1144 
   1145 	bootspace.segs[i].type = BTSEG_TEXT;
   1146 	bootspace.segs[i].va = KERNTEXTOFF;
   1147 	bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
   1148 	bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
   1149 	i++;
   1150 
   1151 	bootspace.segs[i].type = BTSEG_RODATA;
   1152 	bootspace.segs[i].va = (vaddr_t)&__rodata_start;
   1153 	bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__rodata_start - KERNBASE;
   1154 	bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
   1155 	i++;
   1156 
   1157 	bootspace.segs[i].type = BTSEG_DATA;
   1158 	bootspace.segs[i].va = (vaddr_t)&__data_start;
   1159 	bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__data_start - KERNBASE;
   1160 	bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
   1161 	i++;
   1162 
   1163 	bootspace.boot.va = (vaddr_t)&__kernel_end;
   1164 	bootspace.boot.pa = (paddr_t)(vaddr_t)&__kernel_end - KERNBASE;
   1165 	bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
   1166 	    (size_t)&__kernel_end;
   1167 
   1168 	/* Virtual address of the top level page */
   1169 	bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
   1170 }
   1171 
   1172 void
   1173 init386(paddr_t first_avail)
   1174 {
   1175 	extern void consinit(void);
   1176 	int x;
   1177 #ifndef XENPV
   1178 	extern paddr_t local_apic_pa;
   1179 	union descriptor *tgdt;
   1180 	struct region_descriptor region;
   1181 #if NBIOSCALL > 0
   1182 	extern int biostramp_image_size;
   1183 	extern u_char biostramp_image[];
   1184 #endif
   1185 #endif /* !XENPV */
   1186 	struct pcb *pcb;
   1187 	struct idt_vec *iv;
   1188 	idt_descriptor_t *idt;
   1189 
   1190 	KASSERT(first_avail % PAGE_SIZE == 0);
   1191 
   1192 #ifdef XENPV
   1193 	KASSERT(HYPERVISOR_shared_info != NULL);
   1194 	cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
   1195 #endif
   1196 
   1197 #ifdef XEN
   1198 	if (pvh_boot)
   1199 		xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
   1200 #endif
   1201 
   1202 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
   1203 
   1204 	cpu_probe(&cpu_info_primary);
   1205 
   1206 	/*
   1207 	 * Initialize the no-execute bit on cpu0, if supported.
   1208 	 *
   1209 	 * Note: The call to cpu_init_msrs for secondary CPUs happens
   1210 	 * in cpu_hatch.
   1211 	 */
   1212 	cpu_init_msrs(&cpu_info_primary, true);
   1213 
   1214 #ifndef XENPV
   1215 	cpu_speculation_init(&cpu_info_primary);
   1216 #endif
   1217 
   1218 #ifdef PAE
   1219 	use_pae = 1;
   1220 #else
   1221 	use_pae = 0;
   1222 #endif
   1223 
   1224 	pcb = lwp_getpcb(&lwp0);
   1225 #ifdef XENPV
   1226 	pcb->pcb_cr3 = PDPpaddr;
   1227 #endif
   1228 
   1229 #if defined(PAE) && !defined(XENPV)
   1230 	/*
   1231 	 * Save VA and PA of L3 PD of boot processor (for Xen, this is done
   1232 	 * in xen_locore())
   1233 	 */
   1234 	cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
   1235 	cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
   1236 #endif
   1237 
   1238 	uvm_md_init();
   1239 
   1240 	/*
   1241 	 * Start with 2 color bins -- this is just a guess to get us
   1242 	 * started.  We'll recolor when we determine the largest cache
   1243 	 * sizes on the system.
   1244 	 */
   1245 	uvmexp.ncolors = 2;
   1246 
   1247 	avail_start = first_avail;
   1248 
   1249 #ifndef XENPV
   1250 	/*
   1251 	 * Low memory reservations:
   1252 	 * Page 0:	BIOS data
   1253 	 * Page 1:	BIOS callback
   1254 	 * Page 2:	MP bootstrap code (MP_TRAMPOLINE)
   1255 	 * Page 3:	ACPI wakeup code (ACPI_WAKEUP_ADDR)
   1256 	 * Page 4:	Temporary page table for 0MB-4MB
   1257 	 * Page 5:	Temporary page directory
   1258 	 */
   1259 	lowmem_rsvd = 6 * PAGE_SIZE;
   1260 #else /* !XENPV */
   1261 	/* Parse Xen command line (replace bootinfo) */
   1262 	xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
   1263 
   1264 	/* Use the dummy page as a gdt */
   1265 	extern vaddr_t xen_dummy_page;
   1266 	gdtstore = (void *)xen_dummy_page;
   1267 
   1268 	/* Determine physical address space */
   1269 	avail_end = ctob((paddr_t)xen_start_info.nr_pages);
   1270 	pmap_pa_start = (KERNTEXTOFF - KERNBASE);
   1271 	pmap_pa_end = pmap_pa_start + ctob((paddr_t)xen_start_info.nr_pages);
   1272 	mem_clusters[0].start = avail_start;
   1273 	mem_clusters[0].size = avail_end - avail_start;
   1274 	mem_cluster_cnt++;
   1275 	physmem += xen_start_info.nr_pages;
   1276 	uvmexp.wired += atop(avail_start);
   1277 
   1278 	/*
   1279 	 * initgdt() has to be done before consinit(), so that %fs is properly
   1280 	 * initialised. initgdt() uses pmap_kenter_pa so it can't be called
   1281 	 * before the above variables are set.
   1282 	 */
   1283 	initgdt(gdtstore);
   1284 
   1285 	mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
   1286 #endif /* XENPV */
   1287 
   1288 #if NISA > 0 || NPCI > 0
   1289 	x86_bus_space_init();
   1290 #endif
   1291 
   1292 	consinit();	/* XXX SHOULD NOT BE DONE HERE */
   1293 
   1294 #ifdef DEBUG_MEMLOAD
   1295 	printf("mem_cluster_count: %d\n", mem_cluster_cnt);
   1296 #endif
   1297 
   1298 	/*
   1299 	 * Call pmap initialization to make new kernel address space.
   1300 	 * We must do this before loading pages into the VM system.
   1301 	 */
   1302 	pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
   1303 
   1304 	/*
   1305 	 * Initialize RNG to get entropy ASAP either from CPU
   1306 	 * RDRAND/RDSEED or from seed on disk.  Constraints:
   1307 	 *
   1308 	 * - Must happen after cpu_init_msrs so that curcpu() and
   1309 	 *   curlwp work.
   1310 	 *
   1311 	 * - Must happen after consinit so we have the opportunity to
   1312 	 *   print useful feedback.
   1313 	 *
   1314 	 * - On KASLR kernels, must happen after pmap_bootstrap because
   1315 	 *   x86_rndseed requires access to the direct map.
   1316 	 */
   1317 	cpu_rng_init();
   1318 	x86_rndseed();
   1319 
   1320 #ifndef XENPV
   1321 	/* Initialize the memory clusters. */
   1322 	init_x86_clusters();
   1323 
   1324 	/* Internalize the physical pages into the VM system. */
   1325 	init_x86_vm(avail_start);
   1326 #else /* !XENPV */
   1327 	uvm_page_physload(atop(avail_start), atop(avail_end),
   1328 	    atop(avail_start), atop(avail_end),
   1329 	    VM_FREELIST_DEFAULT);
   1330 
   1331 	/* Reclaim the boot gdt page - see locore.s */
   1332 	{
   1333 		extern pt_entry_t xpmap_pg_nx;
   1334 		pt_entry_t pte;
   1335 
   1336 		pte = pmap_pa2pte((vaddr_t)tmpgdt - KERNBASE);
   1337 		pte |= PTE_W | xpmap_pg_nx | PTE_P;
   1338 
   1339 		if (HYPERVISOR_update_va_mapping((vaddr_t)tmpgdt, pte, UVMF_INVLPG) < 0) {
   1340 			panic("tmpgdt page relaim RW update failed.\n");
   1341 		}
   1342 	}
   1343 #endif /* !XENPV */
   1344 
   1345 	init_x86_msgbuf();
   1346 
   1347 #if !defined(XENPV) && NBIOSCALL > 0
   1348 	/*
   1349 	 * XXX Remove this
   1350 	 *
   1351 	 * Setup a temporary Page Table Entry to allow identity mappings of
   1352 	 * the real mode address. This is required by bioscall.
   1353 	 */
   1354 	init386_pte0();
   1355 
   1356 	KASSERT(biostramp_image_size <= PAGE_SIZE);
   1357 	pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, (paddr_t)BIOSTRAMP_BASE,
   1358 	    VM_PROT_ALL, 0);
   1359 	pmap_update(pmap_kernel());
   1360 	memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
   1361 
   1362 	/* Needed early, for bioscall() */
   1363 	cpu_info_primary.ci_pmap = pmap_kernel();
   1364 #endif
   1365 
   1366 #ifndef XENPV
   1367 	pmap_kenter_pa(local_apic_va, local_apic_pa,
   1368 	    VM_PROT_READ|VM_PROT_WRITE, 0);
   1369 	pmap_update(pmap_kernel());
   1370 	memset((void *)local_apic_va, 0, PAGE_SIZE);
   1371 #endif
   1372 
   1373 	pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1374 	pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1375 	pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
   1376 	pmap_update(pmap_kernel());
   1377 	memset((void *)idt_vaddr, 0, PAGE_SIZE);
   1378 	memset((void *)gdt_vaddr, 0, PAGE_SIZE);
   1379 	memset((void *)ldt_vaddr, 0, PAGE_SIZE);
   1380 
   1381 	pmap_kenter_pa(pentium_idt_vaddr, idt_paddr, VM_PROT_READ, 0);
   1382 	pmap_update(pmap_kernel());
   1383 	iv = &(cpu_info_primary.ci_idtvec);
   1384 	idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
   1385 	idt = (idt_descriptor_t *)iv->iv_idt;
   1386 
   1387 #ifndef XENPV
   1388 	/*
   1389 	 * Switch from the initial temporary GDT that was allocated on
   1390 	 * the stack by our caller, start.  That temporary GDT will be
   1391 	 * popped off the stack when init386 returns before start calls
   1392 	 * main, so we need to use a second temporary GDT allocated in
   1393 	 * pmap_bootstrap with pmap_bootstrap_valloc/palloc to make
   1394 	 * sure at least the CPU-local data area, used by CPUVAR(...),
   1395 	 * curcpu(), and curlwp via %fs-relative addressing, will
   1396 	 * continue to work.
   1397 	 *
   1398 	 * Later, in gdt_init via cpu_startup, we will finally allocate
   1399 	 * a permanent GDT with uvm_km(9).
   1400 	 *
   1401 	 * The content of the second temporary GDT is the same as the
   1402 	 * content of the initial GDT, initialized in initgdt, except
   1403 	 * for the address of the LDT, which is also that we are also
   1404 	 * switching to a new temporary LDT at a new address.
   1405 	 */
   1406 	tgdt = gdtstore;
   1407 	gdtstore = (union descriptor *)gdt_vaddr;
   1408 	ldtstore = (union descriptor *)ldt_vaddr;
   1409 
   1410 	memcpy(gdtstore, tgdt, NGDT * sizeof(*gdtstore));
   1411 
   1412 	setsegment(&gdtstore[GLDT_SEL].sd, ldtstore,
   1413 	    NLDT * sizeof(ldtstore[0]) - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
   1414 #else
   1415 	HYPERVISOR_set_callbacks(
   1416 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
   1417 	    GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
   1418 
   1419 	ldtstore = (union descriptor *)ldt_vaddr;
   1420 #endif /* XENPV */
   1421 
   1422 	/* make ldt gates and memory segments */
   1423 	ldtstore[LUCODE_SEL] = gdtstore[GUCODE_SEL];
   1424 	ldtstore[LUCODEBIG_SEL] = gdtstore[GUCODEBIG_SEL];
   1425 	ldtstore[LUDATA_SEL] = gdtstore[GUDATA_SEL];
   1426 
   1427 	/* exceptions */
   1428 	for (x = 0; x < 32; x++) {
   1429 		/* Reset to default. Special cases below */
   1430 		int sel;
   1431 #ifdef XENPV
   1432 		sel = SEL_XEN;
   1433 #else
   1434 		sel = SEL_KPL;
   1435 #endif /* XENPV */
   1436 
   1437 		idt_vec_reserve(iv, x);
   1438 
   1439  		switch (x) {
   1440 #ifdef XENPV
   1441 		case 2:  /* NMI */
   1442 		case 18: /* MCA */
   1443 			sel |= 0x4; /* Auto EOI/mask */
   1444 			break;
   1445 #endif /* XENPV */
   1446 		case 3:
   1447 		case 4:
   1448 			sel = SEL_UPL;
   1449 			break;
   1450 		default:
   1451 			break;
   1452 		}
   1453 		set_idtgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386IGT,
   1454 		    sel, GSEL(GCODE_SEL, SEL_KPL));
   1455 	}
   1456 
   1457 	/* new-style interrupt gate for syscalls */
   1458 	idt_vec_reserve(iv, 128);
   1459 	set_idtgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386IGT, SEL_UPL,
   1460 	    GSEL(GCODE_SEL, SEL_KPL));
   1461 
   1462 #ifndef XENPV
   1463 	/*
   1464 	 * Activate the second temporary GDT, allocated in
   1465 	 * pmap_bootstrap with pmap_bootstrap_valloc/palloc, and
   1466 	 * initialized with the content of the initial temporary GDT in
   1467 	 * initgdt, plus an updated LDT.
   1468 	 *
   1469 	 * This ensures the %fs-relative addressing for the CPU-local
   1470 	 * area used by CPUVAR(...), curcpu(), and curlwp will continue
   1471 	 * to work after init386 returns and the initial temporary GDT
   1472 	 * is popped off, before we call main and later create a
   1473 	 * permanent GDT in gdt_init via cpu_startup.
   1474 	 */
   1475 	setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
   1476 	lgdt(&region);
   1477 #endif
   1478 
   1479 	lldt(GSEL(GLDT_SEL, SEL_KPL));
   1480 	cpu_init_idt(&cpu_info_primary);
   1481 
   1482 #ifdef XENPV
   1483 	xen_init_ksyms();
   1484 #else /* XENPV */
   1485 #ifdef XEN
   1486 	if (vm_guest == VM_GUEST_XENPVH)
   1487 		xen_init_ksyms();
   1488 	else
   1489 #endif /* XEN */
   1490 		init386_ksyms();
   1491 #endif /* XENPV */
   1492 
   1493 #if NMCA > 0
   1494 	/*
   1495 	 * check for MCA bus, needed to be done before ISA stuff - if
   1496 	 * MCA is detected, ISA needs to use level triggered interrupts
   1497 	 * by default
   1498 	 * And we do not search for MCA using bioscall() on EFI systems
   1499 	 * that lacks it (they lack MCA too, anyway).
   1500 	 */
   1501 	if (lookup_bootinfo(BTINFO_EFI) == NULL && vm_guest != VM_GUEST_XENPVH)
   1502 		mca_busprobe();
   1503 #endif
   1504 
   1505 #ifdef XENPV
   1506 	extern int tmpstk;
   1507 	cpu_info_primary.ci_intrstack = &tmpstk;
   1508 	events_default_setup();
   1509 #else
   1510 	intr_default_setup();
   1511 #endif
   1512 
   1513 	splraise(IPL_HIGH);
   1514 	x86_enable_intr();
   1515 
   1516 #ifdef DDB
   1517 	if (boothowto & RB_KDB)
   1518 		Debugger();
   1519 #endif
   1520 #ifdef KGDB
   1521 	kgdb_port_init();
   1522 	if (boothowto & RB_KDB) {
   1523 		kgdb_debug_init = 1;
   1524 		kgdb_connect(1);
   1525 	}
   1526 #endif
   1527 
   1528 	if (physmem < btoc(2 * 1024 * 1024)) {
   1529 		printf("warning: too little memory available; "
   1530 		       "have %lu bytes, want %lu bytes\n"
   1531 		       "running in degraded mode\n"
   1532 		       "press a key to confirm\n\n",
   1533 		       (unsigned long)ptoa(physmem), 2*1024*1024UL);
   1534 		cngetc();
   1535 	}
   1536 
   1537 	pcb->pcb_dbregs = NULL;
   1538 	x86_dbregs_init();
   1539 }
   1540 
   1541 #include <dev/ic/mc146818reg.h>		/* for NVRAM POST */
   1542 #include <i386/isa/nvram.h>		/* for NVRAM POST */
   1543 
   1544 void
   1545 cpu_reset(void)
   1546 {
   1547 #ifdef XENPV
   1548 	HYPERVISOR_reboot();
   1549 	for (;;);
   1550 #else /* XENPV */
   1551 	struct region_descriptor region;
   1552 	idt_descriptor_t *idt;
   1553 
   1554 	idt = (idt_descriptor_t *)cpu_info_primary.ci_idtvec.iv_idt;
   1555 	x86_disable_intr();
   1556 
   1557 	/*
   1558 	 * Ensure the NVRAM reset byte contains something vaguely sane.
   1559 	 */
   1560 
   1561 	outb(IO_RTC, NVRAM_RESET);
   1562 	outb(IO_RTC+1, NVRAM_RESET_RST);
   1563 
   1564 	/*
   1565 	 * Reset AMD Geode SC1100.
   1566 	 *
   1567 	 * 1) Write PCI Configuration Address Register (0xcf8) to
   1568 	 *    select Function 0, Register 0x44: Bridge Configuration,
   1569 	 *    GPIO and LPC Configuration Register Space, Reset
   1570 	 *    Control Register.
   1571 	 *
   1572 	 * 2) Write 0xf to PCI Configuration Data Register (0xcfc)
   1573 	 *    to reset IDE controller, IDE bus, and PCI bus, and
   1574 	 *    to trigger a system-wide reset.
   1575 	 *
   1576 	 * See AMD Geode SC1100 Processor Data Book, Revision 2.0,
   1577 	 * sections 6.3.1, 6.3.2, and 6.4.1.
   1578 	 */
   1579 	if (cpu_info_primary.ci_signature == 0x540) {
   1580 		outl(0xcf8, 0x80009044);
   1581 		outl(0xcfc, 0xf);
   1582 	}
   1583 
   1584 	x86_reset();
   1585 
   1586 	/*
   1587 	 * Try to cause a triple fault and watchdog reset by making the IDT
   1588 	 * invalid and causing a fault.
   1589 	 */
   1590 	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
   1591 	setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
   1592 	lidt(&region);
   1593 	breakpoint();
   1594 
   1595 #if 0
   1596 	/*
   1597 	 * Try to cause a triple fault and watchdog reset by unmapping the
   1598 	 * entire address space and doing a TLB flush.
   1599 	 */
   1600 	memset((void *)PTD, 0, PAGE_SIZE);
   1601 	tlbflush();
   1602 #endif
   1603 
   1604 	for (;;);
   1605 #endif /* XENPV */
   1606 }
   1607 
   1608 void
   1609 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
   1610 {
   1611 	const struct trapframe *tf = l->l_md.md_regs;
   1612 	__greg_t *gr = mcp->__gregs;
   1613 	__greg_t ras_eip;
   1614 
   1615 	/* Save register context. */
   1616 	gr[_REG_GS]  = tf->tf_gs;
   1617 	gr[_REG_FS]  = tf->tf_fs;
   1618 	gr[_REG_ES]  = tf->tf_es;
   1619 	gr[_REG_DS]  = tf->tf_ds;
   1620 	gr[_REG_EFL] = tf->tf_eflags;
   1621 
   1622 	gr[_REG_EDI]    = tf->tf_edi;
   1623 	gr[_REG_ESI]    = tf->tf_esi;
   1624 	gr[_REG_EBP]    = tf->tf_ebp;
   1625 	gr[_REG_EBX]    = tf->tf_ebx;
   1626 	gr[_REG_EDX]    = tf->tf_edx;
   1627 	gr[_REG_ECX]    = tf->tf_ecx;
   1628 	gr[_REG_EAX]    = tf->tf_eax;
   1629 	gr[_REG_EIP]    = tf->tf_eip;
   1630 	gr[_REG_CS]     = tf->tf_cs;
   1631 	gr[_REG_ESP]    = tf->tf_esp;
   1632 	gr[_REG_UESP]   = tf->tf_esp;
   1633 	gr[_REG_SS]     = tf->tf_ss;
   1634 	gr[_REG_TRAPNO] = tf->tf_trapno;
   1635 	gr[_REG_ERR]    = tf->tf_err;
   1636 
   1637 	if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
   1638 	    (void *) gr[_REG_EIP])) != -1)
   1639 		gr[_REG_EIP] = ras_eip;
   1640 
   1641 	*flags |= _UC_CPU;
   1642 
   1643 	mcp->_mc_tlsbase = (uintptr_t)l->l_private;
   1644 	*flags |= _UC_TLSBASE;
   1645 
   1646 	/*
   1647 	 * Save floating point register context.
   1648 	 *
   1649 	 * If the cpu doesn't support fxsave we must still write to
   1650 	 * the entire 512 byte area - otherwise we leak kernel memory
   1651 	 * contents to userspace.
   1652 	 * It wouldn't matter if we were doing the copyout here.
   1653 	 * So we might as well convert to fxsave format.
   1654 	 */
   1655 	__CTASSERT(sizeof (struct fxsave) ==
   1656 	    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
   1657 	process_read_fpregs_xmm(l, (struct fxsave *)
   1658 	    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
   1659 	memset(&mcp->__fpregs.__fp_pad, 0, sizeof mcp->__fpregs.__fp_pad);
   1660 	*flags |= _UC_FXSAVE | _UC_FPU;
   1661 }
   1662 
   1663 int
   1664 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
   1665 {
   1666 	const __greg_t *gr = mcp->__gregs;
   1667 	struct trapframe *tf = l->l_md.md_regs;
   1668 
   1669 	/*
   1670 	 * Check for security violations.  If we're returning
   1671 	 * to protected mode, the CPU will validate the segment
   1672 	 * registers automatically and generate a trap on
   1673 	 * violations.  We handle the trap, rather than doing
   1674 	 * all of the checking here.
   1675 	 */
   1676 	if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
   1677 	    !USERMODE(gr[_REG_CS]))
   1678 		return EINVAL;
   1679 
   1680 	return 0;
   1681 }
   1682 
   1683 int
   1684 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
   1685 {
   1686 	struct trapframe *tf = l->l_md.md_regs;
   1687 	const __greg_t *gr = mcp->__gregs;
   1688 	struct proc *p = l->l_proc;
   1689 	int error;
   1690 
   1691 	/* Restore register context, if any. */
   1692 	if ((flags & _UC_CPU) != 0) {
   1693 		error = cpu_mcontext_validate(l, mcp);
   1694 		if (error)
   1695 			return error;
   1696 
   1697 		tf->tf_gs = gr[_REG_GS];
   1698 		tf->tf_fs = gr[_REG_FS];
   1699 		tf->tf_es = gr[_REG_ES];
   1700 		tf->tf_ds = gr[_REG_DS];
   1701 		/* Only change the user-alterable part of eflags */
   1702 		tf->tf_eflags &= ~PSL_USER;
   1703 		tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
   1704 
   1705 		tf->tf_edi    = gr[_REG_EDI];
   1706 		tf->tf_esi    = gr[_REG_ESI];
   1707 		tf->tf_ebp    = gr[_REG_EBP];
   1708 		tf->tf_ebx    = gr[_REG_EBX];
   1709 		tf->tf_edx    = gr[_REG_EDX];
   1710 		tf->tf_ecx    = gr[_REG_ECX];
   1711 		tf->tf_eax    = gr[_REG_EAX];
   1712 		tf->tf_eip    = gr[_REG_EIP];
   1713 		tf->tf_cs     = gr[_REG_CS];
   1714 		tf->tf_esp    = gr[_REG_UESP];
   1715 		tf->tf_ss     = gr[_REG_SS];
   1716 	}
   1717 
   1718 	if ((flags & _UC_TLSBASE) != 0)
   1719 		lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
   1720 
   1721 	/* Restore floating point register context, if given. */
   1722 	if ((flags & _UC_FPU) != 0) {
   1723 		__CTASSERT(sizeof (struct fxsave) ==
   1724 		    sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
   1725 		__CTASSERT(sizeof (struct save87) ==
   1726 		    sizeof mcp->__fpregs.__fp_reg_set.__fpchip_state);
   1727 
   1728 		if (flags & _UC_FXSAVE) {
   1729 			process_write_fpregs_xmm(l, (const struct fxsave *)
   1730 				    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
   1731 		} else {
   1732 			process_write_fpregs_s87(l, (const struct save87 *)
   1733 				    &mcp->__fpregs.__fp_reg_set.__fpchip_state);
   1734 		}
   1735 	}
   1736 
   1737 	mutex_enter(p->p_lock);
   1738 	if (flags & _UC_SETSTACK)
   1739 		l->l_sigstk.ss_flags |= SS_ONSTACK;
   1740 	if (flags & _UC_CLRSTACK)
   1741 		l->l_sigstk.ss_flags &= ~SS_ONSTACK;
   1742 	mutex_exit(p->p_lock);
   1743 	return (0);
   1744 }
   1745 
   1746 #define	DEV_IO 14		/* iopl for compat_10 */
   1747 
   1748 int
   1749 mm_md_open(dev_t dev, int flag, int mode, struct lwp *l)
   1750 {
   1751 
   1752 	switch (minor(dev)) {
   1753 	case DEV_IO:
   1754 		/*
   1755 		 * This is done by i386_iopl(3) now.
   1756 		 *
   1757 		 * #if defined(COMPAT_10) || defined(COMPAT_FREEBSD)
   1758 		 */
   1759 		if (flag & FWRITE) {
   1760 			struct trapframe *fp;
   1761 			int error;
   1762 
   1763 			error = kauth_authorize_machdep(l->l_cred,
   1764 			    KAUTH_MACHDEP_IOPL, NULL, NULL, NULL, NULL);
   1765 			if (error)
   1766 				return (error);
   1767 			fp = curlwp->l_md.md_regs;
   1768 			fp->tf_eflags |= PSL_IOPL;
   1769 		}
   1770 		break;
   1771 	default:
   1772 		break;
   1773 	}
   1774 	return 0;
   1775 }
   1776 
   1777 #ifdef PAE
   1778 void
   1779 cpu_alloc_l3_page(struct cpu_info *ci)
   1780 {
   1781 	int ret;
   1782 	struct pglist pg;
   1783 	struct vm_page *vmap;
   1784 
   1785 	KASSERT(ci != NULL);
   1786 	/*
   1787 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
   1788 	 * resides below the 4GB boundary.
   1789 	 */
   1790 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
   1791 	vmap = TAILQ_FIRST(&pg);
   1792 
   1793 	if (ret != 0 || vmap == NULL)
   1794 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
   1795 			__func__, cpu_index(ci), ret);
   1796 
   1797 	ci->ci_pae_l3_pdirpa = VM_PAGE_TO_PHYS(vmap);
   1798 
   1799 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
   1800 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
   1801 	if (ci->ci_pae_l3_pdir == NULL)
   1802 		panic("%s: failed to allocate L3 PD for CPU %d\n",
   1803 			__func__, cpu_index(ci));
   1804 
   1805 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
   1806 		VM_PROT_READ | VM_PROT_WRITE, 0);
   1807 
   1808 	pmap_update(pmap_kernel());
   1809 }
   1810 #endif /* PAE */
   1811 
   1812 static void
   1813 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
   1814 {
   1815 	idt_descriptor_t *idt_dst;
   1816 
   1817 	idt_dst = dst->iv_idt;
   1818 	memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
   1819 	memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
   1820 }
   1821 
   1822 void
   1823 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
   1824 {
   1825 	vaddr_t va_idt, va_pentium_idt;
   1826 	struct vm_page *pg;
   1827 
   1828 	if (idt_vec_is_pcpu() &&
   1829 	    cid != cpu_index(&cpu_info_primary)) {
   1830 		va_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
   1831 		    0, UVM_KMF_VAONLY);
   1832 		pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
   1833 		if (pg == NULL) {
   1834 			panic("failed to allocate pcpu idt PA");
   1835 		}
   1836 		pmap_kenter_pa(va_idt, VM_PAGE_TO_PHYS(pg),
   1837 		    VM_PROT_READ|VM_PROT_WRITE, 0);
   1838 		pmap_update(pmap_kernel());
   1839 
   1840 		memset((void *)va_idt, 0, PAGE_SIZE);
   1841 
   1842 		/* pentium f00f bug stuff */
   1843 		va_pentium_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
   1844 		    0, UVM_KMF_VAONLY);
   1845 		pmap_kenter_pa(va_pentium_idt, VM_PAGE_TO_PHYS(pg),
   1846 		    VM_PROT_READ, 0);
   1847 		pmap_update(pmap_kernel());
   1848 
   1849 		iv->iv_idt = (void *)va_idt;
   1850 		iv->iv_idt_pentium = (void *)va_pentium_idt;
   1851 
   1852 		idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
   1853 	} else {
   1854 		iv->iv_idt = (void *)idt_vaddr;
   1855 		iv->iv_idt_pentium = (void *)pentium_idt_vaddr;
   1856 	}
   1857 }
   1858