Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: x86_machdep.c,v 1.160 2025/12/05 17:58:12 khorben Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
      5  * Copyright (c) 2005, 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Julio M. Merino Vidal, and Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.160 2025/12/05 17:58:12 khorben Exp $");
     35 
     36 #include "opt_modular.h"
     37 #include "opt_physmem.h"
     38 #include "opt_splash.h"
     39 #include "opt_kaslr.h"
     40 #include "opt_svs.h"
     41 #include "opt_xen.h"
     42 
     43 #include <sys/types.h>
     44 #include <sys/param.h>
     45 #include <sys/systm.h>
     46 #include <sys/kcore.h>
     47 #include <sys/errno.h>
     48 #include <sys/kauth.h>
     49 #include <sys/mutex.h>
     50 #include <sys/cpu.h>
     51 #include <sys/intr.h>
     52 #include <sys/atomic.h>
     53 #include <sys/module.h>
     54 #include <sys/sysctl.h>
     55 #include <sys/extent.h>
     56 #include <sys/rnd.h>
     57 
     58 #include <x86/bootspace.h>
     59 #include <x86/cpuvar.h>
     60 #include <x86/cputypes.h>
     61 #include <x86/efi.h>
     62 #include <x86/machdep.h>
     63 #include <x86/nmi.h>
     64 #include <x86/pio.h>
     65 
     66 #include <dev/splash/splash.h>
     67 #include <dev/isa/isareg.h>
     68 #include <dev/ic/i8042reg.h>
     69 #include <dev/mm.h>
     70 
     71 #include <machine/bootinfo.h>
     72 #include <machine/pmap_private.h>
     73 #include <machine/vmparam.h>
     74 
     75 #include <uvm/uvm_extern.h>
     76 
     77 #include "tsc.h"
     78 
     79 #include "acpica.h"
     80 #include "ioapic.h"
     81 #include "lapic.h"
     82 
     83 #if NACPICA > 0
     84 #include <dev/acpi/acpivar.h>
     85 #endif
     86 
     87 #if NIOAPIC > 0 || NACPICA > 0
     88 #include <machine/i82093var.h>
     89 #endif
     90 
     91 #include "opt_md.h"
     92 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
     93 #include <dev/md.h>
     94 #endif
     95 
     96 void (*x86_cpu_idle)(void);
     97 static bool x86_cpu_idle_ipi;
     98 static char x86_cpu_idle_text[16];
     99 
    100 static bool x86_user_ldt_enabled __read_mostly = false;
    101 
    102 #ifdef XEN
    103 
    104 #include <xen/xen.h>
    105 #include <xen/hypervisor.h>
    106 #endif
    107 
    108 #ifndef XENPV
    109 void (*delay_func)(unsigned int) = i8254_delay;
    110 void (*x86_initclock_func)(void) = i8254_initclocks;
    111 #else /* XENPV */
    112 void (*delay_func)(unsigned int) = xen_delay;
    113 void (*x86_initclock_func)(void) = xen_initclocks;
    114 #endif
    115 
    116 
    117 /* --------------------------------------------------------------------- */
    118 
    119 /*
    120  * Main bootinfo structure.  This is filled in by the bootstrap process
    121  * done in locore.S based on the information passed by the boot loader.
    122  */
    123 struct bootinfo bootinfo;
    124 
    125 /* --------------------------------------------------------------------- */
    126 
    127 bool bootmethod_efi;
    128 
    129 static kauth_listener_t x86_listener;
    130 
    131 extern paddr_t lowmem_rsvd, avail_start, avail_end;
    132 
    133 vaddr_t msgbuf_vaddr;
    134 
    135 struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX];
    136 
    137 unsigned int msgbuf_p_cnt = 0;
    138 
    139 void init_x86_msgbuf(void);
    140 
    141 /*
    142  * Given the type of a bootinfo entry, looks for a matching item inside
    143  * the bootinfo structure.  If found, returns a pointer to it (which must
    144  * then be casted to the appropriate bootinfo_* type); otherwise, returns
    145  * NULL.
    146  */
    147 void *
    148 lookup_bootinfo(int type)
    149 {
    150 	bool found;
    151 	int i;
    152 	struct btinfo_common *bic;
    153 
    154 	bic = (struct btinfo_common *)(bootinfo.bi_data);
    155 	found = FALSE;
    156 	for (i = 0; i < bootinfo.bi_nentries && !found; i++) {
    157 		if (bic->type == type)
    158 			found = TRUE;
    159 		else
    160 			bic = (struct btinfo_common *)
    161 			    ((uint8_t *)bic + bic->len);
    162 	}
    163 
    164 	return found ? bic : NULL;
    165 }
    166 
    167 #ifdef notyet
    168 /*
    169  * List the available bootinfo entries.
    170  */
    171 static const char *btinfo_str[] = {
    172 	BTINFO_STR
    173 };
    174 
    175 void
    176 aprint_bootinfo(void)
    177 {
    178 	int i;
    179 	struct btinfo_common *bic;
    180 
    181 	aprint_normal("bootinfo:");
    182 	bic = (struct btinfo_common *)(bootinfo.bi_data);
    183 	for (i = 0; i < bootinfo.bi_nentries; i++) {
    184 		if (bic->type >= 0 && bic->type < __arraycount(btinfo_str))
    185 			aprint_normal(" %s", btinfo_str[bic->type]);
    186 		else
    187 			aprint_normal(" %d", bic->type);
    188 		bic = (struct btinfo_common *)
    189 		    ((uint8_t *)bic + bic->len);
    190 	}
    191 	aprint_normal("\n");
    192 }
    193 #endif
    194 
    195 /*
    196  * mm_md_physacc: check if given pa is accessible.
    197  */
    198 int
    199 mm_md_physacc(paddr_t pa, vm_prot_t prot)
    200 {
    201 	extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
    202 	extern int mem_cluster_cnt;
    203 	int i;
    204 
    205 	for (i = 0; i < mem_cluster_cnt; i++) {
    206 		const phys_ram_seg_t *seg = &mem_clusters[i];
    207 		paddr_t lstart = seg->start;
    208 
    209 		if (lstart <= pa && pa - lstart <= seg->size) {
    210 			return 0;
    211 		}
    212 	}
    213 	return kauth_authorize_machdep(kauth_cred_get(),
    214 	    KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL);
    215 }
    216 
    217 #ifdef MODULAR
    218 #ifdef XEN
    219 void x86_add_xen_modules(void);
    220 void
    221 x86_add_xen_modules(void)
    222 {
    223 #if defined(XENPVHVM) || defined(XENPVH)
    224 	uint32_t i;
    225 	struct hvm_modlist_entry *modlist;
    226 
    227 	if (hvm_start_info->nr_modules == 0) {
    228 		aprint_verbose("No Xen module info at boot\n");
    229 		return;
    230 	}
    231 	aprint_debug("%d Xen module(s) at boot\n", hvm_start_info->nr_modules);
    232 	modlist = (void *)((uintptr_t)hvm_start_info->modlist_paddr + KERNBASE);
    233 	for (i = 0; i < hvm_start_info->nr_modules; i++) {
    234 		if (memcmp(
    235 			    (char *)((uintptr_t)modlist[i].paddr + KERNBASE),
    236 			    "\177ELF", 4) == 0) {
    237 			aprint_debug("Prep module path=%s len=%"PRIu64" pa=%p\n",
    238 			    "pvh-module",
    239 			    modlist[i].size,
    240 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE));
    241 			module_prime(
    242 			    "pvh-module",
    243 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE),
    244 			    modlist[i].size);
    245 #ifdef SPLASHSCREEN
    246 		} else if (memcmp(
    247 			    (char *)((uintptr_t)modlist[i].paddr + KERNBASE),
    248 			    "\211PNG\r\n\032\n", 8) == 0 ||
    249 			   memcmp(
    250 			    (char *)((uintptr_t)modlist[i].paddr + KERNBASE),
    251 			    "\377\330\377", 3) == 0) {
    252 			aprint_debug("Splash image path=%s len=%"PRIu64" pa=%p\n",
    253 			    "pvh-image", modlist[i].size,
    254 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE));
    255 			splash_setimage(
    256 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE),
    257 			    modlist[i].size);
    258 #endif
    259 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
    260 		} else {
    261 			aprint_debug("File-system image path=%s len=%"PRIu64" pa=%p\n",
    262 			    "pvh-filesystem",
    263 			    modlist[i].size,
    264 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE));
    265 			md_root_setconf(
    266 			    (void *)((uintptr_t)modlist[i].paddr + KERNBASE),
    267 			    modlist[i].size);
    268 #endif
    269 		}
    270 	}
    271 #endif
    272 }
    273 #endif	/* XEN */
    274 /*
    275  * Push any modules loaded by the boot loader.
    276  */
    277 void
    278 module_init_md(void)
    279 {
    280 	struct btinfo_modulelist *biml;
    281 	struct bi_modulelist_entry *bi, *bimax;
    282 
    283 	biml = lookup_bootinfo(BTINFO_MODULELIST);
    284 	if (biml == NULL) {
    285 		aprint_debug("No module info at boot\n");
    286 		return;
    287 	}
    288 
    289 	bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
    290 	bimax = bi + biml->num;
    291 	for (; bi < bimax; bi++) {
    292 		switch (bi->type) {
    293 		case BI_MODULE_ELF:
    294 			aprint_debug("Prep module path=%s len=%d pa=%x\n",
    295 			    bi->path, bi->len, bi->base);
    296 			KASSERT(trunc_page(bi->base) == bi->base);
    297 			module_prime(bi->path,
    298 #ifdef KASLR
    299 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
    300 #else
    301 			    (void *)((uintptr_t)bi->base + KERNBASE),
    302 #endif
    303 			    bi->len);
    304 			break;
    305 		case BI_MODULE_IMAGE:
    306 #ifdef SPLASHSCREEN
    307 			aprint_debug("Splash image path=%s len=%d pa=%x\n",
    308 			    bi->path, bi->len, bi->base);
    309 			KASSERT(trunc_page(bi->base) == bi->base);
    310 			splash_setimage(
    311 #ifdef KASLR
    312 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
    313 #else
    314 			    (void *)((uintptr_t)bi->base + KERNBASE),
    315 #endif
    316 			    bi->len);
    317 #endif
    318 			break;
    319 		case BI_MODULE_RND:
    320 			/* handled in x86_rndseed */
    321 			break;
    322 		case BI_MODULE_FS:
    323 			aprint_debug("File-system image path=%s len=%d pa=%x\n",
    324 			    bi->path, bi->len, bi->base);
    325 			KASSERT(trunc_page(bi->base) == bi->base);
    326 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
    327 			md_root_setconf(
    328 #ifdef KASLR
    329 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
    330 #else
    331 			    (void *)((uintptr_t)bi->base + KERNBASE),
    332 #endif
    333 			    bi->len);
    334 #endif
    335 			break;
    336 		default:
    337 			aprint_debug("Skipping non-ELF module\n");
    338 			break;
    339 		}
    340 	}
    341 }
    342 #endif	/* MODULAR */
    343 
    344 void
    345 x86_rndseed(void)
    346 {
    347 	struct btinfo_modulelist *biml;
    348 	struct bi_modulelist_entry *bi, *bimax;
    349 
    350 	biml = lookup_bootinfo(BTINFO_MODULELIST);
    351 	if (biml == NULL) {
    352 		aprint_debug("No module info at boot\n");
    353 		return;
    354 	}
    355 
    356 	bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
    357 	bimax = bi + biml->num;
    358 	for (; bi < bimax; bi++) {
    359 		switch (bi->type) {
    360 		case BI_MODULE_RND:
    361 			aprint_debug("Random seed data path=%s len=%d pa=%x\n",
    362 				     bi->path, bi->len, bi->base);
    363 			KASSERT(trunc_page(bi->base) == bi->base);
    364 			rnd_seed(
    365 #ifdef KASLR
    366 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
    367 #else
    368 			    (void *)((uintptr_t)bi->base + KERNBASE),
    369 #endif
    370 			     bi->len);
    371 		}
    372 	}
    373 }
    374 
    375 void
    376 cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags)
    377 {
    378 
    379 	KASSERT(kpreempt_disabled());
    380 
    381 	if ((flags & RESCHED_IDLE) != 0) {
    382 		if ((flags & RESCHED_REMOTE) != 0 &&
    383 		    x86_cpu_idle_ipi != false) {
    384 			cpu_kick(ci);
    385 		}
    386 		return;
    387 	}
    388 
    389 #ifdef __HAVE_PREEMPTION
    390 	if ((flags & RESCHED_KPREEMPT) != 0) {
    391 		if ((flags & RESCHED_REMOTE) != 0) {
    392 #ifdef XENPV
    393 			xen_send_ipi(ci, XEN_IPI_KPREEMPT);
    394 #else
    395 			x86_send_ipi(ci, X86_IPI_KPREEMPT);
    396 #endif
    397 		} else {
    398 			softint_trigger(1 << SIR_PREEMPT);
    399 		}
    400 		return;
    401 	}
    402 #endif
    403 
    404 	KASSERT((flags & RESCHED_UPREEMPT) != 0);
    405 	if ((flags & RESCHED_REMOTE) != 0) {
    406 		cpu_kick(ci);
    407 	} else {
    408 		aston(l);
    409 	}
    410 }
    411 
    412 void
    413 cpu_signotify(struct lwp *l)
    414 {
    415 
    416 	KASSERT(kpreempt_disabled());
    417 
    418 	if (l->l_cpu != curcpu()) {
    419 		cpu_kick(l->l_cpu);
    420 	} else {
    421 		aston(l);
    422 	}
    423 }
    424 
    425 void
    426 cpu_need_proftick(struct lwp *l)
    427 {
    428 
    429 	KASSERT(kpreempt_disabled());
    430 	KASSERT(l->l_cpu == curcpu());
    431 
    432 	l->l_pflag |= LP_OWEUPC;
    433 	aston(l);
    434 }
    435 
    436 bool
    437 cpu_intr_p(void)
    438 {
    439 	int idepth;
    440 	long pctr;
    441 	lwp_t *l;
    442 
    443 	l = curlwp;
    444 	if (__predict_false(l->l_cpu == NULL)) {
    445 		KASSERT(l == &lwp0);
    446 		return false;
    447 	}
    448 	do {
    449 		pctr = lwp_pctr();
    450 		idepth = l->l_cpu->ci_idepth;
    451 	} while (__predict_false(pctr != lwp_pctr()));
    452 
    453 	return idepth >= 0;
    454 }
    455 
    456 #ifdef __HAVE_PREEMPTION
    457 /*
    458  * Called to check MD conditions that would prevent preemption, and to
    459  * arrange for those conditions to be rechecked later.
    460  */
    461 bool
    462 cpu_kpreempt_enter(uintptr_t where, int s)
    463 {
    464 	struct pcb *pcb;
    465 	lwp_t *l;
    466 
    467 	KASSERT(kpreempt_disabled());
    468 	l = curlwp;
    469 
    470 	/*
    471 	 * If SPL raised, can't go.  Note this implies that spin
    472 	 * mutexes at IPL_NONE are _not_ valid to use.
    473 	 */
    474 	if (s > IPL_PREEMPT) {
    475 		softint_trigger(1 << SIR_PREEMPT);
    476 		return false;
    477 	}
    478 
    479 	/* Must save cr2 or it could be clobbered. */
    480 	pcb = lwp_getpcb(l);
    481 	pcb->pcb_cr2 = rcr2();
    482 
    483 	return true;
    484 }
    485 
    486 /*
    487  * Called after returning from a kernel preemption, and called with
    488  * preemption disabled.
    489  */
    490 void
    491 cpu_kpreempt_exit(uintptr_t where)
    492 {
    493 	extern char x86_copyfunc_start, x86_copyfunc_end;
    494 #if defined(XENPV) && defined(i386)
    495 	extern char i386_calltrap_start, i386_calltrap_end;
    496 #endif
    497 	struct pcb *pcb;
    498 
    499 	KASSERT(kpreempt_disabled());
    500 
    501 	/*
    502 	 * If we interrupted any of the copy functions we must reload
    503 	 * the pmap when resuming, as they cannot tolerate it being
    504 	 * swapped out.
    505 	 */
    506 	if (where >= (uintptr_t)&x86_copyfunc_start &&
    507 	    where < (uintptr_t)&x86_copyfunc_end) {
    508 		pmap_load();
    509 	}
    510 #if defined(XENPV) && defined(i386)
    511 	else if (where >= (uintptr_t)&i386_calltrap_start &&
    512 	    where < (uintptr_t)&i386_calltrap_end) {
    513 		pmap_load();
    514 	}
    515 #endif
    516 
    517 	/* Restore cr2 only after the pmap, as pmap_load can block. */
    518 	pcb = lwp_getpcb(curlwp);
    519 	lcr2(pcb->pcb_cr2);
    520 }
    521 
    522 /*
    523  * Return true if preemption is disabled for MD reasons.  Must be called
    524  * with preemption disabled, and thus is only for diagnostic checks.
    525  */
    526 bool
    527 cpu_kpreempt_disabled(void)
    528 {
    529 
    530 	return curcpu()->ci_ilevel > IPL_NONE;
    531 }
    532 #endif	/* __HAVE_PREEMPTION */
    533 
    534 SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle")
    535 {
    536 	const struct sysctlnode	*mnode, *node;
    537 
    538 	sysctl_createv(NULL, 0, NULL, &mnode,
    539 	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
    540 	    NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);
    541 
    542 	sysctl_createv(NULL, 0, &mnode, &node,
    543 		       CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism",
    544 		       SYSCTL_DESCR("Mechanism used for the idle loop."),
    545 		       NULL, 0, x86_cpu_idle_text, 0,
    546 		       CTL_CREATE, CTL_EOL);
    547 }
    548 
    549 void
    550 x86_cpu_idle_init(void)
    551 {
    552 
    553 #ifndef XENPV
    554 	if ((cpu_feature[1] & CPUID2_MONITOR) == 0)
    555 		x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true);
    556 	else
    557 		x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false);
    558 #else
    559 	x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true);
    560 #endif
    561 }
    562 
    563 void
    564 x86_cpu_idle_get(void (**func)(void), char *text, size_t len)
    565 {
    566 
    567 	*func = x86_cpu_idle;
    568 
    569 	(void)strlcpy(text, x86_cpu_idle_text, len);
    570 }
    571 
    572 void
    573 x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi)
    574 {
    575 
    576 	x86_cpu_idle = func;
    577 	x86_cpu_idle_ipi = ipi;
    578 
    579 	(void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text));
    580 }
    581 
    582 #ifndef XENPV
    583 
    584 #define KBTOB(x)	((size_t)(x) * 1024UL)
    585 #define MBTOB(x)	((size_t)(x) * 1024UL * 1024UL)
    586 
    587 static struct {
    588 	int freelist;
    589 	uint64_t limit;
    590 } x86_freelists[VM_NFREELIST] = {
    591 	{ VM_FREELIST_DEFAULT, 0 },
    592 #ifdef VM_FREELIST_FIRST1T
    593 	/* 40-bit addresses needed for modern graphics. */
    594 	{ VM_FREELIST_FIRST1T,	1ULL * 1024 * 1024 * 1024 * 1024 },
    595 #endif
    596 #ifdef VM_FREELIST_FIRST64G
    597 	/* 36-bit addresses needed for oldish graphics. */
    598 	{ VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 },
    599 #endif
    600 #ifdef VM_FREELIST_FIRST4G
    601 	/* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */
    602 	{ VM_FREELIST_FIRST4G,  4ULL * 1024 * 1024 * 1024 },
    603 #endif
    604 	/* 30-bit addresses needed for ancient graphics. */
    605 	{ VM_FREELIST_FIRST1G,	1ULL * 1024 * 1024 * 1024 },
    606 	/* 24-bit addresses needed for ISA DMA. */
    607 	{ VM_FREELIST_FIRST16,	16 * 1024 * 1024 },
    608 };
    609 
    610 int
    611 x86_select_freelist(uint64_t maxaddr)
    612 {
    613 	unsigned int i;
    614 
    615 	if (avail_end <= maxaddr)
    616 		return VM_NFREELIST;
    617 
    618 	for (i = 0; i < __arraycount(x86_freelists); i++) {
    619 		if ((x86_freelists[i].limit - 1) <= maxaddr)
    620 			return x86_freelists[i].freelist;
    621 	}
    622 
    623 	panic("no freelist for maximum address %"PRIx64, maxaddr);
    624 }
    625 
    626 static int
    627 x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type)
    628 {
    629 	extern struct extent *iomem_ex;
    630 	const uint64_t endext = MAXIOMEM + 1;
    631 	uint64_t new_physmem = 0;
    632 	phys_ram_seg_t *cluster;
    633 	int i;
    634 
    635 	if (seg_end > MAXPHYSMEM) {
    636 		aprint_verbose("WARNING: skipping large memory map entry: "
    637 		    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n",
    638 		    seg_start, (seg_end - seg_start), type);
    639 		return 0;
    640 	}
    641 
    642 	/*
    643 	 * XXX: Chop the last page off the size so that it can fit in avail_end.
    644 	 */
    645 	if (seg_end == MAXPHYSMEM)
    646 		seg_end -= PAGE_SIZE;
    647 
    648 	if (seg_end <= seg_start)
    649 		return 0;
    650 
    651 	for (i = 0; i < mem_cluster_cnt; i++) {
    652 		cluster = &mem_clusters[i];
    653 		if ((cluster->start == round_page(seg_start)) &&
    654 		    (cluster->size == trunc_page(seg_end) - cluster->start)) {
    655 #ifdef DEBUG_MEMLOAD
    656 			printf("WARNING: skipping duplicate segment entry\n");
    657 #endif
    658 			return 0;
    659 		}
    660 	}
    661 
    662 	/*
    663 	 * This cluster is used by RAM. If it is included in the iomem extent,
    664 	 * allocate it from there, so that we won't unintentionally reuse it
    665 	 * later with extent_alloc_region. A way to avoid collision (with UVM
    666 	 * for example).
    667 	 *
    668 	 * This is done before the addresses are page rounded just to make
    669 	 * sure we get them all.
    670 	 */
    671 	if (seg_start < endext) {
    672 		uint64_t io_end;
    673 
    674 		if (seg_end > endext)
    675 			io_end = endext;
    676 		else
    677 			io_end = seg_end;
    678 
    679 		if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start,
    680 		    io_end - seg_start, EX_NOWAIT)) {
    681 			/* XXX What should we do? */
    682 			printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT "
    683 			    "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM "
    684 			    "IOMEM EXTENT MAP!\n",
    685 			    seg_start, seg_end - seg_start, type);
    686 			return 0;
    687 		}
    688 	}
    689 
    690 	/* If it's not free memory, skip it. */
    691 	if (type != BIM_Memory)
    692 		return 0;
    693 
    694 	if (mem_cluster_cnt >= VM_PHYSSEG_MAX) {
    695 		printf("WARNING: too many memory segments"
    696 		    "(increase VM_PHYSSEG_MAX)");
    697 		return -1;
    698 	}
    699 
    700 #ifdef PHYSMEM_MAX_ADDR
    701 	if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR))
    702 		return 0;
    703 	if (seg_end > MBTOB(PHYSMEM_MAX_ADDR))
    704 		seg_end = MBTOB(PHYSMEM_MAX_ADDR);
    705 #endif
    706 
    707 	seg_start = round_page(seg_start);
    708 	seg_end = trunc_page(seg_end);
    709 
    710 	if (seg_start == seg_end)
    711 		return 0;
    712 
    713 	cluster = &mem_clusters[mem_cluster_cnt];
    714 	cluster->start = seg_start;
    715 	if (iomem_ex != NULL)
    716 		new_physmem = physmem + atop(seg_end - seg_start);
    717 
    718 #ifdef PHYSMEM_MAX_SIZE
    719 	if (iomem_ex != NULL) {
    720 		if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE)))
    721 			return 0;
    722 		if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) {
    723 			seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem);
    724 			new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE));
    725 		}
    726 	}
    727 #endif
    728 
    729 	cluster->size = seg_end - seg_start;
    730 
    731 	if (iomem_ex != NULL) {
    732 		if (avail_end < seg_end)
    733 			avail_end = seg_end;
    734 		physmem = new_physmem;
    735 	}
    736 	mem_cluster_cnt++;
    737 
    738 	return 0;
    739 }
    740 
    741 static int
    742 x86_parse_clusters(struct btinfo_memmap *bim)
    743 {
    744 	uint64_t seg_start, seg_end;
    745 	uint64_t addr, size;
    746 	uint32_t type;
    747 	int x;
    748 
    749 	KASSERT(bim != NULL);
    750 	KASSERT(bim->num > 0);
    751 
    752 #ifdef DEBUG_MEMLOAD
    753 	printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n",
    754 	    lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS",
    755 	    bim->num);
    756 #endif
    757 
    758 	for (x = 0; x < bim->num; x++) {
    759 		addr = bim->entry[x].addr;
    760 		size = bim->entry[x].size;
    761 		type = bim->entry[x].type;
    762 #ifdef DEBUG_MEMLOAD
    763 		printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64
    764 		    "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n",
    765 		    addr, addr + size - 1, size, type,
    766 		    (type == BIM_Memory) ?  "Memory" :
    767 		    (type == BIM_Reserved) ?  "Reserved" :
    768 		    (type == BIM_ACPI) ? "ACPI" :
    769 		    (type == BIM_NVS) ? "NVS" :
    770 		    (type == BIM_PMEM) ? "Persistent" :
    771 		    (type == BIM_PRAM) ? "Persistent (Legacy)" :
    772 		    "unknown");
    773 #endif
    774 
    775 		/* If the segment is not memory, skip it. */
    776 		switch (type) {
    777 		case BIM_Memory:
    778 		case BIM_ACPI:
    779 		case BIM_NVS:
    780 			break;
    781 		default:
    782 			continue;
    783 		}
    784 
    785 		/* If the segment is smaller than a page, skip it. */
    786 		if (size < PAGE_SIZE)
    787 			continue;
    788 
    789 		seg_start = addr;
    790 		seg_end = addr + size;
    791 
    792 		/*
    793 		 * XXX XXX: Avoid the ISA I/O MEM.
    794 		 *
    795 		 * Some laptops (for example, Toshiba Satellite2550X) report
    796 		 * this area as valid.
    797 		 */
    798 		if (seg_start < IOM_END && seg_end > IOM_BEGIN) {
    799 			printf("WARNING: memory map entry overlaps "
    800 			    "with ``Compatibility Holes'': "
    801 			    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start,
    802 			    seg_end - seg_start, type);
    803 
    804 			if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1)
    805 				break;
    806 			if (x86_add_cluster(IOM_END, seg_end, type) == -1)
    807 				break;
    808 		} else {
    809 			if (x86_add_cluster(seg_start, seg_end, type) == -1)
    810 				break;
    811 		}
    812 	}
    813 
    814 	return 0;
    815 }
    816 
    817 static int
    818 x86_fake_clusters(void)
    819 {
    820 	extern struct extent *iomem_ex;
    821 	phys_ram_seg_t *cluster;
    822 	KASSERT(mem_cluster_cnt == 0);
    823 
    824 	/*
    825 	 * Allocate the physical addresses used by RAM from the iomem extent
    826 	 * map. This is done before the addresses are page rounded just to make
    827 	 * sure we get them all.
    828 	 */
    829 	if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) {
    830 		/* XXX What should we do? */
    831 		printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
    832 		    "IOMEM EXTENT MAP!\n");
    833 	}
    834 
    835 	cluster = &mem_clusters[0];
    836 	cluster->start = 0;
    837 	cluster->size = trunc_page(KBTOB(biosbasemem));
    838 	physmem += atop(cluster->size);
    839 
    840 	if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
    841 	    EX_NOWAIT)) {
    842 		/* XXX What should we do? */
    843 		printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
    844 		    "IOMEM EXTENT MAP!\n");
    845 	}
    846 
    847 #if NISADMA > 0
    848 	/*
    849 	 * Some motherboards/BIOSes remap the 384K of RAM that would
    850 	 * normally be covered by the ISA hole to the end of memory
    851 	 * so that it can be used.  However, on a 16M system, this
    852 	 * would cause bounce buffers to be allocated and used.
    853 	 * This is not desirable behaviour, as more than 384K of
    854 	 * bounce buffers might be allocated.  As a work-around,
    855 	 * we round memory down to the nearest 1M boundary if
    856 	 * we're using any isadma devices and the remapped memory
    857 	 * is what puts us over 16M.
    858 	 */
    859 	if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
    860 		char pbuf[9];
    861 
    862 		format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024));
    863 		printf("Warning: ignoring %s of remapped memory\n", pbuf);
    864 		biosextmem = (15*1024);
    865 	}
    866 #endif
    867 
    868 	cluster = &mem_clusters[1];
    869 	cluster->start = IOM_END;
    870 	cluster->size = trunc_page(KBTOB(biosextmem));
    871 	physmem += atop(cluster->size);
    872 
    873 	mem_cluster_cnt = 2;
    874 
    875 	avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
    876 
    877 	return 0;
    878 }
    879 
    880 /*
    881  * x86_load_region: load the physical memory region from seg_start to seg_end
    882  * into the VM system.
    883  */
    884 static void
    885 x86_load_region(uint64_t seg_start, uint64_t seg_end)
    886 {
    887 	unsigned int i;
    888 	uint64_t tmp;
    889 
    890 	i = __arraycount(x86_freelists);
    891 	while (i--) {
    892 		if (x86_freelists[i].limit <= seg_start)
    893 			continue;
    894 		if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT)
    895 			continue;
    896 		tmp = MIN(x86_freelists[i].limit, seg_end);
    897 		if (tmp == seg_start)
    898 			continue;
    899 
    900 #ifdef DEBUG_MEMLOAD
    901 		printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64
    902 		    " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist,
    903 		    seg_start, tmp, (uint64_t)atop(seg_start),
    904 		    (uint64_t)atop(tmp));
    905 #endif
    906 
    907 		uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start),
    908 		    atop(tmp), x86_freelists[i].freelist);
    909 		seg_start = tmp;
    910 	}
    911 
    912 	if (seg_start != seg_end) {
    913 #ifdef DEBUG_MEMLOAD
    914 		printf("loading default 0x%"PRIx64"-0x%"PRIx64
    915 		    " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end,
    916 		    (uint64_t)atop(seg_start), (uint64_t)atop(seg_end));
    917 #endif
    918 		uvm_page_physload(atop(seg_start), atop(seg_end),
    919 		    atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT);
    920 	}
    921 }
    922 
    923 #ifdef XEN
    924 static void
    925 x86_add_xen_clusters(void)
    926 {
    927 	if (hvm_start_info->memmap_entries > 0) {
    928 		struct hvm_memmap_table_entry *map_entry;
    929 		map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE);
    930 		for (int i = 0; i < hvm_start_info->memmap_entries; i++) {
    931 			if (map_entry[i].size < PAGE_SIZE)
    932 				continue;
    933 			switch (map_entry[i].type) {
    934 			case XEN_HVM_MEMMAP_TYPE_RAM:
    935 				x86_add_cluster(map_entry[i].addr,
    936 				    map_entry[i].addr + map_entry[i].size,
    937 				    BIM_Memory);
    938 				break;
    939 			case XEN_HVM_MEMMAP_TYPE_ACPI:
    940 				x86_add_cluster(map_entry[i].addr,
    941 				    map_entry[i].addr + map_entry[i].size,
    942 				    BIM_ACPI);
    943 				break;
    944 			}
    945 		}
    946 	} else {
    947 		struct xen_memory_map memmap;
    948 		static struct _xen_mmap {
    949 			struct btinfo_memmap bim;
    950 			struct bi_memmap_entry map[128]; /* same as FreeBSD */
    951 		} __packed xen_mmap;
    952 		int err;
    953 
    954 		memmap.nr_entries = 128;
    955 		set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]);
    956 		if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap))
    957 		    < 0)
    958 			panic("XENMEM_memory_map %d", err);
    959 		xen_mmap.bim.num = memmap.nr_entries;
    960 		x86_parse_clusters(&xen_mmap.bim);
    961 	}
    962 }
    963 #endif /* XEN */
    964 /*
    965  * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and
    966  * initialize mem_clusters.
    967  */
    968 void
    969 init_x86_clusters(void)
    970 {
    971 	struct btinfo_memmap *bim;
    972 	struct btinfo_efimemmap *biem;
    973 
    974 	/*
    975 	 * Check to see if we have a memory map from the BIOS (passed to us by
    976 	 * the boot program).
    977 	 */
    978 #ifdef XEN
    979 	if (pvh_boot) {
    980 		x86_add_xen_clusters();
    981 #ifdef MODULAR
    982 		x86_add_xen_modules();
    983 #endif
    984 	}
    985 #endif /* XEN */
    986 
    987 #ifdef i386
    988 	extern int biosmem_implicit;
    989 	biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
    990 	if (biem != NULL)
    991 		bim = efi_get_e820memmap();
    992 	else
    993 		bim = lookup_bootinfo(BTINFO_MEMMAP);
    994 	if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) &&
    995 	    bim != NULL && bim->num > 0)
    996 		x86_parse_clusters(bim);
    997 #else
    998 #if !defined(REALBASEMEM) && !defined(REALEXTMEM)
    999 	biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
   1000 	if (biem != NULL)
   1001 		bim = efi_get_e820memmap();
   1002 	else
   1003 		bim = lookup_bootinfo(BTINFO_MEMMAP);
   1004 	if (bim != NULL && bim->num > 0)
   1005 		x86_parse_clusters(bim);
   1006 #else
   1007 	(void)bim, (void)biem;
   1008 #endif
   1009 #endif
   1010 
   1011 	if (mem_cluster_cnt == 0) {
   1012 		/*
   1013 		 * If x86_parse_clusters didn't find any valid segment, create
   1014 		 * fake clusters.
   1015 		 */
   1016 		x86_fake_clusters();
   1017 	}
   1018 }
   1019 
   1020 /*
   1021  * init_x86_vm: initialize the VM system on x86. We basically internalize as
   1022  * many physical pages as we can, starting at lowmem_rsvd, but we don't
   1023  * internalize the kernel physical pages (from pa_kstart to pa_kend).
   1024  */
   1025 int
   1026 init_x86_vm(paddr_t pa_kend)
   1027 {
   1028 	extern struct bootspace bootspace;
   1029 	paddr_t pa_kstart = bootspace.head.pa;
   1030 	uint64_t seg_start, seg_end;
   1031 	uint64_t seg_start1, seg_end1;
   1032 	int x;
   1033 	unsigned i;
   1034 
   1035 	for (i = 0; i < __arraycount(x86_freelists); i++) {
   1036 		if (avail_end < x86_freelists[i].limit)
   1037 			x86_freelists[i].freelist = VM_FREELIST_DEFAULT;
   1038 	}
   1039 
   1040 	/*
   1041 	 * Now, load the memory clusters (which have already been rounded and
   1042 	 * truncated) into the VM system.
   1043 	 *
   1044 	 * NOTE: we assume that memory starts at 0.
   1045 	 */
   1046 	for (x = 0; x < mem_cluster_cnt; x++) {
   1047 		const phys_ram_seg_t *cluster = &mem_clusters[x];
   1048 
   1049 		seg_start = cluster->start;
   1050 		seg_end = cluster->start + cluster->size;
   1051 		seg_start1 = 0;
   1052 		seg_end1 = 0;
   1053 
   1054 #ifdef DEBUG_MEMLOAD
   1055 		printf("segment %" PRIx64 " - %" PRIx64 "\n",
   1056 		    seg_start, seg_end);
   1057 #endif
   1058 
   1059 		/* Skip memory before our available starting point. */
   1060 		if (seg_end <= lowmem_rsvd) {
   1061 #ifdef DEBUG_MEMLOAD
   1062 			printf("discard segment below starting point "
   1063 			    "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
   1064 #endif
   1065 			continue;
   1066 		}
   1067 
   1068 		if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) {
   1069 			seg_start = lowmem_rsvd;
   1070 			if (seg_start == seg_end) {
   1071 #ifdef DEBUG_MEMLOAD
   1072 				printf("discard segment below starting point "
   1073 				    "%" PRIx64 " - %" PRIx64 "\n",
   1074 				    seg_start, seg_end);
   1075 
   1076 
   1077 #endif
   1078 				continue;
   1079 			}
   1080 		}
   1081 
   1082 		/*
   1083 		 * If this segment contains the kernel, split it in two, around
   1084 		 * the kernel.
   1085 		 *  [seg_start                       seg_end]
   1086 		 *             [pa_kstart  pa_kend]
   1087 		 */
   1088 		if (seg_start <= pa_kstart && pa_kend <= seg_end) {
   1089 #ifdef DEBUG_MEMLOAD
   1090 			printf("split kernel overlapping to "
   1091 			    "%" PRIx64 " - %" PRIxPADDR " and "
   1092 			    "%" PRIxPADDR " - %" PRIx64 "\n",
   1093 			    seg_start, pa_kstart, pa_kend, seg_end);
   1094 #endif
   1095 			seg_start1 = pa_kend;
   1096 			seg_end1 = seg_end;
   1097 			seg_end = pa_kstart;
   1098 			KASSERT(seg_end < seg_end1);
   1099 		}
   1100 
   1101 		/*
   1102 		 * Discard a segment inside the kernel
   1103 		 *  [pa_kstart                       pa_kend]
   1104 		 *             [seg_start  seg_end]
   1105 		 */
   1106 		if (pa_kstart < seg_start && seg_end < pa_kend) {
   1107 #ifdef DEBUG_MEMLOAD
   1108 			printf("discard complete kernel overlap "
   1109 			    "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
   1110 #endif
   1111 			continue;
   1112 		}
   1113 
   1114 		/*
   1115 		 * Discard leading hunk that overlaps the kernel
   1116 		 *  [pa_kstart             pa_kend]
   1117 		 *            [seg_start            seg_end]
   1118 		 */
   1119 		if (pa_kstart < seg_start &&
   1120 		    seg_start < pa_kend &&
   1121 		    pa_kend < seg_end) {
   1122 #ifdef DEBUG_MEMLOAD
   1123 			printf("discard leading kernel overlap "
   1124 			    "%" PRIx64 " - %" PRIxPADDR "\n",
   1125 			    seg_start, pa_kend);
   1126 #endif
   1127 			seg_start = pa_kend;
   1128 		}
   1129 
   1130 		/*
   1131 		 * Discard trailing hunk that overlaps the kernel
   1132 		 *             [pa_kstart            pa_kend]
   1133 		 *  [seg_start              seg_end]
   1134 		 */
   1135 		if (seg_start < pa_kstart &&
   1136 		    pa_kstart < seg_end &&
   1137 		    seg_end < pa_kend) {
   1138 #ifdef DEBUG_MEMLOAD
   1139 			printf("discard trailing kernel overlap "
   1140 			    "%" PRIxPADDR " - %" PRIx64 "\n",
   1141 			    pa_kstart, seg_end);
   1142 #endif
   1143 			seg_end = pa_kstart;
   1144 		}
   1145 
   1146 		/* First hunk */
   1147 		if (seg_start != seg_end) {
   1148 			x86_load_region(seg_start, seg_end);
   1149 		}
   1150 
   1151 		/* Second hunk */
   1152 		if (seg_start1 != seg_end1) {
   1153 			x86_load_region(seg_start1, seg_end1);
   1154 		}
   1155 	}
   1156 
   1157 	return 0;
   1158 }
   1159 
   1160 #endif /* !XENPV */
   1161 
   1162 void
   1163 init_x86_msgbuf(void)
   1164 {
   1165 	/* Message buffer is located at end of core. */
   1166 	psize_t sz = round_page(MSGBUFSIZE);
   1167 	psize_t reqsz = sz;
   1168 	uvm_physseg_t x;
   1169 
   1170 search_again:
   1171 	for (x = uvm_physseg_get_first();
   1172 	     uvm_physseg_valid_p(x);
   1173 	     x = uvm_physseg_get_next(x)) {
   1174 
   1175 		if (ctob(uvm_physseg_get_avail_end(x)) == avail_end)
   1176 			break;
   1177 	}
   1178 
   1179 	if (uvm_physseg_valid_p(x) == false)
   1180 		panic("init_x86_msgbuf: can't find end of memory");
   1181 
   1182 	/* Shrink so it'll fit in the last segment. */
   1183 	if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz))
   1184 		sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x));
   1185 
   1186 	msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
   1187 	msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz;
   1188 	uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz));
   1189 
   1190 	/* Now find where the new avail_end is. */
   1191 	avail_end = ctob(uvm_physseg_get_highest_frame());
   1192 
   1193 	if (sz == reqsz)
   1194 		return;
   1195 
   1196 	reqsz -= sz;
   1197 	if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
   1198 		/* No more segments available, bail out. */
   1199 		printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
   1200 		    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
   1201 		return;
   1202 	}
   1203 
   1204 	sz = reqsz;
   1205 	goto search_again;
   1206 }
   1207 
   1208 void
   1209 x86_reset(void)
   1210 {
   1211 	uint8_t b;
   1212 
   1213 #if NACPICA > 0
   1214 	/*
   1215 	 * If ACPI is active, try to reset using the reset register
   1216 	 * defined in the FADT.
   1217 	 */
   1218 	if (acpi_active) {
   1219 		if (acpi_reset() == 0) {
   1220 			delay(500000); /* wait 0.5 sec to see if that did it */
   1221 		}
   1222 	}
   1223 #endif
   1224 
   1225 	/*
   1226 	 * The keyboard controller has 4 random output pins, one of which is
   1227 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
   1228 	 * keyboard controller to pulse this line a couple of times.
   1229 	 */
   1230 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
   1231 	delay(100000);
   1232 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
   1233 	delay(100000);
   1234 
   1235 	/*
   1236 	 * Attempt to force a reset via the Reset Control register at
   1237 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
   1238 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
   1239 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
   1240 	 * "hard" reset.  We try a "hard" reset.  The first write sets
   1241 	 * bit 1 to select a "hard" reset and clears bit 2.  The
   1242 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
   1243 	 * a reset.
   1244 	 */
   1245 	outb(0xcf9, 0x2);
   1246 	outb(0xcf9, 0x6);
   1247 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
   1248 
   1249 	/*
   1250 	 * Attempt to force a reset via the Fast A20 and Init register
   1251 	 * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
   1252 	 * Bit 0 asserts INIT# when set to 1. We are careful to only
   1253 	 * preserve bit 1 while setting bit 0. We also must clear bit
   1254 	 * 0 before setting it if it isn't already clear.
   1255 	 */
   1256 	b = inb(0x92);
   1257 	if (b != 0xff) {
   1258 		if ((b & 0x1) != 0)
   1259 			outb(0x92, b & 0xfe);
   1260 		outb(0x92, b | 0x1);
   1261 		DELAY(500000);	/* wait 0.5 sec to see if that did it */
   1262 	}
   1263 }
   1264 
   1265 static int
   1266 x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
   1267     void *arg0, void *arg1, void *arg2, void *arg3)
   1268 {
   1269 	int result;
   1270 
   1271 	result = KAUTH_RESULT_DEFER;
   1272 
   1273 	switch (action) {
   1274 	case KAUTH_MACHDEP_IOPERM_GET:
   1275 		result = KAUTH_RESULT_ALLOW;
   1276 		break;
   1277 
   1278 	case KAUTH_MACHDEP_LDT_GET:
   1279 	case KAUTH_MACHDEP_LDT_SET:
   1280 		if (x86_user_ldt_enabled) {
   1281 			result = KAUTH_RESULT_ALLOW;
   1282 		}
   1283 		break;
   1284 
   1285 	default:
   1286 		break;
   1287 	}
   1288 
   1289 	return result;
   1290 }
   1291 
   1292 void
   1293 machdep_init(void)
   1294 {
   1295 
   1296 	x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
   1297 	    x86_listener_cb, NULL);
   1298 }
   1299 
   1300 /*
   1301  * x86_startup: x86 common startup routine
   1302  *
   1303  * called by cpu_startup.
   1304  */
   1305 
   1306 void
   1307 x86_startup(void)
   1308 {
   1309 #if !defined(XENPV)
   1310 	nmi_init();
   1311 #endif
   1312 }
   1313 
   1314 const char *
   1315 get_booted_kernel(void)
   1316 {
   1317 	const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH);
   1318 	return bibp ? bibp->bootpath : NULL;
   1319 }
   1320 
   1321 /*
   1322  * machine dependent system variables.
   1323  */
   1324 static int
   1325 sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
   1326 {
   1327 	struct btinfo_bootpath *bibp;
   1328 	struct sysctlnode node;
   1329 
   1330 	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
   1331 	if (!bibp)
   1332 		return ENOENT; /* ??? */
   1333 
   1334 	node = *rnode;
   1335 	node.sysctl_data = bibp->bootpath;
   1336 	node.sysctl_size = sizeof(bibp->bootpath);
   1337 	return sysctl_lookup(SYSCTLFN_CALL(&node));
   1338 }
   1339 
   1340 static int
   1341 sysctl_machdep_bootmethod(SYSCTLFN_ARGS)
   1342 {
   1343 	struct sysctlnode node;
   1344 	char buf[5];
   1345 
   1346 	node = *rnode;
   1347 	node.sysctl_data = buf;
   1348 	if (bootmethod_efi)
   1349 		memcpy(node.sysctl_data, "UEFI", 5);
   1350 	else
   1351 		memcpy(node.sysctl_data, "BIOS", 5);
   1352 
   1353 	return sysctl_lookup(SYSCTLFN_CALL(&node));
   1354 }
   1355 
   1356 
   1357 static int
   1358 sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
   1359 {
   1360 	struct sysctlnode node;
   1361 	extern struct bi_devmatch *x86_alldisks;
   1362 	extern int x86_ndisks;
   1363 
   1364 	if (x86_alldisks == NULL)
   1365 		return EOPNOTSUPP;
   1366 
   1367 	node = *rnode;
   1368 	node.sysctl_data = x86_alldisks;
   1369 	node.sysctl_size = sizeof(struct disklist) +
   1370 	    (x86_ndisks - 1) * sizeof(struct nativedisk_info);
   1371 	return sysctl_lookup(SYSCTLFN_CALL(&node));
   1372 }
   1373 
   1374 #ifndef XENPV
   1375 static int
   1376 sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)
   1377 {
   1378 	struct sysctlnode node;
   1379 	int error, val;
   1380 
   1381 	val = *(int *)rnode->sysctl_data;
   1382 
   1383 	node = *rnode;
   1384 	node.sysctl_data = &val;
   1385 
   1386 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
   1387 	if (error != 0 || newp == NULL)
   1388 		return error;
   1389 
   1390 	if (val == 1) {
   1391 		tsc_user_enable();
   1392 	} else if (val == 0) {
   1393 		tsc_user_disable();
   1394 	} else {
   1395 		error = EINVAL;
   1396 	}
   1397 	if (error)
   1398 		return error;
   1399 
   1400 	*(int *)rnode->sysctl_data = val;
   1401 
   1402 	return 0;
   1403 }
   1404 #endif
   1405 
   1406 static const char * const vm_guest_name[VM_LAST] = {
   1407 	[VM_GUEST_NO] =		"none",
   1408 	[VM_GUEST_VM] =		"generic",
   1409 	[VM_GUEST_XENPV] =	"XenPV",
   1410 	[VM_GUEST_XENPVH] =	"XenPVH",
   1411 	[VM_GUEST_XENHVM] =	"XenHVM",
   1412 	[VM_GUEST_XENPVHVM] =	"XenPVHVM",
   1413 	[VM_GUEST_GENPVH] =	"GenPVH",
   1414 	[VM_GUEST_HV] =		"Hyper-V",
   1415 	[VM_GUEST_VMWARE] =	"VMware",
   1416 	[VM_GUEST_KVM] =	"KVM",
   1417 	[VM_GUEST_VIRTUALBOX] =	"VirtualBox",
   1418 	[VM_GUEST_NVMM] =	"NVMM",
   1419 };
   1420 
   1421 static int
   1422 sysctl_machdep_hypervisor(SYSCTLFN_ARGS)
   1423 {
   1424 	struct sysctlnode node;
   1425 	const char *t = NULL;
   1426 	char buf[64];
   1427 
   1428 	node = *rnode;
   1429 	node.sysctl_data = buf;
   1430 	if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST)
   1431 		t = vm_guest_name[vm_guest];
   1432 	if (t == NULL)
   1433 		t = "unknown";
   1434 	strlcpy(buf, t, sizeof(buf));
   1435 	return sysctl_lookup(SYSCTLFN_CALL(&node));
   1436 }
   1437 
   1438 static void
   1439 const_sysctl(struct sysctllog **clog, const char *name, int type,
   1440     u_quad_t value, int tag)
   1441 {
   1442 	(sysctl_createv)(clog, 0, NULL, NULL,
   1443 		       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
   1444 		       type, name, NULL, NULL, value, NULL, 0,
   1445 		       CTL_MACHDEP, tag, CTL_EOL);
   1446 }
   1447 
   1448 SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
   1449 {
   1450 	extern uint64_t tsc_freq;
   1451 #ifndef XENPV
   1452 	extern int tsc_user_enabled;
   1453 #endif
   1454 	extern int sparse_dump;
   1455 
   1456 	sysctl_createv(clog, 0, NULL, NULL,
   1457 		       CTLFLAG_PERMANENT,
   1458 		       CTLTYPE_NODE, "machdep", NULL,
   1459 		       NULL, 0, NULL, 0,
   1460 		       CTL_MACHDEP, CTL_EOL);
   1461 
   1462 	sysctl_createv(clog, 0, NULL, NULL,
   1463 		       CTLFLAG_PERMANENT,
   1464 		       CTLTYPE_STRUCT, "console_device", NULL,
   1465 		       sysctl_consdev, 0, NULL, sizeof(dev_t),
   1466 		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
   1467 	sysctl_createv(clog, 0, NULL, NULL,
   1468 		       CTLFLAG_PERMANENT,
   1469 		       CTLTYPE_STRING, "booted_kernel", NULL,
   1470 		       sysctl_machdep_booted_kernel, 0, NULL, 0,
   1471 		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
   1472 	sysctl_createv(clog, 0, NULL, NULL,
   1473 		       CTLFLAG_PERMANENT,
   1474 		       CTLTYPE_STRING, "bootmethod", NULL,
   1475 		       sysctl_machdep_bootmethod, 0, NULL, 0,
   1476 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1477 	sysctl_createv(clog, 0, NULL, NULL,
   1478 		       CTLFLAG_PERMANENT,
   1479 		       CTLTYPE_STRUCT, "diskinfo", NULL,
   1480 		       sysctl_machdep_diskinfo, 0, NULL, 0,
   1481 		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
   1482 	sysctl_createv(clog, 0, NULL, NULL,
   1483 		       CTLFLAG_PERMANENT,
   1484 		       CTLTYPE_STRING, "cpu_brand", NULL,
   1485 		       NULL, 0, cpu_brand_string, 0,
   1486 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1487 	sysctl_createv(clog, 0, NULL, NULL,
   1488 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
   1489 		       CTLTYPE_INT, "sparse_dump", NULL,
   1490 		       NULL, 0, &sparse_dump, 0,
   1491 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1492 	sysctl_createv(clog, 0, NULL, NULL,
   1493 		       CTLFLAG_PERMANENT,
   1494 		       CTLTYPE_QUAD, "tsc_freq", NULL,
   1495 		       NULL, 0, &tsc_freq, 0,
   1496 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1497 	sysctl_createv(clog, 0, NULL, NULL,
   1498 		       CTLFLAG_PERMANENT,
   1499 		       CTLTYPE_INT, "pae",
   1500 		       SYSCTL_DESCR("Whether the kernel uses PAE"),
   1501 		       NULL, 0, &use_pae, 0,
   1502 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1503 #ifndef XENPV
   1504 	sysctl_createv(clog, 0, NULL, NULL,
   1505 		       CTLFLAG_READWRITE,
   1506 		       CTLTYPE_INT, "tsc_user_enable",
   1507 		       SYSCTL_DESCR("RDTSC instruction enabled in usermode"),
   1508 		       sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0,
   1509 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1510 #endif
   1511 	sysctl_createv(clog, 0, NULL, NULL,
   1512 		       CTLFLAG_PERMANENT,
   1513 		       CTLTYPE_STRING, "hypervisor", NULL,
   1514 		       sysctl_machdep_hypervisor, 0, NULL, 0,
   1515 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1516 #ifdef SVS
   1517 	const struct sysctlnode *svs_rnode = NULL;
   1518 	sysctl_createv(clog, 0, NULL, &svs_rnode,
   1519 		       CTLFLAG_PERMANENT,
   1520 		       CTLTYPE_NODE, "svs", NULL,
   1521 		       NULL, 0, NULL, 0,
   1522 		       CTL_MACHDEP, CTL_CREATE);
   1523 	sysctl_createv(clog, 0, &svs_rnode, NULL,
   1524 		       CTLFLAG_PERMANENT,
   1525 		       CTLTYPE_BOOL, "enabled",
   1526 		       SYSCTL_DESCR("Whether the kernel uses SVS"),
   1527 		       NULL, 0, &svs_enabled, 0,
   1528 		       CTL_CREATE, CTL_EOL);
   1529 	sysctl_createv(clog, 0, &svs_rnode, NULL,
   1530 		       CTLFLAG_PERMANENT,
   1531 		       CTLTYPE_BOOL, "pcid",
   1532 		       SYSCTL_DESCR("Whether SVS uses PCID"),
   1533 		       NULL, 0, &svs_pcid, 0,
   1534 		       CTL_CREATE, CTL_EOL);
   1535 #endif
   1536 
   1537 	sysctl_createv(clog, 0, NULL, NULL,
   1538 		       CTLFLAG_READWRITE,
   1539 		       CTLTYPE_BOOL, "user_ldt",
   1540 		       SYSCTL_DESCR("Whether USER_LDT is enabled"),
   1541 		       NULL, 0, &x86_user_ldt_enabled, 0,
   1542 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
   1543 
   1544 #ifndef XENPV
   1545 	void sysctl_speculation_init(struct sysctllog **);
   1546 	sysctl_speculation_init(clog);
   1547 #endif
   1548 
   1549 	/* None of these can ever change once the system has booted */
   1550 	const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
   1551 	    CPU_FPU_PRESENT);
   1552 	const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave,
   1553 	    CPU_OSFXSR);
   1554 	const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse,
   1555 	    CPU_SSE);
   1556 	const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2,
   1557 	    CPU_SSE2);
   1558 
   1559 	const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save,
   1560 	    CPU_FPU_SAVE);
   1561 	const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size,
   1562 	    CPU_FPU_SAVE_SIZE);
   1563 	const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features,
   1564 	    CPU_XSAVE_FEATURES);
   1565 
   1566 #ifndef XENPV
   1567 	const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem,
   1568 	    CPU_BIOSBASEMEM);
   1569 	const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem,
   1570 	    CPU_BIOSEXTMEM);
   1571 #endif
   1572 }
   1573 
   1574 /* Here for want of a better place */
   1575 #if defined(DOM0OPS) || !defined(XENPV)
   1576 struct pic *
   1577 intr_findpic(int num)
   1578 {
   1579 #if NIOAPIC > 0
   1580 	struct ioapic_softc *pic;
   1581 
   1582 	pic = ioapic_find_bybase(num);
   1583 	if (pic != NULL)
   1584 		return &pic->sc_pic;
   1585 #endif
   1586 	if (num < NUM_LEGACY_IRQS)
   1587 		return &i8259_pic;
   1588 
   1589 	return NULL;
   1590 }
   1591 #endif
   1592 
   1593 void
   1594 cpu_initclocks(void)
   1595 {
   1596 
   1597 	/*
   1598 	 * Re-calibrate TSC on boot CPU using most accurate time source,
   1599 	 * thus making accurate TSC available for x86_initclock_func().
   1600 	 */
   1601 	cpu_get_tsc_freq(curcpu());
   1602 
   1603 	/* Now start the clocks on this CPU (the boot CPU). */
   1604 	(*x86_initclock_func)();
   1605 }
   1606 
   1607 int
   1608 x86_cpu_is_lcall(const void *ip)
   1609 {
   1610 	static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 };
   1611 	int error;
   1612 	const size_t sz = sizeof(lcall) + 2;
   1613 	uint8_t tmp[sizeof(lcall) + 2];
   1614 
   1615 	if ((error = copyin(ip, tmp, sz)) != 0)
   1616 		return error;
   1617 
   1618 	if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0)
   1619 		return EINVAL;
   1620 
   1621 	switch (tmp[sz - 2]) {
   1622 	case (uint8_t)0x07: /* NetBSD */
   1623 	case (uint8_t)0x87: /* BSD/OS */
   1624 		return 0;
   1625 	default:
   1626 		return EINVAL;
   1627 	}
   1628 }
   1629