1 /* $NetBSD: x86_machdep.c,v 1.160 2025/12/05 17:58:12 khorben Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi, 5 * Copyright (c) 2005, 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Julio M. Merino Vidal, and Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <sys/cdefs.h> 34 __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.160 2025/12/05 17:58:12 khorben Exp $"); 35 36 #include "opt_modular.h" 37 #include "opt_physmem.h" 38 #include "opt_splash.h" 39 #include "opt_kaslr.h" 40 #include "opt_svs.h" 41 #include "opt_xen.h" 42 43 #include <sys/types.h> 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kcore.h> 47 #include <sys/errno.h> 48 #include <sys/kauth.h> 49 #include <sys/mutex.h> 50 #include <sys/cpu.h> 51 #include <sys/intr.h> 52 #include <sys/atomic.h> 53 #include <sys/module.h> 54 #include <sys/sysctl.h> 55 #include <sys/extent.h> 56 #include <sys/rnd.h> 57 58 #include <x86/bootspace.h> 59 #include <x86/cpuvar.h> 60 #include <x86/cputypes.h> 61 #include <x86/efi.h> 62 #include <x86/machdep.h> 63 #include <x86/nmi.h> 64 #include <x86/pio.h> 65 66 #include <dev/splash/splash.h> 67 #include <dev/isa/isareg.h> 68 #include <dev/ic/i8042reg.h> 69 #include <dev/mm.h> 70 71 #include <machine/bootinfo.h> 72 #include <machine/pmap_private.h> 73 #include <machine/vmparam.h> 74 75 #include <uvm/uvm_extern.h> 76 77 #include "tsc.h" 78 79 #include "acpica.h" 80 #include "ioapic.h" 81 #include "lapic.h" 82 83 #if NACPICA > 0 84 #include <dev/acpi/acpivar.h> 85 #endif 86 87 #if NIOAPIC > 0 || NACPICA > 0 88 #include <machine/i82093var.h> 89 #endif 90 91 #include "opt_md.h" 92 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) 93 #include <dev/md.h> 94 #endif 95 96 void (*x86_cpu_idle)(void); 97 static bool x86_cpu_idle_ipi; 98 static char x86_cpu_idle_text[16]; 99 100 static bool x86_user_ldt_enabled __read_mostly = false; 101 102 #ifdef XEN 103 104 #include <xen/xen.h> 105 #include <xen/hypervisor.h> 106 #endif 107 108 #ifndef XENPV 109 void (*delay_func)(unsigned int) = i8254_delay; 110 void (*x86_initclock_func)(void) = i8254_initclocks; 111 #else /* XENPV */ 112 void (*delay_func)(unsigned int) = xen_delay; 113 void (*x86_initclock_func)(void) = xen_initclocks; 114 #endif 115 116 117 /* --------------------------------------------------------------------- */ 118 119 /* 120 * Main bootinfo structure. This is filled in by the bootstrap process 121 * done in locore.S based on the information passed by the boot loader. 122 */ 123 struct bootinfo bootinfo; 124 125 /* --------------------------------------------------------------------- */ 126 127 bool bootmethod_efi; 128 129 static kauth_listener_t x86_listener; 130 131 extern paddr_t lowmem_rsvd, avail_start, avail_end; 132 133 vaddr_t msgbuf_vaddr; 134 135 struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX]; 136 137 unsigned int msgbuf_p_cnt = 0; 138 139 void init_x86_msgbuf(void); 140 141 /* 142 * Given the type of a bootinfo entry, looks for a matching item inside 143 * the bootinfo structure. If found, returns a pointer to it (which must 144 * then be casted to the appropriate bootinfo_* type); otherwise, returns 145 * NULL. 146 */ 147 void * 148 lookup_bootinfo(int type) 149 { 150 bool found; 151 int i; 152 struct btinfo_common *bic; 153 154 bic = (struct btinfo_common *)(bootinfo.bi_data); 155 found = FALSE; 156 for (i = 0; i < bootinfo.bi_nentries && !found; i++) { 157 if (bic->type == type) 158 found = TRUE; 159 else 160 bic = (struct btinfo_common *) 161 ((uint8_t *)bic + bic->len); 162 } 163 164 return found ? bic : NULL; 165 } 166 167 #ifdef notyet 168 /* 169 * List the available bootinfo entries. 170 */ 171 static const char *btinfo_str[] = { 172 BTINFO_STR 173 }; 174 175 void 176 aprint_bootinfo(void) 177 { 178 int i; 179 struct btinfo_common *bic; 180 181 aprint_normal("bootinfo:"); 182 bic = (struct btinfo_common *)(bootinfo.bi_data); 183 for (i = 0; i < bootinfo.bi_nentries; i++) { 184 if (bic->type >= 0 && bic->type < __arraycount(btinfo_str)) 185 aprint_normal(" %s", btinfo_str[bic->type]); 186 else 187 aprint_normal(" %d", bic->type); 188 bic = (struct btinfo_common *) 189 ((uint8_t *)bic + bic->len); 190 } 191 aprint_normal("\n"); 192 } 193 #endif 194 195 /* 196 * mm_md_physacc: check if given pa is accessible. 197 */ 198 int 199 mm_md_physacc(paddr_t pa, vm_prot_t prot) 200 { 201 extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; 202 extern int mem_cluster_cnt; 203 int i; 204 205 for (i = 0; i < mem_cluster_cnt; i++) { 206 const phys_ram_seg_t *seg = &mem_clusters[i]; 207 paddr_t lstart = seg->start; 208 209 if (lstart <= pa && pa - lstart <= seg->size) { 210 return 0; 211 } 212 } 213 return kauth_authorize_machdep(kauth_cred_get(), 214 KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL); 215 } 216 217 #ifdef MODULAR 218 #ifdef XEN 219 void x86_add_xen_modules(void); 220 void 221 x86_add_xen_modules(void) 222 { 223 #if defined(XENPVHVM) || defined(XENPVH) 224 uint32_t i; 225 struct hvm_modlist_entry *modlist; 226 227 if (hvm_start_info->nr_modules == 0) { 228 aprint_verbose("No Xen module info at boot\n"); 229 return; 230 } 231 aprint_debug("%d Xen module(s) at boot\n", hvm_start_info->nr_modules); 232 modlist = (void *)((uintptr_t)hvm_start_info->modlist_paddr + KERNBASE); 233 for (i = 0; i < hvm_start_info->nr_modules; i++) { 234 if (memcmp( 235 (char *)((uintptr_t)modlist[i].paddr + KERNBASE), 236 "\177ELF", 4) == 0) { 237 aprint_debug("Prep module path=%s len=%"PRIu64" pa=%p\n", 238 "pvh-module", 239 modlist[i].size, 240 (void *)((uintptr_t)modlist[i].paddr + KERNBASE)); 241 module_prime( 242 "pvh-module", 243 (void *)((uintptr_t)modlist[i].paddr + KERNBASE), 244 modlist[i].size); 245 #ifdef SPLASHSCREEN 246 } else if (memcmp( 247 (char *)((uintptr_t)modlist[i].paddr + KERNBASE), 248 "\211PNG\r\n\032\n", 8) == 0 || 249 memcmp( 250 (char *)((uintptr_t)modlist[i].paddr + KERNBASE), 251 "\377\330\377", 3) == 0) { 252 aprint_debug("Splash image path=%s len=%"PRIu64" pa=%p\n", 253 "pvh-image", modlist[i].size, 254 (void *)((uintptr_t)modlist[i].paddr + KERNBASE)); 255 splash_setimage( 256 (void *)((uintptr_t)modlist[i].paddr + KERNBASE), 257 modlist[i].size); 258 #endif 259 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) 260 } else { 261 aprint_debug("File-system image path=%s len=%"PRIu64" pa=%p\n", 262 "pvh-filesystem", 263 modlist[i].size, 264 (void *)((uintptr_t)modlist[i].paddr + KERNBASE)); 265 md_root_setconf( 266 (void *)((uintptr_t)modlist[i].paddr + KERNBASE), 267 modlist[i].size); 268 #endif 269 } 270 } 271 #endif 272 } 273 #endif /* XEN */ 274 /* 275 * Push any modules loaded by the boot loader. 276 */ 277 void 278 module_init_md(void) 279 { 280 struct btinfo_modulelist *biml; 281 struct bi_modulelist_entry *bi, *bimax; 282 283 biml = lookup_bootinfo(BTINFO_MODULELIST); 284 if (biml == NULL) { 285 aprint_debug("No module info at boot\n"); 286 return; 287 } 288 289 bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml)); 290 bimax = bi + biml->num; 291 for (; bi < bimax; bi++) { 292 switch (bi->type) { 293 case BI_MODULE_ELF: 294 aprint_debug("Prep module path=%s len=%d pa=%x\n", 295 bi->path, bi->len, bi->base); 296 KASSERT(trunc_page(bi->base) == bi->base); 297 module_prime(bi->path, 298 #ifdef KASLR 299 (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), 300 #else 301 (void *)((uintptr_t)bi->base + KERNBASE), 302 #endif 303 bi->len); 304 break; 305 case BI_MODULE_IMAGE: 306 #ifdef SPLASHSCREEN 307 aprint_debug("Splash image path=%s len=%d pa=%x\n", 308 bi->path, bi->len, bi->base); 309 KASSERT(trunc_page(bi->base) == bi->base); 310 splash_setimage( 311 #ifdef KASLR 312 (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), 313 #else 314 (void *)((uintptr_t)bi->base + KERNBASE), 315 #endif 316 bi->len); 317 #endif 318 break; 319 case BI_MODULE_RND: 320 /* handled in x86_rndseed */ 321 break; 322 case BI_MODULE_FS: 323 aprint_debug("File-system image path=%s len=%d pa=%x\n", 324 bi->path, bi->len, bi->base); 325 KASSERT(trunc_page(bi->base) == bi->base); 326 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) 327 md_root_setconf( 328 #ifdef KASLR 329 (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), 330 #else 331 (void *)((uintptr_t)bi->base + KERNBASE), 332 #endif 333 bi->len); 334 #endif 335 break; 336 default: 337 aprint_debug("Skipping non-ELF module\n"); 338 break; 339 } 340 } 341 } 342 #endif /* MODULAR */ 343 344 void 345 x86_rndseed(void) 346 { 347 struct btinfo_modulelist *biml; 348 struct bi_modulelist_entry *bi, *bimax; 349 350 biml = lookup_bootinfo(BTINFO_MODULELIST); 351 if (biml == NULL) { 352 aprint_debug("No module info at boot\n"); 353 return; 354 } 355 356 bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml)); 357 bimax = bi + biml->num; 358 for (; bi < bimax; bi++) { 359 switch (bi->type) { 360 case BI_MODULE_RND: 361 aprint_debug("Random seed data path=%s len=%d pa=%x\n", 362 bi->path, bi->len, bi->base); 363 KASSERT(trunc_page(bi->base) == bi->base); 364 rnd_seed( 365 #ifdef KASLR 366 (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base), 367 #else 368 (void *)((uintptr_t)bi->base + KERNBASE), 369 #endif 370 bi->len); 371 } 372 } 373 } 374 375 void 376 cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags) 377 { 378 379 KASSERT(kpreempt_disabled()); 380 381 if ((flags & RESCHED_IDLE) != 0) { 382 if ((flags & RESCHED_REMOTE) != 0 && 383 x86_cpu_idle_ipi != false) { 384 cpu_kick(ci); 385 } 386 return; 387 } 388 389 #ifdef __HAVE_PREEMPTION 390 if ((flags & RESCHED_KPREEMPT) != 0) { 391 if ((flags & RESCHED_REMOTE) != 0) { 392 #ifdef XENPV 393 xen_send_ipi(ci, XEN_IPI_KPREEMPT); 394 #else 395 x86_send_ipi(ci, X86_IPI_KPREEMPT); 396 #endif 397 } else { 398 softint_trigger(1 << SIR_PREEMPT); 399 } 400 return; 401 } 402 #endif 403 404 KASSERT((flags & RESCHED_UPREEMPT) != 0); 405 if ((flags & RESCHED_REMOTE) != 0) { 406 cpu_kick(ci); 407 } else { 408 aston(l); 409 } 410 } 411 412 void 413 cpu_signotify(struct lwp *l) 414 { 415 416 KASSERT(kpreempt_disabled()); 417 418 if (l->l_cpu != curcpu()) { 419 cpu_kick(l->l_cpu); 420 } else { 421 aston(l); 422 } 423 } 424 425 void 426 cpu_need_proftick(struct lwp *l) 427 { 428 429 KASSERT(kpreempt_disabled()); 430 KASSERT(l->l_cpu == curcpu()); 431 432 l->l_pflag |= LP_OWEUPC; 433 aston(l); 434 } 435 436 bool 437 cpu_intr_p(void) 438 { 439 int idepth; 440 long pctr; 441 lwp_t *l; 442 443 l = curlwp; 444 if (__predict_false(l->l_cpu == NULL)) { 445 KASSERT(l == &lwp0); 446 return false; 447 } 448 do { 449 pctr = lwp_pctr(); 450 idepth = l->l_cpu->ci_idepth; 451 } while (__predict_false(pctr != lwp_pctr())); 452 453 return idepth >= 0; 454 } 455 456 #ifdef __HAVE_PREEMPTION 457 /* 458 * Called to check MD conditions that would prevent preemption, and to 459 * arrange for those conditions to be rechecked later. 460 */ 461 bool 462 cpu_kpreempt_enter(uintptr_t where, int s) 463 { 464 struct pcb *pcb; 465 lwp_t *l; 466 467 KASSERT(kpreempt_disabled()); 468 l = curlwp; 469 470 /* 471 * If SPL raised, can't go. Note this implies that spin 472 * mutexes at IPL_NONE are _not_ valid to use. 473 */ 474 if (s > IPL_PREEMPT) { 475 softint_trigger(1 << SIR_PREEMPT); 476 return false; 477 } 478 479 /* Must save cr2 or it could be clobbered. */ 480 pcb = lwp_getpcb(l); 481 pcb->pcb_cr2 = rcr2(); 482 483 return true; 484 } 485 486 /* 487 * Called after returning from a kernel preemption, and called with 488 * preemption disabled. 489 */ 490 void 491 cpu_kpreempt_exit(uintptr_t where) 492 { 493 extern char x86_copyfunc_start, x86_copyfunc_end; 494 #if defined(XENPV) && defined(i386) 495 extern char i386_calltrap_start, i386_calltrap_end; 496 #endif 497 struct pcb *pcb; 498 499 KASSERT(kpreempt_disabled()); 500 501 /* 502 * If we interrupted any of the copy functions we must reload 503 * the pmap when resuming, as they cannot tolerate it being 504 * swapped out. 505 */ 506 if (where >= (uintptr_t)&x86_copyfunc_start && 507 where < (uintptr_t)&x86_copyfunc_end) { 508 pmap_load(); 509 } 510 #if defined(XENPV) && defined(i386) 511 else if (where >= (uintptr_t)&i386_calltrap_start && 512 where < (uintptr_t)&i386_calltrap_end) { 513 pmap_load(); 514 } 515 #endif 516 517 /* Restore cr2 only after the pmap, as pmap_load can block. */ 518 pcb = lwp_getpcb(curlwp); 519 lcr2(pcb->pcb_cr2); 520 } 521 522 /* 523 * Return true if preemption is disabled for MD reasons. Must be called 524 * with preemption disabled, and thus is only for diagnostic checks. 525 */ 526 bool 527 cpu_kpreempt_disabled(void) 528 { 529 530 return curcpu()->ci_ilevel > IPL_NONE; 531 } 532 #endif /* __HAVE_PREEMPTION */ 533 534 SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle") 535 { 536 const struct sysctlnode *mnode, *node; 537 538 sysctl_createv(NULL, 0, NULL, &mnode, 539 CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL, 540 NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL); 541 542 sysctl_createv(NULL, 0, &mnode, &node, 543 CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism", 544 SYSCTL_DESCR("Mechanism used for the idle loop."), 545 NULL, 0, x86_cpu_idle_text, 0, 546 CTL_CREATE, CTL_EOL); 547 } 548 549 void 550 x86_cpu_idle_init(void) 551 { 552 553 #ifndef XENPV 554 if ((cpu_feature[1] & CPUID2_MONITOR) == 0) 555 x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true); 556 else 557 x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false); 558 #else 559 x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true); 560 #endif 561 } 562 563 void 564 x86_cpu_idle_get(void (**func)(void), char *text, size_t len) 565 { 566 567 *func = x86_cpu_idle; 568 569 (void)strlcpy(text, x86_cpu_idle_text, len); 570 } 571 572 void 573 x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi) 574 { 575 576 x86_cpu_idle = func; 577 x86_cpu_idle_ipi = ipi; 578 579 (void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text)); 580 } 581 582 #ifndef XENPV 583 584 #define KBTOB(x) ((size_t)(x) * 1024UL) 585 #define MBTOB(x) ((size_t)(x) * 1024UL * 1024UL) 586 587 static struct { 588 int freelist; 589 uint64_t limit; 590 } x86_freelists[VM_NFREELIST] = { 591 { VM_FREELIST_DEFAULT, 0 }, 592 #ifdef VM_FREELIST_FIRST1T 593 /* 40-bit addresses needed for modern graphics. */ 594 { VM_FREELIST_FIRST1T, 1ULL * 1024 * 1024 * 1024 * 1024 }, 595 #endif 596 #ifdef VM_FREELIST_FIRST64G 597 /* 36-bit addresses needed for oldish graphics. */ 598 { VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 }, 599 #endif 600 #ifdef VM_FREELIST_FIRST4G 601 /* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */ 602 { VM_FREELIST_FIRST4G, 4ULL * 1024 * 1024 * 1024 }, 603 #endif 604 /* 30-bit addresses needed for ancient graphics. */ 605 { VM_FREELIST_FIRST1G, 1ULL * 1024 * 1024 * 1024 }, 606 /* 24-bit addresses needed for ISA DMA. */ 607 { VM_FREELIST_FIRST16, 16 * 1024 * 1024 }, 608 }; 609 610 int 611 x86_select_freelist(uint64_t maxaddr) 612 { 613 unsigned int i; 614 615 if (avail_end <= maxaddr) 616 return VM_NFREELIST; 617 618 for (i = 0; i < __arraycount(x86_freelists); i++) { 619 if ((x86_freelists[i].limit - 1) <= maxaddr) 620 return x86_freelists[i].freelist; 621 } 622 623 panic("no freelist for maximum address %"PRIx64, maxaddr); 624 } 625 626 static int 627 x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type) 628 { 629 extern struct extent *iomem_ex; 630 const uint64_t endext = MAXIOMEM + 1; 631 uint64_t new_physmem = 0; 632 phys_ram_seg_t *cluster; 633 int i; 634 635 if (seg_end > MAXPHYSMEM) { 636 aprint_verbose("WARNING: skipping large memory map entry: " 637 "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", 638 seg_start, (seg_end - seg_start), type); 639 return 0; 640 } 641 642 /* 643 * XXX: Chop the last page off the size so that it can fit in avail_end. 644 */ 645 if (seg_end == MAXPHYSMEM) 646 seg_end -= PAGE_SIZE; 647 648 if (seg_end <= seg_start) 649 return 0; 650 651 for (i = 0; i < mem_cluster_cnt; i++) { 652 cluster = &mem_clusters[i]; 653 if ((cluster->start == round_page(seg_start)) && 654 (cluster->size == trunc_page(seg_end) - cluster->start)) { 655 #ifdef DEBUG_MEMLOAD 656 printf("WARNING: skipping duplicate segment entry\n"); 657 #endif 658 return 0; 659 } 660 } 661 662 /* 663 * This cluster is used by RAM. If it is included in the iomem extent, 664 * allocate it from there, so that we won't unintentionally reuse it 665 * later with extent_alloc_region. A way to avoid collision (with UVM 666 * for example). 667 * 668 * This is done before the addresses are page rounded just to make 669 * sure we get them all. 670 */ 671 if (seg_start < endext) { 672 uint64_t io_end; 673 674 if (seg_end > endext) 675 io_end = endext; 676 else 677 io_end = seg_end; 678 679 if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start, 680 io_end - seg_start, EX_NOWAIT)) { 681 /* XXX What should we do? */ 682 printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT " 683 "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM " 684 "IOMEM EXTENT MAP!\n", 685 seg_start, seg_end - seg_start, type); 686 return 0; 687 } 688 } 689 690 /* If it's not free memory, skip it. */ 691 if (type != BIM_Memory) 692 return 0; 693 694 if (mem_cluster_cnt >= VM_PHYSSEG_MAX) { 695 printf("WARNING: too many memory segments" 696 "(increase VM_PHYSSEG_MAX)"); 697 return -1; 698 } 699 700 #ifdef PHYSMEM_MAX_ADDR 701 if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR)) 702 return 0; 703 if (seg_end > MBTOB(PHYSMEM_MAX_ADDR)) 704 seg_end = MBTOB(PHYSMEM_MAX_ADDR); 705 #endif 706 707 seg_start = round_page(seg_start); 708 seg_end = trunc_page(seg_end); 709 710 if (seg_start == seg_end) 711 return 0; 712 713 cluster = &mem_clusters[mem_cluster_cnt]; 714 cluster->start = seg_start; 715 if (iomem_ex != NULL) 716 new_physmem = physmem + atop(seg_end - seg_start); 717 718 #ifdef PHYSMEM_MAX_SIZE 719 if (iomem_ex != NULL) { 720 if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE))) 721 return 0; 722 if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) { 723 seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem); 724 new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE)); 725 } 726 } 727 #endif 728 729 cluster->size = seg_end - seg_start; 730 731 if (iomem_ex != NULL) { 732 if (avail_end < seg_end) 733 avail_end = seg_end; 734 physmem = new_physmem; 735 } 736 mem_cluster_cnt++; 737 738 return 0; 739 } 740 741 static int 742 x86_parse_clusters(struct btinfo_memmap *bim) 743 { 744 uint64_t seg_start, seg_end; 745 uint64_t addr, size; 746 uint32_t type; 747 int x; 748 749 KASSERT(bim != NULL); 750 KASSERT(bim->num > 0); 751 752 #ifdef DEBUG_MEMLOAD 753 printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n", 754 lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS", 755 bim->num); 756 #endif 757 758 for (x = 0; x < bim->num; x++) { 759 addr = bim->entry[x].addr; 760 size = bim->entry[x].size; 761 type = bim->entry[x].type; 762 #ifdef DEBUG_MEMLOAD 763 printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64 764 "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n", 765 addr, addr + size - 1, size, type, 766 (type == BIM_Memory) ? "Memory" : 767 (type == BIM_Reserved) ? "Reserved" : 768 (type == BIM_ACPI) ? "ACPI" : 769 (type == BIM_NVS) ? "NVS" : 770 (type == BIM_PMEM) ? "Persistent" : 771 (type == BIM_PRAM) ? "Persistent (Legacy)" : 772 "unknown"); 773 #endif 774 775 /* If the segment is not memory, skip it. */ 776 switch (type) { 777 case BIM_Memory: 778 case BIM_ACPI: 779 case BIM_NVS: 780 break; 781 default: 782 continue; 783 } 784 785 /* If the segment is smaller than a page, skip it. */ 786 if (size < PAGE_SIZE) 787 continue; 788 789 seg_start = addr; 790 seg_end = addr + size; 791 792 /* 793 * XXX XXX: Avoid the ISA I/O MEM. 794 * 795 * Some laptops (for example, Toshiba Satellite2550X) report 796 * this area as valid. 797 */ 798 if (seg_start < IOM_END && seg_end > IOM_BEGIN) { 799 printf("WARNING: memory map entry overlaps " 800 "with ``Compatibility Holes'': " 801 "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start, 802 seg_end - seg_start, type); 803 804 if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1) 805 break; 806 if (x86_add_cluster(IOM_END, seg_end, type) == -1) 807 break; 808 } else { 809 if (x86_add_cluster(seg_start, seg_end, type) == -1) 810 break; 811 } 812 } 813 814 return 0; 815 } 816 817 static int 818 x86_fake_clusters(void) 819 { 820 extern struct extent *iomem_ex; 821 phys_ram_seg_t *cluster; 822 KASSERT(mem_cluster_cnt == 0); 823 824 /* 825 * Allocate the physical addresses used by RAM from the iomem extent 826 * map. This is done before the addresses are page rounded just to make 827 * sure we get them all. 828 */ 829 if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) { 830 /* XXX What should we do? */ 831 printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " 832 "IOMEM EXTENT MAP!\n"); 833 } 834 835 cluster = &mem_clusters[0]; 836 cluster->start = 0; 837 cluster->size = trunc_page(KBTOB(biosbasemem)); 838 physmem += atop(cluster->size); 839 840 if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), 841 EX_NOWAIT)) { 842 /* XXX What should we do? */ 843 printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " 844 "IOMEM EXTENT MAP!\n"); 845 } 846 847 #if NISADMA > 0 848 /* 849 * Some motherboards/BIOSes remap the 384K of RAM that would 850 * normally be covered by the ISA hole to the end of memory 851 * so that it can be used. However, on a 16M system, this 852 * would cause bounce buffers to be allocated and used. 853 * This is not desirable behaviour, as more than 384K of 854 * bounce buffers might be allocated. As a work-around, 855 * we round memory down to the nearest 1M boundary if 856 * we're using any isadma devices and the remapped memory 857 * is what puts us over 16M. 858 */ 859 if (biosextmem > (15*1024) && biosextmem < (16*1024)) { 860 char pbuf[9]; 861 862 format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024)); 863 printf("Warning: ignoring %s of remapped memory\n", pbuf); 864 biosextmem = (15*1024); 865 } 866 #endif 867 868 cluster = &mem_clusters[1]; 869 cluster->start = IOM_END; 870 cluster->size = trunc_page(KBTOB(biosextmem)); 871 physmem += atop(cluster->size); 872 873 mem_cluster_cnt = 2; 874 875 avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); 876 877 return 0; 878 } 879 880 /* 881 * x86_load_region: load the physical memory region from seg_start to seg_end 882 * into the VM system. 883 */ 884 static void 885 x86_load_region(uint64_t seg_start, uint64_t seg_end) 886 { 887 unsigned int i; 888 uint64_t tmp; 889 890 i = __arraycount(x86_freelists); 891 while (i--) { 892 if (x86_freelists[i].limit <= seg_start) 893 continue; 894 if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT) 895 continue; 896 tmp = MIN(x86_freelists[i].limit, seg_end); 897 if (tmp == seg_start) 898 continue; 899 900 #ifdef DEBUG_MEMLOAD 901 printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64 902 " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist, 903 seg_start, tmp, (uint64_t)atop(seg_start), 904 (uint64_t)atop(tmp)); 905 #endif 906 907 uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start), 908 atop(tmp), x86_freelists[i].freelist); 909 seg_start = tmp; 910 } 911 912 if (seg_start != seg_end) { 913 #ifdef DEBUG_MEMLOAD 914 printf("loading default 0x%"PRIx64"-0x%"PRIx64 915 " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end, 916 (uint64_t)atop(seg_start), (uint64_t)atop(seg_end)); 917 #endif 918 uvm_page_physload(atop(seg_start), atop(seg_end), 919 atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT); 920 } 921 } 922 923 #ifdef XEN 924 static void 925 x86_add_xen_clusters(void) 926 { 927 if (hvm_start_info->memmap_entries > 0) { 928 struct hvm_memmap_table_entry *map_entry; 929 map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE); 930 for (int i = 0; i < hvm_start_info->memmap_entries; i++) { 931 if (map_entry[i].size < PAGE_SIZE) 932 continue; 933 switch (map_entry[i].type) { 934 case XEN_HVM_MEMMAP_TYPE_RAM: 935 x86_add_cluster(map_entry[i].addr, 936 map_entry[i].addr + map_entry[i].size, 937 BIM_Memory); 938 break; 939 case XEN_HVM_MEMMAP_TYPE_ACPI: 940 x86_add_cluster(map_entry[i].addr, 941 map_entry[i].addr + map_entry[i].size, 942 BIM_ACPI); 943 break; 944 } 945 } 946 } else { 947 struct xen_memory_map memmap; 948 static struct _xen_mmap { 949 struct btinfo_memmap bim; 950 struct bi_memmap_entry map[128]; /* same as FreeBSD */ 951 } __packed xen_mmap; 952 int err; 953 954 memmap.nr_entries = 128; 955 set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]); 956 if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) 957 < 0) 958 panic("XENMEM_memory_map %d", err); 959 xen_mmap.bim.num = memmap.nr_entries; 960 x86_parse_clusters(&xen_mmap.bim); 961 } 962 } 963 #endif /* XEN */ 964 /* 965 * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and 966 * initialize mem_clusters. 967 */ 968 void 969 init_x86_clusters(void) 970 { 971 struct btinfo_memmap *bim; 972 struct btinfo_efimemmap *biem; 973 974 /* 975 * Check to see if we have a memory map from the BIOS (passed to us by 976 * the boot program). 977 */ 978 #ifdef XEN 979 if (pvh_boot) { 980 x86_add_xen_clusters(); 981 #ifdef MODULAR 982 x86_add_xen_modules(); 983 #endif 984 } 985 #endif /* XEN */ 986 987 #ifdef i386 988 extern int biosmem_implicit; 989 biem = lookup_bootinfo(BTINFO_EFIMEMMAP); 990 if (biem != NULL) 991 bim = efi_get_e820memmap(); 992 else 993 bim = lookup_bootinfo(BTINFO_MEMMAP); 994 if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) && 995 bim != NULL && bim->num > 0) 996 x86_parse_clusters(bim); 997 #else 998 #if !defined(REALBASEMEM) && !defined(REALEXTMEM) 999 biem = lookup_bootinfo(BTINFO_EFIMEMMAP); 1000 if (biem != NULL) 1001 bim = efi_get_e820memmap(); 1002 else 1003 bim = lookup_bootinfo(BTINFO_MEMMAP); 1004 if (bim != NULL && bim->num > 0) 1005 x86_parse_clusters(bim); 1006 #else 1007 (void)bim, (void)biem; 1008 #endif 1009 #endif 1010 1011 if (mem_cluster_cnt == 0) { 1012 /* 1013 * If x86_parse_clusters didn't find any valid segment, create 1014 * fake clusters. 1015 */ 1016 x86_fake_clusters(); 1017 } 1018 } 1019 1020 /* 1021 * init_x86_vm: initialize the VM system on x86. We basically internalize as 1022 * many physical pages as we can, starting at lowmem_rsvd, but we don't 1023 * internalize the kernel physical pages (from pa_kstart to pa_kend). 1024 */ 1025 int 1026 init_x86_vm(paddr_t pa_kend) 1027 { 1028 extern struct bootspace bootspace; 1029 paddr_t pa_kstart = bootspace.head.pa; 1030 uint64_t seg_start, seg_end; 1031 uint64_t seg_start1, seg_end1; 1032 int x; 1033 unsigned i; 1034 1035 for (i = 0; i < __arraycount(x86_freelists); i++) { 1036 if (avail_end < x86_freelists[i].limit) 1037 x86_freelists[i].freelist = VM_FREELIST_DEFAULT; 1038 } 1039 1040 /* 1041 * Now, load the memory clusters (which have already been rounded and 1042 * truncated) into the VM system. 1043 * 1044 * NOTE: we assume that memory starts at 0. 1045 */ 1046 for (x = 0; x < mem_cluster_cnt; x++) { 1047 const phys_ram_seg_t *cluster = &mem_clusters[x]; 1048 1049 seg_start = cluster->start; 1050 seg_end = cluster->start + cluster->size; 1051 seg_start1 = 0; 1052 seg_end1 = 0; 1053 1054 #ifdef DEBUG_MEMLOAD 1055 printf("segment %" PRIx64 " - %" PRIx64 "\n", 1056 seg_start, seg_end); 1057 #endif 1058 1059 /* Skip memory before our available starting point. */ 1060 if (seg_end <= lowmem_rsvd) { 1061 #ifdef DEBUG_MEMLOAD 1062 printf("discard segment below starting point " 1063 "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); 1064 #endif 1065 continue; 1066 } 1067 1068 if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) { 1069 seg_start = lowmem_rsvd; 1070 if (seg_start == seg_end) { 1071 #ifdef DEBUG_MEMLOAD 1072 printf("discard segment below starting point " 1073 "%" PRIx64 " - %" PRIx64 "\n", 1074 seg_start, seg_end); 1075 1076 1077 #endif 1078 continue; 1079 } 1080 } 1081 1082 /* 1083 * If this segment contains the kernel, split it in two, around 1084 * the kernel. 1085 * [seg_start seg_end] 1086 * [pa_kstart pa_kend] 1087 */ 1088 if (seg_start <= pa_kstart && pa_kend <= seg_end) { 1089 #ifdef DEBUG_MEMLOAD 1090 printf("split kernel overlapping to " 1091 "%" PRIx64 " - %" PRIxPADDR " and " 1092 "%" PRIxPADDR " - %" PRIx64 "\n", 1093 seg_start, pa_kstart, pa_kend, seg_end); 1094 #endif 1095 seg_start1 = pa_kend; 1096 seg_end1 = seg_end; 1097 seg_end = pa_kstart; 1098 KASSERT(seg_end < seg_end1); 1099 } 1100 1101 /* 1102 * Discard a segment inside the kernel 1103 * [pa_kstart pa_kend] 1104 * [seg_start seg_end] 1105 */ 1106 if (pa_kstart < seg_start && seg_end < pa_kend) { 1107 #ifdef DEBUG_MEMLOAD 1108 printf("discard complete kernel overlap " 1109 "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end); 1110 #endif 1111 continue; 1112 } 1113 1114 /* 1115 * Discard leading hunk that overlaps the kernel 1116 * [pa_kstart pa_kend] 1117 * [seg_start seg_end] 1118 */ 1119 if (pa_kstart < seg_start && 1120 seg_start < pa_kend && 1121 pa_kend < seg_end) { 1122 #ifdef DEBUG_MEMLOAD 1123 printf("discard leading kernel overlap " 1124 "%" PRIx64 " - %" PRIxPADDR "\n", 1125 seg_start, pa_kend); 1126 #endif 1127 seg_start = pa_kend; 1128 } 1129 1130 /* 1131 * Discard trailing hunk that overlaps the kernel 1132 * [pa_kstart pa_kend] 1133 * [seg_start seg_end] 1134 */ 1135 if (seg_start < pa_kstart && 1136 pa_kstart < seg_end && 1137 seg_end < pa_kend) { 1138 #ifdef DEBUG_MEMLOAD 1139 printf("discard trailing kernel overlap " 1140 "%" PRIxPADDR " - %" PRIx64 "\n", 1141 pa_kstart, seg_end); 1142 #endif 1143 seg_end = pa_kstart; 1144 } 1145 1146 /* First hunk */ 1147 if (seg_start != seg_end) { 1148 x86_load_region(seg_start, seg_end); 1149 } 1150 1151 /* Second hunk */ 1152 if (seg_start1 != seg_end1) { 1153 x86_load_region(seg_start1, seg_end1); 1154 } 1155 } 1156 1157 return 0; 1158 } 1159 1160 #endif /* !XENPV */ 1161 1162 void 1163 init_x86_msgbuf(void) 1164 { 1165 /* Message buffer is located at end of core. */ 1166 psize_t sz = round_page(MSGBUFSIZE); 1167 psize_t reqsz = sz; 1168 uvm_physseg_t x; 1169 1170 search_again: 1171 for (x = uvm_physseg_get_first(); 1172 uvm_physseg_valid_p(x); 1173 x = uvm_physseg_get_next(x)) { 1174 1175 if (ctob(uvm_physseg_get_avail_end(x)) == avail_end) 1176 break; 1177 } 1178 1179 if (uvm_physseg_valid_p(x) == false) 1180 panic("init_x86_msgbuf: can't find end of memory"); 1181 1182 /* Shrink so it'll fit in the last segment. */ 1183 if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz)) 1184 sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x)); 1185 1186 msgbuf_p_seg[msgbuf_p_cnt].sz = sz; 1187 msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz; 1188 uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz)); 1189 1190 /* Now find where the new avail_end is. */ 1191 avail_end = ctob(uvm_physseg_get_highest_frame()); 1192 1193 if (sz == reqsz) 1194 return; 1195 1196 reqsz -= sz; 1197 if (msgbuf_p_cnt == VM_PHYSSEG_MAX) { 1198 /* No more segments available, bail out. */ 1199 printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n", 1200 (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz)); 1201 return; 1202 } 1203 1204 sz = reqsz; 1205 goto search_again; 1206 } 1207 1208 void 1209 x86_reset(void) 1210 { 1211 uint8_t b; 1212 1213 #if NACPICA > 0 1214 /* 1215 * If ACPI is active, try to reset using the reset register 1216 * defined in the FADT. 1217 */ 1218 if (acpi_active) { 1219 if (acpi_reset() == 0) { 1220 delay(500000); /* wait 0.5 sec to see if that did it */ 1221 } 1222 } 1223 #endif 1224 1225 /* 1226 * The keyboard controller has 4 random output pins, one of which is 1227 * connected to the RESET pin on the CPU in many PCs. We tell the 1228 * keyboard controller to pulse this line a couple of times. 1229 */ 1230 outb(IO_KBD + KBCMDP, KBC_PULSE0); 1231 delay(100000); 1232 outb(IO_KBD + KBCMDP, KBC_PULSE0); 1233 delay(100000); 1234 1235 /* 1236 * Attempt to force a reset via the Reset Control register at 1237 * I/O port 0xcf9. Bit 2 forces a system reset when it 1238 * transitions from 0 to 1. Bit 1 selects the type of reset 1239 * to attempt: 0 selects a "soft" reset, and 1 selects a 1240 * "hard" reset. We try a "hard" reset. The first write sets 1241 * bit 1 to select a "hard" reset and clears bit 2. The 1242 * second write forces a 0 -> 1 transition in bit 2 to trigger 1243 * a reset. 1244 */ 1245 outb(0xcf9, 0x2); 1246 outb(0xcf9, 0x6); 1247 DELAY(500000); /* wait 0.5 sec to see if that did it */ 1248 1249 /* 1250 * Attempt to force a reset via the Fast A20 and Init register 1251 * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. 1252 * Bit 0 asserts INIT# when set to 1. We are careful to only 1253 * preserve bit 1 while setting bit 0. We also must clear bit 1254 * 0 before setting it if it isn't already clear. 1255 */ 1256 b = inb(0x92); 1257 if (b != 0xff) { 1258 if ((b & 0x1) != 0) 1259 outb(0x92, b & 0xfe); 1260 outb(0x92, b | 0x1); 1261 DELAY(500000); /* wait 0.5 sec to see if that did it */ 1262 } 1263 } 1264 1265 static int 1266 x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 1267 void *arg0, void *arg1, void *arg2, void *arg3) 1268 { 1269 int result; 1270 1271 result = KAUTH_RESULT_DEFER; 1272 1273 switch (action) { 1274 case KAUTH_MACHDEP_IOPERM_GET: 1275 result = KAUTH_RESULT_ALLOW; 1276 break; 1277 1278 case KAUTH_MACHDEP_LDT_GET: 1279 case KAUTH_MACHDEP_LDT_SET: 1280 if (x86_user_ldt_enabled) { 1281 result = KAUTH_RESULT_ALLOW; 1282 } 1283 break; 1284 1285 default: 1286 break; 1287 } 1288 1289 return result; 1290 } 1291 1292 void 1293 machdep_init(void) 1294 { 1295 1296 x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP, 1297 x86_listener_cb, NULL); 1298 } 1299 1300 /* 1301 * x86_startup: x86 common startup routine 1302 * 1303 * called by cpu_startup. 1304 */ 1305 1306 void 1307 x86_startup(void) 1308 { 1309 #if !defined(XENPV) 1310 nmi_init(); 1311 #endif 1312 } 1313 1314 const char * 1315 get_booted_kernel(void) 1316 { 1317 const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH); 1318 return bibp ? bibp->bootpath : NULL; 1319 } 1320 1321 /* 1322 * machine dependent system variables. 1323 */ 1324 static int 1325 sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) 1326 { 1327 struct btinfo_bootpath *bibp; 1328 struct sysctlnode node; 1329 1330 bibp = lookup_bootinfo(BTINFO_BOOTPATH); 1331 if (!bibp) 1332 return ENOENT; /* ??? */ 1333 1334 node = *rnode; 1335 node.sysctl_data = bibp->bootpath; 1336 node.sysctl_size = sizeof(bibp->bootpath); 1337 return sysctl_lookup(SYSCTLFN_CALL(&node)); 1338 } 1339 1340 static int 1341 sysctl_machdep_bootmethod(SYSCTLFN_ARGS) 1342 { 1343 struct sysctlnode node; 1344 char buf[5]; 1345 1346 node = *rnode; 1347 node.sysctl_data = buf; 1348 if (bootmethod_efi) 1349 memcpy(node.sysctl_data, "UEFI", 5); 1350 else 1351 memcpy(node.sysctl_data, "BIOS", 5); 1352 1353 return sysctl_lookup(SYSCTLFN_CALL(&node)); 1354 } 1355 1356 1357 static int 1358 sysctl_machdep_diskinfo(SYSCTLFN_ARGS) 1359 { 1360 struct sysctlnode node; 1361 extern struct bi_devmatch *x86_alldisks; 1362 extern int x86_ndisks; 1363 1364 if (x86_alldisks == NULL) 1365 return EOPNOTSUPP; 1366 1367 node = *rnode; 1368 node.sysctl_data = x86_alldisks; 1369 node.sysctl_size = sizeof(struct disklist) + 1370 (x86_ndisks - 1) * sizeof(struct nativedisk_info); 1371 return sysctl_lookup(SYSCTLFN_CALL(&node)); 1372 } 1373 1374 #ifndef XENPV 1375 static int 1376 sysctl_machdep_tsc_enable(SYSCTLFN_ARGS) 1377 { 1378 struct sysctlnode node; 1379 int error, val; 1380 1381 val = *(int *)rnode->sysctl_data; 1382 1383 node = *rnode; 1384 node.sysctl_data = &val; 1385 1386 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1387 if (error != 0 || newp == NULL) 1388 return error; 1389 1390 if (val == 1) { 1391 tsc_user_enable(); 1392 } else if (val == 0) { 1393 tsc_user_disable(); 1394 } else { 1395 error = EINVAL; 1396 } 1397 if (error) 1398 return error; 1399 1400 *(int *)rnode->sysctl_data = val; 1401 1402 return 0; 1403 } 1404 #endif 1405 1406 static const char * const vm_guest_name[VM_LAST] = { 1407 [VM_GUEST_NO] = "none", 1408 [VM_GUEST_VM] = "generic", 1409 [VM_GUEST_XENPV] = "XenPV", 1410 [VM_GUEST_XENPVH] = "XenPVH", 1411 [VM_GUEST_XENHVM] = "XenHVM", 1412 [VM_GUEST_XENPVHVM] = "XenPVHVM", 1413 [VM_GUEST_GENPVH] = "GenPVH", 1414 [VM_GUEST_HV] = "Hyper-V", 1415 [VM_GUEST_VMWARE] = "VMware", 1416 [VM_GUEST_KVM] = "KVM", 1417 [VM_GUEST_VIRTUALBOX] = "VirtualBox", 1418 [VM_GUEST_NVMM] = "NVMM", 1419 }; 1420 1421 static int 1422 sysctl_machdep_hypervisor(SYSCTLFN_ARGS) 1423 { 1424 struct sysctlnode node; 1425 const char *t = NULL; 1426 char buf[64]; 1427 1428 node = *rnode; 1429 node.sysctl_data = buf; 1430 if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST) 1431 t = vm_guest_name[vm_guest]; 1432 if (t == NULL) 1433 t = "unknown"; 1434 strlcpy(buf, t, sizeof(buf)); 1435 return sysctl_lookup(SYSCTLFN_CALL(&node)); 1436 } 1437 1438 static void 1439 const_sysctl(struct sysctllog **clog, const char *name, int type, 1440 u_quad_t value, int tag) 1441 { 1442 (sysctl_createv)(clog, 0, NULL, NULL, 1443 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, 1444 type, name, NULL, NULL, value, NULL, 0, 1445 CTL_MACHDEP, tag, CTL_EOL); 1446 } 1447 1448 SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup") 1449 { 1450 extern uint64_t tsc_freq; 1451 #ifndef XENPV 1452 extern int tsc_user_enabled; 1453 #endif 1454 extern int sparse_dump; 1455 1456 sysctl_createv(clog, 0, NULL, NULL, 1457 CTLFLAG_PERMANENT, 1458 CTLTYPE_NODE, "machdep", NULL, 1459 NULL, 0, NULL, 0, 1460 CTL_MACHDEP, CTL_EOL); 1461 1462 sysctl_createv(clog, 0, NULL, NULL, 1463 CTLFLAG_PERMANENT, 1464 CTLTYPE_STRUCT, "console_device", NULL, 1465 sysctl_consdev, 0, NULL, sizeof(dev_t), 1466 CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); 1467 sysctl_createv(clog, 0, NULL, NULL, 1468 CTLFLAG_PERMANENT, 1469 CTLTYPE_STRING, "booted_kernel", NULL, 1470 sysctl_machdep_booted_kernel, 0, NULL, 0, 1471 CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); 1472 sysctl_createv(clog, 0, NULL, NULL, 1473 CTLFLAG_PERMANENT, 1474 CTLTYPE_STRING, "bootmethod", NULL, 1475 sysctl_machdep_bootmethod, 0, NULL, 0, 1476 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1477 sysctl_createv(clog, 0, NULL, NULL, 1478 CTLFLAG_PERMANENT, 1479 CTLTYPE_STRUCT, "diskinfo", NULL, 1480 sysctl_machdep_diskinfo, 0, NULL, 0, 1481 CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); 1482 sysctl_createv(clog, 0, NULL, NULL, 1483 CTLFLAG_PERMANENT, 1484 CTLTYPE_STRING, "cpu_brand", NULL, 1485 NULL, 0, cpu_brand_string, 0, 1486 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1487 sysctl_createv(clog, 0, NULL, NULL, 1488 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1489 CTLTYPE_INT, "sparse_dump", NULL, 1490 NULL, 0, &sparse_dump, 0, 1491 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1492 sysctl_createv(clog, 0, NULL, NULL, 1493 CTLFLAG_PERMANENT, 1494 CTLTYPE_QUAD, "tsc_freq", NULL, 1495 NULL, 0, &tsc_freq, 0, 1496 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1497 sysctl_createv(clog, 0, NULL, NULL, 1498 CTLFLAG_PERMANENT, 1499 CTLTYPE_INT, "pae", 1500 SYSCTL_DESCR("Whether the kernel uses PAE"), 1501 NULL, 0, &use_pae, 0, 1502 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1503 #ifndef XENPV 1504 sysctl_createv(clog, 0, NULL, NULL, 1505 CTLFLAG_READWRITE, 1506 CTLTYPE_INT, "tsc_user_enable", 1507 SYSCTL_DESCR("RDTSC instruction enabled in usermode"), 1508 sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0, 1509 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1510 #endif 1511 sysctl_createv(clog, 0, NULL, NULL, 1512 CTLFLAG_PERMANENT, 1513 CTLTYPE_STRING, "hypervisor", NULL, 1514 sysctl_machdep_hypervisor, 0, NULL, 0, 1515 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1516 #ifdef SVS 1517 const struct sysctlnode *svs_rnode = NULL; 1518 sysctl_createv(clog, 0, NULL, &svs_rnode, 1519 CTLFLAG_PERMANENT, 1520 CTLTYPE_NODE, "svs", NULL, 1521 NULL, 0, NULL, 0, 1522 CTL_MACHDEP, CTL_CREATE); 1523 sysctl_createv(clog, 0, &svs_rnode, NULL, 1524 CTLFLAG_PERMANENT, 1525 CTLTYPE_BOOL, "enabled", 1526 SYSCTL_DESCR("Whether the kernel uses SVS"), 1527 NULL, 0, &svs_enabled, 0, 1528 CTL_CREATE, CTL_EOL); 1529 sysctl_createv(clog, 0, &svs_rnode, NULL, 1530 CTLFLAG_PERMANENT, 1531 CTLTYPE_BOOL, "pcid", 1532 SYSCTL_DESCR("Whether SVS uses PCID"), 1533 NULL, 0, &svs_pcid, 0, 1534 CTL_CREATE, CTL_EOL); 1535 #endif 1536 1537 sysctl_createv(clog, 0, NULL, NULL, 1538 CTLFLAG_READWRITE, 1539 CTLTYPE_BOOL, "user_ldt", 1540 SYSCTL_DESCR("Whether USER_LDT is enabled"), 1541 NULL, 0, &x86_user_ldt_enabled, 0, 1542 CTL_MACHDEP, CTL_CREATE, CTL_EOL); 1543 1544 #ifndef XENPV 1545 void sysctl_speculation_init(struct sysctllog **); 1546 sysctl_speculation_init(clog); 1547 #endif 1548 1549 /* None of these can ever change once the system has booted */ 1550 const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present, 1551 CPU_FPU_PRESENT); 1552 const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave, 1553 CPU_OSFXSR); 1554 const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse, 1555 CPU_SSE); 1556 const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2, 1557 CPU_SSE2); 1558 1559 const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save, 1560 CPU_FPU_SAVE); 1561 const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size, 1562 CPU_FPU_SAVE_SIZE); 1563 const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features, 1564 CPU_XSAVE_FEATURES); 1565 1566 #ifndef XENPV 1567 const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem, 1568 CPU_BIOSBASEMEM); 1569 const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem, 1570 CPU_BIOSEXTMEM); 1571 #endif 1572 } 1573 1574 /* Here for want of a better place */ 1575 #if defined(DOM0OPS) || !defined(XENPV) 1576 struct pic * 1577 intr_findpic(int num) 1578 { 1579 #if NIOAPIC > 0 1580 struct ioapic_softc *pic; 1581 1582 pic = ioapic_find_bybase(num); 1583 if (pic != NULL) 1584 return &pic->sc_pic; 1585 #endif 1586 if (num < NUM_LEGACY_IRQS) 1587 return &i8259_pic; 1588 1589 return NULL; 1590 } 1591 #endif 1592 1593 void 1594 cpu_initclocks(void) 1595 { 1596 1597 /* 1598 * Re-calibrate TSC on boot CPU using most accurate time source, 1599 * thus making accurate TSC available for x86_initclock_func(). 1600 */ 1601 cpu_get_tsc_freq(curcpu()); 1602 1603 /* Now start the clocks on this CPU (the boot CPU). */ 1604 (*x86_initclock_func)(); 1605 } 1606 1607 int 1608 x86_cpu_is_lcall(const void *ip) 1609 { 1610 static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 }; 1611 int error; 1612 const size_t sz = sizeof(lcall) + 2; 1613 uint8_t tmp[sizeof(lcall) + 2]; 1614 1615 if ((error = copyin(ip, tmp, sz)) != 0) 1616 return error; 1617 1618 if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0) 1619 return EINVAL; 1620 1621 switch (tmp[sz - 2]) { 1622 case (uint8_t)0x07: /* NetBSD */ 1623 case (uint8_t)0x87: /* BSD/OS */ 1624 return 0; 1625 default: 1626 return EINVAL; 1627 } 1628 } 1629