1 /* $NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2018-2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $"); 34 35 #include "opt_svs.h" 36 #include "opt_user_ldt.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/proc.h> 41 #include <sys/cpu.h> 42 #include <sys/kauth.h> 43 #include <sys/sysctl.h> 44 #include <sys/xcall.h> 45 #include <sys/reboot.h> 46 47 #include <x86/cputypes.h> 48 49 #include <machine/cpuvar.h> 50 #include <machine/frameasm.h> 51 #include <machine/gdt.h> 52 #include <machine/pmap_private.h> 53 54 #include <uvm/uvm.h> 55 #include <uvm/uvm_page.h> 56 57 /* 58 * Separate Virtual Space 59 * 60 * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context 61 * switch to a user pmap, the lower half of updirpa is populated with the 62 * entries containing the userland pages. 63 * 64 * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 65 * 66 * We use a special per-cpu page that we call UTLS, for User Thread Local 67 * Storage. Each CPU has one UTLS page. This page has two VAs: 68 * 69 * o When the user page tables are loaded in CR3, the VA to access this 70 * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is 71 * _constant_ across CPUs, but in the user page tables this VA points to 72 * the physical page of the UTLS that is _local_ to the CPU. 73 * 74 * o When the kernel page tables are loaded in CR3, the VA to access this 75 * page is ci->ci_svs_utls. 76 * 77 * +----------------------------------------------------------------------+ 78 * | CPU0 Local Data (Physical Page) | 79 * | +------------------+ +-------------+ | 80 * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | | 81 * | +------------------+ +-------------+ | 82 * +-------------------------------------------------------------^--------+ 83 * | 84 * +----------+ 85 * | 86 * +----------------------------------------------------------------------+ | 87 * | CPU1 Local Data (Physical Page) | | 88 * | +------------------+ +-------------+ | | 89 * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | | 90 * | +------------------+ +-------------+ | | 91 * +-------------------------------------------------------------^--------+ | 92 * | | 93 * +------------------+ /----------------------+ | 94 * | Kern Page Tables | ci->ci_svs_utls | 95 * +------------------+ \---------------------------------+ 96 * 97 * The goal of the UTLS page is to provide an area where we can store whatever 98 * we want, in a way that it is accessible both when the Kernel and when the 99 * User page tables are loaded in CR3. 100 * 101 * We store in the UTLS page three 64bit values: 102 * 103 * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel 104 * page tables. 105 * 106 * o UTLS_SCRATCH: a dummy place where we temporarily store a value during 107 * the syscall entry procedure. 108 * 109 * o UTLS_RSP0: the value we must put in RSP in order to have a stack where 110 * we can push the register states. This is used only during the syscall 111 * entry procedure, because there the CPU does not automatically switch 112 * RSP (it does not use the TSS.rsp0 mechanism described below). 113 * 114 * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~ 115 * 116 * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between 117 * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to 118 * the stack of the new LWP. Then the execution continues. At some point, the 119 * user LWP we context-switched to will perform a syscall or will receive an 120 * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a 121 * stack. The kernel then pushes the register states on this stack, and 122 * executes in kernel mode normally. 123 * 124 * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore, 125 * when an interrupt is received while we were in kernel mode, the CPU does not 126 * read TSS.rsp0. Instead, it just uses the current stack. 127 * 128 * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~ 129 * 130 * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU 131 * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do 132 * _not_ have associated physical addresses. They are only two VAs. 133 * 134 * The first page is unmapped and acts as a redzone. The second page is 135 * dynamically kentered into the highest page of the real per-lwp kernel stack; 136 * but pay close attention, it is kentered _only_ in the user page tables. 137 * That is to say, the VA of this second page is mapped when the user page 138 * tables are loaded, but not mapped when the kernel page tables are loaded. 139 * 140 * During a context switch, svs_lwp_switch() gets called first. This function 141 * does the kenter job described above, not in the kernel page tables (that 142 * are currently loaded), but in the user page tables (that are not loaded). 143 * 144 * VIRTUAL ADDRESSES PHYSICAL ADDRESSES 145 * 146 * +-----------------------------+ 147 * | KERNEL PAGE TABLES | 148 * | +-------------------+ | +-------------------+ 149 * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) | 150 * | +-------------------+ | +-------------------+ 151 * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) | 152 * | +-------------------+ | +-------------------+ 153 * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) | 154 * | +-------------------+ | +-------------------+ 155 * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) | 156 * | +-------------------+ | +-> +-------------------+ 157 * +-----------------------------+ | 158 * | 159 * +---------------------------------------+ | 160 * | USER PAGE TABLES | | 161 * | +----------------------------------+ | | 162 * | | pcpuarea->ent[cid].rsp0 (page 0) | | | 163 * | +----------------------------------+ | | 164 * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+ 165 * | +----------------------------------+ | 166 * +---------------------------------------+ 167 * 168 * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1) 169 * in TSS.rsp0. Later, when returning to userland on the lwp we context- 170 * switched to, we will load the user page tables and execute in userland 171 * normally. 172 * 173 * Next time an interrupt or syscall is received, the CPU will automatically 174 * use TSS.rsp0 as a stack. Here it is executing with the user page tables 175 * loaded, and therefore TSS.rsp0 is _mapped_. 176 * 177 * As part of the kernel entry procedure, we now switch CR3 to load the kernel 178 * page tables. Here, we are still using the stack pointer we set in TSS.rsp0. 179 * 180 * Remember that it was only one page of stack which was mapped only in the 181 * user page tables. We just switched to the kernel page tables, so we must 182 * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so, 183 * without touching the stack (since it is now unmapped, touching it would 184 * fault). 185 * 186 * After we updated RSP, we can continue execution exactly as in the non-SVS 187 * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if 188 * we updated RSP to a totally different VA, this VA points to the same 189 * physical page as TSS.rsp0. So in the end, the values the CPU pushed are 190 * still here even with the new RSP. 191 * 192 * Thanks to this double-kenter optimization, we don't need to copy the 193 * trapframe during each user<->kernel transition. 194 * 195 * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 196 * 197 * o Touching ci_svs_updir without holding ci_svs_mtx first is *not* 198 * allowed. 199 * 200 * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded 201 * in their CR3 register. It must *not* be replaced by pm_cpus. 202 * 203 * o When a context switch on the current CPU is made from a user LWP 204 * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's 205 * pm_kernel_cpus still contains the current CPU. It implies that the 206 * remote CPUs that execute other threads of the user process we just 207 * left will keep synchronizing us against their changes. 208 * 209 * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~ 210 * 211 * o PTE Space 212 * o Direct Map 213 * o Remote PCPU Areas 214 * o Kernel Heap 215 * o Kernel Image 216 * 217 * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 218 * 219 * Ordered from highest priority to lowest: 220 * 221 * o The NMI stack is not double-entered. Therefore if we ever receive an NMI 222 * and leave it, the content of the stack will be visible to userland (via 223 * Meltdown). Normally we never leave NMIs, unless a privileged user 224 * launched PMCs. That's unlikely to happen, our PMC support is pretty 225 * minimal, and privileged only. 226 * 227 * o Narrow down the entry points: hide the 'jmp handler' instructions. This 228 * makes sense on GENERIC_KASLR kernels. 229 */ 230 231 /* -------------------------------------------------------------------------- */ 232 233 /* SVS_ENTER. */ 234 extern uint8_t svs_enter, svs_enter_end; 235 static const struct x86_hotpatch_source hp_svs_enter_source = { 236 .saddr = &svs_enter, 237 .eaddr = &svs_enter_end 238 }; 239 static const struct x86_hotpatch_descriptor hp_svs_enter_desc = { 240 .name = HP_NAME_SVS_ENTER, 241 .nsrc = 1, 242 .srcs = { &hp_svs_enter_source } 243 }; 244 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_desc); 245 246 /* SVS_ENTER_ALT. */ 247 extern uint8_t svs_enter_altstack, svs_enter_altstack_end; 248 static const struct x86_hotpatch_source hp_svs_enter_altstack_source = { 249 .saddr = &svs_enter_altstack, 250 .eaddr = &svs_enter_altstack_end 251 }; 252 static const struct x86_hotpatch_descriptor hp_svs_enter_altstack_desc = { 253 .name = HP_NAME_SVS_ENTER_ALT, 254 .nsrc = 1, 255 .srcs = { &hp_svs_enter_altstack_source } 256 }; 257 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_altstack_desc); 258 259 /* SVS_ENTER_NMI. */ 260 extern uint8_t svs_enter_nmi, svs_enter_nmi_end; 261 static const struct x86_hotpatch_source hp_svs_enter_nmi_source = { 262 .saddr = &svs_enter_nmi, 263 .eaddr = &svs_enter_nmi_end 264 }; 265 static const struct x86_hotpatch_descriptor hp_svs_enter_nmi_desc = { 266 .name = HP_NAME_SVS_ENTER_NMI, 267 .nsrc = 1, 268 .srcs = { &hp_svs_enter_nmi_source } 269 }; 270 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_nmi_desc); 271 272 /* SVS_LEAVE. */ 273 extern uint8_t svs_leave, svs_leave_end; 274 static const struct x86_hotpatch_source hp_svs_leave_source = { 275 .saddr = &svs_leave, 276 .eaddr = &svs_leave_end 277 }; 278 static const struct x86_hotpatch_descriptor hp_svs_leave_desc = { 279 .name = HP_NAME_SVS_LEAVE, 280 .nsrc = 1, 281 .srcs = { &hp_svs_leave_source } 282 }; 283 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_desc); 284 285 /* SVS_LEAVE_ALT. */ 286 extern uint8_t svs_leave_altstack, svs_leave_altstack_end; 287 static const struct x86_hotpatch_source hp_svs_leave_altstack_source = { 288 .saddr = &svs_leave_altstack, 289 .eaddr = &svs_leave_altstack_end 290 }; 291 static const struct x86_hotpatch_descriptor hp_svs_leave_altstack_desc = { 292 .name = HP_NAME_SVS_LEAVE_ALT, 293 .nsrc = 1, 294 .srcs = { &hp_svs_leave_altstack_source } 295 }; 296 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_altstack_desc); 297 298 /* SVS_LEAVE_NMI. */ 299 extern uint8_t svs_leave_nmi, svs_leave_nmi_end; 300 static const struct x86_hotpatch_source hp_svs_leave_nmi_source = { 301 .saddr = &svs_leave_nmi, 302 .eaddr = &svs_leave_nmi_end 303 }; 304 static const struct x86_hotpatch_descriptor hp_svs_leave_nmi_desc = { 305 .name = HP_NAME_SVS_LEAVE_NMI, 306 .nsrc = 1, 307 .srcs = { &hp_svs_leave_nmi_source } 308 }; 309 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_nmi_desc); 310 311 /* -------------------------------------------------------------------------- */ 312 313 bool svs_enabled __read_mostly = false; 314 bool svs_pcid __read_mostly = false; 315 316 static uint64_t svs_pcid_kcr3 __read_mostly; 317 static uint64_t svs_pcid_ucr3 __read_mostly; 318 319 struct svs_utls { 320 paddr_t kpdirpa; 321 uint64_t scratch; 322 vaddr_t rsp0; 323 }; 324 325 static pd_entry_t * 326 svs_tree_add(struct cpu_info *ci, vaddr_t va) 327 { 328 extern const vaddr_t ptp_masks[]; 329 extern const int ptp_shifts[]; 330 pd_entry_t *dstpde; 331 struct vm_page *pg; 332 size_t i, pidx; 333 paddr_t pa; 334 335 dstpde = ci->ci_svs_updir; 336 337 for (i = PTP_LEVELS; i > 1; i--) { 338 pidx = pl_pi(va, i); 339 340 if (!pmap_valid_entry(dstpde[pidx])) { 341 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 342 if (pg == 0) 343 panic("%s: failed to allocate PA for CPU %d\n", 344 __func__, cpu_index(ci)); 345 pa = VM_PAGE_TO_PHYS(pg); 346 347 dstpde[pidx] = PTE_P | PTE_W | pa; 348 } 349 350 pa = (paddr_t)(dstpde[pidx] & PTE_FRAME); 351 dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa); 352 } 353 354 return dstpde; 355 } 356 357 static void 358 svs_page_add(struct cpu_info *ci, vaddr_t va, bool global) 359 { 360 pd_entry_t *srcpde, *dstpde, pde; 361 size_t idx, pidx; 362 paddr_t pa; 363 364 /* Create levels L4, L3 and L2. */ 365 dstpde = svs_tree_add(ci, va); 366 367 pidx = pl1_pi(va); 368 369 /* 370 * If 'va' is in a large page, we need to compute its physical 371 * address manually. 372 */ 373 idx = pl2_i(va); 374 srcpde = L2_BASE; 375 if (!pmap_valid_entry(srcpde[idx])) { 376 panic("%s: L2 page not mapped", __func__); 377 } 378 if (srcpde[idx] & PTE_PS) { 379 KASSERT(!global); 380 pa = srcpde[idx] & PTE_2MFRAME; 381 pa += (paddr_t)(va % NBPD_L2); 382 pde = (srcpde[idx] & ~(PTE_PS|PTE_2MFRAME)) | pa; 383 384 if (pmap_valid_entry(dstpde[pidx])) { 385 panic("%s: L1 page already mapped", __func__); 386 } 387 dstpde[pidx] = pde; 388 return; 389 } 390 391 /* 392 * Normal page, just copy the PDE. 393 */ 394 idx = pl1_i(va); 395 srcpde = L1_BASE; 396 if (!pmap_valid_entry(srcpde[idx])) { 397 panic("%s: L1 page not mapped", __func__); 398 } 399 if (pmap_valid_entry(dstpde[pidx])) { 400 panic("%s: L1 page already mapped", __func__); 401 } 402 dstpde[pidx] = srcpde[idx]; 403 404 /* 405 * If we want a global translation, mark both the src and dst with 406 * PTE_G. 407 */ 408 if (global) { 409 srcpde[idx] |= PTE_G; 410 dstpde[pidx] |= PTE_G; 411 tlbflushg(); 412 } 413 } 414 415 static void 416 svs_rsp0_init(struct cpu_info *ci) 417 { 418 const cpuid_t cid = cpu_index(ci); 419 vaddr_t va, rsp0; 420 pd_entry_t *pd; 421 size_t pidx; 422 423 rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0; 424 425 /* The first page is a redzone. */ 426 va = rsp0 + PAGE_SIZE; 427 428 /* Create levels L4, L3 and L2. */ 429 pd = svs_tree_add(ci, va); 430 431 /* Get the info for L1. */ 432 pidx = pl1_i(va % NBPD_L2); 433 if (pmap_valid_entry(pd[pidx])) { 434 panic("%s: rsp0 page already mapped", __func__); 435 } 436 437 ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx]; 438 ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe); 439 ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe); 440 ci->ci_svs_krsp0 = 0; 441 } 442 443 static void 444 svs_utls_init(struct cpu_info *ci) 445 { 446 const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls; 447 struct svs_utls *utls; 448 struct vm_page *pg; 449 pd_entry_t *pd; 450 size_t pidx; 451 paddr_t pa; 452 vaddr_t va; 453 454 /* Create levels L4, L3 and L2 of the UTLS page. */ 455 pd = svs_tree_add(ci, utlsva); 456 457 /* Allocate L1. */ 458 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 459 if (pg == 0) 460 panic("%s: failed to allocate PA for CPU %d\n", __func__, 461 cpu_index(ci)); 462 pa = VM_PAGE_TO_PHYS(pg); 463 464 /* Enter L1. */ 465 if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) { 466 panic("%s: local page already mapped", __func__); 467 } 468 pidx = pl1_pi(utlsva); 469 if (pmap_valid_entry(pd[pidx])) { 470 panic("%s: L1 page already mapped", __func__); 471 } 472 pd[pidx] = PTE_P | PTE_W | pmap_pg_nx | pa; 473 474 /* 475 * Now, allocate a VA in the kernel map, that points to the UTLS 476 * page. After that, the UTLS page will be accessible in kernel 477 * mode via ci_svs_utls. 478 */ 479 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 480 UVM_KMF_VAONLY|UVM_KMF_NOWAIT); 481 if (va == 0) { 482 panic("%s: unable to allocate VA\n", __func__); 483 } 484 pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0); 485 pmap_update(pmap_kernel()); 486 487 ci->ci_svs_utls = va; 488 489 /* Initialize the constant fields of the UTLS page */ 490 utls = (struct svs_utls *)ci->ci_svs_utls; 491 utls->rsp0 = ci->ci_svs_rsp0; 492 } 493 494 static void 495 svs_pcid_init(struct cpu_info *ci) 496 { 497 if (!svs_pcid) { 498 return; 499 } 500 501 svs_pcid_ucr3 = __SHIFTIN(PMAP_PCID_USER, CR3_PCID) | CR3_NO_TLB_FLUSH; 502 svs_pcid_kcr3 = __SHIFTIN(PMAP_PCID_KERN, CR3_PCID) | CR3_NO_TLB_FLUSH; 503 504 ci->ci_svs_updirpa |= svs_pcid_ucr3; 505 } 506 507 static void 508 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size, bool global) 509 { 510 size_t i, n; 511 512 KASSERT(size % PAGE_SIZE == 0); 513 n = size / PAGE_SIZE; 514 for (i = 0; i < n; i++) { 515 svs_page_add(ci, va + i * PAGE_SIZE, global); 516 } 517 } 518 519 void 520 cpu_svs_init(struct cpu_info *ci) 521 { 522 extern char __text_user_start; 523 extern char __text_user_end; 524 extern vaddr_t idt_vaddr; 525 const cpuid_t cid = cpu_index(ci); 526 struct vm_page *pg; 527 528 KASSERT(ci != NULL); 529 530 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO); 531 if (pg == 0) 532 panic("%s: failed to allocate L4 PA for CPU %d\n", 533 __func__, cpu_index(ci)); 534 ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg); 535 536 ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 537 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 538 if (ci->ci_svs_updir == NULL) 539 panic("%s: failed to allocate L4 VA for CPU %d\n", 540 __func__, cpu_index(ci)); 541 542 pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa, 543 VM_PROT_READ | VM_PROT_WRITE, 0); 544 545 pmap_update(pmap_kernel()); 546 547 mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM); 548 549 if (cid == cpu_index(&cpu_info_primary) || !idt_vec_is_pcpu()) 550 svs_page_add(ci, idt_vaddr, true); 551 svs_page_add(ci, (vaddr_t)&pcpuarea->ldt, true); 552 svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid], 553 offsetof(struct pcpu_entry, rsp0), true); 554 svs_range_add(ci, (vaddr_t)&__text_user_start, 555 (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start, false); 556 557 svs_rsp0_init(ci); 558 svs_utls_init(ci); 559 svs_pcid_init(ci); 560 561 #ifdef USER_LDT 562 mutex_enter(&cpu_lock); 563 ci->ci_svs_ldt_sel = ldt_alloc(&pcpuarea->ent[cid].ldt, 564 MAX_USERLDT_SIZE); 565 mutex_exit(&cpu_lock); 566 #endif 567 } 568 569 void 570 svs_pmap_sync(struct pmap *pmap, int index) 571 { 572 CPU_INFO_ITERATOR cii; 573 struct cpu_info *ci; 574 cpuid_t cid; 575 576 KASSERT(pmap != NULL); 577 KASSERT(pmap != pmap_kernel()); 578 KASSERT(pmap_is_user(pmap)); 579 KASSERT(mutex_owned(&pmap->pm_lock)); 580 KASSERT(kpreempt_disabled()); 581 KASSERT(index < PDIR_SLOT_USERLIM); 582 583 ci = curcpu(); 584 cid = cpu_index(ci); 585 586 mutex_enter(&ci->ci_svs_mtx); 587 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 588 ci->ci_svs_updir[index] = pmap->pm_pdir[index]; 589 mutex_exit(&ci->ci_svs_mtx); 590 591 if (!kcpuset_isotherset(pmap->pm_kernel_cpus, cid)) { 592 return; 593 } 594 595 for (CPU_INFO_FOREACH(cii, ci)) { 596 cid = cpu_index(ci); 597 598 if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) { 599 continue; 600 } 601 602 /* take the lock and check again */ 603 mutex_enter(&ci->ci_svs_mtx); 604 if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) { 605 ci->ci_svs_updir[index] = pmap->pm_pdir[index]; 606 } 607 mutex_exit(&ci->ci_svs_mtx); 608 } 609 } 610 611 void 612 svs_ldt_sync(struct pmap *pmap) 613 { 614 struct cpu_info *ci = curcpu(); 615 void *ldt; 616 int sel; 617 618 KASSERT(kpreempt_disabled()); 619 620 /* 621 * Another LWP could concurrently modify the LDT via x86_set_ldt1(). 622 * The LWP will wait for pmap_ldt_sync() to finish before destroying 623 * the outdated LDT. 624 * 625 * We have preemption disabled here, so it is guaranteed that even 626 * if the LDT we are syncing is the outdated one, it is still valid. 627 * 628 * pmap_ldt_sync() will execute later once we have preemption enabled, 629 * and will install the new LDT. 630 */ 631 sel = atomic_load_relaxed(&pmap->pm_ldt_sel); 632 if (__predict_false(sel != GSYSSEL(GLDT_SEL, SEL_KPL))) { 633 ldt = atomic_load_relaxed(&pmap->pm_ldt); 634 memcpy(&pcpuarea->ent[cpu_index(ci)].ldt, ldt, 635 MAX_USERLDT_SIZE); 636 sel = ci->ci_svs_ldt_sel; 637 } 638 639 lldt(sel); 640 } 641 642 void 643 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp) 644 { 645 struct cpu_info *ci = curcpu(); 646 struct svs_utls *utls; 647 struct pcb *pcb; 648 pt_entry_t *pte; 649 uintptr_t rsp0; 650 vaddr_t va; 651 652 if (newlwp->l_flag & LW_SYSTEM) { 653 return; 654 } 655 656 #ifdef DIAGNOSTIC 657 if (!(oldlwp->l_flag & LW_SYSTEM)) { 658 pcb = lwp_getpcb(oldlwp); 659 rsp0 = pcb->pcb_rsp0; 660 va = rounddown(rsp0, PAGE_SIZE); 661 KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe)); 662 pte = ci->ci_svs_rsp0_pte; 663 KASSERT(*pte == L1_BASE[pl1_i(va)]); 664 } 665 #endif 666 667 pcb = lwp_getpcb(newlwp); 668 rsp0 = pcb->pcb_rsp0; 669 va = rounddown(rsp0, PAGE_SIZE); 670 671 /* Update the kernel rsp0 in cpu_info */ 672 ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe); 673 KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) == 674 (ci->ci_svs_ursp0 % PAGE_SIZE)); 675 676 utls = (struct svs_utls *)ci->ci_svs_utls; 677 utls->scratch = 0; 678 679 /* 680 * Enter the user rsp0. If we're using PCID we must flush the user VA, 681 * if we aren't it will be flushed during the next CR3 reload. 682 */ 683 pte = ci->ci_svs_rsp0_pte; 684 *pte = L1_BASE[pl1_i(va)]; 685 if (svs_pcid) { 686 invpcid(INVPCID_ADDRESS, PMAP_PCID_USER, ci->ci_svs_rsp0); 687 } 688 } 689 690 /* 691 * We may come here with the pmap unlocked. If a remote CPU is updating 692 * them at the same time, it's not a problem: the remote CPU will call 693 * svs_pmap_sync afterwards, and our updirpa will be synchronized properly. 694 */ 695 void 696 svs_pdir_switch(struct pmap *pmap) 697 { 698 struct cpu_info *ci = curcpu(); 699 struct svs_utls *utls; 700 701 KASSERT(kpreempt_disabled()); 702 KASSERT(pmap != pmap_kernel()); 703 KASSERT(pmap_is_user(pmap)); 704 705 /* Update the info in the UTLS page */ 706 utls = (struct svs_utls *)ci->ci_svs_utls; 707 utls->kpdirpa = pmap_pdirpa(pmap, 0) | svs_pcid_kcr3; 708 709 /* Copy user slots. */ 710 mutex_enter(&ci->ci_svs_mtx); 711 svs_quad_copy(ci->ci_svs_updir, pmap->pm_pdir, PDIR_SLOT_USERLIM); 712 mutex_exit(&ci->ci_svs_mtx); 713 714 if (svs_pcid) { 715 invpcid(INVPCID_CONTEXT, PMAP_PCID_USER, 0); 716 } 717 } 718 719 static void 720 svs_enable(void) 721 { 722 svs_enabled = true; 723 724 x86_hotpatch(HP_NAME_SVS_ENTER, 0); 725 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, 0); 726 x86_hotpatch(HP_NAME_SVS_ENTER_NMI, 0); 727 728 x86_hotpatch(HP_NAME_SVS_LEAVE, 0); 729 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, 0); 730 x86_hotpatch(HP_NAME_SVS_LEAVE_NMI, 0); 731 } 732 733 void 734 svs_init(void) 735 { 736 uint64_t msr; 737 738 if (cpu_vendor != CPUVENDOR_INTEL) { 739 return; 740 } 741 if (boothowto & RB_MD3) { 742 return; 743 } 744 if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) { 745 msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); 746 if (msr & IA32_ARCH_RDCL_NO) { 747 /* 748 * The processor indicates it is not vulnerable to the 749 * Rogue Data Cache Load (Meltdown) flaw. 750 */ 751 return; 752 } 753 } 754 755 if ((cpu_info_primary.ci_feat_val[1] & CPUID2_PCID) && 756 (cpu_info_primary.ci_feat_val[5] & CPUID_SEF_INVPCID)) { 757 svs_pcid = true; 758 lcr4(rcr4() | CR4_PCIDE); 759 } 760 761 svs_enable(); 762 } 763