Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Maxime Villard.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $");
     34 
     35 #include "opt_svs.h"
     36 #include "opt_user_ldt.h"
     37 
     38 #include <sys/param.h>
     39 #include <sys/systm.h>
     40 #include <sys/proc.h>
     41 #include <sys/cpu.h>
     42 #include <sys/kauth.h>
     43 #include <sys/sysctl.h>
     44 #include <sys/xcall.h>
     45 #include <sys/reboot.h>
     46 
     47 #include <x86/cputypes.h>
     48 
     49 #include <machine/cpuvar.h>
     50 #include <machine/frameasm.h>
     51 #include <machine/gdt.h>
     52 #include <machine/pmap_private.h>
     53 
     54 #include <uvm/uvm.h>
     55 #include <uvm/uvm_page.h>
     56 
     57 /*
     58  * Separate Virtual Space
     59  *
     60  * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
     61  * switch to a user pmap, the lower half of updirpa is populated with the
     62  * entries containing the userland pages.
     63  *
     64  * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     65  *
     66  * We use a special per-cpu page that we call UTLS, for User Thread Local
     67  * Storage. Each CPU has one UTLS page. This page has two VAs:
     68  *
     69  *  o When the user page tables are loaded in CR3, the VA to access this
     70  *    page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
     71  *    _constant_ across CPUs, but in the user page tables this VA points to
     72  *    the physical page of the UTLS that is _local_ to the CPU.
     73  *
     74  *  o When the kernel page tables are loaded in CR3, the VA to access this
     75  *    page is ci->ci_svs_utls.
     76  *
     77  * +----------------------------------------------------------------------+
     78  * | CPU0 Local Data                                      (Physical Page) |
     79  * | +------------------+                                 +-------------+ |
     80  * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
     81  * | +------------------+                                 +-------------+ |
     82  * +-------------------------------------------------------------^--------+
     83  *                                                               |
     84  *                                                               +----------+
     85  *                                                                          |
     86  * +----------------------------------------------------------------------+ |
     87  * | CPU1 Local Data                                      (Physical Page) | |
     88  * | +------------------+                                 +-------------+ | |
     89  * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
     90  * | +------------------+                                 +-------------+ | |
     91  * +-------------------------------------------------------------^--------+ |
     92  *                                                               |          |
     93  *   +------------------+                 /----------------------+          |
     94  *   | Kern Page Tables | ci->ci_svs_utls                                   |
     95  *   +------------------+                 \---------------------------------+
     96  *
     97  * The goal of the UTLS page is to provide an area where we can store whatever
     98  * we want, in a way that it is accessible both when the Kernel and when the
     99  * User page tables are loaded in CR3.
    100  *
    101  * We store in the UTLS page three 64bit values:
    102  *
    103  *  o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
    104  *    page tables.
    105  *
    106  *  o UTLS_SCRATCH: a dummy place where we temporarily store a value during
    107  *    the syscall entry procedure.
    108  *
    109  *  o UTLS_RSP0: the value we must put in RSP in order to have a stack where
    110  *    we can push the register states. This is used only during the syscall
    111  *    entry procedure, because there the CPU does not automatically switch
    112  *    RSP (it does not use the TSS.rsp0 mechanism described below).
    113  *
    114  * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
    115  *
    116  * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
    117  * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
    118  * the stack of the new LWP. Then the execution continues. At some point, the
    119  * user LWP we context-switched to will perform a syscall or will receive an
    120  * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
    121  * stack. The kernel then pushes the register states on this stack, and
    122  * executes in kernel mode normally.
    123  *
    124  * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
    125  * when an interrupt is received while we were in kernel mode, the CPU does not
    126  * read TSS.rsp0. Instead, it just uses the current stack.
    127  *
    128  * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
    129  *
    130  * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
    131  * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
    132  * _not_ have associated physical addresses. They are only two VAs.
    133  *
    134  * The first page is unmapped and acts as a redzone. The second page is
    135  * dynamically kentered into the highest page of the real per-lwp kernel stack;
    136  * but pay close attention, it is kentered _only_ in the user page tables.
    137  * That is to say, the VA of this second page is mapped when the user page
    138  * tables are loaded, but not mapped when the kernel page tables are loaded.
    139  *
    140  * During a context switch, svs_lwp_switch() gets called first. This function
    141  * does the kenter job described above, not in the kernel page tables (that
    142  * are currently loaded), but in the user page tables (that are not loaded).
    143  *
    144  *           VIRTUAL ADDRESSES                     PHYSICAL ADDRESSES
    145  *
    146  * +-----------------------------+
    147  * |      KERNEL PAGE TABLES     |
    148  * |    +-------------------+    |                +-------------------+
    149  * |    | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
    150  * |    +-------------------+    |                +-------------------+
    151  * |    | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
    152  * |    +-------------------+    |                +-------------------+
    153  * |    | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
    154  * |    +-------------------+    |                +-------------------+
    155  * |    | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
    156  * |    +-------------------+    |            +-> +-------------------+
    157  * +-----------------------------+            |
    158  *                                            |
    159  * +---------------------------------------+  |
    160  * |           USER PAGE TABLES            |  |
    161  * | +----------------------------------+  |  |
    162  * | | pcpuarea->ent[cid].rsp0 (page 0) |  |  |
    163  * | +----------------------------------+  |  |
    164  * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
    165  * | +----------------------------------+  |
    166  * +---------------------------------------+
    167  *
    168  * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
    169  * in TSS.rsp0. Later, when returning to userland on the lwp we context-
    170  * switched to, we will load the user page tables and execute in userland
    171  * normally.
    172  *
    173  * Next time an interrupt or syscall is received, the CPU will automatically
    174  * use TSS.rsp0 as a stack. Here it is executing with the user page tables
    175  * loaded, and therefore TSS.rsp0 is _mapped_.
    176  *
    177  * As part of the kernel entry procedure, we now switch CR3 to load the kernel
    178  * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
    179  *
    180  * Remember that it was only one page of stack which was mapped only in the
    181  * user page tables. We just switched to the kernel page tables, so we must
    182  * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
    183  * without touching the stack (since it is now unmapped, touching it would
    184  * fault).
    185  *
    186  * After we updated RSP, we can continue execution exactly as in the non-SVS
    187  * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
    188  * we updated RSP to a totally different VA, this VA points to the same
    189  * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
    190  * still here even with the new RSP.
    191  *
    192  * Thanks to this double-kenter optimization, we don't need to copy the
    193  * trapframe during each user<->kernel transition.
    194  *
    195  * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    196  *
    197  *  o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
    198  *    allowed.
    199  *
    200  *  o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
    201  *    in their CR3 register. It must *not* be replaced by pm_cpus.
    202  *
    203  *  o When a context switch on the current CPU is made from a user LWP
    204  *    towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
    205  *    pm_kernel_cpus still contains the current CPU. It implies that the
    206  *    remote CPUs that execute other threads of the user process we just
    207  *    left will keep synchronizing us against their changes.
    208  *
    209  * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
    210  *
    211  *  o PTE Space
    212  *  o Direct Map
    213  *  o Remote PCPU Areas
    214  *  o Kernel Heap
    215  *  o Kernel Image
    216  *
    217  * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    218  *
    219  * Ordered from highest priority to lowest:
    220  *
    221  *  o The NMI stack is not double-entered. Therefore if we ever receive an NMI
    222  *    and leave it, the content of the stack will be visible to userland (via
    223  *    Meltdown). Normally we never leave NMIs, unless a privileged user
    224  *    launched PMCs. That's unlikely to happen, our PMC support is pretty
    225  *    minimal, and privileged only.
    226  *
    227  *  o Narrow down the entry points: hide the 'jmp handler' instructions. This
    228  *    makes sense on GENERIC_KASLR kernels.
    229  */
    230 
    231 /* -------------------------------------------------------------------------- */
    232 
    233 /* SVS_ENTER. */
    234 extern uint8_t svs_enter, svs_enter_end;
    235 static const struct x86_hotpatch_source hp_svs_enter_source = {
    236 	.saddr = &svs_enter,
    237 	.eaddr = &svs_enter_end
    238 };
    239 static const struct x86_hotpatch_descriptor hp_svs_enter_desc = {
    240 	.name = HP_NAME_SVS_ENTER,
    241 	.nsrc = 1,
    242 	.srcs = { &hp_svs_enter_source }
    243 };
    244 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_desc);
    245 
    246 /* SVS_ENTER_ALT. */
    247 extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
    248 static const struct x86_hotpatch_source hp_svs_enter_altstack_source = {
    249 	.saddr = &svs_enter_altstack,
    250 	.eaddr = &svs_enter_altstack_end
    251 };
    252 static const struct x86_hotpatch_descriptor hp_svs_enter_altstack_desc = {
    253 	.name = HP_NAME_SVS_ENTER_ALT,
    254 	.nsrc = 1,
    255 	.srcs = { &hp_svs_enter_altstack_source }
    256 };
    257 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_altstack_desc);
    258 
    259 /* SVS_ENTER_NMI. */
    260 extern uint8_t svs_enter_nmi, svs_enter_nmi_end;
    261 static const struct x86_hotpatch_source hp_svs_enter_nmi_source = {
    262 	.saddr = &svs_enter_nmi,
    263 	.eaddr = &svs_enter_nmi_end
    264 };
    265 static const struct x86_hotpatch_descriptor hp_svs_enter_nmi_desc = {
    266 	.name = HP_NAME_SVS_ENTER_NMI,
    267 	.nsrc = 1,
    268 	.srcs = { &hp_svs_enter_nmi_source }
    269 };
    270 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_nmi_desc);
    271 
    272 /* SVS_LEAVE. */
    273 extern uint8_t svs_leave, svs_leave_end;
    274 static const struct x86_hotpatch_source hp_svs_leave_source = {
    275 	.saddr = &svs_leave,
    276 	.eaddr = &svs_leave_end
    277 };
    278 static const struct x86_hotpatch_descriptor hp_svs_leave_desc = {
    279 	.name = HP_NAME_SVS_LEAVE,
    280 	.nsrc = 1,
    281 	.srcs = { &hp_svs_leave_source }
    282 };
    283 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_desc);
    284 
    285 /* SVS_LEAVE_ALT. */
    286 extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
    287 static const struct x86_hotpatch_source hp_svs_leave_altstack_source = {
    288 	.saddr = &svs_leave_altstack,
    289 	.eaddr = &svs_leave_altstack_end
    290 };
    291 static const struct x86_hotpatch_descriptor hp_svs_leave_altstack_desc = {
    292 	.name = HP_NAME_SVS_LEAVE_ALT,
    293 	.nsrc = 1,
    294 	.srcs = { &hp_svs_leave_altstack_source }
    295 };
    296 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_altstack_desc);
    297 
    298 /* SVS_LEAVE_NMI. */
    299 extern uint8_t svs_leave_nmi, svs_leave_nmi_end;
    300 static const struct x86_hotpatch_source hp_svs_leave_nmi_source = {
    301 	.saddr = &svs_leave_nmi,
    302 	.eaddr = &svs_leave_nmi_end
    303 };
    304 static const struct x86_hotpatch_descriptor hp_svs_leave_nmi_desc = {
    305 	.name = HP_NAME_SVS_LEAVE_NMI,
    306 	.nsrc = 1,
    307 	.srcs = { &hp_svs_leave_nmi_source }
    308 };
    309 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_nmi_desc);
    310 
    311 /* -------------------------------------------------------------------------- */
    312 
    313 bool svs_enabled __read_mostly = false;
    314 bool svs_pcid __read_mostly = false;
    315 
    316 static uint64_t svs_pcid_kcr3 __read_mostly;
    317 static uint64_t svs_pcid_ucr3 __read_mostly;
    318 
    319 struct svs_utls {
    320 	paddr_t kpdirpa;
    321 	uint64_t scratch;
    322 	vaddr_t rsp0;
    323 };
    324 
    325 static pd_entry_t *
    326 svs_tree_add(struct cpu_info *ci, vaddr_t va)
    327 {
    328 	extern const vaddr_t ptp_masks[];
    329 	extern const int ptp_shifts[];
    330 	pd_entry_t *dstpde;
    331 	struct vm_page *pg;
    332 	size_t i, pidx;
    333 	paddr_t pa;
    334 
    335 	dstpde = ci->ci_svs_updir;
    336 
    337 	for (i = PTP_LEVELS; i > 1; i--) {
    338 		pidx = pl_pi(va, i);
    339 
    340 		if (!pmap_valid_entry(dstpde[pidx])) {
    341 			pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
    342 			if (pg == 0)
    343 				panic("%s: failed to allocate PA for CPU %d\n",
    344 					__func__, cpu_index(ci));
    345 			pa = VM_PAGE_TO_PHYS(pg);
    346 
    347 			dstpde[pidx] = PTE_P | PTE_W | pa;
    348 		}
    349 
    350 		pa = (paddr_t)(dstpde[pidx] & PTE_FRAME);
    351 		dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
    352 	}
    353 
    354 	return dstpde;
    355 }
    356 
    357 static void
    358 svs_page_add(struct cpu_info *ci, vaddr_t va, bool global)
    359 {
    360 	pd_entry_t *srcpde, *dstpde, pde;
    361 	size_t idx, pidx;
    362 	paddr_t pa;
    363 
    364 	/* Create levels L4, L3 and L2. */
    365 	dstpde = svs_tree_add(ci, va);
    366 
    367 	pidx = pl1_pi(va);
    368 
    369 	/*
    370 	 * If 'va' is in a large page, we need to compute its physical
    371 	 * address manually.
    372 	 */
    373 	idx = pl2_i(va);
    374 	srcpde = L2_BASE;
    375 	if (!pmap_valid_entry(srcpde[idx])) {
    376 		panic("%s: L2 page not mapped", __func__);
    377 	}
    378 	if (srcpde[idx] & PTE_PS) {
    379 		KASSERT(!global);
    380 		pa = srcpde[idx] & PTE_2MFRAME;
    381 		pa += (paddr_t)(va % NBPD_L2);
    382 		pde = (srcpde[idx] & ~(PTE_PS|PTE_2MFRAME)) | pa;
    383 
    384 		if (pmap_valid_entry(dstpde[pidx])) {
    385 			panic("%s: L1 page already mapped", __func__);
    386 		}
    387 		dstpde[pidx] = pde;
    388 		return;
    389 	}
    390 
    391 	/*
    392 	 * Normal page, just copy the PDE.
    393 	 */
    394 	idx = pl1_i(va);
    395 	srcpde = L1_BASE;
    396 	if (!pmap_valid_entry(srcpde[idx])) {
    397 		panic("%s: L1 page not mapped", __func__);
    398 	}
    399 	if (pmap_valid_entry(dstpde[pidx])) {
    400 		panic("%s: L1 page already mapped", __func__);
    401 	}
    402 	dstpde[pidx] = srcpde[idx];
    403 
    404 	/*
    405 	 * If we want a global translation, mark both the src and dst with
    406 	 * PTE_G.
    407 	 */
    408 	if (global) {
    409 		srcpde[idx] |= PTE_G;
    410 		dstpde[pidx] |= PTE_G;
    411 		tlbflushg();
    412 	}
    413 }
    414 
    415 static void
    416 svs_rsp0_init(struct cpu_info *ci)
    417 {
    418 	const cpuid_t cid = cpu_index(ci);
    419 	vaddr_t va, rsp0;
    420 	pd_entry_t *pd;
    421 	size_t pidx;
    422 
    423 	rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
    424 
    425 	/* The first page is a redzone. */
    426 	va = rsp0 + PAGE_SIZE;
    427 
    428 	/* Create levels L4, L3 and L2. */
    429 	pd = svs_tree_add(ci, va);
    430 
    431 	/* Get the info for L1. */
    432 	pidx = pl1_i(va % NBPD_L2);
    433 	if (pmap_valid_entry(pd[pidx])) {
    434 		panic("%s: rsp0 page already mapped", __func__);
    435 	}
    436 
    437 	ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
    438 	ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
    439 	ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
    440 	ci->ci_svs_krsp0 = 0;
    441 }
    442 
    443 static void
    444 svs_utls_init(struct cpu_info *ci)
    445 {
    446 	const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
    447 	struct svs_utls *utls;
    448 	struct vm_page *pg;
    449 	pd_entry_t *pd;
    450 	size_t pidx;
    451 	paddr_t pa;
    452 	vaddr_t va;
    453 
    454 	/* Create levels L4, L3 and L2 of the UTLS page. */
    455 	pd = svs_tree_add(ci, utlsva);
    456 
    457 	/* Allocate L1. */
    458 	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
    459 	if (pg == 0)
    460 		panic("%s: failed to allocate PA for CPU %d\n", __func__,
    461 		    cpu_index(ci));
    462 	pa = VM_PAGE_TO_PHYS(pg);
    463 
    464 	/* Enter L1. */
    465 	if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
    466 		panic("%s: local page already mapped", __func__);
    467 	}
    468 	pidx = pl1_pi(utlsva);
    469 	if (pmap_valid_entry(pd[pidx])) {
    470 		panic("%s: L1 page already mapped", __func__);
    471 	}
    472 	pd[pidx] = PTE_P | PTE_W | pmap_pg_nx | pa;
    473 
    474 	/*
    475 	 * Now, allocate a VA in the kernel map, that points to the UTLS
    476 	 * page. After that, the UTLS page will be accessible in kernel
    477 	 * mode via ci_svs_utls.
    478 	 */
    479 	va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    480 	    UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
    481 	if (va == 0) {
    482 		panic("%s: unable to allocate VA\n", __func__);
    483 	}
    484 	pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
    485 	pmap_update(pmap_kernel());
    486 
    487 	ci->ci_svs_utls = va;
    488 
    489 	/* Initialize the constant fields of the UTLS page */
    490 	utls = (struct svs_utls *)ci->ci_svs_utls;
    491 	utls->rsp0 = ci->ci_svs_rsp0;
    492 }
    493 
    494 static void
    495 svs_pcid_init(struct cpu_info *ci)
    496 {
    497 	if (!svs_pcid) {
    498 		return;
    499 	}
    500 
    501 	svs_pcid_ucr3 = __SHIFTIN(PMAP_PCID_USER, CR3_PCID) | CR3_NO_TLB_FLUSH;
    502 	svs_pcid_kcr3 = __SHIFTIN(PMAP_PCID_KERN, CR3_PCID) | CR3_NO_TLB_FLUSH;
    503 
    504 	ci->ci_svs_updirpa |= svs_pcid_ucr3;
    505 }
    506 
    507 static void
    508 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size, bool global)
    509 {
    510 	size_t i, n;
    511 
    512 	KASSERT(size % PAGE_SIZE == 0);
    513 	n = size / PAGE_SIZE;
    514 	for (i = 0; i < n; i++) {
    515 		svs_page_add(ci, va + i * PAGE_SIZE, global);
    516 	}
    517 }
    518 
    519 void
    520 cpu_svs_init(struct cpu_info *ci)
    521 {
    522 	extern char __text_user_start;
    523 	extern char __text_user_end;
    524 	extern vaddr_t idt_vaddr;
    525 	const cpuid_t cid = cpu_index(ci);
    526 	struct vm_page *pg;
    527 
    528 	KASSERT(ci != NULL);
    529 
    530 	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
    531 	if (pg == 0)
    532 		panic("%s: failed to allocate L4 PA for CPU %d\n",
    533 			__func__, cpu_index(ci));
    534 	ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
    535 
    536 	ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    537 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
    538 	if (ci->ci_svs_updir == NULL)
    539 		panic("%s: failed to allocate L4 VA for CPU %d\n",
    540 			__func__, cpu_index(ci));
    541 
    542 	pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
    543 		VM_PROT_READ | VM_PROT_WRITE, 0);
    544 
    545 	pmap_update(pmap_kernel());
    546 
    547 	mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
    548 
    549 	if (cid == cpu_index(&cpu_info_primary) || !idt_vec_is_pcpu())
    550 		svs_page_add(ci, idt_vaddr, true);
    551 	svs_page_add(ci, (vaddr_t)&pcpuarea->ldt, true);
    552 	svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
    553 	    offsetof(struct pcpu_entry, rsp0), true);
    554 	svs_range_add(ci, (vaddr_t)&__text_user_start,
    555 	    (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start, false);
    556 
    557 	svs_rsp0_init(ci);
    558 	svs_utls_init(ci);
    559 	svs_pcid_init(ci);
    560 
    561 #ifdef USER_LDT
    562 	mutex_enter(&cpu_lock);
    563 	ci->ci_svs_ldt_sel = ldt_alloc(&pcpuarea->ent[cid].ldt,
    564 	    MAX_USERLDT_SIZE);
    565 	mutex_exit(&cpu_lock);
    566 #endif
    567 }
    568 
    569 void
    570 svs_pmap_sync(struct pmap *pmap, int index)
    571 {
    572 	CPU_INFO_ITERATOR cii;
    573 	struct cpu_info *ci;
    574 	cpuid_t cid;
    575 
    576 	KASSERT(pmap != NULL);
    577 	KASSERT(pmap != pmap_kernel());
    578 	KASSERT(pmap_is_user(pmap));
    579 	KASSERT(mutex_owned(&pmap->pm_lock));
    580 	KASSERT(kpreempt_disabled());
    581 	KASSERT(index < PDIR_SLOT_USERLIM);
    582 
    583 	ci = curcpu();
    584 	cid = cpu_index(ci);
    585 
    586 	mutex_enter(&ci->ci_svs_mtx);
    587 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
    588 	ci->ci_svs_updir[index] = pmap->pm_pdir[index];
    589 	mutex_exit(&ci->ci_svs_mtx);
    590 
    591 	if (!kcpuset_isotherset(pmap->pm_kernel_cpus, cid)) {
    592 		return;
    593 	}
    594 
    595 	for (CPU_INFO_FOREACH(cii, ci)) {
    596 		cid = cpu_index(ci);
    597 
    598 		if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
    599 			continue;
    600 		}
    601 
    602 		/* take the lock and check again */
    603 		mutex_enter(&ci->ci_svs_mtx);
    604 		if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
    605 			ci->ci_svs_updir[index] = pmap->pm_pdir[index];
    606 		}
    607 		mutex_exit(&ci->ci_svs_mtx);
    608 	}
    609 }
    610 
    611 void
    612 svs_ldt_sync(struct pmap *pmap)
    613 {
    614 	struct cpu_info *ci = curcpu();
    615 	void *ldt;
    616 	int sel;
    617 
    618 	KASSERT(kpreempt_disabled());
    619 
    620 	/*
    621 	 * Another LWP could concurrently modify the LDT via x86_set_ldt1().
    622 	 * The LWP will wait for pmap_ldt_sync() to finish before destroying
    623 	 * the outdated LDT.
    624 	 *
    625 	 * We have preemption disabled here, so it is guaranteed that even
    626 	 * if the LDT we are syncing is the outdated one, it is still valid.
    627 	 *
    628 	 * pmap_ldt_sync() will execute later once we have preemption enabled,
    629 	 * and will install the new LDT.
    630 	 */
    631 	sel = atomic_load_relaxed(&pmap->pm_ldt_sel);
    632 	if (__predict_false(sel != GSYSSEL(GLDT_SEL, SEL_KPL))) {
    633 		ldt = atomic_load_relaxed(&pmap->pm_ldt);
    634 		memcpy(&pcpuarea->ent[cpu_index(ci)].ldt, ldt,
    635 		    MAX_USERLDT_SIZE);
    636 		sel = ci->ci_svs_ldt_sel;
    637 	}
    638 
    639 	lldt(sel);
    640 }
    641 
    642 void
    643 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
    644 {
    645 	struct cpu_info *ci = curcpu();
    646 	struct svs_utls *utls;
    647 	struct pcb *pcb;
    648 	pt_entry_t *pte;
    649 	uintptr_t rsp0;
    650 	vaddr_t va;
    651 
    652 	if (newlwp->l_flag & LW_SYSTEM) {
    653 		return;
    654 	}
    655 
    656 #ifdef DIAGNOSTIC
    657 	if (!(oldlwp->l_flag & LW_SYSTEM)) {
    658 		pcb = lwp_getpcb(oldlwp);
    659 		rsp0 = pcb->pcb_rsp0;
    660 		va = rounddown(rsp0, PAGE_SIZE);
    661 		KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
    662 		pte = ci->ci_svs_rsp0_pte;
    663 		KASSERT(*pte == L1_BASE[pl1_i(va)]);
    664 	}
    665 #endif
    666 
    667 	pcb = lwp_getpcb(newlwp);
    668 	rsp0 = pcb->pcb_rsp0;
    669 	va = rounddown(rsp0, PAGE_SIZE);
    670 
    671 	/* Update the kernel rsp0 in cpu_info */
    672 	ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
    673 	KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
    674 	    (ci->ci_svs_ursp0 % PAGE_SIZE));
    675 
    676 	utls = (struct svs_utls *)ci->ci_svs_utls;
    677 	utls->scratch = 0;
    678 
    679 	/*
    680 	 * Enter the user rsp0. If we're using PCID we must flush the user VA,
    681 	 * if we aren't it will be flushed during the next CR3 reload.
    682 	 */
    683 	pte = ci->ci_svs_rsp0_pte;
    684 	*pte = L1_BASE[pl1_i(va)];
    685 	if (svs_pcid) {
    686 		invpcid(INVPCID_ADDRESS, PMAP_PCID_USER, ci->ci_svs_rsp0);
    687 	}
    688 }
    689 
    690 /*
    691  * We may come here with the pmap unlocked.  If a remote CPU is updating
    692  * them at the same time, it's not a problem: the remote CPU will call
    693  * svs_pmap_sync afterwards, and our updirpa will be synchronized properly.
    694  */
    695 void
    696 svs_pdir_switch(struct pmap *pmap)
    697 {
    698 	struct cpu_info *ci = curcpu();
    699 	struct svs_utls *utls;
    700 
    701 	KASSERT(kpreempt_disabled());
    702 	KASSERT(pmap != pmap_kernel());
    703 	KASSERT(pmap_is_user(pmap));
    704 
    705 	/* Update the info in the UTLS page */
    706 	utls = (struct svs_utls *)ci->ci_svs_utls;
    707 	utls->kpdirpa = pmap_pdirpa(pmap, 0) | svs_pcid_kcr3;
    708 
    709 	/* Copy user slots. */
    710 	mutex_enter(&ci->ci_svs_mtx);
    711 	svs_quad_copy(ci->ci_svs_updir, pmap->pm_pdir, PDIR_SLOT_USERLIM);
    712 	mutex_exit(&ci->ci_svs_mtx);
    713 
    714 	if (svs_pcid) {
    715 		invpcid(INVPCID_CONTEXT, PMAP_PCID_USER, 0);
    716 	}
    717 }
    718 
    719 static void
    720 svs_enable(void)
    721 {
    722 	svs_enabled = true;
    723 
    724 	x86_hotpatch(HP_NAME_SVS_ENTER, 0);
    725 	x86_hotpatch(HP_NAME_SVS_ENTER_ALT, 0);
    726 	x86_hotpatch(HP_NAME_SVS_ENTER_NMI, 0);
    727 
    728 	x86_hotpatch(HP_NAME_SVS_LEAVE, 0);
    729 	x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, 0);
    730 	x86_hotpatch(HP_NAME_SVS_LEAVE_NMI, 0);
    731 }
    732 
    733 void
    734 svs_init(void)
    735 {
    736 	uint64_t msr;
    737 
    738 	if (cpu_vendor != CPUVENDOR_INTEL) {
    739 		return;
    740 	}
    741 	if (boothowto & RB_MD3) {
    742 		return;
    743 	}
    744 	if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) {
    745 		msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
    746 		if (msr & IA32_ARCH_RDCL_NO) {
    747 			/*
    748 			 * The processor indicates it is not vulnerable to the
    749 			 * Rogue Data Cache Load (Meltdown) flaw.
    750 			 */
    751 			return;
    752 		}
    753 	}
    754 
    755 	if ((cpu_info_primary.ci_feat_val[1] & CPUID2_PCID) &&
    756 	    (cpu_info_primary.ci_feat_val[5] & CPUID_SEF_INVPCID)) {
    757 		svs_pcid = true;
    758 		lcr4(rcr4() | CR4_PCIDE);
    759 	}
    760 
    761 	svs_enable();
    762 }
    763