Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: fpu.c,v 1.93 2025/05/14 23:39:54 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.  All
      5  * rights reserved.
      6  *
      7  * This code is derived from software developed for The NetBSD Foundation
      8  * by Andrew Doran and Maxime Villard.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1991 The Regents of the University of California.
     34  * All rights reserved.
     35  *
     36  * Redistribution and use in source and binary forms, with or without
     37  * modification, are permitted provided that the following conditions
     38  * are met:
     39  * 1. Redistributions of source code must retain the above copyright
     40  *    notice, this list of conditions and the following disclaimer.
     41  * 2. Redistributions in binary form must reproduce the above copyright
     42  *    notice, this list of conditions and the following disclaimer in the
     43  *    documentation and/or other materials provided with the distribution.
     44  * 3. Neither the name of the University nor the names of its contributors
     45  *    may be used to endorse or promote products derived from this software
     46  *    without specific prior written permission.
     47  *
     48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     58  * SUCH DAMAGE.
     59  *
     60  *	@(#)npx.c	7.2 (Berkeley) 5/12/91
     61  */
     62 
     63 /*
     64  * Copyright (c) 1994, 1995, 1998 Charles M. Hannum.  All rights reserved.
     65  * Copyright (c) 1990 William Jolitz.
     66  *
     67  * Redistribution and use in source and binary forms, with or without
     68  * modification, are permitted provided that the following conditions
     69  * are met:
     70  * 1. Redistributions of source code must retain the above copyright
     71  *    notice, this list of conditions and the following disclaimer.
     72  * 2. Redistributions in binary form must reproduce the above copyright
     73  *    notice, this list of conditions and the following disclaimer in the
     74  *    documentation and/or other materials provided with the distribution.
     75  * 3. All advertising materials mentioning features or use of this software
     76  *    must display the following acknowledgement:
     77  *	This product includes software developed by the University of
     78  *	California, Berkeley and its contributors.
     79  * 4. Neither the name of the University nor the names of its contributors
     80  *    may be used to endorse or promote products derived from this software
     81  *    without specific prior written permission.
     82  *
     83  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     93  * SUCH DAMAGE.
     94  *
     95  *	@(#)npx.c	7.2 (Berkeley) 5/12/91
     96  */
     97 
     98 #include <sys/cdefs.h>
     99 __KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.93 2025/05/14 23:39:54 riastradh Exp $");
    100 
    101 #include "opt_ddb.h"
    102 #include "opt_multiprocessor.h"
    103 
    104 #include <sys/param.h>
    105 #include <sys/systm.h>
    106 #include <sys/conf.h>
    107 #include <sys/cpu.h>
    108 #include <sys/file.h>
    109 #include <sys/proc.h>
    110 #include <sys/kernel.h>
    111 #include <sys/sysctl.h>
    112 #include <sys/xcall.h>
    113 
    114 #include <machine/cpu.h>
    115 #include <machine/cpuvar.h>
    116 #include <machine/cputypes.h>
    117 #include <machine/intr.h>
    118 #include <machine/cpufunc.h>
    119 #include <machine/pcb.h>
    120 #include <machine/trap.h>
    121 #include <machine/specialreg.h>
    122 #include <x86/cpu.h>
    123 #include <x86/fpu.h>
    124 
    125 #ifdef DDB
    126 #include <ddb/ddb.h>
    127 #endif
    128 
    129 #ifdef XENPV
    130 #define clts() HYPERVISOR_fpu_taskswitch(0)
    131 #define stts() HYPERVISOR_fpu_taskswitch(1)
    132 #endif
    133 
    134 void fpu_handle_deferred(void);
    135 void fpu_switch(struct lwp *, struct lwp *);
    136 
    137 uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
    138 
    139 static const union savefpu safe_fpu_storage __aligned(64) = {
    140 	.sv_xmm = {
    141 		.fx_mxcsr = __SAFE_MXCSR__,
    142 	},
    143 };
    144 static const union savefpu zero_fpu_storage __aligned(64);
    145 
    146 static const void *safe_fpu __read_mostly = &safe_fpu_storage;
    147 static const void *zero_fpu __read_mostly = &zero_fpu_storage;
    148 
    149 /*
    150  * x86_fpu_save_separate_p()
    151  *
    152  *	True if we allocate the FPU save space separately, outside the
    153  *	struct pcb itself, because it doesn't fit in a single page.
    154  */
    155 bool
    156 x86_fpu_save_separate_p(void)
    157 {
    158 
    159 	return x86_fpu_save_size >
    160 	    PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall);
    161 }
    162 
    163 static inline union savefpu *
    164 fpu_lwp_area(struct lwp *l)
    165 {
    166 	struct pcb *pcb = lwp_getpcb(l);
    167 	union savefpu *area = pcb->pcb_savefpu;
    168 
    169 	KASSERT((l->l_flag & LW_SYSTEM) == 0);
    170 	if (l == curlwp) {
    171 		fpu_save();
    172 	}
    173 	KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU));
    174 
    175 	return area;
    176 }
    177 
    178 static inline void
    179 fpu_save_lwp(struct lwp *l)
    180 {
    181 	struct pcb *pcb = lwp_getpcb(l);
    182 	union savefpu *area = pcb->pcb_savefpu;
    183 	int s;
    184 
    185 	s = splvm();
    186 	if (l->l_md.md_flags & MDL_FPU_IN_CPU) {
    187 		KASSERT((l->l_flag & LW_SYSTEM) == 0);
    188 		fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32));
    189 		l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
    190 	}
    191 	splx(s);
    192 }
    193 
    194 /*
    195  * Bring curlwp's FPU state in memory. It will get installed back in the CPU
    196  * when returning to userland.
    197  */
    198 void
    199 fpu_save(void)
    200 {
    201 	fpu_save_lwp(curlwp);
    202 }
    203 
    204 void
    205 fpuinit(struct cpu_info *ci)
    206 {
    207 	/*
    208 	 * This might not be strictly necessary since it will be initialized
    209 	 * for each process. However it does no harm.
    210 	 */
    211 	clts();
    212 	fninit();
    213 	stts();
    214 }
    215 
    216 /*
    217  * fpuinit_mxcsr_mask()
    218  *
    219  *	Called once by cpu_init on the primary CPU.  Initializes
    220  *	x86_fpu_mxcsr_mask based on the initial FPU state, and
    221  *	initializes save_fpu and zero_fpu if necessary when the
    222  *	hardware's FPU save size is larger than union savefpu.
    223  *
    224  *	XXX Rename this function!
    225  */
    226 void
    227 fpuinit_mxcsr_mask(void)
    228 {
    229 	/*
    230 	 * If the CPU's x86 fpu save size is larger than union savefpu,
    231 	 * we have to allocate larger buffers for the safe and zero FPU
    232 	 * states used here and by fpu_kern_enter/leave.
    233 	 *
    234 	 * Note: This is NOT the same as x86_fpu_save_separate_p(),
    235 	 * which may have a little more space than union savefpu.
    236 	 */
    237 	const bool allocfpusave = x86_fpu_save_size > sizeof(union savefpu);
    238 	vaddr_t va;
    239 
    240 #if defined XENPV
    241 	if (x86_fpu_save_separate_p()) {
    242 		/*
    243 		 * XXX Temporary workaround for PR kern/59371 until we
    244 		 * work out the implications.
    245 		 */
    246 		panic("NetBSD/xen does not support fpu save size %u",
    247 		    x86_fpu_save_size);
    248 	}
    249 #elif defined __i386__
    250 	if (x86_fpu_save_separate_p()) {
    251 		/*
    252 		 * XXX Need to teach cpu_uarea_alloc/free to allocate a
    253 		 * separate fpu save space, and make pcb_savefpu a
    254 		 * pointer indirection -- currently only done on amd64,
    255 		 * not on i386.
    256 		 *
    257 		 * But the primary motivation on amd64 is the 8192-byte
    258 		 * TILEDATA state for Intel AMX (Advanced Matrix
    259 		 * Extensions), which doesn't work in 32-bit mode
    260 		 * anyway, so on such machines we ought to just disable
    261 		 * it in the first place and keep x86_fpu_save_size
    262 		 * down:
    263 		 *
    264 		 *	While Intel AMX instructions can be executed
    265 		 *	only in 64-bit mode, instructions of the XSAVE
    266 		 *	feature set can operate on TILECFG and TILEDATA
    267 		 *	in any mode.  It is recommended that only
    268 		 *	64-bit operating systems enable Intel AMX by
    269 		 *	setting XCR0[18:17].
    270 		 *
    271 		 *	--Intel 64 and IA-32 Architectures Software
    272 		 *	Developer's Manual, Volume 1: Basic
    273 		 *	Architecture, Order Number: 253665-087US, March
    274 		 *	2025, Sec. 13.3 `Enabling the XSAVE feature set
    275 		 *	and XSAVE-enabled features', p. 13-6.
    276 		 *	https://cdrdv2.intel.com/v1/dl/getContent/671436
    277 		 *	https://web.archive.org/web/20250404141850/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf
    278 		 *	https://web.archive.org/web/20250404141850if_/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf#page=324
    279 		 */
    280 		panic("NetBSD/i386 does not support fpu save size %u",
    281 		    x86_fpu_save_size);
    282 	}
    283 #endif
    284 
    285 #ifndef XENPV
    286 	struct fxsave fpusave __aligned(64);
    287 	u_long psl;
    288 
    289 	memset(&fpusave, 0, sizeof(fpusave));
    290 
    291 	/* Disable interrupts, and enable FPU */
    292 	psl = x86_read_psl();
    293 	x86_disable_intr();
    294 	clts();
    295 
    296 	/* Fill in the FPU area */
    297 	fxsave(&fpusave);
    298 
    299 	/* Restore previous state */
    300 	stts();
    301 	x86_write_psl(psl);
    302 
    303 	if (fpusave.fx_mxcsr_mask == 0) {
    304 		x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
    305 	} else {
    306 		x86_fpu_mxcsr_mask = fpusave.fx_mxcsr_mask;
    307 	}
    308 #else
    309 	/*
    310 	 * XXX XXX XXX: On Xen the FXSAVE above faults. That's because
    311 	 * &fpusave is not 16-byte aligned. Stack alignment problem
    312 	 * somewhere, it seems.
    313 	 */
    314 	x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
    315 #endif
    316 
    317 	/*
    318 	 * If necessary, allocate FPU save spaces for safe or zero FPU
    319 	 * state, for fpu_kern_enter/leave.
    320 	 */
    321 	if (allocfpusave) {
    322 		__CTASSERT(PAGE_SIZE >= 64);
    323 
    324 		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
    325 		    UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
    326 		memcpy((void *)va, &safe_fpu_storage,
    327 		    sizeof(safe_fpu_storage));
    328 		uvm_km_protect(kernel_map, va, x86_fpu_save_size,
    329 		    VM_PROT_READ);
    330 		safe_fpu = (void *)va;
    331 
    332 		va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
    333 		    UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
    334 		/*
    335 		 * No initialization -- just want zeroes!  In fact we
    336 		 * could share this with other all-zero pages.
    337 		 */
    338 		uvm_km_protect(kernel_map, va, x86_fpu_save_size,
    339 		    VM_PROT_READ);
    340 		zero_fpu = (void *)va;
    341 	}
    342 }
    343 
    344 static inline void
    345 fpu_errata_amd(void)
    346 {
    347 	uint16_t sw;
    348 
    349 	/*
    350 	 * AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor
    351 	 * when FSW.ES=0, leaking other threads' execution history.
    352 	 *
    353 	 * Clear them manually by loading a zero (fldummy). We do this
    354 	 * unconditionally, regardless of FSW.ES.
    355 	 *
    356 	 * Before that, clear the ES bit in the x87 status word if it is
    357 	 * currently set, in order to avoid causing a fault in the
    358 	 * upcoming load.
    359 	 *
    360 	 * Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2],
    361 	 * which indicates that FIP/FDP/FOP are restored (same behavior
    362 	 * as Intel). We're not using it though.
    363 	 */
    364 	fnstsw(&sw);
    365 	if (sw & 0x80)
    366 		fnclex();
    367 	fldummy();
    368 }
    369 
    370 #ifdef __x86_64__
    371 #define XS64(x) (is_64bit ? x##64 : x)
    372 #else
    373 #define XS64(x) x
    374 #endif
    375 
    376 void
    377 fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit)
    378 {
    379 	switch (x86_fpu_save) {
    380 	case FPU_SAVE_FSAVE:
    381 		fnsave(area);
    382 		break;
    383 	case FPU_SAVE_FXSAVE:
    384 		XS64(fxsave)(area);
    385 		break;
    386 	case FPU_SAVE_XSAVE:
    387 		XS64(xsave)(area, xsave_features);
    388 		break;
    389 	case FPU_SAVE_XSAVEOPT:
    390 		XS64(xsaveopt)(area, xsave_features);
    391 		break;
    392 	}
    393 
    394 	stts();
    395 }
    396 
    397 void
    398 fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit)
    399 {
    400 	clts();
    401 
    402 	switch (x86_fpu_save) {
    403 	case FPU_SAVE_FSAVE:
    404 		frstor(area);
    405 		break;
    406 	case FPU_SAVE_FXSAVE:
    407 		if (cpu_vendor == CPUVENDOR_AMD)
    408 			fpu_errata_amd();
    409 		XS64(fxrstor)(area);
    410 		break;
    411 	case FPU_SAVE_XSAVE:
    412 	case FPU_SAVE_XSAVEOPT:
    413 		if (cpu_vendor == CPUVENDOR_AMD)
    414 			fpu_errata_amd();
    415 		XS64(xrstor)(area, xsave_features);
    416 		break;
    417 	}
    418 }
    419 
    420 void
    421 fpu_handle_deferred(void)
    422 {
    423 	struct pcb *pcb = lwp_getpcb(curlwp);
    424 	fpu_area_restore(pcb->pcb_savefpu, x86_xsave_features,
    425 	    !(curlwp->l_proc->p_flag & PK_32));
    426 }
    427 
    428 void
    429 fpu_switch(struct lwp *oldlwp, struct lwp *newlwp)
    430 {
    431 	struct cpu_info *ci __diagused = curcpu();
    432 	struct pcb *pcb;
    433 
    434 	KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d",
    435 	    cpu_index(ci), ci->ci_ilevel);
    436 
    437 	if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
    438 		KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
    439 		pcb = lwp_getpcb(oldlwp);
    440 		fpu_area_save(pcb->pcb_savefpu, x86_xsave_features,
    441 		    !(oldlwp->l_proc->p_flag & PK_32));
    442 		oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
    443 	}
    444 	KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU));
    445 }
    446 
    447 void
    448 fpu_lwp_fork(struct lwp *l1, struct lwp *l2)
    449 {
    450 	struct pcb *pcb2 = lwp_getpcb(l2);
    451 	union savefpu *fpu_save;
    452 
    453 	/* Kernel threads have no FPU. */
    454 	if (__predict_false(l2->l_flag & LW_SYSTEM)) {
    455 		return;
    456 	}
    457 
    458 	/* For init(8). */
    459 	if (__predict_false(l1->l_flag & LW_SYSTEM)) {
    460 		memset(pcb2->pcb_savefpu, 0, x86_fpu_save_size);
    461 		return;
    462 	}
    463 
    464 	fpu_save = fpu_lwp_area(l1);
    465 	memcpy(pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
    466 	l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
    467 }
    468 
    469 void
    470 fpu_lwp_abandon(struct lwp *l)
    471 {
    472 	int s;
    473 
    474 	KASSERT(l == curlwp);
    475 	s = splvm();
    476 	l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
    477 	stts();
    478 	splx(s);
    479 }
    480 
    481 /* -------------------------------------------------------------------------- */
    482 
    483 /*
    484  * fpu_kern_enter()
    485  *
    486  *	Begin using the FPU.  Raises to splvm, disabling most
    487  *	interrupts and rendering the thread non-preemptible; caller
    488  *	should not use this for long periods of time, and must call
    489  *	fpu_kern_leave() afterward.  Non-recursive -- you cannot call
    490  *	fpu_kern_enter() again without calling fpu_kern_leave() first.
    491  *
    492  *	Must be used only at IPL_VM or below -- never in IPL_SCHED or
    493  *	IPL_HIGH interrupt handlers.
    494  */
    495 void
    496 fpu_kern_enter(void)
    497 {
    498 	struct lwp *l = curlwp;
    499 	struct cpu_info *ci;
    500 	int s;
    501 
    502 	s = splvm();
    503 
    504 	ci = curcpu();
    505 #if 0
    506 	/*
    507 	 * Can't assert this because if the caller holds a spin lock at
    508 	 * IPL_VM, and previously held and released a spin lock at
    509 	 * higher IPL, the IPL remains raised above IPL_VM.
    510 	 */
    511 	KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d",
    512 	    ci->ci_ilevel);
    513 #endif
    514 	KASSERT(ci->ci_kfpu_spl == -1);
    515 	ci->ci_kfpu_spl = s;
    516 
    517 	/*
    518 	 * If we are in a softint and have a pinned lwp, the fpu state is that
    519 	 * of the pinned lwp, so save it there.
    520 	 */
    521 	while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL))
    522 		l = l->l_switchto;
    523 	fpu_save_lwp(l);
    524 
    525 	/*
    526 	 * Clear CR0_TS, which fpu_save_lwp set if it saved anything --
    527 	 * otherwise the CPU will trap if we try to use the FPU under
    528 	 * the false impression that there has been a task switch since
    529 	 * the last FPU usage requiring that we save the FPU state.
    530 	 */
    531 	clts();
    532 
    533 	/*
    534 	 * Zero the FPU registers and install safe control words.
    535 	 */
    536 	fpu_area_restore(safe_fpu, x86_xsave_features, /*is_64bit*/false);
    537 }
    538 
    539 /*
    540  * fpu_kern_leave()
    541  *
    542  *	End using the FPU after fpu_kern_enter().
    543  */
    544 void
    545 fpu_kern_leave(void)
    546 {
    547 	struct cpu_info *ci = curcpu();
    548 	int s;
    549 
    550 #if 0
    551 	/*
    552 	 * Can't assert this because if the caller holds a spin lock at
    553 	 * IPL_VM, and previously held and released a spin lock at
    554 	 * higher IPL, the IPL remains raised above IPL_VM.
    555 	 */
    556 	KASSERT(ci->ci_ilevel == IPL_VM || cold);
    557 #endif
    558 	KASSERT(ci->ci_kfpu_spl != -1);
    559 
    560 	/*
    561 	 * Zero the fpu registers; otherwise we might leak secrets
    562 	 * through Spectre-class attacks to userland, even if there are
    563 	 * no bugs in fpu state management.
    564 	 */
    565 	fpu_area_restore(zero_fpu, x86_xsave_features, /*is_64bit*/false);
    566 
    567 	/*
    568 	 * Set CR0_TS again so that the kernel can't accidentally use
    569 	 * the FPU.
    570 	 */
    571 	stts();
    572 
    573 	s = ci->ci_kfpu_spl;
    574 	ci->ci_kfpu_spl = -1;
    575 	splx(s);
    576 }
    577 
    578 /* -------------------------------------------------------------------------- */
    579 
    580 /*
    581  * The following table is used to ensure that the FPE_... value
    582  * that is passed as a trapcode to the signal handler of the user
    583  * process does not have more than one bit set.
    584  *
    585  * Multiple bits may be set if SSE simd instructions generate errors
    586  * on more than one value or if the user process modifies the control
    587  * word while a status word bit is already set (which this is a sign
    588  * of bad coding).
    589  * We have no choice than to narrow them down to one bit, since we must
    590  * not send a trapcode that is not exactly one of the FPE_ macros.
    591  *
    592  * The mechanism has a static table with 127 entries.  Each combination
    593  * of the 7 FPU status word exception bits directly translates to a
    594  * position in this table, where a single FPE_... value is stored.
    595  * This FPE_... value stored there is considered the "most important"
    596  * of the exception bits and will be sent as the signal code.  The
    597  * precedence of the bits is based upon Intel Document "Numerical
    598  * Applications", Chapter "Special Computational Situations".
    599  *
    600  * The code to choose one of these values does these steps:
    601  * 1) Throw away status word bits that cannot be masked.
    602  * 2) Throw away the bits currently masked in the control word,
    603  *    assuming the user isn't interested in them anymore.
    604  * 3) Reinsert status word bit 7 (stack fault) if it is set, which
    605  *    cannot be masked but must be preserved.
    606  *    'Stack fault' is a sub-class of 'invalid operation'.
    607  * 4) Use the remaining bits to point into the trapcode table.
    608  *
    609  * The 6 maskable bits in order of their preference, as stated in the
    610  * above referenced Intel manual:
    611  * 1  Invalid operation (FP_X_INV)
    612  * 1a   Stack underflow
    613  * 1b   Stack overflow
    614  * 1c   Operand of unsupported format
    615  * 1d   SNaN operand.
    616  * 2  QNaN operand (not an exception, irrelevant here)
    617  * 3  Any other invalid-operation not mentioned above or zero divide
    618  *      (FP_X_INV, FP_X_DZ)
    619  * 4  Denormal operand (FP_X_DNML)
    620  * 5  Numeric over/underflow (FP_X_OFL, FP_X_UFL)
    621  * 6  Inexact result (FP_X_IMP)
    622  *
    623  * NB: the above seems to mix up the mxscr error bits and the x87 ones.
    624  * They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx
    625  * status.
    626  *
    627  * The table is nearly, but not quite, in bit order (ZERODIV and DENORM
    628  * are swapped).
    629  *
    630  * This table assumes that any stack fault is cleared - so that an INVOP
    631  * fault will only be reported as FLTSUB once.
    632  * This might not happen if the mask is being changed.
    633  */
    634 #define FPE_xxx1(f) (f & EN_SW_INVOP \
    635 		? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \
    636 	: f & EN_SW_ZERODIV ? FPE_FLTDIV \
    637 	: f & EN_SW_DENORM ? FPE_FLTUND \
    638 	: f & EN_SW_OVERFLOW ? FPE_FLTOVF \
    639 	: f & EN_SW_UNDERFLOW ? FPE_FLTUND \
    640 	: f & EN_SW_PRECLOSS ? FPE_FLTRES \
    641 	: f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0)
    642 #define	FPE_xxx2(f)	FPE_xxx1(f),	FPE_xxx1((f + 1))
    643 #define	FPE_xxx4(f)	FPE_xxx2(f),	FPE_xxx2((f + 2))
    644 #define	FPE_xxx8(f)	FPE_xxx4(f),	FPE_xxx4((f + 4))
    645 #define	FPE_xxx16(f)	FPE_xxx8(f),	FPE_xxx8((f + 8))
    646 #define	FPE_xxx32(f)	FPE_xxx16(f),	FPE_xxx16((f + 16))
    647 static const uint8_t fpetable[128] = {
    648 	FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96)
    649 };
    650 #undef FPE_xxx1
    651 #undef FPE_xxx2
    652 #undef FPE_xxx4
    653 #undef FPE_xxx8
    654 #undef FPE_xxx16
    655 #undef FPE_xxx32
    656 
    657 /*
    658  * This is a synchronous trap on either an x87 instruction (due to an unmasked
    659  * error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due
    660  * to an error on the instruction itself.
    661  *
    662  * If trap actually generates a signal, then the fpu state is saved and then
    663  * copied onto the lwp's user-stack, and then recovered from there when the
    664  * signal returns.
    665  *
    666  * All this code needs to do is save the reason for the trap. For x87 traps the
    667  * status word bits need clearing to stop the trap re-occurring. For SSE traps
    668  * the mxcsr bits are 'sticky' and need clearing to not confuse a later trap.
    669  *
    670  * We come here with interrupts disabled.
    671  */
    672 void
    673 fputrap(struct trapframe *frame)
    674 {
    675 	uint32_t statbits;
    676 	ksiginfo_t ksi;
    677 
    678 	if (__predict_false(!USERMODE(frame->tf_cs))) {
    679 		register_t ip = X86_TF_RIP(frame);
    680 		char where[128];
    681 
    682 #ifdef DDB
    683 		db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC);
    684 #else
    685 		snprintf(where, sizeof(where), "%p", (void *)ip);
    686 #endif
    687 		panic("fpu trap from kernel at %s, trapframe %p\n", where,
    688 		    frame);
    689 	}
    690 
    691 	KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU);
    692 
    693 	if (frame->tf_trapno == T_XMM) {
    694 		uint32_t mxcsr;
    695 		x86_stmxcsr(&mxcsr);
    696 		statbits = mxcsr;
    697 		/* Clear the sticky status bits */
    698 		mxcsr &= ~0x3f;
    699 		x86_ldmxcsr(&mxcsr);
    700 
    701 		/* Remove masked interrupts and non-status bits */
    702 		statbits &= ~(statbits >> 7) & 0x3f;
    703 		/* Mark this is an XMM status */
    704 		statbits |= 0x10000;
    705 	} else {
    706 		uint16_t cw, sw;
    707 		/* Get current control and status words */
    708 		fnstcw(&cw);
    709 		fnstsw(&sw);
    710 		/* Clear any pending exceptions from status word */
    711 		fnclex();
    712 
    713 		/* Remove masked interrupts */
    714 		statbits = sw & ~(cw & 0x3f);
    715 	}
    716 
    717 	/* Doesn't matter now if we get pre-empted */
    718 	x86_enable_intr();
    719 
    720 	KSI_INIT_TRAP(&ksi);
    721 	ksi.ksi_signo = SIGFPE;
    722 	ksi.ksi_addr = (void *)X86_TF_RIP(frame);
    723 	ksi.ksi_code = fpetable[statbits & 0x7f];
    724 	ksi.ksi_trap = statbits;
    725 	(*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi);
    726 }
    727 
    728 void
    729 fpudna(struct trapframe *frame)
    730 {
    731 #ifdef XENPV
    732 	/*
    733 	 * Xen produes spurious fpudna traps, just do nothing.
    734 	 */
    735 	if (USERMODE(frame->tf_cs)) {
    736 		clts();
    737 		return;
    738 	}
    739 #endif
    740 	panic("fpudna from %s, ip %p, trapframe %p",
    741 	    USERMODE(frame->tf_cs) ? "userland" : "kernel",
    742 	    (void *)X86_TF_RIP(frame), frame);
    743 }
    744 
    745 /* -------------------------------------------------------------------------- */
    746 
    747 static inline void
    748 fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate)
    749 {
    750 	/*
    751 	 * Force a reload of the given xstate during the next XRSTOR.
    752 	 */
    753 	if (x86_fpu_save >= FPU_SAVE_XSAVE) {
    754 		fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate;
    755 	}
    756 }
    757 
    758 void
    759 fpu_set_default_cw(struct lwp *l, unsigned int x87_cw)
    760 {
    761 	union savefpu *fpu_save = fpu_lwp_area(l);
    762 	struct pcb *pcb = lwp_getpcb(l);
    763 
    764 	if (i386_use_fxsave) {
    765 		fpu_save->sv_xmm.fx_cw = x87_cw;
    766 		if (x87_cw != __INITIAL_NPXCW__) {
    767 			fpu_xstate_reload(fpu_save, XCR0_X87);
    768 		}
    769 	} else {
    770 		fpu_save->sv_87.s87_cw = x87_cw;
    771 	}
    772 	pcb->pcb_fpu_dflt_cw = x87_cw;
    773 }
    774 
    775 void
    776 fpu_clear(struct lwp *l, unsigned int x87_cw)
    777 {
    778 	union savefpu *fpu_save;
    779 	struct pcb *pcb;
    780 
    781 	KASSERT(l == curlwp);
    782 	fpu_save = fpu_lwp_area(l);
    783 
    784 	switch (x86_fpu_save) {
    785 	case FPU_SAVE_FSAVE:
    786 		memset(&fpu_save->sv_87, 0, x86_fpu_save_size);
    787 		fpu_save->sv_87.s87_tw = 0xffff;
    788 		fpu_save->sv_87.s87_cw = x87_cw;
    789 		break;
    790 	case FPU_SAVE_FXSAVE:
    791 		memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
    792 		fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
    793 		fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
    794 		fpu_save->sv_xmm.fx_cw = x87_cw;
    795 		break;
    796 	case FPU_SAVE_XSAVE:
    797 	case FPU_SAVE_XSAVEOPT:
    798 		memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
    799 		fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
    800 		fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
    801 		fpu_save->sv_xmm.fx_cw = x87_cw;
    802 		if (__predict_false(x87_cw != __INITIAL_NPXCW__)) {
    803 			fpu_xstate_reload(fpu_save, XCR0_X87);
    804 		}
    805 		break;
    806 	}
    807 
    808 	pcb = lwp_getpcb(l);
    809 	pcb->pcb_fpu_dflt_cw = x87_cw;
    810 }
    811 
    812 void
    813 fpu_sigreset(struct lwp *l)
    814 {
    815 	union savefpu *fpu_save = fpu_lwp_area(l);
    816 	struct pcb *pcb = lwp_getpcb(l);
    817 
    818 	/*
    819 	 * For signal handlers the register values don't matter. Just reset
    820 	 * a few fields.
    821 	 */
    822 	if (i386_use_fxsave) {
    823 		fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
    824 		fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
    825 		fpu_save->sv_xmm.fx_tw = 0;
    826 		fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw;
    827 	} else {
    828 		fpu_save->sv_87.s87_tw = 0xffff;
    829 		fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw;
    830 	}
    831 }
    832 
    833 void
    834 process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs)
    835 {
    836 	union savefpu *fpu_save = fpu_lwp_area(l);
    837 
    838 	if (i386_use_fxsave) {
    839 		memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm));
    840 
    841 		/*
    842 		 * Invalid bits in mxcsr or mxcsr_mask will cause faults.
    843 		 */
    844 		fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
    845 		fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask;
    846 
    847 		fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
    848 	} else {
    849 		process_xmm_to_s87(fpregs, &fpu_save->sv_87);
    850 	}
    851 }
    852 
    853 void
    854 process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs)
    855 {
    856 	union savefpu *fpu_save = fpu_lwp_area(l);
    857 
    858 	if (i386_use_fxsave) {
    859 		process_s87_to_xmm(fpregs, &fpu_save->sv_xmm);
    860 		fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
    861 	} else {
    862 		memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87));
    863 	}
    864 }
    865 
    866 void
    867 process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs)
    868 {
    869 	union savefpu *fpu_save = fpu_lwp_area(l);
    870 
    871 	if (i386_use_fxsave) {
    872 		memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm));
    873 	} else {
    874 		memset(fpregs, 0, sizeof(*fpregs));
    875 		process_s87_to_xmm(&fpu_save->sv_87, fpregs);
    876 	}
    877 }
    878 
    879 void
    880 process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs)
    881 {
    882 	union savefpu *fpu_save = fpu_lwp_area(l);
    883 
    884 	if (i386_use_fxsave) {
    885 		memset(fpregs, 0, sizeof(*fpregs));
    886 		process_xmm_to_s87(&fpu_save->sv_xmm, fpregs);
    887 	} else {
    888 		memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87));
    889 	}
    890 }
    891 
    892 int
    893 process_read_xstate(struct lwp *l, struct xstate *xstate)
    894 {
    895 	union savefpu *fpu_save = fpu_lwp_area(l);
    896 
    897 	if (x86_fpu_save == FPU_SAVE_FSAVE) {
    898 		/* Convert from legacy FSAVE format. */
    899 		memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave));
    900 		process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave);
    901 
    902 		/* We only got x87 data. */
    903 		xstate->xs_rfbm = XCR0_X87;
    904 		xstate->xs_xstate_bv = XCR0_X87;
    905 		return 0;
    906 	}
    907 
    908 	/* Copy the legacy area. */
    909 	memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave,
    910 	    sizeof(xstate->xs_fxsave));
    911 
    912 	if (x86_fpu_save == FPU_SAVE_FXSAVE) {
    913 		/* FXSAVE means we've got x87 + SSE data. */
    914 		xstate->xs_rfbm = XCR0_X87 | XCR0_SSE;
    915 		xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE;
    916 		return 0;
    917 	}
    918 
    919 	/* Copy the bitmap indicating which states are available. */
    920 	xstate->xs_rfbm = x86_xsave_features & XCR0_FPU;
    921 	xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv;
    922 	KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm));
    923 
    924 #define COPY_COMPONENT(xcr0_val, xsave_val, field)			\
    925 	if (xstate->xs_xstate_bv & xcr0_val) {				\
    926 		KASSERT(x86_xsave_offsets[xsave_val]			\
    927 		    >= sizeof(struct xsave_header));			\
    928 		KASSERT(x86_xsave_sizes[xsave_val]			\
    929 		    >= sizeof(xstate->field));				\
    930 		memcpy(&xstate->field,					\
    931 		    (char*)fpu_save + x86_xsave_offsets[xsave_val],	\
    932 		    sizeof(xstate->field));				\
    933 	}
    934 
    935 	COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
    936 	COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
    937 	COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
    938 	COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
    939 
    940 #undef COPY_COMPONENT
    941 
    942 	return 0;
    943 }
    944 
    945 int
    946 process_verify_xstate(const struct xstate *xstate)
    947 {
    948 	/* xstate_bv must be a subset of RFBM */
    949 	if (xstate->xs_xstate_bv & ~xstate->xs_rfbm)
    950 		return EINVAL;
    951 
    952 	switch (x86_fpu_save) {
    953 	case FPU_SAVE_FSAVE:
    954 		if ((xstate->xs_rfbm & ~XCR0_X87))
    955 			return EINVAL;
    956 		break;
    957 	case FPU_SAVE_FXSAVE:
    958 		if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE)))
    959 			return EINVAL;
    960 		break;
    961 	default:
    962 		/* Verify whether no unsupported features are enabled */
    963 		if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0)
    964 			return EINVAL;
    965 	}
    966 
    967 	return 0;
    968 }
    969 
    970 int
    971 process_write_xstate(struct lwp *l, const struct xstate *xstate)
    972 {
    973 	union savefpu *fpu_save = fpu_lwp_area(l);
    974 
    975 	/* Convert data into legacy FSAVE format. */
    976 	if (x86_fpu_save == FPU_SAVE_FSAVE) {
    977 		if (xstate->xs_xstate_bv & XCR0_X87)
    978 			process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87);
    979 		return 0;
    980 	}
    981 
    982 	/* If XSAVE is supported, make sure that xstate_bv is set correctly. */
    983 	if (x86_fpu_save >= FPU_SAVE_XSAVE) {
    984 		/*
    985 		 * Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv :
    986 		 *           fpu_save->sv_xsave_hdr.xsh_xstate_bv"
    987 		 */
    988 		fpu_save->sv_xsave_hdr.xsh_xstate_bv =
    989 		    (fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) |
    990 		    xstate->xs_xstate_bv;
    991 	}
    992 
    993 	if (xstate->xs_xstate_bv & XCR0_X87) {
    994 		/*
    995 		 * X87 state is split into two areas, interspersed with SSE
    996 		 * data.
    997 		 */
    998 		memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24);
    999 		memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac,
   1000 		    sizeof(xstate->xs_fxsave.fx_87_ac));
   1001 	}
   1002 
   1003 	/*
   1004 	 * Copy MXCSR if either SSE or AVX state is requested, to match the
   1005 	 * XSAVE behavior for those flags.
   1006 	 */
   1007 	if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) {
   1008 		/*
   1009 		 * Invalid bits in mxcsr or mxcsr_mask will cause faults.
   1010 		 */
   1011 		fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask
   1012 		    & x86_fpu_mxcsr_mask;
   1013 		fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr &
   1014 		    fpu_save->sv_xmm.fx_mxcsr_mask;
   1015 	}
   1016 
   1017 	if (xstate->xs_xstate_bv & XCR0_SSE) {
   1018 		memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160],
   1019 		    xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm));
   1020 	}
   1021 
   1022 #define COPY_COMPONENT(xcr0_val, xsave_val, field)			\
   1023 	if (xstate->xs_xstate_bv & xcr0_val) {				\
   1024 		KASSERT(x86_xsave_offsets[xsave_val]			\
   1025 		    >= sizeof(struct xsave_header));			\
   1026 		KASSERT(x86_xsave_sizes[xsave_val]			\
   1027 		    >= sizeof(xstate->field));				\
   1028 		memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val],	\
   1029 		    &xstate->field, sizeof(xstate->field));		\
   1030 	}
   1031 
   1032 	COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
   1033 	COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
   1034 	COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
   1035 	COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
   1036 
   1037 #undef COPY_COMPONENT
   1038 
   1039 	return 0;
   1040 }
   1041