Home | History | Annotate | Line # | Download | only in amd64
      1 /*	$NetBSD: locore.S,v 1.233 2025/09/09 11:34:25 bouyer Exp $	*/
      2 
      3 /*
      4  * Copyright-o-rama!
      5  */
      6 
      7 /*
      8  * Copyright (c) 1998, 2000, 2007, 2008, 2016 The NetBSD Foundation, Inc.
      9  * All rights reserved.
     10  *
     11  * This code is derived from software contributed to The NetBSD Foundation
     12  * by Charles M. Hannum and by Maxime Villard.
     13  *
     14  * Redistribution and use in source and binary forms, with or without
     15  * modification, are permitted provided that the following conditions
     16  * are met:
     17  * 1. Redistributions of source code must retain the above copyright
     18  *    notice, this list of conditions and the following disclaimer.
     19  * 2. Redistributions in binary form must reproduce the above copyright
     20  *    notice, this list of conditions and the following disclaimer in the
     21  *    documentation and/or other materials provided with the distribution.
     22  *
     23  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     25  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     26  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     27  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     28  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     29  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     30  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     31  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     32  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     33  * POSSIBILITY OF SUCH DAMAGE.
     34  */
     35 
     36 /*
     37  * Copyright (c) 2007 Manuel Bouyer.
     38  *
     39  * Redistribution and use in source and binary forms, with or without
     40  * modification, are permitted provided that the following conditions
     41  * are met:
     42  * 1. Redistributions of source code must retain the above copyright
     43  *    notice, this list of conditions and the following disclaimer.
     44  * 2. Redistributions in binary form must reproduce the above copyright
     45  *    notice, this list of conditions and the following disclaimer in the
     46  *    documentation and/or other materials provided with the distribution.
     47  *
     48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     58  *
     59  */
     60 
     61 /*
     62  * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
     63  *
     64  * Permission to use, copy, modify, and distribute this software for any
     65  * purpose with or without fee is hereby granted, provided that the above
     66  * copyright notice and this permission notice appear in all copies.
     67  *
     68  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     69  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     70  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     71  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     72  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     73  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     74  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     75  */
     76 
     77 /*
     78  * Copyright (c) 2001 Wasabi Systems, Inc.
     79  * All rights reserved.
     80  *
     81  * Written by Frank van der Linden for Wasabi Systems, Inc.
     82  *
     83  * Redistribution and use in source and binary forms, with or without
     84  * modification, are permitted provided that the following conditions
     85  * are met:
     86  * 1. Redistributions of source code must retain the above copyright
     87  *    notice, this list of conditions and the following disclaimer.
     88  * 2. Redistributions in binary form must reproduce the above copyright
     89  *    notice, this list of conditions and the following disclaimer in the
     90  *    documentation and/or other materials provided with the distribution.
     91  * 3. All advertising materials mentioning features or use of this software
     92  *    must display the following acknowledgement:
     93  *      This product includes software developed for the NetBSD Project by
     94  *      Wasabi Systems, Inc.
     95  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     96  *    or promote products derived from this software without specific prior
     97  *    written permission.
     98  *
     99  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
    100  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
    101  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
    102  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
    103  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    104  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    105  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    106  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    107  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    108  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    109  * POSSIBILITY OF SUCH DAMAGE.
    110  */
    111 
    112 /*-
    113  * Copyright (c) 1990 The Regents of the University of California.
    114  * All rights reserved.
    115  *
    116  * This code is derived from software contributed to Berkeley by
    117  * William Jolitz.
    118  *
    119  * Redistribution and use in source and binary forms, with or without
    120  * modification, are permitted provided that the following conditions
    121  * are met:
    122  * 1. Redistributions of source code must retain the above copyright
    123  *    notice, this list of conditions and the following disclaimer.
    124  * 2. Redistributions in binary form must reproduce the above copyright
    125  *    notice, this list of conditions and the following disclaimer in the
    126  *    documentation and/or other materials provided with the distribution.
    127  * 3. Neither the name of the University nor the names of its contributors
    128  *    may be used to endorse or promote products derived from this software
    129  *    without specific prior written permission.
    130  *
    131  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    132  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    133  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    134  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    135  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    136  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    137  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    138  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    139  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    140  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    141  * SUCH DAMAGE.
    142  *
    143  *	@(#)locore.s	7.3 (Berkeley) 5/13/91
    144  */
    145 
    146 /* Override user-land alignment before including asm.h */
    147 #define	ALIGN_DATA	.align	8
    148 #define ALIGN_TEXT	.align 16,0x90
    149 #define _ALIGN_TEXT	ALIGN_TEXT
    150 
    151 #include <machine/asm.h>
    152 
    153 #include "opt_kasan.h"
    154 #include "opt_copy_symtab.h"
    155 #include "opt_ddb.h"
    156 #include "opt_ddbparam.h"
    157 #include "opt_modular.h"
    158 #include "opt_realmem.h"
    159 #include "opt_selfreloc.h"
    160 
    161 #include "opt_compat_netbsd.h"
    162 #include "opt_compat_netbsd32.h"
    163 #include "opt_xen.h"
    164 #include "opt_svs.h"
    165 
    166 #include "assym.h"
    167 #include "lapic.h"
    168 #include "ioapic.h"
    169 #include "ksyms.h"
    170 
    171 #include <sys/errno.h>
    172 #include <sys/syscall.h>
    173 
    174 #include <machine/pte.h>
    175 #include <machine/segments.h>
    176 #include <machine/specialreg.h>
    177 #include <machine/trap.h>
    178 #include <machine/bootinfo.h>
    179 #include <machine/frameasm.h>
    180 #include <machine/cputypes.h>
    181 
    182 #if NLAPIC > 0
    183 #include <machine/i82489reg.h>
    184 #endif
    185 
    186 /* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
    187 #include <dev/isa/isareg.h>
    188 
    189 #define	_RELOC(x)	((x) - KERNBASE)
    190 #define	RELOC(x)	_RELOC(_C_LABEL(x))
    191 
    192 /* 32bit version of PTE_NX */
    193 #define PTE_NX32	0x80000000
    194 
    195 #if L2_SLOT_KERNBASE > 0
    196 #define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
    197 #else
    198 #define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
    199 #endif
    200 
    201 #if L3_SLOT_KERNBASE > 0
    202 #define TABLE_L3_ENTRIES (2 * NKL3_KIMG_ENTRIES)
    203 #else
    204 #define TABLE_L3_ENTRIES NKL3_KIMG_ENTRIES
    205 #endif
    206 
    207 #define PROC0_PML4_OFF	0
    208 #define PROC0_STK_OFF	(PROC0_PML4_OFF + 1 * PAGE_SIZE)
    209 #define PROC0_PTP3_OFF	(PROC0_STK_OFF + UPAGES * PAGE_SIZE)
    210 #define PROC0_PTP2_OFF	(PROC0_PTP3_OFF + NKL4_KIMG_ENTRIES * PAGE_SIZE)
    211 #define PROC0_PTP1_OFF	(PROC0_PTP2_OFF + TABLE_L3_ENTRIES * PAGE_SIZE)
    212 #define TABLESIZE \
    213   ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES) \
    214     * PAGE_SIZE)
    215 
    216 /* Amount of VA used to map the kernel, the syms and the preloaded modules */
    217 #define BOOTMAP_VA_SIZE \
    218 	(NKL2_KIMG_ENTRIES * (1 << L2_SHIFT) - TABLESIZE - IOM_SIZE)
    219 
    220 /*
    221  * fillkpt - Fill in a kernel page table
    222  *	eax = pte (page frame | control | status)
    223  *	ebx = page table address
    224  *	ecx = number of pages to map
    225  *
    226  * Each entry is 8 (PDE_SIZE) bytes long: we must set the 4 upper bytes to 0.
    227  */
    228 #define fillkpt	\
    229 	cmpl	$0,%ecx			;	/* zero-sized? */	\
    230 	je 	2f			; \
    231 1:	movl	$0,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: 0 */	\
    232 	movl	%eax,(%ebx)		;	/* store phys addr */	\
    233 	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
    234 	addl	$PAGE_SIZE,%eax		;	/* next phys page */	\
    235 	loop	1b			; \
    236 2:					;
    237 
    238 /*
    239  * fillkpt_nox - Same as fillkpt, but sets the NX/XD bit.
    240  */
    241 #define fillkpt_nox \
    242 	cmpl	$0,%ecx			;	/* zero-sized? */	\
    243 	je 	2f			; \
    244 	pushl	%ebp			; \
    245 	movl	RELOC(nox_flag),%ebp	; \
    246 1:	movl	%ebp,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: NX */ \
    247 	movl	%eax,(%ebx)		;	/* store phys addr */	\
    248 	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
    249 	addl	$PAGE_SIZE,%eax		;	/* next phys page */	\
    250 	loop	1b			; \
    251 	popl	%ebp			; \
    252 2:					;
    253 
    254 /*
    255  * fillkpt_blank - Fill in a kernel page table with blank entries
    256  *	ebx = page table address
    257  *	ecx = number of pages to map
    258  */
    259 #define fillkpt_blank	\
    260 	cmpl	$0,%ecx			;	/* zero-sized? */	\
    261 	je 	2f			; \
    262 1:	movl	$0,(PDE_SIZE-4)(%ebx)	;	/* upper 32 bits: 0 */	\
    263 	movl	$0,(%ebx)		;	/* lower 32 bits: 0 */	\
    264 	addl	$PDE_SIZE,%ebx		;	/* next PTE/PDE */	\
    265 	loop	1b			; \
    266 2:					;
    267 
    268 /*
    269  * killkpt - Destroy a kernel page table (long mode)
    270  *	rbx = page table address
    271  *	rcx = number of pages to destroy
    272  */
    273 #define killkpt \
    274 1:	movq	$0,(%rbx)	; \
    275 	addq	$PDE_SIZE,%rbx	; \
    276 	loop	1b		;
    277 
    278 /* record boot start cycle count */
    279 #define getstarttsc \
    280 	rdtsc					; \
    281 	movl	%eax, RELOC(starttsc_lo)	; \
    282 	movl	%edx, RELOC(starttsc_hi)	;
    283 
    284 #ifdef XEN
    285 #define __ASSEMBLY__
    286 #include <xen/include/public/arch-x86/cpuid.h>
    287 #include <xen/include/public/elfnote.h>
    288 #include <xen/include/public/xen.h>
    289 
    290 #define ELFNOTE(name, type, desctype, descdata...) \
    291 .pushsection .note.name, "a", @note	;	\
    292   .align 4				;	\
    293   .long 2f - 1f		/* namesz */	;	\
    294   .long 4f - 3f		/* descsz */	;	\
    295   .long type				;	\
    296 1:.asciz #name				;	\
    297 2:.align 4				;	\
    298 3:desctype descdata			;	\
    299 4:.align 4				;	\
    300 .popsection
    301 
    302 /*
    303  * Xen guest identifier and loader selection
    304  */
    305 .section __xen_guest
    306 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "NetBSD")
    307 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "4.99")
    308 	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
    309 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad,  KERNBASE)
    310 #ifdef XENPV
    311 	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  KERNBASE)
    312 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad,  start)
    313 #else
    314 	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad,  0)
    315 	ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,   .long,  RELOC(start_pvh))
    316 #endif /* XENPV */
    317 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad,  hypercall_page)
    318 	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .quad,  HYPERVISOR_VIRT_START)
    319 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel|hvm_callback_vector")
    320 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
    321 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long,  PTE_P, PTE_P)\
    322 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
    323 	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long,  0)
    324 #if NKSYMS > 0 || defined(DDB) || defined(MODULAR)
    325 	ELFNOTE(Xen, XEN_ELFNOTE_BSD_SYMTAB,     .asciz, "yes")
    326 #endif
    327 #endif  /* XEN */
    328 
    329 /*
    330  * Initialization
    331  */
    332 	.data
    333 
    334 	.globl	_C_LABEL(tablesize)
    335 	.globl	_C_LABEL(nox_flag)
    336 	.globl	_C_LABEL(cputype)
    337 	.globl	_C_LABEL(cpuid_level)
    338 	.globl	_C_LABEL(esym)
    339 	.globl	_C_LABEL(eblob)
    340 	.globl	_C_LABEL(atdevbase)
    341 	.globl	_C_LABEL(PDPpaddr)
    342 	.globl	_C_LABEL(boothowto)
    343 	.globl	_C_LABEL(bootinfo)
    344 	.globl	_C_LABEL(biosbasemem)
    345 	.globl	_C_LABEL(biosextmem)
    346 	.globl	_C_LABEL(lwp0uarea)
    347 	.globl	do_mov_es
    348 	.globl	do_mov_ds
    349 	.globl	do_mov_fs
    350 	.globl	do_mov_gs
    351 	.globl	do_iret
    352 
    353 	.type	_C_LABEL(tablesize), @object
    354 _C_LABEL(tablesize):	.long	TABLESIZE
    355 END(tablesize)
    356 	.type	_C_LABEL(nox_flag), @object
    357 LABEL(nox_flag)		.long	0	/* 32bit NOX flag, set if supported */
    358 END(nox_flag)
    359 	.type	_C_LABEL(cputype), @object
    360 LABEL(cputype)		.long	0	/* are we 80486, Pentium, or.. */
    361 END(cputype)
    362 	.type	_C_LABEL(cpuid_level), @object
    363 LABEL(cpuid_level)	.long	-1	/* max. level accepted by cpuid instr */
    364 END(cpuid_level)
    365 	.type	_C_LABEL(esym), @object
    366 LABEL(esym)		.quad	0	/* ptr to end of syms */
    367 END(esym)
    368 	.type	_C_LABEL(eblob), @object
    369 LABEL(eblob)		.quad	0	/* ptr to end of modules */
    370 END(eblob)
    371 	.type	_C_LABEL(atdevbase), @object
    372 LABEL(atdevbase)	.quad	0	/* location of start of iomem in virt */
    373 END(atdevbase)
    374 	.type	_C_LABEL(PDPpaddr), @object
    375 LABEL(PDPpaddr)		.quad	0	/* paddr of PTD, for libkvm */
    376 END(PDPpaddr)
    377 	.type	_C_LABEL(biosbasemem), @object
    378 #ifndef REALBASEMEM
    379 LABEL(biosbasemem)	.long	0	/* base memory reported by BIOS */
    380 #else
    381 LABEL(biosbasemem)	.long	REALBASEMEM
    382 #endif
    383 END(biosbasemem)
    384 	.type	_C_LABEL(biosextmem), @object
    385 #ifndef REALEXTMEM
    386 LABEL(biosextmem)	.long	0	/* extended memory reported by BIOS */
    387 #else
    388 LABEL(biosextmem)	.long	REALEXTMEM
    389 #endif
    390 END(biosextmem)
    391 	.type	_C_LABEL(lwp0uarea), @object
    392 LABEL(lwp0uarea)	.quad	0
    393 END(lwp0uarea)
    394 	.type	_C_LABEL(starttsc_lo), @object
    395 LABEL(starttsc_lo)		.long	0	/* low part of rdtsc */
    396 END(starttsc_lo)
    397 	.type	_C_LABEL(starttsc_hi), @object
    398 LABEL(starttsc_hi)		.long	0	/* high part of rdtsc */
    399 END(starttsc_hi)
    400 
    401 #ifndef XENPV
    402 	.globl	gdt64_lo
    403 	.globl	gdt64_hi
    404 
    405 #define GDT64_LIMIT gdt64_end-gdt64_start-1
    406 /* Temporary gdt64, with base address in low memory */
    407 	.type	_C_LABEL(gdt64_lo), @object
    408 LABEL(gdt64_lo)
    409 	.word	GDT64_LIMIT
    410 	.quad	_RELOC(gdt64_start)
    411 END(gdt64_lo)
    412 .align 64
    413 
    414 /* Temporary gdt64, with base address in high memory */
    415 	.type	_C_LABEL(gdt64_hi), @object
    416 LABEL(gdt64_hi)
    417 	.word	GDT64_LIMIT
    418 	.quad	gdt64_start
    419 END(gdt64_hi)
    420 .align 64
    421 #undef GDT64_LIMIT
    422 
    423 	.type	_C_LABEL(gdt64_start), @object
    424 _C_LABEL(gdt64_start):
    425 	.quad 0x0000000000000000	/* always empty */
    426 	.quad 0x00af9a000000ffff	/* kernel CS */
    427 	.quad 0x00cf92000000ffff	/* kernel DS */
    428 END(gdt64_start)
    429 gdt64_end:
    430 
    431 	.type	_C_LABEL(farjmp64), @object
    432 _C_LABEL(farjmp64):
    433 	.long	_RELOC(longmode)
    434 	.word	GSEL(GCODE_SEL, SEL_KPL)
    435 END(farjmp64)
    436 
    437 #ifdef XEN
    438 /* 32bit GDT */
    439 gdtdesc32:
    440 	.word	gdt32end - gdt32
    441 	.long	RELOC(gdt32)
    442 	.long	0
    443 gdt32:
    444 	.long	0			# null descriptor
    445 	.long	0
    446 	.long	0x0000ffff		# %cs
    447 	.long	0x00cf9a00
    448 	.long	0x0000ffff		# %ds, %es, %ss
    449 	.long	0x00cf9200
    450 gdt32end:
    451 #endif /* XEN */
    452 #endif	/* !XENPV */
    453 
    454 	/* Space for the temporary stack */
    455 	.size	tmpstk, tmpstk - .
    456 	.space	512
    457 tmpstk:
    458 
    459 /*
    460  * Some hackage to deal with 64bit symbols in 32 bit mode.
    461  * This may not be needed if things are cleaned up a little.
    462  */
    463 
    464 	.text
    465 	.globl	_C_LABEL(kernel_text)
    466 	.set	_C_LABEL(kernel_text),KERNTEXTOFF
    467 
    468 ENTRY(start)
    469 #ifndef XENPV
    470 	.code32
    471 #ifdef BOOT_DURATION
    472 	getstarttsc
    473 #endif
    474 
    475 #ifdef SELFRELOC
    476 	call	next
    477 next:	pop	%edi
    478 	sub     $(next - kernel_text), %edi
    479 
    480 	/* If not KERNBASE, reloc ourselves to KERNBASE */
    481 	cmpl	$(KERNTEXTOFF_LO - KERNBASE_LO), %edi
    482 	jne	selfreloc_start
    483 #endif /* SELFRELOC */
    484 
    485 	/* Warm boot */
    486 	movw	$0x1234,0x472
    487 
    488 	/*
    489 	 * Load parameters from the stack (32 bits):
    490 	 *     boothowto, [bootdev], bootinfo, esym, biosextmem, biosbasemem
    491 	 * We are not interested in 'bootdev'.
    492 	 */
    493 
    494 	/* Load 'boothowto' */
    495 	movl	4(%esp),%eax
    496 	movl	%eax,RELOC(boothowto)
    497 
    498 	/* Load 'bootinfo' */
    499 	movl	12(%esp),%eax
    500 	testl	%eax,%eax		/* bootinfo = NULL? */
    501 	jz	.Lbootinfo_finished
    502 
    503 	movl	(%eax),%ebx		/* bootinfo::bi_nentries */
    504 	movl	$RELOC(bootinfo),%ebp
    505 	movl	%ebp,%edx
    506 	addl	$BOOTINFO_MAXSIZE,%ebp
    507 	movl	%ebx,(%edx)
    508 	addl	$4,%edx
    509 
    510 .Lbootinfo_entryloop:
    511 	testl	%ebx,%ebx		/* no remaining entries? */
    512 	jz	.Lbootinfo_finished
    513 
    514 	addl	$4,%eax
    515 	movl	(%eax),%ecx		/* address of entry */
    516 	pushl	%edi
    517 	pushl	%esi
    518 	pushl	%eax
    519 
    520 	movl	(%ecx),%eax		/* btinfo_common::len (size of entry) */
    521 	movl	%edx,%edi
    522 	addl	%eax,%edx		/* update dest pointer */
    523 	cmpl	%ebp,%edx		/* beyond bootinfo+BOOTINFO_MAXSIZE? */
    524 	jg	.Lbootinfo_overflow
    525 
    526 	movl	%ecx,%esi
    527 	movl	%eax,%ecx
    528 
    529 	/*
    530 	 * If any modules were loaded, record where they end. 'eblob' is used
    531 	 * later to compute the initial bootstrap tables.
    532 	 */
    533 	cmpl	$BTINFO_MODULELIST,4(%esi) /* btinfo_common::type */
    534 	jne	.Lbootinfo_copy
    535 
    536 	/* Skip the modules if we won't have enough VA to map them */
    537 	movl	12(%esi),%eax		/* btinfo_modulelist::endpa */
    538 	addl	$PGOFSET,%eax		/* roundup to a page */
    539 	andl	$~PGOFSET,%eax
    540 	cmpl	$BOOTMAP_VA_SIZE,%eax
    541 	jg	.Lbootinfo_skip
    542 	movl	%eax,RELOC(eblob)
    543 	addl	$KERNBASE_LO,RELOC(eblob)
    544 	adcl	$KERNBASE_HI,RELOC(eblob)+4
    545 
    546 .Lbootinfo_copy:
    547 	rep
    548 	movsb				/* copy esi -> edi */
    549 	jmp	.Lbootinfo_next
    550 
    551 .Lbootinfo_skip:
    552 	subl	%ecx,%edx		/* revert dest pointer */
    553 
    554 .Lbootinfo_next:
    555 	popl	%eax
    556 	popl	%esi
    557 	popl	%edi
    558 	subl	$1,%ebx			/* decrement the # of entries */
    559 	jmp	.Lbootinfo_entryloop
    560 
    561 .Lbootinfo_overflow:
    562 	/*
    563 	 * Cleanup for overflow case. Pop the registers, and correct the number
    564 	 * of entries.
    565 	 */
    566 	popl	%eax
    567 	popl	%esi
    568 	popl	%edi
    569 	movl	$RELOC(bootinfo),%ebp
    570 	movl	%ebp,%edx
    571 	subl	%ebx,(%edx)		/* correct the number of entries */
    572 .Lbootinfo_finished:
    573 
    574 	/* Load 'esym' */
    575 	movl	16(%esp),%eax
    576 	testl	%eax,%eax		/* esym = NULL? */
    577 	jz	1f
    578 
    579 	addl	$KERNBASE_LO,%eax
    580 
    581 1:
    582 	movl	$RELOC(esym),%ebp
    583 	movl	%eax,(%ebp)
    584 	movl	$KERNBASE_HI,4(%ebp)
    585 
    586 	/* Load 'biosextmem' */
    587 	movl	$RELOC(biosextmem),%ebp
    588 	movl	(%ebp),%eax
    589 	testl	%eax,%eax		/* already set? */
    590 	jnz	.Lbiosextmem_finished
    591 
    592 	movl	20(%esp),%eax
    593 	movl	%eax,(%ebp)
    594 
    595 .Lbiosextmem_finished:
    596 	/* Load 'biosbasemem' */
    597 	movl	$RELOC(biosbasemem),%ebp
    598 	movl	(%ebp),%eax
    599 	testl	%eax,%eax		/* already set? */
    600 	jnz	.Lbiosbasemem_finished
    601 
    602 	movl	24(%esp),%eax
    603 	movl	%eax,(%ebp)
    604 
    605 .Lbiosbasemem_finished:
    606 	/*
    607 	 * Done with the parameters!
    608 	 */
    609 
    610 	/* First, reset the PSL. */
    611 	pushl	$PSL_MBO
    612 	popfl
    613 
    614 	xorl	%eax,%eax
    615 	cpuid
    616 	movl	%eax,RELOC(cpuid_level)
    617 
    618 	/*
    619 	 * Finished with old stack; load new %esp now instead of later so we
    620 	 * can trace this code without having to worry about the trace trap
    621 	 * clobbering the memory test or the zeroing of the bss+bootstrap page
    622 	 * tables.
    623 	 *
    624 	 * The boot program should check:
    625 	 *	text+data <= &stack_variable - more_space_for_stack
    626 	 *	text+data+bss+pad+space_for_page_tables <= end_of_memory
    627 	 *
    628 	 * XXX: the gdt is in the carcass of the boot program so clearing
    629 	 * the rest of memory is still not possible.
    630 	 */
    631 	movl	$RELOC(tmpstk),%esp
    632 
    633 	/*
    634 	 * Retrieve the NX/XD flag. We use the 32bit version of PTE_NX.
    635 	 */
    636 	movl	$0x80000001,%eax
    637 	cpuid
    638 	andl	$CPUID_NOX,%edx
    639 	jz	.Lno_NOX
    640 	movl	$PTE_NX32,RELOC(nox_flag)
    641 .Lno_NOX:
    642 
    643 /*
    644  * There are four levels of pages in amd64: PML4 -> PDP -> PD -> PT. They will
    645  * be referred to as: L4 -> L3 -> L2 -> L1.
    646  *
    647  * Virtual address space of the kernel:
    648  * +------+--------+------+-----+--------+---------------------+----------
    649  * | TEXT | RODATA | DATA | BSS | [SYMS] | [PRELOADED MODULES] | L4 ->
    650  * +------+--------+------+-----+--------+---------------------+----------
    651  *                             (1)      (2)                   (3)
    652  *
    653  * --------------+-----+-----+----+-------------+
    654  * -> PROC0 STK -> L3 -> L2 -> L1 | ISA I/O MEM |
    655  * --------------+-----+-----+----+-------------+
    656  *                               (4)
    657  *
    658  * PROC0 STK is obviously not linked as a page level. It just happens to be
    659  * caught between L4 and L3.
    660  *
    661  * (PROC0 STK + L4 + L3 + L2 + L1) is later referred to as BOOTSTRAP TABLES.
    662  *
    663  * ISA I/O MEM has no physical page allocated here, just virtual addresses.
    664  *
    665  * Important note: the kernel segments are properly 4k-aligned
    666  * (see kern.ldscript), so there's no need to enforce alignment.
    667  */
    668 
    669 	/* Find end of kernel image; brings us on (1). */
    670 	movl	$RELOC(__kernel_end),%edi
    671 
    672 #if (NKSYMS || defined(DDB) || defined(MODULAR)) && !defined(makeoptions_COPY_SYMTAB)
    673 	/* Save the symbols (if loaded); brings us on (2). */
    674 	movl	RELOC(esym),%eax
    675 	testl	%eax,%eax
    676 	jz	1f
    677 	subl	$KERNBASE_LO,%eax	/* XXX */
    678 	movl	%eax,%edi
    679 1:
    680 #endif
    681 	/* Skip over any modules/blobs; brings us on (3). */
    682 	movl	RELOC(eblob),%eax
    683 	testl	%eax,%eax
    684 	jz	1f
    685 	subl	$KERNBASE_LO,%eax	/* XXX */
    686 	movl	%eax,%edi
    687 1:
    688 
    689 	/* We are on (3). Align up for BOOTSTRAP TABLES. */
    690 	movl	%edi,%esi
    691 	addl	$PGOFSET,%esi
    692 	andl	$~PGOFSET,%esi
    693 
    694 	/* We are on the BOOTSTRAP TABLES. Save L4's physical address. */
    695 	movl	$RELOC(PDPpaddr),%ebp
    696 	movl	%esi,(%ebp)
    697 	movl	$0,4(%ebp)
    698 
    699 	/* Now, zero out the BOOTSTRAP TABLES (before filling them in). */
    700 	movl	%esi,%edi
    701 	xorl	%eax,%eax
    702 	cld
    703 	movl	$TABLESIZE,%ecx
    704 	shrl	$2,%ecx
    705 	rep
    706 	stosl				/* copy eax -> edi */
    707 
    708 /*
    709  * Build the page tables and levels. We go from L1 to L4, and link the levels
    710  * together. Note: RELOC computes &addr - KERNBASE in 32 bits; the value can't
    711  * be > 4G, or we can't deal with it anyway, since we are in 32bit mode.
    712  */
    713 	/*
    714 	 * Build L1.
    715 	 */
    716 	leal	(PROC0_PTP1_OFF)(%esi),%ebx
    717 
    718 	/* Skip the area below the kernel text. */
    719 	movl	$(KERNTEXTOFF_LO - KERNBASE_LO),%ecx
    720 	shrl	$PGSHIFT,%ecx
    721 	fillkpt_blank
    722 
    723 	/* Map the kernel text RX. */
    724 	movl	$(KERNTEXTOFF_LO - KERNBASE_LO),%eax	/* start of TEXT */
    725 	movl	$RELOC(__rodata_start),%ecx
    726 	subl	%eax,%ecx
    727 	shrl	$PGSHIFT,%ecx
    728 	orl	$(PTE_P),%eax
    729 	fillkpt
    730 
    731 	/* Map the kernel rodata R. */
    732 	movl	$RELOC(__rodata_start),%eax
    733 	movl	$RELOC(__data_start),%ecx
    734 	subl	%eax,%ecx
    735 	shrl	$PGSHIFT,%ecx
    736 	orl	$(PTE_P),%eax
    737 	fillkpt_nox
    738 
    739 	/* Map the kernel data+bss RW. */
    740 	movl	$RELOC(__data_start),%eax
    741 	movl	$RELOC(__kernel_end),%ecx
    742 	subl	%eax,%ecx
    743 	shrl	$PGSHIFT,%ecx
    744 	orl	$(PTE_P|PTE_W),%eax
    745 	fillkpt_nox
    746 
    747 	/* Map [SYMS]+[PRELOADED MODULES] RW. */
    748 	movl	$RELOC(__kernel_end),%eax
    749 	movl	%esi,%ecx		/* start of BOOTSTRAP TABLES */
    750 	subl	%eax,%ecx
    751 	shrl	$PGSHIFT,%ecx
    752 	orl	$(PTE_P|PTE_W),%eax
    753 	fillkpt_nox
    754 
    755 	/* Map the BOOTSTRAP TABLES RW. */
    756 	movl	%esi,%eax		/* start of BOOTSTRAP TABLES */
    757 	movl	$TABLESIZE,%ecx		/* length of BOOTSTRAP TABLES */
    758 	shrl	$PGSHIFT,%ecx
    759 	orl	$(PTE_P|PTE_W),%eax
    760 	fillkpt_nox
    761 
    762 	/* We are on (4). Map ISA I/O MEM RW. */
    763 	movl	$IOM_BEGIN,%eax
    764 	movl	$IOM_SIZE,%ecx	/* size of ISA I/O MEM */
    765 	shrl	$PGSHIFT,%ecx
    766 	orl	$(PTE_P|PTE_W/*|PTE_PCD*/),%eax
    767 	fillkpt_nox
    768 
    769 	/*
    770 	 * Build L2. Linked to L1.
    771 	 */
    772 	leal	(PROC0_PTP2_OFF)(%esi),%ebx
    773 	leal	(PROC0_PTP1_OFF)(%esi),%eax
    774 	orl	$(PTE_P|PTE_W),%eax
    775 	movl	$(NKL2_KIMG_ENTRIES+1),%ecx
    776 	fillkpt
    777 
    778 #if L2_SLOT_KERNBASE > 0
    779 	/* If needed, set up level 2 entries for actual kernel mapping */
    780 	leal	(PROC0_PTP2_OFF + L2_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
    781 	leal	(PROC0_PTP1_OFF)(%esi),%eax
    782 	orl	$(PTE_P|PTE_W),%eax
    783 	movl	$(NKL2_KIMG_ENTRIES+1),%ecx
    784 	fillkpt
    785 #endif
    786 
    787 	/*
    788 	 * Build L3. Linked to L2.
    789 	 */
    790 	leal	(PROC0_PTP3_OFF)(%esi),%ebx
    791 	leal	(PROC0_PTP2_OFF)(%esi),%eax
    792 	orl	$(PTE_P|PTE_W),%eax
    793 	movl	$NKL3_KIMG_ENTRIES,%ecx
    794 	fillkpt
    795 
    796 #if L3_SLOT_KERNBASE > 0
    797 	/* If needed, set up level 3 entries for actual kernel mapping */
    798 	leal	(PROC0_PTP3_OFF + L3_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
    799 	leal	(PROC0_PTP2_OFF)(%esi),%eax
    800 	orl	$(PTE_P|PTE_W),%eax
    801 	movl	$NKL3_KIMG_ENTRIES,%ecx
    802 	fillkpt
    803 #endif
    804 
    805 	/*
    806 	 * Build L4 for identity mapping. Linked to L3.
    807 	 */
    808 	leal	(PROC0_PML4_OFF)(%esi),%ebx
    809 	leal	(PROC0_PTP3_OFF)(%esi),%eax
    810 	orl	$(PTE_P|PTE_W),%eax
    811 	movl	$NKL4_KIMG_ENTRIES,%ecx
    812 	fillkpt
    813 
    814 	/* Set up L4 entries for actual kernel mapping */
    815 	leal	(PROC0_PML4_OFF + L4_SLOT_KERNBASE * PDE_SIZE)(%esi),%ebx
    816 	leal	(PROC0_PTP3_OFF)(%esi),%eax
    817 	orl	$(PTE_P|PTE_W),%eax
    818 	movl	$NKL4_KIMG_ENTRIES,%ecx
    819 	fillkpt
    820 
    821 	/*
    822 	 * Startup checklist:
    823 	 * 1. Enable PAE (and SSE while here).
    824 	 */
    825 	movl	%cr4,%eax
    826 	orl	$(CR4_PAE|CR4_OSFXSR|CR4_OSXMMEXCPT),%eax
    827 	movl	%eax,%cr4
    828 
    829 	/*
    830 	 * 2. Set Long Mode Enable in EFER. Also enable the syscall extensions,
    831 	 *    and NOX if available.
    832 	 */
    833 	movl	$MSR_EFER,%ecx
    834 	rdmsr
    835 	xorl	%eax,%eax	/* XXX */
    836 	orl	$(EFER_LME|EFER_SCE),%eax
    837 	movl	RELOC(nox_flag),%ebx
    838 	cmpl	$0,%ebx
    839 	je 	.Lskip_NOX
    840 	orl	$(EFER_NXE),%eax
    841 .Lskip_NOX:
    842 	wrmsr
    843 
    844 	/*
    845 	 * 3. Load %cr3 with pointer to PML4.
    846 	 */
    847 	movl	%esi,%eax
    848 	movl	%eax,%cr3
    849 
    850 	/*
    851 	 * 4. Enable paging and the rest of it.
    852 	 */
    853 	movl	%cr0,%eax
    854 	orl	$(CR0_PE|CR0_PG|CR0_NE|CR0_TS|CR0_MP|CR0_WP|CR0_AM),%eax
    855 	movl	%eax,%cr0
    856 	jmp	compat
    857 compat:
    858 
    859 	movl	$RELOC(tmpstk),%esp
    860 	/*
    861 	 * 5. Not quite done yet, we're now in a compatibility segment, in
    862 	 *    legacy mode. We must jump to a long mode segment. Need to set up
    863 	 *    a temporary GDT with a long mode segment in it to do that.
    864 	 */
    865 	movl	$RELOC(gdt64_lo),%eax
    866 	lgdt	(%eax)
    867 	movl	$RELOC(farjmp64),%eax
    868 	ljmp	*(%eax)
    869 
    870 	.code64
    871 longmode:
    872 	/*
    873 	 * 6. Finally, we're in long mode. However, we're still in the identity
    874 	 *    mapped area (could not jump out of that earlier because it would
    875 	 *    have been a > 32bit jump). We can do that now, so here we go.
    876 	 */
    877 	movabsq	$longmode_hi,%rax
    878 	jmp	*%rax
    879 
    880 longmode_hi:
    881 
    882 	/*
    883 	 * We left the identity mapped area. Base address of
    884 	 * the temporary gdt64 should now be in high memory.
    885 	 */
    886 	movq	$RELOC(gdt64_hi),%rax
    887 	lgdt	(%rax)
    888 
    889 	/*
    890 	 * We have arrived. There's no need anymore for the identity mapping in
    891 	 * low memory, remove it.
    892 	 */
    893 	movq	$KERNBASE,%r8
    894 
    895 #if L2_SLOT_KERNBASE > 0
    896 	movq	$(NKL2_KIMG_ENTRIES+1),%rcx
    897 	leaq	(PROC0_PTP2_OFF)(%rsi),%rbx	/* old, phys address */
    898 	addq	%r8,%rbx			/* new, virt address */
    899 	killkpt
    900 #endif
    901 
    902 #if L3_SLOT_KERNBASE > 0
    903 	movq	$NKL3_KIMG_ENTRIES,%rcx
    904 	leaq	(PROC0_PTP3_OFF)(%rsi),%rbx	/* old, phys address */
    905 	addq	%r8,%rbx			/* new, virt address */
    906 	killkpt
    907 #endif
    908 
    909 	movq	$NKL4_KIMG_ENTRIES,%rcx
    910 	leaq	(PROC0_PML4_OFF)(%rsi),%rbx	/* old, phys address of PML4 */
    911 	addq	%r8,%rbx			/* new, virt address of PML4 */
    912 	killkpt
    913 
    914 	/* Relocate atdevbase. */
    915 	movq	$(TABLESIZE+KERNBASE),%rdx
    916 	addq	%rsi,%rdx
    917 	movq	%rdx,_C_LABEL(atdevbase)(%rip)
    918 
    919 	/* Set up bootstrap stack. */
    920 	leaq	(PROC0_STK_OFF)(%rsi),%rax
    921 	addq	%r8,%rax
    922 	movq	%rax,_C_LABEL(lwp0uarea)(%rip)
    923 	leaq	(USPACE-FRAMESIZE)(%rax),%rsp
    924 	xorq	%rbp,%rbp			/* mark end of frames */
    925 
    926 	xorw	%ax,%ax
    927 	movw	%ax,%gs
    928 	movw	%ax,%fs
    929 
    930 	/* The first physical page available. */
    931 	leaq	(TABLESIZE)(%rsi),%rdi
    932 
    933 #else	/* XENPV */
    934 	/* First, reset the PSL. */
    935 	pushq	$2
    936 	popfq
    937 
    938 	cld
    939 
    940 	/*
    941 	 * Xen info:
    942 	 * - %rsi -> start_info struct
    943 	 * - %rsp -> stack, *theoretically* the last used page by Xen bootstrap
    944 	 */
    945 	movq	%rsi,%rbx
    946 
    947 	/* Clear BSS. */
    948 	xorq	%rax,%rax
    949 	movq	$_C_LABEL(__bss_start),%rdi
    950 	movq	$_C_LABEL(_end),%rcx
    951 	subq	%rdi,%rcx
    952 	rep
    953 	stosb
    954 
    955 	/* Copy start_info to a safe place. */
    956 	movq	%rbx,%rsi
    957 	movq	$_C_LABEL(start_info_union),%rdi
    958 	movq	$(PAGE_SIZE / 8),%rcx
    959 	rep
    960 	movsq
    961 
    962 	/*
    963 	 * Memory layout at start of the day:
    964 	 * - Kernel image
    965 	 * - Page frames list
    966 	 * - start_info struct. we copied it, so it can be recycled.
    967 	 * - xenstore
    968 	 * - console
    969 	 * - Xen bootstrap page tables
    970 	 * - kernel stack. provided by Xen
    971 	 * - guaranteed 512kB padding
    972 	 *
    973 	 * As we want to rebuild our page tables and place our stack
    974 	 * in proc0 struct, all data starting from after console can be
    975 	 * discarded after we've done a little setup.
    976 	 */
    977 
    978 	/*
    979 	 * We want our own page tables, and will rebuild them. We will reclaim
    980 	 * the Xen space later, INCLUDING the stack. So we need to switch to a
    981 	 * temporary one now.
    982 	 */
    983 	movq	$tmpstk,%rax
    984 	subq	$8,%rax
    985 	movq	%rax,%rsp
    986 
    987 	xorl	%eax,%eax
    988 	cpuid
    989 	movl	%eax,_C_LABEL(cpuid_level)
    990 
    991 	movl	$VM_GUEST_XENPV, _C_LABEL(vm_guest)
    992 
    993 	/*
    994 	 * Initialize cpu_info_primary.ci_self := &cpu_info_primary,
    995 	 * and initialize some MSRs with
    996 	 * cpu_init_msrs(&cpu_info_primary, full=true).  This sets up
    997 	 * SYSCALL/SYSRET (XXX why?) and %fs/%gs, which is needed for
    998 	 * the %gs-relative addressing used by CPUVAR(...), curcpu(),
    999 	 * and curlwp.
   1000 	 *
   1001 	 * XXX Is it necessary to set cpu_info_primary.ci_self here?
   1002 	 * Isn't it statically initialized in x86/cpu.c?
   1003 	 *
   1004 	 * XXX Why do we immediately clear the segment registers just
   1005 	 * afterward?
   1006 	 */
   1007 	movq	$cpu_info_primary,%rdi
   1008 	movq	%rdi,CPU_INFO_SELF(%rdi) /* ci->ci_self = ci */
   1009 	movq	$1,%rsi
   1010 	call	cpu_init_msrs	/* cpu_init_msrs(ci, true); */
   1011 
   1012 	call	xen_locore
   1013 
   1014 	/*
   1015 	 * The first VA available is returned by xen_locore in %rax. We
   1016 	 * use it as the UAREA, and set up the stack here.
   1017 	 */
   1018 	movq	%rax,%rsi
   1019 	movq	%rsi,_C_LABEL(lwp0uarea)(%rip)
   1020 	leaq	(USPACE-FRAMESIZE)(%rsi),%rsp
   1021 	xorq	%rbp,%rbp
   1022 
   1023 	/* Clear segment registers. */
   1024 	xorw	%ax,%ax
   1025 	movw	%ax,%gs
   1026 	movw	%ax,%fs
   1027 
   1028 	/* Set first_avail after the DUMMY PAGE (see xen_locore). */
   1029 	movq	%rsi,%rdi
   1030 	addq	$(USPACE+PAGE_SIZE),%rdi
   1031 	subq	$KERNBASE,%rdi	/* init_x86_64 wants a physical address */
   1032 #endif	/* XENPV */
   1033 
   1034 	pushq	%rdi
   1035 	call	_C_LABEL(init_bootspace)
   1036 #ifdef KASAN
   1037 	movq	_C_LABEL(lwp0uarea)(%rip),%rdi
   1038 	call	_C_LABEL(kasan_early_init)
   1039 #endif
   1040 	/* <-- DO NOT INSERT C CALLS BEFORE THIS POINT --> */
   1041 #if defined(XEN) && !defined(XENPV)
   1042 	call	_C_LABEL(init_xen_early)
   1043 #endif
   1044 	call	_C_LABEL(init_slotspace)
   1045 	popq	%rdi
   1046 	call	_C_LABEL(init_x86_64)
   1047 	call 	_C_LABEL(main)
   1048 END(start)
   1049 
   1050 #if defined(XEN)
   1051 # if !defined(XENPV)
   1052 /* entry point for Xen PVH */
   1053 	.code32
   1054 ENTRY(start_pvh)
   1055 #ifdef BOOT_DURATION
   1056 	getstarttsc
   1057 #endif
   1058 	/* Xen doesn't start us with a valid gdt */
   1059 	movl    $RELOC(gdtdesc32), %eax
   1060 	lgdt    (%eax)
   1061 	jmp     $GSEL(GCODE_SEL, SEL_KPL), $RELOC(.Lreload_cs)
   1062 
   1063 .Lreload_cs:
   1064 	movw    $GSEL(GDATA_SEL, SEL_KPL), %ax
   1065 	movw    %ax, %ds
   1066 	movw    %ax, %es
   1067 	movw    %ax, %ss
   1068 
   1069 	/* we need a valid stack */
   1070 	movl	$RELOC(tmpstk),%esp
   1071 
   1072 	/* clear BSS */
   1073         xorl    %eax,%eax
   1074 	movl    $RELOC(__bss_start),%edi
   1075 	movl    $RELOC(_end),%ecx
   1076 	subl    %edi,%ecx
   1077 	rep
   1078 	stosb
   1079 
   1080 	/*
   1081 	 * Here, we have 2 cases :
   1082 	 *
   1083 	 *  1) We have been started by Xen
   1084 	 *  2) We have been started by another VMM (Qemu, Firecracker, ...)
   1085 	 *
   1086 	 * The main difference is that, when we are started by Xen,
   1087 	 * %ebx (addr of the hvm_start_info structure) is pointing to a
   1088 	 * location that will be mapped correctly later.
   1089 	 *
   1090 	 * In the second case, we have to copy this structure (and all
   1091 	 * the information contained in it) to a location that will be
   1092 	 * mapped later : __kernel_end
   1093 	 *
   1094 	 * To distinguish between the 2 cases, we'll use the 'cpuid' instruction
   1095 	 */
   1096 	push %ebx
   1097 	xorl %eax, %eax
   1098 	cpuid
   1099 	cmpl $0x1, %eax		/* Check if we can call CPUID with eax=1 */
   1100 	jb .start_genpvh
   1101 	xorl %eax, %eax
   1102 	inc %eax
   1103 	cpuid
   1104 	shr $31, %ecx
   1105 	testb $1, %cl		/* Check if bit 31 of ECX (hypervisor) is set */
   1106 	jz .start_genpvh
   1107 	xorl %eax, %eax
   1108 	inc %eax
   1109 	shl $30, %eax
   1110 	cpuid			/* Calling cpuid with eax=0x40000000 */
   1111 	cmp $XEN_CPUID_SIGNATURE_EBX, %ebx	/* "VneX" */
   1112 	je .start_xen32
   1113 
   1114 	/* We have been started by a VMM that is *not* Xen */
   1115 
   1116 .start_genpvh:
   1117 
   1118 	/* announce ourself */
   1119 	movl $VM_GUEST_GENPVH, RELOC(vm_guest)
   1120 
   1121 	pop %ebx
   1122 	movl $RELOC(__kernel_end), %eax
   1123 	movl %eax, %ecx
   1124 	addl $KERNBASE_LO,%ecx
   1125 	movl $RELOC(esym),%ebp
   1126 	movl %ecx,(%ebp)
   1127 	movl $KERNBASE_HI,4(%ebp)
   1128 
   1129 	jmp .copy_hvm_info
   1130 
   1131 .start_xen32:
   1132 	movl $VM_GUEST_XENPVH, RELOC(vm_guest)
   1133 	/*
   1134 	 * Read the size of the symbol table, sanity-check and compute the end
   1135 	 * We have:
   1136 	 * |   kernel   |
   1137 	 * -------------- kernel_end
   1138 	 *     alignment
   1139 	 * -------------- bsd_symtab
   1140 	 * | size (int) |
   1141 	 * | elf_header |
   1142 	 *
   1143 	 */
   1144 	movl $RELOC(__kernel_end), %ebp
   1145 	addl $3, %ebp
   1146 	andl $~3, %ebp
   1147 	movl 0(%ebp), %eax /* read size */
   1148 	testl $~0x00ffffff, %eax /* more than 16MB ? */
   1149 	jnz .bad_esym
   1150 	addl %ebp, %eax /* compute esym */
   1151 	/* check if start_info is within symbol table */
   1152 	movl 0(%esp), %ebx
   1153 	cmp %ebp, %ebx
   1154 	jb .save_esym /* %ebx < __kernel_end */
   1155 	cmp %eax, %ebx
   1156 	jae .save_esym /* %ebx >= esym */
   1157 
   1158 .bad_esym:
   1159 	movl $RELOC(__kernel_end), %eax
   1160 .save_esym:
   1161 	movl %eax, %ebx
   1162 	addl $KERNBASE_LO,%ebx
   1163 	movl $RELOC(esym),%ebp
   1164 	movl %ebx,(%ebp)
   1165 	movl $KERNBASE_HI,4(%ebp)
   1166 	/* advance to next page boundary, this will be our hvm_start_info */
   1167 	addl $PGOFSET,%eax
   1168 	andl $~PGOFSET,%eax
   1169 	pop %ebx
   1170 
   1171 .copy_hvm_info:
   1172  	/*
   1173 	 * %ebx points to physical address provided by Xen
   1174 	 * %eax points to where we want it to be copied to
   1175 	 */
   1176 	/* check if %ebx and %eax are in the same page */
   1177 	movl %ebx, %esi
   1178 	addl $PGOFSET,%esi
   1179 	andl $~PGOFSET,%esi
   1180 	cmp %esi, %eax
   1181 	je .same_hvm_info
   1182 
   1183 	/* First, copy the hvm_start_info structure to %eax */
   1184 	movl %ebx, %esi
   1185 	movl %eax, %edi
   1186 	movl $HVM_START_INFO_SIZE, %ecx
   1187 	shrl $2, %ecx
   1188 	rep movsl
   1189 
   1190 	/* Copy cmdline_paddr after hvm_start_info */
   1191 	movl CMDLINE_PADDR(%ebx), %esi
   1192 	movl %edi, CMDLINE_PADDR(%eax)	/* Set new cmdline_paddr in hvm_start_info */
   1193 .cmdline_copy:
   1194 	movb (%esi), %cl
   1195 	movsb
   1196 	cmp $0, %cl
   1197 	jne .cmdline_copy
   1198 
   1199 	/* Copy memmap_paddr after cmdline (only if hvm_start_info->version != 0) */
   1200 	xorl %ecx, %ecx
   1201 	cmpl START_INFO_VERSION(%ebx), %ecx
   1202 	je .save_hvm_info
   1203 	pushl %eax
   1204 	movl MMAP_PADDR(%ebx), %esi
   1205 	movl %edi, MMAP_PADDR(%eax)	/* Set new memmap_paddr in hvm_start_info */
   1206 	movl MMAP_ENTRIES(%ebx), %eax	/* Get memmap_entries */
   1207 	movl $MMAP_ENTRY_SIZE, %ebx
   1208 	mull %ebx			/* eax * ebx => edx:eax */
   1209 	movl %eax, %ecx
   1210 	shrl $2, %ecx
   1211 	rep movsl
   1212 	popl %eax
   1213 
   1214 .save_hvm_info:
   1215 	/*
   1216 	 * %eax points to the start of hvm_start_info
   1217 	 * %edi points to the end
   1218 	 */
   1219 	addl 	$KERNBASE_LO,%eax
   1220 	movl	$RELOC(hvm_start_info),%ebp
   1221 	movl	%eax,(%ebp)
   1222 	movl	$KERNBASE_HI,4(%ebp)
   1223 
   1224 	/* round end to next page boundary */
   1225 	addl	$PGOFSET,%edi
   1226 	andl	$~PGOFSET,%edi
   1227 
   1228 	/* get a page for HYPERVISOR_shared_info */
   1229 	/* this is only needed if we are running on Xen */
   1230 	cmpl	$VM_GUEST_XENPVH, RELOC(vm_guest)
   1231 	jne	.save_eblob
   1232 	movl	$RELOC(HYPERVISOR_shared_info_pa),%ebp
   1233 	movl	%edi,(%ebp)
   1234 	movl	$0,4(%ebp)
   1235 	addl	$PAGE_SIZE, %edi
   1236 .save_eblob:
   1237 	addl	$KERNBASE_LO,%edi
   1238 	movl	$RELOC(eblob),%ebp
   1239 	movl	%edi,(%ebp)
   1240 	movl	$KERNBASE_HI,4(%ebp)
   1241 
   1242 	jmp .Lbiosbasemem_finished
   1243 
   1244 .same_hvm_info:
   1245 	/* just use the provided %ebx */
   1246 	/* XXX assume hvm_start_info+dependant structure fits in a single page */
   1247 	movl %ebx, %eax
   1248 	movl %ebx, %edi
   1249 	addl	$PAGE_SIZE, %edi
   1250 	jmp .save_hvm_info
   1251 END(start_pvh)
   1252 	.code64
   1253 # endif /* !XENPV */
   1254 /* space for the hypercall call page */
   1255 #define HYPERCALL_PAGE_OFFSET 0x1000
   1256 .align HYPERCALL_PAGE_OFFSET
   1257 ENTRY(hypercall_page) /* Returns -1, on HYPERVISOR_xen_version() */
   1258 .skip	(__HYPERVISOR_xen_version*32), 0x90
   1259 	movq	$-1, %rax
   1260 	retq
   1261 .align HYPERCALL_PAGE_OFFSET, 0x90
   1262 END(hypercall_page)
   1263 #endif /* XEN */
   1264 
   1265 /*
   1266  * int setjmp(label_t *)
   1267  *
   1268  * Used primarily by DDB.
   1269  */
   1270 ENTRY(setjmp)
   1271 	/*
   1272 	 * Only save registers that must be preserved across function
   1273 	 * calls according to the ABI (%rbx, %rsp, %rbp, %r12-%r15)
   1274 	 * and %rip.
   1275 	 */
   1276 	movq	%rdi,%rax
   1277 	movq	%rbx,(%rax)
   1278 	movq	%rsp,8(%rax)
   1279 	movq	%rbp,16(%rax)
   1280 	movq	%r12,24(%rax)
   1281 	movq	%r13,32(%rax)
   1282 	movq	%r14,40(%rax)
   1283 	movq	%r15,48(%rax)
   1284 	movq	(%rsp),%rdx
   1285 	movq	%rdx,56(%rax)
   1286 	xorl	%eax,%eax
   1287 	ret
   1288 END(setjmp)
   1289 
   1290 /*
   1291  * int longjmp(label_t *)
   1292  *
   1293  * Used primarily by DDB.
   1294  */
   1295 ENTRY(longjmp)
   1296 	movq	%rdi,%rax
   1297 	movq	(%rax),%rbx
   1298 	movq	8(%rax),%rsp
   1299 	movq	16(%rax),%rbp
   1300 	movq	24(%rax),%r12
   1301 	movq	32(%rax),%r13
   1302 	movq	40(%rax),%r14
   1303 	movq	48(%rax),%r15
   1304 	movq	56(%rax),%rdx
   1305 	movq	%rdx,(%rsp)
   1306 	movl	$1,%eax
   1307 	ret
   1308 END(longjmp)
   1309 
   1310 /*
   1311  * void dumpsys(void)
   1312  *
   1313  * Mimic cpu_switchto() for postmortem debugging.
   1314  */
   1315 ENTRY(dumpsys)
   1316 	/* Build a fake switch frame. */
   1317 	pushq	%rbx
   1318 	pushq	%r12
   1319 	pushq	%r13
   1320 	pushq	%r14
   1321 	pushq	%r15
   1322 
   1323 	/* Save a context. */
   1324 	movq	$dumppcb, %rax
   1325 	movq	%rsp, PCB_RSP(%rax)
   1326 	movq	%rbp, PCB_RBP(%rax)
   1327 
   1328 	call	_C_LABEL(dodumpsys)
   1329 
   1330 	addq	$(5*8), %rsp	/* sizeof(switchframe) - sizeof(%rip) */
   1331 	ret
   1332 END(dumpsys)
   1333 
   1334 /*
   1335  * struct lwp *cpu_switchto(struct lwp *oldlwp, struct lwp *newlwp,
   1336  *     bool returning)
   1337  *
   1338  *	1. save context of oldlwp.
   1339  *	2. restore context of newlwp.
   1340  *
   1341  * Note that the stack frame layout is known to "struct switchframe" in
   1342  * <machine/frame.h> and to the code in cpu_lwp_fork() which initializes
   1343  * it for a new lwp.
   1344  */
   1345 ENTRY(cpu_switchto)
   1346 	pushq	%rbx
   1347 	pushq	%r12
   1348 	pushq	%r13
   1349 	pushq	%r14
   1350 	pushq	%r15
   1351 
   1352 	movq	%rdi,%r13	/* oldlwp */
   1353 	movq	%rsi,%r12	/* newlwp */
   1354 
   1355 	/* Save old context. */
   1356 	movq	L_PCB(%r13),%rax
   1357 	movq	%rsp,PCB_RSP(%rax)
   1358 	movq	%rbp,PCB_RBP(%rax)
   1359 
   1360 	/* Switch to newlwp's stack. */
   1361 	movq	L_PCB(%r12),%r14
   1362 	movq	PCB_RSP(%r14),%rsp
   1363 	movq	PCB_RBP(%r14),%rbp
   1364 
   1365 	/*
   1366 	 * Issue XCHG, rather than MOV, to set ci_curlwp := newlwp in
   1367 	 * order to coordinate mutex_exit on this CPU with
   1368 	 * mutex_vector_enter on another CPU.
   1369 	 *
   1370 	 * 1. Any prior mutex_exit by oldlwp must be visible to other
   1371 	 *    CPUs before we set ci_curlwp := newlwp on this one,
   1372 	 *    requiring a store-before-store barrier.
   1373 	 *
   1374 	 *    (This is always guaranteed by the x86 memory model, TSO,
   1375 	 *    but other architectures require a explicit barrier before
   1376 	 *    the store to ci->ci_curlwp.)
   1377 	 *
   1378 	 * 2. ci_curlwp := newlwp must be visible on all other CPUs
   1379 	 *    before any subsequent mutex_exit by newlwp can even test
   1380 	 *    whether there might be waiters, requiring a
   1381 	 *    store-before-load barrier.
   1382 	 *
   1383 	 *    (This is the only ordering x86 TSO ever requires any kind
   1384 	 *    of barrier for -- in this case, we take advantage of the
   1385 	 *    sequential consistency implied by XCHG to obviate the
   1386 	 *    need for MFENCE or something.)
   1387 	 *
   1388 	 * See kern_mutex.c for details -- this is necessary for
   1389 	 * adaptive mutexes to detect whether the lwp is on the CPU in
   1390 	 * order to safely block without requiring atomic r/m/w in
   1391 	 * mutex_exit.
   1392 	 */
   1393 	movq	%r12,%rcx
   1394 	xchgq	%rcx,CPUVAR(CURLWP)
   1395 
   1396 #ifdef XENPV
   1397 	/* if we are there, we're obviously not in user context.
   1398 	 * reset ci_xen_clockf_* in case the splx() at the end of mi_switch()
   1399 	 * triggers a deffered call do xen_timer_handler()
   1400 	 */
   1401 	movb    $0, CPUVAR(XEN_CLOCKF_USERMODE)
   1402 	movq    $_C_LABEL(cpu_switchto), CPUVAR(XEN_CLOCKF_PC)
   1403 #endif
   1404 
   1405 
   1406 	/* Skip the rest if returning to a pinned LWP. */
   1407 	testb	%dl,%dl		/* returning = true ? */
   1408 	jnz	.Lswitch_return
   1409 
   1410 #ifdef SVS
   1411 	movb	_C_LABEL(svs_enabled),%dl
   1412 	testb	%dl,%dl
   1413 	jz	.Lskip_svs
   1414 	callq	_C_LABEL(svs_lwp_switch)
   1415 .Lskip_svs:
   1416 #endif
   1417 
   1418 #ifndef XENPV
   1419 	movq	%r13,%rdi
   1420 	movq	%r12,%rsi
   1421 	callq	_C_LABEL(speculation_barrier)
   1422 #endif
   1423 
   1424 	/* Switch ring0 stack */
   1425 #ifdef SVS
   1426 	movb	_C_LABEL(svs_enabled),%al
   1427 	testb	%al,%al
   1428 	jz	.Lno_svs_switch
   1429 
   1430 	movq	CPUVAR(RSP0),%rax
   1431 	movq	CPUVAR(TSS),%rdi
   1432 	movq	%rax,TSS_RSP0(%rdi)
   1433 	jmp	.Lring0_switched
   1434 
   1435 .Lno_svs_switch:
   1436 #endif
   1437 
   1438 #if !defined(XENPV)
   1439 	movq	PCB_RSP0(%r14),%rax
   1440 	movq	CPUVAR(TSS),%rdi
   1441 	movq	%rax,TSS_RSP0(%rdi)
   1442 #else
   1443 	movq	%r14,%rdi
   1444 	callq	_C_LABEL(x86_64_switch_context)
   1445 #endif
   1446 .Lring0_switched:
   1447 
   1448 	/* Switch the dbregs. */
   1449 	movq	%r13,%rdi
   1450 	movq	%r12,%rsi
   1451 	callq	_C_LABEL(x86_dbregs_switch)
   1452 
   1453 	/* Switch the FPU. */
   1454 	movq	%r13,%rdi
   1455 	movq	%r12,%rsi
   1456 	callq	_C_LABEL(fpu_switch)
   1457 
   1458 	/* Don't bother with the rest if switching to a system process. */
   1459 	testl	$LW_SYSTEM,L_FLAG(%r12)
   1460 	jnz	.Lswitch_return
   1461 
   1462 	/* Is this process using RAS (restartable atomic sequences)? */
   1463 	movq	L_PROC(%r12),%rdi
   1464 	cmpq	$0,P_RASLIST(%rdi)
   1465 	je	.Lno_RAS
   1466 
   1467 	/* Handle restartable atomic sequences (RAS). */
   1468 	movq	L_MD_REGS(%r12),%rbx
   1469 	movq	TF_RIP(%rbx),%rsi
   1470 	call	_C_LABEL(ras_lookup)
   1471 	cmpq	$-1,%rax
   1472 	je	.Lno_RAS
   1473 	movq	%rax,TF_RIP(%rbx)
   1474 .Lno_RAS:
   1475 
   1476 #ifndef XENPV
   1477 	/* Raise the IPL to IPL_HIGH. Dropping the priority is deferred until
   1478 	 * mi_switch(), when cpu_switchto() returns. XXX Still needed? */
   1479 	movb	$IPL_HIGH,CPUVAR(ILEVEL)
   1480 
   1481 	/* The 32bit LWPs are handled differently. */
   1482 	testl	$PCB_COMPAT32,PCB_FLAGS(%r14)
   1483 	jnz	.Llwp_32bit
   1484 
   1485 .Llwp_64bit:
   1486 	/* Set default 64bit values in %ds, %es, %fs and %gs. */
   1487 	movq	$GSEL(GUDATA_SEL, SEL_UPL),%rax
   1488 	movw	%ax,%ds
   1489 	movw	%ax,%es
   1490 	xorq	%rax,%rax
   1491 	movw	%ax,%fs
   1492 	CLI(cx)
   1493 	SWAPGS
   1494 	movw	%ax,%gs
   1495 	SWAPGS
   1496 	STI(cx)
   1497 
   1498 	/* Zero out GDT descriptors. */
   1499 	movq	CPUVAR(GDT),%rcx
   1500 	movq	%rax,(GUFS_SEL*8)(%rcx)
   1501 	movq	%rax,(GUGS_SEL*8)(%rcx)
   1502 
   1503 	/* Reload 64-bit %fs/%gs MSRs. */
   1504 	movl	$MSR_FSBASE,%ecx
   1505 	movl	PCB_FS(%r14),%eax
   1506 	movl	4+PCB_FS(%r14),%edx
   1507 	wrmsr
   1508 	movl	$MSR_KERNELGSBASE,%ecx
   1509 	movl	PCB_GS(%r14),%eax
   1510 	movl	4+PCB_GS(%r14),%edx
   1511 	wrmsr
   1512 
   1513 	jmp	.Lswitch_return
   1514 
   1515 .Llwp_32bit:
   1516 	/* Reload %fs/%gs GDT descriptors. */
   1517 	movq	CPUVAR(GDT),%rcx
   1518 	movq	PCB_FS(%r14),%rax
   1519 	movq	%rax,(GUFS_SEL*8)(%rcx)
   1520 	movq	PCB_GS(%r14),%rax
   1521 	movq	%rax,(GUGS_SEL*8)(%rcx)
   1522 
   1523 	/* Set default 32bit values in %ds, %es, %fs and %gs. */
   1524 	movq	L_MD_REGS(%r12),%rbx
   1525 	movq	$GSEL(GUDATA32_SEL, SEL_UPL),%rax
   1526 	movw	%ax,%ds
   1527 	movw	%ax,%es
   1528 	movw	%ax,%fs
   1529 	CLI(ax)
   1530 	SWAPGS
   1531 	movw	%ax,%gs
   1532 	SWAPGS
   1533 	STI(ax)
   1534 #else
   1535 	movq	%r12,%rdi
   1536 	callq	_C_LABEL(x86_64_tls_switch)
   1537 #endif
   1538 
   1539 .Lswitch_return:
   1540 	/* Return to the new LWP, returning 'oldlwp' in %rax. */
   1541 	KMSAN_INIT_RET(8)
   1542 	movq	%r13,%rax
   1543 	popq	%r15
   1544 	popq	%r14
   1545 	popq	%r13
   1546 	popq	%r12
   1547 	popq	%rbx
   1548 	ret
   1549 END(cpu_switchto)
   1550 
   1551 /*
   1552  * void savectx(struct pcb *pcb);
   1553  *
   1554  * Update pcb, saving current processor state.
   1555  */
   1556 ENTRY(savectx)
   1557 	/* Save stack pointers. */
   1558 	movq	%rsp,PCB_RSP(%rdi)
   1559 	movq	%rbp,PCB_RBP(%rdi)
   1560 	ret
   1561 END(savectx)
   1562 
   1563 /*
   1564  * Syscall handler.
   1565  */
   1566 ENTRY(handle_syscall)
   1567 	STI(si)
   1568 
   1569 	movq	CPUVAR(CURLWP),%r14
   1570 	incq	CPUVAR(NSYSCALL)	/* count it atomically */
   1571 	movq	%rsp,L_MD_REGS(%r14)	/* save pointer to frame */
   1572 	movq	L_PROC(%r14),%r15
   1573 	andl	$~MDL_IRET,L_MD_FLAGS(%r14)   /* Allow sysret return */
   1574 	movq	%rsp,%rdi		/* Pass frame as arg0 */
   1575 	call	*P_MD_SYSCALL(%r15)
   1576 .Lsyscall_checkast:
   1577 	/*
   1578 	 * Disable interrupts to avoid new ASTs (etc) being added and
   1579 	 * to ensure we don't take an interrupt with some of the user
   1580 	 * registers loaded.
   1581 	 */
   1582 	CLI(si)
   1583 	/* Check for ASTs on exit to user mode. */
   1584 	movl	L_MD_ASTPENDING(%r14),%eax
   1585 	orl	CPUVAR(WANT_PMAPLOAD),%eax
   1586 	jnz	9f
   1587 
   1588 #ifdef DIAGNOSTIC
   1589 	cmpb	$IPL_NONE,CPUVAR(ILEVEL)
   1590 	jne	.Lspl_error
   1591 #endif
   1592 
   1593 	HANDLE_DEFERRED_FPU
   1594 
   1595 	/*
   1596 	 * Decide if we need to take a slow path. That's the case when we
   1597 	 * want to reload %cs and %ss on a 64bit LWP (MDL_IRET set), or when
   1598 	 * we're returning to a 32bit LWP (MDL_COMPAT32 set).
   1599 	 *
   1600 	 * In either case, we jump into intrfastexit and return to userland
   1601 	 * with the iret instruction.
   1602 	 */
   1603 	testl	$(MDL_IRET|MDL_COMPAT32),L_MD_FLAGS(%r14)
   1604 	jnz	intrfastexit
   1605 
   1606 	jmp	syscall_sysret
   1607 
   1608 #ifdef DIAGNOSTIC
   1609 .Lspl_error:
   1610 	movabsq	$4f,%rdi
   1611 	movzbl	CPUVAR(ILEVEL),%esi
   1612 	call	_C_LABEL(panic)
   1613 4:	.asciz	"spl not lowered on syscall, ilevel=%x"
   1614 #endif
   1615 
   1616 /* AST pending or pmap load needed */
   1617 9:
   1618 	cmpl	$0,CPUVAR(WANT_PMAPLOAD)
   1619 	jz	10f
   1620 	STI(si)
   1621 	call	_C_LABEL(do_pmap_load)
   1622 	jmp	.Lsyscall_checkast	/* re-check ASTs */
   1623 10:
   1624 	CLEAR_ASTPENDING(%r14)
   1625 	STI(si)
   1626 	/* Pushed T_ASTFLT into tf_trapno on entry. */
   1627 	movq	%rsp,%rdi
   1628 	KMSAN_INIT_ARG(8)
   1629 	call	_C_LABEL(trap)
   1630 	jmp	.Lsyscall_checkast	/* re-check ASTs */
   1631 END(handle_syscall)
   1632 
   1633 /*
   1634  * void lwp_trampoline(void);
   1635  *
   1636  * This is a trampoline function pushed run by newly created LWPs
   1637  * in order to do additional setup in their context.
   1638  */
   1639 ENTRY(lwp_trampoline)
   1640 	movq	%rbp,%rsi
   1641 	movq	%rbp,%r14	/* for .Lsyscall_checkast */
   1642 	movq	%rax,%rdi
   1643 	xorq	%rbp,%rbp
   1644 	KMSAN_INIT_ARG(16)
   1645 	call	_C_LABEL(lwp_startup)
   1646 	movq	%r13,%rdi
   1647 	KMSAN_INIT_ARG(8)
   1648 	call	*%r12
   1649 	jmp	.Lsyscall_checkast
   1650 END(lwp_trampoline)
   1651 
   1652 /*
   1653  * Entry points of the 'syscall' instruction, 64bit and 32bit mode.
   1654  */
   1655 
   1656 #define SP(x)	(x)-(TF_SS+8)(%rax)
   1657 
   1658 .macro	SYSCALL_ENTRY	name,is_svs
   1659 IDTVEC(\name)
   1660 #ifndef XENPV
   1661 	/*
   1662 	 * The user %rip is in %rcx and the user %rflags in %r11. The kernel %cs
   1663 	 * and %ss are loaded, but nothing else is.
   1664 	 *
   1665 	 * The 'swapgs' instruction gives us access to cpu-specific memory where
   1666 	 * we can save a user register and then read the LWP's kernel stack
   1667 	 * pointer.
   1668 	 *
   1669 	 * This code doesn't seem to set %ds, this may not matter since it is
   1670 	 * ignored in 64bit mode, OTOH the syscall instruction sets %ss and that
   1671 	 * is ignored as well.
   1672 	 */
   1673 	swapgs
   1674 
   1675 	/* Get the LWP's kernel stack pointer in %rax */
   1676 	.if	\is_svs
   1677 		movabs	%rax,SVS_UTLS+UTLS_SCRATCH
   1678 		movabs	SVS_UTLS+UTLS_RSP0,%rax
   1679 	.else
   1680 		movq	%rax,CPUVAR(SCRATCH)
   1681 		movq	CPUVAR(CURLWP),%rax
   1682 		movq	L_PCB(%rax),%rax
   1683 		movq	PCB_RSP0(%rax),%rax
   1684 	.endif
   1685 
   1686 	/* Make stack look like an 'int nn' frame */
   1687 	movq	$(LSEL(LUDATA_SEL, SEL_UPL)),SP(TF_SS)	/* user %ss */
   1688 	movq	%rsp,SP(TF_RSP)				/* user %rsp */
   1689 	movq	%r11,SP(TF_RFLAGS)			/* user %rflags */
   1690 	movq	$(LSEL(LUCODE_SEL, SEL_UPL)),SP(TF_CS)	/* user %cs */
   1691 	movq	%rcx,SP(TF_RIP)				/* user %rip */
   1692 	leaq	SP(0),%rsp		/* %rsp now valid after frame */
   1693 
   1694 	/* Restore %rax */
   1695 	.if	\is_svs
   1696 		movabs	SVS_UTLS+UTLS_SCRATCH,%rax
   1697 	.else
   1698 		movq	CPUVAR(SCRATCH),%rax
   1699 	.endif
   1700 
   1701 	movq	$2,TF_ERR(%rsp)		/* syscall instruction size */
   1702 	movq	$T_ASTFLT,TF_TRAPNO(%rsp)
   1703 #else
   1704 	/*
   1705 	 * Xen already switched to kernel stack.
   1706 	 * But it didn't disable events
   1707 	 */
   1708 	pushq	%rsi
   1709 	CLI(si)
   1710 	popq	%rsi
   1711 	addq	$0x10,%rsp	/* gap to match cs:rip */
   1712 	pushq	$2		/* error code */
   1713 	pushq	$T_ASTFLT
   1714 	subq	$TF_REGSIZE,%rsp
   1715 	cld
   1716 #endif
   1717 	INTR_SAVE_GPRS
   1718 	IBRS_ENTER
   1719 	movw	$GSEL(GUDATA_SEL, SEL_UPL),TF_DS(%rsp)
   1720 	movw	$GSEL(GUDATA_SEL, SEL_UPL),TF_ES(%rsp)
   1721 	movw	$0,TF_FS(%rsp)
   1722 	movw	$0,TF_GS(%rsp)
   1723 	.if	\is_svs
   1724 		SVS_ENTER
   1725 	.endif
   1726 	KMSAN_ENTER
   1727 	jmp	handle_syscall
   1728 IDTVEC_END(\name)
   1729 .endm
   1730 
   1731 SYSCALL_ENTRY	syscall,is_svs=0
   1732 
   1733 	TEXT_USER_BEGIN
   1734 
   1735 #ifdef SVS
   1736 SYSCALL_ENTRY	syscall_svs,is_svs=1
   1737 #endif
   1738 
   1739 IDTVEC(syscall32)
   1740 	sysretl		/* go away please */
   1741 IDTVEC_END(syscall32)
   1742 
   1743 	TEXT_USER_END
   1744 
   1745 /*
   1746  * osyscall()
   1747  *
   1748  * Trap gate entry for int $80 syscall, also used by sigreturn.
   1749  */
   1750 	TEXT_USER_BEGIN
   1751 IDTVEC(osyscall)
   1752 #ifdef XENPV
   1753 	pushq	%rsi
   1754 	CLI(si)
   1755 	popq	%rsi
   1756 	movq (%rsp),%rcx
   1757 	movq 8(%rsp),%r11
   1758 	addq $0x10,%rsp
   1759 #endif
   1760 	pushq	$2		/* size of instruction for restart */
   1761 	pushq	$T_ASTFLT	/* trap # for doing ASTs */
   1762 	INTRENTRY
   1763 	jmp	handle_syscall
   1764 IDTVEC_END(osyscall)
   1765 	TEXT_USER_END
   1766 
   1767 /*
   1768  * Return to userland via 'sysret'.
   1769  */
   1770 	TEXT_USER_BEGIN
   1771 	_ALIGN_TEXT
   1772 LABEL(syscall_sysret)
   1773 	KMSAN_LEAVE
   1774 	MDS_LEAVE
   1775 	SVS_LEAVE
   1776 	IBRS_LEAVE
   1777 	INTR_RESTORE_GPRS
   1778 	SWAPGS
   1779 #ifndef XENPV
   1780 	movq	TF_RIP(%rsp),%rcx	/* %rip for sysret */
   1781 	movq	TF_RFLAGS(%rsp),%r11	/* %flags for sysret */
   1782 	movq	TF_RSP(%rsp),%rsp
   1783 	sysretq
   1784 #else
   1785 	addq	$TF_RIP,%rsp
   1786 	pushq	$256	/* VGCF_IN_SYSCALL */
   1787 	jmp	HYPERVISOR_iret
   1788 #endif
   1789 END(syscall_sysret)
   1790 	TEXT_USER_END
   1791 
   1792 	TEXT_USER_BEGIN
   1793 
   1794 /*
   1795  * In intrfastexit, we advance %rsp at the beginning. We then access the
   1796  * segment registers in the trapframe with TF_BACKW (backwards). See the
   1797  * documentation in amd64_trap.S for an explanation.
   1798  */
   1799 
   1800 #define TF_BACKW(val, reg)	(val - (TF_REGSIZE+16))(reg)
   1801 
   1802 	_ALIGN_TEXT
   1803 	.type intrfastexit,@function
   1804 LABEL(intrfastexit)
   1805 	NOT_XEN(cli;)
   1806 	KMSAN_LEAVE
   1807 
   1808 	testb	$SEL_UPL,TF_CS(%rsp)
   1809 	jz	.Lkexit
   1810 
   1811 	MDS_LEAVE
   1812 	SVS_LEAVE
   1813 	IBRS_LEAVE
   1814 	INTR_RESTORE_GPRS
   1815 	addq	$(TF_REGSIZE+16),%rsp	/* iret frame */
   1816 	SWAPGS
   1817 
   1818 	cmpw	$LSEL(LUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp)
   1819 	je	do_iret
   1820 	cmpw	$GSEL(GUCODE_SEL, SEL_UPL),TF_BACKW(TF_CS, %rsp)
   1821 	je	do_iret
   1822 #ifdef XENPV
   1823 	cmpw	$FLAT_RING3_CS64,TF_BACKW(TF_CS, %rsp)
   1824 	je	do_iret
   1825 #endif
   1826 
   1827 do_mov_es:
   1828 	movw	TF_BACKW(TF_ES, %rsp),%es
   1829 do_mov_ds:
   1830 	movw	TF_BACKW(TF_DS, %rsp),%ds
   1831 do_mov_fs:
   1832 	movw	TF_BACKW(TF_FS, %rsp),%fs
   1833 #ifndef XENPV
   1834 do_mov_gs:
   1835 	movw	TF_BACKW(TF_GS, %rsp),%gs
   1836 #endif
   1837 
   1838 do_iret:
   1839 	iretq
   1840 
   1841 .Lkexit:
   1842 	INTR_RESTORE_GPRS
   1843 	addq	$(TF_REGSIZE+16),%rsp	/* iret frame */
   1844 	iretq
   1845 END(intrfastexit)
   1846 
   1847 	TEXT_USER_END
   1848 
   1849 	.section .rodata
   1850 
   1851 	/*
   1852 	 * Hotpatch templates.
   1853 	 */
   1854 
   1855 LABEL(hp_nolock)
   1856 	nop
   1857 LABEL(hp_nolock_end)
   1858 
   1859 LABEL(hp_retfence)
   1860 	lfence
   1861 LABEL(hp_retfence_end)
   1862 
   1863 LABEL(hp_clac)
   1864 	clac
   1865 LABEL(hp_clac_end)
   1866 
   1867 LABEL(hp_stac)
   1868 	stac
   1869 LABEL(hp_stac_end)
   1870 
   1871 #ifdef SVS
   1872 LABEL(svs_enter)
   1873 	movabs	SVS_UTLS+UTLS_KPDIRPA,%rax
   1874 	movq	%rax,%cr3
   1875 	movq	CPUVAR(KRSP0),%rsp
   1876 LABEL(svs_enter_end)
   1877 
   1878 LABEL(svs_enter_altstack)
   1879 	testb	$SEL_UPL,TF_CS(%rsp)
   1880 	jz	1234f
   1881 	movabs	SVS_UTLS+UTLS_KPDIRPA,%rax
   1882 	movq	%rax,%cr3
   1883 1234:
   1884 LABEL(svs_enter_altstack_end)
   1885 
   1886 LABEL(svs_enter_nmi)
   1887 	movq	%cr3,%rax
   1888 	movq	%rax,(FRAMESIZE+1*8)(%rsp)	/* nmistore->scratch */
   1889 	movq	(FRAMESIZE+0*8)(%rsp),%rax	/* nmistore->cr3 */
   1890 	movq	%rax,%cr3
   1891 LABEL(svs_enter_nmi_end)
   1892 
   1893 LABEL(svs_leave)
   1894 	movq	CPUVAR(URSP0),%rsp
   1895 	movq	CPUVAR(UPDIRPA),%rax
   1896 	movq	%rax,%cr3
   1897 LABEL(svs_leave_end)
   1898 
   1899 LABEL(svs_leave_altstack)
   1900 	testb	$SEL_UPL,TF_CS(%rsp)
   1901 	jz	1234f
   1902 	movq	CPUVAR(UPDIRPA),%rax
   1903 	movq	%rax,%cr3
   1904 1234:
   1905 LABEL(svs_leave_altstack_end)
   1906 
   1907 LABEL(svs_leave_nmi)
   1908 	movq	(FRAMESIZE+1*8)(%rsp),%rax	/* nmistore->scratch */
   1909 	movq	%rax,%cr3
   1910 LABEL(svs_leave_nmi_end)
   1911 #endif
   1912 
   1913 	/* IBRS <- 1 */
   1914 LABEL(ibrs_enter)
   1915 	movl	$MSR_IA32_SPEC_CTRL,%ecx
   1916 	rdmsr
   1917 	orl	$IA32_SPEC_CTRL_IBRS,%eax
   1918 	wrmsr
   1919 LABEL(ibrs_enter_end)
   1920 
   1921 	/* IBRS <- 0 */
   1922 LABEL(ibrs_leave)
   1923 	movl	$MSR_IA32_SPEC_CTRL,%ecx
   1924 	rdmsr
   1925 	andl	$~IA32_SPEC_CTRL_IBRS,%eax
   1926 	wrmsr
   1927 LABEL(ibrs_leave_end)
   1928 
   1929 LABEL(noibrs_enter)
   1930 	NOIBRS_ENTER
   1931 LABEL(noibrs_enter_end)
   1932 
   1933 LABEL(noibrs_leave)
   1934 	NOIBRS_LEAVE
   1935 LABEL(noibrs_leave_end)
   1936 
   1937 LABEL(mds_leave)
   1938 	pushq	$GSEL(GDATA_SEL, SEL_KPL)
   1939 	verw	(%rsp)
   1940 	addq	$8,%rsp
   1941 LABEL(mds_leave_end)
   1942 
   1943 LABEL(nomds_leave)
   1944 	NOMDS_LEAVE
   1945 LABEL(nomds_leave_end)
   1946 
   1947 #ifdef SELFRELOC
   1948 /*
   1949  * selfreloc(loadaddr edi)
   1950  * This is adapted from sys/arch/i386/i386/locore.S
   1951  */
   1952 	.code32
   1953 ENTRY(selfreloc_start)
   1954 	movl	%edi, %ebx		/* loadaddr saved in ebx */
   1955 	movl	%edi, %esi				/* src */
   1956 	movl	$_RELOC(kernel_text), %edi		/* dest */
   1957 	movl	16(%esp),%ecx				/* esym */
   1958 	subl	$_RELOC(kernel_text), %ecx		/* size */
   1959 
   1960 #if defined(NO_OVERLAP)
   1961         movl    %ecx, %eax
   1962 #else
   1963         movl    %edi, %eax
   1964         subl    %esi, %eax
   1965         cmpl    %ecx, %eax      /* overlapping? */
   1966         movl    %ecx, %eax
   1967         jb      .Lbackwards
   1968 #endif
   1969         /* nope, copy forwards. */
   1970         shrl    $2, %ecx        /* copy by words */
   1971         rep
   1972         movsl
   1973         and     $3, %eax        /* any bytes left? */
   1974         jnz     .Ltrailing
   1975         jmp     .Lcopy_done
   1976 
   1977 .Ltrailing:
   1978         cmp     $2, %eax
   1979         jb      11f
   1980         movw    (%esi), %ax
   1981         movw    %ax, (%edi)
   1982         je      .Lcopy_done
   1983         movb    2(%esi), %al
   1984         movb    %al, 2(%edi)
   1985         jmp     .Lcopy_done
   1986 11:     movb    (%esi), %al
   1987         movb    %al, (%edi)
   1988         jmp     .Lcopy_done
   1989 
   1990 #if !defined(NO_OVERLAP)
   1991 .Lbackwards:
   1992         addl    %ecx, %edi      /* copy backwards. */
   1993         addl    %ecx, %esi
   1994         and     $3, %eax        /* any fractional bytes? */
   1995         jnz     .Lback_align
   1996 .Lback_aligned:
   1997         shrl    $2, %ecx
   1998         subl    $4, %esi
   1999         subl    $4, %edi
   2000         std
   2001         rep
   2002         movsl
   2003         cld
   2004         jmp     .Lcopy_done
   2005 
   2006 .Lback_align:
   2007         sub     %eax, %esi
   2008         sub     %eax, %edi
   2009         cmp     $2, %eax
   2010         jb      11f
   2011         je      12f
   2012         movb    2(%esi), %al
   2013         movb    %al, 2(%edi)
   2014 12:     movw    (%esi), %ax
   2015         movw    %ax, (%edi)
   2016         jmp     .Lback_aligned
   2017 11:     movb    (%esi), %al
   2018         movb    %al, (%edi)
   2019         jmp     .Lback_aligned
   2020 #endif
   2021         /* End of copy kernel */
   2022 .Lcopy_done:
   2023 	cld			/* LynxOS depends on it */
   2024 
   2025 	/* load current selfreloc_start addesss in $edi */
   2026 	movl	%ebx, %edi	/* loadaddr was saved in ebx */
   2027 	addl	$(selfreloc_start - kernel_text), %edi
   2028 
   2029 	/* Prepare jump address */
   2030 	lea	(selfreloc_start32a - selfreloc_start)(%edi), %eax
   2031 	movl	%eax, (selfreloc_start32r - selfreloc_start)(%edi)
   2032 
   2033 	/* Setup GDT */
   2034 	lea	(gdt - selfreloc_start)(%edi), %eax
   2035 	mov	%eax, (gdtrr - selfreloc_start)(%edi)
   2036 	lgdt	(gdtr - selfreloc_start)(%edi)
   2037 
   2038 	/* Jump to set %cs */
   2039 	ljmp	*(selfreloc_start32r - selfreloc_start)(%edi)
   2040 
   2041 	.align	4
   2042 selfreloc_start32a:
   2043 	movl	$0x10, %eax	/* #define DATA_SEGMENT 0x10 */
   2044 	movw	%ax, %ds
   2045 	movw	%ax, %es
   2046 	movw	%ax, %fs
   2047 	movw	%ax, %gs
   2048 	movw	%ax, %ss
   2049 
   2050 	/* Disable Paging in CR0 */
   2051 	movl	%cr0, %eax
   2052 	andl	$(~CR0_PG), %eax
   2053 	movl	%eax, %cr0
   2054 
   2055 	/* Disable PAE in CR4 */
   2056 	movl	%cr4, %eax
   2057 	andl	$(~CR4_PAE), %eax
   2058 	movl	%eax, %cr4
   2059 
   2060 	jmp	selfreloc_start32b
   2061 
   2062 	.align	4
   2063 selfreloc_start32b:
   2064 	xor	%eax, %eax
   2065 	movl	$_RELOC(start), %esi
   2066 	jmp	*%esi
   2067 
   2068 	.align	16
   2069 selfreloc_start32r:
   2070 	.long	0
   2071 	.long	0x08	/* #define	CODE_SEGMENT	0x08 */
   2072 	.align	16
   2073 gdt:
   2074 	.long	0, 0
   2075 	.byte	0xff, 0xff, 0x00, 0x00, 0x00, 0x9f, 0xcf, 0x00
   2076 	.byte	0xff, 0xff, 0x00, 0x00, 0x00, 0x93, 0xcf, 0x00
   2077 gdtr:
   2078 	.word	gdtr - gdt
   2079 gdtrr:
   2080 	.quad
   2081 END(selfreloc_start)
   2082 #endif /* SELFRELOC */
   2083