Home | History | Annotate | Line # | Download | only in prekern
      1 /*	$NetBSD: mm.c,v 1.28 2021/05/04 21:09:16 khorben Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2017-2020 The NetBSD Foundation, Inc. All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Maxime Villard.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 #include "prekern.h"
     32 
     33 #define ELFROUND	64
     34 
     35 static const uint8_t pads[4] = {
     36 	[BTSEG_NONE] = 0x00,
     37 	[BTSEG_TEXT] = 0xCC,
     38 	[BTSEG_RODATA] = 0x00,
     39 	[BTSEG_DATA] = 0x00
     40 };
     41 
     42 #define MM_PROT_READ	0x00
     43 #define MM_PROT_WRITE	0x01
     44 #define MM_PROT_EXECUTE	0x02
     45 
     46 static const pt_entry_t protection_codes[3] = {
     47 	[MM_PROT_READ] = PTE_NX,
     48 	[MM_PROT_WRITE] = PTE_W | PTE_NX,
     49 	[MM_PROT_EXECUTE] = 0,
     50 	/* RWX does not exist */
     51 };
     52 
     53 struct bootspace bootspace;
     54 
     55 extern paddr_t kernpa_start, kernpa_end;
     56 vaddr_t iom_base;
     57 
     58 paddr_t pa_avail = 0;
     59 static const vaddr_t tmpva = (PREKERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
     60 
     61 void
     62 mm_init(paddr_t first_pa)
     63 {
     64 	pa_avail = first_pa;
     65 }
     66 
     67 static void
     68 mm_enter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
     69 {
     70 	if (PTE_BASE[pl1_i(va)] & PTE_P) {
     71 		fatal("mm_enter_pa: mapping already present");
     72 	}
     73 	PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
     74 }
     75 
     76 static void
     77 mm_reenter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
     78 {
     79 	PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
     80 }
     81 
     82 static void
     83 mm_flush_va(vaddr_t va)
     84 {
     85 	asm volatile("invlpg (%0)" ::"r" (va) : "memory");
     86 }
     87 
     88 static paddr_t
     89 mm_palloc(size_t npages)
     90 {
     91 	paddr_t pa;
     92 	size_t i;
     93 
     94 	/* Allocate the physical pages */
     95 	pa = pa_avail;
     96 	pa_avail += npages * PAGE_SIZE;
     97 
     98 	/* Zero them out */
     99 	for (i = 0; i < npages; i++) {
    100 		mm_reenter_pa(pa + i * PAGE_SIZE, tmpva,
    101 		    MM_PROT_READ|MM_PROT_WRITE);
    102 		mm_flush_va(tmpva);
    103 		memset((void *)tmpva, 0, PAGE_SIZE);
    104 	}
    105 
    106 	return pa;
    107 }
    108 
    109 static bool
    110 mm_pte_is_valid(pt_entry_t pte)
    111 {
    112 	return ((pte & PTE_P) != 0);
    113 }
    114 
    115 static void
    116 mm_mprotect(vaddr_t startva, size_t size, pte_prot_t prot)
    117 {
    118 	size_t i, npages;
    119 	vaddr_t va;
    120 	paddr_t pa;
    121 
    122 	ASSERT(size % PAGE_SIZE == 0);
    123 	npages = size / PAGE_SIZE;
    124 
    125 	for (i = 0; i < npages; i++) {
    126 		va = startva + i * PAGE_SIZE;
    127 		pa = (PTE_BASE[pl1_i(va)] & PTE_FRAME);
    128 		mm_reenter_pa(pa, va, prot);
    129 		mm_flush_va(va);
    130 	}
    131 }
    132 
    133 void
    134 mm_bootspace_mprotect(void)
    135 {
    136 	pte_prot_t prot;
    137 	size_t i;
    138 
    139 	/* Remap the kernel segments with proper permissions. */
    140 	for (i = 0; i < BTSPACE_NSEGS; i++) {
    141 		if (bootspace.segs[i].type == BTSEG_TEXT) {
    142 			prot = MM_PROT_READ|MM_PROT_EXECUTE;
    143 		} else if (bootspace.segs[i].type == BTSEG_RODATA) {
    144 			prot = MM_PROT_READ;
    145 		} else {
    146 			continue;
    147 		}
    148 		mm_mprotect(bootspace.segs[i].va, bootspace.segs[i].sz, prot);
    149 	}
    150 
    151 	print_state(STATE_NORMAL, "Segments protection updated");
    152 }
    153 
    154 static size_t
    155 mm_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
    156 {
    157 	size_t npages;
    158 
    159 	npages = roundup((endva / PAGE_SIZE), (pgsz / PAGE_SIZE)) -
    160 	    rounddown((startva / PAGE_SIZE), (pgsz / PAGE_SIZE));
    161 	return (npages / (pgsz / PAGE_SIZE));
    162 }
    163 
    164 static void
    165 mm_map_tree(vaddr_t startva, vaddr_t endva)
    166 {
    167 	size_t i, nL4e, nL3e, nL2e;
    168 	size_t L4e_idx, L3e_idx, L2e_idx;
    169 	paddr_t pa;
    170 
    171 	/* Build L4. */
    172 	L4e_idx = pl4_i(startva);
    173 	nL4e = mm_nentries_range(startva, endva, NBPD_L4);
    174 	ASSERT(L4e_idx == 511);
    175 	ASSERT(nL4e == 1);
    176 	if (!mm_pte_is_valid(L4_BASE[L4e_idx])) {
    177 		pa = mm_palloc(1);
    178 		L4_BASE[L4e_idx] = pa | PTE_P | PTE_W;
    179 	}
    180 
    181 	/* Build L3. */
    182 	L3e_idx = pl3_i(startva);
    183 	nL3e = mm_nentries_range(startva, endva, NBPD_L3);
    184 	for (i = 0; i < nL3e; i++) {
    185 		if (mm_pte_is_valid(L3_BASE[L3e_idx+i])) {
    186 			continue;
    187 		}
    188 		pa = mm_palloc(1);
    189 		L3_BASE[L3e_idx+i] = pa | PTE_P | PTE_W;
    190 	}
    191 
    192 	/* Build L2. */
    193 	L2e_idx = pl2_i(startva);
    194 	nL2e = mm_nentries_range(startva, endva, NBPD_L2);
    195 	for (i = 0; i < nL2e; i++) {
    196 		if (mm_pte_is_valid(L2_BASE[L2e_idx+i])) {
    197 			continue;
    198 		}
    199 		pa = mm_palloc(1);
    200 		L2_BASE[L2e_idx+i] = pa | PTE_P | PTE_W;
    201 	}
    202 }
    203 
    204 static vaddr_t
    205 mm_randva_kregion(size_t size, size_t pagesz)
    206 {
    207 	vaddr_t sva, eva;
    208 	vaddr_t randva;
    209 	uint64_t rnd;
    210 	size_t i;
    211 	bool ok;
    212 
    213 	while (1) {
    214 		prng_get_rand(&rnd, sizeof(rnd));
    215 		randva = rounddown(KASLR_WINDOW_BASE +
    216 		    rnd % (KASLR_WINDOW_SIZE - size), pagesz);
    217 
    218 		/* Detect collisions */
    219 		ok = true;
    220 		for (i = 0; i < BTSPACE_NSEGS; i++) {
    221 			if (bootspace.segs[i].type == BTSEG_NONE) {
    222 				continue;
    223 			}
    224 			sva = bootspace.segs[i].va;
    225 			eva = sva + bootspace.segs[i].sz;
    226 
    227 			if ((sva <= randva) && (randva < eva)) {
    228 				ok = false;
    229 				break;
    230 			}
    231 			if ((sva < randva + size) && (randva + size <= eva)) {
    232 				ok = false;
    233 				break;
    234 			}
    235 			if (randva < sva && eva < (randva + size)) {
    236 				ok = false;
    237 				break;
    238 			}
    239 		}
    240 		if (ok) {
    241 			break;
    242 		}
    243 	}
    244 
    245 	mm_map_tree(randva, randva + size);
    246 
    247 	return randva;
    248 }
    249 
    250 static paddr_t
    251 bootspace_get_kern_segs_end_pa(void)
    252 {
    253 	paddr_t pa, max = 0;
    254 	size_t i;
    255 
    256 	for (i = 0; i < BTSPACE_NSEGS; i++) {
    257 		if (bootspace.segs[i].type == BTSEG_NONE) {
    258 			continue;
    259 		}
    260 		pa = bootspace.segs[i].pa + bootspace.segs[i].sz;
    261 		if (pa > max)
    262 			max = pa;
    263 	}
    264 
    265 	return max;
    266 }
    267 
    268 static void
    269 bootspace_addseg(int type, vaddr_t va, paddr_t pa, size_t sz)
    270 {
    271 	size_t i;
    272 
    273 	for (i = 0; i < BTSPACE_NSEGS; i++) {
    274 		if (bootspace.segs[i].type == BTSEG_NONE) {
    275 			bootspace.segs[i].type = type;
    276 			bootspace.segs[i].va = va;
    277 			bootspace.segs[i].pa = pa;
    278 			bootspace.segs[i].sz = sz;
    279 			return;
    280 		}
    281 	}
    282 
    283 	fatal("bootspace_addseg: segments full");
    284 }
    285 
    286 static size_t
    287 mm_shift_segment(vaddr_t va, size_t pagesz, size_t elfsz, size_t elfalign)
    288 {
    289 	size_t shiftsize, offset;
    290 	uint64_t rnd;
    291 
    292 	/*
    293 	 * If possible, shift the segment in memory using a random offset. Once
    294 	 * shifted the segment remains in the same page, of size pagesz. Make
    295 	 * sure to respect the ELF alignment constraint.
    296 	 */
    297 
    298 	if (elfalign == 0) {
    299 		elfalign = ELFROUND;
    300 	}
    301 
    302 	ASSERT(pagesz >= elfalign);
    303 	ASSERT(pagesz % elfalign == 0);
    304 	shiftsize = roundup(elfsz, pagesz) - roundup(elfsz, elfalign);
    305 	if (shiftsize == 0) {
    306 		return 0;
    307 	}
    308 
    309 	prng_get_rand(&rnd, sizeof(rnd));
    310 	offset = roundup(rnd % shiftsize, elfalign);
    311 	ASSERT((va + offset) % elfalign == 0);
    312 
    313 	memmove((void *)(va + offset), (void *)va, elfsz);
    314 
    315 	return offset;
    316 }
    317 
    318 static void
    319 mm_map_head(void)
    320 {
    321 	size_t i, npages, size;
    322 	uint64_t rnd;
    323 	vaddr_t randva;
    324 
    325 	/*
    326 	 * The HEAD window is 1GB below the main KASLR window. This is to
    327 	 * ensure that head always comes first in virtual memory. The reason
    328 	 * for that is that we use (headva + sh_offset), and sh_offset is
    329 	 * unsigned.
    330 	 */
    331 
    332 	/*
    333 	 * To get the size of the head, we give a look at the read-only
    334 	 * mapping of the kernel we created in locore. We're identity mapped,
    335 	 * so kernpa = kernva.
    336 	 */
    337 	size = elf_get_head_size((vaddr_t)kernpa_start);
    338 	npages = size / PAGE_SIZE;
    339 
    340 	/*
    341 	 * Choose a random range of VAs in the HEAD window, and create the page
    342 	 * tree for it.
    343 	 */
    344 	prng_get_rand(&rnd, sizeof(rnd));
    345 	randva = rounddown(HEAD_WINDOW_BASE + rnd % (HEAD_WINDOW_SIZE - size),
    346 	    PAGE_SIZE);
    347 	mm_map_tree(randva, randva + size);
    348 
    349 	/* Enter the area and build the ELF info */
    350 	for (i = 0; i < npages; i++) {
    351 		mm_enter_pa(kernpa_start + i * PAGE_SIZE,
    352 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
    353 	}
    354 	elf_build_head(randva);
    355 
    356 	/* Register the values in bootspace */
    357 	bootspace.head.va = randva;
    358 	bootspace.head.pa = kernpa_start;
    359 	bootspace.head.sz = size;
    360 }
    361 
    362 vaddr_t
    363 mm_map_segment(int segtype, paddr_t pa, size_t elfsz, size_t elfalign)
    364 {
    365 	size_t i, npages, size, pagesz, offset;
    366 	vaddr_t randva;
    367 	char pad;
    368 
    369 	if (elfsz <= PAGE_SIZE) {
    370 		pagesz = NBPD_L1;
    371 	} else {
    372 		pagesz = NBPD_L2;
    373 	}
    374 
    375 	/* Create the page tree */
    376 	size = roundup(elfsz, pagesz);
    377 	randva = mm_randva_kregion(size, pagesz);
    378 
    379 	/* Enter the segment */
    380 	npages = size / PAGE_SIZE;
    381 	for (i = 0; i < npages; i++) {
    382 		mm_enter_pa(pa + i * PAGE_SIZE,
    383 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
    384 	}
    385 
    386 	/* Shift the segment in memory */
    387 	offset = mm_shift_segment(randva, pagesz, elfsz, elfalign);
    388 	ASSERT(offset + elfsz <= size);
    389 
    390 	/* Fill the paddings */
    391 	pad = pads[segtype];
    392 	memset((void *)randva, pad, offset);
    393 	memset((void *)(randva + offset + elfsz), pad, size - elfsz - offset);
    394 
    395 	/* Register the bootspace information */
    396 	bootspace_addseg(segtype, randva, pa, size);
    397 
    398 	return (randva + offset);
    399 }
    400 
    401 static void
    402 mm_map_boot(void)
    403 {
    404 	size_t i, npages, size;
    405 	vaddr_t randva;
    406 	paddr_t bootpa;
    407 
    408 	/*
    409 	 * The "boot" region is special: its page tree has a fixed size, but
    410 	 * the number of pages entered is lower.
    411 	 */
    412 
    413 	/* Create the page tree, starting at a random VA */
    414 	size = (NKL2_KIMG_ENTRIES + 1) * NBPD_L2;
    415 	randva = mm_randva_kregion(size, PAGE_SIZE);
    416 
    417 	/* The "boot" region begins right after the kernel segments */
    418 	bootpa = bootspace_get_kern_segs_end_pa();
    419 
    420 	/* The prekern consumed some EXTRA memory up until pa_avail, this
    421 	 * covers REL/RELA/SYM/STR and EXTRA */
    422 	size = (pa_avail - bootpa);
    423 	npages = size / PAGE_SIZE;
    424 
    425 	/* Enter the whole area linearly */
    426 	for (i = 0; i < npages; i++) {
    427 		mm_enter_pa(bootpa + i * PAGE_SIZE,
    428 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
    429 	}
    430 
    431 	/* Fix up the ELF sections located in the "boot" region */
    432 	elf_fixup_boot(randva, bootpa);
    433 
    434 	/* Map the ISA I/O MEM right after EXTRA, in pure VA */
    435 	iom_base = randva + npages * PAGE_SIZE;
    436 	npages = IOM_SIZE / PAGE_SIZE;
    437 	for (i = 0; i < npages; i++) {
    438 		mm_enter_pa(IOM_BEGIN + i * PAGE_SIZE,
    439 		    iom_base + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
    440 	}
    441 
    442 	/* Register the values in bootspace */
    443 	bootspace.boot.va = randva;
    444 	bootspace.boot.pa = bootpa;
    445 	bootspace.boot.sz = (size_t)(iom_base + IOM_SIZE) -
    446 	    (size_t)bootspace.boot.va;
    447 
    448 	/* Initialize the values that are located in the "boot" region */
    449 	extern uint64_t PDPpaddr;
    450 	bootspace.spareva = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
    451 	bootspace.pdir = bootspace.boot.va + (PDPpaddr - bootspace.boot.pa);
    452 	bootspace.smodule = (vaddr_t)iom_base + IOM_SIZE;
    453 	bootspace.emodule = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
    454 }
    455 
    456 /*
    457  * The bootloader has set up the following layout of physical memory:
    458  * +------------+--------------+------------+------------------------+-------+
    459  * | ELF HEADER | SECT HEADERS | KERN SECTS | REL/RELA/SYM/STR SECTS | EXTRA |
    460  * +------------+--------------+------------+------------------------+-------+
    461  * This was done in the loadfile_elf32.c:loadfile_dynamic() function.
    462  *
    463  * We abstract this layout into several "regions":
    464  * +---------------------------+------------+--------------------------------+
    465  * |         Head region       | Kern segs  |          Boot region           |
    466  * +---------------------------+------------+--------------------------------+
    467  *
    468  * There is a variable number of independent regions we create: one head,
    469  * several kernel segments, one boot. They are all mapped at random VAs.
    470  *
    471  * "Head" contains the ELF Header and ELF Section Headers, and we use them to
    472  * map the rest of the regions. Head must be placed *before* the other
    473  * regions, in both virtual memory and physical memory.
    474  *
    475  * The "Kernel Segments" contain the kernel SHT_NOBITS and SHT_PROGBITS
    476  * sections, in a 1:1 manner (one segment is associated with one section).
    477  * The segments are mapped at random VAs and referenced in bootspace.segs[].
    478  *
    479  * "Boot" contains miscellaneous information:
    480  *  - The ELF Rel/Rela/Sym/Str sections of the kernel
    481  *  - Some extra memory the prekern has consumed so far
    482  *  - The ISA I/O MEM, in pure VA
    483  *  - Eventually the module_map, in pure VA (the kernel uses the available VA
    484  *    at the end of "boot")
    485  * Boot is placed *after* the other regions in physical memory. In virtual
    486  * memory however there is no constraint, so its VA is randomly selected in
    487  * the main KASLR window.
    488  *
    489  * At the end of this function, the bootspace structure is fully constructed.
    490  */
    491 void
    492 mm_map_kernel(void)
    493 {
    494 	memset(&bootspace, 0, sizeof(bootspace));
    495 	mm_map_head();
    496 	print_state(STATE_NORMAL, "Head region mapped");
    497 	elf_map_sections();
    498 	print_state(STATE_NORMAL, "Segments mapped");
    499 	mm_map_boot();
    500 	print_state(STATE_NORMAL, "Boot region mapped");
    501 }
    502