Home | History | Annotate | Line # | Download | only in m68k
      1 /*	$NetBSD: pmap_68k.c,v 1.49 2025/12/17 07:05:50 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2025 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Pmap module for the Motorola 68851 / 68030 / 68040 / 68060 MMUs.
     34  * (...and HP 68851-like MMU.)
     35  *
     36  * This implementation supports both 2-level and 3-level page table
     37  * layouts.  The 3-level is mandated by 68040 / 68060, and the 2-level
     38  * is mandated by the HP MMU.  The 68851 and 68030 can do either, and
     39  * for now, the 2-level arrangement is retained for those MMUs, although
     40  * eventually we will switch them to the 3-level configuration.
     41  *
     42  * To support both configurations, page tables are abstracted away from
     43  * the page table pages that contain them.  The interface pmap operations
     44  * operate on "leaf" (page) tables, and only when one of those tables needs
     45  * to be allocated or freed, do the differences between the two configurations
     46  * need to be dealt with.  All of the tables are kept in a red-black tree
     47  * that's indexed by their "segment" number (where "segment" is defined as
     48  * "the amount of space mapped by a single leaf table").  This avoids having
     49  * to burn large amounts of kernel address space to access tables which are
     50  * expected to be sparsely-populated.
     51  *
     52  * In order to reduce the number of tree lookups, the most recently used
     53  * leaf table is cached, and the interface contract is such that bulk
     54  * operations are allowed to access subsequent PTEs within a given table
     55  * (segment) without having to perform another PTE lookup.
     56  *
     57  * This illustrates the initial table layout for a simple program
     58  * (/usr/bin/yes) using the standard m68k address space layout (based
     59  * on the historical 4.3BSD-on-hp300 layout, which was itself based on
     60  * HP-UX in order to facilitate HP-UX binary compatibility back when
     61  * that was considered to be important).  This example uses a 4K page
     62  * size.
     63  *
     64  * TEXTADDR is $0000.2000 (not always strictly true, but close enough)
     65  * USRSTACK is $FFF0.0000 (grows down, first used page VA is $FFEF.F000)
     66  *
     67  * (TEXTADDR is $0000.2000 because the linker uses 8K page size for
     68  * broader compatibility and keeps the 0-page unmapped so that NULL
     69  * pointer dereferences blow up.)
     70  *
     71  * This is to say: the text / data / heap of this program are in the
     72  * bottom 1MB of the address space, and the stack is in the second-from-
     73  * the-top 1MB of the address space.
     74  *
     75  * In the 2-level layout, the level-1 table is 4KB in size, and has 1024
     76  * entries.  Those 1024 entries together represent the 4GB user address
     77  * space, and each entry thus maps a 4MB "segment" by itself pointing to
     78  * a level-2 table which themselves are 4KB in size and have 1024 entries
     79  * (4MB / 1024 -> 4KB, which is the page size ... convenient!)  So, when
     80  * our very simple program is loaded, we have a table structure that looks
     81  * like this:
     82  *
     83  *                             (4KB)
     84  *                    +----------------------+
     85  *                    |       Level-1        |
     86  *                    |0                 1023|
     87  *                    +----------------------+
     88  *                     |                    |
     89  *                     |                    |
     90  *           +---------+                    +---------+
     91  *           |                                        |
     92  *           v                                        v
     93  *         (4KB)                                    (4KB)
     94  * +----------------------+                 +----------------------+
     95  * |       Level-2        |                 |       Level-2        |
     96  * | 2 4                  |                 |             767      |
     97  * +----------------------+                 +----------------------+
     98  *   | |                                                   |
     99  *   | +-+                                                 |
    100  *   v   v                                                 v
    101  * TEXT DATA/bss/heap                                    stack
    102  *
    103  * As you can see, this requires 3 tables (1 level-1 and 2 level-2).  Each
    104  * table consumes a full 4KB page, so mapping this address space requires
    105  * 3 total pages.
    106  *
    107  * In the 3-level layout, the level-1 and level-2 tables each contain 128
    108  * entries, making them 512 bytes in size.  When using 4KB pages, the level-3
    109  * tables contain 64 entries, making them 256 bytes in size.
    110  *
    111  * So, assuming the same address space layout, the 3-level structure looks
    112  * like this:
    113  *
    114  *                              (512B)
    115  *                         +--------------+
    116  *                         |   Level-1    |
    117  *                         |0          127|
    118  *                         +--------------+
    119  *                          |           |
    120  *                      +---+           +---+
    121  *                      v                   v
    122  *                    (512B)              (512B)
    123  *               +--------------+    +--------------+
    124  *               |   Level-2    |    |   Level-2    |
    125  *               |0             |    |          123 |
    126  *               +--------------+    +--------------+
    127  *                |                              |
    128  *      +---------+                              +-----+
    129  *      v                                              v
    130  *    (256B)                                         (256B)
    131  * +------------+                                 +------------+
    132  * |  Level-3   |                                 |  Level-3   |
    133  * | 2 4        |                                 |          63|
    134  * +------------+                                 +------------+
    135  *   | |                                                      |
    136  *   | +-+                                                    |
    137  *   v   v                                                    v
    138  * TEXT DATA/bss/heap                                       stack
    139  *
    140  * The table allocator has two pools of memory for tables in the 3-level
    141  * configuration: one for "segment" tables (always 512 bytes) and one for
    142  * "page" or "leaf" tables (256 bytes in size for 4K pages).  Pages are
    143  * allocated to the pools one at a time, and then the tables are allocated
    144  * from the pages.  Because of this, we only need two pages, 33% less (!),
    145  * than the 2-level configuration to map the same address space.
    146  *
    147  * There is a cost, however: each access that misses the Address Translation
    148  * Cache costs one extra memory cycle in the 3-level configuration.
    149  *
    150  * LOCKING IN THIS PMAP MODULE:
    151  *
    152  * MULTIPROCESSING IS NOT SUPPORTED IN THIS PMAP MODULE.  Adding support
    153  * for it would not be terribly difficult, but there is little value in
    154  * doing that work until such time as a multiprocessor m68k machine exists
    155  * that NetBSD runs on.
    156  *
    157  * As such, there is **no** locking performed of any data structures here.
    158  * We do actually reap a benefit from this perceived laziness: we do not
    159  * have to worry about lock ordering, which means we can take some shortcuts
    160  * in some places (especially around pv_entry manipulation).
    161  *
    162  * THERE IS A CAVEAT, HOWEVER!  Because there are no guard rails, we cannot,
    163  * under any circumstances, yield the CPU during the critical section of a
    164  * pmap operation, as doing so could cause the world to change beneath our
    165  * feet, possibly rendering our work, for lack of a better term, "crashy".
    166  * Specifically, this means:
    167  *
    168  *	- Adaptive mutexes must not be acquired (e.g. when calling into
    169  *	  other code, e.g. UVM to get a VA or a page).
    170  *	- Waiting for memory is not allowed.
    171  *	- The current thread may not be preempted.
    172  *
    173  * If any of those things are required, they must be performed outside of
    174  * a critical section.  If we discover that this is required while inside
    175  * a critical section, then we must exit the critical section, perform the
    176  * blocking work, re-enter the critical section and re-evaluate everything.
    177  * Macros are provided to mark the boundaries of critical sections:
    178  *
    179  *	- PMAP_CRIT_ENTER()
    180  *	- PMAP_CRIT_EXIT()
    181  *
    182  * XXX Alas, doesn't seem to be a way for us to hook into ASSERT_SLEEPABLE()
    183  * XXX when inside a critical section.  We should explore that for a future
    184  * XXX enhancement.
    185  */
    186 
    187 /*
    188  * Current status:
    189  * - Very stable multi-user on virt68k (qemu 68040; does not accurately
    190  *   model cache or ATC, but suitable for exercising large memory configs).
    191  *
    192  * - Single-user mode on 68030 w/ no external cache (luna68k).
    193  *
    194  * - Single-user mode on 68040 (hp425t).
    195  *
    196  * - Ports that have been adapted: hp300, luna68k, mvme68k (not tested),
    197  *   news68k (see below), next68k (not tested), virt68k, x68k.
    198  *
    199  * XXX TODO XXX
    200  *
    201  * - Adapt amiga (hard), atari (hard), cesfic (easy), mac68k (moderate).
    202  * - Test on 68020.
    203  * - Test on 68060.
    204  * - More rigorous 68040 testing.
    205  * - More rigorous 68030 testing.
    206  * - Test on machines above listed as "not tested".
    207  * - More rigorous testing in various emulators (Nono, UAE?)
    208  * - Fix problems observed on news68k (external cache related?)
    209  * - Finish HP MMU support and test on real HP MMU.
    210  * - Convert '851 / '030 to 3-level.
    211  * - Optimize ATC / cache manipulation.
    212  * - Add some more instrumentation.
    213  * - Eventually disable instrumentation by default.
    214  * - ...
    215  * - PROFIT!
    216  */
    217 
    218 #include "opt_m68k_arch.h"
    219 
    220 #include <sys/cdefs.h>
    221 __KERNEL_RCSID(0, "$NetBSD: pmap_68k.c,v 1.49 2025/12/17 07:05:50 thorpej Exp $");
    222 
    223 #include <sys/param.h>
    224 #include <sys/systm.h>
    225 #include <sys/evcnt.h>
    226 #include <sys/proc.h>
    227 #include <sys/pool.h>
    228 #include <sys/cpu.h>
    229 #include <sys/atomic.h>
    230 #include <sys/kmem.h>
    231 
    232 #include <machine/pcb.h>
    233 
    234 #include <uvm/uvm.h>
    235 #include <uvm/uvm_physseg.h>
    236 
    237 #include <m68k/cacheops.h>
    238 
    239 #if !defined(M68K_MMU_MOTOROLA) && !defined(M68K_MMU_HP)
    240 #error Hit the road, Jack...
    241 #endif
    242 
    243 /****************************** SERIALIZATION ********************************/
    244 
    245 /*
    246  * XXX Would like to make these do something lightweight-ish in
    247  * XXX DIAGNOSTIC kernels (and also make ASSERT_SLEEPABLE() trip
    248  * XXX if we're in a critical section).
    249  */
    250 
    251 #define	PMAP_CRIT_ENTER()	__nothing
    252 #define	PMAP_CRIT_EXIT()	__nothing
    253 #define	PMAP_CRIT_ASSERT()	__nothing
    254 
    255 /**************************** MMU CONFIGURATION ******************************/
    256 
    257 #include "opt_m68k_arch.h"
    258 
    259 #if defined(M68K_MMU_68030)
    260 #include <m68k/mmu_30.h>	/* for cpu_kcore_hdr_t */
    261 #endif
    262 
    263 /*
    264  * We consider 3 different MMU classes:
    265  * - 68851 (includes 68030)
    266  * - 68040 (includes 68060)
    267  * - HP MMU for 68020 (68851-like, 2-level 4K only, external VAC)
    268  */
    269 
    270 #define	MMU_CLASS_68851		0
    271 #define	MMU_CLASS_68040		1
    272 #define	MMU_CLASS_HP		3
    273 
    274 static int	pmap_mmuclass __read_mostly;
    275 
    276 #if defined(M68K_MMU_68851) || defined(M68K_MMU_68030)
    277 #define	MMU_CONFIG_68851_CLASS	1
    278 #else
    279 #define	MMU_CONFIG_68851_CLASS	0
    280 #endif
    281 
    282 #if defined(M68K_MMU_68040) || defined(M68K_MMU_68060)
    283 #define	MMU_CONFIG_68040_CLASS	1
    284 #else
    285 #define	MMU_CONFIG_68040_CLASS	0
    286 #endif
    287 
    288 #if defined(M68K_MMU_HP)
    289 #define	MMU_CONFIG_HP_CLASS	1
    290 #else
    291 #define	MMU_CONFIG_HP_CLASS	0
    292 #endif
    293 
    294 #define	MMU_CONFIG_NCLASSES	(MMU_CONFIG_68851_CLASS + \
    295 				 MMU_CONFIG_68040_CLASS + \
    296 				 MMU_CONFIG_HP_CLASS)
    297 
    298 #if MMU_CONFIG_NCLASSES == 1
    299 
    300 #if MMU_CONFIG_68851_CLASS
    301 #define	MMU_IS_68851_CLASS	1
    302 #elif MMU_CONFIG_68040_CLASS
    303 #define	MMU_IS_68040_CLASS	1
    304 #elif MMU_CONFIG_HP_CLASS
    305 #define	MMU_IS_HP_CLASS		1
    306 #else
    307 #error Single MMU config predicate error.
    308 #endif
    309 
    310 #else /* MMU_CONFIG_NCLASSES != 1 */
    311 
    312 #if MMU_CONFIG_68851_CLASS
    313 #define	MMU_IS_68851_CLASS	(pmap_mmuclass == MMU_CLASS_68851)
    314 #endif
    315 
    316 #if MMU_CONFIG_68040_CLASS
    317 #define	MMU_IS_68040_CLASS	(pmap_mmuclass == MMU_CLASS_68040)
    318 #endif
    319 
    320 #if MMU_CONFIG_HP_CLASS
    321 #define	MMU_IS_HP_CLASS		(pmap_mmuclass == MMU_CLASS_HP)
    322 #endif
    323 
    324 #endif /* MMU_CONFIG_NCLASSES == 1 */
    325 
    326 #ifndef MMU_IS_68851_CLASS
    327 #define	MMU_IS_68851_CLASS	0
    328 #endif
    329 
    330 #ifndef MMU_IS_68040_CLASS
    331 #define	MMU_IS_68040_CLASS	0
    332 #endif
    333 
    334 #ifndef MMU_IS_HP_CLASS
    335 #define	MMU_IS_HP_CLASS		0
    336 #endif
    337 
    338 /*
    339  * 68040 must always use 3-level.  Eventually, we will switch the '851
    340  * type over to 3-level as well, for for now, it gets 2-level.  The
    341  * HP MMU is stuck there for all eternity.
    342  */
    343 #define	MMU_USE_3L		(MMU_IS_68040_CLASS)
    344 #define	MMU_USE_2L		(!MMU_USE_3L)
    345 
    346 /***************************** INSTRUMENTATION *******************************/
    347 
    348 #define	PMAP_EVENT_COUNTERS
    349 
    350 static struct evcnt pmap_nkptpages_initial_ev =
    351     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap nkptpages", "initial");
    352 static struct evcnt pmap_nkptpages_current_ev =
    353     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap nkptpages", "current");
    354 EVCNT_ATTACH_STATIC(pmap_nkptpages_initial_ev);
    355 EVCNT_ATTACH_STATIC(pmap_nkptpages_current_ev);
    356 
    357 static struct evcnt pmap_nkstpages_initial_ev =
    358     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap nkstpages", "initial");
    359 static struct evcnt pmap_nkstpages_current_ev =
    360     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap nkstpages", "current");
    361 EVCNT_ATTACH_STATIC(pmap_nkstpages_initial_ev);
    362 EVCNT_ATTACH_STATIC(pmap_nkstpages_current_ev);
    363 
    364 static struct evcnt pmap_maxkva_ev =
    365     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap", "maxkva");
    366 EVCNT_ATTACH_STATIC(pmap_maxkva_ev);
    367 
    368 static struct evcnt pmap_kvalimit_ev =
    369     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap", "kvalimit");
    370 EVCNT_ATTACH_STATIC(pmap_kvalimit_ev);
    371 
    372 #ifdef PMAP_EVENT_COUNTERS
    373 static struct evcnt pmap_pv_alloc_wait_ev =
    374     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_alloc", "wait");
    375 EVCNT_ATTACH_STATIC(pmap_pv_alloc_wait_ev);
    376 
    377 static struct evcnt pmap_pv_alloc_nowait_ev =
    378     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_alloc", "nowait");
    379 EVCNT_ATTACH_STATIC(pmap_pv_alloc_nowait_ev);
    380 
    381 static struct evcnt pmap_pv_enter_called_ev =
    382     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_enter", "called");
    383 EVCNT_ATTACH_STATIC(pmap_pv_enter_called_ev);
    384 
    385 static struct evcnt pmap_pv_enter_usr_ci_ev =
    386     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_enter", "usr_ci");
    387 EVCNT_ATTACH_STATIC(pmap_pv_enter_usr_ci_ev);
    388 
    389 #if MMU_CONFIG_HP_CLASS
    390 static struct evcnt pmap_pv_enter_vac_ci_ev =
    391     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_enter", "vac_ci");
    392 EVCNT_ATTACH_STATIC(pmap_pv_enter_vac_ci_ev);
    393 #endif
    394 
    395 static struct evcnt pmap_pv_enter_ci_multi_ev =
    396     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_enter", "ci_multi");
    397 EVCNT_ATTACH_STATIC(pmap_pv_enter_ci_multi_ev);
    398 
    399 static struct evcnt pmap_pv_remove_called_ev =
    400     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_remove", "called");
    401 EVCNT_ATTACH_STATIC(pmap_pv_remove_called_ev);
    402 
    403 static struct evcnt pmap_pv_remove_ci_ev =
    404     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pv_remove", "ci");
    405 EVCNT_ATTACH_STATIC(pmap_pv_remove_ci_ev);
    406 
    407 static struct evcnt pmap_pt_cache_hit_ev =
    408     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pt_cache", "hit");
    409 EVCNT_ATTACH_STATIC(pmap_pt_cache_hit_ev);
    410 
    411 static struct evcnt pmap_pt_cache_miss_ev =
    412     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap pt_cache", "miss");
    413 EVCNT_ATTACH_STATIC(pmap_pt_cache_miss_ev);
    414 
    415 static struct evcnt pmap_enter_nowait_ev =
    416     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "nowait");
    417 EVCNT_ATTACH_STATIC(pmap_enter_nowait_ev);
    418 
    419 static struct evcnt pmap_enter_yeswait_ev =
    420     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "yeswait");
    421 EVCNT_ATTACH_STATIC(pmap_enter_yeswait_ev);
    422 
    423 static struct evcnt pmap_enter_pte_alloc_fail_ev =
    424     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "pte alloc failed");
    425 EVCNT_ATTACH_STATIC(pmap_enter_pte_alloc_fail_ev);
    426 
    427 static struct evcnt pmap_enter_pv_alloc_fail_ev =
    428     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "pv alloc failed");
    429 EVCNT_ATTACH_STATIC(pmap_enter_pv_alloc_fail_ev);
    430 
    431 static struct evcnt pmap_enter_valid_ev =
    432     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "valid");
    433 EVCNT_ATTACH_STATIC(pmap_enter_valid_ev);
    434 
    435 static struct evcnt pmap_enter_wire_change_ev =
    436     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "wire change");
    437 EVCNT_ATTACH_STATIC(pmap_enter_wire_change_ev);
    438 
    439 static struct evcnt pmap_enter_prot_change_ev =
    440     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "prot change");
    441 EVCNT_ATTACH_STATIC(pmap_enter_prot_change_ev);
    442 
    443 static struct evcnt pmap_enter_pa_change_ev =
    444     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "pa change");
    445 EVCNT_ATTACH_STATIC(pmap_enter_pa_change_ev);
    446 
    447 static struct evcnt pmap_enter_pv_recycle_ev =
    448     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap enter", "pv recycle");
    449 EVCNT_ATTACH_STATIC(pmap_enter_pv_recycle_ev);
    450 
    451 #define	pmap_evcnt(e)		pmap_ ## e ## _ev.ev_count++
    452 #else
    453 #define	pmap_evcnt(e)		__nothing
    454 #endif
    455 
    456 static void (*pmap_load_urp_func)(paddr_t) __read_mostly;
    457 
    458 static void
    459 pmap_mmuclass_init(void)
    460 {
    461 	switch (mmutype) {
    462 #if MMU_CONFIG_68040_CLASS
    463 	case MMU_68040:
    464 	case MMU_68060:
    465 		pmap_mmuclass = MMU_CLASS_68040;
    466 		/*
    467 		 * XXX This is messy because 68060 frequently gets
    468 		 * XXX initialize to MMU_68040.  Should be cleaned
    469 		 * XXX up once the Hibler pmap is obsoleted.
    470 		 */
    471 #if defined(M68040)
    472 		if (cputype == CPU_68040) {
    473 			pmap_load_urp_func = mmu_load_urp40;
    474 		}
    475 #endif
    476 #if defined(M68060)
    477 		if (cputype == CPU_68060) {
    478 			pmap_load_urp_func = mmu_load_urp60;
    479 		}
    480 #endif
    481 		break;
    482 #endif
    483 #if MMU_CONFIG_68851_CLASS
    484 	case MMU_68851:
    485 	case MMU_68030:
    486 		pmap_mmuclass = MMU_CLASS_68851;
    487 		protorp[0] = MMU51_CRP_BITS;
    488 		pmap_load_urp_func = mmu_load_urp51;
    489 		break;
    490 #endif
    491 #if MMU_CONFIG_HP_CLASS
    492 	case MMU_HP:
    493 		pmap_mmuclass = MMU_CLASS_HP;
    494 		pmap_load_urp_func = mmu_load_urp20hp;
    495 		break;
    496 #endif
    497 	default:
    498 		panic("%s: mmutype=%d not configured?", __func__, mmutype);
    499 	}
    500 
    501 	if (pmap_load_urp_func == NULL) {
    502 		panic("%s: No mmu_load_*() for cputype=%d mmutype=%d",
    503 		    __func__, cputype, mmutype);
    504 	}
    505 }
    506 
    507 /*
    508  * pmap_load_urp:
    509  *
    510  *	Load the user root table into the MMU.
    511  */
    512 static inline void
    513 pmap_load_urp(paddr_t urp)
    514 {
    515 	(*pmap_load_urp_func)(urp);
    516 }
    517 
    518 #if MMU_CONFIG_HP_CLASS
    519 static vaddr_t	pmap_aliasmask __read_mostly;
    520 #endif
    521 
    522 /*
    523  * pmap_init_vac:
    524  *
    525  *	Set up virtually-addressed cache information.  Only relevant
    526  *	for the HP MMU.
    527  */
    528 void
    529 pmap_init_vac(size_t vacsize)
    530 {
    531 #if MMU_CONFIG_HP_CLASS
    532 	KASSERT(pmap_aliasmask == 0);
    533 	KASSERT(powerof2(vacsize));
    534 	pmap_aliasmask = vacsize - 1;
    535 #endif
    536 }
    537 
    538 /***************************** PHYS <-> VM PAGE ******************************/
    539 
    540 static bool pmap_initialized_p;
    541 
    542 static inline struct vm_page *
    543 pmap_pa_to_pg(paddr_t pa)
    544 {
    545 	return pmap_initialized_p ? PHYS_TO_VM_PAGE(pa) : NULL;
    546 }
    547 
    548 /*************************** RESOURCE MANAGEMENT *****************************/
    549 
    550 static struct pmap kernel_pmap_store;
    551 struct pmap * const kernel_pmap_ptr = &kernel_pmap_store;
    552 
    553 /*
    554  * Physical address of kernel level 1 table.  This name is compatible
    555  * with the Hibler pmap's name.
    556  */
    557 paddr_t		Sysseg_pa;
    558 
    559 /*
    560  * Avoid a memory load when doing comparisons against pmap_kernel()
    561  * within this compilation unit.
    562  */
    563 #undef pmap_kernel
    564 #define	pmap_kernel()	(&kernel_pmap_store)
    565 
    566 static inline bool
    567 active_pmap(pmap_t pmap)
    568 {
    569 	return pmap == pmap_kernel() ||
    570 	       pmap == curproc->p_vmspace->vm_map.pmap;
    571 }
    572 
    573 static inline bool
    574 active_user_pmap(pmap_t pmap)
    575 {
    576 	return curproc != NULL &&
    577 	       pmap != pmap_kernel() &&
    578 	       pmap == curproc->p_vmspace->vm_map.pmap;
    579 }
    580 
    581 /*
    582  * Number of tables per page table page:
    583  * 0 - number of leaf page tables per page
    584  * 1 - number of segment tables per page
    585  */
    586 static unsigned int pmap_ptpage_table_counts[2];
    587 
    588 __CTASSERT(LA40_L1_COUNT == LA40_L2_COUNT);
    589 
    590 static void
    591 pmap_ptpage_init(void)
    592 {
    593 	if (MMU_USE_3L) {
    594 		pmap_ptpage_table_counts[0] = PAGE_SIZE / TBL40_L3_SIZE;
    595 		pmap_ptpage_table_counts[1] = PAGE_SIZE / TBL40_L2_SIZE;
    596 	} else {
    597 		pmap_ptpage_table_counts[0] = 1;
    598 		pmap_ptpage_table_counts[1] = 1;
    599 	}
    600 }
    601 
    602 static struct vm_page *
    603 pmap_page_alloc(bool nowait)
    604 {
    605 	struct vm_page *pg;
    606 	const int flags = nowait ? UVM_PGA_USERESERVE : 0;
    607 
    608 	while ((pg = uvm_pagealloc(NULL, 0, NULL, flags)) == NULL) {
    609 		if (nowait) {
    610 			return NULL;
    611 		}
    612 		uvm_wait("pmappg");
    613 	}
    614 	pg->flags &= ~PG_BUSY;	/* never busy */
    615 
    616 	return pg;
    617 }
    618 
    619 static struct pmap_ptpage *
    620 pmap_ptpage_alloc(bool segtab, bool nowait)
    621 {
    622 	const unsigned int tabcnt = pmap_ptpage_table_counts[segtab];
    623 	const size_t size = sizeof(struct pmap_ptpage) +
    624 	    (sizeof(struct pmap_table) * tabcnt);
    625 	const size_t tabsize = PAGE_SIZE / tabcnt;
    626 	struct pmap_ptpage *ptp;
    627 	struct pmap_table *pt;
    628 	struct vm_page *pg;
    629 	const int uvm_f_nowait = nowait ? UVM_KMF_NOWAIT : 0;
    630 	vaddr_t ptpva;
    631 
    632 	ptp = kmem_zalloc(size, nowait ? KM_NOSLEEP : KM_SLEEP);
    633 	if (__predict_false(ptp == NULL)) {
    634 		return NULL;
    635 	}
    636 
    637 	/* Allocate a VA for the PT page. */
    638 	ptpva = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    639 			     UVM_KMF_VAONLY | uvm_f_nowait);
    640 	if (__predict_false(ptpva == 0)) {
    641 		kmem_free(ptp, size);
    642 		return NULL;
    643 	}
    644 
    645 	/* Get a page. */
    646 	pg = pmap_page_alloc(nowait);
    647 	if (__predict_false(pg == NULL)) {
    648 		uvm_km_free(kernel_map, ptpva, PAGE_SIZE, UVM_KMF_VAONLY);
    649 		kmem_free(ptp, size);
    650 		return NULL;
    651 	}
    652 
    653 	/* Map the page cache-inhibited and zero it out. */
    654 	pmap_kenter_pa(ptpva, VM_PAGE_TO_PHYS(pg),
    655 	    UVM_PROT_READ | UVM_PROT_WRITE, PMAP_NOCACHE);
    656 	zeropage((void *)ptpva);
    657 
    658 	/*
    659 	 * All resources for the PT page have been allocated.
    660 	 * Now initialize it and the individual table descriptors.
    661 	 */
    662 	LIST_INIT(&ptp->ptp_freelist);
    663 	ptp->ptp_pg = pg;
    664 	ptp->ptp_vpagenum = m68k_btop(ptpva);
    665 	ptp->ptp_freecnt = tabcnt;
    666 	ptp->ptp_segtab = segtab;
    667 
    668 	for (unsigned int i = 0; i < tabcnt; ptpva += tabsize, i++) {
    669 		pt = &ptp->ptp_tables[i];
    670 		pt->pt_ptpage = ptp;
    671 		pt->pt_entries = (pt_entry_t *)ptpva;
    672 		LIST_INSERT_HEAD(&ptp->ptp_freelist, pt, pt_freelist);
    673 	}
    674 
    675 	return ptp;
    676 }
    677 
    678 static void
    679 pmap_ptpage_free(struct pmap_ptpage *ptp)
    680 {
    681 	const unsigned int tabcnt = pmap_ptpage_table_counts[ptp->ptp_segtab];
    682 	const size_t size = sizeof(struct pmap_ptpage) +
    683 	    (sizeof(struct pmap_table) * tabcnt);
    684 
    685 	uvm_km_free(kernel_map, m68k_ptob(ptp->ptp_vpagenum), PAGE_SIZE,
    686 		    UVM_KMF_WIRED);
    687 	kmem_free(ptp, size);
    688 }
    689 
    690 static struct pool pmap_pool;
    691 static struct pool pmap_pv_pool;
    692 
    693 #define	PMAP_PV_LOWAT		16
    694 
    695 static void
    696 pmap_alloc_init(void)
    697 {
    698 	pool_init(&pmap_pv_pool, sizeof(struct pv_entry),
    699 	    PVH_ATTR_MASK + 1,		/* align */
    700 	    0,				/* ioff */
    701 	    0,				/* flags */
    702 	    "pmappv",			/* wchan */
    703 	    &pool_allocator_meta,	/* palloc */
    704 	    IPL_VM);			/* ipl */
    705 
    706 	/*
    707 	 * Set a low water mark on the pv_entry pool, so that we are
    708 	 * more likely to have these around even in extreme memory
    709 	 * starvation.
    710 	 */
    711 	pool_setlowat(&pmap_pv_pool, PMAP_PV_LOWAT);
    712 
    713 	pool_init(&pmap_pool, sizeof(struct pmap),
    714 	    0,				/* align */
    715 	    0,				/* ioff */
    716 	    0,				/* flags */
    717 	    "pmappl",			/* wchan */
    718 	    &pool_allocator_kmem,	/* palloc */
    719 	    IPL_NONE);			/* ipl */
    720 }
    721 
    722 static inline pmap_t
    723 pmap_alloc(void)
    724 {
    725 	pmap_t pmap = pool_get(&pmap_pool, PR_WAITOK);
    726 	memset(pmap, 0, sizeof(*pmap));
    727 	return pmap;
    728 }
    729 
    730 static inline void
    731 pmap_free(pmap_t pmap)
    732 {
    733 	pool_put(&pmap_pool, pmap);
    734 }
    735 
    736 static struct pv_entry *
    737 pmap_pv_alloc(bool nowait)
    738 {
    739 	struct pv_entry *pv;
    740 
    741 #ifdef PMAP_EVENT_COUNTERS
    742 	if (nowait) {
    743 		pmap_evcnt(pv_alloc_nowait);
    744 	} else {
    745 		pmap_evcnt(pv_alloc_wait);
    746 	}
    747 #endif
    748 
    749 	pv = pool_get(&pmap_pv_pool, nowait ? PR_NOWAIT : 0);
    750 	if (__predict_true(pv != NULL)) {
    751 		KASSERT((((uintptr_t)pv) & PVH_ATTR_MASK) == 0);
    752 	}
    753 	return pv;
    754 }
    755 
    756 static void
    757 pmap_pv_free(struct pv_entry *pv)
    758 {
    759 	pool_put(&pmap_pv_pool, pv);
    760 }
    761 
    762 /*
    763  * Whenever we need to free resources back to the system, we want to
    764  * do it in a batch with any locks released.  So, we have this around
    765  * to collect the garbage, as needed.
    766  */
    767 struct pmap_completion {
    768 	struct pmap_ptpage_list pc_ptpages;
    769 	struct pmap_pv_list pc_pvlist;
    770 };
    771 
    772 static inline void
    773 pmap_completion_init(struct pmap_completion *pc)
    774 {
    775 	TAILQ_INIT(&pc->pc_ptpages);
    776 	LIST_INIT(&pc->pc_pvlist);
    777 }
    778 
    779 static void
    780 pmap_completion_fini(struct pmap_completion *pc)
    781 {
    782 	struct pmap_ptpage *ptp;
    783 	struct pv_entry *pv;
    784 
    785 	while ((ptp = TAILQ_FIRST(&pc->pc_ptpages)) != NULL) {
    786 		TAILQ_REMOVE(&pc->pc_ptpages, ptp, ptp_list);
    787 		/*
    788 		 * Can't assert ptp_freecnt here; it won't match up
    789 		 * in the pmap_remove_all() case.
    790 		 *
    791 		 * KASSERT(ptp->ptp_freecnt ==
    792 		 *     pmap_ptpage_table_counts[ptp->ptp_segtab]);
    793 		 */
    794 		pmap_ptpage_free(ptp);
    795 	}
    796 
    797 	while ((pv = LIST_FIRST(&pc->pc_pvlist)) != NULL) {
    798 		LIST_REMOVE(pv, pv_pmlist);
    799 		pmap_pv_free(pv);
    800 	}
    801 }
    802 
    803 /************************ PTE MANIPULATION HELPERS ***************************/
    804 
    805 /* Assert assumptions made in <machine/pmap.h>. */
    806 __CTASSERT(DT51_PAGE == PTE40_RESIDENT);
    807 __CTASSERT(PTE51_WP == PTE40_W);
    808 __CTASSERT(PTE51_U == PTE40_U);
    809 __CTASSERT(PTE51_M == PTE40_M);
    810 __CTASSERT(PTE51_CI == PTE40_CM_NC_SER);
    811 
    812 static pt_entry_t	pmap_pte_proto[UVM_PROT_ALL + 1];
    813 static pt_entry_t	pmap_pte_proto_ci[UVM_PROT_ALL + 1];
    814 static pt_entry_t	pmap_pte_proto_um[UVM_PROT_ALL + 1];
    815 static pt_entry_t	pmap_ste_proto;
    816 
    817 static inline paddr_t
    818 pte_pa(pt_entry_t pte)
    819 {
    820 	return pte & PTE40_PGA;
    821 }
    822 
    823 /*
    824  * These predicate inlines compile down into BFEXTU, so are quite fast.
    825  */
    826 
    827 static inline bool
    828 pte_valid_p(pt_entry_t pte)
    829 {
    830 	return !!(pte & PTE_VALID);
    831 }
    832 
    833 static inline bool
    834 pte_wired_p(pt_entry_t pte)
    835 {
    836 	return !!(pte & PTE_WIRED);
    837 }
    838 
    839 static inline bool
    840 pte_managed_p(pt_entry_t pte)
    841 {
    842 	return !!(pte & PTE_PVLIST);
    843 }
    844 
    845 static inline bool
    846 pte_ci_p(pt_entry_t pte)
    847 {
    848 	/*
    849 	 * Happily, PTE51_CI is bit 6, which is set for both of the
    850 	 * cache-inhibited modes on 68040, so we can just check for
    851 	 * that.
    852 	 */
    853 	return !!(pte & PTE51_CI);
    854 }
    855 
    856 #define	PTE_PROT_CHANGE_BITS	(PTE_WP | PTE_CMASK)
    857 
    858 static inline pt_entry_t
    859 pte_change_prot(pt_entry_t opte, vm_prot_t prot)
    860 {
    861 	pt_entry_t *pte_proto = pte_ci_p(opte) ? pmap_pte_proto_ci
    862 					       : pmap_pte_proto;
    863 
    864 	return (opte & ~PTE_PROT_CHANGE_BITS) | pte_proto[prot];
    865 }
    866 
    867 static inline pt_entry_t
    868 pte_load(pt_entry_t *ptep)
    869 {
    870 	return atomic_load_relaxed(ptep);
    871 }
    872 
    873 static inline void
    874 pte_store(pt_entry_t *ptep, pt_entry_t npte)
    875 {
    876 	atomic_store_relaxed(ptep, npte);
    877 }
    878 
    879 /*
    880  * Don't inline the CAS.L instruction; some systems have non-working
    881  * READ-MODIFY-WRITE cycle logic.  This will ensure that we'll use
    882  * restartable atomic sequence, if required.
    883  *
    884  * AND.L and OR.L don't use the RMC signal, so they aren't subject
    885  * to the same constraints.
    886  */
    887 static inline bool
    888 pte_update(pt_entry_t *ptep, pt_entry_t opte, pt_entry_t npte)
    889 {
    890 	/*
    891 	 * Use compare-and-swap to update the PTE.  This ensures there's
    892 	 * no possibility of losing any hardware-maintained bits when
    893 	 * updating the PTE.
    894 	 *
    895 	 * XXX Should turn this into a single instruction when possible
    896 	 * XXX to deduce at compile time.
    897 	 */
    898 	return atomic_cas_uint(ptep, opte, npte) == opte;
    899 }
    900 
    901 #if MMU_CONFIG_HP_CLASS
    902 /*
    903  * These are only used for HP MMU VAC shenanigans.  There is no need
    904  * for these to be truly atomic, and systems with an HP MMU can't do
    905  * truly atomic operations anyway.
    906  */
    907 static inline void
    908 pte_set(pt_entry_t *ptep, pt_entry_t bits)
    909 {
    910 	*ptep |= bits;
    911 }
    912 
    913 static inline void
    914 pte_mask(pt_entry_t *ptep, pt_entry_t mask)
    915 {
    916 	*ptep &= mask;
    917 }
    918 #endif /* MMU_CONFIG_HP_CLASS */
    919 
    920 static inline pt_entry_t
    921 pte_set_ci(pt_entry_t pte)
    922 {
    923 	return (pte & ~PTE_CMASK) | (MMU_IS_68040_CLASS ? PTE40_CM_NC_SER
    924 							: PTE51_CI);
    925 }
    926 
    927 static inline pt_entry_t
    928 pte_clr_ci(pt_entry_t pte)
    929 {
    930 	pte &= ~PTE_CMASK;
    931 	if (MMU_IS_68040_CLASS) {
    932 		pte |= (pte & PTE_WP) ? PTE40_CM_WT
    933 				      : PTE40_CM_CB;
    934 	}
    935 	return pte;
    936 }
    937 
    938 static void
    939 pmap_pte_proto_init(void)
    940 {
    941 	pt_entry_t c_bits, ro_c_bits, rw_c_bits, ci_bits, prot_bits, um_bits;
    942 	int prot;
    943 
    944 	if (MMU_IS_68040_CLASS) {
    945 		ro_c_bits = PTE40_CM_WT; /* this is what the Hibler pmap did */
    946 		rw_c_bits = PTE40_CM_CB;
    947 		ci_bits = PTE40_CM_NC_SER;
    948 	} else {
    949 		ro_c_bits = rw_c_bits = 0;
    950 		ci_bits = PTE51_CI;
    951 	}
    952 
    953 	for (prot = 1; prot <= UVM_PROT_ALL; prot++) {
    954 		prot_bits = um_bits = 0;
    955 		if (prot & UVM_PROT_WRITE) {
    956 			um_bits = PTE_U | PTE_M;
    957 		} else if (prot & (UVM_PROT_READ|UVM_PROT_EXEC)) {
    958 			prot_bits = PTE_WP;
    959 			um_bits = PTE_U;
    960 		}
    961 		c_bits = (prot & UVM_PROT_WRITE) ? rw_c_bits : ro_c_bits;
    962 		pmap_pte_proto[prot]    = PTE_VALID | prot_bits | c_bits;
    963 		pmap_pte_proto_ci[prot] = PTE_VALID | prot_bits | ci_bits;
    964 		pmap_pte_proto_um[prot] = um_bits;
    965 	}
    966 
    967 	/*
    968 	 * from hp300/DOC/HPMMU.notes:
    969 	 *
    970 	 * Segment table entries:
    971 	 *
    972 	 * bits 31-12:	Physical page frame number of PT page
    973 	 * bits 11-4:	Reserved at zero (can software use them?)
    974 	 * bit 3:	Reserved at one
    975 	 * bits 1-0:	Valid bits (hardware uses bit 1)
    976 	 *
    977 	 * This is all roughly compatible with 68851 and 68040:
    978 	 *
    979 	 * bit 3:	DTE51_U / UTE40_U (used)
    980 	 * bits 1-0:	DT51_SHORT / UTE40_RESIDENT
    981 	 *
    982 	 * The Hibler pmap set "SG_U" in the 68040 case, but not in
    983 	 * any others (??), which seems at odds with HPMMU.notes, but
    984 	 * whatever.  It does not seem to cause any harm to set the
    985 	 * "used" bit in all cases, so that's what we'll do.  If it
    986 	 * does prove to be problematic, we can make adjustments.
    987 	 */
    988 	pmap_ste_proto = DTE51_U | DT51_SHORT;
    989 }
    990 
    991 static inline pt_entry_t
    992 pmap_make_pte(paddr_t pa, vm_prot_t prot, u_int flags)
    993 {
    994 	pt_entry_t *pte_proto = (flags & PMAP_NOCACHE) ? pmap_pte_proto_ci
    995 						       : pmap_pte_proto;
    996 
    997 	prot &= UVM_PROT_ALL;
    998 	KASSERT(prot != 0);
    999 
   1000 	pt_entry_t npte = pa | pte_proto[prot] |
   1001 	    pmap_pte_proto_um[flags & UVM_PROT_ALL];
   1002 
   1003 	if (flags & PMAP_WIRED) {
   1004 		npte |= PTE_WIRED;
   1005 	}
   1006 
   1007 	return npte;
   1008 }
   1009 
   1010 /************************** PAGE TABLE MANAGEMENT ****************************/
   1011 
   1012 /*
   1013  * Kernel page table management works differently from user page table
   1014  * management.  An initial set of kernel PTs are allocated during early
   1015  * bootstrap (enough to map the virtual addresses set up at that time,
   1016  * plus a little extra to give the kernel some breathing room while
   1017  * UVM gets initialized -- see pmap_bootstrap1()).  If more PTs are
   1018  * needed in order to expand the kernel address space, pmap_growkernel()
   1019  * is called to allocate some more.  We always allocate kernel PTs in
   1020  * chunks of one page, allocating more inner segment tables as needed
   1021  * to link them into the MMU tree (3-level), or just poking them in
   1022  * directly to the level-1 table (2-level).
   1023  *
   1024  * The kernel PTs are mapped into a single linear array to make that
   1025  * makes it possible to simply index by virtual page number to find
   1026  * the PTE that maps that virtual address.
   1027  */
   1028 #define	PTPAGEVASZ	((PAGE_SIZE / sizeof(pt_entry_t)) * PAGE_SIZE)
   1029 #define	PTPAGEVAOFS	(PTPAGEVASZ - 1)
   1030 
   1031 #define	pmap_round_ptpage(va)	(((va) + PTPAGEVAOFS) & ~PTPAGEVAOFS)
   1032 
   1033 /*
   1034  * kernel_virtual_start marks the first kernel virtual address that
   1035  * is handed off to UVM to manage.  kernel_virtual_end marks the end
   1036  * of the kernel address space that is currently mappable with the
   1037  * number of pages allocated to kernel PTs.
   1038  *
   1039  * kernel_virtual_start is fixed once pmap_bootstrap1() completes.
   1040  * kernel_virtual_end can be extended by calling pmap_growkernel().
   1041  *
   1042  * kernel_virtual_max represents the absolute maximum.  It starts at
   1043  * KERNEL_MAX_ADDRESS, but may get clamped by fixed mappings that
   1044  * start beyond the end of kernel virtual address space.
   1045  *
   1046  * kernel_virtual_max is exported to the rest of the kernel via
   1047  * pmap_virtual_space() and VM_MAX_KERNEL_ADDRESS.
   1048  */
   1049 #define	KERNEL_MAX_ADDRESS	((vaddr_t)0 - PAGE_SIZE)
   1050 static vaddr_t kernel_virtual_start, kernel_virtual_end;
   1051        vaddr_t kernel_virtual_max = KERNEL_MAX_ADDRESS;
   1052 
   1053 /*
   1054  * kernel_stnext_pa and kernel_stnext_endpa together implement a
   1055  * simple allocator for inner segment tables used in the 3-level
   1056  * configuration.  When the initial level-1 table is allocated
   1057  * the remained of that page is set in kernel_stnext_pa, and
   1058  * kernel_stnext_endpa is set to the next page boundary.  When
   1059  * a segment table is needed, kernel_stnext_pa is the address
   1060  * of the next free table and is advanced by the L2 table size
   1061  * (512 bytes).  If that allocation attempt finds that kernel_stnext_pa
   1062  * is equal to kernel_stnext_endpa, a new page is allocated and
   1063  * kernel_stnext_pa and kernel_stnext_endpa updated to reflect
   1064  * the newly-allocated page before the table is taken from it.
   1065  */
   1066 static paddr_t kernel_stnext_pa, kernel_stnext_endpa;
   1067 
   1068 /*
   1069  * Null segment table that every pmap gets as its initial level 1
   1070  * map.  This is a single page allocated in pmap_bootstrap1(), and
   1071  * we zero it out in pmap_init().
   1072  */
   1073 static paddr_t null_segtab_pa __read_mostly;
   1074 
   1075 static inline void
   1076 pmap_set_lev1map(pmap_t pmap, struct pmap_table *pt, paddr_t pa)
   1077 {
   1078 	pmap->pm_lev1map = pt;
   1079 	pmap->pm_lev1pa = pa;
   1080 	if (active_user_pmap(pmap)) {
   1081 #if MMU_CONFIG_HP_CLASS
   1082 		/*
   1083 		 * N.B. re-loading the user segment table pointer also
   1084 		 * invalidates the user side of the VAC, so no additional
   1085 		 * work is necessary.
   1086 		 */
   1087 #endif
   1088 		pmap_load_urp(pmap->pm_lev1pa);
   1089 		TBIAU();		/* XXX optimize? */
   1090 		ICIA();			/* XXX optimize? */
   1091 	}
   1092 }
   1093 
   1094 /*
   1095  * Table accessors.
   1096  */
   1097 static inline unsigned int
   1098 pmap_pagenum(vaddr_t va)
   1099 {
   1100 	return ((va) >> PGSHIFT);
   1101 }
   1102 
   1103 static inline unsigned int
   1104 pmap_segnum(vaddr_t va)
   1105 {
   1106 	return MMU_USE_3L ? ((va) >> SEGSHIFT3L) : ((va) >> SEGSHIFT2L);
   1107 }
   1108 
   1109 static inline unsigned int
   1110 pmap_st1_index(vaddr_t va)
   1111 {
   1112 	return MMU_USE_3L ? LA40_RI(va) : LA2L_RI(va);
   1113 }
   1114 
   1115 static inline unsigned int
   1116 pmap_st_index(vaddr_t va)
   1117 {
   1118 	return MMU_USE_3L ? LA40_PI(va) : LA2L_RI(va);
   1119 }
   1120 
   1121 static inline unsigned int
   1122 pmap_pt_index(vaddr_t va)
   1123 {
   1124 	return MMU_USE_3L ? LA40_PGI(va) : LA2L_PGI(va);
   1125 }
   1126 
   1127 static inline vaddr_t
   1128 pmap_trunc_seg(vaddr_t va)
   1129 {
   1130 	return MMU_USE_3L ? pmap_trunc_seg_3L(va) : pmap_trunc_seg_2L(va);
   1131 }
   1132 
   1133 static inline vaddr_t
   1134 pmap_trunc_seg1(vaddr_t va)
   1135 {
   1136 	KASSERT(MMU_USE_3L);
   1137 	return pmap_trunc_seg1_3L(va);
   1138 }
   1139 
   1140 static inline vaddr_t
   1141 pmap_round_seg(vaddr_t va)
   1142 {
   1143 	return MMU_USE_3L ? pmap_round_seg_3L(va) : pmap_round_seg_2L(va);
   1144 }
   1145 
   1146 static inline vaddr_t
   1147 pmap_next_seg(vaddr_t va)
   1148 {
   1149 	return pmap_round_seg(va + PAGE_SIZE);
   1150 }
   1151 
   1152 static paddr_t
   1153 pmap_table_pa(const struct pmap_table * const pt)
   1154 {
   1155 	const struct pmap_ptpage * const ptp = pt->pt_ptpage;
   1156 	const vaddr_t ptpva = m68k_ptob(ptp->ptp_vpagenum);
   1157 	const vaddr_t ptva = (vaddr_t)pt->pt_entries;
   1158 
   1159 	return VM_PAGE_TO_PHYS(ptp->ptp_pg) + (ptva - ptpva);
   1160 }
   1161 
   1162 static inline unsigned int
   1163 pmap_table_make_key(unsigned int segnum, bool segtab)
   1164 {
   1165 	KASSERT((segnum & 0x80000000) == 0);
   1166 	return (segnum << 1) | (unsigned int)segtab;
   1167 }
   1168 
   1169 static int
   1170 pmap_table_rb_compare_key(void *v __unused, const void *n, const void *k)
   1171 {
   1172 	const struct pmap_table * const pt1 = n;
   1173 	const unsigned int k1 = pt1->pt_key;
   1174 	const unsigned int k2 = *(const unsigned int *)k;
   1175 
   1176 	return (int)(k1 - k2);
   1177 }
   1178 
   1179 static int
   1180 pmap_table_rb_compare_nodes(void *v, const void *n1, const void *n2)
   1181 {
   1182 	const struct pmap_table * const pt2 = n2;
   1183 
   1184 	return pmap_table_rb_compare_key(v, n1, &pt2->pt_key);
   1185 }
   1186 
   1187 static const rb_tree_ops_t pmap_table_rb_ops = {
   1188 	.rbto_compare_nodes = pmap_table_rb_compare_nodes,
   1189 	.rbto_compare_key   = pmap_table_rb_compare_key,
   1190 	.rbto_node_offset   = offsetof(struct pmap_table, pt_node),
   1191 };
   1192 
   1193 static struct pmap_table *
   1194 pmap_table_alloc(pmap_t pmap, bool segtab, bool nowait,
   1195     struct pmap_completion *pc)
   1196 {
   1197 	struct pmap_ptpage_list *pmlist = &pmap->pm_ptpages[segtab];
   1198 	struct pmap_ptpage *ptp, *newptp = NULL;
   1199 	struct pmap_table *pt;
   1200 
   1201 	KASSERT(pc != NULL);
   1202 
   1203  try_again:
   1204 	if ((ptp = TAILQ_FIRST(pmlist)) == NULL || ptp->ptp_freecnt == 0) {
   1205 		/*
   1206 		 * No PT pages with free tables (empty PT pages are moved
   1207 		 * to the tail of the list).  Allocate a new PT page and
   1208 		 * try again.  If someone else successfully allocates one
   1209 		 * while we're sleeping, then we'll use it and free what
   1210 		 * we allocated back to the system.
   1211 		 */
   1212 		KASSERT(ptp == NULL || LIST_FIRST(&ptp->ptp_freelist) == NULL);
   1213 		if (newptp == NULL) {
   1214 			newptp = pmap_ptpage_alloc(segtab, nowait);
   1215 			if (newptp == NULL) {
   1216 				/*
   1217 				 * If we didn't wait, then no one would
   1218 				 * have allocted one behind our back.
   1219 				 */
   1220 				KASSERT(nowait);
   1221 				return NULL;
   1222 			}
   1223 			goto try_again;
   1224 		}
   1225 		ptp = newptp;
   1226 		TAILQ_INSERT_HEAD(pmlist, newptp, ptp_list);
   1227 	}
   1228 	if (__predict_false(newptp != NULL && ptp != newptp)) {
   1229 		/* Not using newly-allocated PT page; free it back. */
   1230 		TAILQ_INSERT_TAIL(&pc->pc_ptpages, newptp, ptp_list);
   1231 	}
   1232 	pt = LIST_FIRST(&ptp->ptp_freelist);
   1233 	KASSERT(pt != NULL);
   1234 	LIST_REMOVE(pt, pt_freelist);
   1235 	ptp->ptp_freecnt--;
   1236 	if (ptp->ptp_freecnt == 0 &&
   1237 	    TAILQ_NEXT(ptp, ptp_list) != NULL) {
   1238 		TAILQ_REMOVE(pmlist, ptp, ptp_list);
   1239 		TAILQ_INSERT_TAIL(pmlist, ptp, ptp_list);
   1240 	}
   1241 	KASSERT(pt->pt_st == NULL);
   1242 	pt->pt_holdcnt = 1;
   1243 
   1244 	return pt;
   1245 }
   1246 
   1247 static void
   1248 pmap_table_free(pmap_t pmap, struct pmap_table *pt,
   1249 		struct pmap_completion *pc)
   1250 {
   1251 	struct pmap_ptpage *ptp = pt->pt_ptpage;
   1252 	struct pmap_ptpage_list *pmlist = &pmap->pm_ptpages[ptp->ptp_segtab];
   1253 
   1254 	KASSERT(pt->pt_st == NULL);
   1255 
   1256 	LIST_INSERT_HEAD(&ptp->ptp_freelist, pt, pt_freelist);
   1257 	KASSERT(ptp->ptp_freecnt < pmap_ptpage_table_counts[ptp->ptp_segtab]);
   1258 	ptp->ptp_freecnt++;
   1259 
   1260 	/*
   1261 	 * If the PT page no longer has any active tables, then
   1262 	 * remove it from the pmap and queue it up to be given
   1263 	 * back to the system.
   1264 	 */
   1265 	if (ptp->ptp_freecnt == pmap_ptpage_table_counts[ptp->ptp_segtab]) {
   1266 		TAILQ_REMOVE(pmlist, ptp, ptp_list);
   1267 		TAILQ_INSERT_TAIL(&pc->pc_ptpages, ptp, ptp_list);
   1268 	}
   1269 	/*
   1270 	 * If the PT page now has exactly one free table, then
   1271 	 * put it at the head of its list so that it is allocated
   1272 	 * from first the next time a table is needed.
   1273 	 */
   1274 	else if (ptp->ptp_freecnt == 1) {
   1275 		TAILQ_REMOVE(pmlist, ptp, ptp_list);
   1276 		TAILQ_INSERT_HEAD(pmlist, ptp, ptp_list);
   1277 	}
   1278 	/*
   1279 	 * Push this PT page down the list if it has more free tables
   1280 	 * than the ones that come after.  The goal is to keep PT pages
   1281 	 * with the fewest free tables at the head of the list so that
   1282 	 * they're allocated from first.  This is an effort to keep
   1283 	 * fragmentation at bay so as to increase the likelihood that
   1284 	 * we can free PT pages back to the system.
   1285 	 */
   1286 	else {
   1287 		struct pmap_ptpage *next_ptp;
   1288 		for (next_ptp = TAILQ_NEXT(ptp, ptp_list);
   1289 		     next_ptp != NULL;
   1290 		     next_ptp = TAILQ_NEXT(next_ptp, ptp_list)) {
   1291 			if (next_ptp->ptp_freecnt < ptp->ptp_freecnt) {
   1292 				break;
   1293 			}
   1294 		}
   1295 		if (next_ptp != NULL &&
   1296 		    next_ptp != TAILQ_NEXT(ptp, ptp_list) &&
   1297 		    next_ptp->ptp_freecnt != 0) {
   1298 			TAILQ_REMOVE(pmlist, ptp, ptp_list);
   1299 			TAILQ_INSERT_AFTER(pmlist, next_ptp, ptp, ptp_list);
   1300 		}
   1301 	}
   1302 }
   1303 
   1304 /*
   1305  * pmap_table_retain:
   1306  *
   1307  *	Take a retain count on the specified table.  Retain counts
   1308  *	are used to ensure the table remains stable while working
   1309  *	on it, and each mapping placed into the table also gets
   1310  *	a retain count.
   1311  */
   1312 static inline void
   1313 pmap_table_retain(struct pmap_table *pt)
   1314 {
   1315 	if (__predict_true(pt != NULL)) {
   1316 		pt->pt_holdcnt++;
   1317 		KASSERT(pt->pt_holdcnt != 0);
   1318 	}
   1319 }
   1320 
   1321 /*
   1322  * pmap_table_release:
   1323  *
   1324  *	Release a previously-taken retain count on the specified
   1325  *	table.  If the retain count drops to zero, the table is
   1326  *	unlinked from the lookup tree and the MMU tree and freed.
   1327  */
   1328 static __noinline void
   1329 pmap_table_release_slow(pmap_t pmap, struct pmap_table *pt,
   1330 			struct pmap_completion *pc)
   1331 {
   1332 	KASSERT(pt != NULL);
   1333 	KASSERT(pt->pt_holdcnt != 0);
   1334 	pt->pt_holdcnt--;
   1335 	if (__predict_false(pt->pt_holdcnt != 0)) {
   1336 		return;
   1337 	}
   1338 
   1339 	/*
   1340 	 * If the caller doesn't expect the count to go to zero,
   1341 	 * they won't have bothered with a completion context.
   1342 	 * Going to zero is unexpected in this case, so blow up
   1343 	 * if it happens.
   1344 	 */
   1345 	KASSERT(pc != NULL);
   1346 	if (__predict_true(pt == pmap->pm_pt_cache)) {
   1347 		pmap->pm_pt_cache = NULL;
   1348 	}
   1349 	if (__predict_true(pt->pt_st != NULL)) {
   1350 		/*
   1351 		 * This table needs to be unlinked from the lookup
   1352 		 * tree and the MMU tree.
   1353 		 */
   1354 		pte_store(&pt->pt_st->pt_entries[pt->pt_stidx], 0);
   1355 		rb_tree_remove_node(&pmap->pm_tables, pt);
   1356 		pmap_table_release_slow(pmap, pt->pt_st, pc);
   1357 		pt->pt_st = NULL;
   1358 	} else if (pt == pmap->pm_lev1map) {
   1359 		pmap_set_lev1map(pmap, NULL, null_segtab_pa);
   1360 	}
   1361 	pmap_table_free(pmap, pt, pc);
   1362 }
   1363 
   1364 static inline void
   1365 pmap_table_release(pmap_t pmap, struct pmap_table *pt,
   1366 		   struct pmap_completion *pc)
   1367 {
   1368 	if (__predict_true(pt != NULL)) {
   1369 		if (__predict_true(pt->pt_holdcnt > 1)) {
   1370 			pt->pt_holdcnt--;
   1371 			return;
   1372 		}
   1373 		pmap_table_release_slow(pmap, pt, pc);
   1374 	}
   1375 }
   1376 
   1377 /*
   1378  * pmap_table_lookup:
   1379  *
   1380  *	Lookup the table corresponding to the specified segment.
   1381  */
   1382 static struct pmap_table *
   1383 pmap_table_lookup(pmap_t pmap, unsigned int segnum, bool segtab)
   1384 {
   1385 	const unsigned int key = pmap_table_make_key(segnum, segtab);
   1386 	struct pmap_table *pt;
   1387 
   1388 	if ((pt = pmap->pm_pt_cache) == NULL || pt->pt_key != key) {
   1389 		pmap_evcnt(pt_cache_miss);
   1390 		pt = rb_tree_find_node(&pmap->pm_tables, &key);
   1391 		if (__predict_true(!segtab)) {
   1392 			pmap->pm_pt_cache = pt;
   1393 		}
   1394 	} else {
   1395 		pmap_evcnt(pt_cache_hit);
   1396 	}
   1397 	if (pt != NULL) {
   1398 		pmap_table_retain(pt);
   1399 	}
   1400 	return pt;
   1401 }
   1402 
   1403 /*
   1404  * pmap_table_insert:
   1405  *
   1406  *	Allocate and insert a table into the tree at the specified
   1407  *	location.
   1408  */
   1409 static struct pmap_table *
   1410 pmap_table_insert(pmap_t pmap, struct pmap_table *t1, unsigned int stidx,
   1411     unsigned int segnum, bool segtab, bool nowait, struct pmap_completion *pc)
   1412 {
   1413 	struct pmap_table *t2, *ret_t;
   1414 
   1415 	t2 = pmap_table_lookup(pmap, segnum, segtab);
   1416 	if (t2 != NULL) {
   1417 		/*
   1418 		 * Table at this level already exists, and looking
   1419 		 * it up gave us a retain count, so we no longer need
   1420 		 * the retain count on the upper level table (it is
   1421 		 * retained-by-proxy by the table we just found).
   1422 		 * We pass NULL for the completion context because
   1423 		 * we don't expect the upper level table's retain count
   1424 		 * to drop to zero, and we want things to blow up
   1425 		 * loudly if it does!
   1426 		 */
   1427 		pmap_table_release(pmap, t1, NULL);
   1428 		return t2;
   1429 	}
   1430 
   1431 	/* Allocate the new table. */
   1432 	PMAP_CRIT_EXIT();
   1433 	t2 = pmap_table_alloc(pmap, segtab, nowait, pc);
   1434 	PMAP_CRIT_ENTER();
   1435 	if (__predict_false(t2 == NULL)) {
   1436 		pmap_table_release(pmap, t1, pc);
   1437 		return NULL;
   1438 	}
   1439 	t2->pt_key = pmap_table_make_key(segnum, segtab);
   1440 
   1441 	/*
   1442 	 * Now that we have the new table, we need to insert it into the
   1443 	 * table lookup tree.  If we blocked while allocating, it's possible
   1444 	 * someone raced with us and inserted one behind our back, so we need
   1445 	 * to check for that.
   1446 	 */
   1447 	ret_t = rb_tree_insert_node(&pmap->pm_tables, t2);
   1448 	if (__predict_false(ret_t != t2)) {
   1449 		/*
   1450 		 * Someone beat us to the punch.  If this happens,
   1451 		 * then we also need to drop the retain count on
   1452 		 * t1 because the table we just found already has
   1453 		 * a retain count on it.
   1454 		 */
   1455 		pmap_table_retain(ret_t);
   1456 		pmap_table_release(pmap, t2, pc);
   1457 		pmap_table_release(pmap, t1, NULL);
   1458 		return ret_t;
   1459 	}
   1460 
   1461 	/*
   1462 	 * Table has been successfully inserted into the lookup
   1463 	 * tree, now link it into the MMU's tree.  The new table
   1464 	 * takes ownership of the retain count that was taken on
   1465 	 * the upper level table while working.
   1466 	 */
   1467 	t2->pt_st = t1;
   1468 	t2->pt_stidx = (unsigned short)stidx;
   1469 	pte_store(&t1->pt_entries[stidx], pmap_ste_proto | pmap_table_pa(t2));
   1470 
   1471 	return t2;
   1472 }
   1473 
   1474 /*************************** PTE LOOKUP FUNCTIONS ****************************/
   1475 
   1476 static pt_entry_t *kernel_ptes;
   1477 
   1478 /*
   1479  * pmap_kernel_pte:
   1480  *
   1481  *	Get the PTE that maps the specified kernel virtual address.
   1482  *
   1483  *	Take note: the caller *may assume* they they can linearly
   1484  *	access adjacent PTEs up until the address indicated by
   1485  *	virtual_end!  That means, "pte++" is totally fine until you
   1486  *	get to the current limit of the kernel virtual address space!
   1487  *
   1488  *	XXX This is exported because db_memrw.c needs it.
   1489  */
   1490 pt_entry_t *
   1491 pmap_kernel_pte(vaddr_t va)
   1492 {
   1493 	/*
   1494 	 * The kernel PTEs are mapped as a linear array, whose entries
   1495 	 * represent the entire possible 4GB supervisor address space.
   1496 	 *
   1497 	 * Kernel PT pages are pre-allocated and mapped into this linear
   1498 	 * space (via pmap_growkernel(), as needed) and never freed back.
   1499 	 * So, as long as the VA is below virtual_end, we know that a PTE
   1500 	 * exists to back it.
   1501 	 *
   1502 	 * We don't assert that the VA < virtual_end, however; there may
   1503 	 * be special cases where we need to get a PTE that has been
   1504 	 * statically-allocated out beyond where virtual space is allowed
   1505 	 * to grow.  We'll find out soon enough if a PT page doesn't back
   1506 	 * it, because a fault will occur when the PTE is accessed.
   1507 	 */
   1508 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
   1509 	return &kernel_ptes[m68k_btop(va - VM_MIN_KERNEL_ADDRESS)];
   1510 }
   1511 
   1512 /*
   1513  * pmap_pte_lookup:
   1514  *
   1515  *	Lookup the PTE for the given address, returning a retained
   1516  *	reference to the table containing the PTE.
   1517  *
   1518  *	Take note: the caller *may assume* they they can linearly
   1519  *	access adjacent PTEs that map addresses within the same
   1520  *	segment!  That means, "pte++" is totally fine until you
   1521  *	get to the next segment boundary!
   1522  */
   1523 static pt_entry_t *
   1524 pmap_pte_lookup(pmap_t pmap, vaddr_t va, struct pmap_table **out_pt)
   1525 {
   1526 	if (pmap == pmap_kernel()) {
   1527 		*out_pt = NULL;
   1528 		return pmap_kernel_pte(va);
   1529 	}
   1530 
   1531 	const unsigned int segnum = pmap_segnum(va);
   1532 
   1533 	struct pmap_table *pt = pmap_table_lookup(pmap, segnum, false);
   1534 	if (__predict_true(pt != NULL)) {
   1535 		*out_pt = pt;	/* already retained */
   1536 		return &pt->pt_entries[pmap_pt_index(va)];
   1537 	}
   1538 
   1539 	*out_pt = NULL;
   1540 	return NULL;
   1541 }
   1542 
   1543 /*
   1544  * pmap_pte_alloc:
   1545  *
   1546  *	Like pmap_pte_lookup(), but allocates tables as necessary.
   1547  *
   1548  *	We enter in a critical section, but may drop that along
   1549  *	the way and re-validate our own assumptions.  Callers
   1550  *	(pmap_enter(), basically), should be aware of this.
   1551  */
   1552 static pt_entry_t *
   1553 pmap_pte_alloc(pmap_t pmap, vaddr_t va, struct pmap_table **out_pt,
   1554     bool nowait, struct pmap_completion *pc)
   1555 {
   1556 	struct pmap_table *st, *pt;
   1557 	pt_entry_t *ptep;
   1558 
   1559 	PMAP_CRIT_ASSERT();
   1560 
   1561 	ptep = pmap_pte_lookup(pmap, va, out_pt);
   1562 	if (__predict_true(ptep != NULL)) {
   1563 		return ptep;
   1564 	}
   1565 
   1566 	/*
   1567 	 * First get a reference on the top-level segment table and
   1568 	 * retain it so that it's stable while we work.
   1569 	 */
   1570 	if (__predict_true((st = pmap->pm_lev1map) != NULL)) {
   1571 		pmap_table_retain(st);
   1572 	} else {
   1573 		/*
   1574 		 * Oh look!  Baby pmap's first mapping!  Allocate
   1575 		 * a segment table.
   1576 		 */
   1577 		PMAP_CRIT_EXIT();
   1578 		st = pmap_table_alloc(pmap, true/*segtab*/, nowait, pc);
   1579 		PMAP_CRIT_ENTER();
   1580 		if (__predict_false(st == NULL)) {
   1581 			return NULL;
   1582 		}
   1583 
   1584 		/* Re-validate that we still need the segment table. */
   1585 		if (__predict_false(pmap->pm_lev1map != NULL)) {
   1586 			/* Raced and lost. */
   1587 			pmap_table_release(pmap, st, pc);
   1588 			st = pmap->pm_lev1map;
   1589 			pmap_table_retain(st);
   1590 		} else {
   1591 			/* New table is returned to us retained. */
   1592 			pmap_set_lev1map(pmap, st, pmap_table_pa(st));
   1593 		}
   1594 	}
   1595 
   1596 	/*
   1597 	 * Now we know that st points to a valid segment table with a
   1598 	 * retain count that lets us safely reference it.
   1599 	 */
   1600 
   1601 	if (MMU_USE_3L) {
   1602 		/* Get the inner segment table for this virtual address. */
   1603 		struct pmap_table * const st1 = st;
   1604 		st = pmap_table_insert(pmap, st1, pmap_st1_index(va),
   1605 		    pmap_st1_index(va), true/*segtab*/, nowait, pc);
   1606 		if (__predict_false(st == NULL)) {
   1607 			pmap_table_release(pmap, st1, pc);
   1608 			return NULL;
   1609 		}
   1610 	}
   1611 
   1612 	/* We can now allocate and insert the leaf page table. */
   1613 	pt = pmap_table_insert(pmap, st, pmap_st_index(va), pmap_segnum(va),
   1614 	    false/*segtab*/, nowait, pc);
   1615 	if (__predict_false(pt == NULL)) {
   1616 		pmap_table_release(pmap, st, pc);
   1617 		return NULL;
   1618 	}
   1619 
   1620 	*out_pt = pt;
   1621 	return &pt->pt_entries[pmap_pt_index(va)];
   1622 }
   1623 
   1624 /************************** P->V ENTRY MANAGEMENT ****************************/
   1625 
   1626 static inline pt_entry_t *
   1627 pmap_pv_pte(struct pv_entry * const pv)
   1628 {
   1629 	const vaddr_t va = PV_VA(pv);
   1630 
   1631 	if (__predict_true(pv->pv_pmap != pmap_kernel())) {
   1632 		KASSERT(pv->pv_pt != NULL);
   1633 		return &pv->pv_pt->pt_entries[pmap_pt_index(va)];
   1634 	}
   1635 	return pmap_kernel_pte(va);
   1636 }
   1637 
   1638 #define	MATCHING_PMAP(p1, p2)			\
   1639 	((p1) == (p2) ||			\
   1640 	 (p1) == pmap_kernel() || (p2) == pmap_kernel())
   1641 
   1642 #define	CONFLICTING_ALIAS(va1, va2)		\
   1643 	(((va1) & pmap_aliasmask) != ((va2) & pmap_aliasmask))
   1644 
   1645 /*
   1646  * pmap_pv_enter:
   1647  *
   1648  *	Add a physical->virtual entry to the pv table.  Caller must provide
   1649  *	the storage for the new PV entry.
   1650  *
   1651  *	We are responsible for storing the new PTE into the destination
   1652  *	table.  We are also guaranteed that no mapping exists there, but
   1653  *	the MMU has a negative cache in the ATC (see 68030UM Figure 9-8.
   1654  *	Address Translation General Flowchart, ATC hit, B==1 case, as well
   1655  *	as 68040UM Figure 3-21. ATC Entry and Tag Fields, R bit and the
   1656  *	associated descriptive text), so we still have to handle ATC entry
   1657  *	invalidation.
   1658  */
   1659 static void
   1660 pmap_pv_enter(pmap_t pmap, struct vm_page *pg, vaddr_t va, vm_prot_t prot,
   1661     struct pmap_table *pt, pt_entry_t npte, struct pv_entry *newpv)
   1662 {
   1663 	const bool usr_ci = pte_ci_p(npte);
   1664 	struct pv_entry *pv;
   1665 	pt_entry_t opte;
   1666 
   1667 	pmap_evcnt(pv_enter_called);
   1668 
   1669 	PMAP_CRIT_ASSERT();
   1670 	KASSERT(newpv != NULL);
   1671 
   1672 	npte |= PTE_PVLIST;
   1673 
   1674 	newpv->pv_pmap = pmap;
   1675 	newpv->pv_vf = va;
   1676 	newpv->pv_pt = pt;
   1677 
   1678 	pt_entry_t *ptep = pmap_pv_pte(newpv);
   1679 
   1680 #ifdef DEBUG
   1681 	/*
   1682 	 * Make sure the entry doesn't already exist.
   1683 	 */
   1684 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   1685 		if (pmap == pv->pv_pmap && va == PV_VA(pv)) {
   1686 			panic("%s: pmap=%p va=0x%08lx already in PV table",
   1687 			    __func__, pmap, va);
   1688 		}
   1689 	}
   1690 #endif
   1691 
   1692 	if (__predict_false(usr_ci)) {
   1693 		newpv->pv_vf |= PV_F_CI_USR;
   1694 	}
   1695 
   1696 	newpv->pv_next = VM_MDPAGE_PVS(pg);
   1697 	VM_MDPAGE_SETPVP(VM_MDPAGE_HEAD_PVP(pg), newpv);
   1698 	LIST_INSERT_HEAD(&pmap->pm_pvlist, newpv, pv_pmlist);
   1699 
   1700 	/*
   1701 	 * If this is an EXEC mapping, then we have to ensure that
   1702 	 * the I$ doesn't load stale data.
   1703 	 *
   1704 	 * XXX Should have a soft-PTE bit for this.
   1705 	 */
   1706 	if (prot & UVM_PROT_EXEC) {
   1707 #if MMU_CONFIG_68040_CLASS
   1708 		if (MMU_IS_68040_CLASS) {
   1709 			/*
   1710 			 * XXX Potential future optimization: is only
   1711 			 * XXX the DCFP() needed here to deal with
   1712 			 * XXX write-back?  Should we track EXEC-ness
   1713 			 * XXX in the VM_MDPAGE?
   1714 			 */
   1715 			const paddr_t pa = VM_PAGE_TO_PHYS(pg);
   1716 			DCFP(pa);
   1717 			ICPP(pa);
   1718 		}
   1719 #endif
   1720 	}
   1721 
   1722 #if MMU_CONFIG_HP_CLASS
   1723 	if (MMU_IS_HP_CLASS) {
   1724 		/* Go handle the HP MMU's VAC. */
   1725 		goto hp_mmu_vac_shenanigans;
   1726 	}
   1727 #endif
   1728 
   1729 	/*
   1730 	 * If the page is marked as being cache-inhibited, it means
   1731 	 * there is at least one user-requested CI mapping already
   1732 	 * (and that all of the extant mappings are thus CI).
   1733 	 *
   1734 	 * In this case, we need to make sure that the one we're
   1735 	 * establishing now is CI as well.
   1736 	 */
   1737 	if (__predict_false(VM_MDPAGE_CI_P(pg))) {
   1738 		npte = pte_set_ci(npte);
   1739 		pte_store(ptep, npte);
   1740 		/* See below. */
   1741 		if (active_pmap(pmap)) {
   1742 			TBIS(va);
   1743 		}
   1744 		return;
   1745 	}
   1746 
   1747 	/* Set the PTE for the new mapping. */
   1748 	pte_store(ptep, npte);
   1749 
   1750 	/*
   1751 	 * Invalidate the ATC entry **after** storing the PTE so that
   1752 	 * there is no window where another MMU table walk finds the
   1753 	 * stale invalid entry.
   1754 	 */
   1755 	if (active_pmap(pmap)) {
   1756 		TBIS(va);
   1757 	}
   1758 
   1759 	/*
   1760 	 * If this is a user-requested CI mapping, we need to make
   1761 	 * sure the page is purged from the cache and mark any other
   1762 	 * mappings of this page CI as well.
   1763 	 */
   1764 	if (__predict_false(usr_ci)) {
   1765 		VM_MDPAGE_SET_CI(pg);
   1766 
   1767 		pmap_evcnt(pv_enter_usr_ci);
   1768 
   1769 		/*
   1770 		 * There shouldn't be very many of these; CI mappings
   1771 		 * of managed pages are typically only for coherent DMA
   1772 		 * purposes, and multiple mappings of the same page are
   1773 		 * extremely uncommon in that scenario.
   1774 		 */
   1775 		for (pv = newpv->pv_next; pv != NULL; pv = pv->pv_next) {
   1776 			pmap_evcnt(pv_enter_ci_multi);
   1777 			ptep = pmap_pv_pte(pv);
   1778 			for (;;) {
   1779 				opte = pte_load(ptep);
   1780 				npte = pte_set_ci(opte);
   1781 				if (pte_update(ptep, opte, npte)) {
   1782 					if (active_pmap(pv->pv_pmap)) {
   1783 						TBIS(PV_VA(pv));
   1784 					}
   1785 					break;
   1786 				}
   1787 			}
   1788 		}
   1789 #if MMU_CONFIG_68040_CLASS
   1790 		if (MMU_IS_68040_CLASS) {
   1791 			const paddr_t pa = VM_PAGE_TO_PHYS(pg);
   1792 			DCFP(pa);
   1793 			ICPP(pa);
   1794 		}
   1795 #endif
   1796 	}
   1797 	return;
   1798 
   1799 #if MMU_CONFIG_HP_CLASS
   1800  hp_mmu_vac_shenanigans:
   1801 	/*
   1802 	 * We have ourselves a VAC, so in addition to checking for
   1803 	 * user-requested-CI mappings, we have to check for cache
   1804 	 * aliases and cache-inhibit all mappings for a page that
   1805 	 * have a cache alias conflict.
   1806 	 *
   1807 	 * - All mappings of a given page within the same pmap must
   1808 	 *   not collide.  (The VAC is flushed when switching pmaps
   1809 	 *   by virtue of a new segment table pointer being loaded
   1810 	 *   into the user segment table register.)
   1811 	 *
   1812 	 * - The Hibler pmap check to see that the kernel doesn't have
   1813 	 *   conflicting mappings with any user pmap.  We'll do the same,
   1814 	 *   which seems reasonable on the surface if you think about it
   1815 	 *   for a couple of minutes.
   1816 	 *
   1817 	 * - The Hibler pmap also just punts and cache-inhibits all
   1818 	 *   mappings once it becomes > 2, but we do NOT do that because
   1819 	 *   it will severely penalize shared libraries.
   1820 	 *
   1821 	 * N.B. The method used here will not universally render all
   1822 	 * mappings for a given page uncached; only address spaces with
   1823 	 * conflicts are penalized.
   1824 	 *
   1825 	 * XXX This probably only matters if one of the mappings is
   1826 	 * XXX writable, as this is the only situation where data
   1827 	 * XXX inconsistency could arise.  There is probably room
   1828 	 * XXX for further optimization if someone with one of these
   1829 	 * XXX machines cares to take it up.
   1830 	 */
   1831 	bool flush_s_vac = false;
   1832 	bool flush_u_vac = false;
   1833 
   1834 	/* Set the PTE for the new mapping. */
   1835 	pte_store(ptep, npte);
   1836 
   1837 	/*
   1838 	 * Invalidate the ATC entry **after** storing the PTE so that
   1839 	 * there is no window where another MMU table walk finds the
   1840 	 * stale invalid entry.
   1841 	 *
   1842 	 * XXX I don't know that this is strictly necessary with the
   1843 	 * XXX HP MMU, but there is basically zero documentation available
   1844 	 * XXX for it, so we err on the side of caution.
   1845 	 */
   1846 	if (active_pmap(pmap)) {
   1847 		TBIS(va);
   1848 	}
   1849 
   1850 	vaddr_t pv_flags = newpv->pv_vf & PV_F_CI_USR;
   1851 	if (usr_ci) {
   1852 		pmap_evcnt(pv_enter_usr_ci);
   1853 	}
   1854 
   1855 	for (pv = newpv->pv_next; pv != NULL; pv = pv->pv_next) {
   1856 		if (MATCHING_PMAP(pmap, pv->pv_pmap) &&
   1857 		    CONFLICTING_ALIAS(va, PV_VA(pv))) {
   1858 			pmap_evcnt(pv_enter_vac_ci);
   1859 			pv_flags |= PV_F_CI_VAC;
   1860 			break;
   1861 		}
   1862 	}
   1863 
   1864 	if (__predict_true(pv_flags == 0)) {
   1865 		/* No new inhibitions! */
   1866 		return;
   1867 	}
   1868 
   1869 	VM_MDPAGE_SET_CI(pg);
   1870 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   1871 		if (MATCHING_PMAP(pmap, pv->pv_pmap)) {
   1872 			pmap_evcnt(pv_enter_ci_multi);
   1873 			pv->pv_vf |= pv_flags;
   1874 			pte_set(pmap_pv_pte(pv), PTE51_CI);
   1875 			if (active_pmap(pv->pv_pmap)) {
   1876 				TBIS(PV_VA(pv));
   1877 				if (pv->pv_pmap == pmap_kernel()) {
   1878 					flush_s_vac = true;
   1879 				} else {
   1880 					flush_u_vac = true;
   1881 				}
   1882 			}
   1883 		}
   1884 	}
   1885 	if (flush_u_vac && flush_s_vac) {
   1886 		DCIA();
   1887 	} else if (flush_u_vac) {
   1888 		DCIU();
   1889 	} else if (flush_s_vac) {
   1890 		DCIS();
   1891 	}
   1892 #endif /* MMU_CONFIG_HP_CLASS */
   1893 }
   1894 
   1895 /*
   1896  * pmap_pv_remove:
   1897  *
   1898  *	Remove a physical->virtual entry from the pv table.
   1899  */
   1900 static void
   1901 pmap_pv_remove(pmap_t pmap, struct vm_page *pg, vaddr_t va,
   1902     struct pmap_completion *pc)
   1903 {
   1904 	struct pv_entry **pvp, *pv;
   1905 	pt_entry_t *ptep, opte, npte;
   1906 
   1907 	pmap_evcnt(pv_remove_called);
   1908 
   1909 	PMAP_CRIT_ASSERT();
   1910 
   1911 	for (pvp = VM_MDPAGE_HEAD_PVP(pg), pv = VM_MDPAGE_PVS(pg);
   1912 	     pv != NULL;
   1913 	     pvp = &pv->pv_next, pv = *pvp) {
   1914 		if (pmap == pv->pv_pmap && va == PV_VA(pv)) {
   1915 			break;
   1916 		}
   1917 	}
   1918 
   1919 	KASSERT(pv != NULL);
   1920 	VM_MDPAGE_SETPVP(pvp, pv->pv_next);
   1921 	LIST_REMOVE(pv, pv_pmlist);
   1922 
   1923 	KASSERT(pc != NULL);
   1924 	LIST_INSERT_HEAD(&pc->pc_pvlist, pv, pv_pmlist);
   1925 
   1926 #if MMU_CONFIG_HP_CLASS
   1927 	if (MMU_IS_HP_CLASS) {
   1928 		/* Go handle the HP MMU's VAC. */
   1929 		goto hp_mmu_vac_shenanigans;
   1930 	}
   1931 #endif
   1932 
   1933 	/*
   1934 	 * If the page is marked as being cache-inhibited, then it
   1935 	 * means there was at least one user-requested CI mapping
   1936 	 * for the page.  In that case, we need to scan the P->V
   1937 	 * list to see if any remain, and if not, clear the CI
   1938 	 * status for the page.
   1939 	 *
   1940 	 * N.B. This requires traversing the list twice: once to
   1941 	 * check if any of the mappings are user-requested-CI,
   1942 	 * and one again to fix them up.  But, we're making a
   1943 	 * classical space-vs-time trade-off here: Assuming that
   1944 	 * this is a rare situation, it's better to pay the cpu
   1945 	 * cost on the rare edge transitions rather than always pay
   1946 	 * the memory cost of having a counter to track something
   1947 	 * that almost never happens (and, when it does, the list
   1948 	 * will be very short).
   1949 	 */
   1950 	if (__predict_false(VM_MDPAGE_CI_P(pg))) {
   1951 		pmap_evcnt(pv_remove_ci);
   1952 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   1953 			if (pv->pv_vf & PV_F_CI_USR) {
   1954 				/*
   1955 				 * There is still at least one user-requested
   1956 				 * CI mapping, so we can't change the page's CI
   1957 				 * status.
   1958 				 */
   1959 				return;
   1960 			}
   1961 		}
   1962 		KASSERT(pv == NULL);
   1963 		VM_MDPAGE_CLR_CI(pg);
   1964 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   1965 			ptep = pmap_pv_pte(pv);
   1966 			for (;;) {
   1967 				opte = pte_load(ptep);
   1968 				npte = pte_clr_ci(opte);
   1969 				if (pte_update(ptep, opte, npte)) {
   1970 					if (active_pmap(pv->pv_pmap)) {
   1971 						TBIS(PV_VA(pv));
   1972 					}
   1973 					break;
   1974 				}
   1975 			}
   1976 		}
   1977 	}
   1978 	return;
   1979 
   1980 #if MMU_CONFIG_HP_CLASS
   1981  hp_mmu_vac_shenanigans:
   1982 	/*
   1983 	 * If we have a VAC and the page was cache-inhibited due to
   1984 	 * a cache alias conflict, we can re-enable the cache if there
   1985 	 * is just one such mapping left.
   1986 	 */
   1987 	if (__predict_false(VM_MDPAGE_CI_P(pg))) {
   1988 		vaddr_t all_ci_flags = PV_F_CI_USR;
   1989 
   1990 		pmap_evcnt(pv_remove_ci);
   1991 
   1992 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   1993 			if (! MATCHING_PMAP(pmap, pv->pv_pmap)) {
   1994 				continue;
   1995 			}
   1996 			if (pv->pv_vf & all_ci_flags) {
   1997 				/*
   1998 				 * There is at least one CI_USR mapping
   1999 				 * or more than one CI_VAC mapping, so
   2000 				 * the CI status of the page remains
   2001 				 * unchanged.
   2002 				 */
   2003 				return;
   2004 			}
   2005 			all_ci_flags |= pv->pv_vf & PV_F_CI_VAC;
   2006 		}
   2007 		KASSERT(pv == NULL);
   2008 		/*
   2009 		 * We now know we can remove CI from the page mappings
   2010 		 * in the matching address space.  If no CI mappings
   2011 		 * remain, then we can clear the CI indicator on the
   2012 		 * page.
   2013 		 */
   2014 		all_ci_flags = 0;
   2015 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   2016 			if (! MATCHING_PMAP(pmap, pv->pv_pmap)) {
   2017 				all_ci_flags |= pv->pv_vf;
   2018 				continue;
   2019 			}
   2020 			pte_mask(pmap_pv_pte(pv), ~((uint32_t)PTE51_CI));
   2021 			if (active_pmap(pv->pv_pmap)) {
   2022 				TBIS(PV_VA(pv));
   2023 			}
   2024 		}
   2025 		all_ci_flags &= PV_F_CI_USR | PV_F_CI_VAC;
   2026 		if (__predict_true(all_ci_flags == 0)) {
   2027 			VM_MDPAGE_CLR_CI(pg);
   2028 		}
   2029 	}
   2030 #endif /* MMU_CONFIG_HP_CLASS */
   2031 }
   2032 
   2033 #undef CONFLICTING_ALIAS
   2034 #undef MATCHING_PMAP
   2035 
   2036 /***************** PMAP INTERFACE (AND ADJACENT) FUNCTIONS *******************/
   2037 
   2038 static inline void
   2039 pmap_stat_update_impl(long *valp, int val)
   2040 {
   2041 	*valp += val;
   2042 }
   2043 
   2044 #define	pmap_stat_update(pm, stat, delta)		\
   2045 	pmap_stat_update_impl(&(pm)->pm_stats.stat, (delta))
   2046 
   2047 static inline void
   2048 pmap_stat_set_impl(long *valp, int val)
   2049 {
   2050 	atomic_store_relaxed(valp, val);
   2051 }
   2052 
   2053 #define	pmap_stat_set(pm, stat, val)			\
   2054 	pmap_stat_set_impl(&(pm)->pm_stats.stat, (val))
   2055 
   2056 /*
   2057  * pmap_pinit:
   2058  *
   2059  *	Common bits of pmap structure initialization shared between
   2060  *	the kernel pmap and user pmaps.
   2061  */
   2062 static void
   2063 pmap_pinit(pmap_t pmap, paddr_t lev1pa)
   2064 {
   2065 	pmap->pm_lev1pa = lev1pa;
   2066 	rb_tree_init(&pmap->pm_tables, &pmap_table_rb_ops);
   2067 	TAILQ_INIT(&pmap->pm_ptpages[0]);
   2068 	TAILQ_INIT(&pmap->pm_ptpages[1]);
   2069 	LIST_INIT(&pmap->pm_pvlist);
   2070 
   2071 	atomic_store_relaxed(&pmap->pm_refcnt, 1);
   2072 }
   2073 
   2074 /*
   2075  * pmap_virtual_space:		[ INTERFACE ]
   2076  *
   2077  *	Define the initial bounds of the kernel virtual address space.
   2078  *
   2079  *	In this implementation, the start address we return marks the
   2080  *	end of the statically allocated special kernel virtual addresses
   2081  *	set up in pmap_bootstrap1().  We return kernel_virtual_max as
   2082  *	the end because we can grow the kernel address space using
   2083  *	pmap_growkernel().
   2084  */
   2085 void
   2086 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
   2087 {
   2088 	*vstartp = kernel_virtual_start;
   2089 	*vendp = kernel_virtual_max;
   2090 }
   2091 
   2092 /*
   2093  * pmap_init:			[ INTERFACE ]
   2094  *
   2095  *	Initialize the pmap module.  Called by vm_init(), to initialize any
   2096  *	structures that the pmap system needs to map virtual memory.
   2097  */
   2098 void
   2099 pmap_init(void)
   2100 {
   2101 	/* Initialize the pmap / pv_entry allocators. */
   2102 	pmap_alloc_init();
   2103 
   2104 	/* Initialize the PT page allocator. */
   2105 	pmap_ptpage_init();
   2106 
   2107 	/* Now it's safe to do P->V entry recording! */
   2108 	pmap_initialized_p = true;
   2109 }
   2110 
   2111 /*
   2112  * pmap_create:			[ INTERFACE ]
   2113  *
   2114  *	Create and return a physical map.
   2115  */
   2116 pmap_t
   2117 pmap_create(void)
   2118 {
   2119 	pmap_t pmap;
   2120 
   2121 	/*
   2122 	 * We reference the null segment table and and have a NULL
   2123 	 * lev1map pointer until the first mapping is entered.
   2124 	 */
   2125 	pmap = pmap_alloc();
   2126 	pmap_pinit(pmap, null_segtab_pa);
   2127 
   2128 	return pmap;
   2129 }
   2130 
   2131 /*
   2132  * pmap_destroy:		[ INTERFACE ]
   2133  *
   2134  *	Drop the reference count on the specified pmap, releasing
   2135  *	all resources if the reference count drops to zero.
   2136  */
   2137 void
   2138 pmap_destroy(pmap_t pmap)
   2139 {
   2140 	unsigned int newval;
   2141 
   2142 	PMAP_CRIT_ENTER();
   2143 	KASSERT(pmap->pm_refcnt > 0);
   2144 	newval = --pmap->pm_refcnt;
   2145 	PMAP_CRIT_EXIT();
   2146 
   2147 	if (newval) {
   2148 		return;
   2149 	}
   2150 
   2151 	/* We assume all mappings have been removed. */
   2152 	KASSERT(pmap->pm_lev1map == NULL);
   2153 	KASSERT(pmap->pm_lev1pa == null_segtab_pa);
   2154 
   2155 	pmap_free(pmap);
   2156 }
   2157 
   2158 /*
   2159  * pmap_reference:		[ INTERFACE ]
   2160  *
   2161  *	Add a reference to the specified pmap.
   2162  */
   2163 void
   2164 pmap_reference(pmap_t pmap)
   2165 {
   2166 	PMAP_CRIT_ENTER();
   2167 	pmap->pm_refcnt++;
   2168 	KASSERT(pmap->pm_refcnt > 0);
   2169 	PMAP_CRIT_EXIT();
   2170 }
   2171 
   2172 /*
   2173  * pmap_remove_mapping:
   2174  *
   2175  *	Invalidate a single page denoted by pmap/va.
   2176  *
   2177  *	If (ptep != NULL), it is the already computed PTE for the mapping.
   2178  *
   2179  *	If (flags & PRM_TFLUSH), we must invalidate any TLB information.
   2180  *
   2181  *	If (flags & PRM_CFLUSH), we must flush/invalidate any cache
   2182  *	information.
   2183  *
   2184  *	If the caller wishes to prevent the page table from being freed,
   2185  *	they should perform an extra retain.
   2186  */
   2187 #define	PRM_TFLUSH	__BIT(0)
   2188 #define	PRM_CFLUSH	__BIT(1)
   2189 static void
   2190 pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *ptep,
   2191     struct pmap_table *pt, int flags, struct pmap_completion *pc)
   2192 {
   2193 	KASSERT(ptep != NULL);
   2194 
   2195 	const paddr_t opte = pte_load(ptep);
   2196 	if (! pte_valid_p(opte)) {
   2197 		return;
   2198 	}
   2199 
   2200 	const paddr_t pa = pte_pa(opte);
   2201 
   2202 	/* Update statistics. */
   2203 	if (pte_wired_p(opte)) {
   2204 		pmap_stat_update(pmap, wired_count, -1);
   2205 	}
   2206 	pmap_stat_update(pmap, resident_count, -1);
   2207 
   2208 	if (flags & PRM_CFLUSH) {
   2209 #if MMU_CONFIG_68040_CLASS
   2210 		if (MMU_IS_68040_CLASS) {
   2211 			DCFP(pa);
   2212 			ICPP(pa);
   2213 		}
   2214 #endif
   2215 #if MMU_CONFIG_HP_CLASS
   2216 		if (MMU_IS_HP_CLASS) {
   2217 			if (pmap == pmap_kernel()) {
   2218 				DCIS();
   2219 			} else if (active_user_pmap(pmap)) {
   2220 				DCIU();
   2221 			}
   2222 		}
   2223 #endif
   2224 	}
   2225 
   2226 	/*
   2227 	 * Zap the PTE and drop the retain count that the mapping
   2228 	 * had on the table.
   2229 	 */
   2230 	pte_store(ptep, 0);
   2231 	pmap_table_release(pmap, pt, pc);
   2232 
   2233 	/*
   2234 	 * Now that the ATC can't be reloaded from the PTE, invalidate
   2235 	 * the ATC entry.
   2236 	 */
   2237 	if (__predict_true((flags & PRM_TFLUSH) != 0 && active_pmap(pmap))) {
   2238 		TBIS(va);
   2239 	}
   2240 
   2241 	struct vm_page * const pg = pmap_pa_to_pg(pa);
   2242 	if (__predict_true(pg != NULL)) {
   2243 		KASSERT(pte_managed_p(opte));
   2244 		/* Update cached U/M bits from mapping that's going away. */
   2245 		VM_MDPAGE_ADD_UM(pg, opte);
   2246 		pmap_pv_remove(pmap, pg, va, pc);
   2247 	} else {
   2248 		KASSERT(! pte_managed_p(opte));
   2249 	}
   2250 }
   2251 
   2252 /*
   2253  * pmap_remove:			[ INTERFACE ]
   2254  *
   2255  *	Remove the given range of addresses from the specified map.
   2256  *
   2257  *	It is assumed that the start and end are properly rounded
   2258  *	to the page size.
   2259  *
   2260  *	N.B. Callers of pmap_remove_internal() are expected to
   2261  *	provide an initialized completion context, which we
   2262  *	will finalize.
   2263  */
   2264 static void
   2265 pmap_remove_internal(pmap_t pmap, vaddr_t sva, vaddr_t eva,
   2266     struct pmap_completion *pc)
   2267 {
   2268 	pt_entry_t opte, *ptep;
   2269 	struct pmap_table *pt;
   2270 	vaddr_t nextseg;
   2271 	int prm_flags;
   2272 #if MMU_CONFIG_HP_CLASS
   2273 	pt_entry_t all_ci = PTE51_CI;
   2274 #endif
   2275 
   2276 	/*
   2277 	 * If this is the kernel pmap, we can use a faster method
   2278 	 * for accessing the PTEs (since the PT pages are always
   2279 	 * resident).
   2280 	 *
   2281 	 * Note that this routine should NEVER be called from an
   2282 	 * interrupt context; pmap_kremove() is used for that.
   2283 	 */
   2284 	prm_flags = active_pmap(pmap) ? PRM_TFLUSH : 0;
   2285 	if (pmap == pmap_kernel()) {
   2286 		PMAP_CRIT_ENTER();
   2287 
   2288 		for (ptep = pmap_kernel_pte(sva); sva < eva;
   2289 		     ptep++, sva += PAGE_SIZE) {
   2290 			opte = pte_load(ptep);
   2291 			if (pte_valid_p(opte)) {
   2292 #if MMU_CONFIG_HP_CLASS
   2293 				/*
   2294 				 * If all of the PTEs we're zapping have the
   2295 				 * cache-inhibit bit set, ci_pte will remain
   2296 				 * non-zero and we'll be able to skip flushing
   2297 				 * the VAC when we're done.
   2298 				 */
   2299 				all_ci &= opte;
   2300 #endif
   2301 				pmap_remove_mapping(pmap, sva, ptep, NULL,
   2302 				    prm_flags, pc);
   2303 			}
   2304 		}
   2305 #if MMU_CONFIG_HP_CLASS
   2306 		if (MMU_IS_HP_CLASS && !all_ci) {
   2307 			/*
   2308 			 * Cacheable mappings were removed, so invalidate
   2309 			 * the cache.
   2310 			 */
   2311 			DCIS();
   2312 		}
   2313 #endif
   2314 		PMAP_CRIT_EXIT();
   2315 
   2316 		/* kernel PT pages are never freed. */
   2317 		KASSERT(TAILQ_EMPTY(&pc->pc_ptpages));
   2318 
   2319 		/* ...but we might have freed PV entries. */
   2320 		pmap_completion_fini(pc);
   2321 
   2322 		return;
   2323 	}
   2324 
   2325 	PMAP_CRIT_ENTER();
   2326 
   2327 	while (sva < eva) {
   2328 		nextseg = pmap_next_seg(sva);
   2329 		if (nextseg == 0 || nextseg > eva) {
   2330 			nextseg = eva;
   2331 		}
   2332 
   2333 		ptep = pmap_pte_lookup(pmap, sva, &pt);
   2334 		if (ptep == NULL) {
   2335 			/*
   2336 			 * No table for this address, meaning nothing
   2337 			 * within this segment; advance to the next
   2338 			 * one.
   2339 			 */
   2340 			sva = nextseg;
   2341 			continue;
   2342 		}
   2343 
   2344 		for (; sva < nextseg; ptep++, sva += PAGE_SIZE) {
   2345 			opte = pte_load(ptep);
   2346 			if (! pte_valid_p(opte)) {
   2347 				continue;
   2348 			}
   2349 #if MMU_CONFIG_HP_CLASS
   2350 			/*
   2351 			 * If all of the PTEs we're zapping have the
   2352 			 * cache-inhibit bit set, ci_pte will remain
   2353 			 * non-zero and we'll be able to skip flushing
   2354 			 * the VAC when we're done.
   2355 			 */
   2356 			all_ci &= opte;
   2357 #endif
   2358 			pmap_remove_mapping(pmap, sva, ptep, pt, prm_flags, pc);
   2359 		}
   2360 		pmap_table_release(pmap, pt, pc);
   2361 	}
   2362 #if MMU_CONFIG_HP_CLASS
   2363 	if (MMU_IS_HP_CLASS && !all_ci) {
   2364 		/*
   2365 		 * Cacheable mappings were removed, so invalidate
   2366 		 * the cache.
   2367 		 */
   2368 		if (pmap == pmap_kernel()) {
   2369 			DCIS();
   2370 		} else if (active_user_pmap(pmap)) {
   2371 			DCIU();
   2372 		}
   2373 	}
   2374 #endif
   2375 	PMAP_CRIT_EXIT();
   2376 
   2377 	pmap_completion_fini(pc);
   2378 }
   2379 
   2380 void
   2381 pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
   2382 {
   2383 	struct pmap_completion pc;
   2384 	pmap_completion_init(&pc);
   2385 	pmap_remove_internal(pmap, sva, eva, &pc);
   2386 	/* pmap_remove_internal() calls pmap_completion_fini(). */
   2387 }
   2388 
   2389 /*
   2390  * pmap_remove_all:		[ INTERFACE ]
   2391  *
   2392  *	Remove all mappings from a pmap in bulk.  This is only called
   2393  *	when it's known that the address space is no longer visible to
   2394  *	any user process (e.g. during exit or exec).
   2395  */
   2396 bool
   2397 pmap_remove_all(pmap_t pmap)
   2398 {
   2399 	struct pmap_completion pc;
   2400 	struct pv_entry *pv;
   2401 
   2402 	KASSERT(pmap != pmap_kernel());
   2403 
   2404 	/*
   2405 	 * This process is pretty simple:
   2406 	 *
   2407 	 * ==> (1) Set the segment table pointer to the NULL segment table.
   2408 	 *
   2409 	 * ==> (2) Copy the PT page list to a tempory list and re-init.
   2410 	 *
   2411 	 * ==> (3) Walk the PV entry list and remove each entry.
   2412 	 *
   2413 	 * ==> (4) Zero the wired and resident count.
   2414 	 *
   2415 	 * Once we've done that, we just need to free everything
   2416 	 * back to the system.
   2417 	 */
   2418 
   2419 	pmap_completion_init(&pc);
   2420 
   2421 	PMAP_CRIT_ENTER();
   2422 
   2423 	/* Step 1. */
   2424 	pmap_set_lev1map(pmap, NULL, null_segtab_pa);
   2425 
   2426 	/* Step 2. */
   2427 	pmap->pm_pt_cache = NULL;
   2428 	TAILQ_CONCAT(&pc.pc_ptpages, &pmap->pm_ptpages[0], ptp_list);
   2429 	TAILQ_CONCAT(&pc.pc_ptpages, &pmap->pm_ptpages[1], ptp_list);
   2430 	memset(&pmap->pm_tables, 0, sizeof(pmap->pm_tables));
   2431 	rb_tree_init(&pmap->pm_tables, &pmap_table_rb_ops);
   2432 	KASSERT(RB_TREE_MIN(&pmap->pm_tables) == NULL);
   2433 
   2434 	/* Step 3. */
   2435 	while ((pv = LIST_FIRST(&pmap->pm_pvlist)) != NULL) {
   2436 		KASSERT(pv->pv_pmap == pmap);
   2437 		pmap_pv_remove(pmap,
   2438 		    pmap_pa_to_pg(pte_pa(pte_load(pmap_pv_pte(pv)))),
   2439 		    PV_VA(pv), &pc);
   2440 	}
   2441 
   2442 	/* Step 4. */
   2443 	pmap_stat_set(pmap, wired_count, 0);
   2444 	pmap_stat_set(pmap, resident_count, 0);
   2445 
   2446 	PMAP_CRIT_EXIT();
   2447 
   2448 	pmap_completion_fini(&pc);
   2449 
   2450 	return true;
   2451 }
   2452 
   2453 /*
   2454  * pmap_page_protect:		[ INTERFACE ]
   2455  *
   2456  *	Lower the permission for all mappings to a given page to
   2457  *	the permissions specified.
   2458  */
   2459 void
   2460 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
   2461 {
   2462 	struct pmap_completion pc;
   2463 	struct pv_entry *pv;
   2464 
   2465 	if (prot & UVM_PROT_WRITE) {
   2466 		/* No protection to revoke. */
   2467 		return;
   2468 	}
   2469 
   2470 	if (prot & UVM_PROT_READ) {
   2471 		/* Making page copy-on-write. */
   2472 		pmap_changebit(pg, PTE_WP, ~0U);
   2473 		return;
   2474 	}
   2475 
   2476 	/* Removing all mappings for a page. */
   2477 	pmap_completion_init(&pc);
   2478 
   2479 	PMAP_CRIT_ENTER();
   2480 
   2481 	while ((pv = VM_MDPAGE_PVS(pg)) != NULL) {
   2482 		pmap_remove_mapping(pv->pv_pmap, PV_VA(pv), pmap_pv_pte(pv),
   2483 		    pv->pv_pt, PRM_TFLUSH|PRM_CFLUSH, &pc);
   2484 	}
   2485 
   2486 	PMAP_CRIT_EXIT();
   2487 
   2488 	pmap_completion_fini(&pc);
   2489 }
   2490 
   2491 /*
   2492  * pmap_protect:		[ INTERFACE ]
   2493  *
   2494  *	Set the physical protection on the specified range of this map
   2495  *	as requested.
   2496  */
   2497 void
   2498 pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
   2499 {
   2500 	pt_entry_t *ptep, opte, npte;
   2501 	struct pmap_table *pt;
   2502 	vaddr_t nextseg;
   2503 #if MMU_CONFIG_68040_CLASS
   2504 	bool removing_write;
   2505 #endif
   2506 	bool need_tflush;
   2507 
   2508 	if ((prot & UVM_PROT_READ) == 0) {
   2509 		struct pmap_completion pc;
   2510 		pmap_completion_init(&pc);
   2511 		pmap_remove_internal(pmap, sva, eva, &pc);
   2512 		/* pmap_remove_internal() calls pmap_completion_fini(). */
   2513 		return;
   2514 	}
   2515 
   2516 	PMAP_CRIT_ENTER();
   2517 
   2518 #if MMU_CONFIG_68040_CLASS
   2519 	removing_write = (prot & UVM_PROT_WRITE) == 0;
   2520 #endif
   2521 	need_tflush = active_pmap(pmap);
   2522 	while (sva < eva) {
   2523 		nextseg = pmap_next_seg(sva);
   2524 		if (nextseg == 0 || nextseg > eva) {
   2525 			nextseg = eva;
   2526 		}
   2527 
   2528 		ptep = pmap_pte_lookup(pmap, sva, &pt);
   2529 		if (ptep == NULL) {
   2530 			/*
   2531 			 * No table for this address, meaning nothing
   2532 			 * within this segment; advance to the next
   2533 			 * one.
   2534 			 */
   2535 			sva = nextseg;
   2536 			continue;
   2537 		}
   2538 
   2539 		/*
   2540 		 * Change protection on mapping if it is valid and doesn't
   2541 		 * already have the correct protection.
   2542 		 */
   2543 		for (; sva < nextseg; ptep++, sva += PAGE_SIZE) {
   2544  try_again:
   2545 			opte = pte_load(ptep);
   2546 			if (! pte_valid_p(opte)) {
   2547 				continue;
   2548 			}
   2549 			npte = pte_change_prot(opte, prot);
   2550 			if (npte == opte) {
   2551 				continue;
   2552 			}
   2553 #if MMU_CONFIG_68040_CLASS
   2554 			if (MMU_IS_68040_CLASS && removing_write) {
   2555 				/*
   2556 				 * Clear caches if making RO (see section
   2557 				 * "7.3 Cache Coherency" in the manual).
   2558 				 */
   2559 				paddr_t pa = pte_pa(opte);
   2560 				DCFP(pa);
   2561 				ICPP(pa);
   2562 			}
   2563 #endif
   2564 			if (! pte_update(ptep, opte, npte)) {
   2565 				/* Lost race updating PTE; try again. */
   2566 				goto try_again;
   2567 			}
   2568 			if (need_tflush) {
   2569 				TBIS(sva);
   2570 			}
   2571 		}
   2572 		pmap_table_release(pmap, pt, NULL);
   2573 	}
   2574 
   2575 	PMAP_CRIT_EXIT();
   2576 }
   2577 
   2578 /*
   2579  * pmap_enter:			[ INTERFACE ]
   2580  *
   2581  *	Insert the given physical address (pa) at the specified
   2582  *	virtual address (va) in the target physical map with the
   2583  *	protection requested.
   2584  *
   2585  *	If specified, the page will be wired down, meaning that
   2586  *	related pte can not be reclaimed.
   2587  *
   2588  *	Note:  This is the only routine which MAY NOT lazy-evaluate
   2589  *	or lose information.  That is, this routine must actually
   2590  *	insert this page into the given map NOW.
   2591  */
   2592 int
   2593 pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
   2594 {
   2595 	struct pmap_table *pt;
   2596 	pt_entry_t *ptep, npte, opte;
   2597 	struct pv_entry *newpv;
   2598 	struct pmap_completion pc;
   2599 	int error = 0;
   2600 	const bool nowait = !!(flags & PMAP_CANFAIL);
   2601 
   2602 	pmap_completion_init(&pc);
   2603 
   2604 	struct vm_page * const pg = pmap_pa_to_pg(pa);
   2605 	if (__predict_false(pg == NULL)) {
   2606 		/*
   2607 		 * PA is not part of managed memory.  Make the mapping
   2608 		 * cache-inhibited on the assumption that it's a device.
   2609 		 */
   2610 		flags |= PMAP_NOCACHE;
   2611 	}
   2612 
   2613 	PMAP_CRIT_ENTER();
   2614 
   2615 	if (nowait) {
   2616 		pmap_evcnt(enter_nowait);
   2617 	} else {
   2618 		pmap_evcnt(enter_yeswait);
   2619 	}
   2620 
   2621 	/* Get the destination table. */
   2622 	ptep = pmap_pte_alloc(pmap, va, &pt, nowait, &pc);
   2623 	if (__predict_false(ptep == NULL)) {
   2624 		pmap_evcnt(enter_pte_alloc_fail);
   2625 		error = ENOMEM;
   2626 		goto out;
   2627 	}
   2628 
   2629 	/* Compute the new PTE. */
   2630 	npte = pmap_make_pte(pa, prot, flags);
   2631 
   2632 	/* Fetch old PTE. */
   2633 	opte = pte_load(ptep);
   2634 
   2635 	/*
   2636 	 * Check to see if there is a valid mapping at this address.
   2637 	 * It might simply be a wiring or protection change.
   2638 	 */
   2639 	if (pte_valid_p(opte)) {
   2640  		pmap_evcnt(enter_valid);
   2641  restart:
   2642 		if (pte_pa(opte) == pa) {
   2643 			/*
   2644 			 * Just a protection or wiring change.
   2645 			 *
   2646 			 * Since the old PTE is handy, go ahead and update
   2647 			 * the cached U/M attributes now.  Normally we would
   2648 			 * do this in pmap_remove_mapping(), but we're not
   2649 			 * taking that path in this case.  We also add in
   2650 			 * any U/M attributes hinted by the access type
   2651 			 * that brought us to pmap_enter() in the first
   2652 			 * place (a write-fault on a writable page mapped
   2653 			 * read-only during a page-out, for example).
   2654 			 *
   2655 			 * Also ensure that the PV list status of the mapping
   2656 			 * is consistent.
   2657 			 */
   2658 			if (__predict_true(pg != NULL)) {
   2659 				VM_MDPAGE_ADD_UM(pg, opte | npte);
   2660 				KASSERT(pte_managed_p(opte));
   2661 				npte |= PTE_PVLIST;
   2662 			}
   2663 
   2664 			/* Preserve cache-inhibited status. */
   2665 			if (__predict_false(pte_ci_p(opte))) {
   2666 				npte =
   2667 				    (npte & ~PTE_CMASK) | (opte & PTE_CMASK);
   2668 			}
   2669 
   2670 			/* Set the new PTE. */
   2671 			pte_store(ptep, npte);
   2672 
   2673 			const pt_entry_t diff = opte ^ npte;
   2674 
   2675 #ifdef PMAP_EVENT_COUNTERS
   2676 			if (diff & PTE_WIRED) {
   2677 				pmap_evcnt(enter_wire_change);
   2678 			}
   2679 			if (diff & PTE_WP) {
   2680 				pmap_evcnt(enter_prot_change);
   2681 			}
   2682 #endif
   2683 
   2684 			if (pte_wired_p(diff)) {
   2685 				pmap_stat_update(pmap, wired_count,
   2686 				    pte_wired_p(npte) ? 1 : -1);
   2687 			}
   2688 			if (diff & PTE_CRIT_BITS) {
   2689 #if MMU_CONFIG_68040_CLASS
   2690 				/*
   2691 				 * Protection or caching status is changing;
   2692 				 * flush the page from the cache.
   2693 				 */
   2694 				if (MMU_IS_68040_CLASS) {
   2695 					DCFP(pa);
   2696 					ICPP(pa);
   2697 				}
   2698 #endif
   2699 				if (active_pmap(pmap)) {
   2700 					TBIS(va);
   2701 #if MMU_CONFIG_HP_CLASS
   2702 					/*
   2703 					 * If the new mapping is CI and the old
   2704 					 * one is not, then flush the VAC.
   2705 					 */
   2706 					if (__predict_false(MMU_IS_HP_CLASS &&
   2707 							    pte_ci_p(diff) &&
   2708 							    pte_ci_p(npte))) {
   2709 						DCIA();
   2710 					}
   2711 #endif
   2712 				}
   2713 			}
   2714 
   2715 			/* All done! */
   2716 			goto out_release;
   2717 		}
   2718 
   2719 		/*
   2720 		 * The mapping has completely changed.  Need to remove
   2721 		 * the old one first.
   2722 		 *
   2723 		 * This drops the retain count on the PT owned by the
   2724 		 * previous mapping, but the newly-entered mapping will
   2725 		 * inherit the retain count taken when we looked up the
   2726 		 * PTE.
   2727 		 *
   2728 		 * XXX Can we elide the ATC flush here?  We're going to
   2729 		 * XXX hit the ATC after setting the new PTE anyway.
   2730 		 */
   2731 		pmap_evcnt(enter_pa_change);
   2732 		pmap_remove_mapping(pmap, va, ptep, pt,
   2733 		    PRM_TFLUSH|PRM_CFLUSH, &pc);
   2734 	}
   2735 
   2736 	/*
   2737 	 * By the time we get here, we should be assured that the
   2738 	 * PTE at ptep is invalid.
   2739 	 */
   2740 	KASSERT(! pte_valid_p(pte_load(ptep)));
   2741 
   2742 	/* Update pmap stats now. */
   2743 	pmap_stat_update(pmap, resident_count, 1);
   2744 	if (__predict_false(pte_wired_p(npte))) {
   2745 		pmap_stat_update(pmap, wired_count, 1);
   2746 	}
   2747 
   2748 	if (__predict_true(pg != NULL)) {
   2749 		/*
   2750 		 * Managed pages also go on the PV list, so we are
   2751 		 * going to need a PV entry.
   2752 		 */
   2753 		newpv = LIST_FIRST(&pc.pc_pvlist);
   2754 		if (__predict_true(newpv == NULL)) {
   2755 			/*
   2756 			 * No PV entry to recycle; allocate a new one.
   2757 			 * Because this is an extremely common case, we
   2758 			 * are first going to attempt allocation while
   2759 			 * still in the critical section.  If that fails
   2760 			 * and waiting is allowed, we'll leave the critical
   2761 			 * section and try a blocking allocation.
   2762 			 */
   2763 			newpv = pmap_pv_alloc(true/*nowait flag*/);
   2764 			if (__predict_false(newpv == NULL)) {
   2765 				if (nowait) {
   2766 					pmap_evcnt(enter_pv_alloc_fail);
   2767 					error = ENOMEM;
   2768 					goto out_release;
   2769 				}
   2770 				PMAP_CRIT_EXIT();
   2771 				newpv = pmap_pv_alloc(false/*nowait flag*/);
   2772 				KASSERT(newpv != NULL);
   2773 				PMAP_CRIT_ENTER();
   2774 				/*
   2775 				 * Because we may have blocked while allocating
   2776 				 * the PV entry, we have to re-validate our
   2777 				 * environment, as another thread could have
   2778 				 * inserted a mapping here behind our back.
   2779 				 */
   2780 				opte = pte_load(ptep);
   2781 				if (__predict_false(pte_valid_p(opte))) {
   2782 					pmap_stat_update(pmap,
   2783 					    resident_count, -1);
   2784 					if (pte_wired_p(npte)) {
   2785 						pmap_stat_update(pmap,
   2786 						    wired_count, -1);
   2787 					}
   2788 					LIST_INSERT_HEAD(&pc.pc_pvlist,
   2789 					    newpv, pv_pmlist);
   2790 					goto restart;
   2791 				}
   2792 			}
   2793 		} else {
   2794 			pmap_evcnt(enter_pv_recycle);
   2795 			LIST_REMOVE(newpv, pv_pmlist);
   2796 		}
   2797 
   2798 		/*
   2799 		 * Enter the mapping into the PV list.  pmap_pv_enter()
   2800 		 * will also set the PTE in the table.
   2801 		 */
   2802 		pmap_pv_enter(pmap, pg, va, prot, pt, npte, newpv);
   2803 
   2804 		/*
   2805 		 * The new mapping takes ownership of the PT
   2806 		 * retain count we took while looking up the PTE.
   2807 		 */
   2808 		goto out_crit_exit;
   2809 	}
   2810 
   2811 	/*
   2812 	 * Not a managed mapping, so set the new PTE.  As with managed
   2813 	 * mappings, the new mapping takes ownership of the PT retain
   2814 	 * count we took while looking up the PTE.
   2815 	 */
   2816 	pte_store(ptep, npte);
   2817 
   2818 	/*
   2819 	 * See comments in pmap_pv_enter() for why we have to hit
   2820 	 * the ATC here.
   2821 	 */
   2822 	if (active_pmap(pmap)) {
   2823 		TBIS(va);
   2824 	}
   2825 	goto out_crit_exit;
   2826 
   2827  out_release:
   2828 	pmap_table_release(pmap, pt, &pc);
   2829  out_crit_exit:
   2830 	PMAP_CRIT_EXIT();
   2831  out:
   2832 	pmap_completion_fini(&pc);
   2833 	return error;
   2834 }
   2835 
   2836 /*
   2837  * pmap_kenter_pa:		[ INTERFACE ]
   2838  *
   2839  *	Enter a va -> pa mapping into the kernel pmap without any
   2840  *	physical->virtual tracking.
   2841  */
   2842 void
   2843 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
   2844 {
   2845 	pmap_t const pmap = pmap_kernel();
   2846 
   2847 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
   2848 
   2849 	pt_entry_t * const ptep = pmap_kernel_pte(va);
   2850 
   2851 	/* Build the new PTE. */
   2852 	const pt_entry_t npte = pmap_make_pte(pa, prot, flags | PMAP_WIRED);
   2853 
   2854 	/*
   2855 	 * If this is an EXEC mapping, then we have to ensure that
   2856 	 * the I$ doesn't load stale data.
   2857 	 */
   2858 	if (__predict_false(prot & UVM_PROT_EXEC)) {
   2859 #if MMU_CONFIG_68040_CLASS
   2860 		if (MMU_IS_68040_CLASS) {
   2861 			/*
   2862 			 * XXX Potential future optimization: is only
   2863 			 * XXX the DCFP() needed here to deal with
   2864 			 * XXX write-back?
   2865 			 */
   2866 			DCFP(pa);
   2867 			ICPP(pa);
   2868 		}
   2869 #endif
   2870 	}
   2871 
   2872 	/* Set the new PTE. */
   2873 	const pt_entry_t opte = pte_load(ptep);
   2874 	pte_store(ptep, npte);
   2875 
   2876 	/*
   2877 	 * See comments in pmap_pv_enter() as for why we hit the ATC here.
   2878 	 * This *should* be unnecessary because this is a wired kernel
   2879 	 * mapping and no demand-page-ins should have happened at this
   2880 	 * VA, but we're erring on the side of caution for now.
   2881 	 */
   2882 	TBIS(va);
   2883 
   2884 	/*
   2885 	 * There should not have been anything here, previously,
   2886 	 * so we can skip ATC invalidation in the common case.
   2887 	 */
   2888 	if (__predict_false(pte_valid_p(opte))) {
   2889 		if (__predict_false(pte_managed_p(opte))) {
   2890 			/*
   2891 			 * Can't handle this case and it's a legitimate
   2892 			 * error if it happens.
   2893 			 */
   2894 			panic("%s: old mapping was managed", __func__);
   2895 		}
   2896 		if (__predict_false(! pte_wired_p(opte))) {
   2897 			pmap_stat_update(pmap, wired_count, 1);
   2898 		}
   2899 	} else {
   2900 		pmap_stat_update(pmap, resident_count, 1);
   2901 		pmap_stat_update(pmap, wired_count, 1);
   2902 	}
   2903 }
   2904 
   2905 /*
   2906  * pmap_kremove:		[ INTERFACE ]
   2907  *
   2908  *	Remove a mapping entered with pmap_kenter_pa() starting at va,
   2909  *	for size bytes (assumed to be page rounded).
   2910  */
   2911 void
   2912 pmap_kremove(vaddr_t va, vsize_t size)
   2913 {
   2914 	pt_entry_t *ptep, opte;
   2915 	pmap_t const pmap = pmap_kernel();
   2916 	int count = 0;
   2917 #if MMU_CONFIG_HP_CLASS
   2918 	pt_entry_t all_ci = PTE51_CI;
   2919 #endif
   2920 
   2921 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
   2922 
   2923 	for (ptep = pmap_kernel_pte(va); size != 0;
   2924 	     ptep++, size -= PAGE_SIZE, va += PAGE_SIZE) {
   2925 		opte = pte_load(ptep);
   2926 		if (pte_valid_p(opte)) {
   2927 			KASSERT(! pte_managed_p(opte));
   2928 			KASSERT(pte_wired_p(opte));
   2929 #if MMU_CONFIG_HP_CLASS
   2930 			/*
   2931 			 * If all of the PTEs we're zapping have the
   2932 			 * cache-inhibit bit set, ci_pte will remain
   2933 			 * non-zero and we'll be able to skip flushing
   2934 			 * the VAC when we're done.
   2935 			 */
   2936 			all_ci &= opte;
   2937 #endif
   2938 			/* Zap the mapping. */
   2939 			pte_store(ptep, 0);
   2940 			TBIS(va);
   2941 			count++;
   2942 		}
   2943 	}
   2944 #if MMU_CONFIG_HP_CLASS
   2945 	if (MMU_IS_HP_CLASS && !all_ci) {
   2946 		/*
   2947 		 * Cacheable mappings were removed, so invalidate
   2948 		 * the cache.
   2949 		 */
   2950 		DCIS();
   2951 	}
   2952 #endif
   2953 	/* Update stats. */
   2954 	if (__predict_true(count != 0)) {
   2955 		pmap_stat_update(pmap, resident_count, -count);
   2956 		pmap_stat_update(pmap, wired_count, -count);
   2957 	}
   2958 }
   2959 
   2960 /*
   2961  * pmap_unwire:			[ INTERFACE ]
   2962  *
   2963  *	Clear the wired attribute for a map/virtual-address pair.
   2964  *
   2965  *	The mapping must already exist in the pmap.
   2966  */
   2967 void
   2968 pmap_unwire(pmap_t pmap, vaddr_t va)
   2969 {
   2970 	struct pmap_table *pt;
   2971 	pt_entry_t opte, npte, *ptep;
   2972 
   2973 	PMAP_CRIT_ENTER();
   2974 
   2975 	ptep = pmap_pte_lookup(pmap, va, &pt);
   2976 	KASSERT(ptep != NULL);
   2977 
   2978 	for (;;) {
   2979 		opte = pte_load(ptep);
   2980 		KASSERT(pte_valid_p(opte));
   2981 
   2982 		/*
   2983 		 * If the wiring actually changed (always?), clear the wire
   2984 		 * bit and update the wire count.  Note that the wiring is
   2985 		 * not a hardware characteristic so there is no need to
   2986 		 * invalidate the ATC.
   2987 		 */
   2988 		if (! pte_wired_p(opte)) {
   2989 			break;
   2990 		}
   2991 		npte = opte & ~PTE_WIRED;
   2992 		if (pte_update(ptep, opte, npte)) {
   2993 			pmap_stat_update(pmap, wired_count, -1);
   2994 			break;
   2995 		}
   2996 	}
   2997 
   2998 	pmap_table_release(pmap, pt, NULL);
   2999 
   3000 	PMAP_CRIT_EXIT();
   3001 }
   3002 
   3003 /*
   3004  * pmap_extract:		[ INTERFACE ]
   3005  *
   3006  *	Extract the physical address associated with the given
   3007  *	pmap/virtual address pair.
   3008  *
   3009  * pmap_extract_info:
   3010  *
   3011  *	Like pmap_extract(), but also returns information
   3012  *	about the mapping (wired, cache-inhibited, etc.)
   3013  */
   3014 bool
   3015 pmap_extract_info(pmap_t pmap, vaddr_t va, paddr_t *pap, int *flagsp)
   3016 {
   3017 	struct pmap_table *pt;
   3018 	pt_entry_t pte, *ptep;
   3019 	bool rv = false;
   3020 
   3021 	if (__predict_false(pmap == pmap_kernel() &&
   3022 			    va >= kernel_virtual_end)) {
   3023 		return false;
   3024 	}
   3025 
   3026 	PMAP_CRIT_ENTER();
   3027 
   3028 	ptep = pmap_pte_lookup(pmap, va, &pt);
   3029 	if (__predict_true(ptep != NULL)) {
   3030 		pte = pte_load(ptep);
   3031 		if (__predict_true(pte_valid_p(pte))) {
   3032 			if (__predict_true(pap != NULL)) {
   3033 				*pap = pte_pa(pte) | (va & PGOFSET);
   3034 			}
   3035 			if (__predict_false(flagsp != NULL)) {
   3036 				*flagsp =
   3037 				    (pte_wired_p(pte) ? PMAP_WIRED : 0) |
   3038 				    (pte_ci_p(pte) ? PMAP_NOCACHE : 0);
   3039 			}
   3040 			rv = true;
   3041 		}
   3042 		pmap_table_release(pmap, pt, NULL);
   3043 	}
   3044 
   3045 	PMAP_CRIT_EXIT();
   3046 
   3047 	return rv;
   3048 }
   3049 
   3050 bool
   3051 pmap_extract(pmap_t pmap, vaddr_t va, paddr_t *pap)
   3052 {
   3053 	return pmap_extract_info(pmap, va, pap, NULL);
   3054 }
   3055 
   3056 /*
   3057  * vtophys:
   3058  *
   3059  *	Dumber version of pmap_extract(pmap_kernel(), ...)
   3060  */
   3061 paddr_t
   3062 vtophys(vaddr_t va)
   3063 {
   3064 	paddr_t pa;
   3065 	bool rv __diagused;
   3066 
   3067 	rv = pmap_extract_info(pmap_kernel(), va, &pa, NULL);
   3068 	KASSERT(rv);
   3069 	return rv ? pa : -1;
   3070 }
   3071 
   3072 /*
   3073  * kvtop:
   3074  *
   3075  *	Sigh.
   3076  */
   3077 int
   3078 kvtop(void *v)
   3079 {
   3080 	return (int)vtophys((vaddr_t)v);
   3081 }
   3082 
   3083 /*
   3084  * pmap_copy:			[ INTERFACE ]
   3085  *
   3086  *	Copy the mapping range specified by src_addr/len
   3087  *	from the source map to the range dst_addr/len
   3088  *	in the destination map.
   3089  *
   3090  *	This routine is only advisory and need not do anything.
   3091  */
   3092 /* call deleted in <machine/pmap.h> */
   3093 
   3094 /*
   3095  * pmap_update:			[ INTERFACE ]
   3096  *
   3097  *	Require that all active physical maps contain no
   3098  *	incorrect entries NOW, by processing any deferred
   3099  *	pmap operations.
   3100  */
   3101 /* call deleted in <machine/pmap.h> */
   3102 
   3103 /*
   3104  * pmap_activate:		[ INTERFACE ]
   3105  *
   3106  *	Activate the pmap used by the specified process.  This includes
   3107  *	reloading the MMU context of the current process, and marking
   3108  *	the pmap in use by the processor.
   3109  */
   3110 void
   3111 pmap_activate(struct lwp *l)
   3112 {
   3113 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
   3114 
   3115 	KASSERT(l == curlwp);
   3116 
   3117 	/*
   3118 	 * Because the kernel has a separate root pointer, we don't
   3119 	 * need to activate the kernel pmap.
   3120 	 */
   3121 	if (pmap != pmap_kernel()) {
   3122 		PMAP_CRIT_ENTER();
   3123 		pmap_load_urp(pmap->pm_lev1pa);
   3124 		PMAP_CRIT_EXIT();
   3125 	}
   3126 }
   3127 
   3128 /*
   3129  * pmap_deactivate:		[ INTERFACE ]
   3130  *
   3131  *	Mark that the pmap used by the specified process is no longer
   3132  *	in use by the processor.
   3133  */
   3134 void
   3135 pmap_deactivate(struct lwp *l)
   3136 {
   3137 	/* No action necessary in this pmap implementation. */
   3138 }
   3139 
   3140 static vaddr_t pmap_tmpmap_srcva;
   3141 static vaddr_t pmap_tmpmap_dstva;
   3142 
   3143 /*
   3144  * pmap_zero_page:		[ INTERFACE ]
   3145  *
   3146  *	Zero the specified VM page by mapping the page into the kernel
   3147  *	and using memset() (or equivalent) to clear its contents.
   3148  */
   3149 void
   3150 pmap_zero_page(paddr_t pa)
   3151 {
   3152 	const int flags = MMU_IS_HP_CLASS ? PMAP_NOCACHE|PMAP_WIRED
   3153 					  : PMAP_WIRED;
   3154 	pt_entry_t * const dst_ptep = pmap_kernel_pte(pmap_tmpmap_dstva);
   3155 
   3156 	/* Build the new PTE. */
   3157 	const pt_entry_t dst_pte =
   3158 	    pmap_make_pte(pa, UVM_PROT_READ | UVM_PROT_WRITE, flags);
   3159 
   3160 	/* Set the new PTE. */
   3161 	KASSERT(! pte_valid_p(pte_load(dst_ptep)));
   3162 	pte_store(dst_ptep, dst_pte);
   3163 	/* XXX Possibly being over-cautious here; see pmap_kenter_pa(). */
   3164 	TBIS(pmap_tmpmap_dstva);
   3165 
   3166 	/* Zero the page. */
   3167 	zeropage((void *)pmap_tmpmap_dstva);
   3168 
   3169 	/* Invalidate the PTEs. */
   3170 	pte_store(dst_ptep, 0);
   3171 	TBIS(pmap_tmpmap_dstva);
   3172 }
   3173 
   3174 /*
   3175  * pmap_copy_page:		[ INTERFACE ]
   3176  *
   3177  *	Copy the specified VM page by mapping the page(s) into the kernel
   3178  *	and using memcpy() (or equivalent).
   3179  */
   3180 void
   3181 pmap_copy_page(paddr_t src, paddr_t dst)
   3182 {
   3183 	const int flags = MMU_IS_HP_CLASS ? PMAP_NOCACHE|PMAP_WIRED
   3184 					  : PMAP_WIRED;
   3185 	pt_entry_t * const src_ptep = pmap_kernel_pte(pmap_tmpmap_srcva);
   3186 	pt_entry_t * const dst_ptep = pmap_kernel_pte(pmap_tmpmap_dstva);
   3187 
   3188 	/* Build the new PTEs. */
   3189 	const pt_entry_t src_pte =
   3190 	    pmap_make_pte(src, UVM_PROT_READ, flags);
   3191 	const pt_entry_t dst_pte =
   3192 	    pmap_make_pte(dst, UVM_PROT_READ | UVM_PROT_WRITE, flags);
   3193 
   3194 	/* Set the new PTEs. */
   3195 	KASSERT(! pte_valid_p(pte_load(src_ptep)));
   3196 	pte_store(src_ptep, src_pte);
   3197 	/* XXX Possibly being over-cautious here; see pmap_kenter_pa(). */
   3198 	TBIS(pmap_tmpmap_srcva);
   3199 
   3200 	KASSERT(! pte_valid_p(pte_load(dst_ptep)));
   3201 	pte_store(dst_ptep, dst_pte);
   3202 	/* XXX Possibly being over-cautious here; see pmap_kenter_pa(). */
   3203 	TBIS(pmap_tmpmap_dstva);
   3204 
   3205 	/* Copy the page. */
   3206 	copypage((void *)pmap_tmpmap_srcva, (void *)pmap_tmpmap_dstva);
   3207 
   3208 	/* Invalidate the PTEs. */
   3209 	pte_store(src_ptep, 0);
   3210 	TBIS(pmap_tmpmap_srcva);
   3211 
   3212 	pte_store(dst_ptep, 0);
   3213 	TBIS(pmap_tmpmap_dstva);
   3214 }
   3215 
   3216 /*
   3217  * pmap_clear_modify:		[ INTERFACE ]
   3218  *
   3219  *	Clear the modify bits on the specified physical page.
   3220  */
   3221 /* See <machine/pmap.h> */
   3222 
   3223 /*
   3224  * pmap_clear_reference:	[ INTERFACE ]
   3225  *
   3226  *	Clear the reference bit on the specified physical page.
   3227  */
   3228 /* See <machine/pmap.h> */
   3229 
   3230 /*
   3231  * pmap_is_referenced:		[ INTERFACE ]
   3232  *
   3233  *	Return whether or not the specified physical page has been referenced
   3234  *	by any physical maps.
   3235  */
   3236 /* See <machine/pmap.h> */
   3237 
   3238 /*
   3239  * pmap_is_modified:		[ INTERFACE ]
   3240  *
   3241  *	Return whether or not the specified physical page has been modified
   3242  *	by any physical maps.
   3243  */
   3244 /* See <machine/pmap.h> */
   3245 
   3246 /*
   3247  * pmap_testbit:
   3248  *
   3249  *	Test the modified / referenced bits of a physical page.
   3250  */
   3251 bool
   3252 pmap_testbit(struct vm_page *pg, pt_entry_t bit)
   3253 {
   3254 	struct pv_entry *pv;
   3255 	pt_entry_t pte = 0;
   3256 	bool rv = false;
   3257 
   3258 	PMAP_CRIT_ENTER();
   3259 
   3260 	if (VM_MDPAGE_UM(pg) & bit) {
   3261 		rv = true;
   3262 		goto out;
   3263 	}
   3264 
   3265 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   3266 		pte |= pte_load(pmap_pv_pte(pv));
   3267 		if (pte & bit) {
   3268 			rv = true;
   3269 			break;
   3270 		}
   3271 	}
   3272 	VM_MDPAGE_ADD_UM(pg, pte);
   3273  out:
   3274 	PMAP_CRIT_EXIT();
   3275 
   3276 	return rv;
   3277 }
   3278 
   3279 /*
   3280  * pmap_changebit:
   3281  *
   3282  *	Test-and-change various bits (including mod/ref bits).
   3283  */
   3284 bool
   3285 pmap_changebit(struct vm_page *pg, pt_entry_t set, pt_entry_t mask)
   3286 {
   3287 	struct pv_entry *pv;
   3288 	pt_entry_t *ptep, combined_pte = 0, diff, opte, npte;
   3289 	bool rv = false;
   3290 
   3291 #if MMU_CONFIG_68040_CLASS
   3292 	/*
   3293 	 * If we're making the page read-only or changing the caching
   3294 	 * status of the page, we need to flush it the first time we
   3295 	 * change a mapping.
   3296 	 */
   3297 	bool cflush_040;
   3298 	if (MMU_IS_68040_CLASS &&
   3299 	    ((set  & PTE_CRIT_BITS) != 0 ||
   3300 	     (mask & PTE_CRIT_BITS) == 0)) {
   3301 		cflush_040 = true;
   3302 	} else {
   3303 		cflush_040 = false;
   3304 	}
   3305 #endif
   3306 
   3307 	PMAP_CRIT_ENTER();
   3308 
   3309 	/*
   3310 	 * Since we're running over every mapping for the page anyway,
   3311 	 * we might as well synchronize any attribute bits that we're
   3312 	 * not clearing.
   3313 	 */
   3314 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
   3315 		for (;;) {
   3316 			ptep = pmap_pv_pte(pv);
   3317 			opte = pte_load(ptep);
   3318 			npte = (opte | set) & mask;
   3319 			if ((diff = (opte ^ npte)) == 0) {
   3320 				break;
   3321 			}
   3322 #if MMU_CONFIG_68040_CLASS
   3323 			if (__predict_false(cflush_040)) {
   3324 				paddr_t pa = VM_PAGE_TO_PHYS(pg);
   3325 				DCFP(pa);
   3326 				ICPP(pa);
   3327 				cflush_040 = false;
   3328 			}
   3329 #endif
   3330 			if (pte_update(ptep, opte, npte)) {
   3331 				rv = true;
   3332 				break;
   3333 			}
   3334 			/* Lost race, try again. */
   3335 		}
   3336 		combined_pte |= opte;
   3337 		if ((diff & PTE_CRIT_BITS) != 0 && active_pmap(pv->pv_pmap)) {
   3338 			TBIS(PV_VA(pv));
   3339 		}
   3340 	}
   3341 
   3342 	/*
   3343 	 * Update any attributes we looked at, clear the ones we're clearing.
   3344 	 */
   3345 	VM_MDPAGE_SET_UM(pg,
   3346 	    (VM_MDPAGE_UM(pg) | combined_pte | set) & mask);
   3347 
   3348 	PMAP_CRIT_EXIT();
   3349 
   3350 	return rv;
   3351 }
   3352 
   3353 /*
   3354  * pmap_phys_address:		[ INTERFACE ]
   3355  *
   3356  *	Return the physical address corresponding to the specified
   3357  *	cookie.  Used by the device pager to decode a device driver's
   3358  *	mmap entry point return value.
   3359  */
   3360 paddr_t
   3361 pmap_phys_address(paddr_t cookie)
   3362 {
   3363 	return m68k_ptob(cookie);
   3364 }
   3365 
   3366 static pt_entry_t *kernel_lev1map;
   3367 
   3368 /*
   3369  * pmap_growkernel_alloc_page:
   3370  *
   3371  *	Helper for pmap_growkernel().
   3372  */
   3373 static paddr_t
   3374 pmap_growkernel_alloc_page(void)
   3375 {
   3376 	/*
   3377 	 * XXX Needs more work if we're going to do this during
   3378 	 * XXX early bootstrap.
   3379 	 */
   3380 	if (! uvm.page_init_done) {
   3381 		panic("%s: called before UVM initialized", __func__);
   3382 	}
   3383 
   3384 	struct vm_page *pg = pmap_page_alloc(true/*nowait*/);
   3385 	if (pg == NULL) {
   3386 		panic("%s: out of memory", __func__);
   3387 	}
   3388 
   3389 	paddr_t pa = VM_PAGE_TO_PHYS(pg);
   3390 	pmap_zero_page(pa);
   3391 #if MMU_CONFIG_68040_CLASS
   3392 	if (MMU_IS_68040_CLASS) {
   3393 		DCFP(pa);
   3394 	}
   3395 #endif
   3396 	return pa;
   3397 }
   3398 
   3399 /*
   3400  * pmap_growkernel_link_kptpage:
   3401  *
   3402  *	Helper for pmap_growkernel().
   3403  */
   3404 static void
   3405 pmap_growkernel_link_kptpage(vaddr_t va, paddr_t ptp_pa)
   3406 {
   3407 	/*
   3408 	 * This is trivial for the 2-level MMU configuration.
   3409 	 */
   3410 	if (MMU_USE_2L) {
   3411 		KASSERT((kernel_lev1map[LA2L_RI(va)] & DT51_SHORT) == 0);
   3412 		kernel_lev1map[LA2L_RI(va)] = pmap_ste_proto | ptp_pa;
   3413 		return;
   3414 	}
   3415 
   3416 	/*
   3417 	 * N.B. pmap_zero_page() is used in this process, which
   3418 	 * uses pmap_tmpmap_dstva.  pmap_tmpmap_srcva is available
   3419 	 * for our use, however, so that's what we used to temporarily
   3420 	 * map inner segment table pages.
   3421 	 */
   3422 	const vaddr_t stpg_va = pmap_tmpmap_srcva;
   3423 
   3424 	paddr_t stpa, stpg_pa, stpgoff, last_stpg_pa = (paddr_t)-1;
   3425 	paddr_t pa = ptp_pa, end_pa = ptp_pa + PAGE_SIZE;
   3426 	pt_entry_t *stes;
   3427 
   3428 	for (; pa < end_pa; va += NBSEG3L, pa += TBL40_L3_SIZE) {
   3429 		if ((kernel_lev1map[LA40_RI(va)] & UTE40_RESIDENT) == 0) {
   3430 			/* Level-2 table for this segment needed. */
   3431 			if (kernel_stnext_pa == kernel_stnext_endpa) {
   3432 				/*
   3433 				 * No more slots left in the last page
   3434 				 * we allocated for segment tables.  Grab
   3435 				 * another one.
   3436 				 */
   3437 				kernel_stnext_pa = pmap_growkernel_alloc_page();
   3438 				kernel_stnext_endpa =
   3439 				    kernel_stnext_pa + PAGE_SIZE;
   3440 				pmap_nkstpages_current_ev.ev_count++;
   3441 			}
   3442 			kernel_lev1map[LA40_RI(va)] =
   3443 			    pmap_ste_proto | kernel_stnext_pa;
   3444 			kernel_stnext_pa += TBL40_L2_SIZE;
   3445 		}
   3446 		stpa = kernel_lev1map[LA40_RI(va)] & UTE40_PTA;
   3447 		stpg_pa = m68k_trunc_page(stpa);
   3448 		if (stpg_pa != last_stpg_pa) {
   3449 			if (last_stpg_pa != (paddr_t)-1) {
   3450 				pmap_kremove(stpg_va, PAGE_SIZE);
   3451 			}
   3452 			pmap_kenter_pa(stpg_va, stpg_pa,
   3453 			    UVM_PROT_READ | UVM_PROT_WRITE,
   3454 			    PMAP_WIRED | PMAP_NOCACHE);
   3455 			last_stpg_pa = stpg_pa;
   3456 		}
   3457 		stpgoff = stpa - stpg_pa;
   3458 		stes = (pt_entry_t *)(stpg_va + stpgoff);
   3459 		stes[LA40_PI(va)] = pmap_ste_proto | pa;
   3460 	}
   3461 	if (last_stpg_pa != (paddr_t)-1) {
   3462 		pmap_kremove(stpg_va, PAGE_SIZE);
   3463 	}
   3464 }
   3465 
   3466 /*
   3467  * pmap_growkernel:		[ INTERFACE ]
   3468  *
   3469  *	Grow the kernel address space.  This is a hint from the
   3470  *	upper layer to pre-allocate more kernel PT pages.
   3471  */
   3472 vaddr_t
   3473 pmap_growkernel(vaddr_t maxkvaddr)
   3474 {
   3475 	PMAP_CRIT_ENTER();
   3476 
   3477 	KASSERT((kernel_virtual_end & PTPAGEVAOFS) == 0);
   3478 
   3479 	vaddr_t new_maxkva = pmap_round_ptpage(maxkvaddr);
   3480 	if (new_maxkva < kernel_virtual_end) {
   3481 		/*
   3482 		 * Great news!  We already have what we need to map
   3483 		 * the requested max address.  This happens one during
   3484 		 * early bootstrap before UVM's notion of "maxkvaddr"
   3485 		 * has been initialized.
   3486 		 */
   3487 		new_maxkva = kernel_virtual_end;
   3488 		goto done;
   3489 	}
   3490 
   3491 	if (new_maxkva > kernel_virtual_max) {
   3492 		panic("%s: out of kernel VA space (req=0x%08lx limit=0x%08lx)",
   3493 		    __func__, maxkvaddr, kernel_virtual_max);
   3494 	}
   3495 
   3496 	/*
   3497 	 * Allocate PT pages and link them into the MMU tree as we
   3498 	 * go.
   3499 	 */
   3500 	vaddr_t va, ptp_pa;
   3501 	for (va = kernel_virtual_end; va < new_maxkva; va += PTPAGEVASZ) {
   3502 		/* Allocate page and link it into the MMU tree. */
   3503 		ptp_pa = pmap_growkernel_alloc_page();
   3504 		pmap_growkernel_link_kptpage(va, ptp_pa);
   3505 		pmap_nkptpages_current_ev.ev_count++;
   3506 
   3507 		/* Map the PT page into the kernel PTE array. */
   3508 		pmap_kenter_pa((vaddr_t)pmap_kernel_pte(va),
   3509 		    ptp_pa, UVM_PROT_READ | UVM_PROT_WRITE,
   3510 		    PMAP_WIRED | PMAP_NOCACHE);
   3511 	}
   3512  	kernel_virtual_end = new_maxkva;
   3513  done:
   3514 	pmap_maxkva_ev.ev_count32 = new_maxkva;
   3515 	pmap_kvalimit_ev.ev_count32 = kernel_virtual_max;
   3516 	PMAP_CRIT_EXIT();
   3517 	return new_maxkva;
   3518 }
   3519 
   3520 /*
   3521  * pmap_prefer:			[ INTERFACE ]
   3522  *
   3523  *	Attempt to arrange for pages at a given VM object offset
   3524  *	to occupy the same virtually-addressed cache footprint
   3525  *	in order to avoid cache aliases.
   3526  */
   3527 #if MMU_CONFIG_HP_CLASS
   3528 static struct evcnt pmap_prefer_nochange_ev =
   3529     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap prefer", "nochange");
   3530 static struct evcnt pmap_prefer_change_ev =
   3531     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pmap prefer", "change");
   3532 
   3533 EVCNT_ATTACH_STATIC(pmap_prefer_change_ev);
   3534 EVCNT_ATTACH_STATIC(pmap_prefer_nochange_ev);
   3535 #endif
   3536 void
   3537 pmap_prefer(vaddr_t hint, vaddr_t *vap, int td)
   3538 {
   3539 #if MMU_CONFIG_HP_CLASS
   3540 	if (MMU_IS_HP_CLASS) {
   3541 		vaddr_t va = *vap;
   3542 		ptrdiff_t diff = (hint - va) & pmap_aliasmask;
   3543 
   3544 		if (diff == 0) {
   3545 			pmap_prefer_nochange_ev.ev_count++;
   3546 		} else {
   3547 			pmap_prefer_change_ev.ev_count++;
   3548 			if (__predict_false(td)) {
   3549 				va -= pmap_aliasmask + 1;
   3550 			}
   3551 			*vap = va + diff;
   3552 		}
   3553 	}
   3554 #endif
   3555 }
   3556 
   3557 /*
   3558  * pmap_procwr:			[ INTERFACE ]
   3559  *
   3560  *	Perform any cache synchronization required after writing
   3561  *	to a process's address space.
   3562  */
   3563 void
   3564 pmap_procwr(struct proc *p, vaddr_t va, size_t len)
   3565 {
   3566 	/*
   3567 	 * This is just a wrapper around the "cachectl" machdep
   3568 	 * system call.
   3569 	 *
   3570 	 * XXX This is kind of gross, to be honest.
   3571 	 */
   3572 	(void)cachectl1(0x80000004, va, len, p);
   3573 }
   3574 
   3575 static paddr_t kernel_reloc_offset;
   3576 static vaddr_t kernel_reloc_end;
   3577 
   3578 /*
   3579  * pmap_init_kcore_hdr:
   3580  *
   3581  *	Initialize the m68k kernel crash dump header with information
   3582  *	necessary to perform KVA -> phys translations.
   3583  *
   3584  *	Returns a pointer to the crash dump RAM segment entries for
   3585  *	machine-specific code to initialize.
   3586  */
   3587 phys_ram_seg_t *
   3588 pmap_init_kcore_hdr(cpu_kcore_hdr_t *h)
   3589 {
   3590 	struct gen68k_kcore_hdr *m = &h->un._gen68k;
   3591 
   3592 	memset(h, 0, sizeof(*h));
   3593 
   3594 	/*
   3595 	 * Initialize the `dispatcher' portion of the header.
   3596 	 */
   3597 	strcpy(h->name, "gen68k");
   3598 	h->page_size = PAGE_SIZE;
   3599 	h->kernbase = VM_MIN_KERNEL_ADDRESS;
   3600 
   3601 	/* Fixed relocation information. */
   3602 	m->reloc    = kernel_reloc_offset;
   3603 	m->relocend = kernel_reloc_end;
   3604 
   3605 	/*
   3606 	 * Fill in information about our MMU configuration.
   3607 	 *
   3608 	 * We essentially pretend to be a 68851 as far as table-
   3609 	 * walks are concerned.
   3610 	 *
   3611 	 * We provide the kernel's MMU_* constant so that the TT
   3612 	 * registers can be interpreted correctly.
   3613 	 */
   3614 	m->mmutype = mmutype;
   3615 	m->tcr = MMU_USE_3L ? MMU51_3L_TCR_BITS : MMU51_TCR_BITS;
   3616 	m->srp[0] = MMU51_SRP_BITS;
   3617 	m->srp[1] = Sysseg_pa;
   3618 
   3619 #if MMU_CONFIG_68040_CLASS
   3620 	if (MMU_IS_68040_CLASS) {
   3621 		m->itt0 = mmu_tt40[MMU_TTREG_ITT0];
   3622 		m->itt1 = mmu_tt40[MMU_TTREG_ITT1];
   3623 		m->tt0  = mmu_tt40[MMU_TTREG_DTT0];
   3624 		m->tt1  = mmu_tt40[MMU_TTREG_DTT1];
   3625 	}
   3626 #endif
   3627 #if defined(M68K_MMU_68030)
   3628 	if (mmutype == MMU_68030) {
   3629 		m->tt0  = mmu_tt30[MMU_TTREG_TT0];
   3630 		m->tt1  = mmu_tt30[MMU_TTREG_TT1];
   3631 	}
   3632 #endif
   3633 
   3634 	return m->ram_segs;
   3635 }
   3636 
   3637 /***************************** PMAP BOOTSTRAP ********************************/
   3638 
   3639 /*
   3640  * The kernel virtual address space layout that this implementation is tuned
   3641  * for assumes that KVA space begins at $0000.0000, that the static kernel
   3642  * image (text/data/bss, etc.) resides at or near the bottom of this space,
   3643  * and that all additional KVA that's mapped by PTEs grows upwards from there.
   3644  *
   3645  * Regions mapped by Transparent Translation registers (68030 and up)
   3646  * are assumed to lie beyond where the KVA space is expected to grow.  When
   3647  * we encounter these regions in the machine_bootmap[] (represented by a
   3648  * KEEPOUT entry), we clamp the maximum KVA to prevent its growth into that
   3649  * region.  The TT mechanism is not terribly precise, and only supports
   3650  * VA==PA mappings, so it's only really suitable for device regions that
   3651  * are in the upper reaches of the physical address space (at or beyond 1GB
   3652  * or so).
   3653  *
   3654  * This implementation certainly could be adjusted to work with other address
   3655  * space layouts, but the assumption asserted here is a bit baked-in.
   3656  */
   3657 __CTASSERT(VM_MIN_KERNEL_ADDRESS == 0);
   3658 
   3659 /*
   3660  * The virtual kernel PTE array covers the entire 4GB kernel supervisor
   3661  * address space, but is sparsely populated.  The amount of VA space required
   3662  * for this linear array is:
   3663  *
   3664  *	(4GB / PAGE_SIZE) * sizeof(pt_entry_t)
   3665  * -or-
   3666  *	4KB: 4MB (1024 pages)
   3667  *	8KB: 2MB (512 pages)
   3668  *
   3669  * To avoid doing 64-bit math, we calculate it like so:
   3670  *
   3671  *	((0xffffffff >> PGSHIFT) + 1) * sizeof(pt_entry_t)
   3672  *
   3673  * The traditional name for this virtual array is "Sysmap".
   3674  */
   3675 #define	SYSMAP_VA_SIZE	(((0xffffffffU >> PGSHIFT) + 1) * sizeof(pt_entry_t))
   3676 
   3677 /*
   3678  * In the Hibler/Utah pmap, the kernel PTE array was placed right near
   3679  * the very top of the kernel virtual address space.  This was because
   3680  * of the hp300's unique physical memory arrangement: the last page of
   3681  * memory is always located at PA $FFFF.F000 and the physical address
   3682  * of the beginning of RAM varied based on the RAM size.  This meant that
   3683  * VA $FFFF.F000 is a convenient place to map the RAM VA==PA, making
   3684  * transition between "MMU off" and "MMU on" (and vice versa) easier.
   3685  * Since VA $FFFF.F000 was already going to be mapped, it made sense to
   3686  * put something else along side of it in order to minimize waste in
   3687  * PT pages.
   3688  *
   3689  * As noted above, this implementation is tuned for a growing-from-0
   3690  * virtual space layout.  However, we have a special case for this
   3691  * particular requirement: if a platform defines SYSMAP_VA, then we
   3692  * will assume it is as a high address, place the kernel PTE array at
   3693  * that KVA, and ensure sufficient page tables to map from that VA until
   3694  * the very end of the 4GB supervisor address space.  These tables will
   3695  * be allocated before the machine_bootmap[] is processed to map physical
   3696  * addresses, thus allowing the machine_bootmap[] use it to map physical
   3697  * addresses into one of these high virtual addresses if necessary.  The
   3698  * beginning of this region will also serve to clamp the maximum kernel
   3699  * virtual address, in the same way as a KEEPOUT region in machine_bootmap[].
   3700  *
   3701  * For reference, the traditional hp300 definition is:
   3702  *
   3703  *	#define	SYSMAP_VA	((vaddr_t)(0-PAGE_SIZE*NPTEPG*2))
   3704  *
   3705  * ...and because the hp300 always used a 4KB page size (restriction
   3706  * of HP MMU), this is: 0 - 4096*1024*2
   3707  *                   -> 0 - 8388608 (8MB)
   3708  *                   -> $FF80.0000
   3709  *
   3710  * Unfortunately (for the hp300), this means 2 PT pages for the top of
   3711  * the address space (in the 2-level case), but that's unavoidable anyway
   3712  * because of the last page being a separate mapping and the kernel PTE
   3713  * array needs 4MB of space on its own.
   3714  */
   3715 
   3716 static vaddr_t	lwp0uarea;
   3717        char *	vmmap;
   3718        void *	msgbufaddr;
   3719 
   3720 /* XXX Doesn't belong here. */
   3721 paddr_t		avail_start;	/* PA of first available physical page */
   3722 paddr_t		avail_end;	/* PA of last available physical page */
   3723 
   3724 extern char *	kernel_text;
   3725 extern char *	etext;
   3726 
   3727 /*
   3728  * pmap_bootstrap1:
   3729  *
   3730  *	Phase 1 of bootstrapping virtual memory.  This is called before
   3731  *	the MMU is enabled to set up the initial kernel MMU tables and
   3732  *	allocate other important data structures.
   3733  *
   3734  *	Because the MMU has not yet been turned on, and we don't know if
   3735  *	we're running VA==PA, we have to manually relocate all global
   3736  *	symbol references.
   3737  *
   3738  *	Arguments:	nextpa		Physical address immediately
   3739  *					following the kernel / symbols /
   3740  *					etc.  This will be page-rounded
   3741  *					before use.
   3742  *
   3743  *			reloff		VA<->PA relocation offset
   3744  *
   3745  *	Returns:	nextpa		Updated value after all of the
   3746  *					allocations performed.
   3747  */
   3748 paddr_t __attribute__((no_instrument_function))
   3749 pmap_bootstrap1(paddr_t nextpa, paddr_t reloff)
   3750 {
   3751 	paddr_t lwp0upa, stnext_endpa, stnext_pa;
   3752 	paddr_t pa, kernimg_endpa, kern_lev1pa;
   3753 	vaddr_t va, nextva, kern_lev1va;
   3754 	pt_entry_t *pte, *epte;
   3755 	int entry_count = 0;
   3756 
   3757 #ifdef SYSMAP_VA
   3758 #define	VA_RANGE_DEFAULT	0
   3759 #define	VA_RANGE_KPTES		1
   3760 #define	NRANGES			2
   3761 #else
   3762 #define	VA_RANGE_DEFAULT	0
   3763 #define	VA_RANGE_KPTES		0
   3764 #define	NRANGES			1
   3765 #endif
   3766 
   3767 	struct va_range {
   3768 		vaddr_t start_va;
   3769 		vaddr_t end_va;
   3770 		paddr_t start_ptp;
   3771 		paddr_t end_ptp;
   3772 	} va_ranges[NRANGES], *var;
   3773 	int r;
   3774 
   3775 #define	VA_IN_RANGE(va, var)				\
   3776 	((va) >= (var)->start_va &&			\
   3777 	 ((va) < (var)->end_va || (var)->end_va == 0))
   3778 
   3779 #define	VA_PTE_BASE(va, var)				\
   3780 	(&((pt_entry_t *)				\
   3781 	    PMAP_BOOTSTRAP_RELOC_PA((var)->start_ptp))[	\
   3782 	    m68k_btop((va) - (var)->start_va)])
   3783 
   3784 #define	RELOC(v, t)	*((t *)PMAP_BOOTSTRAP_RELOC_GLOB(&(v)))
   3785 
   3786 	/* Record the relocation offset for kernel crash dumps. */
   3787 	RELOC(kernel_reloc_offset, paddr_t) = reloff;
   3788 
   3789 	/*
   3790 	 * First determination we have to make is our configuration:
   3791 	 * Are we using a 2-level or 3-level table?  For the purposes
   3792 	 * of bootstrapping the kernel, it's "68040-class" and "other",
   3793 	 * the former getting the 3-level table.
   3794 	 */
   3795 	const bool is_68040_class = RELOC(mmutype, int) == MMU_68040 ||
   3796 				    RELOC(mmutype, int) == MMU_68060;
   3797 	const bool use_3l = is_68040_class;
   3798 
   3799 	/*
   3800 	 * Based on MMU class, figure out what the constant values of
   3801 	 * segment / page table entries look like.
   3802 	 *
   3803 	 * See pmap_pte_proto_init().
   3804 	 */
   3805 	pt_entry_t proto_ro_pte;	/* read-only */
   3806 	pt_entry_t proto_rw_pte;	/* read-write */
   3807 	pt_entry_t proto_rw_ci_pte;	/* read-write, cache-inhibited */
   3808 	pt_entry_t proto_ste;
   3809 
   3810 	if (is_68040_class) {
   3811 		proto_ro_pte    = PTE_VALID|PTE_WIRED|PTE_WP|PTE40_CM_WT;
   3812 		proto_rw_pte    = PTE_VALID|PTE_WIRED       |PTE40_CM_CB;
   3813 		proto_rw_ci_pte = PTE_VALID|PTE_WIRED       |PTE40_CM_NC_SER;
   3814 	} else {
   3815 		proto_ro_pte    = PTE_VALID|PTE_WIRED|PTE_WP;
   3816 		proto_rw_pte    = PTE_VALID|PTE_WIRED;
   3817 		proto_rw_ci_pte = PTE_VALID|PTE_WIRED       |PTE51_CI;
   3818 	}
   3819 	proto_ste = DTE51_U | DT51_SHORT;
   3820 
   3821 	/*
   3822 	 * Allocate some important fixed virtual (and physical) addresses.
   3823 	 * We use the sum total of this initial mapped kernel space to
   3824 	 * determine how many inital kernel PT pages to allocate.  The
   3825 	 * things that consume physical space will come first, and the
   3826 	 * virtual-space-{only,mostly} things come at the end.
   3827 	 *
   3828 	 *	lwp0upa		lwp0 u-area	USPACE	(p)
   3829 	 *	lwp0uarea				(v)
   3830 	 *
   3831 	 *	Sysseg_pa	kernel lev1map	PAGE_SIZE (p)
   3832 	 *	kernel_lev1map			PAGE_SIZE (v, ci)
   3833 	 *
   3834 	 *	^^^^ end of simple relocation region ^^^^
   3835 	 *
   3836 	 *	null_segtab_pa	null segtab	PAGE_SIZE (p)
   3837 	 *
   3838 	 *	tmpmap_srcva	temp map, src	PAGE_SIZE (v)
   3839 	 *	tmpmap_dstva	temp map, dst	PAGE_SIZE (v)
   3840 	 *
   3841 	 *	vmmap		ya tmp map	PAGE_SIZE (v)
   3842 	 *
   3843 	 *	msgbufaddr	kernel msg buf	round_page(MSGBUFSIZE) (v)
   3844 	 *
   3845 	 *	kernel_ptes	kernel PTEs	SYSMAP_VA_SIZE (v, ci)
   3846 	 *					(see comments above)
   3847 	 *
   3848 	 * When we allocate the kernel lev1map, for the 2-level
   3849 	 * configuration, there is no inner segment tables to allocate,
   3850 	 * the leaf PT pages get poked directly into the level-1 table.
   3851 	 *
   3852 	 * In the 3-level configuration, to map all of the leaf tables,
   3853 	 * inner segment table pages are allocated as necessary.  We
   3854 	 * first take those tables from the page containing the level-1
   3855 	 * table, and allocate additional pages as necessary.
   3856 	 */
   3857 
   3858 	nextpa = m68k_round_page(nextpa);
   3859 	nextva = PMAP_BOOTSTRAP_PA_TO_VA(nextpa);
   3860 
   3861 	/*
   3862 	 * nextpa now represents the end of the loaded kernel image.
   3863 	 * This includes the .data + .bss segments, the debugger symbols,
   3864 	 * and any other ancillary data loaded after the kernel.
   3865 	 *
   3866 	 * N.B. This represents the start of our dynamic memory allocation,
   3867 	 * which will be referenced below when we zero the memory we've
   3868 	 * allocated.
   3869 	 */
   3870 	kernimg_endpa = nextpa;
   3871 
   3872 	/*
   3873 	 * lwp0 u-area.  We allocate it here, and finish setting it
   3874 	 * up in pmap_bootstrap2().
   3875 	 */
   3876 	lwp0upa = nextpa;
   3877 	nextpa += USPACE;
   3878 	RELOC(lwp0uarea, vaddr_t) = nextva;
   3879 	nextva += USPACE;
   3880 
   3881 	size_t nstpages = 0;
   3882 
   3883 	/* kernel level-1 map */
   3884 	RELOC(Sysseg_pa, paddr_t) = kern_lev1pa = nextpa;
   3885 	nextpa += PAGE_SIZE;
   3886 	RELOC(kernel_lev1map, vaddr_t) = kern_lev1va = nextva;
   3887 	nextva += PAGE_SIZE;
   3888 	nstpages++;
   3889 
   3890 	/* This is the end of the simple relocation region. */
   3891 	RELOC(kernel_reloc_end, vaddr_t) = nextva;
   3892 
   3893 	/*
   3894 	 * For 3-level configs, we now have space to allocate
   3895 	 * inner segment tables.
   3896 	 */
   3897 	stnext_pa = kern_lev1pa + TBL40_L1_SIZE;
   3898 	stnext_endpa = m68k_round_page(stnext_pa);
   3899 
   3900 	/* null segment table */
   3901 #ifdef NULL_SEGTAB_PA
   3902 	RELOC(null_segtab_pa, paddr_t) = (paddr_t)NULL_SEGTAB_PA;
   3903 #else
   3904 	RELOC(null_segtab_pa, paddr_t) = nextpa;
   3905 	nextpa += PAGE_SIZE;
   3906 #endif
   3907 
   3908 	/* pmap temporary map addresses */
   3909 	RELOC(pmap_tmpmap_srcva, vaddr_t) = nextva;
   3910 	nextva += PAGE_SIZE;
   3911 	RELOC(pmap_tmpmap_dstva, vaddr_t) = nextva;
   3912 	nextva += PAGE_SIZE;
   3913 
   3914 	/* vmmap temporary map address */
   3915 	RELOC(vmmap, vaddr_t) = nextva;
   3916 	nextva += PAGE_SIZE;
   3917 
   3918 	/* kernel message buffer */
   3919 	RELOC(msgbufaddr, vaddr_t) = nextva;
   3920 	nextva += m68k_round_page(MSGBUFSIZE);
   3921 
   3922 	/* Kernel PTE array. */
   3923 #ifdef SYSMAP_VA
   3924 	if ((vaddr_t)SYSMAP_VA < RELOC(kernel_virtual_max, vaddr_t)) {
   3925 		RELOC(kernel_virtual_max, vaddr_t) = (vaddr_t)SYSMAP_VA;
   3926 	}
   3927 	RELOC(kernel_ptes, vaddr_t) = (vaddr_t)SYSMAP_VA;
   3928 	va_ranges[VA_RANGE_KPTES].start_va = (vaddr_t)SYSMAP_VA;
   3929 	va_ranges[VA_RANGE_KPTES].end_va = 0; /* end of the address space */
   3930 #else
   3931 	RELOC(kernel_ptes, vaddr_t) = nextva;
   3932 	nextva += SYSMAP_VA_SIZE;
   3933 #endif /* SYSMAP_VA */
   3934 
   3935 	/*
   3936 	 * Allocate machine-specific VAs.
   3937 	 */
   3938 	extern const struct pmap_bootmap machine_bootmap[];
   3939 	const struct pmap_bootmap *pmbm = (const struct pmap_bootmap *)
   3940 	    PMAP_BOOTSTRAP_RELOC_GLOB(machine_bootmap);
   3941 	for (; pmbm->pmbm_vaddr != (vaddr_t)-1; pmbm++) {
   3942 		if (pmbm->pmbm_size == 0) {
   3943 			continue;
   3944 		}
   3945 		if (pmbm->pmbm_flags & (PMBM_F_FIXEDVA | PMBM_F_KEEPOUT)) {
   3946 			va = m68k_trunc_page(pmbm->pmbm_vaddr);
   3947 			if (va < RELOC(kernel_virtual_max, vaddr_t)) {
   3948 				RELOC(kernel_virtual_max, vaddr_t) = va;
   3949 			}
   3950 		} else {
   3951 			*(vaddr_t *)
   3952 			    PMAP_BOOTSTRAP_RELOC_GLOB(pmbm->pmbm_vaddr_ptr) =
   3953 			    nextva;
   3954 			nextva += m68k_round_page(pmbm->pmbm_size);
   3955 		}
   3956 	}
   3957 
   3958 	/* UVM-managed kernel virtual starts here. */
   3959 	RELOC(kernel_virtual_start, vaddr_t) = nextva;
   3960 
   3961 	/*
   3962 	 * Allocate enough PT pages to map all of physical memory.
   3963 	 * This should be sufficient to prevent pmap_growkernel()
   3964 	 * from having to do any work before the VM system is set
   3965 	 * up.
   3966 	 *
   3967 	 * XXX mac68k also relies on being able to map the last page
   3968 	 * XXX of RAM VA==PA for the mmu-switchoff dance.  Unlike hp300,
   3969 	 * XXX this is not at a fixed location.  However, RAM generally
   3970 	 * XXX starts at $0000.0000 on Macs, so this calculation should
   3971 	 * XXX be sufficient to ensure there is a PTE available for this
   3972 	 * XXX purpose.
   3973 	 * XXX TODO: Provide a way for cpu_startup() on mac68k to assert
   3974 	 * XXX this (export kernel_virtual_end?).
   3975 	 */
   3976 	nextva += RELOC(physmem, psize_t) << PGSHIFT;
   3977 	nextva = pmap_round_ptpage(nextva);
   3978 	if (nextva > RELOC(kernel_virtual_max, vaddr_t) ||
   3979 	    nextva < RELOC(kernel_virtual_start, vaddr_t)) {
   3980 		/* clamp it. */
   3981 		nextva = RELOC(kernel_virtual_max, vaddr_t);
   3982 	}
   3983 
   3984 	/*
   3985 	 * This marks the end of UVM-managed kernel virtual space,
   3986 	 * until such time as pmap_growkernel() is called to expand
   3987 	 * it.
   3988 	 */
   3989 	va_ranges[VA_RANGE_DEFAULT].start_va = VM_MIN_KERNEL_ADDRESS;
   3990 	va_ranges[VA_RANGE_DEFAULT].end_va = nextva;
   3991 	RELOC(kernel_virtual_end, vaddr_t) = nextva;
   3992 
   3993 	/*
   3994 	 * Now, compute the number of PT pages required to map the
   3995 	 * required VA ranges and allocate them.
   3996 	 */
   3997 	size_t nptpages, total_ptpages = 0;
   3998 	for (r = 0; r < NRANGES; r++) {
   3999 		var = &va_ranges[r];
   4000 		nptpages = (var->end_va - var->start_va) / PTPAGEVASZ;
   4001 		var->start_ptp = nextpa;
   4002 		nextpa += nptpages * PAGE_SIZE;
   4003 		var->end_ptp = nextpa;
   4004 		total_ptpages += nptpages;
   4005 	}
   4006 
   4007 #ifdef PMAP_MACHINE_CHECK_BOOTSTRAP_ALLOCATIONS
   4008 	/*
   4009 	 * Right here, the old mac68k Utah pmap_bootstrap1() has a
   4010 	 * check to see if the kernel + bootstrap allocations fit
   4011 	 * within one of the memory segments mapped by the loader.
   4012 	 * This is a hook to accommodate that requirement.
   4013 	 */
   4014 	void (*alloc_checkfn)(paddr_t, paddr_t) = (void *)
   4015 	    PMAP_BOOTSTRAP_RELOC_GLOB(pmap_machine_check_bootstrap_allocations);
   4016 	(*alloc_checkfn)(nextpa, reloff);
   4017 #endif
   4018 
   4019 	/*
   4020 	 * The bulk of the dynamic memory allocation is done (there
   4021 	 * may be more below if we have to allocate more inner segment
   4022 	 * table pages, but we'll burn that bridge when we come to it).
   4023 	 *
   4024 	 * Zero out all of these freshly-allocated pages.
   4025 	 */
   4026 	pte = (pt_entry_t *)PMAP_BOOTSTRAP_RELOC_PA(kernimg_endpa);
   4027 	epte = (pt_entry_t *)PMAP_BOOTSTRAP_RELOC_PA(nextpa);
   4028 	while (pte < epte) {
   4029 		*pte++ = 0;
   4030 	}
   4031 
   4032 	/*
   4033 	 * Ok, let's get to mapping stuff!  Almost everything is in
   4034 	 * the default VA range.
   4035 	 */
   4036 	var = &va_ranges[VA_RANGE_DEFAULT];
   4037 
   4038 	/*
   4039 	 * Kernel text - read-only.
   4040 	 *
   4041 	 * ...that is, unless, a platform as some quirky requirement
   4042 	 * (hello mac68k!).  This hook lets a platform specify an
   4043 	 * alternate proto PTE for the kernel text (in the mac68k case,
   4044 	 * it will be read/write write-though-cacheable).  Once the
   4045 	 * kernel is up and running on its own mappings, machine-specific
   4046 	 * code can perform any fixups as necessary.
   4047 	 */
   4048 #ifndef PMAP_BOOTSTRAP_TEXT_PROTO_PTE
   4049 #define	PMAP_BOOTSTRAP_TEXT_PROTO_PTE	proto_ro_pte
   4050 #endif
   4051 	pa = PMAP_BOOTSTRAP_VA_TO_PA(m68k_trunc_page(&kernel_text));
   4052 	pte = VA_PTE_BASE(&kernel_text, var);
   4053 	epte = VA_PTE_BASE(&etext, var);
   4054 	while (pte < epte) {
   4055 		*pte++ = PMAP_BOOTSTRAP_TEXT_PROTO_PTE | pa;
   4056 		pa += PAGE_SIZE;
   4057 		entry_count++;
   4058 	}
   4059 
   4060 	/* Remainder of kernel image - read-write. */
   4061 	epte = VA_PTE_BASE(PMAP_BOOTSTRAP_PA_TO_VA(kernimg_endpa), var);
   4062 	while (pte < epte) {
   4063 		*pte++ = proto_rw_pte | pa;
   4064 		pa += PAGE_SIZE;
   4065 		entry_count++;
   4066 	}
   4067 
   4068 	/* lwp0 u-area - read-write. */
   4069 	pa = lwp0upa;
   4070 	pte = VA_PTE_BASE(RELOC(lwp0uarea, vaddr_t), var);
   4071 	epte = VA_PTE_BASE(RELOC(lwp0uarea, vaddr_t) + USPACE, var);
   4072 	while (pte < epte) {
   4073 		*pte++ = proto_rw_pte | pa;
   4074 		pa += PAGE_SIZE;
   4075 		entry_count++;
   4076 	}
   4077 
   4078 	/* Kernel lev1map - read-write, cache-inhibited. */
   4079 	pte = VA_PTE_BASE(kern_lev1va, var);
   4080 	*pte = proto_rw_ci_pte | kern_lev1pa;
   4081 	entry_count++;
   4082 
   4083 	/*
   4084 	 * Kernel leaf PT pages - read-write, cache-inhibited.
   4085 	 *
   4086 	 * These will be in a different VA range if the machine
   4087 	 * defines SYSMAP_VA.
   4088 	 */
   4089 	va = RELOC(kernel_ptes, vaddr_t);
   4090 	pt_entry_t *kptes = (pt_entry_t *)va;
   4091 	struct va_range *kpte_var = &va_ranges[VA_RANGE_KPTES];
   4092 
   4093 	for (r = 0; r < NRANGES; r++) {
   4094 		var = &va_ranges[r];
   4095 		va = (vaddr_t)(&kptes[m68k_btop(var->start_va)]);
   4096 		pte = VA_PTE_BASE(va, kpte_var);
   4097 		for (pa = var->start_ptp; pa < var->end_ptp; pa += PAGE_SIZE) {
   4098 			*pte++ = proto_rw_ci_pte | pa;
   4099 			entry_count++;
   4100 		}
   4101 	}
   4102 
   4103 	/*
   4104 	 * Now perform any machine-specific mappings at VAs
   4105 	 * allocated earlier.
   4106 	 */
   4107 	pmbm = (const struct pmap_bootmap *)
   4108 	    PMAP_BOOTSTRAP_RELOC_GLOB(machine_bootmap);
   4109 	for (; pmbm->pmbm_vaddr != (vaddr_t)-1; pmbm++) {
   4110 		pt_entry_t proto;
   4111 
   4112 		if (pmbm->pmbm_size == 0 ||
   4113 		    (pmbm->pmbm_flags & (PMBM_F_VAONLY | PMBM_F_KEEPOUT))) {
   4114 			continue;
   4115 		}
   4116 		if (pmbm->pmbm_flags & PMBM_F_FIXEDVA) {
   4117 			va = pmbm->pmbm_vaddr;
   4118 		} else {
   4119 			va = *(vaddr_t *)
   4120 			    PMAP_BOOTSTRAP_RELOC_GLOB(pmbm->pmbm_vaddr_ptr);
   4121 		}
   4122 		for (r = 0; r < NRANGES; r++) {
   4123 			var = &va_ranges[r];
   4124 			if (VA_IN_RANGE(va, var)) {
   4125 				break;
   4126 			}
   4127 		}
   4128 		pa = pmbm->pmbm_paddr;
   4129 		pte = VA_PTE_BASE(va, var);
   4130 		switch (pmbm->pmbm_flags & (PMBM_F_CI|PMBM_F_RO)) {
   4131 		case PMBM_F_CI|PMBM_F_RO:
   4132 			proto = proto_rw_ci_pte | PTE_WP;
   4133 			break;
   4134 		case PMBM_F_CI:
   4135 			proto = proto_rw_ci_pte;
   4136 			break;
   4137 		case PMBM_F_RO:
   4138 			proto = proto_ro_pte;
   4139 			break;
   4140 		default:
   4141 			proto = proto_rw_pte;
   4142 			break;
   4143 		}
   4144 		for (vsize_t size = m68k_round_page(pmbm->pmbm_size);
   4145 		     size != 0;
   4146 		     va += PAGE_SIZE, pa += PAGE_SIZE, size -= PAGE_SIZE) {
   4147 			*pte++ = proto | pa;
   4148 			entry_count++;
   4149 		}
   4150 	}
   4151 
   4152 	/*
   4153 	 * Now that all of the invidual VAs are mapped in the leaf
   4154 	 * tables, it's time to link those tables into the segment
   4155 	 * table.
   4156 	 *
   4157 	 * For the 2-level case, this is trivial.  For the 3-level
   4158 	 * case, we will have to allocate inner segment tables.
   4159 	 */
   4160 	for (r = 0; r < NRANGES; r++) {
   4161 		var = &va_ranges[r];
   4162 		if (use_3l) {
   4163 			pt_entry_t *stes, *stes1 = (pt_entry_t *)
   4164 			    PMAP_BOOTSTRAP_RELOC_PA(kern_lev1pa);
   4165 			for (va = var->start_va, pa = var->start_ptp;
   4166 			     pa < var->end_ptp;
   4167 			     va += NBSEG3L, pa += TBL40_L3_SIZE) {
   4168 				unsigned int ri = LA40_RI(va);
   4169 				if ((stes1[ri] & UTE40_RESIDENT) == 0) {
   4170 					/*
   4171 					 * Level-2 table for this segment
   4172 					 * needed.
   4173 					 */
   4174 					if (stnext_pa == stnext_endpa) {
   4175 						/*
   4176 						 * No more slots left in the
   4177 						 * last page we allocated for
   4178 						 * segment tables.  Grab
   4179 						 * another one.
   4180 						 */
   4181 						stnext_pa = nextpa;
   4182 						nextpa += PAGE_SIZE;
   4183 						stnext_endpa = nextpa;
   4184 						nstpages++;
   4185 #ifdef PMAP_MACHINE_CHECK_BOOTSTRAP_ALLOCATIONS
   4186 						(*alloc_checkfn)(nextpa,
   4187 						    reloff);
   4188 #endif
   4189 						/*
   4190 						 * Zero out the new inner
   4191 						 * segment table page.
   4192 						 */
   4193 						pte = (pt_entry_t *)
   4194 						    PMAP_BOOTSTRAP_RELOC_PA(
   4195 						    stnext_pa);
   4196 						epte = (pt_entry_t *)
   4197 						    PMAP_BOOTSTRAP_RELOC_PA(
   4198 						    stnext_endpa);
   4199 						while (pte < epte) {
   4200 							*pte++ = 0;
   4201 						}
   4202 					}
   4203 					stes1[ri] = proto_ste | stnext_pa;
   4204 					stnext_pa += TBL40_L2_SIZE;
   4205 				}
   4206 				stes = (pt_entry_t *)
   4207 				    PMAP_BOOTSTRAP_RELOC_PA(
   4208 				    stes1[ri] & UTE40_PTA);
   4209 				stes[LA40_PI(va)] = proto_ste | pa;
   4210 			}
   4211 		} else {
   4212 			pt_entry_t *stes = (pt_entry_t *)
   4213 			    PMAP_BOOTSTRAP_RELOC_PA(kern_lev1pa);
   4214 			for (va = var->start_va, pa = var->start_ptp;
   4215 			     pa < var->end_ptp;
   4216 			     va += NBSEG2L, pa += PAGE_SIZE) {
   4217 				stes[LA2L_RI(va)] = proto_ste | pa;
   4218 			}
   4219 		}
   4220 	}
   4221 
   4222 	/* Instrumentation. */
   4223 	RELOC(pmap_nkptpages_initial_ev.ev_count32, uint32_t) =
   4224 	RELOC(pmap_nkptpages_current_ev.ev_count32, uint32_t) = total_ptpages;
   4225 	RELOC(pmap_nkstpages_initial_ev.ev_count32, uint32_t) =
   4226 	RELOC(pmap_nkstpages_current_ev.ev_count32, uint32_t) = nstpages;
   4227 
   4228 	/*
   4229 	 * Record the number of wired mappings we created above
   4230 	 * in the kernel pmap stats.
   4231 	 */
   4232 	RELOC(kernel_pmap_store.pm_stats.resident_count, long) = entry_count;
   4233 	RELOC(kernel_pmap_store.pm_stats.wired_count, long) = entry_count;
   4234 
   4235 	/*
   4236 	 * Stash any left-over segment table space for use by
   4237 	 * pmap_growkernel() later.
   4238 	 */
   4239 	RELOC(kernel_stnext_pa, paddr_t) = stnext_pa;
   4240 	RELOC(kernel_stnext_endpa, paddr_t) = stnext_endpa;
   4241 
   4242 	return nextpa;
   4243 }
   4244 
   4245 /*
   4246  * pmap_bootstrap2:
   4247  *
   4248  *	Phase 2 of bootstrapping virtual memory.  This is called after
   4249  *	the MMU has been enabled to finish setting up run-time-computed
   4250  *	global pmap data, plus the lwp0 u-area, curlwp, and curpcb.
   4251  */
   4252 void *
   4253 pmap_bootstrap2(void)
   4254 {
   4255 	/* Setup the MMU class; needed before anything else. */
   4256 	pmap_mmuclass_init();
   4257 
   4258 	/* Early low-level UVM initialization. */
   4259 	uvmexp.pagesize = NBPG;				/* XXX ick, NBPG */
   4260 	uvm_md_init();
   4261 
   4262 	/* Initialize prototype PTEs; needed before anything else is mapped. */
   4263 	pmap_pte_proto_init();
   4264 
   4265 	/* Initialize the kernel pmap. */
   4266 	pmap_pinit(pmap_kernel(), Sysseg_pa);
   4267 
   4268 	/* Initialize lwp0 u-area, curlwp, and curpcb. */
   4269 	memset((void *)lwp0uarea, 0, USPACE);
   4270 	uvm_lwp_setuarea(&lwp0, lwp0uarea);
   4271 	curlwp = &lwp0;
   4272 	curpcb = lwp_getpcb(&lwp0);
   4273 
   4274 	/*
   4275 	 * Initialize the source/destination control registers for
   4276 	 * movs.
   4277 	 */
   4278 	setsfc(FC_USERD);
   4279 	setdfc(FC_USERD);
   4280 
   4281 	return (void *)lwp0uarea;
   4282 }
   4283