Home | History | Annotate | Line # | Download | only in x86
xen_pmap.c revision 1.15
      1  1.15  cherry /*	$NetBSD: xen_pmap.c,v 1.15 2012/01/22 18:16:34 cherry Exp $	*/
      2   1.2   chuck 
      3   1.1  dyoung /*
      4   1.1  dyoung  * Copyright (c) 2007 Manuel Bouyer.
      5   1.1  dyoung  *
      6   1.1  dyoung  * Redistribution and use in source and binary forms, with or without
      7   1.1  dyoung  * modification, are permitted provided that the following conditions
      8   1.1  dyoung  * are met:
      9   1.1  dyoung  * 1. Redistributions of source code must retain the above copyright
     10   1.1  dyoung  *    notice, this list of conditions and the following disclaimer.
     11   1.1  dyoung  * 2. Redistributions in binary form must reproduce the above copyright
     12   1.1  dyoung  *    notice, this list of conditions and the following disclaimer in the
     13   1.1  dyoung  *    documentation and/or other materials provided with the distribution.
     14   1.1  dyoung  *
     15   1.1  dyoung  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     16   1.1  dyoung  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     17   1.1  dyoung  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     18   1.1  dyoung  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     19   1.1  dyoung  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     20   1.1  dyoung  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     21   1.1  dyoung  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     22   1.1  dyoung  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23   1.1  dyoung  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     24   1.1  dyoung  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25   1.1  dyoung  *
     26   1.1  dyoung  */
     27   1.1  dyoung 
     28   1.1  dyoung /*
     29   1.1  dyoung  * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
     30   1.1  dyoung  *
     31   1.1  dyoung  * Permission to use, copy, modify, and distribute this software for any
     32   1.1  dyoung  * purpose with or without fee is hereby granted, provided that the above
     33   1.1  dyoung  * copyright notice and this permission notice appear in all copies.
     34   1.1  dyoung  *
     35   1.1  dyoung  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     36   1.1  dyoung  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     37   1.1  dyoung  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     38   1.1  dyoung  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     39   1.1  dyoung  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     40   1.1  dyoung  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     41   1.1  dyoung  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     42   1.1  dyoung  */
     43   1.1  dyoung 
     44   1.1  dyoung /*
     45   1.1  dyoung  * Copyright (c) 1997 Charles D. Cranor and Washington University.
     46   1.1  dyoung  * All rights reserved.
     47   1.1  dyoung  *
     48   1.1  dyoung  * Redistribution and use in source and binary forms, with or without
     49   1.1  dyoung  * modification, are permitted provided that the following conditions
     50   1.1  dyoung  * are met:
     51   1.1  dyoung  * 1. Redistributions of source code must retain the above copyright
     52   1.1  dyoung  *    notice, this list of conditions and the following disclaimer.
     53   1.1  dyoung  * 2. Redistributions in binary form must reproduce the above copyright
     54   1.1  dyoung  *    notice, this list of conditions and the following disclaimer in the
     55   1.1  dyoung  *    documentation and/or other materials provided with the distribution.
     56   1.1  dyoung  *
     57   1.1  dyoung  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     58   1.1  dyoung  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     59   1.1  dyoung  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     60   1.1  dyoung  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     61   1.1  dyoung  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     62   1.1  dyoung  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     63   1.1  dyoung  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     64   1.1  dyoung  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     65   1.1  dyoung  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     66   1.1  dyoung  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     67   1.1  dyoung  */
     68   1.1  dyoung 
     69   1.1  dyoung /*
     70   1.1  dyoung  * Copyright 2001 (c) Wasabi Systems, Inc.
     71   1.1  dyoung  * All rights reserved.
     72   1.1  dyoung  *
     73   1.1  dyoung  * Written by Frank van der Linden for Wasabi Systems, Inc.
     74   1.1  dyoung  *
     75   1.1  dyoung  * Redistribution and use in source and binary forms, with or without
     76   1.1  dyoung  * modification, are permitted provided that the following conditions
     77   1.1  dyoung  * are met:
     78   1.1  dyoung  * 1. Redistributions of source code must retain the above copyright
     79   1.1  dyoung  *    notice, this list of conditions and the following disclaimer.
     80   1.1  dyoung  * 2. Redistributions in binary form must reproduce the above copyright
     81   1.1  dyoung  *    notice, this list of conditions and the following disclaimer in the
     82   1.1  dyoung  *    documentation and/or other materials provided with the distribution.
     83   1.1  dyoung  * 3. All advertising materials mentioning features or use of this software
     84   1.1  dyoung  *    must display the following acknowledgement:
     85   1.1  dyoung  *      This product includes software developed for the NetBSD Project by
     86   1.1  dyoung  *      Wasabi Systems, Inc.
     87   1.1  dyoung  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     88   1.1  dyoung  *    or promote products derived from this software without specific prior
     89   1.1  dyoung  *    written permission.
     90   1.1  dyoung  *
     91   1.1  dyoung  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
     92   1.1  dyoung  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     93   1.1  dyoung  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     94   1.1  dyoung  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
     95   1.1  dyoung  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     96   1.1  dyoung  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     97   1.1  dyoung  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     98   1.1  dyoung  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     99   1.1  dyoung  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    100   1.1  dyoung  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    101   1.1  dyoung  * POSSIBILITY OF SUCH DAMAGE.
    102   1.1  dyoung  */
    103   1.1  dyoung 
    104   1.1  dyoung #include <sys/cdefs.h>
    105  1.15  cherry __KERNEL_RCSID(0, "$NetBSD: xen_pmap.c,v 1.15 2012/01/22 18:16:34 cherry Exp $");
    106   1.1  dyoung 
    107   1.1  dyoung #include "opt_user_ldt.h"
    108   1.1  dyoung #include "opt_lockdebug.h"
    109   1.1  dyoung #include "opt_multiprocessor.h"
    110   1.1  dyoung #include "opt_xen.h"
    111   1.1  dyoung #if !defined(__x86_64__)
    112   1.1  dyoung #include "opt_kstack_dr0.h"
    113   1.1  dyoung #endif /* !defined(__x86_64__) */
    114   1.1  dyoung 
    115   1.1  dyoung #include <sys/param.h>
    116   1.1  dyoung #include <sys/systm.h>
    117   1.1  dyoung #include <sys/proc.h>
    118   1.1  dyoung #include <sys/pool.h>
    119   1.1  dyoung #include <sys/kernel.h>
    120   1.1  dyoung #include <sys/atomic.h>
    121   1.1  dyoung #include <sys/cpu.h>
    122   1.1  dyoung #include <sys/intr.h>
    123   1.1  dyoung #include <sys/xcall.h>
    124   1.1  dyoung 
    125   1.1  dyoung #include <uvm/uvm.h>
    126   1.1  dyoung 
    127   1.1  dyoung #include <dev/isa/isareg.h>
    128   1.1  dyoung 
    129   1.1  dyoung #include <machine/specialreg.h>
    130   1.1  dyoung #include <machine/gdt.h>
    131   1.1  dyoung #include <machine/isa_machdep.h>
    132   1.1  dyoung #include <machine/cpuvar.h>
    133   1.1  dyoung 
    134   1.1  dyoung #include <x86/pmap.h>
    135   1.1  dyoung #include <x86/pmap_pv.h>
    136   1.1  dyoung 
    137   1.1  dyoung #include <x86/i82489reg.h>
    138   1.1  dyoung #include <x86/i82489var.h>
    139   1.1  dyoung 
    140  1.11  cegger #include <xen/xen-public/xen.h>
    141   1.1  dyoung #include <xen/hypervisor.h>
    142  1.10     jym #include <xen/xenpmap.h>
    143   1.1  dyoung 
    144   1.3   rmind #define COUNT(x)	/* nothing */
    145   1.3   rmind 
    146   1.3   rmind static pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
    147   1.3   rmind extern pd_entry_t * const normal_pdes[];
    148   1.3   rmind 
    149   1.1  dyoung extern paddr_t pmap_pa_start; /* PA of first physical page for this domain */
    150   1.1  dyoung extern paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
    151   1.1  dyoung 
    152   1.3   rmind void
    153   1.3   rmind pmap_apte_flush(struct pmap *pmap)
    154   1.3   rmind {
    155   1.3   rmind 
    156   1.3   rmind 	KASSERT(kpreempt_disabled());
    157   1.3   rmind 
    158   1.3   rmind 	/*
    159   1.3   rmind 	 * Flush the APTE mapping from all other CPUs that
    160   1.3   rmind 	 * are using the pmap we are using (who's APTE space
    161   1.3   rmind 	 * is the one we've just modified).
    162   1.3   rmind 	 *
    163   1.3   rmind 	 * XXXthorpej -- find a way to defer the IPI.
    164   1.3   rmind 	 */
    165   1.3   rmind 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_APTE);
    166   1.3   rmind 	pmap_tlb_shootnow();
    167   1.3   rmind }
    168   1.3   rmind 
    169   1.3   rmind /*
    170   1.3   rmind  * Unmap the content of APDP PDEs
    171   1.3   rmind  */
    172   1.3   rmind void
    173   1.3   rmind pmap_unmap_apdp(void)
    174   1.3   rmind {
    175   1.3   rmind 	int i;
    176   1.3   rmind 
    177   1.3   rmind 	for (i = 0; i < PDP_SIZE; i++) {
    178   1.3   rmind 		pmap_pte_set(APDP_PDE+i, 0);
    179   1.3   rmind #if defined (PAE)
    180   1.5     jym 		/*
    181   1.5     jym 		 * For PAE, there are two places where alternative recursive
    182   1.5     jym 		 * mappings could be found with Xen:
    183   1.5     jym 		 * - in the L2 shadow pages
    184   1.5     jym 		 * - the "real" L2 kernel page (pmap_kl2pd), which is unique
    185   1.5     jym 		 * and static.
    186   1.5     jym 		 * We first clear the APDP for the current pmap. As L2 kernel
    187   1.5     jym 		 * page is unique, we only need to do it once for all pmaps.
    188   1.5     jym 		 */
    189   1.3   rmind 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
    190   1.3   rmind #endif
    191   1.3   rmind 	}
    192   1.3   rmind }
    193   1.3   rmind 
    194   1.3   rmind /*
    195   1.3   rmind  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
    196   1.3   rmind  *
    197   1.3   rmind  * => we lock enough pmaps to keep things locked in
    198   1.3   rmind  * => must be undone with pmap_unmap_ptes before returning
    199   1.3   rmind  */
    200   1.3   rmind 
    201   1.3   rmind void
    202   1.3   rmind pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
    203   1.3   rmind 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
    204   1.3   rmind {
    205   1.3   rmind 	pd_entry_t opde, npde;
    206   1.3   rmind 	struct pmap *ourpmap;
    207   1.3   rmind 	struct cpu_info *ci;
    208   1.3   rmind 	struct lwp *l;
    209   1.3   rmind 	bool iscurrent;
    210   1.3   rmind 	uint64_t ncsw;
    211   1.3   rmind 	int s;
    212   1.3   rmind 
    213   1.3   rmind 	/* the kernel's pmap is always accessible */
    214   1.3   rmind 	if (pmap == pmap_kernel()) {
    215   1.3   rmind 		*pmap2 = NULL;
    216   1.3   rmind 		*ptepp = PTE_BASE;
    217   1.3   rmind 		*pdeppp = normal_pdes;
    218   1.3   rmind 		return;
    219   1.3   rmind 	}
    220   1.3   rmind 	KASSERT(kpreempt_disabled());
    221   1.3   rmind 
    222   1.3   rmind  retry:
    223   1.3   rmind 	l = curlwp;
    224   1.3   rmind 	ncsw = l->l_ncsw;
    225   1.3   rmind  	ourpmap = NULL;
    226   1.3   rmind 	ci = curcpu();
    227   1.3   rmind #if defined(__x86_64__)
    228   1.3   rmind 	/*
    229   1.3   rmind 	 * curmap can only be pmap_kernel so at this point
    230   1.3   rmind 	 * pmap_is_curpmap is always false
    231   1.3   rmind 	 */
    232   1.3   rmind 	iscurrent = 0;
    233   1.3   rmind 	ourpmap = pmap_kernel();
    234   1.3   rmind #else /* __x86_64__*/
    235   1.3   rmind 	if (ci->ci_want_pmapload &&
    236   1.3   rmind 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
    237   1.3   rmind 		pmap_load();
    238   1.3   rmind 		if (l->l_ncsw != ncsw)
    239   1.3   rmind 			goto retry;
    240   1.3   rmind 	}
    241   1.3   rmind 	iscurrent = pmap_is_curpmap(pmap);
    242   1.3   rmind 	/* if curpmap then we are always mapped */
    243   1.3   rmind 	if (iscurrent) {
    244   1.3   rmind 		mutex_enter(pmap->pm_lock);
    245   1.3   rmind 		*pmap2 = NULL;
    246   1.3   rmind 		*ptepp = PTE_BASE;
    247   1.3   rmind 		*pdeppp = normal_pdes;
    248   1.3   rmind 		goto out;
    249   1.3   rmind 	}
    250   1.3   rmind 	ourpmap = ci->ci_pmap;
    251   1.3   rmind #endif /* __x86_64__ */
    252   1.3   rmind 
    253   1.3   rmind 	/* need to lock both curpmap and pmap: use ordered locking */
    254   1.3   rmind 	pmap_reference(ourpmap);
    255   1.3   rmind 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
    256   1.3   rmind 		mutex_enter(pmap->pm_lock);
    257   1.3   rmind 		mutex_enter(ourpmap->pm_lock);
    258   1.3   rmind 	} else {
    259   1.3   rmind 		mutex_enter(ourpmap->pm_lock);
    260   1.3   rmind 		mutex_enter(pmap->pm_lock);
    261   1.3   rmind 	}
    262   1.3   rmind 
    263   1.3   rmind 	if (l->l_ncsw != ncsw)
    264   1.3   rmind 		goto unlock_and_retry;
    265   1.3   rmind 
    266   1.3   rmind 	/* need to load a new alternate pt space into curpmap? */
    267   1.3   rmind 	COUNT(apdp_pde_map);
    268   1.3   rmind 	opde = *APDP_PDE;
    269   1.3   rmind 	if (!pmap_valid_entry(opde) ||
    270   1.3   rmind 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
    271   1.3   rmind 		int i;
    272   1.3   rmind 		s = splvm();
    273   1.3   rmind 		/* Make recursive entry usable in user PGD */
    274   1.3   rmind 		for (i = 0; i < PDP_SIZE; i++) {
    275   1.3   rmind 			npde = pmap_pa2pte(
    276   1.3   rmind 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
    277   1.3   rmind 			xpq_queue_pte_update(
    278   1.3   rmind 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
    279   1.3   rmind 			    npde);
    280   1.3   rmind 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
    281   1.3   rmind 			    npde);
    282   1.3   rmind #ifdef PAE
    283   1.3   rmind 			/* update shadow entry too */
    284   1.3   rmind 			xpq_queue_pte_update(
    285   1.3   rmind 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
    286   1.3   rmind #endif /* PAE */
    287   1.3   rmind 			xpq_queue_invlpg(
    288   1.3   rmind 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
    289   1.3   rmind 		}
    290   1.3   rmind 		if (pmap_valid_entry(opde))
    291   1.3   rmind 			pmap_apte_flush(ourpmap);
    292   1.3   rmind 		splx(s);
    293   1.3   rmind 	}
    294   1.3   rmind 	*pmap2 = ourpmap;
    295   1.3   rmind 	*ptepp = APTE_BASE;
    296   1.3   rmind 	*pdeppp = alternate_pdes;
    297   1.3   rmind 	KASSERT(l->l_ncsw == ncsw);
    298   1.3   rmind #if !defined(__x86_64__)
    299   1.3   rmind  out:
    300   1.3   rmind #endif
    301   1.3   rmind  	/*
    302   1.3   rmind  	 * might have blocked, need to retry?
    303   1.3   rmind  	 */
    304   1.3   rmind 	if (l->l_ncsw != ncsw) {
    305   1.3   rmind  unlock_and_retry:
    306   1.3   rmind 	    	if (ourpmap != NULL) {
    307   1.3   rmind 			mutex_exit(ourpmap->pm_lock);
    308   1.3   rmind 			pmap_destroy(ourpmap);
    309   1.3   rmind 		}
    310   1.3   rmind 		mutex_exit(pmap->pm_lock);
    311   1.3   rmind 		goto retry;
    312   1.3   rmind 	}
    313   1.3   rmind }
    314   1.3   rmind 
    315   1.3   rmind /*
    316   1.3   rmind  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
    317   1.3   rmind  */
    318   1.3   rmind 
    319   1.3   rmind void
    320   1.3   rmind pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
    321   1.3   rmind {
    322   1.3   rmind 
    323   1.3   rmind 	if (pmap == pmap_kernel()) {
    324   1.3   rmind 		return;
    325   1.3   rmind 	}
    326   1.3   rmind 	KASSERT(kpreempt_disabled());
    327   1.3   rmind 	if (pmap2 == NULL) {
    328   1.3   rmind 		mutex_exit(pmap->pm_lock);
    329   1.3   rmind 	} else {
    330   1.3   rmind #if defined(__x86_64__)
    331   1.3   rmind 		KASSERT(pmap2 == pmap_kernel());
    332   1.3   rmind #else
    333   1.3   rmind 		KASSERT(curcpu()->ci_pmap == pmap2);
    334   1.3   rmind #endif
    335   1.3   rmind #if defined(MULTIPROCESSOR)
    336   1.3   rmind 		pmap_unmap_apdp();
    337   1.3   rmind 		pmap_pte_flush();
    338   1.3   rmind 		pmap_apte_flush(pmap2);
    339   1.3   rmind #endif /* MULTIPROCESSOR */
    340   1.3   rmind 		COUNT(apdp_pde_unmap);
    341   1.3   rmind 		mutex_exit(pmap->pm_lock);
    342   1.3   rmind 		mutex_exit(pmap2->pm_lock);
    343   1.3   rmind 		pmap_destroy(pmap2);
    344   1.3   rmind 	}
    345   1.3   rmind }
    346   1.3   rmind 
    347   1.1  dyoung int
    348   1.1  dyoung pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
    349   1.1  dyoung {
    350   1.1  dyoung         paddr_t ma;
    351   1.1  dyoung 
    352   1.1  dyoung 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
    353   1.1  dyoung 		ma = pa; /* XXX hack */
    354   1.1  dyoung 	} else {
    355   1.1  dyoung 		ma = xpmap_ptom(pa);
    356   1.1  dyoung 	}
    357   1.1  dyoung 
    358   1.1  dyoung 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
    359   1.1  dyoung }
    360   1.1  dyoung 
    361   1.1  dyoung /*
    362   1.1  dyoung  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
    363   1.1  dyoung  *
    364   1.1  dyoung  * => no need to lock anything, assume va is already allocated
    365   1.1  dyoung  * => should be faster than normal pmap enter function
    366   1.1  dyoung  * => we expect a MACHINE address
    367   1.1  dyoung  */
    368   1.1  dyoung 
    369   1.1  dyoung void
    370   1.1  dyoung pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
    371   1.1  dyoung {
    372   1.1  dyoung 	pt_entry_t *pte, opte, npte;
    373   1.1  dyoung 
    374   1.1  dyoung 	if (va < VM_MIN_KERNEL_ADDRESS)
    375   1.1  dyoung 		pte = vtopte(va);
    376   1.1  dyoung 	else
    377   1.1  dyoung 		pte = kvtopte(va);
    378   1.1  dyoung 
    379   1.1  dyoung 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
    380   1.1  dyoung 	     PG_V | PG_k;
    381   1.1  dyoung 	if (flags & PMAP_NOCACHE)
    382   1.1  dyoung 		npte |= PG_N;
    383   1.1  dyoung 
    384   1.1  dyoung 	if ((cpu_feature[2] & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
    385   1.1  dyoung 		npte |= PG_NX;
    386   1.1  dyoung 
    387   1.1  dyoung 	opte = pmap_pte_testset (pte, npte); /* zap! */
    388   1.1  dyoung 
    389   1.1  dyoung 	if (pmap_valid_entry(opte)) {
    390   1.1  dyoung #if defined(MULTIPROCESSOR)
    391   1.1  dyoung 		kpreempt_disable();
    392   1.3   rmind 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
    393   1.1  dyoung 		kpreempt_enable();
    394   1.1  dyoung #else
    395   1.1  dyoung 		/* Don't bother deferring in the single CPU case. */
    396   1.1  dyoung 		pmap_update_pg(va);
    397   1.1  dyoung #endif
    398   1.1  dyoung 	}
    399   1.1  dyoung }
    400   1.1  dyoung 
    401   1.1  dyoung /*
    402   1.1  dyoung  * pmap_extract_ma: extract a MA for the given VA
    403   1.1  dyoung  */
    404   1.1  dyoung 
    405   1.1  dyoung bool
    406   1.1  dyoung pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
    407   1.1  dyoung {
    408   1.1  dyoung 	pt_entry_t *ptes, pte;
    409   1.1  dyoung 	pd_entry_t pde;
    410   1.1  dyoung 	pd_entry_t * const *pdes;
    411   1.1  dyoung 	struct pmap *pmap2;
    412   1.5     jym 
    413   1.1  dyoung 	kpreempt_disable();
    414   1.1  dyoung 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
    415   1.1  dyoung 	if (!pmap_pdes_valid(va, pdes, &pde)) {
    416   1.1  dyoung 		pmap_unmap_ptes(pmap, pmap2);
    417   1.1  dyoung 		kpreempt_enable();
    418   1.1  dyoung 		return false;
    419   1.1  dyoung 	}
    420   1.5     jym 
    421   1.1  dyoung 	pte = ptes[pl1_i(va)];
    422   1.1  dyoung 	pmap_unmap_ptes(pmap, pmap2);
    423   1.1  dyoung 	kpreempt_enable();
    424   1.5     jym 
    425   1.1  dyoung 	if (__predict_true((pte & PG_V) != 0)) {
    426   1.1  dyoung 		if (pap != NULL)
    427   1.1  dyoung 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
    428   1.1  dyoung 		return true;
    429   1.1  dyoung 	}
    430   1.5     jym 
    431   1.1  dyoung 	return false;
    432   1.1  dyoung }
    433   1.6     jym 
    434   1.6     jym /*
    435   1.6     jym  * Flush all APDP entries found in pmaps
    436   1.9     jym  * Required during Xen save/restore operations, as Xen does not
    437   1.6     jym  * handle alternative recursive mappings properly
    438   1.6     jym  */
    439   1.6     jym void
    440   1.9     jym pmap_xen_suspend(void)
    441   1.6     jym {
    442   1.6     jym 	int i;
    443   1.6     jym 	int s;
    444   1.6     jym 	struct pmap *pm;
    445   1.6     jym 
    446   1.6     jym 	s = splvm();
    447   1.6     jym 
    448   1.6     jym 	pmap_unmap_apdp();
    449   1.6     jym 
    450   1.6     jym 	mutex_enter(&pmaps_lock);
    451   1.6     jym 	/*
    452   1.6     jym 	 * Set APDP entries to 0 in all pmaps.
    453   1.6     jym 	 * Note that for PAE kernels, this only clears the APDP entries
    454   1.6     jym 	 * found in the L2 shadow pages, as pmap_pdirpa() is used to obtain
    455   1.6     jym 	 * the PA of the pmap->pm_pdir[] pages (forming the 4 contiguous
    456   1.6     jym 	 * pages of PAE PD: 3 for user space, 1 for the L2 kernel shadow page)
    457   1.6     jym 	 */
    458   1.6     jym 	LIST_FOREACH(pm, &pmaps, pm_list) {
    459   1.6     jym 		for (i = 0; i < PDP_SIZE; i++) {
    460   1.6     jym 			xpq_queue_pte_update(
    461   1.6     jym 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_APTE + i)),
    462   1.6     jym 			    0);
    463   1.6     jym 		}
    464   1.6     jym 	}
    465   1.6     jym 	mutex_exit(&pmaps_lock);
    466   1.6     jym 
    467   1.6     jym 	xpq_flush_queue();
    468   1.6     jym 
    469   1.6     jym 	splx(s);
    470   1.6     jym 
    471   1.9     jym #ifdef PAE
    472   1.9     jym 	pmap_unmap_recursive_entries();
    473   1.9     jym #endif
    474   1.9     jym }
    475   1.9     jym 
    476   1.9     jym void
    477   1.9     jym pmap_xen_resume(void)
    478   1.9     jym {
    479   1.9     jym #ifdef PAE
    480   1.9     jym 	pmap_map_recursive_entries();
    481   1.9     jym #endif
    482   1.6     jym }
    483   1.6     jym 
    484   1.6     jym #ifdef PAE
    485   1.6     jym /*
    486   1.6     jym  * NetBSD uses L2 shadow pages to support PAE with Xen. However, Xen does not
    487   1.6     jym  * handle them correctly during save/restore, leading to incorrect page
    488   1.6     jym  * tracking and pinning during restore.
    489   1.6     jym  * For save/restore to succeed, two functions are introduced:
    490   1.6     jym  * - pmap_map_recursive_entries(), used by resume code to set the recursive
    491   1.6     jym  *   mapping entries to their correct value
    492   1.6     jym  * - pmap_unmap_recursive_entries(), used by suspend code to clear all
    493   1.6     jym  *   PDIR_SLOT_PTE entries
    494   1.6     jym  */
    495   1.6     jym void
    496   1.6     jym pmap_map_recursive_entries(void)
    497   1.6     jym {
    498   1.6     jym 	int i;
    499   1.6     jym 	struct pmap *pm;
    500   1.6     jym 
    501   1.6     jym 	mutex_enter(&pmaps_lock);
    502   1.6     jym 	LIST_FOREACH(pm, &pmaps, pm_list) {
    503   1.6     jym 		for (i = 0; i < PDP_SIZE; i++) {
    504   1.6     jym 			xpq_queue_pte_update(
    505   1.6     jym 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)),
    506   1.6     jym 			    xpmap_ptom((pm)->pm_pdirpa[i]) | PG_V);
    507   1.6     jym 		}
    508   1.6     jym 	}
    509   1.6     jym 	mutex_exit(&pmaps_lock);
    510   1.6     jym 
    511   1.6     jym 	for (i = 0; i < PDP_SIZE; i++) {
    512   1.6     jym 		xpq_queue_pte_update(
    513   1.6     jym 		    xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
    514   1.6     jym 		    xpmap_ptom(pmap_kernel()->pm_pdirpa[i]) | PG_V);
    515   1.6     jym 	}
    516   1.6     jym 
    517   1.6     jym 	xpq_flush_queue();
    518   1.6     jym }
    519   1.6     jym 
    520   1.6     jym void
    521   1.6     jym pmap_unmap_recursive_entries(void)
    522   1.6     jym {
    523   1.6     jym 	int i;
    524   1.6     jym 	struct pmap *pm;
    525   1.6     jym 
    526   1.9     jym 	/*
    527   1.9     jym 	 * Invalidate pmap_pdp_cache as it contains L2-pinned objects with
    528   1.9     jym 	 * recursive entries.
    529   1.9     jym 	 * XXX jym@ : find a way to drain per-CPU caches to. pool_cache_inv
    530   1.9     jym 	 * does not do that.
    531   1.9     jym 	 */
    532   1.9     jym 	pool_cache_invalidate(&pmap_pdp_cache);
    533   1.6     jym 
    534   1.6     jym 	mutex_enter(&pmaps_lock);
    535   1.6     jym 	LIST_FOREACH(pm, &pmaps, pm_list) {
    536   1.6     jym 		for (i = 0; i < PDP_SIZE; i++) {
    537   1.6     jym 			xpq_queue_pte_update(
    538   1.6     jym 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)), 0);
    539   1.6     jym 		}
    540   1.6     jym 	}
    541   1.6     jym 	mutex_exit(&pmaps_lock);
    542   1.6     jym 
    543   1.6     jym 	/* do it for pmap_kernel() too! */
    544   1.6     jym 	for (i = 0; i < PDP_SIZE; i++)
    545   1.6     jym 		xpq_queue_pte_update(
    546   1.6     jym 		    xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
    547   1.6     jym 		    0);
    548   1.6     jym 
    549   1.6     jym 	xpq_flush_queue();
    550   1.6     jym 
    551   1.6     jym }
    552   1.6     jym #endif /* PAE */
    553  1.12  cherry 
    554  1.12  cherry #if defined(PAE) || defined(__x86_64__)
    555  1.12  cherry 
    556  1.12  cherry extern struct cpu_info	* (*xpq_cpu)(void);
    557  1.12  cherry static __inline void
    558  1.15  cherry pmap_kpm_setpte(struct cpu_info *ci, struct pmap *pmap, int index)
    559  1.12  cherry {
    560  1.15  cherry 	if (pmap == pmap_kernel()) {
    561  1.15  cherry 		KASSERT(index >= PDIR_SLOT_KERN);
    562  1.15  cherry 	}
    563  1.12  cherry #ifdef PAE
    564  1.15  cherry 	xpq_queue_pte_update(
    565  1.15  cherry 		xpmap_ptetomach(&ci->ci_kpm_pdir[l2tol2(index)]),
    566  1.15  cherry 		pmap->pm_pdir[index]);
    567  1.12  cherry #elif defined(__x86_64__)
    568  1.15  cherry 	xpq_queue_pte_update(
    569  1.15  cherry 		xpmap_ptetomach(&ci->ci_kpm_pdir[index]),
    570  1.15  cherry 		pmap->pm_pdir[index]);
    571  1.12  cherry #endif /* PAE */
    572  1.12  cherry }
    573  1.12  cherry 
    574  1.12  cherry static void
    575  1.12  cherry pmap_kpm_sync_xcall(void *arg1, void *arg2)
    576  1.12  cherry {
    577  1.12  cherry 	KASSERT(arg1 != NULL);
    578  1.12  cherry 	KASSERT(arg2 != NULL);
    579  1.12  cherry 
    580  1.12  cherry 	struct pmap *pmap = arg1;
    581  1.12  cherry 	int index = *(int *)arg2;
    582  1.13  cherry 	KASSERT(pmap == pmap_kernel() || index < PDIR_SLOT_PTE);
    583  1.13  cherry 
    584  1.12  cherry 	struct cpu_info *ci = xpq_cpu();
    585  1.12  cherry 
    586  1.15  cherry #ifdef PAE
    587  1.15  cherry 	KASSERTMSG(pmap == pmap_kernel(), "%s not allowed for PAE user pmaps", __func__);
    588  1.15  cherry #endif /* PAE */
    589  1.15  cherry 
    590  1.15  cherry 	if (__predict_true(pmap != pmap_kernel()) &&
    591  1.15  cherry 	    pmap != ci->ci_pmap) {
    592  1.15  cherry 		/* User pmap changed. Nothing to do. */
    593  1.12  cherry 		return;
    594  1.12  cherry 	}
    595  1.12  cherry 
    596  1.15  cherry 	/* Update per-cpu kpm */
    597  1.15  cherry 	pmap_kpm_setpte(ci, pmap, index);
    598  1.15  cherry 	pmap_pte_flush();
    599  1.12  cherry 	return;
    600  1.12  cherry }
    601  1.12  cherry 
    602  1.12  cherry /*
    603  1.12  cherry  * Synchronise shadow pdir with the pmap on all cpus on which it is
    604  1.12  cherry  * loaded.
    605  1.12  cherry  */
    606  1.12  cherry void
    607  1.12  cherry xen_kpm_sync(struct pmap *pmap, int index)
    608  1.12  cherry {
    609  1.12  cherry 	uint64_t where;
    610  1.12  cherry 
    611  1.12  cherry 	KASSERT(pmap != NULL);
    612  1.12  cherry 
    613  1.12  cherry 	pmap_pte_flush();
    614  1.12  cherry 
    615  1.12  cherry 	if (__predict_false(xpq_cpu != &x86_curcpu)) { /* Too early to xcall */
    616  1.12  cherry 		CPU_INFO_ITERATOR cii;
    617  1.12  cherry 		struct cpu_info *ci;
    618  1.14  bouyer 		int s = splvm();
    619  1.12  cherry 		for (CPU_INFO_FOREACH(cii, ci)) {
    620  1.12  cherry 			if (ci == NULL) {
    621  1.12  cherry 				continue;
    622  1.12  cherry 			}
    623  1.12  cherry 			if (pmap == pmap_kernel() ||
    624  1.12  cherry 			    ci->ci_cpumask & pmap->pm_cpus) {
    625  1.15  cherry 				pmap_kpm_setpte(ci, pmap, index);
    626  1.12  cherry 			}
    627  1.12  cherry 		}
    628  1.12  cherry 		pmap_pte_flush();
    629  1.14  bouyer 		splx(s);
    630  1.12  cherry 		return;
    631  1.12  cherry 	}
    632  1.12  cherry 
    633  1.12  cherry 	if (pmap == pmap_kernel()) {
    634  1.12  cherry 		where = xc_broadcast(XC_HIGHPRI,
    635  1.12  cherry 		    pmap_kpm_sync_xcall, pmap, &index);
    636  1.12  cherry 		xc_wait(where);
    637  1.12  cherry 	} else {
    638  1.12  cherry 		KASSERT(mutex_owned(pmap->pm_lock));
    639  1.12  cherry 		KASSERT(kpreempt_disabled());
    640  1.12  cherry 
    641  1.12  cherry 		CPU_INFO_ITERATOR cii;
    642  1.12  cherry 		struct cpu_info *ci;
    643  1.12  cherry 		for (CPU_INFO_FOREACH(cii, ci)) {
    644  1.12  cherry 			if (ci == NULL) {
    645  1.12  cherry 				continue;
    646  1.12  cherry 			}
    647  1.12  cherry 			while (ci->ci_cpumask & pmap->pm_cpus) {
    648  1.12  cherry #ifdef MULTIPROCESSOR
    649  1.12  cherry #define CPU_IS_CURCPU(ci) __predict_false((ci) == curcpu())
    650  1.12  cherry #else /* MULTIPROCESSOR */
    651  1.12  cherry #define CPU_IS_CURCPU(ci) __predict_true((ci) == curcpu())
    652  1.12  cherry #endif /* MULTIPROCESSOR */
    653  1.13  cherry #if 0 /* XXX: Race with remote pmap_load() */
    654  1.12  cherry 				if (ci->ci_want_pmapload &&
    655  1.12  cherry 				    !CPU_IS_CURCPU(ci)) {
    656  1.12  cherry 					/*
    657  1.12  cherry 					 * XXX: make this more cpu
    658  1.12  cherry 					 *  cycle friendly/co-operate
    659  1.12  cherry 					 *  with pmap_load()
    660  1.12  cherry 					 */
    661  1.12  cherry 					continue;
    662  1.12  cherry 				    }
    663  1.13  cherry #endif /* 0 */
    664  1.12  cherry 				where = xc_unicast(XC_HIGHPRI, pmap_kpm_sync_xcall,
    665  1.12  cherry 				    pmap, &index, ci);
    666  1.12  cherry 				xc_wait(where);
    667  1.12  cherry 				break;
    668  1.12  cherry 			}
    669  1.12  cherry 		}
    670  1.12  cherry 	}
    671  1.12  cherry }
    672  1.12  cherry 
    673  1.12  cherry #endif /* PAE || __x86_64__ */
    674