xen_pmap.c revision 1.9 1 /* $NetBSD: xen_pmap.c,v 1.9 2011/11/20 19:41:27 jym Exp $ */
2
3 /*
4 * Copyright (c) 2007 Manuel Bouyer.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 */
27
28 /*
29 * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
30 *
31 * Permission to use, copy, modify, and distribute this software for any
32 * purpose with or without fee is hereby granted, provided that the above
33 * copyright notice and this permission notice appear in all copies.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42 */
43
44 /*
45 * Copyright (c) 1997 Charles D. Cranor and Washington University.
46 * All rights reserved.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 *
57 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
58 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
59 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
60 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
61 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
62 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
63 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
64 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
66 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67 */
68
69 /*
70 * Copyright 2001 (c) Wasabi Systems, Inc.
71 * All rights reserved.
72 *
73 * Written by Frank van der Linden for Wasabi Systems, Inc.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgement:
85 * This product includes software developed for the NetBSD Project by
86 * Wasabi Systems, Inc.
87 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
88 * or promote products derived from this software without specific prior
89 * written permission.
90 *
91 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
95 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101 * POSSIBILITY OF SUCH DAMAGE.
102 */
103
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: xen_pmap.c,v 1.9 2011/11/20 19:41:27 jym Exp $");
106
107 #include "opt_user_ldt.h"
108 #include "opt_lockdebug.h"
109 #include "opt_multiprocessor.h"
110 #include "opt_xen.h"
111 #if !defined(__x86_64__)
112 #include "opt_kstack_dr0.h"
113 #endif /* !defined(__x86_64__) */
114
115 #include <sys/param.h>
116 #include <sys/systm.h>
117 #include <sys/proc.h>
118 #include <sys/pool.h>
119 #include <sys/kernel.h>
120 #include <sys/atomic.h>
121 #include <sys/cpu.h>
122 #include <sys/intr.h>
123 #include <sys/xcall.h>
124
125 #include <uvm/uvm.h>
126
127 #include <dev/isa/isareg.h>
128
129 #include <machine/specialreg.h>
130 #include <machine/gdt.h>
131 #include <machine/isa_machdep.h>
132 #include <machine/cpuvar.h>
133
134 #include <x86/pmap.h>
135 #include <x86/pmap_pv.h>
136
137 #include <x86/i82489reg.h>
138 #include <x86/i82489var.h>
139
140 #ifdef XEN
141 #include <xen/xen3-public/xen.h>
142 #include <xen/hypervisor.h>
143 #endif
144
145 #define COUNT(x) /* nothing */
146
147 static pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
148 extern pd_entry_t * const normal_pdes[];
149
150 extern paddr_t pmap_pa_start; /* PA of first physical page for this domain */
151 extern paddr_t pmap_pa_end; /* PA of last physical page for this domain */
152
153 void
154 pmap_apte_flush(struct pmap *pmap)
155 {
156
157 KASSERT(kpreempt_disabled());
158
159 /*
160 * Flush the APTE mapping from all other CPUs that
161 * are using the pmap we are using (who's APTE space
162 * is the one we've just modified).
163 *
164 * XXXthorpej -- find a way to defer the IPI.
165 */
166 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_APTE);
167 pmap_tlb_shootnow();
168 }
169
170 /*
171 * Unmap the content of APDP PDEs
172 */
173 void
174 pmap_unmap_apdp(void)
175 {
176 int i;
177
178 for (i = 0; i < PDP_SIZE; i++) {
179 pmap_pte_set(APDP_PDE+i, 0);
180 #if defined (PAE)
181 /*
182 * For PAE, there are two places where alternative recursive
183 * mappings could be found with Xen:
184 * - in the L2 shadow pages
185 * - the "real" L2 kernel page (pmap_kl2pd), which is unique
186 * and static.
187 * We first clear the APDP for the current pmap. As L2 kernel
188 * page is unique, we only need to do it once for all pmaps.
189 */
190 pmap_pte_set(APDP_PDE_SHADOW+i, 0);
191 #endif
192 }
193 }
194
195 /*
196 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
197 *
198 * => we lock enough pmaps to keep things locked in
199 * => must be undone with pmap_unmap_ptes before returning
200 */
201
202 void
203 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
204 pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
205 {
206 pd_entry_t opde, npde;
207 struct pmap *ourpmap;
208 struct cpu_info *ci;
209 struct lwp *l;
210 bool iscurrent;
211 uint64_t ncsw;
212 int s;
213
214 /* the kernel's pmap is always accessible */
215 if (pmap == pmap_kernel()) {
216 *pmap2 = NULL;
217 *ptepp = PTE_BASE;
218 *pdeppp = normal_pdes;
219 return;
220 }
221 KASSERT(kpreempt_disabled());
222
223 retry:
224 l = curlwp;
225 ncsw = l->l_ncsw;
226 ourpmap = NULL;
227 ci = curcpu();
228 #if defined(__x86_64__)
229 /*
230 * curmap can only be pmap_kernel so at this point
231 * pmap_is_curpmap is always false
232 */
233 iscurrent = 0;
234 ourpmap = pmap_kernel();
235 #else /* __x86_64__*/
236 if (ci->ci_want_pmapload &&
237 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
238 pmap_load();
239 if (l->l_ncsw != ncsw)
240 goto retry;
241 }
242 iscurrent = pmap_is_curpmap(pmap);
243 /* if curpmap then we are always mapped */
244 if (iscurrent) {
245 mutex_enter(pmap->pm_lock);
246 *pmap2 = NULL;
247 *ptepp = PTE_BASE;
248 *pdeppp = normal_pdes;
249 goto out;
250 }
251 ourpmap = ci->ci_pmap;
252 #endif /* __x86_64__ */
253
254 /* need to lock both curpmap and pmap: use ordered locking */
255 pmap_reference(ourpmap);
256 if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
257 mutex_enter(pmap->pm_lock);
258 mutex_enter(ourpmap->pm_lock);
259 } else {
260 mutex_enter(ourpmap->pm_lock);
261 mutex_enter(pmap->pm_lock);
262 }
263
264 if (l->l_ncsw != ncsw)
265 goto unlock_and_retry;
266
267 /* need to load a new alternate pt space into curpmap? */
268 COUNT(apdp_pde_map);
269 opde = *APDP_PDE;
270 if (!pmap_valid_entry(opde) ||
271 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
272 int i;
273 s = splvm();
274 /* Make recursive entry usable in user PGD */
275 for (i = 0; i < PDP_SIZE; i++) {
276 npde = pmap_pa2pte(
277 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
278 xpq_queue_pte_update(
279 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
280 npde);
281 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
282 npde);
283 #ifdef PAE
284 /* update shadow entry too */
285 xpq_queue_pte_update(
286 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
287 #endif /* PAE */
288 xpq_queue_invlpg(
289 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
290 }
291 if (pmap_valid_entry(opde))
292 pmap_apte_flush(ourpmap);
293 splx(s);
294 }
295 *pmap2 = ourpmap;
296 *ptepp = APTE_BASE;
297 *pdeppp = alternate_pdes;
298 KASSERT(l->l_ncsw == ncsw);
299 #if !defined(__x86_64__)
300 out:
301 #endif
302 /*
303 * might have blocked, need to retry?
304 */
305 if (l->l_ncsw != ncsw) {
306 unlock_and_retry:
307 if (ourpmap != NULL) {
308 mutex_exit(ourpmap->pm_lock);
309 pmap_destroy(ourpmap);
310 }
311 mutex_exit(pmap->pm_lock);
312 goto retry;
313 }
314 }
315
316 /*
317 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
318 */
319
320 void
321 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
322 {
323
324 if (pmap == pmap_kernel()) {
325 return;
326 }
327 KASSERT(kpreempt_disabled());
328 if (pmap2 == NULL) {
329 mutex_exit(pmap->pm_lock);
330 } else {
331 #if defined(__x86_64__)
332 KASSERT(pmap2 == pmap_kernel());
333 #else
334 KASSERT(curcpu()->ci_pmap == pmap2);
335 #endif
336 #if defined(MULTIPROCESSOR)
337 pmap_unmap_apdp();
338 pmap_pte_flush();
339 pmap_apte_flush(pmap2);
340 #endif /* MULTIPROCESSOR */
341 COUNT(apdp_pde_unmap);
342 mutex_exit(pmap->pm_lock);
343 mutex_exit(pmap2->pm_lock);
344 pmap_destroy(pmap2);
345 }
346 }
347
348 int
349 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
350 {
351 paddr_t ma;
352
353 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
354 ma = pa; /* XXX hack */
355 } else {
356 ma = xpmap_ptom(pa);
357 }
358
359 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
360 }
361
362 /*
363 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
364 *
365 * => no need to lock anything, assume va is already allocated
366 * => should be faster than normal pmap enter function
367 * => we expect a MACHINE address
368 */
369
370 void
371 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
372 {
373 pt_entry_t *pte, opte, npte;
374
375 if (va < VM_MIN_KERNEL_ADDRESS)
376 pte = vtopte(va);
377 else
378 pte = kvtopte(va);
379
380 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
381 PG_V | PG_k;
382 if (flags & PMAP_NOCACHE)
383 npte |= PG_N;
384
385 if ((cpu_feature[2] & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
386 npte |= PG_NX;
387
388 opte = pmap_pte_testset (pte, npte); /* zap! */
389
390 if (pmap_valid_entry(opte)) {
391 #if defined(MULTIPROCESSOR)
392 kpreempt_disable();
393 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
394 kpreempt_enable();
395 #else
396 /* Don't bother deferring in the single CPU case. */
397 pmap_update_pg(va);
398 #endif
399 }
400 }
401
402 /*
403 * pmap_extract_ma: extract a MA for the given VA
404 */
405
406 bool
407 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
408 {
409 pt_entry_t *ptes, pte;
410 pd_entry_t pde;
411 pd_entry_t * const *pdes;
412 struct pmap *pmap2;
413
414 kpreempt_disable();
415 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
416 if (!pmap_pdes_valid(va, pdes, &pde)) {
417 pmap_unmap_ptes(pmap, pmap2);
418 kpreempt_enable();
419 return false;
420 }
421
422 pte = ptes[pl1_i(va)];
423 pmap_unmap_ptes(pmap, pmap2);
424 kpreempt_enable();
425
426 if (__predict_true((pte & PG_V) != 0)) {
427 if (pap != NULL)
428 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
429 return true;
430 }
431
432 return false;
433 }
434
435 /*
436 * Flush all APDP entries found in pmaps
437 * Required during Xen save/restore operations, as Xen does not
438 * handle alternative recursive mappings properly
439 */
440 void
441 pmap_xen_suspend(void)
442 {
443 int i;
444 int s;
445 struct pmap *pm;
446
447 s = splvm();
448
449 pmap_unmap_apdp();
450
451 mutex_enter(&pmaps_lock);
452 /*
453 * Set APDP entries to 0 in all pmaps.
454 * Note that for PAE kernels, this only clears the APDP entries
455 * found in the L2 shadow pages, as pmap_pdirpa() is used to obtain
456 * the PA of the pmap->pm_pdir[] pages (forming the 4 contiguous
457 * pages of PAE PD: 3 for user space, 1 for the L2 kernel shadow page)
458 */
459 LIST_FOREACH(pm, &pmaps, pm_list) {
460 for (i = 0; i < PDP_SIZE; i++) {
461 xpq_queue_pte_update(
462 xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_APTE + i)),
463 0);
464 }
465 }
466 mutex_exit(&pmaps_lock);
467
468 xpq_flush_queue();
469
470 splx(s);
471
472 #ifdef PAE
473 pmap_unmap_recursive_entries();
474 #endif
475 }
476
477 void
478 pmap_xen_resume(void)
479 {
480 #ifdef PAE
481 pmap_map_recursive_entries();
482 #endif
483 }
484
485 #ifdef PAE
486 /*
487 * NetBSD uses L2 shadow pages to support PAE with Xen. However, Xen does not
488 * handle them correctly during save/restore, leading to incorrect page
489 * tracking and pinning during restore.
490 * For save/restore to succeed, two functions are introduced:
491 * - pmap_map_recursive_entries(), used by resume code to set the recursive
492 * mapping entries to their correct value
493 * - pmap_unmap_recursive_entries(), used by suspend code to clear all
494 * PDIR_SLOT_PTE entries
495 */
496 void
497 pmap_map_recursive_entries(void)
498 {
499 int i;
500 struct pmap *pm;
501
502 mutex_enter(&pmaps_lock);
503 LIST_FOREACH(pm, &pmaps, pm_list) {
504 for (i = 0; i < PDP_SIZE; i++) {
505 xpq_queue_pte_update(
506 xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)),
507 xpmap_ptom((pm)->pm_pdirpa[i]) | PG_V);
508 }
509 }
510 mutex_exit(&pmaps_lock);
511
512 for (i = 0; i < PDP_SIZE; i++) {
513 xpq_queue_pte_update(
514 xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
515 xpmap_ptom(pmap_kernel()->pm_pdirpa[i]) | PG_V);
516 }
517
518 xpq_flush_queue();
519 }
520
521 void
522 pmap_unmap_recursive_entries(void)
523 {
524 int i;
525 struct pmap *pm;
526
527 /*
528 * Invalidate pmap_pdp_cache as it contains L2-pinned objects with
529 * recursive entries.
530 * XXX jym@ : find a way to drain per-CPU caches to. pool_cache_inv
531 * does not do that.
532 */
533 pool_cache_invalidate(&pmap_pdp_cache);
534
535 mutex_enter(&pmaps_lock);
536 LIST_FOREACH(pm, &pmaps, pm_list) {
537 for (i = 0; i < PDP_SIZE; i++) {
538 xpq_queue_pte_update(
539 xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)), 0);
540 }
541 }
542 mutex_exit(&pmaps_lock);
543
544 /* do it for pmap_kernel() too! */
545 for (i = 0; i < PDP_SIZE; i++)
546 xpq_queue_pte_update(
547 xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
548 0);
549
550 xpq_flush_queue();
551
552 }
553 #endif /* PAE */
554