x86_xpmap.c revision 1.38.2.5 1 /* $NetBSD: x86_xpmap.c,v 1.38.2.5 2012/06/12 19:00:25 riz Exp $ */
2
3 /*
4 * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 /*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
32 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
40 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 *
42 */
43
44 /*
45 *
46 * Copyright (c) 2004 Christian Limpach.
47 * All rights reserved.
48 *
49 * Redistribution and use in source and binary forms, with or without
50 * modification, are permitted provided that the following conditions
51 * are met:
52 * 1. Redistributions of source code must retain the above copyright
53 * notice, this list of conditions and the following disclaimer.
54 * 2. Redistributions in binary form must reproduce the above copyright
55 * notice, this list of conditions and the following disclaimer in the
56 * documentation and/or other materials provided with the distribution.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
59 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
60 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
61 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
62 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
63 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
64 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
65 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
66 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
67 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
68 */
69
70
71 #include <sys/cdefs.h>
72 __KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.38.2.5 2012/06/12 19:00:25 riz Exp $");
73
74 #include "opt_xen.h"
75 #include "opt_ddb.h"
76 #include "ksyms.h"
77
78 #include <sys/param.h>
79 #include <sys/systm.h>
80 #include <sys/mutex.h>
81 #include <sys/cpu.h>
82
83 #include <uvm/uvm.h>
84
85 #include <x86/pmap.h>
86 #include <machine/gdt.h>
87 #include <xen/xenfunc.h>
88
89 #include <dev/isa/isareg.h>
90 #include <machine/isa_machdep.h>
91
92 #undef XENDEBUG
93 /* #define XENDEBUG_SYNC */
94 /* #define XENDEBUG_LOW */
95
96 #ifdef XENDEBUG
97 #define XENPRINTF(x) printf x
98 #define XENPRINTK(x) printk x
99 #define XENPRINTK2(x) /* printk x */
100
101 static char XBUF[256];
102 #else
103 #define XENPRINTF(x)
104 #define XENPRINTK(x)
105 #define XENPRINTK2(x)
106 #endif
107 #define PRINTF(x) printf x
108 #define PRINTK(x) printk x
109
110 volatile shared_info_t *HYPERVISOR_shared_info;
111 /* Xen requires the start_info struct to be page aligned */
112 union start_info_union start_info_union __aligned(PAGE_SIZE);
113 unsigned long *xpmap_phys_to_machine_mapping;
114 kmutex_t pte_lock;
115
116 void xen_failsafe_handler(void);
117
118 #define HYPERVISOR_mmu_update_self(req, count, success_count) \
119 HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
120
121 void
122 xen_failsafe_handler(void)
123 {
124
125 panic("xen_failsafe_handler called!\n");
126 }
127
128
129 void
130 xen_set_ldt(vaddr_t base, uint32_t entries)
131 {
132 vaddr_t va;
133 vaddr_t end;
134 pt_entry_t *ptp;
135 int s;
136
137 #ifdef __x86_64__
138 end = base + (entries << 3);
139 #else
140 end = base + entries * sizeof(union descriptor);
141 #endif
142
143 for (va = base; va < end; va += PAGE_SIZE) {
144 KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
145 ptp = kvtopte(va);
146 XENPRINTF(("xen_set_ldt %#" PRIxVADDR " %d %p\n",
147 base, entries, ptp));
148 pmap_pte_clearbits(ptp, PG_RW);
149 }
150 s = splvm();
151 xpq_queue_set_ldt(base, entries);
152 splx(s);
153 }
154
155 #ifdef XENDEBUG
156 void xpq_debug_dump(void);
157 #endif
158
159 #define XPQUEUE_SIZE 2048
160 static mmu_update_t xpq_queue_array[MAXCPUS][XPQUEUE_SIZE];
161 static int xpq_idx_array[MAXCPUS];
162
163 extern struct cpu_info * (*xpq_cpu)(void);
164
165 void
166 xpq_flush_queue(void)
167 {
168 int i, ok = 0, ret;
169
170 mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
171 int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
172
173 XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
174 for (i = 0; i < xpq_idx; i++)
175 XENPRINTK2(("%d: 0x%08" PRIx64 " 0x%08" PRIx64 "\n", i,
176 xpq_queue[i].ptr, xpq_queue[i].val));
177
178 retry:
179 ret = HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok);
180
181 if (xpq_idx != 0 && ret < 0) {
182 struct cpu_info *ci;
183 CPU_INFO_ITERATOR cii;
184
185 printf("xpq_flush_queue: %d entries (%d successful) on "
186 "cpu%d (%ld)\n",
187 xpq_idx, ok, xpq_cpu()->ci_index, xpq_cpu()->ci_cpuid);
188
189 if (ok != 0) {
190 xpq_queue += ok;
191 xpq_idx -= ok;
192 ok = 0;
193 goto retry;
194 }
195
196 for (CPU_INFO_FOREACH(cii, ci)) {
197 xpq_queue = xpq_queue_array[ci->ci_cpuid];
198 xpq_idx = xpq_idx_array[ci->ci_cpuid];
199 printf("cpu%d (%ld):\n", ci->ci_index, ci->ci_cpuid);
200 for (i = 0; i < xpq_idx; i++) {
201 printf(" 0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
202 xpq_queue[i].ptr, xpq_queue[i].val);
203 }
204 #ifdef __x86_64__
205 for (i = 0; i < PDIR_SLOT_PTE; i++) {
206 if (ci->ci_kpm_pdir[i] == 0)
207 continue;
208 printf(" kpm_pdir[%d]: 0x%" PRIx64 "\n",
209 i, ci->ci_kpm_pdir[i]);
210 }
211 #endif
212 }
213 panic("HYPERVISOR_mmu_update failed, ret: %d\n", ret);
214 }
215 xpq_idx_array[xpq_cpu()->ci_cpuid] = 0;
216 }
217
218 static inline void
219 xpq_increment_idx(void)
220 {
221
222 if (__predict_false(++xpq_idx_array[xpq_cpu()->ci_cpuid] == XPQUEUE_SIZE))
223 xpq_flush_queue();
224 }
225
226 void
227 xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
228 {
229
230 mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
231 int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
232
233 XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
234 "\n", (int64_t)ma, (int64_t)pa));
235
236 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
237 xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
238 xpq_increment_idx();
239 #ifdef XENDEBUG_SYNC
240 xpq_flush_queue();
241 #endif
242 }
243
244 void
245 xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
246 {
247
248 mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
249 int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
250
251 KASSERT((ptr & 3) == 0);
252 xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
253 xpq_queue[xpq_idx].val = val;
254 xpq_increment_idx();
255 #ifdef XENDEBUG_SYNC
256 xpq_flush_queue();
257 #endif
258 }
259
260 void
261 xpq_queue_pt_switch(paddr_t pa)
262 {
263 struct mmuext_op op;
264 xpq_flush_queue();
265
266 XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
267 (int64_t)pa, (int64_t)pa));
268 op.cmd = MMUEXT_NEW_BASEPTR;
269 op.arg1.mfn = pa >> PAGE_SHIFT;
270 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
271 panic("xpq_queue_pt_switch");
272 }
273
274 void
275 xpq_queue_pin_table(paddr_t pa, int lvl)
276 {
277 struct mmuext_op op;
278
279 xpq_flush_queue();
280
281 XENPRINTK2(("xpq_queue_pin_l%d_table: %#" PRIxPADDR "\n",
282 lvl + 1, pa));
283
284 op.arg1.mfn = pa >> PAGE_SHIFT;
285 op.cmd = lvl;
286
287 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
288 panic("xpq_queue_pin_table");
289 }
290
291 void
292 xpq_queue_unpin_table(paddr_t pa)
293 {
294 struct mmuext_op op;
295
296 xpq_flush_queue();
297
298 XENPRINTK2(("xpq_queue_unpin_table: %#" PRIxPADDR "\n", pa));
299 op.arg1.mfn = pa >> PAGE_SHIFT;
300 op.cmd = MMUEXT_UNPIN_TABLE;
301 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
302 panic("xpq_queue_unpin_table");
303 }
304
305 void
306 xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
307 {
308 struct mmuext_op op;
309
310 xpq_flush_queue();
311
312 XENPRINTK2(("xpq_queue_set_ldt\n"));
313 KASSERT(va == (va & ~PAGE_MASK));
314 op.cmd = MMUEXT_SET_LDT;
315 op.arg1.linear_addr = va;
316 op.arg2.nr_ents = entries;
317 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
318 panic("xpq_queue_set_ldt");
319 }
320
321 void
322 xpq_queue_tlb_flush(void)
323 {
324 struct mmuext_op op;
325
326 xpq_flush_queue();
327
328 XENPRINTK2(("xpq_queue_tlb_flush\n"));
329 op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
330 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
331 panic("xpq_queue_tlb_flush");
332 }
333
334 void
335 xpq_flush_cache(void)
336 {
337 struct mmuext_op op;
338 int s = splvm(), err;
339
340 xpq_flush_queue();
341
342 XENPRINTK2(("xpq_queue_flush_cache\n"));
343 op.cmd = MMUEXT_FLUSH_CACHE;
344 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) < 0) {
345 panic("xpq_flush_cache, err %d", err);
346 }
347 splx(s); /* XXX: removeme */
348 }
349
350 void
351 xpq_queue_invlpg(vaddr_t va)
352 {
353 struct mmuext_op op;
354 xpq_flush_queue();
355
356 XENPRINTK2(("xpq_queue_invlpg %#" PRIxVADDR "\n", va));
357 op.cmd = MMUEXT_INVLPG_LOCAL;
358 op.arg1.linear_addr = (va & ~PAGE_MASK);
359 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
360 panic("xpq_queue_invlpg");
361 }
362
363 void
364 xen_mcast_invlpg(vaddr_t va, kcpuset_t *kc)
365 {
366 u_long xcpumask = 0;
367 mmuext_op_t op;
368
369 kcpuset_copybits(kc, &xcpumask, sizeof(xcpumask));
370
371 /* Flush pending page updates */
372 xpq_flush_queue();
373
374 op.cmd = MMUEXT_INVLPG_MULTI;
375 op.arg1.linear_addr = va;
376 op.arg2.vcpumask = &xcpumask;
377
378 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
379 panic("xpq_queue_invlpg_all");
380 }
381
382 return;
383 }
384
385 void
386 xen_bcast_invlpg(vaddr_t va)
387 {
388 mmuext_op_t op;
389
390 /* Flush pending page updates */
391 xpq_flush_queue();
392
393 op.cmd = MMUEXT_INVLPG_ALL;
394 op.arg1.linear_addr = va;
395
396 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
397 panic("xpq_queue_invlpg_all");
398 }
399
400 return;
401 }
402
403 /* This is a synchronous call. */
404 void
405 xen_mcast_tlbflush(kcpuset_t *kc)
406 {
407 u_long xcpumask = 0;
408 mmuext_op_t op;
409
410 kcpuset_copybits(kc, &xcpumask, sizeof(xcpumask));
411
412 /* Flush pending page updates */
413 xpq_flush_queue();
414
415 op.cmd = MMUEXT_TLB_FLUSH_MULTI;
416 op.arg2.vcpumask = &xcpumask;
417
418 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
419 panic("xpq_queue_invlpg_all");
420 }
421
422 return;
423 }
424
425 /* This is a synchronous call. */
426 void
427 xen_bcast_tlbflush(void)
428 {
429 mmuext_op_t op;
430
431 /* Flush pending page updates */
432 xpq_flush_queue();
433
434 op.cmd = MMUEXT_TLB_FLUSH_ALL;
435
436 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
437 panic("xpq_queue_invlpg_all");
438 }
439
440 return;
441 }
442
443 /* This is a synchronous call. */
444 void
445 xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr_t eva, kcpuset_t *kc)
446 {
447 KASSERT(eva > sva);
448
449 /* Flush pending page updates */
450 xpq_flush_queue();
451
452 /* Align to nearest page boundary */
453 sva &= ~PAGE_MASK;
454 eva &= ~PAGE_MASK;
455
456 for ( ; sva <= eva; sva += PAGE_SIZE) {
457 xen_mcast_invlpg(sva, kc);
458 }
459
460 return;
461 }
462
463 /* This is a synchronous call. */
464 void
465 xen_vcpu_bcast_invlpg(vaddr_t sva, vaddr_t eva)
466 {
467 KASSERT(eva > sva);
468
469 /* Flush pending page updates */
470 xpq_flush_queue();
471
472 /* Align to nearest page boundary */
473 sva &= ~PAGE_MASK;
474 eva &= ~PAGE_MASK;
475
476 for ( ; sva <= eva; sva += PAGE_SIZE) {
477 xen_bcast_invlpg(sva);
478 }
479
480 return;
481 }
482
483 int
484 xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
485 {
486 mmu_update_t op;
487 int ok;
488
489 xpq_flush_queue();
490
491 op.ptr = ptr;
492 op.val = val;
493 if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
494 return EFAULT;
495 return (0);
496 }
497
498 #ifdef XENDEBUG
499 void
500 xpq_debug_dump(void)
501 {
502 int i;
503
504 mmu_update_t *xpq_queue = xpq_queue_array[xpq_cpu()->ci_cpuid];
505 int xpq_idx = xpq_idx_array[xpq_cpu()->ci_cpuid];
506
507 XENPRINTK2(("idx: %d\n", xpq_idx));
508 for (i = 0; i < xpq_idx; i++) {
509 snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64,
510 xpq_queue[i].ptr, xpq_queue[i].val);
511 if (++i < xpq_idx)
512 snprintf(XBUF + strlen(XBUF),
513 sizeof(XBUF) - strlen(XBUF),
514 "%" PRIx64 " %08" PRIx64,
515 xpq_queue[i].ptr, xpq_queue[i].val);
516 if (++i < xpq_idx)
517 snprintf(XBUF + strlen(XBUF),
518 sizeof(XBUF) - strlen(XBUF),
519 "%" PRIx64 " %08" PRIx64,
520 xpq_queue[i].ptr, xpq_queue[i].val);
521 if (++i < xpq_idx)
522 snprintf(XBUF + strlen(XBUF),
523 sizeof(XBUF) - strlen(XBUF),
524 "%" PRIx64 " %08" PRIx64,
525 xpq_queue[i].ptr, xpq_queue[i].val);
526 XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
527 }
528 }
529 #endif
530
531
532 extern volatile struct xencons_interface *xencons_interface; /* XXX */
533 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
534
535 static void xen_bt_set_readonly (vaddr_t);
536 static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
537
538 /* How many PDEs ? */
539 #if L2_SLOT_KERNBASE > 0
540 #define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
541 #else
542 #define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
543 #endif
544
545 /*
546 * Construct and switch to new pagetables
547 * first_avail is the first vaddr we can use after
548 * we get rid of Xen pagetables
549 */
550
551 vaddr_t xen_pmap_bootstrap (void);
552
553 /*
554 * Function to get rid of Xen bootstrap tables
555 */
556
557 /* How many PDP do we need: */
558 #ifdef PAE
559 /*
560 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
561 * all of them mapped by the L3 page. We also need a shadow page
562 * for L3[3].
563 */
564 static const int l2_4_count = 6;
565 #elif defined(__x86_64__)
566 static const int l2_4_count = PTP_LEVELS;
567 #else
568 static const int l2_4_count = PTP_LEVELS - 1;
569 #endif
570
571 vaddr_t
572 xen_pmap_bootstrap(void)
573 {
574 int count, oldcount;
575 long mapsize;
576 vaddr_t bootstrap_tables, init_tables;
577
578 memset(xpq_idx_array, 0, sizeof xpq_idx_array);
579
580 xpmap_phys_to_machine_mapping =
581 (unsigned long *)xen_start_info.mfn_list;
582 init_tables = xen_start_info.pt_base;
583 __PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
584
585 /* Space after Xen boostrap tables should be free */
586 bootstrap_tables = xen_start_info.pt_base +
587 (xen_start_info.nr_pt_frames * PAGE_SIZE);
588
589 /*
590 * Calculate how many space we need
591 * first everything mapped before the Xen bootstrap tables
592 */
593 mapsize = init_tables - KERNTEXTOFF;
594 /* after the tables we'll have:
595 * - UAREA
596 * - dummy user PGD (x86_64)
597 * - HYPERVISOR_shared_info
598 * - early_zerop
599 * - ISA I/O mem (if needed)
600 */
601 mapsize += UPAGES * NBPG;
602 #ifdef __x86_64__
603 mapsize += NBPG;
604 #endif
605 mapsize += NBPG;
606 mapsize += NBPG;
607
608 #ifdef DOM0OPS
609 if (xendomain_is_dom0()) {
610 /* space for ISA I/O mem */
611 mapsize += IOM_SIZE;
612 }
613 #endif
614 /* at this point mapsize doens't include the table size */
615
616 #ifdef __x86_64__
617 count = TABLE_L2_ENTRIES;
618 #else
619 count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
620 #endif /* __x86_64__ */
621
622 /* now compute how many L2 pages we need exactly */
623 XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
624 while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
625 ((long)count << L2_SHIFT) + KERNBASE) {
626 count++;
627 }
628 #ifndef __x86_64__
629 /*
630 * one more L2 page: we'll alocate several pages after kva_start
631 * in pmap_bootstrap() before pmap_growkernel(), which have not been
632 * counted here. It's not a big issue to allocate one more L2 as
633 * pmap_growkernel() will be called anyway.
634 */
635 count++;
636 nkptp[1] = count;
637 #endif
638
639 /*
640 * install bootstrap pages. We may need more L2 pages than will
641 * have the final table here, as it's installed after the final table
642 */
643 oldcount = count;
644
645 bootstrap_again:
646 XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
647 /*
648 * Xen space we'll reclaim may not be enough for our new page tables,
649 * move bootstrap tables if necessary
650 */
651 if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
652 bootstrap_tables = init_tables +
653 ((count + l2_4_count) * PAGE_SIZE);
654 /* make sure we have enough to map the bootstrap_tables */
655 if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
656 ((long)oldcount << L2_SHIFT) + KERNBASE) {
657 oldcount++;
658 goto bootstrap_again;
659 }
660
661 /* Create temporary tables */
662 xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
663 xen_start_info.nr_pt_frames, oldcount, 0);
664
665 /* Create final tables */
666 xen_bootstrap_tables(bootstrap_tables, init_tables,
667 oldcount + l2_4_count, count, 1);
668
669 /* zero out free space after tables */
670 memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
671 (UPAGES + 1) * NBPG);
672
673 /* Finally, flush TLB. */
674 xpq_queue_tlb_flush();
675
676 return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
677 }
678
679 /*
680 * Build a new table and switch to it
681 * old_count is # of old tables (including PGD, PDTPE and PDE)
682 * new_count is # of new tables (PTE only)
683 * we assume areas don't overlap
684 */
685 static void
686 xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
687 int old_count, int new_count, int final)
688 {
689 pd_entry_t *pdtpe, *pde, *pte;
690 pd_entry_t *cur_pgd, *bt_pgd;
691 paddr_t addr;
692 vaddr_t page, avail, text_end, map_end;
693 int i;
694 extern char __data_start;
695 extern char *early_zerop; /* from pmap.c */
696
697 __PRINTK(("xen_bootstrap_tables(%#" PRIxVADDR ", %#" PRIxVADDR ","
698 " %d, %d)\n",
699 old_pgd, new_pgd, old_count, new_count));
700 text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
701 /*
702 * size of R/W area after kernel text:
703 * xencons_interface (if present)
704 * xenstore_interface (if present)
705 * table pages (new_count + l2_4_count entries)
706 * extra mappings (only when final is true):
707 * UAREA
708 * dummy user PGD (x86_64 only)/gdt page (i386 only)
709 * HYPERVISOR_shared_info
710 * early_zerop
711 * ISA I/O mem (if needed)
712 */
713 map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
714 if (final) {
715 map_end += (UPAGES + 1) * NBPG;
716 HYPERVISOR_shared_info = (shared_info_t *)map_end;
717 map_end += NBPG;
718 early_zerop = (char *)map_end;
719 map_end += NBPG;
720 }
721 /*
722 * we always set atdevbase, as it's used by init386 to find the first
723 * available VA. map_end is updated only if we are dom0, so
724 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
725 * this case.
726 */
727 if (final)
728 atdevbase = map_end;
729 #ifdef DOM0OPS
730 if (final && xendomain_is_dom0()) {
731 /* ISA I/O mem */
732 map_end += IOM_SIZE;
733 }
734 #endif /* DOM0OPS */
735
736 __PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
737 text_end, map_end));
738 __PRINTK(("console %#lx ", xen_start_info.console_mfn));
739 __PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn));
740
741 /*
742 * Create bootstrap page tables
743 * What we need:
744 * - a PGD (level 4)
745 * - a PDTPE (level 3)
746 * - a PDE (level2)
747 * - some PTEs (level 1)
748 */
749
750 cur_pgd = (pd_entry_t *) old_pgd;
751 bt_pgd = (pd_entry_t *) new_pgd;
752 memset (bt_pgd, 0, PAGE_SIZE);
753 avail = new_pgd + PAGE_SIZE;
754 #if PTP_LEVELS > 3
755 /* per-cpu L4 PD */
756 pd_entry_t *bt_cpu_pgd = bt_pgd;
757 /* pmap_kernel() "shadow" L4 PD */
758 bt_pgd = (pd_entry_t *) avail;
759 memset(bt_pgd, 0, PAGE_SIZE);
760 avail += PAGE_SIZE;
761
762 /* Install level 3 */
763 pdtpe = (pd_entry_t *) avail;
764 memset (pdtpe, 0, PAGE_SIZE);
765 avail += PAGE_SIZE;
766
767 addr = ((u_long) pdtpe) - KERNBASE;
768 bt_pgd[pl4_pi(KERNTEXTOFF)] = bt_cpu_pgd[pl4_pi(KERNTEXTOFF)] =
769 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
770
771 __PRINTK(("L3 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
772 " -> L4[%#x]\n",
773 pdtpe, addr, bt_pgd[pl4_pi(KERNTEXTOFF)], pl4_pi(KERNTEXTOFF)));
774 #else
775 pdtpe = bt_pgd;
776 #endif /* PTP_LEVELS > 3 */
777
778 #if PTP_LEVELS > 2
779 /* Level 2 */
780 pde = (pd_entry_t *) avail;
781 memset(pde, 0, PAGE_SIZE);
782 avail += PAGE_SIZE;
783
784 addr = ((u_long) pde) - KERNBASE;
785 pdtpe[pl3_pi(KERNTEXTOFF)] =
786 xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
787 __PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
788 " -> L3[%#x]\n",
789 pde, addr, pdtpe[pl3_pi(KERNTEXTOFF)], pl3_pi(KERNTEXTOFF)));
790 #elif defined(PAE)
791 /* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
792 pde = (pd_entry_t *) avail;
793 memset(pde, 0, PAGE_SIZE * 5);
794 avail += PAGE_SIZE * 5;
795 addr = ((u_long) pde) - KERNBASE;
796 /*
797 * enter L2 pages in the L3.
798 * The real L2 kernel PD will be the last one (so that
799 * pde[L2_SLOT_KERN] always point to the shadow).
800 */
801 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
802 /*
803 * Xen doesn't want R/W mappings in L3 entries, it'll add it
804 * itself.
805 */
806 pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
807 __PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
808 " -> L3[%#x]\n",
809 (vaddr_t)pde + PAGE_SIZE * i, addr, pdtpe[i], i));
810 }
811 addr += PAGE_SIZE;
812 pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
813 __PRINTK(("L2 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
814 " -> L3[%#x]\n",
815 (vaddr_t)pde + PAGE_SIZE * 4, addr, pdtpe[3], 3));
816
817 #else /* PAE */
818 pde = bt_pgd;
819 #endif /* PTP_LEVELS > 2 */
820
821 /* Level 1 */
822 page = KERNTEXTOFF;
823 for (i = 0; i < new_count; i ++) {
824 vaddr_t cur_page = page;
825
826 pte = (pd_entry_t *) avail;
827 avail += PAGE_SIZE;
828
829 memset(pte, 0, PAGE_SIZE);
830 while (pl2_pi(page) == pl2_pi (cur_page)) {
831 if (page >= map_end) {
832 /* not mapped at all */
833 pte[pl1_pi(page)] = 0;
834 page += PAGE_SIZE;
835 continue;
836 }
837 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
838 if (page == (vaddr_t)HYPERVISOR_shared_info) {
839 pte[pl1_pi(page)] = xen_start_info.shared_info;
840 __PRINTK(("HYPERVISOR_shared_info "
841 "va %#lx pte %#" PRIxPADDR "\n",
842 HYPERVISOR_shared_info, pte[pl1_pi(page)]));
843 }
844 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
845 == xen_start_info.console.domU.mfn) {
846 xencons_interface = (void *)page;
847 pte[pl1_pi(page)] = xen_start_info.console_mfn;
848 pte[pl1_pi(page)] <<= PAGE_SHIFT;
849 __PRINTK(("xencons_interface "
850 "va %#lx pte %#" PRIxPADDR "\n",
851 xencons_interface, pte[pl1_pi(page)]));
852 }
853 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
854 == xen_start_info.store_mfn) {
855 xenstore_interface = (void *)page;
856 pte[pl1_pi(page)] = xen_start_info.store_mfn;
857 pte[pl1_pi(page)] <<= PAGE_SHIFT;
858 __PRINTK(("xenstore_interface "
859 "va %#lx pte %#" PRIxPADDR "\n",
860 xenstore_interface, pte[pl1_pi(page)]));
861 }
862 #ifdef DOM0OPS
863 if (page >= (vaddr_t)atdevbase &&
864 page < (vaddr_t)atdevbase + IOM_SIZE) {
865 pte[pl1_pi(page)] =
866 IOM_BEGIN + (page - (vaddr_t)atdevbase);
867 }
868 #endif
869 pte[pl1_pi(page)] |= PG_k | PG_V;
870 if (page < text_end) {
871 /* map kernel text RO */
872 pte[pl1_pi(page)] |= 0;
873 } else if (page >= old_pgd
874 && page < old_pgd + (old_count * PAGE_SIZE)) {
875 /* map old page tables RO */
876 pte[pl1_pi(page)] |= 0;
877 } else if (page >= new_pgd &&
878 page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
879 /* map new page tables RO */
880 pte[pl1_pi(page)] |= 0;
881 } else {
882 /* map page RW */
883 pte[pl1_pi(page)] |= PG_RW;
884 }
885
886 if ((page >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE))
887 || page >= new_pgd) {
888 __PRINTK(("va %#lx pa %#lx "
889 "entry 0x%" PRIxPADDR " -> L1[%#x]\n",
890 page, page - KERNBASE,
891 pte[pl1_pi(page)], pl1_pi(page)));
892 }
893 page += PAGE_SIZE;
894 }
895
896 addr = ((u_long) pte) - KERNBASE;
897 pde[pl2_pi(cur_page)] =
898 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
899 __PRINTK(("L1 va %#lx pa %#" PRIxPADDR " entry %#" PRIxPADDR
900 " -> L2[%#x]\n",
901 pte, addr, pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
902 /* Mark readonly */
903 xen_bt_set_readonly((vaddr_t) pte);
904 }
905
906 /* Install recursive page tables mapping */
907 #ifdef PAE
908 /*
909 * we need a shadow page for the kernel's L2 page
910 * The real L2 kernel PD will be the last one (so that
911 * pde[L2_SLOT_KERN] always point to the shadow.
912 */
913 memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
914 cpu_info_primary.ci_kpm_pdir = &pde[L2_SLOT_KERN + NPDPG];
915 cpu_info_primary.ci_kpm_pdirpa =
916 (vaddr_t) cpu_info_primary.ci_kpm_pdir - KERNBASE;
917
918 /*
919 * We don't enter a recursive entry from the L3 PD. Instead,
920 * we enter the first 4 L2 pages, which includes the kernel's L2
921 * shadow. But we have to entrer the shadow after switching
922 * %cr3, or Xen will refcount some PTE with the wrong type.
923 */
924 addr = (u_long)pde - KERNBASE;
925 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
926 pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
927 __PRINTK(("pde[%d] va %#" PRIxVADDR " pa %#" PRIxPADDR
928 " entry %#" PRIxPADDR "\n",
929 (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i,
930 addr, pde[PDIR_SLOT_PTE + i]));
931 }
932 #if 0
933 addr += PAGE_SIZE; /* point to shadow L2 */
934 pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
935 __PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
936 (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
937 (int64_t)pde[PDIR_SLOT_PTE + 3]));
938 #endif
939 /* Mark tables RO, and pin the kernel's shadow as L2 */
940 addr = (u_long)pde - KERNBASE;
941 for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
942 xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
943 if (i == 2 || i == 3)
944 continue;
945 #if 0
946 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
947 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
948 #endif
949 }
950 if (final) {
951 addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
952 __PRINTK(("pin L2 %d addr %#" PRIxPADDR "\n", 2, addr));
953 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
954 }
955 #if 0
956 addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
957 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
958 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
959 #endif
960 #else /* PAE */
961 /* recursive entry in higher-level per-cpu PD and pmap_kernel() */
962 bt_pgd[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)bt_pgd - KERNBASE) | PG_k | PG_V;
963 #ifdef __x86_64__
964 bt_cpu_pgd[PDIR_SLOT_PTE] =
965 xpmap_ptom_masked((paddr_t)bt_cpu_pgd - KERNBASE) | PG_k | PG_V;
966 #endif /* __x86_64__ */
967 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] va %#" PRIxVADDR " pa %#" PRIxPADDR
968 " entry %#" PRIxPADDR "\n", new_pgd, (paddr_t)new_pgd - KERNBASE,
969 bt_pgd[PDIR_SLOT_PTE]));
970 /* Mark tables RO */
971 xen_bt_set_readonly((vaddr_t) pde);
972 #endif
973 #if PTP_LEVELS > 2 || defined(PAE)
974 xen_bt_set_readonly((vaddr_t) pdtpe);
975 #endif
976 #if PTP_LEVELS > 3
977 xen_bt_set_readonly(new_pgd);
978 #endif
979 /* Pin the PGD */
980 __PRINTK(("pin PGD: %"PRIxVADDR"\n", new_pgd - KERNBASE));
981 #ifdef __x86_64__
982 xpq_queue_pin_l4_table(xpmap_ptom_masked(new_pgd - KERNBASE));
983 #elif PAE
984 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
985 #else
986 xpq_queue_pin_l2_table(xpmap_ptom_masked(new_pgd - KERNBASE));
987 #endif
988
989 /* Save phys. addr of PDP, for libkvm. */
990 #ifdef PAE
991 PDPpaddr = (u_long)pde - KERNBASE; /* PDP is the L2 with PAE */
992 #else
993 PDPpaddr = (u_long)bt_pgd - KERNBASE;
994 #endif
995
996 /* Switch to new tables */
997 __PRINTK(("switch to PGD\n"));
998 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
999 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry %#" PRIxPADDR "\n",
1000 bt_pgd[PDIR_SLOT_PTE]));
1001
1002 #ifdef PAE
1003 if (final) {
1004 /* save the address of the L3 page */
1005 cpu_info_primary.ci_pae_l3_pdir = pdtpe;
1006 cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE);
1007
1008 /* now enter kernel's PTE mappings */
1009 addr = (u_long)pde - KERNBASE + PAGE_SIZE * 3;
1010 xpq_queue_pte_update(
1011 xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
1012 xpmap_ptom_masked(addr) | PG_k | PG_V);
1013 xpq_flush_queue();
1014 }
1015 #elif defined(__x86_64__)
1016 if (final) {
1017 /* save the address of the real per-cpu L4 pgd page */
1018 cpu_info_primary.ci_kpm_pdir = bt_cpu_pgd;
1019 cpu_info_primary.ci_kpm_pdirpa = ((paddr_t) bt_cpu_pgd - KERNBASE);
1020 }
1021 #endif
1022
1023 /* Now we can safely reclaim space taken by old tables */
1024
1025 __PRINTK(("unpin old PGD\n"));
1026 /* Unpin old PGD */
1027 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
1028 /* Mark old tables RW */
1029 page = old_pgd;
1030 addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
1031 addr = xpmap_mtop(addr);
1032 pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
1033 pte += pl1_pi(page);
1034 __PRINTK(("*pde %#" PRIxPADDR " addr %#" PRIxPADDR " pte %#lx\n",
1035 pde[pl2_pi(page)], addr, (long)pte));
1036 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
1037 addr = xpmap_ptom(((u_long) pte) - KERNBASE);
1038 XENPRINTK(("addr %#" PRIxPADDR " pte %#lx "
1039 "*pte %#" PRIxPADDR "\n",
1040 addr, (long)pte, *pte));
1041 xpq_queue_pte_update(addr, *pte | PG_RW);
1042 page += PAGE_SIZE;
1043 /*
1044 * Our ptes are contiguous
1045 * so it's safe to just "++" here
1046 */
1047 pte++;
1048 }
1049 xpq_flush_queue();
1050 }
1051
1052
1053 /*
1054 * Bootstrap helper functions
1055 */
1056
1057 /*
1058 * Mark a page readonly
1059 * XXX: assuming vaddr = paddr + KERNBASE
1060 */
1061
1062 static void
1063 xen_bt_set_readonly (vaddr_t page)
1064 {
1065 pt_entry_t entry;
1066
1067 entry = xpmap_ptom_masked(page - KERNBASE);
1068 entry |= PG_k | PG_V;
1069
1070 HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
1071 }
1072
1073 #ifdef __x86_64__
1074 void
1075 xen_set_user_pgd(paddr_t page)
1076 {
1077 struct mmuext_op op;
1078 int s = splvm();
1079
1080 xpq_flush_queue();
1081 op.cmd = MMUEXT_NEW_USER_BASEPTR;
1082 op.arg1.mfn = pfn_to_mfn(page >> PAGE_SHIFT);
1083 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
1084 panic("xen_set_user_pgd: failed to install new user page"
1085 " directory %#" PRIxPADDR, page);
1086 splx(s);
1087 }
1088 #endif /* __x86_64__ */
1089