x86_xpmap.c revision 1.12.4.1 1 /* $NetBSD: x86_xpmap.c,v 1.12.4.1 2009/02/09 00:03:55 jym Exp $ */
2
3 /*
4 * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 /*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 * must display the following acknowledgement:
32 * This product includes software developed by Manuel Bouyer.
33 * 4. The name of the author may not be used to endorse or promote products
34 * derived from this software without specific prior written permission.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *
47 */
48
49 /*
50 *
51 * Copyright (c) 2004 Christian Limpach.
52 * All rights reserved.
53 *
54 * Redistribution and use in source and binary forms, with or without
55 * modification, are permitted provided that the following conditions
56 * are met:
57 * 1. Redistributions of source code must retain the above copyright
58 * notice, this list of conditions and the following disclaimer.
59 * 2. Redistributions in binary form must reproduce the above copyright
60 * notice, this list of conditions and the following disclaimer in the
61 * documentation and/or other materials provided with the distribution.
62 * 3. All advertising materials mentioning features or use of this software
63 * must display the following acknowledgement:
64 * This product includes software developed by Christian Limpach.
65 * 4. The name of the author may not be used to endorse or promote products
66 * derived from this software without specific prior written permission.
67 *
68 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
69 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
70 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
71 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
72 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
73 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
74 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
75 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
76 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
77 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
78 */
79
80
81 #include <sys/cdefs.h>
82 __KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.12.4.1 2009/02/09 00:03:55 jym Exp $");
83
84 #include "opt_xen.h"
85 #include "opt_ddb.h"
86 #include "ksyms.h"
87
88 #include <sys/param.h>
89 #include <sys/systm.h>
90 #include <sys/rwlock.h>
91
92 #include <uvm/uvm.h>
93
94 #include <machine/pmap.h>
95 #include <machine/gdt.h>
96 #include <xen/xenfunc.h>
97
98 #include <dev/isa/isareg.h>
99 #include <machine/isa_machdep.h>
100
101 #undef XENDEBUG
102 /* #define XENDEBUG_SYNC */
103 /* #define XENDEBUG_LOW */
104
105 #ifdef XENDEBUG
106 #define XENPRINTF(x) printf x
107 #define XENPRINTK(x) printk x
108 #define XENPRINTK2(x) /* printk x */
109
110 static char XBUF[256];
111 #else
112 #define XENPRINTF(x)
113 #define XENPRINTK(x)
114 #define XENPRINTK2(x)
115 #endif
116 #define PRINTF(x) printf x
117 #define PRINTK(x) printk x
118
119 /* on x86_64 kernel runs in ring 3 */
120 #ifdef __x86_64__
121 #define PG_k PG_u
122 #else
123 #define PG_k 0
124 #endif
125
126 volatile shared_info_t *HYPERVISOR_shared_info;
127 /* Xen requires the start_info struct to be page aligned */
128 union start_info_union start_info_union __aligned(PAGE_SIZE);
129 unsigned long *xpmap_phys_to_machine_mapping;
130
131 /*
132 * We should avoid the domU to manipulate MFNs when it is suspending
133 * or migrating, as they could be invalid once domU resumes operations.
134 *
135 * We use a read/write lock for that: when a thread is expected to
136 * manipulate MFNs, it should first acquire a reader lock, then proceed
137 * to MFN's manipulation. Once it has finished with it, the reader lock is
138 * released.
139 *
140 * The thread responsible for the domU suspension will acquire an exclusive
141 * (writer) lock.
142 */
143 static krwlock_t xen_ptom_lock;
144
145 void
146 xen_init_ptom_lock(void) {
147 xen_suspending = 0;
148 rw_init(&xen_ptom_lock);
149 }
150
151 void
152 xen_release_ptom_lock(void) {
153 rw_exit(&xen_ptom_lock);
154 }
155
156 void
157 xen_acquire_reader_ptom_lock(void) {
158 rw_enter(&xen_ptom_lock, RW_READER);
159 }
160
161 void
162 xen_acquire_writer_ptom_lock(void) {
163 rw_enter(&xen_ptom_lock, RW_WRITER);
164 }
165
166 void xen_failsafe_handler(void);
167
168 #ifdef XEN3
169 #define HYPERVISOR_mmu_update_self(req, count, success_count) \
170 HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
171 #else
172 #define HYPERVISOR_mmu_update_self(req, count, success_count) \
173 HYPERVISOR_mmu_update((req), (count), (success_count))
174 #endif
175
176 void
177 xen_failsafe_handler(void)
178 {
179
180 panic("xen_failsafe_handler called!\n");
181 }
182
183
184 void
185 xen_set_ldt(vaddr_t base, uint32_t entries)
186 {
187 vaddr_t va;
188 vaddr_t end;
189 pt_entry_t *ptp;
190 int s;
191
192 #ifdef __x86_64__
193 end = base + (entries << 3);
194 #else
195 end = base + entries * sizeof(union descriptor);
196 #endif
197
198 #ifdef XEN3
199 xen_acquire_reader_ptom_lock();
200 #endif
201
202 for (va = base; va < end; va += PAGE_SIZE) {
203 KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
204 ptp = kvtopte(va);
205 XENPRINTF(("xen_set_ldt %p %d %p\n", (void *)base,
206 entries, ptp));
207 pmap_pte_clearbits(ptp, PG_RW);
208 }
209 s = splvm();
210 xpq_queue_set_ldt(base, entries);
211 xpq_flush_queue();
212
213 #ifdef XEN3
214 xen_release_ptom_lock();
215 #endif
216
217 splx(s);
218 }
219
220 #ifdef XENDEBUG
221 void xpq_debug_dump(void);
222 #endif
223
224 #define XPQUEUE_SIZE 2048
225 static mmu_update_t xpq_queue[XPQUEUE_SIZE];
226 static int xpq_idx = 0;
227
228 void
229 xpq_flush_queue(void)
230 {
231 int i, ok;
232
233 XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
234 for (i = 0; i < xpq_idx; i++)
235 XENPRINTK2(("%d: %p %08" PRIx64 "\n", i,
236 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val));
237 if (xpq_idx != 0 &&
238 HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok) < 0) {
239 printf("xpq_flush_queue: %d entries \n", xpq_idx);
240 for (i = 0; i < xpq_idx; i++)
241 printf("0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
242 (uint64_t)xpq_queue[i].ptr,
243 (uint64_t)xpq_queue[i].val);
244 panic("HYPERVISOR_mmu_update failed\n");
245 }
246 xpq_idx = 0;
247 }
248
249 static inline void
250 xpq_increment_idx(void)
251 {
252
253 xpq_idx++;
254 if (__predict_false(xpq_idx == XPQUEUE_SIZE))
255 xpq_flush_queue();
256 }
257
258 void
259 xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
260 {
261 XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
262 "\n", (int64_t)ma, (int64_t)pa));
263 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
264 xpq_queue[xpq_idx].val = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
265 xpq_increment_idx();
266 #ifdef XENDEBUG_SYNC
267 xpq_flush_queue();
268 #endif
269 }
270
271 void
272 xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
273 {
274
275 KASSERT((ptr & 3) == 0);
276 xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
277 xpq_queue[xpq_idx].val = val;
278 xpq_increment_idx();
279 #ifdef XENDEBUG_SYNC
280 xpq_flush_queue();
281 #endif
282 }
283
284 #ifdef XEN3
285 void
286 xpq_queue_pt_switch(paddr_t pa)
287 {
288 struct mmuext_op op;
289 xpq_flush_queue();
290
291 XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
292 (int64_t)pa, (int64_t)pa));
293 op.cmd = MMUEXT_NEW_BASEPTR;
294 op.arg1.mfn = pa >> PAGE_SHIFT;
295 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
296 panic("xpq_queue_pt_switch");
297 }
298
299 void
300 xpq_queue_pin_table(paddr_t pa)
301 {
302 struct mmuext_op op;
303 xpq_flush_queue();
304
305 XENPRINTK2(("xpq_queue_pin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
306 (int64_t)pa, (int64_t)pa));
307 op.arg1.mfn = pa >> PAGE_SHIFT;
308
309 #if defined(__x86_64__)
310 op.cmd = MMUEXT_PIN_L4_TABLE;
311 #else
312 op.cmd = MMUEXT_PIN_L2_TABLE;
313 #endif
314 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
315 panic("xpq_queue_pin_table");
316 }
317
318 #ifdef PAE
319 static void
320 xpq_queue_pin_l3_table(paddr_t pa)
321 {
322 struct mmuext_op op;
323 xpq_flush_queue();
324
325 XENPRINTK2(("xpq_queue_pin_l2_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
326 (int64_t)pa, (int64_t)pa));
327 op.arg1.mfn = pa >> PAGE_SHIFT;
328
329 op.cmd = MMUEXT_PIN_L3_TABLE;
330 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
331 panic("xpq_queue_pin_table");
332 }
333 #endif
334
335 void
336 xpq_queue_unpin_table(paddr_t pa)
337 {
338 struct mmuext_op op;
339 xpq_flush_queue();
340
341 XENPRINTK2(("xpq_queue_unpin_table: 0x%" PRIx64 " 0x%" PRIx64 "\n",
342 (int64_t)pa, (int64_t)pa));
343 op.arg1.mfn = pa >> PAGE_SHIFT;
344 op.cmd = MMUEXT_UNPIN_TABLE;
345 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
346 panic("xpq_queue_unpin_table");
347 }
348
349 void
350 xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
351 {
352 struct mmuext_op op;
353 xpq_flush_queue();
354
355 XENPRINTK2(("xpq_queue_set_ldt\n"));
356 KASSERT(va == (va & ~PAGE_MASK));
357 op.cmd = MMUEXT_SET_LDT;
358 op.arg1.linear_addr = va;
359 op.arg2.nr_ents = entries;
360 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
361 panic("xpq_queue_set_ldt");
362 }
363
364 void
365 xpq_queue_tlb_flush(void)
366 {
367 struct mmuext_op op;
368 xpq_flush_queue();
369
370 XENPRINTK2(("xpq_queue_tlb_flush\n"));
371 op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
372 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
373 panic("xpq_queue_tlb_flush");
374 }
375
376 void
377 xpq_flush_cache(void)
378 {
379 struct mmuext_op op;
380 int s = splvm();
381 xpq_flush_queue();
382
383 XENPRINTK2(("xpq_queue_flush_cache\n"));
384 op.cmd = MMUEXT_FLUSH_CACHE;
385 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
386 panic("xpq_flush_cache");
387 splx(s);
388 }
389
390 void
391 xpq_queue_invlpg(vaddr_t va)
392 {
393 struct mmuext_op op;
394 xpq_flush_queue();
395
396 XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
397 op.cmd = MMUEXT_INVLPG_LOCAL;
398 op.arg1.linear_addr = (va & ~PAGE_MASK);
399 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
400 panic("xpq_queue_invlpg");
401 }
402
403 int
404 xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
405 {
406 mmu_update_t op;
407 int ok;
408 xpq_flush_queue();
409
410 op.ptr = ptr;
411 op.val = val;
412 if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
413 return EFAULT;
414 return (0);
415 }
416 #else /* XEN3 */
417 void
418 xpq_queue_pt_switch(paddr_t pa)
419 {
420
421 XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa));
422 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
423 xpq_queue[xpq_idx].val = MMUEXT_NEW_BASEPTR;
424 xpq_increment_idx();
425 }
426
427 void
428 xpq_queue_pin_table(paddr_t pa)
429 {
430
431 XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa));
432 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
433 xpq_queue[xpq_idx].val = MMUEXT_PIN_L2_TABLE;
434 xpq_increment_idx();
435 }
436
437 void
438 xpq_queue_unpin_table(paddr_t pa)
439 {
440
441 XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa));
442 xpq_queue[xpq_idx].ptr = pa | MMU_EXTENDED_COMMAND;
443 xpq_queue[xpq_idx].val = MMUEXT_UNPIN_TABLE;
444 xpq_increment_idx();
445 }
446
447 void
448 xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
449 {
450
451 XENPRINTK2(("xpq_queue_set_ldt\n"));
452 KASSERT(va == (va & ~PAGE_MASK));
453 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND | va;
454 xpq_queue[xpq_idx].val = MMUEXT_SET_LDT | (entries << MMUEXT_CMD_SHIFT);
455 xpq_increment_idx();
456 }
457
458 void
459 xpq_queue_tlb_flush(void)
460 {
461
462 XENPRINTK2(("xpq_queue_tlb_flush\n"));
463 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND;
464 xpq_queue[xpq_idx].val = MMUEXT_TLB_FLUSH;
465 xpq_increment_idx();
466 }
467
468 void
469 xpq_flush_cache(void)
470 {
471 int s = splvm();
472
473 XENPRINTK2(("xpq_queue_flush_cache\n"));
474 xpq_queue[xpq_idx].ptr = MMU_EXTENDED_COMMAND;
475 xpq_queue[xpq_idx].val = MMUEXT_FLUSH_CACHE;
476 xpq_increment_idx();
477 xpq_flush_queue();
478 splx(s);
479 }
480
481 void
482 xpq_queue_invlpg(vaddr_t va)
483 {
484
485 XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
486 xpq_queue[xpq_idx].ptr = (va & ~PAGE_MASK) | MMU_EXTENDED_COMMAND;
487 xpq_queue[xpq_idx].val = MMUEXT_INVLPG;
488 xpq_increment_idx();
489 }
490
491 int
492 xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
493 {
494 mmu_update_t xpq_up[3];
495
496 xpq_up[0].ptr = MMU_EXTENDED_COMMAND;
497 xpq_up[0].val = MMUEXT_SET_FOREIGNDOM | (dom << 16);
498 xpq_up[1].ptr = ptr;
499 xpq_up[1].val = val;
500 if (HYPERVISOR_mmu_update_self(xpq_up, 2, NULL) < 0)
501 return EFAULT;
502 return (0);
503 }
504 #endif /* XEN3 */
505
506 #ifdef XENDEBUG
507 void
508 xpq_debug_dump(void)
509 {
510 int i;
511
512 XENPRINTK2(("idx: %d\n", xpq_idx));
513 for (i = 0; i < xpq_idx; i++) {
514 sprintf(XBUF, "%" PRIx64 " %08" PRIx64,
515 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
516 if (++i < xpq_idx)
517 sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
518 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
519 if (++i < xpq_idx)
520 sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
521 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
522 if (++i < xpq_idx)
523 sprintf(XBUF + strlen(XBUF), "%" PRIx64 " %08" PRIx64,
524 (uint64_t)xpq_queue[i].ptr, (uint64_t)xpq_queue[i].val);
525 XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
526 }
527 }
528 #endif
529
530
531 extern volatile struct xencons_interface *xencons_interface; /* XXX */
532 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
533
534 static void xen_bt_set_readonly (vaddr_t);
535 static void xen_bootstrap_tables (vaddr_t, vaddr_t, int, int, int);
536
537 /* How many PDEs ? */
538 #if L2_SLOT_KERNBASE > 0
539 #define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
540 #else
541 #define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
542 #endif
543
544 /*
545 * Construct and switch to new pagetables
546 * first_avail is the first vaddr we can use after
547 * we get rid of Xen pagetables
548 */
549
550 vaddr_t xen_pmap_bootstrap (void);
551
552 /*
553 * Function to get rid of Xen bootstrap tables
554 */
555
556 /* How many PDP do we need: */
557 #ifdef PAE
558 /*
559 * For PAE, we consider a single contigous L2 "superpage" of 4 pages,
560 * all of them mapped by the L3 page. We also need a shadow page
561 * for L3[3].
562 */
563 static const int l2_4_count = 6;
564 #else
565 static const int l2_4_count = PTP_LEVELS - 1;
566 #endif
567
568 vaddr_t
569 xen_pmap_bootstrap(void)
570 {
571 int count, oldcount;
572 long mapsize;
573 vaddr_t bootstrap_tables, init_tables;
574
575 xpmap_phys_to_machine_mapping =
576 (unsigned long *)xen_start_info.mfn_list;
577 init_tables = xen_start_info.pt_base;
578 __PRINTK(("xen_arch_pmap_bootstrap init_tables=0x%lx\n", init_tables));
579
580 /* Space after Xen boostrap tables should be free */
581 bootstrap_tables = xen_start_info.pt_base +
582 (xen_start_info.nr_pt_frames * PAGE_SIZE);
583
584 /*
585 * Calculate how many space we need
586 * first everything mapped before the Xen bootstrap tables
587 */
588 mapsize = init_tables - KERNTEXTOFF;
589 /* after the tables we'll have:
590 * - UAREA
591 * - dummy user PGD (x86_64)
592 * - HYPERVISOR_shared_info
593 * - ISA I/O mem (if needed)
594 */
595 mapsize += UPAGES * NBPG;
596 #ifdef __x86_64__
597 mapsize += NBPG;
598 #endif
599 mapsize += NBPG;
600
601 #ifdef DOM0OPS
602 if (xendomain_is_dom0()) {
603 /* space for ISA I/O mem */
604 mapsize += IOM_SIZE;
605 }
606 #endif
607 /* at this point mapsize doens't include the table size */
608
609 #ifdef __x86_64__
610 count = TABLE_L2_ENTRIES;
611 #else
612 count = (mapsize + (NBPD_L2 -1)) >> L2_SHIFT;
613 #endif /* __x86_64__ */
614
615 /* now compute how many L2 pages we need exactly */
616 XENPRINTK(("bootstrap_final mapsize 0x%lx count %d\n", mapsize, count));
617 while (mapsize + (count + l2_4_count) * PAGE_SIZE + KERNTEXTOFF >
618 ((long)count << L2_SHIFT) + KERNBASE) {
619 count++;
620 }
621 #ifndef __x86_64__
622 /*
623 * one more L2 page: we'll alocate several pages after kva_start
624 * in pmap_bootstrap() before pmap_growkernel(), which have not been
625 * counted here. It's not a big issue to allocate one more L2 as
626 * pmap_growkernel() will be called anyway.
627 */
628 count++;
629 nkptp[1] = count;
630 #endif
631
632 /*
633 * install bootstrap pages. We may need more L2 pages than will
634 * have the final table here, as it's installed after the final table
635 */
636 oldcount = count;
637
638 bootstrap_again:
639 XENPRINTK(("bootstrap_again oldcount %d\n", oldcount));
640 /*
641 * Xen space we'll reclaim may not be enough for our new page tables,
642 * move bootstrap tables if necessary
643 */
644 if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
645 bootstrap_tables = init_tables +
646 ((count + l2_4_count) * PAGE_SIZE);
647 /* make sure we have enough to map the bootstrap_tables */
648 if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
649 ((long)oldcount << L2_SHIFT) + KERNBASE) {
650 oldcount++;
651 goto bootstrap_again;
652 }
653
654 /* Create temporary tables */
655 xen_bootstrap_tables(xen_start_info.pt_base, bootstrap_tables,
656 xen_start_info.nr_pt_frames, oldcount, 0);
657
658 /* Create final tables */
659 xen_bootstrap_tables(bootstrap_tables, init_tables,
660 oldcount + l2_4_count, count, 1);
661
662 /* zero out free space after tables */
663 memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
664 (UPAGES + 1) * NBPG);
665 return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
666 }
667
668
669 /*
670 * Build a new table and switch to it
671 * old_count is # of old tables (including PGD, PDTPE and PDE)
672 * new_count is # of new tables (PTE only)
673 * we assume areas don't overlap
674 */
675
676
677 static void
678 xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd,
679 int old_count, int new_count, int final)
680 {
681 pd_entry_t *pdtpe, *pde, *pte;
682 pd_entry_t *cur_pgd, *bt_pgd;
683 paddr_t addr;
684 vaddr_t page, avail, text_end, map_end;
685 int i;
686 extern char __data_start;
687
688 __PRINTK(("xen_bootstrap_tables(0x%lx, 0x%lx, %d, %d)\n",
689 old_pgd, new_pgd, old_count, new_count));
690 text_end = ((vaddr_t)&__data_start) & ~PAGE_MASK;
691 /*
692 * size of R/W area after kernel text:
693 * xencons_interface (if present)
694 * xenstore_interface (if present)
695 * table pages (new_count + l2_4_count entries)
696 * extra mappings (only when final is true):
697 * UAREA
698 * dummy user PGD (x86_64 only)/gdt page (i386 only)
699 * HYPERVISOR_shared_info
700 * ISA I/O mem (if needed)
701 */
702 map_end = new_pgd + ((new_count + l2_4_count) * NBPG);
703 if (final) {
704 map_end += (UPAGES + 1) * NBPG;
705 HYPERVISOR_shared_info = (shared_info_t *)map_end;
706 map_end += NBPG;
707 }
708 /*
709 * we always set atdevbase, as it's used by init386 to find the first
710 * available VA. map_end is updated only if we are dom0, so
711 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
712 * this case.
713 */
714 if (final)
715 atdevbase = map_end;
716 #ifdef DOM0OPS
717 if (final && xendomain_is_dom0()) {
718 /* ISA I/O mem */
719 map_end += IOM_SIZE;
720 }
721 #endif /* DOM0OPS */
722
723 __PRINTK(("xen_bootstrap_tables text_end 0x%lx map_end 0x%lx\n",
724 text_end, map_end));
725 __PRINTK(("console 0x%lx ", xen_start_info.console.domU.mfn));
726 __PRINTK(("xenstore 0x%lx\n", xen_start_info.store_mfn));
727
728 /*
729 * Create bootstrap page tables
730 * What we need:
731 * - a PGD (level 4)
732 * - a PDTPE (level 3)
733 * - a PDE (level2)
734 * - some PTEs (level 1)
735 */
736
737 cur_pgd = (pd_entry_t *) old_pgd;
738 bt_pgd = (pd_entry_t *) new_pgd;
739 memset (bt_pgd, 0, PAGE_SIZE);
740 avail = new_pgd + PAGE_SIZE;
741 #if PTP_LEVELS > 3
742 /* Install level 3 */
743 pdtpe = (pd_entry_t *) avail;
744 memset (pdtpe, 0, PAGE_SIZE);
745 avail += PAGE_SIZE;
746
747 addr = ((u_long) pdtpe) - KERNBASE;
748 bt_pgd[pl4_pi(KERNTEXTOFF)] =
749 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
750
751 __PRINTK(("L3 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L4[0x%x]\n",
752 pdtpe, (uint64_t)addr, (uint64_t)bt_pgd[pl4_pi(KERNTEXTOFF)],
753 pl4_pi(KERNTEXTOFF)));
754 #else
755 pdtpe = bt_pgd;
756 #endif /* PTP_LEVELS > 3 */
757
758 #if PTP_LEVELS > 2
759 /* Level 2 */
760 pde = (pd_entry_t *) avail;
761 memset(pde, 0, PAGE_SIZE);
762 avail += PAGE_SIZE;
763
764 addr = ((u_long) pde) - KERNBASE;
765 pdtpe[pl3_pi(KERNTEXTOFF)] =
766 xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
767 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64 " -> L3[0x%x]\n",
768 pde, (int64_t)addr, (int64_t)pdtpe[pl3_pi(KERNTEXTOFF)],
769 pl3_pi(KERNTEXTOFF)));
770 #elif defined(PAE)
771 /* our PAE-style level 2: 5 contigous pages (4 L2 + 1 shadow) */
772 pde = (pd_entry_t *) avail;
773 memset(pde, 0, PAGE_SIZE * 5);
774 avail += PAGE_SIZE * 5;
775 addr = ((u_long) pde) - KERNBASE;
776 /*
777 * enter L2 pages in the L3.
778 * The real L2 kernel PD will be the last one (so that
779 * pde[L2_SLOT_KERN] always point to the shadow).
780 */
781 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
782 /*
783 * Xen doens't want R/W mappings in L3 entries, it'll add it
784 * itself.
785 */
786 pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
787 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
788 " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * i,
789 (int64_t)addr, (int64_t)pdtpe[i], i));
790 }
791 addr += PAGE_SIZE;
792 pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
793 __PRINTK(("L2 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
794 " -> L3[0x%x]\n", (vaddr_t)pde + PAGE_SIZE * 4,
795 (int64_t)addr, (int64_t)pdtpe[3], 3));
796
797 #else /* PAE */
798 pde = bt_pgd;
799 #endif /* PTP_LEVELS > 2 */
800
801 /* Level 1 */
802 page = KERNTEXTOFF;
803 for (i = 0; i < new_count; i ++) {
804 vaddr_t cur_page = page;
805
806 pte = (pd_entry_t *) avail;
807 avail += PAGE_SIZE;
808
809 memset(pte, 0, PAGE_SIZE);
810 while (pl2_pi(page) == pl2_pi (cur_page)) {
811 if (page >= map_end) {
812 /* not mapped at all */
813 pte[pl1_pi(page)] = 0;
814 page += PAGE_SIZE;
815 continue;
816 }
817 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
818 if (page == (vaddr_t)HYPERVISOR_shared_info) {
819 pte[pl1_pi(page)] = xen_start_info.shared_info;
820 __PRINTK(("HYPERVISOR_shared_info "
821 "va 0x%lx pte 0x%" PRIx64 "\n",
822 HYPERVISOR_shared_info, (int64_t)pte[pl1_pi(page)]));
823 }
824 #ifdef XEN3
825 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
826 == xen_start_info.console.domU.mfn) {
827 xencons_interface = (void *)page;
828 pte[pl1_pi(page)] = xen_start_info.console.domU.mfn;
829 pte[pl1_pi(page)] <<= PAGE_SHIFT;
830 __PRINTK(("xencons_interface "
831 "va 0x%lx pte 0x%" PRIx64 "\n",
832 xencons_interface, (int64_t)pte[pl1_pi(page)]));
833 }
834 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
835 == xen_start_info.store_mfn) {
836 xenstore_interface = (void *)page;
837 pte[pl1_pi(page)] = xen_start_info.store_mfn;
838 pte[pl1_pi(page)] <<= PAGE_SHIFT;
839 __PRINTK(("xenstore_interface "
840 "va 0x%lx pte 0x%" PRIx64 "\n",
841 xenstore_interface, (int64_t)pte[pl1_pi(page)]));
842 }
843 #endif /* XEN3 */
844 #ifdef DOM0OPS
845 if (page >= (vaddr_t)atdevbase &&
846 page < (vaddr_t)atdevbase + IOM_SIZE) {
847 pte[pl1_pi(page)] =
848 IOM_BEGIN + (page - (vaddr_t)atdevbase);
849 }
850 #endif
851 pte[pl1_pi(page)] |= PG_k | PG_V;
852 if (page < text_end) {
853 /* map kernel text RO */
854 pte[pl1_pi(page)] |= 0;
855 } else if (page >= old_pgd
856 && page < old_pgd + (old_count * PAGE_SIZE)) {
857 /* map old page tables RO */
858 pte[pl1_pi(page)] |= 0;
859 } else if (page >= new_pgd &&
860 page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
861 /* map new page tables RO */
862 pte[pl1_pi(page)] |= 0;
863 } else {
864 /* map page RW */
865 pte[pl1_pi(page)] |= PG_RW;
866 }
867
868 if ((page >= old_pgd && page < old_pgd + (old_count * PAGE_SIZE))
869 || page >= new_pgd) {
870 __PRINTK(("va 0x%lx pa 0x%lx "
871 "entry 0x%" PRIx64 " -> L1[0x%x]\n",
872 page, page - KERNBASE,
873 (int64_t)pte[pl1_pi(page)], pl1_pi(page)));
874 }
875 page += PAGE_SIZE;
876 }
877
878 addr = ((u_long) pte) - KERNBASE;
879 pde[pl2_pi(cur_page)] =
880 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
881 __PRINTK(("L1 va 0x%lx pa 0x%" PRIx64 " entry 0x%" PRIx64
882 " -> L2[0x%x]\n", pte, (int64_t)addr,
883 (int64_t)pde[pl2_pi(cur_page)], pl2_pi(cur_page)));
884 /* Mark readonly */
885 xen_bt_set_readonly((vaddr_t) pte);
886 }
887
888 /* Install recursive page tables mapping */
889 #ifdef PAE
890 /*
891 * we need a shadow page for the kernel's L2 page
892 * The real L2 kernel PD will be the last one (so that
893 * pde[L2_SLOT_KERN] always point to the shadow.
894 */
895 memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
896 pmap_kl2pd = &pde[L2_SLOT_KERN + NPDPG];
897 pmap_kl2paddr = (u_long)pmap_kl2pd - KERNBASE;
898
899 /*
900 * We don't enter a recursive entry from the L3 PD. Instead,
901 * we enter the first 4 L2 pages, which includes the kernel's L2
902 * shadow. But we have to entrer the shadow after switching
903 * %cr3, or Xen will refcount some PTE with the wrong type.
904 */
905 addr = (u_long)pde - KERNBASE;
906 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
907 pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
908 __PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
909 (int)(PDIR_SLOT_PTE + i), pde + PAGE_SIZE * i, (long)addr,
910 (int64_t)pde[PDIR_SLOT_PTE + i]));
911 }
912 #if 0
913 addr += PAGE_SIZE; /* point to shadow L2 */
914 pde[PDIR_SLOT_PTE + 3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
915 __PRINTK(("pde[%d] va 0x%lx pa 0x%lx entry 0x%" PRIx64 "\n",
916 (int)(PDIR_SLOT_PTE + 3), pde + PAGE_SIZE * 4, (long)addr,
917 (int64_t)pde[PDIR_SLOT_PTE + 3]));
918 #endif
919 /* Mark tables RO, and pin the kenrel's shadow as L2 */
920 addr = (u_long)pde - KERNBASE;
921 for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
922 xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
923 if (i == 2 || i == 3)
924 continue;
925 #if 0
926 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", i, (int64_t)addr));
927 xpq_queue_pin_table(xpmap_ptom_masked(addr));
928 #endif
929 }
930 if (final) {
931 addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
932 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
933 xpq_queue_pin_table(xpmap_ptom_masked(addr));
934 }
935 #if 0
936 addr = (u_long)pde - KERNBASE + 2 * PAGE_SIZE;
937 __PRINTK(("pin L2 %d addr 0x%" PRIx64 "\n", 2, (int64_t)addr));
938 xpq_queue_pin_table(xpmap_ptom_masked(addr));
939 #endif
940 #else /* PAE */
941 /* recursive entry in higher-level PD */
942 bt_pgd[PDIR_SLOT_PTE] =
943 xpmap_ptom_masked(new_pgd - KERNBASE) | PG_k | PG_V;
944 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] va 0x%lx pa 0x%" PRIx64
945 " entry 0x%" PRIx64 "\n", new_pgd, (int64_t)new_pgd - KERNBASE,
946 (int64_t)bt_pgd[PDIR_SLOT_PTE]));
947 /* Mark tables RO */
948 xen_bt_set_readonly((vaddr_t) pde);
949 #endif
950 #if PTP_LEVELS > 2 || defined(PAE)
951 xen_bt_set_readonly((vaddr_t) pdtpe);
952 #endif
953 #if PTP_LEVELS > 3
954 xen_bt_set_readonly(new_pgd);
955 #endif
956 /* Pin the PGD */
957 __PRINTK(("pin PDG\n"));
958 #ifdef PAE
959 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
960 #else
961 xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE));
962 #endif
963 #ifdef __i386__
964 /* Save phys. addr of PDP, for libkvm. */
965 PDPpaddr = (long)pde;
966 #ifdef PAE
967 /* also save the address of the L3 page */
968 pmap_l3pd = pdtpe;
969 pmap_l3paddr = (new_pgd - KERNBASE);
970 #endif /* PAE */
971 #endif /* i386 */
972 /* Switch to new tables */
973 __PRINTK(("switch to PDG\n"));
974 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
975 __PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry 0x%" PRIx64 "\n",
976 (int64_t)bt_pgd[PDIR_SLOT_PTE]));
977 #ifdef PAE
978 if (final) {
979 /* now enter kernel's PTE mappings */
980 addr = (u_long)pde - KERNBASE + PAGE_SIZE * 3;
981 xpq_queue_pte_update(
982 xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
983 xpmap_ptom_masked(addr) | PG_k | PG_V);
984 xpq_flush_queue();
985 }
986 #endif
987
988
989
990 /* Now we can safely reclaim space taken by old tables */
991
992 __PRINTK(("unpin old PDG\n"));
993 /* Unpin old PGD */
994 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
995 /* Mark old tables RW */
996 page = old_pgd;
997 addr = (paddr_t) pde[pl2_pi(page)] & PG_FRAME;
998 addr = xpmap_mtop(addr);
999 pte = (pd_entry_t *) ((u_long)addr + KERNBASE);
1000 pte += pl1_pi(page);
1001 __PRINTK(("*pde 0x%" PRIx64 " addr 0x%" PRIx64 " pte 0x%lx\n",
1002 (int64_t)pde[pl2_pi(page)], (int64_t)addr, (long)pte));
1003 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
1004 addr = xpmap_ptom(((u_long) pte) - KERNBASE);
1005 XENPRINTK(("addr 0x%" PRIx64 " pte 0x%lx *pte 0x%" PRIx64 "\n",
1006 (int64_t)addr, (long)pte, (int64_t)*pte));
1007 xpq_queue_pte_update(addr, *pte | PG_RW);
1008 page += PAGE_SIZE;
1009 /*
1010 * Our ptes are contiguous
1011 * so it's safe to just "++" here
1012 */
1013 pte++;
1014 }
1015 xpq_flush_queue();
1016 }
1017
1018
1019 /*
1020 * Bootstrap helper functions
1021 */
1022
1023 /*
1024 * Mark a page readonly
1025 * XXX: assuming vaddr = paddr + KERNBASE
1026 */
1027
1028 static void
1029 xen_bt_set_readonly (vaddr_t page)
1030 {
1031 pt_entry_t entry;
1032
1033 xen_acquire_reader_ptom_lock();
1034
1035 entry = xpmap_ptom_masked(page - KERNBASE);
1036 entry |= PG_k | PG_V;
1037
1038 HYPERVISOR_update_va_mapping (page, entry, UVMF_INVLPG);
1039
1040 xen_release_ptom_lock();
1041 }
1042
1043 #ifdef __x86_64__
1044 void
1045 xen_set_user_pgd(paddr_t page)
1046 {
1047 struct mmuext_op op;
1048 int s = splvm();
1049
1050 xpq_flush_queue();
1051 op.cmd = MMUEXT_NEW_USER_BASEPTR;
1052
1053 xen_acquire_reader_ptom_lock();
1054
1055 op.arg1.mfn = pfn_to_mfn(page >> PAGE_SHIFT);
1056 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
1057 panic("xen_set_user_pgd: failed to install new user page"
1058 " directory %lx", page);
1059
1060 xen_release_ptom_lock();
1061
1062 splx(s);
1063 }
1064 #endif /* __x86_64__ */
1065