x86_xpmap.c revision 1.70 1 /* $NetBSD: x86_xpmap.c,v 1.70 2017/01/22 19:24:51 maxv Exp $ */
2
3 /*
4 * Copyright (c) 2006 Mathieu Ropert <mro (at) adviseo.fr>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 /*
20 * Copyright (c) 2006, 2007 Manuel Bouyer.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the above copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
32 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
34 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
40 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42
43 /*
44 * Copyright (c) 2004 Christian Limpach.
45 * All rights reserved.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions
49 * are met:
50 * 1. Redistributions of source code must retain the above copyright
51 * notice, this list of conditions and the following disclaimer.
52 * 2. Redistributions in binary form must reproduce the above copyright
53 * notice, this list of conditions and the following disclaimer in the
54 * documentation and/or other materials provided with the distribution.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
57 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
58 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
59 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
60 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
61 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
65 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.70 2017/01/22 19:24:51 maxv Exp $");
70
71 #include "opt_xen.h"
72 #include "opt_ddb.h"
73 #include "ksyms.h"
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/mutex.h>
78 #include <sys/cpu.h>
79
80 #include <uvm/uvm.h>
81
82 #include <x86/pmap.h>
83 #include <machine/gdt.h>
84 #include <xen/xenfunc.h>
85
86 #include <dev/isa/isareg.h>
87 #include <machine/isa_machdep.h>
88
89 #undef XENDEBUG
90 /* #define XENDEBUG_SYNC */
91
92 #ifdef XENDEBUG
93 #define XENPRINTF(x) printf x
94 #define XENPRINTK2(x) /* printk x */
95 static char XBUF[256];
96 #else
97 #define XENPRINTF(x)
98 #define XENPRINTK2(x)
99 #endif
100
101 volatile shared_info_t *HYPERVISOR_shared_info;
102 /* Xen requires the start_info struct to be page aligned */
103 union start_info_union start_info_union __aligned(PAGE_SIZE);
104 unsigned long *xpmap_phys_to_machine_mapping;
105 kmutex_t pte_lock;
106 vaddr_t xen_dummy_page;
107
108 pt_entry_t xpmap_pg_nx;
109
110 void xen_failsafe_handler(void);
111
112 #define HYPERVISOR_mmu_update_self(req, count, success_count) \
113 HYPERVISOR_mmu_update((req), (count), (success_count), DOMID_SELF)
114
115 extern volatile struct xencons_interface *xencons_interface; /* XXX */
116 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
117
118 static void xen_bt_set_readonly(vaddr_t);
119 static void xen_bootstrap_tables(vaddr_t, vaddr_t, size_t, size_t, bool);
120
121 vaddr_t xen_locore(void);
122
123 /*
124 * kcpuset internally uses an array of uint32_t while xen uses an array of
125 * u_long. As we're little-endian we can cast one to the other.
126 */
127 typedef union {
128 #ifdef _LP64
129 uint32_t xcpum_km[2];
130 #else
131 uint32_t xcpum_km[1];
132 #endif
133 u_long xcpum_xm;
134 } xcpumask_t;
135
136 void
137 xen_failsafe_handler(void)
138 {
139
140 panic("xen_failsafe_handler called!\n");
141 }
142
143 void
144 xen_set_ldt(vaddr_t base, uint32_t entries)
145 {
146 vaddr_t va;
147 vaddr_t end;
148 pt_entry_t *ptp;
149 int s;
150
151 #ifdef __x86_64__
152 end = base + (entries << 3);
153 #else
154 end = base + entries * sizeof(union descriptor);
155 #endif
156
157 for (va = base; va < end; va += PAGE_SIZE) {
158 KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
159 ptp = kvtopte(va);
160 XENPRINTF(("xen_set_ldt %#" PRIxVADDR " %d %p\n",
161 base, entries, ptp));
162 pmap_pte_clearbits(ptp, PG_RW);
163 }
164 s = splvm();
165 xpq_queue_set_ldt(base, entries);
166 splx(s);
167 }
168
169 #ifdef XENDEBUG
170 void xpq_debug_dump(void);
171 #endif
172
173 #define XPQUEUE_SIZE 2048
174 static mmu_update_t xpq_queue_array[MAXCPUS][XPQUEUE_SIZE];
175 static int xpq_idx_array[MAXCPUS];
176
177 #ifdef i386
178 extern union descriptor tmpgdt[];
179 #endif
180
181 void
182 xpq_flush_queue(void)
183 {
184 int i, ok = 0, ret;
185
186 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid];
187 int xpq_idx = xpq_idx_array[curcpu()->ci_cpuid];
188
189 XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
190 for (i = 0; i < xpq_idx; i++)
191 XENPRINTK2(("%d: 0x%08" PRIx64 " 0x%08" PRIx64 "\n", i,
192 xpq_queue[i].ptr, xpq_queue[i].val));
193
194 retry:
195 ret = HYPERVISOR_mmu_update_self(xpq_queue, xpq_idx, &ok);
196
197 if (xpq_idx != 0 && ret < 0) {
198 struct cpu_info *ci;
199 CPU_INFO_ITERATOR cii;
200
201 printf("xpq_flush_queue: %d entries (%d successful) on "
202 "cpu%d (%ld)\n",
203 xpq_idx, ok, curcpu()->ci_index, curcpu()->ci_cpuid);
204
205 if (ok != 0) {
206 xpq_queue += ok;
207 xpq_idx -= ok;
208 ok = 0;
209 goto retry;
210 }
211
212 for (CPU_INFO_FOREACH(cii, ci)) {
213 xpq_queue = xpq_queue_array[ci->ci_cpuid];
214 xpq_idx = xpq_idx_array[ci->ci_cpuid];
215 printf("cpu%d (%ld):\n", ci->ci_index, ci->ci_cpuid);
216 for (i = 0; i < xpq_idx; i++) {
217 printf(" 0x%016" PRIx64 ": 0x%016" PRIx64 "\n",
218 xpq_queue[i].ptr, xpq_queue[i].val);
219 }
220 #ifdef __x86_64__
221 for (i = 0; i < PDIR_SLOT_PTE; i++) {
222 if (ci->ci_kpm_pdir[i] == 0)
223 continue;
224 printf(" kpm_pdir[%d]: 0x%" PRIx64 "\n",
225 i, ci->ci_kpm_pdir[i]);
226 }
227 #endif
228 }
229 panic("HYPERVISOR_mmu_update failed, ret: %d\n", ret);
230 }
231 xpq_idx_array[curcpu()->ci_cpuid] = 0;
232 }
233
234 static inline void
235 xpq_increment_idx(void)
236 {
237
238 if (__predict_false(++xpq_idx_array[curcpu()->ci_cpuid] == XPQUEUE_SIZE))
239 xpq_flush_queue();
240 }
241
242 void
243 xpq_queue_machphys_update(paddr_t ma, paddr_t pa)
244 {
245
246 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid];
247 int xpq_idx = xpq_idx_array[curcpu()->ci_cpuid];
248
249 XENPRINTK2(("xpq_queue_machphys_update ma=0x%" PRIx64 " pa=0x%" PRIx64
250 "\n", (int64_t)ma, (int64_t)pa));
251
252 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE;
253 xpq_queue[xpq_idx].val = pa >> PAGE_SHIFT;
254 xpq_increment_idx();
255 #ifdef XENDEBUG_SYNC
256 xpq_flush_queue();
257 #endif
258 }
259
260 void
261 xpq_queue_pte_update(paddr_t ptr, pt_entry_t val)
262 {
263
264 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid];
265 int xpq_idx = xpq_idx_array[curcpu()->ci_cpuid];
266
267 KASSERT((ptr & 3) == 0);
268 xpq_queue[xpq_idx].ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
269 xpq_queue[xpq_idx].val = val;
270 xpq_increment_idx();
271 #ifdef XENDEBUG_SYNC
272 xpq_flush_queue();
273 #endif
274 }
275
276 void
277 xpq_queue_pt_switch(paddr_t pa)
278 {
279 struct mmuext_op op;
280 xpq_flush_queue();
281
282 XENPRINTK2(("xpq_queue_pt_switch: 0x%" PRIx64 " 0x%" PRIx64 "\n",
283 (int64_t)pa, (int64_t)pa));
284 op.cmd = MMUEXT_NEW_BASEPTR;
285 op.arg1.mfn = pa >> PAGE_SHIFT;
286 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
287 panic("xpq_queue_pt_switch");
288 }
289
290 void
291 xpq_queue_pin_table(paddr_t pa, int lvl)
292 {
293 struct mmuext_op op;
294
295 xpq_flush_queue();
296
297 XENPRINTK2(("xpq_queue_pin_l%d_table: %#" PRIxPADDR "\n",
298 lvl + 1, pa));
299
300 op.arg1.mfn = pa >> PAGE_SHIFT;
301 op.cmd = lvl;
302
303 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
304 panic("xpq_queue_pin_table");
305 }
306
307 void
308 xpq_queue_unpin_table(paddr_t pa)
309 {
310 struct mmuext_op op;
311
312 xpq_flush_queue();
313
314 XENPRINTK2(("xpq_queue_unpin_table: %#" PRIxPADDR "\n", pa));
315 op.arg1.mfn = pa >> PAGE_SHIFT;
316 op.cmd = MMUEXT_UNPIN_TABLE;
317 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
318 panic("xpq_queue_unpin_table");
319 }
320
321 void
322 xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
323 {
324 struct mmuext_op op;
325
326 xpq_flush_queue();
327
328 XENPRINTK2(("xpq_queue_set_ldt\n"));
329 KASSERT(va == (va & ~PAGE_MASK));
330 op.cmd = MMUEXT_SET_LDT;
331 op.arg1.linear_addr = va;
332 op.arg2.nr_ents = entries;
333 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
334 panic("xpq_queue_set_ldt");
335 }
336
337 void
338 xpq_queue_tlb_flush(void)
339 {
340 struct mmuext_op op;
341
342 xpq_flush_queue();
343
344 XENPRINTK2(("xpq_queue_tlb_flush\n"));
345 op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
346 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
347 panic("xpq_queue_tlb_flush");
348 }
349
350 void
351 xpq_flush_cache(void)
352 {
353 int s = splvm();
354
355 xpq_flush_queue();
356
357 XENPRINTK2(("xpq_queue_flush_cache\n"));
358 asm("wbinvd":::"memory");
359 splx(s); /* XXX: removeme */
360 }
361
362 void
363 xpq_queue_invlpg(vaddr_t va)
364 {
365 struct mmuext_op op;
366 xpq_flush_queue();
367
368 XENPRINTK2(("xpq_queue_invlpg %#" PRIxVADDR "\n", va));
369 op.cmd = MMUEXT_INVLPG_LOCAL;
370 op.arg1.linear_addr = (va & ~PAGE_MASK);
371 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
372 panic("xpq_queue_invlpg");
373 }
374
375 void
376 xen_mcast_invlpg(vaddr_t va, kcpuset_t *kc)
377 {
378 xcpumask_t xcpumask;
379 mmuext_op_t op;
380
381 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask));
382
383 /* Flush pending page updates */
384 xpq_flush_queue();
385
386 op.cmd = MMUEXT_INVLPG_MULTI;
387 op.arg1.linear_addr = va;
388 op.arg2.vcpumask = &xcpumask.xcpum_xm;
389
390 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
391 panic("xpq_queue_invlpg_all");
392 }
393
394 return;
395 }
396
397 void
398 xen_bcast_invlpg(vaddr_t va)
399 {
400 mmuext_op_t op;
401
402 /* Flush pending page updates */
403 xpq_flush_queue();
404
405 op.cmd = MMUEXT_INVLPG_ALL;
406 op.arg1.linear_addr = va;
407
408 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
409 panic("xpq_queue_invlpg_all");
410 }
411
412 return;
413 }
414
415 /* This is a synchronous call. */
416 void
417 xen_mcast_tlbflush(kcpuset_t *kc)
418 {
419 xcpumask_t xcpumask;
420 mmuext_op_t op;
421
422 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask));
423
424 /* Flush pending page updates */
425 xpq_flush_queue();
426
427 op.cmd = MMUEXT_TLB_FLUSH_MULTI;
428 op.arg2.vcpumask = &xcpumask.xcpum_xm;
429
430 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
431 panic("xpq_queue_invlpg_all");
432 }
433
434 return;
435 }
436
437 /* This is a synchronous call. */
438 void
439 xen_bcast_tlbflush(void)
440 {
441 mmuext_op_t op;
442
443 /* Flush pending page updates */
444 xpq_flush_queue();
445
446 op.cmd = MMUEXT_TLB_FLUSH_ALL;
447
448 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
449 panic("xpq_queue_invlpg_all");
450 }
451
452 return;
453 }
454
455 /* This is a synchronous call. */
456 void
457 xen_vcpu_mcast_invlpg(vaddr_t sva, vaddr_t eva, kcpuset_t *kc)
458 {
459 KASSERT(eva > sva);
460
461 /* Flush pending page updates */
462 xpq_flush_queue();
463
464 /* Align to nearest page boundary */
465 sva &= ~PAGE_MASK;
466 eva &= ~PAGE_MASK;
467
468 for ( ; sva <= eva; sva += PAGE_SIZE) {
469 xen_mcast_invlpg(sva, kc);
470 }
471
472 return;
473 }
474
475 /* This is a synchronous call. */
476 void
477 xen_vcpu_bcast_invlpg(vaddr_t sva, vaddr_t eva)
478 {
479 KASSERT(eva > sva);
480
481 /* Flush pending page updates */
482 xpq_flush_queue();
483
484 /* Align to nearest page boundary */
485 sva &= ~PAGE_MASK;
486 eva &= ~PAGE_MASK;
487
488 for ( ; sva <= eva; sva += PAGE_SIZE) {
489 xen_bcast_invlpg(sva);
490 }
491
492 return;
493 }
494
495 /* Copy a page */
496 void
497 xen_copy_page(paddr_t srcpa, paddr_t dstpa)
498 {
499 mmuext_op_t op;
500
501 op.cmd = MMUEXT_COPY_PAGE;
502 op.arg1.mfn = xpmap_ptom(dstpa) >> PAGE_SHIFT;
503 op.arg2.src_mfn = xpmap_ptom(srcpa) >> PAGE_SHIFT;
504
505 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
506 panic(__func__);
507 }
508 }
509
510 /* Zero a physical page */
511 void
512 xen_pagezero(paddr_t pa)
513 {
514 mmuext_op_t op;
515
516 op.cmd = MMUEXT_CLEAR_PAGE;
517 op.arg1.mfn = xpmap_ptom(pa) >> PAGE_SHIFT;
518
519 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) {
520 panic(__func__);
521 }
522 }
523
524 int
525 xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom)
526 {
527 mmu_update_t op;
528 int ok;
529
530 xpq_flush_queue();
531
532 op.ptr = ptr;
533 op.val = val;
534 if (HYPERVISOR_mmu_update(&op, 1, &ok, dom) < 0)
535 return EFAULT;
536 return (0);
537 }
538
539 #ifdef XENDEBUG
540 void
541 xpq_debug_dump(void)
542 {
543 int i;
544
545 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid];
546 int xpq_idx = xpq_idx_array[curcpu()->ci_cpuid];
547
548 XENPRINTK2(("idx: %d\n", xpq_idx));
549 for (i = 0; i < xpq_idx; i++) {
550 snprintf(XBUF, sizeof(XBUF), "%" PRIx64 " %08" PRIx64,
551 xpq_queue[i].ptr, xpq_queue[i].val);
552 if (++i < xpq_idx)
553 snprintf(XBUF + strlen(XBUF),
554 sizeof(XBUF) - strlen(XBUF),
555 "%" PRIx64 " %08" PRIx64,
556 xpq_queue[i].ptr, xpq_queue[i].val);
557 if (++i < xpq_idx)
558 snprintf(XBUF + strlen(XBUF),
559 sizeof(XBUF) - strlen(XBUF),
560 "%" PRIx64 " %08" PRIx64,
561 xpq_queue[i].ptr, xpq_queue[i].val);
562 if (++i < xpq_idx)
563 snprintf(XBUF + strlen(XBUF),
564 sizeof(XBUF) - strlen(XBUF),
565 "%" PRIx64 " %08" PRIx64,
566 xpq_queue[i].ptr, xpq_queue[i].val);
567 XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
568 }
569 }
570 #endif
571
572
573 #if L2_SLOT_KERNBASE > 0
574 #define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1))
575 #else
576 #define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1)
577 #endif
578
579 #ifdef PAE
580 /*
581 * For PAE, we consider a single contiguous L2 "superpage" of 4 pages, all of
582 * them mapped by the L3 page. We also need a shadow page for L3[3].
583 */
584 static const int l2_4_count = 6;
585 #elif defined(__x86_64__)
586 static const int l2_4_count = PTP_LEVELS;
587 #else
588 static const int l2_4_count = PTP_LEVELS - 1;
589 #endif
590
591 /*
592 * Xen locore: get rid of the Xen bootstrap tables. Build and switch to new page
593 * tables.
594 *
595 * Virtual address space of the kernel when leaving this function:
596 * +--------------+------------------+-------------+------------+---------------
597 * | KERNEL IMAGE | BOOTSTRAP TABLES | PROC0 UAREA | DUMMY PAGE | HYPER. SHARED
598 * +--------------+------------------+-------------+------------+---------------
599 *
600 * ------+-----------------+-------------+
601 * INFO | EARLY ZERO PAGE | ISA I/O MEM |
602 * ------+-----------------+-------------+
603 *
604 * DUMMY PAGE is either a PDG for amd64 or a GDT for i386.
605 *
606 * (HYPER. SHARED INFO + EARLY ZERO PAGE + ISA I/O MEM) have no physical
607 * addresses preallocated.
608 */
609 vaddr_t
610 xen_locore(void)
611 {
612 size_t count, oldcount, mapsize;
613 vaddr_t bootstrap_tables, init_tables;
614 u_int descs[4];
615
616 xen_init_features();
617
618 memset(xpq_idx_array, 0, sizeof(xpq_idx_array));
619
620 xpmap_phys_to_machine_mapping =
621 (unsigned long *)xen_start_info.mfn_list;
622
623 /* Set the NX/XD bit, if available. descs[3] = %edx. */
624 x86_cpuid(0x80000001, descs);
625 xpmap_pg_nx = (descs[3] & CPUID_NOX) ? PG_NX : 0;
626
627 /* Space after Xen boostrap tables should be free */
628 init_tables = xen_start_info.pt_base;
629 bootstrap_tables = init_tables +
630 (xen_start_info.nr_pt_frames * PAGE_SIZE);
631
632 /*
633 * Calculate how much space we need. First, everything mapped before
634 * the Xen bootstrap tables.
635 */
636 mapsize = init_tables - KERNTEXTOFF;
637 /* after the tables we'll have:
638 * - UAREA
639 * - dummy user PGD (x86_64)
640 * - HYPERVISOR_shared_info
641 * - early_zerop
642 * - ISA I/O mem (if needed)
643 */
644 mapsize += UPAGES * PAGE_SIZE;
645 #ifdef __x86_64__
646 mapsize += PAGE_SIZE;
647 #endif
648 mapsize += PAGE_SIZE;
649 mapsize += PAGE_SIZE;
650 #ifdef DOM0OPS
651 if (xendomain_is_dom0()) {
652 mapsize += IOM_SIZE;
653 }
654 #endif
655
656 /*
657 * At this point, mapsize doesn't include the table size.
658 */
659 #ifdef __x86_64__
660 count = TABLE_L2_ENTRIES;
661 #else
662 count = (mapsize + (NBPD_L2 - 1)) >> L2_SHIFT;
663 #endif
664
665 /*
666 * Now compute how many L2 pages we need exactly. This is useful only
667 * on i386, since the initial count for amd64 is already enough.
668 */
669 while (KERNTEXTOFF + mapsize + (count + l2_4_count) * PAGE_SIZE >
670 KERNBASE + (count << L2_SHIFT)) {
671 count++;
672 }
673
674 #ifdef i386
675 /*
676 * One more L2 page: we'll allocate several pages after kva_start
677 * in pmap_bootstrap() before pmap_growkernel(), which have not been
678 * counted here. It's not a big issue to allocate one more L2 as
679 * pmap_growkernel() will be called anyway.
680 */
681 count++;
682 nkptp[1] = count;
683 #endif
684
685 /*
686 * Install bootstrap pages. We may need more L2 pages than will
687 * have the final table here, as it's installed after the final table.
688 */
689 oldcount = count;
690
691 bootstrap_again:
692
693 /*
694 * Xen space we'll reclaim may not be enough for our new page tables,
695 * move bootstrap tables if necessary.
696 */
697 if (bootstrap_tables < init_tables + ((count + l2_4_count) * PAGE_SIZE))
698 bootstrap_tables = init_tables +
699 ((count + l2_4_count) * PAGE_SIZE);
700
701 /*
702 * Make sure the number of L2 pages we have is enough to map everything
703 * from KERNBASE to the bootstrap tables themselves.
704 */
705 if (bootstrap_tables + ((oldcount + l2_4_count) * PAGE_SIZE) >
706 KERNBASE + (oldcount << L2_SHIFT)) {
707 oldcount++;
708 goto bootstrap_again;
709 }
710
711 /* Create temporary tables */
712 xen_bootstrap_tables(init_tables, bootstrap_tables,
713 xen_start_info.nr_pt_frames, oldcount, false);
714
715 /* Create final tables */
716 xen_bootstrap_tables(bootstrap_tables, init_tables,
717 oldcount + l2_4_count, count, true);
718
719 /* Zero out PROC0 UAREA and DUMMY PAGE. */
720 memset((void *)(init_tables + ((count + l2_4_count) * PAGE_SIZE)), 0,
721 (UPAGES + 1) * PAGE_SIZE);
722
723 /* Finally, flush TLB. */
724 xpq_queue_tlb_flush();
725
726 return (init_tables + ((count + l2_4_count) * PAGE_SIZE));
727 }
728
729 /*
730 * Build a new table and switch to it.
731 * old_count is # of old tables (including PGD, PDTPE and PDE).
732 * new_count is # of new tables (PTE only).
733 * We assume the areas don't overlap.
734 */
735 static void
736 xen_bootstrap_tables(vaddr_t old_pgd, vaddr_t new_pgd, size_t old_count,
737 size_t new_count, bool final)
738 {
739 pd_entry_t *pdtpe, *pde, *pte;
740 pd_entry_t *bt_pgd;
741 paddr_t addr;
742 vaddr_t page, avail, map_end;
743 int i;
744 extern char __rodata_start;
745 extern char __data_start;
746 extern char __kernel_end;
747 extern char *early_zerop; /* from pmap.c */
748
749 /*
750 * Layout of RW area after the kernel image:
751 * xencons_interface (if present)
752 * xenstore_interface (if present)
753 * table pages (new_count + l2_4_count entries)
754 * Extra mappings (only when final is true):
755 * UAREA
756 * dummy user PGD (x86_64 only) / GDT page (i386 only)
757 * HYPERVISOR_shared_info
758 * early_zerop
759 * ISA I/O mem (if needed)
760 */
761 map_end = new_pgd + ((new_count + l2_4_count) * PAGE_SIZE);
762 if (final) {
763 map_end += UPAGES * PAGE_SIZE;
764 xen_dummy_page = (vaddr_t)map_end;
765 map_end += PAGE_SIZE;
766 HYPERVISOR_shared_info = (shared_info_t *)map_end;
767 map_end += PAGE_SIZE;
768 early_zerop = (char *)map_end;
769 map_end += PAGE_SIZE;
770 }
771
772 /*
773 * We always set atdevbase, as it's used by init386 to find the first
774 * available VA. map_end is updated only if we are dom0, so
775 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in
776 * this case.
777 */
778 if (final) {
779 atdevbase = map_end;
780 #ifdef DOM0OPS
781 if (xendomain_is_dom0()) {
782 /* ISA I/O mem */
783 map_end += IOM_SIZE;
784 }
785 #endif
786 }
787
788 __PRINTK(("xen_bootstrap_tables map_end 0x%lx\n", map_end));
789 __PRINTK(("console %#lx ", xen_start_info.console_mfn));
790 __PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn));
791
792 /*
793 * Create bootstrap page tables. What we need:
794 * - a PGD (level 4)
795 * - a PDTPE (level 3)
796 * - a PDE (level 2)
797 * - some PTEs (level 1)
798 */
799
800 bt_pgd = (pd_entry_t *)new_pgd;
801 memset(bt_pgd, 0, PAGE_SIZE);
802 avail = new_pgd + PAGE_SIZE;
803
804 #if PTP_LEVELS > 3
805 /* Per-cpu L4 */
806 pd_entry_t *bt_cpu_pgd = bt_pgd;
807 /* pmap_kernel() "shadow" L4 */
808 bt_pgd = (pd_entry_t *)avail;
809 memset(bt_pgd, 0, PAGE_SIZE);
810 avail += PAGE_SIZE;
811
812 /* Install L3 */
813 pdtpe = (pd_entry_t *)avail;
814 memset(pdtpe, 0, PAGE_SIZE);
815 avail += PAGE_SIZE;
816
817 addr = ((u_long)pdtpe) - KERNBASE;
818 bt_pgd[pl4_pi(KERNTEXTOFF)] = bt_cpu_pgd[pl4_pi(KERNTEXTOFF)] =
819 xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
820 #else
821 pdtpe = bt_pgd;
822 #endif
823
824 #if PTP_LEVELS > 2
825 /* Level 2 */
826 pde = (pd_entry_t *)avail;
827 memset(pde, 0, PAGE_SIZE);
828 avail += PAGE_SIZE;
829
830 addr = ((u_long)pde) - KERNBASE;
831 pdtpe[pl3_pi(KERNTEXTOFF)] =
832 xpmap_ptom_masked(addr) | PG_k | PG_V | PG_RW;
833 #elif defined(PAE)
834 /*
835 * Our PAE-style level 2, 5 contiguous pages (4 L2 + 1 shadow).
836 * +-----------------+----------------+---------+
837 * Physical layout: | 3 * USERLAND L2 | L2 KERN SHADOW | L2 KERN |
838 * +-----------------+----------------+---------+
839 * However, we enter pdtpte[3] into L2 KERN, and not L2 KERN SHADOW.
840 * This way, pde[L2_SLOT_KERN] always points to the shadow.
841 */
842 pde = (pd_entry_t *)avail;
843 memset(pde, 0, PAGE_SIZE * 5);
844 avail += PAGE_SIZE * 5;
845
846 /*
847 * Link L2 pages in L3, with a special case for L2 KERN. Xen doesn't
848 * want RW permissions in L3 entries, it'll add them itself.
849 */
850 addr = ((u_long)pde) - KERNBASE;
851 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
852 pdtpe[i] = xpmap_ptom_masked(addr) | PG_k | PG_V;
853 }
854 addr += PAGE_SIZE;
855 pdtpe[3] = xpmap_ptom_masked(addr) | PG_k | PG_V;
856 #else
857 pde = bt_pgd;
858 #endif
859
860 /* Level 1 */
861 page = KERNTEXTOFF;
862 for (i = 0; i < new_count; i ++) {
863 vaddr_t cur_page = page;
864
865 pte = (pd_entry_t *)avail;
866 avail += PAGE_SIZE;
867
868 memset(pte, 0, PAGE_SIZE);
869 while (pl2_pi(page) == pl2_pi(cur_page)) {
870 if (page >= map_end) {
871 /* not mapped at all */
872 pte[pl1_pi(page)] = 0;
873 page += PAGE_SIZE;
874 continue;
875 }
876 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE);
877 if (page == (vaddr_t)HYPERVISOR_shared_info) {
878 pte[pl1_pi(page)] = xen_start_info.shared_info;
879 }
880 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
881 == xen_start_info.console.domU.mfn) {
882 xencons_interface = (void *)page;
883 pte[pl1_pi(page)] = xen_start_info.console_mfn;
884 pte[pl1_pi(page)] <<= PAGE_SHIFT;
885 }
886 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT)
887 == xen_start_info.store_mfn) {
888 xenstore_interface = (void *)page;
889 pte[pl1_pi(page)] = xen_start_info.store_mfn;
890 pte[pl1_pi(page)] <<= PAGE_SHIFT;
891 }
892 #ifdef DOM0OPS
893 if (page >= (vaddr_t)atdevbase &&
894 page < (vaddr_t)atdevbase + IOM_SIZE) {
895 pte[pl1_pi(page)] =
896 IOM_BEGIN + (page - (vaddr_t)atdevbase);
897 pte[pl1_pi(page)] |= xpmap_pg_nx;
898 }
899 #endif
900
901 pte[pl1_pi(page)] |= PG_k | PG_V;
902 if (page < (vaddr_t)&__rodata_start) {
903 /* Map the kernel text RX. */
904 pte[pl1_pi(page)] |= PG_RO;
905 } else if (page >= (vaddr_t)&__rodata_start &&
906 page < (vaddr_t)&__data_start) {
907 /* Map the kernel rodata R. */
908 pte[pl1_pi(page)] |= PG_RO | xpmap_pg_nx;
909 } else if (page >= old_pgd &&
910 page < old_pgd + (old_count * PAGE_SIZE)) {
911 /* Map the old page tables R. */
912 pte[pl1_pi(page)] |= PG_RO | xpmap_pg_nx;
913 } else if (page >= new_pgd &&
914 page < new_pgd + ((new_count + l2_4_count) * PAGE_SIZE)) {
915 /* Map the new page tables R. */
916 pte[pl1_pi(page)] |= PG_RO | xpmap_pg_nx;
917 #ifdef i386
918 } else if (page == (vaddr_t)tmpgdt) {
919 /*
920 * Map bootstrap gdt R/O. Later, we will re-add
921 * this page to uvm after making it writable.
922 */
923 pte[pl1_pi(page)] = 0;
924 page += PAGE_SIZE;
925 continue;
926 #endif
927 } else if (page >= (vaddr_t)&__data_start &&
928 page < (vaddr_t)&__kernel_end) {
929 /* Map the kernel data+bss RW. */
930 pte[pl1_pi(page)] |= PG_RW | xpmap_pg_nx;
931 } else {
932 /* Map the page RW. */
933 pte[pl1_pi(page)] |= PG_RW | xpmap_pg_nx;
934 }
935
936 page += PAGE_SIZE;
937 }
938
939 addr = ((u_long)pte) - KERNBASE;
940 pde[pl2_pi(cur_page)] =
941 xpmap_ptom_masked(addr) | PG_k | PG_RW | PG_V;
942
943 /* Mark readonly */
944 xen_bt_set_readonly((vaddr_t)pte);
945 }
946
947 /* Install recursive page tables mapping */
948 #ifdef PAE
949 /* Copy L2 KERN into L2 KERN SHADOW, and reference the latter in cpu0. */
950 memcpy(&pde[L2_SLOT_KERN + NPDPG], &pde[L2_SLOT_KERN], PAGE_SIZE);
951 cpu_info_primary.ci_kpm_pdir = &pde[L2_SLOT_KERN + NPDPG];
952 cpu_info_primary.ci_kpm_pdirpa =
953 (vaddr_t)cpu_info_primary.ci_kpm_pdir - KERNBASE;
954
955 /*
956 * We don't enter a recursive entry from the L3 PD. Instead, we enter
957 * the first 4 L2 pages, which includes the kernel's L2 shadow. But we
958 * have to enter the shadow after switching %cr3, or Xen will refcount
959 * some PTEs with the wrong type.
960 */
961 addr = (u_long)pde - KERNBASE;
962 for (i = 0; i < 3; i++, addr += PAGE_SIZE) {
963 pde[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PG_k | PG_V |
964 xpmap_pg_nx;
965 }
966
967 /* Mark tables RO, and pin L2 KERN SHADOW. */
968 addr = (u_long)pde - KERNBASE;
969 for (i = 0; i < 5; i++, addr += PAGE_SIZE) {
970 xen_bt_set_readonly(((vaddr_t)pde) + PAGE_SIZE * i);
971 }
972 if (final) {
973 addr = (u_long)pde - KERNBASE + 3 * PAGE_SIZE;
974 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr));
975 }
976 #else /* PAE */
977
978 /* Recursive entry in pmap_kernel(). */
979 bt_pgd[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)bt_pgd - KERNBASE)
980 | PG_k | PG_RO | PG_V | xpmap_pg_nx;
981 #ifdef __x86_64__
982 /* Recursive entry in higher-level per-cpu PD. */
983 bt_cpu_pgd[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)bt_cpu_pgd - KERNBASE)
984 | PG_k | PG_RO | PG_V | xpmap_pg_nx;
985 #endif
986
987 /* Mark tables RO */
988 xen_bt_set_readonly((vaddr_t)pde);
989 #endif /* PAE */
990
991 #if PTP_LEVELS > 2 || defined(PAE)
992 xen_bt_set_readonly((vaddr_t)pdtpe);
993 #endif
994 #if PTP_LEVELS > 3
995 xen_bt_set_readonly(new_pgd);
996 #endif
997
998 /* Pin the PGD */
999 #ifdef __x86_64__
1000 xpq_queue_pin_l4_table(xpmap_ptom_masked(new_pgd - KERNBASE));
1001 #elif PAE
1002 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE));
1003 #else
1004 xpq_queue_pin_l2_table(xpmap_ptom_masked(new_pgd - KERNBASE));
1005 #endif
1006
1007 /* Save phys. addr of PDP, for libkvm. */
1008 #ifdef PAE
1009 PDPpaddr = (u_long)pde - KERNBASE; /* PDP is the L2 with PAE */
1010 #else
1011 PDPpaddr = (u_long)bt_pgd - KERNBASE;
1012 #endif
1013
1014 /* Switch to new tables */
1015 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
1016
1017 #ifdef PAE
1018 if (final) {
1019 /* Save the address of the L3 page */
1020 cpu_info_primary.ci_pae_l3_pdir = pdtpe;
1021 cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE);
1022
1023 /* Now enter the kernel's PTE mappings */
1024 addr = (u_long)pde - KERNBASE + PAGE_SIZE * 3;
1025 xpq_queue_pte_update(
1026 xpmap_ptom(((vaddr_t)&pde[PDIR_SLOT_PTE + 3]) - KERNBASE),
1027 xpmap_ptom_masked(addr) | PG_k | PG_V);
1028 xpq_flush_queue();
1029 }
1030 #elif defined(__x86_64__)
1031 if (final) {
1032 /* Save the address of the real per-cpu L4 page. */
1033 cpu_info_primary.ci_kpm_pdir = bt_cpu_pgd;
1034 cpu_info_primary.ci_kpm_pdirpa = ((paddr_t)bt_cpu_pgd - KERNBASE);
1035 }
1036 #endif
1037 __USE(pdtpe);
1038
1039 /*
1040 * Now we can safely reclaim the space taken by the old tables.
1041 */
1042
1043 /* Unpin old PGD */
1044 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE));
1045
1046 /* Mark old tables RW */
1047 page = old_pgd;
1048 addr = xpmap_mtop((paddr_t)pde[pl2_pi(page)] & PG_FRAME);
1049 pte = (pd_entry_t *)((u_long)addr + KERNBASE);
1050 pte += pl1_pi(page);
1051 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) {
1052 addr = xpmap_ptom(((u_long)pte) - KERNBASE);
1053 xpq_queue_pte_update(addr, *pte | PG_RW);
1054 page += PAGE_SIZE;
1055 /*
1056 * Our PTEs are contiguous so it's safe to just "++" here.
1057 */
1058 pte++;
1059 }
1060 xpq_flush_queue();
1061 }
1062
1063 /*
1064 * Mark a page read-only, assuming vaddr = paddr + KERNBASE.
1065 */
1066 static void
1067 xen_bt_set_readonly(vaddr_t page)
1068 {
1069 pt_entry_t entry;
1070
1071 entry = xpmap_ptom_masked(page - KERNBASE);
1072 entry |= PG_k | PG_V | xpmap_pg_nx;
1073
1074 HYPERVISOR_update_va_mapping(page, entry, UVMF_INVLPG);
1075 }
1076
1077 #ifdef __x86_64__
1078 void
1079 xen_set_user_pgd(paddr_t page)
1080 {
1081 struct mmuext_op op;
1082 int s = splvm();
1083
1084 xpq_flush_queue();
1085 op.cmd = MMUEXT_NEW_USER_BASEPTR;
1086 op.arg1.mfn = xpmap_ptom_masked(page) >> PAGE_SHIFT;
1087 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
1088 panic("xen_set_user_pgd: failed to install new user page"
1089 " directory %#" PRIxPADDR, page);
1090 splx(s);
1091 }
1092 #endif /* __x86_64__ */
1093