libnvmm_x86.c revision 1.4.2.2 1 /* $NetBSD: libnvmm_x86.c,v 1.4.2.2 2018/11/26 01:52:13 pgoyette Exp $ */
2
3 /*
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <sys/ioctl.h>
41 #include <sys/mman.h>
42 #include <machine/vmparam.h>
43 #include <machine/pte.h>
44 #include <machine/psl.h>
45
46 #include "nvmm.h"
47
48 #include <x86/specialreg.h>
49
50 /* -------------------------------------------------------------------------- */
51
52 #define PTE32_L1_SHIFT 12
53 #define PTE32_L2_SHIFT 22
54
55 #define PTE32_L2_MASK 0xffc00000
56 #define PTE32_L1_MASK 0x003ff000
57
58 #define PTE32_L2_FRAME (PTE32_L2_MASK)
59 #define PTE32_L1_FRAME (PTE32_L2_FRAME|PTE32_L1_MASK)
60
61 #define pte32_l1idx(va) (((va) & PTE32_L1_MASK) >> PTE32_L1_SHIFT)
62 #define pte32_l2idx(va) (((va) & PTE32_L2_MASK) >> PTE32_L2_SHIFT)
63
64 typedef uint32_t pte_32bit_t;
65
66 static int
67 x86_gva_to_gpa_32bit(struct nvmm_machine *mach, uint64_t cr3,
68 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
69 {
70 gpaddr_t L2gpa, L1gpa;
71 uintptr_t L2hva, L1hva;
72 pte_32bit_t *pdir, pte;
73
74 /* We begin with an RWXU access. */
75 *prot = NVMM_PROT_ALL;
76
77 /* Parse L2. */
78 L2gpa = (cr3 & PG_FRAME);
79 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
80 return -1;
81 pdir = (pte_32bit_t *)L2hva;
82 pte = pdir[pte32_l2idx(gva)];
83 if ((pte & PG_V) == 0)
84 return -1;
85 if ((pte & PG_u) == 0)
86 *prot &= ~NVMM_PROT_USER;
87 if ((pte & PG_KW) == 0)
88 *prot &= ~NVMM_PROT_WRITE;
89 if ((pte & PG_PS) && !has_pse)
90 return -1;
91 if (pte & PG_PS) {
92 *gpa = (pte & PTE32_L2_FRAME);
93 return 0;
94 }
95
96 /* Parse L1. */
97 L1gpa = (pte & PG_FRAME);
98 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
99 return -1;
100 pdir = (pte_32bit_t *)L1hva;
101 pte = pdir[pte32_l1idx(gva)];
102 if ((pte & PG_V) == 0)
103 return -1;
104 if ((pte & PG_u) == 0)
105 *prot &= ~NVMM_PROT_USER;
106 if ((pte & PG_KW) == 0)
107 *prot &= ~NVMM_PROT_WRITE;
108 if (pte & PG_PS)
109 return -1;
110
111 *gpa = (pte & PG_FRAME);
112 return 0;
113 }
114
115 /* -------------------------------------------------------------------------- */
116
117 #define PTE32_PAE_L1_SHIFT 12
118 #define PTE32_PAE_L2_SHIFT 21
119 #define PTE32_PAE_L3_SHIFT 30
120
121 #define PTE32_PAE_L3_MASK 0xc0000000
122 #define PTE32_PAE_L2_MASK 0x3fe00000
123 #define PTE32_PAE_L1_MASK 0x001ff000
124
125 #define PTE32_PAE_L3_FRAME (PTE32_PAE_L3_MASK)
126 #define PTE32_PAE_L2_FRAME (PTE32_PAE_L3_FRAME|PTE32_PAE_L2_MASK)
127 #define PTE32_PAE_L1_FRAME (PTE32_PAE_L2_FRAME|PTE32_PAE_L1_MASK)
128
129 #define pte32_pae_l1idx(va) (((va) & PTE32_PAE_L1_MASK) >> PTE32_PAE_L1_SHIFT)
130 #define pte32_pae_l2idx(va) (((va) & PTE32_PAE_L2_MASK) >> PTE32_PAE_L2_SHIFT)
131 #define pte32_pae_l3idx(va) (((va) & PTE32_PAE_L3_MASK) >> PTE32_PAE_L3_SHIFT)
132
133 typedef uint64_t pte_32bit_pae_t;
134
135 static int
136 x86_gva_to_gpa_32bit_pae(struct nvmm_machine *mach, uint64_t cr3,
137 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
138 {
139 gpaddr_t L3gpa, L2gpa, L1gpa;
140 uintptr_t L3hva, L2hva, L1hva;
141 pte_32bit_pae_t *pdir, pte;
142
143 /* We begin with an RWXU access. */
144 *prot = NVMM_PROT_ALL;
145
146 /* Parse L3. */
147 L3gpa = (cr3 & PG_FRAME);
148 if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
149 return -1;
150 pdir = (pte_32bit_pae_t *)L3hva;
151 pte = pdir[pte32_pae_l3idx(gva)];
152 if ((pte & PG_V) == 0)
153 return -1;
154 if (pte & PG_NX)
155 *prot &= ~NVMM_PROT_EXEC;
156 if (pte & PG_PS)
157 return -1;
158
159 /* Parse L2. */
160 L2gpa = (pte & PG_FRAME);
161 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
162 return -1;
163 pdir = (pte_32bit_pae_t *)L2hva;
164 pte = pdir[pte32_pae_l2idx(gva)];
165 if ((pte & PG_V) == 0)
166 return -1;
167 if ((pte & PG_u) == 0)
168 *prot &= ~NVMM_PROT_USER;
169 if ((pte & PG_KW) == 0)
170 *prot &= ~NVMM_PROT_WRITE;
171 if (pte & PG_NX)
172 *prot &= ~NVMM_PROT_EXEC;
173 if ((pte & PG_PS) && !has_pse)
174 return -1;
175 if (pte & PG_PS) {
176 *gpa = (pte & PTE32_PAE_L2_FRAME);
177 return 0;
178 }
179
180 /* Parse L1. */
181 L1gpa = (pte & PG_FRAME);
182 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
183 return -1;
184 pdir = (pte_32bit_pae_t *)L1hva;
185 pte = pdir[pte32_pae_l1idx(gva)];
186 if ((pte & PG_V) == 0)
187 return -1;
188 if ((pte & PG_u) == 0)
189 *prot &= ~NVMM_PROT_USER;
190 if ((pte & PG_KW) == 0)
191 *prot &= ~NVMM_PROT_WRITE;
192 if (pte & PG_NX)
193 *prot &= ~NVMM_PROT_EXEC;
194 if (pte & PG_PS)
195 return -1;
196
197 *gpa = (pte & PG_FRAME);
198 return 0;
199 }
200
201 /* -------------------------------------------------------------------------- */
202
203 #define PTE64_L1_SHIFT 12
204 #define PTE64_L2_SHIFT 21
205 #define PTE64_L3_SHIFT 30
206 #define PTE64_L4_SHIFT 39
207
208 #define PTE64_L4_MASK 0x0000ff8000000000
209 #define PTE64_L3_MASK 0x0000007fc0000000
210 #define PTE64_L2_MASK 0x000000003fe00000
211 #define PTE64_L1_MASK 0x00000000001ff000
212
213 #define PTE64_L4_FRAME PTE64_L4_MASK
214 #define PTE64_L3_FRAME (PTE64_L4_FRAME|PTE64_L3_MASK)
215 #define PTE64_L2_FRAME (PTE64_L3_FRAME|PTE64_L2_MASK)
216 #define PTE64_L1_FRAME (PTE64_L2_FRAME|PTE64_L1_MASK)
217
218 #define pte64_l1idx(va) (((va) & PTE64_L1_MASK) >> PTE64_L1_SHIFT)
219 #define pte64_l2idx(va) (((va) & PTE64_L2_MASK) >> PTE64_L2_SHIFT)
220 #define pte64_l3idx(va) (((va) & PTE64_L3_MASK) >> PTE64_L3_SHIFT)
221 #define pte64_l4idx(va) (((va) & PTE64_L4_MASK) >> PTE64_L4_SHIFT)
222
223 typedef uint64_t pte_64bit_t;
224
225 static inline bool
226 x86_gva_64bit_canonical(gvaddr_t gva)
227 {
228 /* Bits 63:47 must have the same value. */
229 #define SIGN_EXTEND 0xffff800000000000ULL
230 return (gva & SIGN_EXTEND) == 0 || (gva & SIGN_EXTEND) == SIGN_EXTEND;
231 }
232
233 static int
234 x86_gva_to_gpa_64bit(struct nvmm_machine *mach, uint64_t cr3,
235 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
236 {
237 gpaddr_t L4gpa, L3gpa, L2gpa, L1gpa;
238 uintptr_t L4hva, L3hva, L2hva, L1hva;
239 pte_64bit_t *pdir, pte;
240
241 /* We begin with an RWXU access. */
242 *prot = NVMM_PROT_ALL;
243
244 if (!x86_gva_64bit_canonical(gva))
245 return -1;
246
247 /* Parse L4. */
248 L4gpa = (cr3 & PG_FRAME);
249 if (nvmm_gpa_to_hva(mach, L4gpa, &L4hva) == -1)
250 return -1;
251 pdir = (pte_64bit_t *)L4hva;
252 pte = pdir[pte64_l4idx(gva)];
253 if ((pte & PG_V) == 0)
254 return -1;
255 if ((pte & PG_u) == 0)
256 *prot &= ~NVMM_PROT_USER;
257 if ((pte & PG_KW) == 0)
258 *prot &= ~NVMM_PROT_WRITE;
259 if (pte & PG_NX)
260 *prot &= ~NVMM_PROT_EXEC;
261 if (pte & PG_PS)
262 return -1;
263
264 /* Parse L3. */
265 L3gpa = (pte & PG_FRAME);
266 if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
267 return -1;
268 pdir = (pte_64bit_t *)L3hva;
269 pte = pdir[pte64_l3idx(gva)];
270 if ((pte & PG_V) == 0)
271 return -1;
272 if ((pte & PG_u) == 0)
273 *prot &= ~NVMM_PROT_USER;
274 if ((pte & PG_KW) == 0)
275 *prot &= ~NVMM_PROT_WRITE;
276 if (pte & PG_NX)
277 *prot &= ~NVMM_PROT_EXEC;
278 if ((pte & PG_PS) && !has_pse)
279 return -1;
280 if (pte & PG_PS) {
281 *gpa = (pte & PTE64_L3_FRAME);
282 return 0;
283 }
284
285 /* Parse L2. */
286 L2gpa = (pte & PG_FRAME);
287 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
288 return -1;
289 pdir = (pte_64bit_t *)L2hva;
290 pte = pdir[pte64_l2idx(gva)];
291 if ((pte & PG_V) == 0)
292 return -1;
293 if ((pte & PG_u) == 0)
294 *prot &= ~NVMM_PROT_USER;
295 if ((pte & PG_KW) == 0)
296 *prot &= ~NVMM_PROT_WRITE;
297 if (pte & PG_NX)
298 *prot &= ~NVMM_PROT_EXEC;
299 if ((pte & PG_PS) && !has_pse)
300 return -1;
301 if (pte & PG_PS) {
302 *gpa = (pte & PTE64_L2_FRAME);
303 return 0;
304 }
305
306 /* Parse L1. */
307 L1gpa = (pte & PG_FRAME);
308 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
309 return -1;
310 pdir = (pte_64bit_t *)L1hva;
311 pte = pdir[pte64_l1idx(gva)];
312 if ((pte & PG_V) == 0)
313 return -1;
314 if ((pte & PG_u) == 0)
315 *prot &= ~NVMM_PROT_USER;
316 if ((pte & PG_KW) == 0)
317 *prot &= ~NVMM_PROT_WRITE;
318 if (pte & PG_NX)
319 *prot &= ~NVMM_PROT_EXEC;
320 if (pte & PG_PS)
321 return -1;
322
323 *gpa = (pte & PG_FRAME);
324 return 0;
325 }
326
327 static inline int
328 x86_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_x64_state *state,
329 gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
330 {
331 bool is_pae, is_lng, has_pse;
332 uint64_t cr3;
333 int ret;
334
335 if ((state->crs[NVMM_X64_CR_CR0] & CR0_PG) == 0) {
336 /* No paging. */
337 *prot = NVMM_PROT_ALL;
338 *gpa = gva;
339 return 0;
340 }
341
342 is_pae = (state->crs[NVMM_X64_CR_CR4] & CR4_PAE) != 0;
343 is_lng = (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
344 has_pse = (state->crs[NVMM_X64_CR_CR4] & CR4_PSE) != 0;
345 cr3 = state->crs[NVMM_X64_CR_CR3];
346
347 if (is_pae && is_lng) {
348 /* 64bit */
349 ret = x86_gva_to_gpa_64bit(mach, cr3, gva, gpa, has_pse, prot);
350 } else if (is_pae && !is_lng) {
351 /* 32bit PAE */
352 ret = x86_gva_to_gpa_32bit_pae(mach, cr3, gva, gpa, has_pse,
353 prot);
354 } else if (!is_pae && !is_lng) {
355 /* 32bit */
356 ret = x86_gva_to_gpa_32bit(mach, cr3, gva, gpa, has_pse, prot);
357 } else {
358 ret = -1;
359 }
360
361 if (ret == -1) {
362 errno = EFAULT;
363 }
364
365 return ret;
366 }
367
368 int
369 nvmm_gva_to_gpa(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
370 gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
371 {
372 struct nvmm_x64_state state;
373 int ret;
374
375 if (gva & PAGE_MASK) {
376 errno = EINVAL;
377 return -1;
378 }
379
380 ret = nvmm_vcpu_getstate(mach, cpuid, &state,
381 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
382 if (ret == -1)
383 return -1;
384
385 return x86_gva_to_gpa(mach, &state, gva, gpa, prot);
386 }
387
388 /* -------------------------------------------------------------------------- */
389
390 static inline bool
391 is_long_mode(struct nvmm_x64_state *state)
392 {
393 return (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
394 }
395
396 static inline bool
397 is_illegal(struct nvmm_io *io, nvmm_prot_t prot)
398 {
399 return (io->in && !(prot & NVMM_PROT_WRITE));
400 }
401
402 static int
403 segment_apply(struct nvmm_x64_state_seg *seg, gvaddr_t *gva, size_t size)
404 {
405 uint64_t limit;
406
407 /*
408 * This is incomplete. We should check topdown, etc, really that's
409 * tiring.
410 */
411 if (__predict_false(!seg->attrib.p)) {
412 goto error;
413 }
414
415 limit = (seg->limit + 1);
416 if (__predict_true(seg->attrib.gran)) {
417 limit *= PAGE_SIZE;
418 }
419
420 if (__predict_false(*gva + seg->base + size > limit)) {
421 goto error;
422 }
423
424 *gva += seg->base;
425 return 0;
426
427 error:
428 errno = EFAULT;
429 return -1;
430 }
431
432 int
433 nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
434 struct nvmm_exit *exit, void (*cb)(struct nvmm_io *))
435 {
436 struct nvmm_x64_state state;
437 struct nvmm_io io;
438 nvmm_prot_t prot;
439 size_t remain, done;
440 uintptr_t hva;
441 gvaddr_t gva, off;
442 gpaddr_t gpa;
443 uint64_t rsi;
444 uint8_t tmp[8];
445 uint8_t *ptr, *ptr2;
446 bool cross;
447 int ret;
448
449 if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
450 errno = EINVAL;
451 return -1;
452 }
453
454 io.port = exit->u.io.port;
455 io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
456 io.size = exit->u.io.operand_size;
457
458 ret = nvmm_vcpu_getstate(mach, cpuid, &state,
459 NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
460 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
461 if (ret == -1)
462 return -1;
463
464 cross = false;
465
466 if (!exit->u.io.str) {
467 ptr = (uint8_t *)&state.gprs[NVMM_X64_GPR_RAX];
468 } else {
469 rsi = state.gprs[NVMM_X64_GPR_RSI];
470
471 switch (exit->u.io.address_size) {
472 case 8:
473 gva = rsi;
474 break;
475 case 4:
476 gva = (rsi & 0x00000000FFFFFFFF);
477 break;
478 case 2:
479 default: /* impossible */
480 gva = (rsi & 0x000000000000FFFF);
481 break;
482 }
483
484 if (!is_long_mode(&state)) {
485 ret = segment_apply(&state.segs[exit->u.io.seg], &gva,
486 io.size);
487 if (ret == -1)
488 return -1;
489 }
490
491 off = (gva & PAGE_MASK);
492 gva &= ~PAGE_MASK;
493
494 ret = x86_gva_to_gpa(mach, &state, gva, &gpa, &prot);
495 if (ret == -1)
496 return -1;
497 if (__predict_false(is_illegal(&io, prot))) {
498 errno = EFAULT;
499 return -1;
500 }
501 ret = nvmm_gpa_to_hva(mach, gpa, &hva);
502 if (ret == -1)
503 return -1;
504
505 ptr = (uint8_t *)hva + off;
506
507 /*
508 * Special case. If the buffer is in between two pages, we
509 * need to retrieve data from the next page.
510 */
511 if (__predict_false(off + io.size > PAGE_SIZE)) {
512 cross = true;
513 remain = off + io.size - PAGE_SIZE;
514 done = PAGE_SIZE - off;
515
516 memcpy(tmp, ptr, done);
517
518 ret = x86_gva_to_gpa(mach, &state, gva + PAGE_SIZE,
519 &gpa, &prot);
520 if (ret == -1)
521 return -1;
522 if (__predict_false(is_illegal(&io, prot))) {
523 errno = EFAULT;
524 return -1;
525 }
526 ret = nvmm_gpa_to_hva(mach, gpa, &hva);
527 if (ret == -1)
528 return -1;
529
530 memcpy(&tmp[done], (uint8_t *)hva, remain);
531 ptr2 = &tmp[done];
532 }
533 }
534
535 if (io.in) {
536 /* nothing to do */
537 } else {
538 memcpy(io.data, ptr, io.size);
539 }
540
541 (*cb)(&io);
542
543 if (io.in) {
544 if (!exit->u.io.str)
545 state.gprs[NVMM_X64_GPR_RAX] = 0;
546 if (__predict_false(cross)) {
547 memcpy(ptr, io.data, done);
548 memcpy(ptr2, &io.data[done], remain);
549 } else {
550 memcpy(ptr, io.data, io.size);
551 }
552 } else {
553 /* nothing to do */
554 }
555
556 if (exit->u.io.rep) {
557 state.gprs[NVMM_X64_GPR_RCX] -= 1;
558 if (state.gprs[NVMM_X64_GPR_RCX] == 0) {
559 state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
560 }
561 if (exit->u.io.str) {
562 if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
563 state.gprs[NVMM_X64_GPR_RSI] -= io.size;
564 } else {
565 state.gprs[NVMM_X64_GPR_RSI] += io.size;
566 }
567 }
568 } else {
569 state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
570 }
571
572 ret = nvmm_vcpu_setstate(mach, cpuid, &state, NVMM_X64_STATE_GPRS);
573 if (ret == -1)
574 return -1;
575
576 return 0;
577 }
578
579 /* -------------------------------------------------------------------------- */
580
581 int
582 nvmm_assist_mem(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
583 struct nvmm_exit *exit, void (*cb)(struct nvmm_mem *))
584 {
585 if (__predict_false(exit->reason != NVMM_EXIT_MEMORY)) {
586 errno = EINVAL;
587 return -1;
588 }
589
590 // TODO
591 errno = ENOSYS;
592 return -1;
593 }
594