libnvmm_x86.c revision 1.1 1 /* $NetBSD: libnvmm_x86.c,v 1.1 2018/11/10 09:28:56 maxv Exp $ */
2
3 /*
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <sys/ioctl.h>
41 #include <sys/mman.h>
42 #include <machine/vmparam.h>
43 #include <machine/pte.h>
44 #include <machine/psl.h>
45
46 #include "nvmm.h"
47
48 #include <x86/specialreg.h>
49
50 /* -------------------------------------------------------------------------- */
51
52 #define PTE32_L1_SHIFT 12
53 #define PTE32_L2_SHIFT 22
54
55 #define PTE32_L2_MASK 0xffc00000
56 #define PTE32_L1_MASK 0x003ff000
57
58 #define PTE32_L2_FRAME (PTE32_L2_MASK)
59 #define PTE32_L1_FRAME (PTE32_L2_FRAME|PTE32_L1_MASK)
60
61 #define pte32_l1idx(va) (((va) & PTE32_L1_MASK) >> PTE32_L1_SHIFT)
62 #define pte32_l2idx(va) (((va) & PTE32_L2_MASK) >> PTE32_L2_SHIFT)
63
64 typedef uint32_t pte_32bit_t;
65
66 static int
67 x86_gva_to_gpa_32bit(struct nvmm_machine *mach, uint64_t cr3,
68 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
69 {
70 gpaddr_t L2gpa, L1gpa;
71 uintptr_t L2hva, L1hva;
72 pte_32bit_t *pdir, pte;
73
74 /* We begin with an RWXU access. */
75 *prot = NVMM_PROT_ALL;
76
77 /* Parse L2. */
78 L2gpa = (cr3 & PG_FRAME);
79 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
80 return -1;
81 pdir = (pte_32bit_t *)L2hva;
82 pte = pdir[pte32_l2idx(gva)];
83 if ((pte & PG_V) == 0)
84 return -1;
85 if ((pte & PG_u) == 0)
86 *prot &= ~NVMM_PROT_USER;
87 if ((pte & PG_KW) == 0)
88 *prot &= ~NVMM_PROT_WRITE;
89 if ((pte & PG_PS) && !has_pse)
90 return -1;
91 if (pte & PG_PS) {
92 *gpa = (pte & PTE32_L2_FRAME);
93 return 0;
94 }
95
96 /* Parse L1. */
97 L1gpa = (pte & PG_FRAME);
98 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
99 return -1;
100 pdir = (pte_32bit_t *)L1hva;
101 pte = pdir[pte32_l1idx(gva)];
102 if ((pte & PG_V) == 0)
103 return -1;
104 if ((pte & PG_u) == 0)
105 *prot &= ~NVMM_PROT_USER;
106 if ((pte & PG_KW) == 0)
107 *prot &= ~NVMM_PROT_WRITE;
108 if (pte & PG_PS)
109 return -1;
110
111 *gpa = (pte & PG_FRAME);
112 return 0;
113 }
114
115 /* -------------------------------------------------------------------------- */
116
117 #define PTE32_PAE_L1_SHIFT 12
118 #define PTE32_PAE_L2_SHIFT 21
119 #define PTE32_PAE_L3_SHIFT 30
120
121 #define PTE32_PAE_L3_MASK 0xc0000000
122 #define PTE32_PAE_L2_MASK 0x3fe00000
123 #define PTE32_PAE_L1_MASK 0x001ff000
124
125 #define PTE32_PAE_L3_FRAME (PTE32_PAE_L3_MASK)
126 #define PTE32_PAE_L2_FRAME (PTE32_PAE_L3_FRAME|PTE32_PAE_L2_MASK)
127 #define PTE32_PAE_L1_FRAME (PTE32_PAE_L2_FRAME|PTE32_PAE_L1_MASK)
128
129 #define pte32_pae_l1idx(va) (((va) & PTE32_PAE_L1_MASK) >> PTE32_PAE_L1_SHIFT)
130 #define pte32_pae_l2idx(va) (((va) & PTE32_PAE_L2_MASK) >> PTE32_PAE_L2_SHIFT)
131 #define pte32_pae_l3idx(va) (((va) & PTE32_PAE_L3_MASK) >> PTE32_PAE_L3_SHIFT)
132
133 typedef uint64_t pte_32bit_pae_t;
134
135 static int
136 x86_gva_to_gpa_32bit_pae(struct nvmm_machine *mach, uint64_t cr3,
137 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
138 {
139 gpaddr_t L3gpa, L2gpa, L1gpa;
140 uintptr_t L3hva, L2hva, L1hva;
141 pte_32bit_pae_t *pdir, pte;
142
143 /* We begin with an RWXU access. */
144 *prot = NVMM_PROT_ALL;
145
146 /* Parse L3. */
147 L3gpa = (cr3 & PG_FRAME);
148 if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
149 return -1;
150 pdir = (pte_32bit_pae_t *)L3hva;
151 pte = pdir[pte32_pae_l3idx(gva)];
152 if ((pte & PG_V) == 0)
153 return -1;
154 if (pte & PG_NX)
155 *prot &= ~NVMM_PROT_EXEC;
156 if (pte & PG_PS)
157 return -1;
158
159 /* Parse L2. */
160 L2gpa = (pte & PG_FRAME);
161 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
162 return -1;
163 pdir = (pte_32bit_pae_t *)L2hva;
164 pte = pdir[pte32_pae_l2idx(gva)];
165 if ((pte & PG_V) == 0)
166 return -1;
167 if ((pte & PG_u) == 0)
168 *prot &= ~NVMM_PROT_USER;
169 if ((pte & PG_KW) == 0)
170 *prot &= ~NVMM_PROT_WRITE;
171 if (pte & PG_NX)
172 *prot &= ~NVMM_PROT_EXEC;
173 if ((pte & PG_PS) && !has_pse)
174 return -1;
175 if (pte & PG_PS) {
176 *gpa = (pte & PTE32_PAE_L2_FRAME);
177 return 0;
178 }
179
180 /* Parse L1. */
181 L1gpa = (pte & PG_FRAME);
182 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
183 return -1;
184 pdir = (pte_32bit_pae_t *)L1hva;
185 pte = pdir[pte32_pae_l1idx(gva)];
186 if ((pte & PG_V) == 0)
187 return -1;
188 if ((pte & PG_u) == 0)
189 *prot &= ~NVMM_PROT_USER;
190 if ((pte & PG_KW) == 0)
191 *prot &= ~NVMM_PROT_WRITE;
192 if (pte & PG_NX)
193 *prot &= ~NVMM_PROT_EXEC;
194 if (pte & PG_PS)
195 return -1;
196
197 *gpa = (pte & PG_FRAME);
198 return 0;
199 }
200
201 /* -------------------------------------------------------------------------- */
202
203 #define PTE64_L1_SHIFT 12
204 #define PTE64_L2_SHIFT 21
205 #define PTE64_L3_SHIFT 30
206 #define PTE64_L4_SHIFT 39
207
208 #define PTE64_L4_MASK 0x0000ff8000000000
209 #define PTE64_L3_MASK 0x0000007fc0000000
210 #define PTE64_L2_MASK 0x000000003fe00000
211 #define PTE64_L1_MASK 0x00000000001ff000
212
213 #define PTE64_L4_FRAME PTE64_L4_MASK
214 #define PTE64_L3_FRAME (PTE64_L4_FRAME|PTE64_L3_MASK)
215 #define PTE64_L2_FRAME (PTE64_L3_FRAME|PTE64_L2_MASK)
216 #define PTE64_L1_FRAME (PTE64_L2_FRAME|PTE64_L1_MASK)
217
218 #define pte64_l1idx(va) (((va) & PTE64_L1_MASK) >> PTE64_L1_SHIFT)
219 #define pte64_l2idx(va) (((va) & PTE64_L2_MASK) >> PTE64_L2_SHIFT)
220 #define pte64_l3idx(va) (((va) & PTE64_L3_MASK) >> PTE64_L3_SHIFT)
221 #define pte64_l4idx(va) (((va) & PTE64_L4_MASK) >> PTE64_L4_SHIFT)
222
223 typedef uint64_t pte_64bit_t;
224
225 static inline bool
226 x86_gva_64bit_canonical(gvaddr_t gva)
227 {
228 /* Bits 63:47 must have the same value. */
229 #define SIGN_EXTEND 0xffff800000000000ULL
230 return (gva & SIGN_EXTEND) == 0 || (gva & SIGN_EXTEND) == SIGN_EXTEND;
231 }
232
233 static int
234 x86_gva_to_gpa_64bit(struct nvmm_machine *mach, uint64_t cr3,
235 gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
236 {
237 gpaddr_t L4gpa, L3gpa, L2gpa, L1gpa;
238 uintptr_t L4hva, L3hva, L2hva, L1hva;
239 pte_64bit_t *pdir, pte;
240
241 /* We begin with an RWXU access. */
242 *prot = NVMM_PROT_ALL;
243
244 if (!x86_gva_64bit_canonical(gva))
245 return -1;
246
247 /* Parse L4. */
248 L4gpa = (cr3 & PG_FRAME);
249 if (nvmm_gpa_to_hva(mach, L4gpa, &L4hva) == -1)
250 return -1;
251 pdir = (pte_64bit_t *)L4hva;
252 pte = pdir[pte64_l4idx(gva)];
253 if ((pte & PG_V) == 0)
254 return -1;
255 if ((pte & PG_u) == 0)
256 *prot &= ~NVMM_PROT_USER;
257 if ((pte & PG_KW) == 0)
258 *prot &= ~NVMM_PROT_WRITE;
259 if (pte & PG_NX)
260 *prot &= ~NVMM_PROT_EXEC;
261 if (pte & PG_PS)
262 return -1;
263
264 /* Parse L3. */
265 L3gpa = (pte & PG_FRAME);
266 if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
267 return -1;
268 pdir = (pte_64bit_t *)L3hva;
269 pte = pdir[pte64_l3idx(gva)];
270 if ((pte & PG_V) == 0)
271 return -1;
272 if ((pte & PG_u) == 0)
273 *prot &= ~NVMM_PROT_USER;
274 if ((pte & PG_KW) == 0)
275 *prot &= ~NVMM_PROT_WRITE;
276 if (pte & PG_NX)
277 *prot &= ~NVMM_PROT_EXEC;
278 if ((pte & PG_PS) && !has_pse)
279 return -1;
280 if (pte & PG_PS) {
281 *gpa = (pte & PTE64_L3_FRAME);
282 return 0;
283 }
284
285 /* Parse L2. */
286 L2gpa = (pte & PG_FRAME);
287 if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
288 return -1;
289 pdir = (pte_64bit_t *)L2hva;
290 pte = pdir[pte64_l2idx(gva)];
291 if ((pte & PG_V) == 0)
292 return -1;
293 if ((pte & PG_u) == 0)
294 *prot &= ~NVMM_PROT_USER;
295 if ((pte & PG_KW) == 0)
296 *prot &= ~NVMM_PROT_WRITE;
297 if (pte & PG_NX)
298 *prot &= ~NVMM_PROT_EXEC;
299 if ((pte & PG_PS) && !has_pse)
300 return -1;
301 if (pte & PG_PS) {
302 *gpa = (pte & PTE64_L2_FRAME);
303 return 0;
304 }
305
306 /* Parse L1. */
307 L1gpa = (pte & PG_FRAME);
308 if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
309 return -1;
310 pdir = (pte_64bit_t *)L1hva;
311 pte = pdir[pte64_l1idx(gva)];
312 if ((pte & PG_V) == 0)
313 return -1;
314 if ((pte & PG_u) == 0)
315 *prot &= ~NVMM_PROT_USER;
316 if ((pte & PG_KW) == 0)
317 *prot &= ~NVMM_PROT_WRITE;
318 if (pte & PG_NX)
319 *prot &= ~NVMM_PROT_EXEC;
320 if (pte & PG_PS)
321 return -1;
322
323 *gpa = (pte & PG_FRAME);
324 return 0;
325 }
326
327 static inline int
328 x86_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_x64_state *state,
329 gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
330 {
331 bool is_pae, is_lng, has_pse;
332 uint64_t cr3;
333 int ret;
334
335 if ((state->crs[NVMM_X64_CR_CR0] & CR0_PG) == 0) {
336 /* No paging. */
337 *gpa = gva;
338 return 0;
339 }
340
341 is_pae = (state->crs[NVMM_X64_CR_CR4] & CR4_PAE) != 0;
342 is_lng = (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
343 has_pse = (state->crs[NVMM_X64_CR_CR4] & CR4_PSE) != 0;
344 cr3 = state->crs[NVMM_X64_CR_CR3];
345
346 if (is_pae && is_lng) {
347 /* 64bit */
348 ret = x86_gva_to_gpa_64bit(mach, cr3, gva, gpa, has_pse, prot);
349 } else if (is_pae && !is_lng) {
350 /* 32bit PAE */
351 ret = x86_gva_to_gpa_32bit_pae(mach, cr3, gva, gpa, has_pse,
352 prot);
353 } else if (!is_pae && !is_lng) {
354 /* 32bit */
355 ret = x86_gva_to_gpa_32bit(mach, cr3, gva, gpa, has_pse, prot);
356 } else {
357 ret = -1;
358 }
359
360 if (ret == -1) {
361 errno = EFAULT;
362 }
363
364 return ret;
365 }
366
367 int
368 nvmm_gva_to_gpa(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
369 gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
370 {
371 struct nvmm_x64_state state;
372 int ret;
373
374 if (gva & PAGE_MASK) {
375 errno = EINVAL;
376 return -1;
377 }
378
379 ret = nvmm_vcpu_getstate(mach, cpuid, &state,
380 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
381 if (ret == -1)
382 return -1;
383
384 return x86_gva_to_gpa(mach, &state, gva, gpa, prot);
385 }
386
387 /* -------------------------------------------------------------------------- */
388
389 static inline bool
390 is_long_mode(struct nvmm_x64_state *state)
391 {
392 return (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
393 }
394
395 static inline bool
396 is_illegal(struct nvmm_io *io, nvmm_prot_t prot)
397 {
398 return (io->in && !(prot & NVMM_PROT_WRITE));
399 }
400
401 static int
402 segment_apply(struct nvmm_x64_state_seg *seg, gvaddr_t *gva, size_t size)
403 {
404 uint64_t limit;
405
406 /*
407 * This is incomplete. We should check topdown, etc, really that's
408 * tiring.
409 */
410 if (__predict_false(!seg->attrib.p)) {
411 goto error;
412 }
413
414 limit = (seg->limit + 1);
415 if (__predict_true(seg->attrib.gran)) {
416 limit *= PAGE_SIZE;
417 }
418
419 if (__predict_false(*gva + seg->base + size > limit)) {
420 goto error;
421 }
422
423 *gva += seg->base;
424 return 0;
425
426 error:
427 errno = EFAULT;
428 return -1;
429 }
430
431 int
432 nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
433 struct nvmm_exit *exit, void (*cb)(struct nvmm_io *))
434 {
435 struct nvmm_x64_state state;
436 struct nvmm_io io;
437 nvmm_prot_t prot;
438 size_t remain, done;
439 uintptr_t hva;
440 gvaddr_t gva, off;
441 gpaddr_t gpa;
442 uint64_t rsi;
443 uint8_t tmp[8];
444 uint8_t *ptr, *ptr2;
445 bool cross;
446 int ret;
447
448 if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
449 errno = EINVAL;
450 return -1;
451 }
452
453 io.port = exit->u.io.port;
454 io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
455 io.size = exit->u.io.operand_size;
456
457 ret = nvmm_vcpu_getstate(mach, cpuid, &state,
458 NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
459 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
460 if (ret == -1)
461 return -1;
462
463 cross = false;
464
465 if (!exit->u.io.str) {
466 ptr = (uint8_t *)&state.gprs[NVMM_X64_GPR_RAX];
467 } else {
468 rsi = state.gprs[NVMM_X64_GPR_RSI];
469
470 switch (exit->u.io.address_size) {
471 case 8:
472 gva = rsi;
473 break;
474 case 4:
475 gva = (rsi & 0x00000000FFFFFFFF);
476 break;
477 case 2:
478 default: /* impossible */
479 gva = (rsi & 0x000000000000FFFF);
480 break;
481 }
482
483 if (!is_long_mode(&state)) {
484 ret = segment_apply(&state.segs[exit->u.io.seg], &gva,
485 io.size);
486 if (ret == -1)
487 return -1;
488 }
489
490 off = (gva & PAGE_MASK);
491 gva &= ~PAGE_MASK;
492
493 ret = x86_gva_to_gpa(mach, &state, gva, &gpa, &prot);
494 if (ret == -1)
495 return -1;
496 if (__predict_false(is_illegal(&io, prot))) {
497 errno = EFAULT;
498 return -1;
499 }
500 ret = nvmm_gpa_to_hva(mach, gpa, &hva);
501 if (ret == -1)
502 return -1;
503
504 ptr = (uint8_t *)hva + off;
505
506 /*
507 * Special case. If the buffer is in between two pages, we
508 * need to retrieve data from the next page.
509 */
510 if (__predict_false(off + io.size > PAGE_SIZE)) {
511 cross = true;
512 remain = off + io.size - PAGE_SIZE;
513 done = PAGE_SIZE - off;
514
515 memcpy(tmp, ptr, done);
516
517 ret = x86_gva_to_gpa(mach, &state, gva + PAGE_SIZE,
518 &gpa, &prot);
519 if (ret == -1)
520 return -1;
521 if (__predict_false(is_illegal(&io, prot))) {
522 errno = EFAULT;
523 return -1;
524 }
525 ret = nvmm_gpa_to_hva(mach, gpa, &hva);
526 if (ret == -1)
527 return -1;
528
529 memcpy(&tmp[done], (uint8_t *)hva, remain);
530 ptr2 = &tmp[done];
531 }
532 }
533
534 if (io.in) {
535 /* nothing to do */
536 } else {
537 memcpy(io.data, ptr, io.size);
538 }
539
540 (*cb)(&io);
541
542 if (io.in) {
543 if (!exit->u.io.str)
544 state.gprs[NVMM_X64_GPR_RAX] = 0;
545 if (__predict_false(cross)) {
546 memcpy(ptr, io.data, done);
547 memcpy(ptr2, &io.data[done], remain);
548 } else {
549 memcpy(ptr, io.data, io.size);
550 }
551 } else {
552 /* nothing to do */
553 }
554
555 if (exit->u.io.rep) {
556 state.gprs[NVMM_X64_GPR_RCX] -= 1;
557 if (state.gprs[NVMM_X64_GPR_RCX] == 0) {
558 state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
559 }
560 if (exit->u.io.str) {
561 if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
562 state.gprs[NVMM_X64_GPR_RSI] -= io.size;
563 } else {
564 state.gprs[NVMM_X64_GPR_RSI] += io.size;
565 }
566 }
567 } else {
568 state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
569 }
570
571 ret = nvmm_vcpu_setstate(mach, cpuid, &state, NVMM_X64_STATE_GPRS);
572 if (ret == -1)
573 return -1;
574
575 return 0;
576 }
577
578 /* -------------------------------------------------------------------------- */
579
580 int
581 nvmm_assist_mem(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
582 struct nvmm_exit *exit, void (*cb)(struct nvmm_mem *))
583 {
584 if (__predict_false(exit->reason != NVMM_EXIT_MEMORY)) {
585 errno = EINVAL;
586 return -1;
587 }
588
589 // TODO
590 errno = ENOSYS;
591 return -1;
592 }
593