nvmm_x86_svm.c revision 1.43 1 /* $NetBSD: nvmm_x86_svm.c,v 1.43 2019/04/28 14:22:13 maxv Exp $ */
2
3 /*
4 * Copyright (c) 2018 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.43 2019/04/28 14:22:13 maxv Exp $");
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/kmem.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/mman.h>
42
43 #include <uvm/uvm.h>
44 #include <uvm/uvm_page.h>
45
46 #include <x86/cputypes.h>
47 #include <x86/specialreg.h>
48 #include <x86/pmap.h>
49 #include <x86/dbregs.h>
50 #include <x86/cpu_counter.h>
51 #include <machine/cpuvar.h>
52
53 #include <dev/nvmm/nvmm.h>
54 #include <dev/nvmm/nvmm_internal.h>
55 #include <dev/nvmm/x86/nvmm_x86.h>
56
57 int svm_vmrun(paddr_t, uint64_t *);
58
59 #define MSR_VM_HSAVE_PA 0xC0010117
60
61 /* -------------------------------------------------------------------------- */
62
63 #define VMCB_EXITCODE_CR0_READ 0x0000
64 #define VMCB_EXITCODE_CR1_READ 0x0001
65 #define VMCB_EXITCODE_CR2_READ 0x0002
66 #define VMCB_EXITCODE_CR3_READ 0x0003
67 #define VMCB_EXITCODE_CR4_READ 0x0004
68 #define VMCB_EXITCODE_CR5_READ 0x0005
69 #define VMCB_EXITCODE_CR6_READ 0x0006
70 #define VMCB_EXITCODE_CR7_READ 0x0007
71 #define VMCB_EXITCODE_CR8_READ 0x0008
72 #define VMCB_EXITCODE_CR9_READ 0x0009
73 #define VMCB_EXITCODE_CR10_READ 0x000A
74 #define VMCB_EXITCODE_CR11_READ 0x000B
75 #define VMCB_EXITCODE_CR12_READ 0x000C
76 #define VMCB_EXITCODE_CR13_READ 0x000D
77 #define VMCB_EXITCODE_CR14_READ 0x000E
78 #define VMCB_EXITCODE_CR15_READ 0x000F
79 #define VMCB_EXITCODE_CR0_WRITE 0x0010
80 #define VMCB_EXITCODE_CR1_WRITE 0x0011
81 #define VMCB_EXITCODE_CR2_WRITE 0x0012
82 #define VMCB_EXITCODE_CR3_WRITE 0x0013
83 #define VMCB_EXITCODE_CR4_WRITE 0x0014
84 #define VMCB_EXITCODE_CR5_WRITE 0x0015
85 #define VMCB_EXITCODE_CR6_WRITE 0x0016
86 #define VMCB_EXITCODE_CR7_WRITE 0x0017
87 #define VMCB_EXITCODE_CR8_WRITE 0x0018
88 #define VMCB_EXITCODE_CR9_WRITE 0x0019
89 #define VMCB_EXITCODE_CR10_WRITE 0x001A
90 #define VMCB_EXITCODE_CR11_WRITE 0x001B
91 #define VMCB_EXITCODE_CR12_WRITE 0x001C
92 #define VMCB_EXITCODE_CR13_WRITE 0x001D
93 #define VMCB_EXITCODE_CR14_WRITE 0x001E
94 #define VMCB_EXITCODE_CR15_WRITE 0x001F
95 #define VMCB_EXITCODE_DR0_READ 0x0020
96 #define VMCB_EXITCODE_DR1_READ 0x0021
97 #define VMCB_EXITCODE_DR2_READ 0x0022
98 #define VMCB_EXITCODE_DR3_READ 0x0023
99 #define VMCB_EXITCODE_DR4_READ 0x0024
100 #define VMCB_EXITCODE_DR5_READ 0x0025
101 #define VMCB_EXITCODE_DR6_READ 0x0026
102 #define VMCB_EXITCODE_DR7_READ 0x0027
103 #define VMCB_EXITCODE_DR8_READ 0x0028
104 #define VMCB_EXITCODE_DR9_READ 0x0029
105 #define VMCB_EXITCODE_DR10_READ 0x002A
106 #define VMCB_EXITCODE_DR11_READ 0x002B
107 #define VMCB_EXITCODE_DR12_READ 0x002C
108 #define VMCB_EXITCODE_DR13_READ 0x002D
109 #define VMCB_EXITCODE_DR14_READ 0x002E
110 #define VMCB_EXITCODE_DR15_READ 0x002F
111 #define VMCB_EXITCODE_DR0_WRITE 0x0030
112 #define VMCB_EXITCODE_DR1_WRITE 0x0031
113 #define VMCB_EXITCODE_DR2_WRITE 0x0032
114 #define VMCB_EXITCODE_DR3_WRITE 0x0033
115 #define VMCB_EXITCODE_DR4_WRITE 0x0034
116 #define VMCB_EXITCODE_DR5_WRITE 0x0035
117 #define VMCB_EXITCODE_DR6_WRITE 0x0036
118 #define VMCB_EXITCODE_DR7_WRITE 0x0037
119 #define VMCB_EXITCODE_DR8_WRITE 0x0038
120 #define VMCB_EXITCODE_DR9_WRITE 0x0039
121 #define VMCB_EXITCODE_DR10_WRITE 0x003A
122 #define VMCB_EXITCODE_DR11_WRITE 0x003B
123 #define VMCB_EXITCODE_DR12_WRITE 0x003C
124 #define VMCB_EXITCODE_DR13_WRITE 0x003D
125 #define VMCB_EXITCODE_DR14_WRITE 0x003E
126 #define VMCB_EXITCODE_DR15_WRITE 0x003F
127 #define VMCB_EXITCODE_EXCP0 0x0040
128 #define VMCB_EXITCODE_EXCP1 0x0041
129 #define VMCB_EXITCODE_EXCP2 0x0042
130 #define VMCB_EXITCODE_EXCP3 0x0043
131 #define VMCB_EXITCODE_EXCP4 0x0044
132 #define VMCB_EXITCODE_EXCP5 0x0045
133 #define VMCB_EXITCODE_EXCP6 0x0046
134 #define VMCB_EXITCODE_EXCP7 0x0047
135 #define VMCB_EXITCODE_EXCP8 0x0048
136 #define VMCB_EXITCODE_EXCP9 0x0049
137 #define VMCB_EXITCODE_EXCP10 0x004A
138 #define VMCB_EXITCODE_EXCP11 0x004B
139 #define VMCB_EXITCODE_EXCP12 0x004C
140 #define VMCB_EXITCODE_EXCP13 0x004D
141 #define VMCB_EXITCODE_EXCP14 0x004E
142 #define VMCB_EXITCODE_EXCP15 0x004F
143 #define VMCB_EXITCODE_EXCP16 0x0050
144 #define VMCB_EXITCODE_EXCP17 0x0051
145 #define VMCB_EXITCODE_EXCP18 0x0052
146 #define VMCB_EXITCODE_EXCP19 0x0053
147 #define VMCB_EXITCODE_EXCP20 0x0054
148 #define VMCB_EXITCODE_EXCP21 0x0055
149 #define VMCB_EXITCODE_EXCP22 0x0056
150 #define VMCB_EXITCODE_EXCP23 0x0057
151 #define VMCB_EXITCODE_EXCP24 0x0058
152 #define VMCB_EXITCODE_EXCP25 0x0059
153 #define VMCB_EXITCODE_EXCP26 0x005A
154 #define VMCB_EXITCODE_EXCP27 0x005B
155 #define VMCB_EXITCODE_EXCP28 0x005C
156 #define VMCB_EXITCODE_EXCP29 0x005D
157 #define VMCB_EXITCODE_EXCP30 0x005E
158 #define VMCB_EXITCODE_EXCP31 0x005F
159 #define VMCB_EXITCODE_INTR 0x0060
160 #define VMCB_EXITCODE_NMI 0x0061
161 #define VMCB_EXITCODE_SMI 0x0062
162 #define VMCB_EXITCODE_INIT 0x0063
163 #define VMCB_EXITCODE_VINTR 0x0064
164 #define VMCB_EXITCODE_CR0_SEL_WRITE 0x0065
165 #define VMCB_EXITCODE_IDTR_READ 0x0066
166 #define VMCB_EXITCODE_GDTR_READ 0x0067
167 #define VMCB_EXITCODE_LDTR_READ 0x0068
168 #define VMCB_EXITCODE_TR_READ 0x0069
169 #define VMCB_EXITCODE_IDTR_WRITE 0x006A
170 #define VMCB_EXITCODE_GDTR_WRITE 0x006B
171 #define VMCB_EXITCODE_LDTR_WRITE 0x006C
172 #define VMCB_EXITCODE_TR_WRITE 0x006D
173 #define VMCB_EXITCODE_RDTSC 0x006E
174 #define VMCB_EXITCODE_RDPMC 0x006F
175 #define VMCB_EXITCODE_PUSHF 0x0070
176 #define VMCB_EXITCODE_POPF 0x0071
177 #define VMCB_EXITCODE_CPUID 0x0072
178 #define VMCB_EXITCODE_RSM 0x0073
179 #define VMCB_EXITCODE_IRET 0x0074
180 #define VMCB_EXITCODE_SWINT 0x0075
181 #define VMCB_EXITCODE_INVD 0x0076
182 #define VMCB_EXITCODE_PAUSE 0x0077
183 #define VMCB_EXITCODE_HLT 0x0078
184 #define VMCB_EXITCODE_INVLPG 0x0079
185 #define VMCB_EXITCODE_INVLPGA 0x007A
186 #define VMCB_EXITCODE_IOIO 0x007B
187 #define VMCB_EXITCODE_MSR 0x007C
188 #define VMCB_EXITCODE_TASK_SWITCH 0x007D
189 #define VMCB_EXITCODE_FERR_FREEZE 0x007E
190 #define VMCB_EXITCODE_SHUTDOWN 0x007F
191 #define VMCB_EXITCODE_VMRUN 0x0080
192 #define VMCB_EXITCODE_VMMCALL 0x0081
193 #define VMCB_EXITCODE_VMLOAD 0x0082
194 #define VMCB_EXITCODE_VMSAVE 0x0083
195 #define VMCB_EXITCODE_STGI 0x0084
196 #define VMCB_EXITCODE_CLGI 0x0085
197 #define VMCB_EXITCODE_SKINIT 0x0086
198 #define VMCB_EXITCODE_RDTSCP 0x0087
199 #define VMCB_EXITCODE_ICEBP 0x0088
200 #define VMCB_EXITCODE_WBINVD 0x0089
201 #define VMCB_EXITCODE_MONITOR 0x008A
202 #define VMCB_EXITCODE_MWAIT 0x008B
203 #define VMCB_EXITCODE_MWAIT_CONDITIONAL 0x008C
204 #define VMCB_EXITCODE_XSETBV 0x008D
205 #define VMCB_EXITCODE_EFER_WRITE_TRAP 0x008F
206 #define VMCB_EXITCODE_CR0_WRITE_TRAP 0x0090
207 #define VMCB_EXITCODE_CR1_WRITE_TRAP 0x0091
208 #define VMCB_EXITCODE_CR2_WRITE_TRAP 0x0092
209 #define VMCB_EXITCODE_CR3_WRITE_TRAP 0x0093
210 #define VMCB_EXITCODE_CR4_WRITE_TRAP 0x0094
211 #define VMCB_EXITCODE_CR5_WRITE_TRAP 0x0095
212 #define VMCB_EXITCODE_CR6_WRITE_TRAP 0x0096
213 #define VMCB_EXITCODE_CR7_WRITE_TRAP 0x0097
214 #define VMCB_EXITCODE_CR8_WRITE_TRAP 0x0098
215 #define VMCB_EXITCODE_CR9_WRITE_TRAP 0x0099
216 #define VMCB_EXITCODE_CR10_WRITE_TRAP 0x009A
217 #define VMCB_EXITCODE_CR11_WRITE_TRAP 0x009B
218 #define VMCB_EXITCODE_CR12_WRITE_TRAP 0x009C
219 #define VMCB_EXITCODE_CR13_WRITE_TRAP 0x009D
220 #define VMCB_EXITCODE_CR14_WRITE_TRAP 0x009E
221 #define VMCB_EXITCODE_CR15_WRITE_TRAP 0x009F
222 #define VMCB_EXITCODE_NPF 0x0400
223 #define VMCB_EXITCODE_AVIC_INCOMP_IPI 0x0401
224 #define VMCB_EXITCODE_AVIC_NOACCEL 0x0402
225 #define VMCB_EXITCODE_VMGEXIT 0x0403
226 #define VMCB_EXITCODE_INVALID -1
227
228 /* -------------------------------------------------------------------------- */
229
230 struct vmcb_ctrl {
231 uint32_t intercept_cr;
232 #define VMCB_CTRL_INTERCEPT_RCR(x) __BIT( 0 + x)
233 #define VMCB_CTRL_INTERCEPT_WCR(x) __BIT(16 + x)
234
235 uint32_t intercept_dr;
236 #define VMCB_CTRL_INTERCEPT_RDR(x) __BIT( 0 + x)
237 #define VMCB_CTRL_INTERCEPT_WDR(x) __BIT(16 + x)
238
239 uint32_t intercept_vec;
240 #define VMCB_CTRL_INTERCEPT_VEC(x) __BIT(x)
241
242 uint32_t intercept_misc1;
243 #define VMCB_CTRL_INTERCEPT_INTR __BIT(0)
244 #define VMCB_CTRL_INTERCEPT_NMI __BIT(1)
245 #define VMCB_CTRL_INTERCEPT_SMI __BIT(2)
246 #define VMCB_CTRL_INTERCEPT_INIT __BIT(3)
247 #define VMCB_CTRL_INTERCEPT_VINTR __BIT(4)
248 #define VMCB_CTRL_INTERCEPT_CR0_SPEC __BIT(5)
249 #define VMCB_CTRL_INTERCEPT_RIDTR __BIT(6)
250 #define VMCB_CTRL_INTERCEPT_RGDTR __BIT(7)
251 #define VMCB_CTRL_INTERCEPT_RLDTR __BIT(8)
252 #define VMCB_CTRL_INTERCEPT_RTR __BIT(9)
253 #define VMCB_CTRL_INTERCEPT_WIDTR __BIT(10)
254 #define VMCB_CTRL_INTERCEPT_WGDTR __BIT(11)
255 #define VMCB_CTRL_INTERCEPT_WLDTR __BIT(12)
256 #define VMCB_CTRL_INTERCEPT_WTR __BIT(13)
257 #define VMCB_CTRL_INTERCEPT_RDTSC __BIT(14)
258 #define VMCB_CTRL_INTERCEPT_RDPMC __BIT(15)
259 #define VMCB_CTRL_INTERCEPT_PUSHF __BIT(16)
260 #define VMCB_CTRL_INTERCEPT_POPF __BIT(17)
261 #define VMCB_CTRL_INTERCEPT_CPUID __BIT(18)
262 #define VMCB_CTRL_INTERCEPT_RSM __BIT(19)
263 #define VMCB_CTRL_INTERCEPT_IRET __BIT(20)
264 #define VMCB_CTRL_INTERCEPT_INTN __BIT(21)
265 #define VMCB_CTRL_INTERCEPT_INVD __BIT(22)
266 #define VMCB_CTRL_INTERCEPT_PAUSE __BIT(23)
267 #define VMCB_CTRL_INTERCEPT_HLT __BIT(24)
268 #define VMCB_CTRL_INTERCEPT_INVLPG __BIT(25)
269 #define VMCB_CTRL_INTERCEPT_INVLPGA __BIT(26)
270 #define VMCB_CTRL_INTERCEPT_IOIO_PROT __BIT(27)
271 #define VMCB_CTRL_INTERCEPT_MSR_PROT __BIT(28)
272 #define VMCB_CTRL_INTERCEPT_TASKSW __BIT(29)
273 #define VMCB_CTRL_INTERCEPT_FERR_FREEZE __BIT(30)
274 #define VMCB_CTRL_INTERCEPT_SHUTDOWN __BIT(31)
275
276 uint32_t intercept_misc2;
277 #define VMCB_CTRL_INTERCEPT_VMRUN __BIT(0)
278 #define VMCB_CTRL_INTERCEPT_VMMCALL __BIT(1)
279 #define VMCB_CTRL_INTERCEPT_VMLOAD __BIT(2)
280 #define VMCB_CTRL_INTERCEPT_VMSAVE __BIT(3)
281 #define VMCB_CTRL_INTERCEPT_STGI __BIT(4)
282 #define VMCB_CTRL_INTERCEPT_CLGI __BIT(5)
283 #define VMCB_CTRL_INTERCEPT_SKINIT __BIT(6)
284 #define VMCB_CTRL_INTERCEPT_RDTSCP __BIT(7)
285 #define VMCB_CTRL_INTERCEPT_ICEBP __BIT(8)
286 #define VMCB_CTRL_INTERCEPT_WBINVD __BIT(9)
287 #define VMCB_CTRL_INTERCEPT_MONITOR __BIT(10)
288 #define VMCB_CTRL_INTERCEPT_MWAIT __BIT(12)
289 #define VMCB_CTRL_INTERCEPT_XSETBV __BIT(13)
290 #define VMCB_CTRL_INTERCEPT_EFER_SPEC __BIT(15)
291 #define VMCB_CTRL_INTERCEPT_WCR_SPEC(x) __BIT(16 + x)
292
293 uint8_t rsvd1[40];
294 uint16_t pause_filt_thresh;
295 uint16_t pause_filt_cnt;
296 uint64_t iopm_base_pa;
297 uint64_t msrpm_base_pa;
298 uint64_t tsc_offset;
299 uint32_t guest_asid;
300
301 uint32_t tlb_ctrl;
302 #define VMCB_CTRL_TLB_CTRL_FLUSH_ALL 0x01
303 #define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST 0x03
304 #define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST_NONGLOBAL 0x07
305
306 uint64_t v;
307 #define VMCB_CTRL_V_TPR __BITS(3,0)
308 #define VMCB_CTRL_V_IRQ __BIT(8)
309 #define VMCB_CTRL_V_VGIF __BIT(9)
310 #define VMCB_CTRL_V_INTR_PRIO __BITS(19,16)
311 #define VMCB_CTRL_V_IGN_TPR __BIT(20)
312 #define VMCB_CTRL_V_INTR_MASKING __BIT(24)
313 #define VMCB_CTRL_V_GUEST_VGIF __BIT(25)
314 #define VMCB_CTRL_V_AVIC_EN __BIT(31)
315 #define VMCB_CTRL_V_INTR_VECTOR __BITS(39,32)
316
317 uint64_t intr;
318 #define VMCB_CTRL_INTR_SHADOW __BIT(0)
319
320 uint64_t exitcode;
321 uint64_t exitinfo1;
322 uint64_t exitinfo2;
323
324 uint64_t exitintinfo;
325 #define VMCB_CTRL_EXITINTINFO_VECTOR __BITS(7,0)
326 #define VMCB_CTRL_EXITINTINFO_TYPE __BITS(10,8)
327 #define VMCB_CTRL_EXITINTINFO_EV __BIT(11)
328 #define VMCB_CTRL_EXITINTINFO_V __BIT(31)
329 #define VMCB_CTRL_EXITINTINFO_ERRORCODE __BITS(63,32)
330
331 uint64_t enable1;
332 #define VMCB_CTRL_ENABLE_NP __BIT(0)
333 #define VMCB_CTRL_ENABLE_SEV __BIT(1)
334 #define VMCB_CTRL_ENABLE_ES_SEV __BIT(2)
335
336 uint64_t avic;
337 #define VMCB_CTRL_AVIC_APIC_BAR __BITS(51,0)
338
339 uint64_t ghcb;
340
341 uint64_t eventinj;
342 #define VMCB_CTRL_EVENTINJ_VECTOR __BITS(7,0)
343 #define VMCB_CTRL_EVENTINJ_TYPE __BITS(10,8)
344 #define VMCB_CTRL_EVENTINJ_EV __BIT(11)
345 #define VMCB_CTRL_EVENTINJ_V __BIT(31)
346 #define VMCB_CTRL_EVENTINJ_ERRORCODE __BITS(63,32)
347
348 uint64_t n_cr3;
349
350 uint64_t enable2;
351 #define VMCB_CTRL_ENABLE_LBR __BIT(0)
352 #define VMCB_CTRL_ENABLE_VVMSAVE __BIT(1)
353
354 uint32_t vmcb_clean;
355 #define VMCB_CTRL_VMCB_CLEAN_I __BIT(0)
356 #define VMCB_CTRL_VMCB_CLEAN_IOPM __BIT(1)
357 #define VMCB_CTRL_VMCB_CLEAN_ASID __BIT(2)
358 #define VMCB_CTRL_VMCB_CLEAN_TPR __BIT(3)
359 #define VMCB_CTRL_VMCB_CLEAN_NP __BIT(4)
360 #define VMCB_CTRL_VMCB_CLEAN_CR __BIT(5)
361 #define VMCB_CTRL_VMCB_CLEAN_DR __BIT(6)
362 #define VMCB_CTRL_VMCB_CLEAN_DT __BIT(7)
363 #define VMCB_CTRL_VMCB_CLEAN_SEG __BIT(8)
364 #define VMCB_CTRL_VMCB_CLEAN_CR2 __BIT(9)
365 #define VMCB_CTRL_VMCB_CLEAN_LBR __BIT(10)
366 #define VMCB_CTRL_VMCB_CLEAN_AVIC __BIT(11)
367
368 uint32_t rsvd2;
369 uint64_t nrip;
370 uint8_t inst_len;
371 uint8_t inst_bytes[15];
372 uint64_t avic_abpp;
373 uint64_t rsvd3;
374 uint64_t avic_ltp;
375
376 uint64_t avic_phys;
377 #define VMCB_CTRL_AVIC_PHYS_TABLE_PTR __BITS(51,12)
378 #define VMCB_CTRL_AVIC_PHYS_MAX_INDEX __BITS(7,0)
379
380 uint64_t rsvd4;
381 uint64_t vmcb_ptr;
382
383 uint8_t pad[752];
384 } __packed;
385
386 CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
387
388 struct vmcb_segment {
389 uint16_t selector;
390 uint16_t attrib; /* hidden */
391 uint32_t limit; /* hidden */
392 uint64_t base; /* hidden */
393 } __packed;
394
395 CTASSERT(sizeof(struct vmcb_segment) == 16);
396
397 struct vmcb_state {
398 struct vmcb_segment es;
399 struct vmcb_segment cs;
400 struct vmcb_segment ss;
401 struct vmcb_segment ds;
402 struct vmcb_segment fs;
403 struct vmcb_segment gs;
404 struct vmcb_segment gdt;
405 struct vmcb_segment ldt;
406 struct vmcb_segment idt;
407 struct vmcb_segment tr;
408 uint8_t rsvd1[43];
409 uint8_t cpl;
410 uint8_t rsvd2[4];
411 uint64_t efer;
412 uint8_t rsvd3[112];
413 uint64_t cr4;
414 uint64_t cr3;
415 uint64_t cr0;
416 uint64_t dr7;
417 uint64_t dr6;
418 uint64_t rflags;
419 uint64_t rip;
420 uint8_t rsvd4[88];
421 uint64_t rsp;
422 uint8_t rsvd5[24];
423 uint64_t rax;
424 uint64_t star;
425 uint64_t lstar;
426 uint64_t cstar;
427 uint64_t sfmask;
428 uint64_t kernelgsbase;
429 uint64_t sysenter_cs;
430 uint64_t sysenter_esp;
431 uint64_t sysenter_eip;
432 uint64_t cr2;
433 uint8_t rsvd6[32];
434 uint64_t g_pat;
435 uint64_t dbgctl;
436 uint64_t br_from;
437 uint64_t br_to;
438 uint64_t int_from;
439 uint64_t int_to;
440 uint8_t pad[2408];
441 } __packed;
442
443 CTASSERT(sizeof(struct vmcb_state) == 0xC00);
444
445 struct vmcb {
446 struct vmcb_ctrl ctrl;
447 struct vmcb_state state;
448 } __packed;
449
450 CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
451 CTASSERT(offsetof(struct vmcb, state) == 0x400);
452
453 /* -------------------------------------------------------------------------- */
454
455 static void svm_vcpu_state_provide(struct nvmm_cpu *, uint64_t);
456 static void svm_vcpu_state_commit(struct nvmm_cpu *);
457
458 struct svm_hsave {
459 paddr_t pa;
460 };
461
462 static struct svm_hsave hsave[MAXCPUS];
463
464 static uint8_t *svm_asidmap __read_mostly;
465 static uint32_t svm_maxasid __read_mostly;
466 static kmutex_t svm_asidlock __cacheline_aligned;
467
468 static bool svm_decode_assist __read_mostly;
469 static uint32_t svm_ctrl_tlb_flush __read_mostly;
470
471 #define SVM_XCR0_MASK_DEFAULT (XCR0_X87|XCR0_SSE)
472 static uint64_t svm_xcr0_mask __read_mostly;
473
474 #define SVM_NCPUIDS 32
475
476 #define VMCB_NPAGES 1
477
478 #define MSRBM_NPAGES 2
479 #define MSRBM_SIZE (MSRBM_NPAGES * PAGE_SIZE)
480
481 #define IOBM_NPAGES 3
482 #define IOBM_SIZE (IOBM_NPAGES * PAGE_SIZE)
483
484 /* Does not include EFER_LMSLE. */
485 #define EFER_VALID \
486 (EFER_SCE|EFER_LME|EFER_LMA|EFER_NXE|EFER_SVME|EFER_FFXSR|EFER_TCE)
487
488 #define EFER_TLB_FLUSH \
489 (EFER_NXE|EFER_LMA|EFER_LME)
490 #define CR0_TLB_FLUSH \
491 (CR0_PG|CR0_WP|CR0_CD|CR0_NW)
492 #define CR4_TLB_FLUSH \
493 (CR4_PGE|CR4_PAE|CR4_PSE)
494
495 /* -------------------------------------------------------------------------- */
496
497 struct svm_machdata {
498 bool cpuidpresent[SVM_NCPUIDS];
499 struct nvmm_x86_conf_cpuid cpuid[SVM_NCPUIDS];
500 volatile uint64_t mach_htlb_gen;
501 };
502
503 static const size_t svm_conf_sizes[NVMM_X86_NCONF] = {
504 [NVMM_X86_CONF_CPUID] = sizeof(struct nvmm_x86_conf_cpuid)
505 };
506
507 struct svm_cpudata {
508 /* General */
509 bool shared_asid;
510 bool gtlb_want_flush;
511 bool gtsc_want_update;
512 uint64_t vcpu_htlb_gen;
513
514 /* VMCB */
515 struct vmcb *vmcb;
516 paddr_t vmcb_pa;
517
518 /* I/O bitmap */
519 uint8_t *iobm;
520 paddr_t iobm_pa;
521
522 /* MSR bitmap */
523 uint8_t *msrbm;
524 paddr_t msrbm_pa;
525
526 /* Host state */
527 uint64_t hxcr0;
528 uint64_t star;
529 uint64_t lstar;
530 uint64_t cstar;
531 uint64_t sfmask;
532 uint64_t fsbase;
533 uint64_t kernelgsbase;
534 bool ts_set;
535 struct xsave_header hfpu __aligned(64);
536
537 /* Intr state */
538 bool int_window_exit;
539 bool nmi_window_exit;
540 bool evt_pending;
541
542 /* Guest state */
543 uint64_t gxcr0;
544 uint64_t gprs[NVMM_X64_NGPR];
545 uint64_t drs[NVMM_X64_NDR];
546 uint64_t gtsc;
547 struct xsave_header gfpu __aligned(64);
548 };
549
550 static void
551 svm_vmcb_cache_default(struct vmcb *vmcb)
552 {
553 vmcb->ctrl.vmcb_clean =
554 VMCB_CTRL_VMCB_CLEAN_I |
555 VMCB_CTRL_VMCB_CLEAN_IOPM |
556 VMCB_CTRL_VMCB_CLEAN_ASID |
557 VMCB_CTRL_VMCB_CLEAN_TPR |
558 VMCB_CTRL_VMCB_CLEAN_NP |
559 VMCB_CTRL_VMCB_CLEAN_CR |
560 VMCB_CTRL_VMCB_CLEAN_DR |
561 VMCB_CTRL_VMCB_CLEAN_DT |
562 VMCB_CTRL_VMCB_CLEAN_SEG |
563 VMCB_CTRL_VMCB_CLEAN_CR2 |
564 VMCB_CTRL_VMCB_CLEAN_LBR |
565 VMCB_CTRL_VMCB_CLEAN_AVIC;
566 }
567
568 static void
569 svm_vmcb_cache_update(struct vmcb *vmcb, uint64_t flags)
570 {
571 if (flags & NVMM_X64_STATE_SEGS) {
572 vmcb->ctrl.vmcb_clean &=
573 ~(VMCB_CTRL_VMCB_CLEAN_SEG | VMCB_CTRL_VMCB_CLEAN_DT);
574 }
575 if (flags & NVMM_X64_STATE_CRS) {
576 vmcb->ctrl.vmcb_clean &=
577 ~(VMCB_CTRL_VMCB_CLEAN_CR | VMCB_CTRL_VMCB_CLEAN_CR2 |
578 VMCB_CTRL_VMCB_CLEAN_TPR);
579 }
580 if (flags & NVMM_X64_STATE_DRS) {
581 vmcb->ctrl.vmcb_clean &= ~VMCB_CTRL_VMCB_CLEAN_DR;
582 }
583 if (flags & NVMM_X64_STATE_MSRS) {
584 /* CR for EFER, NP for PAT. */
585 vmcb->ctrl.vmcb_clean &=
586 ~(VMCB_CTRL_VMCB_CLEAN_CR | VMCB_CTRL_VMCB_CLEAN_NP);
587 }
588 }
589
590 static inline void
591 svm_vmcb_cache_flush(struct vmcb *vmcb, uint64_t flags)
592 {
593 vmcb->ctrl.vmcb_clean &= ~flags;
594 }
595
596 static inline void
597 svm_vmcb_cache_flush_all(struct vmcb *vmcb)
598 {
599 vmcb->ctrl.vmcb_clean = 0;
600 }
601
602 #define SVM_EVENT_TYPE_HW_INT 0
603 #define SVM_EVENT_TYPE_NMI 2
604 #define SVM_EVENT_TYPE_EXC 3
605 #define SVM_EVENT_TYPE_SW_INT 4
606
607 static void
608 svm_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi)
609 {
610 struct svm_cpudata *cpudata = vcpu->cpudata;
611 struct vmcb *vmcb = cpudata->vmcb;
612
613 if (nmi) {
614 vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
615 cpudata->nmi_window_exit = true;
616 } else {
617 vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
618 vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
619 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_TPR);
620 cpudata->int_window_exit = true;
621 }
622
623 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
624 }
625
626 static void
627 svm_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi)
628 {
629 struct svm_cpudata *cpudata = vcpu->cpudata;
630 struct vmcb *vmcb = cpudata->vmcb;
631
632 if (nmi) {
633 vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
634 cpudata->nmi_window_exit = false;
635 } else {
636 vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
637 vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
638 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_TPR);
639 cpudata->int_window_exit = false;
640 }
641
642 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
643 }
644
645 static inline int
646 svm_event_has_error(uint64_t vector)
647 {
648 switch (vector) {
649 case 8: /* #DF */
650 case 10: /* #TS */
651 case 11: /* #NP */
652 case 12: /* #SS */
653 case 13: /* #GP */
654 case 14: /* #PF */
655 case 17: /* #AC */
656 case 30: /* #SX */
657 return 1;
658 default:
659 return 0;
660 }
661 }
662
663 static int
664 svm_vcpu_inject(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
665 struct nvmm_event *event)
666 {
667 struct svm_cpudata *cpudata = vcpu->cpudata;
668 struct vmcb *vmcb = cpudata->vmcb;
669 int type = 0, err = 0;
670
671 if (event->vector >= 256) {
672 return EINVAL;
673 }
674
675 switch (event->type) {
676 case NVMM_EVENT_INTERRUPT_HW:
677 type = SVM_EVENT_TYPE_HW_INT;
678 if (event->vector == 2) {
679 type = SVM_EVENT_TYPE_NMI;
680 }
681 if (type == SVM_EVENT_TYPE_NMI) {
682 if (cpudata->nmi_window_exit) {
683 return EAGAIN;
684 }
685 svm_event_waitexit_enable(vcpu, true);
686 } else {
687 if (((vmcb->state.rflags & PSL_I) == 0) ||
688 ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0)) {
689 svm_event_waitexit_enable(vcpu, false);
690 return EAGAIN;
691 }
692 }
693 err = 0;
694 break;
695 case NVMM_EVENT_INTERRUPT_SW:
696 return EINVAL;
697 case NVMM_EVENT_EXCEPTION:
698 type = SVM_EVENT_TYPE_EXC;
699 if (event->vector == 2 || event->vector >= 32)
700 return EINVAL;
701 if (event->vector == 3 || event->vector == 0)
702 return EINVAL;
703 err = svm_event_has_error(event->vector);
704 break;
705 default:
706 return EINVAL;
707 }
708
709 vmcb->ctrl.eventinj =
710 __SHIFTIN(event->vector, VMCB_CTRL_EVENTINJ_VECTOR) |
711 __SHIFTIN(type, VMCB_CTRL_EVENTINJ_TYPE) |
712 __SHIFTIN(err, VMCB_CTRL_EVENTINJ_EV) |
713 __SHIFTIN(1, VMCB_CTRL_EVENTINJ_V) |
714 __SHIFTIN(event->u.error, VMCB_CTRL_EVENTINJ_ERRORCODE);
715
716 cpudata->evt_pending = true;
717
718 return 0;
719 }
720
721 static void
722 svm_inject_ud(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
723 {
724 struct nvmm_event event;
725 int ret __diagused;
726
727 event.type = NVMM_EVENT_EXCEPTION;
728 event.vector = 6;
729 event.u.error = 0;
730
731 ret = svm_vcpu_inject(mach, vcpu, &event);
732 KASSERT(ret == 0);
733 }
734
735 static void
736 svm_inject_gp(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
737 {
738 struct nvmm_event event;
739 int ret __diagused;
740
741 event.type = NVMM_EVENT_EXCEPTION;
742 event.vector = 13;
743 event.u.error = 0;
744
745 ret = svm_vcpu_inject(mach, vcpu, &event);
746 KASSERT(ret == 0);
747 }
748
749 static inline void
750 svm_inkernel_advance(struct vmcb *vmcb)
751 {
752 /*
753 * Maybe we should also apply single-stepping and debug exceptions.
754 * Matters for guest-ring3, because it can execute 'cpuid' under a
755 * debugger.
756 */
757 vmcb->state.rip = vmcb->ctrl.nrip;
758 vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
759 }
760
761 static void
762 svm_inkernel_handle_cpuid(struct nvmm_cpu *vcpu, uint64_t eax, uint64_t ecx)
763 {
764 struct svm_cpudata *cpudata = vcpu->cpudata;
765 uint64_t cr4;
766
767 switch (eax) {
768 case 0x00000001:
769 cpudata->vmcb->state.rax &= nvmm_cpuid_00000001.eax;
770
771 cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_LOCAL_APIC_ID;
772 cpudata->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid,
773 CPUID_LOCAL_APIC_ID);
774
775 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000001.ecx;
776 cpudata->gprs[NVMM_X64_GPR_RCX] |= CPUID2_RAZ;
777
778 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000001.edx;
779
780 /* CPUID2_OSXSAVE depends on CR4. */
781 cr4 = cpudata->vmcb->state.cr4;
782 if (!(cr4 & CR4_OSXSAVE)) {
783 cpudata->gprs[NVMM_X64_GPR_RCX] &= ~CPUID2_OSXSAVE;
784 }
785 break;
786 case 0x00000005:
787 case 0x00000006:
788 cpudata->vmcb->state.rax = 0;
789 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
790 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
791 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
792 break;
793 case 0x00000007:
794 cpudata->vmcb->state.rax &= nvmm_cpuid_00000007.eax;
795 cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_00000007.ebx;
796 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000007.ecx;
797 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000007.edx;
798 break;
799 case 0x0000000D:
800 if (svm_xcr0_mask == 0) {
801 break;
802 }
803 switch (ecx) {
804 case 0:
805 cpudata->vmcb->state.rax = svm_xcr0_mask & 0xFFFFFFFF;
806 if (cpudata->gxcr0 & XCR0_SSE) {
807 cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct fxsave);
808 } else {
809 cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct save87);
810 }
811 cpudata->gprs[NVMM_X64_GPR_RBX] += 64; /* XSAVE header */
812 cpudata->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave) + 64;
813 cpudata->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
814 break;
815 case 1:
816 cpudata->vmcb->state.rax &= ~CPUID_PES1_XSAVES;
817 break;
818 }
819 break;
820 case 0x40000000:
821 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
822 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
823 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
824 memcpy(&cpudata->gprs[NVMM_X64_GPR_RBX], "___ ", 4);
825 memcpy(&cpudata->gprs[NVMM_X64_GPR_RCX], "NVMM", 4);
826 memcpy(&cpudata->gprs[NVMM_X64_GPR_RDX], " ___", 4);
827 break;
828 case 0x80000001:
829 cpudata->vmcb->state.rax &= nvmm_cpuid_80000001.eax;
830 cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_80000001.ebx;
831 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_80000001.ecx;
832 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_80000001.edx;
833 break;
834 default:
835 break;
836 }
837 }
838
839 static void
840 svm_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
841 struct nvmm_exit *exit)
842 {
843 struct svm_machdata *machdata = mach->machdata;
844 struct svm_cpudata *cpudata = vcpu->cpudata;
845 struct nvmm_x86_conf_cpuid *cpuid;
846 uint64_t eax, ecx;
847 u_int descs[4];
848 size_t i;
849
850 eax = cpudata->vmcb->state.rax;
851 ecx = cpudata->gprs[NVMM_X64_GPR_RCX];
852 x86_cpuid2(eax, ecx, descs);
853
854 cpudata->vmcb->state.rax = descs[0];
855 cpudata->gprs[NVMM_X64_GPR_RBX] = descs[1];
856 cpudata->gprs[NVMM_X64_GPR_RCX] = descs[2];
857 cpudata->gprs[NVMM_X64_GPR_RDX] = descs[3];
858
859 svm_inkernel_handle_cpuid(vcpu, eax, ecx);
860
861 for (i = 0; i < SVM_NCPUIDS; i++) {
862 cpuid = &machdata->cpuid[i];
863 if (!machdata->cpuidpresent[i]) {
864 continue;
865 }
866 if (cpuid->leaf != eax) {
867 continue;
868 }
869
870 /* del */
871 cpudata->vmcb->state.rax &= ~cpuid->del.eax;
872 cpudata->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->del.ebx;
873 cpudata->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->del.ecx;
874 cpudata->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->del.edx;
875
876 /* set */
877 cpudata->vmcb->state.rax |= cpuid->set.eax;
878 cpudata->gprs[NVMM_X64_GPR_RBX] |= cpuid->set.ebx;
879 cpudata->gprs[NVMM_X64_GPR_RCX] |= cpuid->set.ecx;
880 cpudata->gprs[NVMM_X64_GPR_RDX] |= cpuid->set.edx;
881
882 break;
883 }
884
885 svm_inkernel_advance(cpudata->vmcb);
886 exit->reason = NVMM_EXIT_NONE;
887 }
888
889 static void
890 svm_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
891 struct nvmm_exit *exit)
892 {
893 struct svm_cpudata *cpudata = vcpu->cpudata;
894 struct vmcb *vmcb = cpudata->vmcb;
895
896 if (cpudata->int_window_exit && (vmcb->state.rflags & PSL_I)) {
897 svm_event_waitexit_disable(vcpu, false);
898 }
899
900 svm_inkernel_advance(cpudata->vmcb);
901 exit->reason = NVMM_EXIT_HALTED;
902 }
903
904 #define SVM_EXIT_IO_PORT __BITS(31,16)
905 #define SVM_EXIT_IO_SEG __BITS(12,10)
906 #define SVM_EXIT_IO_A64 __BIT(9)
907 #define SVM_EXIT_IO_A32 __BIT(8)
908 #define SVM_EXIT_IO_A16 __BIT(7)
909 #define SVM_EXIT_IO_SZ32 __BIT(6)
910 #define SVM_EXIT_IO_SZ16 __BIT(5)
911 #define SVM_EXIT_IO_SZ8 __BIT(4)
912 #define SVM_EXIT_IO_REP __BIT(3)
913 #define SVM_EXIT_IO_STR __BIT(2)
914 #define SVM_EXIT_IO_IN __BIT(0)
915
916 static void
917 svm_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
918 struct nvmm_exit *exit)
919 {
920 struct svm_cpudata *cpudata = vcpu->cpudata;
921 uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
922 uint64_t nextpc = cpudata->vmcb->ctrl.exitinfo2;
923
924 exit->reason = NVMM_EXIT_IO;
925
926 if (info & SVM_EXIT_IO_IN) {
927 exit->u.io.type = NVMM_EXIT_IO_IN;
928 } else {
929 exit->u.io.type = NVMM_EXIT_IO_OUT;
930 }
931
932 exit->u.io.port = __SHIFTOUT(info, SVM_EXIT_IO_PORT);
933
934 if (svm_decode_assist) {
935 KASSERT(__SHIFTOUT(info, SVM_EXIT_IO_SEG) < 6);
936 exit->u.io.seg = __SHIFTOUT(info, SVM_EXIT_IO_SEG);
937 } else {
938 exit->u.io.seg = -1;
939 }
940
941 if (info & SVM_EXIT_IO_A64) {
942 exit->u.io.address_size = 8;
943 } else if (info & SVM_EXIT_IO_A32) {
944 exit->u.io.address_size = 4;
945 } else if (info & SVM_EXIT_IO_A16) {
946 exit->u.io.address_size = 2;
947 }
948
949 if (info & SVM_EXIT_IO_SZ32) {
950 exit->u.io.operand_size = 4;
951 } else if (info & SVM_EXIT_IO_SZ16) {
952 exit->u.io.operand_size = 2;
953 } else if (info & SVM_EXIT_IO_SZ8) {
954 exit->u.io.operand_size = 1;
955 }
956
957 exit->u.io.rep = (info & SVM_EXIT_IO_REP) != 0;
958 exit->u.io.str = (info & SVM_EXIT_IO_STR) != 0;
959 exit->u.io.npc = nextpc;
960
961 svm_vcpu_state_provide(vcpu,
962 NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
963 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
964 }
965
966 static const uint64_t msr_ignore_list[] = {
967 0xc0010055, /* MSR_CMPHALT */
968 MSR_DE_CFG,
969 MSR_IC_CFG,
970 MSR_UCODE_AMD_PATCHLEVEL
971 };
972
973 static bool
974 svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
975 struct nvmm_exit *exit)
976 {
977 struct svm_cpudata *cpudata = vcpu->cpudata;
978 struct vmcb *vmcb = cpudata->vmcb;
979 uint64_t val;
980 size_t i;
981
982 switch (exit->u.msr.type) {
983 case NVMM_EXIT_MSR_RDMSR:
984 if (exit->u.msr.msr == MSR_NB_CFG) {
985 val = NB_CFG_INITAPICCPUIDLO;
986 vmcb->state.rax = (val & 0xFFFFFFFF);
987 cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
988 goto handled;
989 }
990 for (i = 0; i < __arraycount(msr_ignore_list); i++) {
991 if (msr_ignore_list[i] != exit->u.msr.msr)
992 continue;
993 val = 0;
994 vmcb->state.rax = (val & 0xFFFFFFFF);
995 cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
996 goto handled;
997 }
998 break;
999 case NVMM_EXIT_MSR_WRMSR:
1000 if (exit->u.msr.msr == MSR_EFER) {
1001 if (__predict_false(exit->u.msr.val & ~EFER_VALID)) {
1002 goto error;
1003 }
1004 if ((vmcb->state.efer ^ exit->u.msr.val) &
1005 EFER_TLB_FLUSH) {
1006 cpudata->gtlb_want_flush = true;
1007 }
1008 vmcb->state.efer = exit->u.msr.val | EFER_SVME;
1009 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_CR);
1010 goto handled;
1011 }
1012 if (exit->u.msr.msr == MSR_TSC) {
1013 cpudata->gtsc = exit->u.msr.val;
1014 cpudata->gtsc_want_update = true;
1015 goto handled;
1016 }
1017 for (i = 0; i < __arraycount(msr_ignore_list); i++) {
1018 if (msr_ignore_list[i] != exit->u.msr.msr)
1019 continue;
1020 goto handled;
1021 }
1022 break;
1023 }
1024
1025 return false;
1026
1027 handled:
1028 svm_inkernel_advance(cpudata->vmcb);
1029 return true;
1030
1031 error:
1032 svm_inject_gp(mach, vcpu);
1033 return true;
1034 }
1035
1036 static void
1037 svm_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1038 struct nvmm_exit *exit)
1039 {
1040 struct svm_cpudata *cpudata = vcpu->cpudata;
1041 uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
1042
1043 if (info == 0) {
1044 exit->u.msr.type = NVMM_EXIT_MSR_RDMSR;
1045 } else {
1046 exit->u.msr.type = NVMM_EXIT_MSR_WRMSR;
1047 }
1048
1049 exit->u.msr.msr = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF);
1050
1051 if (info == 1) {
1052 uint64_t rdx, rax;
1053 rdx = cpudata->gprs[NVMM_X64_GPR_RDX];
1054 rax = cpudata->vmcb->state.rax;
1055 exit->u.msr.val = (rdx << 32) | (rax & 0xFFFFFFFF);
1056 } else {
1057 exit->u.msr.val = 0;
1058 }
1059
1060 if (svm_inkernel_handle_msr(mach, vcpu, exit)) {
1061 exit->reason = NVMM_EXIT_NONE;
1062 return;
1063 }
1064
1065 exit->reason = NVMM_EXIT_MSR;
1066 exit->u.msr.npc = cpudata->vmcb->ctrl.nrip;
1067
1068 svm_vcpu_state_provide(vcpu, NVMM_X64_STATE_GPRS);
1069 }
1070
1071 static void
1072 svm_exit_npf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1073 struct nvmm_exit *exit)
1074 {
1075 struct svm_cpudata *cpudata = vcpu->cpudata;
1076 gpaddr_t gpa = cpudata->vmcb->ctrl.exitinfo2;
1077
1078 exit->reason = NVMM_EXIT_MEMORY;
1079 if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_W)
1080 exit->u.mem.prot = PROT_WRITE;
1081 else if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_X)
1082 exit->u.mem.prot = PROT_EXEC;
1083 else
1084 exit->u.mem.prot = PROT_READ;
1085 exit->u.mem.gpa = gpa;
1086 exit->u.mem.inst_len = cpudata->vmcb->ctrl.inst_len;
1087 memcpy(exit->u.mem.inst_bytes, cpudata->vmcb->ctrl.inst_bytes,
1088 sizeof(exit->u.mem.inst_bytes));
1089
1090 svm_vcpu_state_provide(vcpu,
1091 NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
1092 NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
1093 }
1094
1095 static void
1096 svm_exit_insn(struct vmcb *vmcb, struct nvmm_exit *exit, uint64_t reason)
1097 {
1098 exit->u.insn.npc = vmcb->ctrl.nrip;
1099 exit->reason = reason;
1100 }
1101
1102 static void
1103 svm_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1104 struct nvmm_exit *exit)
1105 {
1106 struct svm_cpudata *cpudata = vcpu->cpudata;
1107 struct vmcb *vmcb = cpudata->vmcb;
1108 uint64_t val;
1109
1110 exit->reason = NVMM_EXIT_NONE;
1111
1112 val = (cpudata->gprs[NVMM_X64_GPR_RDX] << 32) |
1113 (vmcb->state.rax & 0xFFFFFFFF);
1114
1115 if (__predict_false(cpudata->gprs[NVMM_X64_GPR_RCX] != 0)) {
1116 goto error;
1117 } else if (__predict_false(vmcb->state.cpl != 0)) {
1118 goto error;
1119 } else if (__predict_false((val & ~svm_xcr0_mask) != 0)) {
1120 goto error;
1121 } else if (__predict_false((val & XCR0_X87) == 0)) {
1122 goto error;
1123 }
1124
1125 cpudata->gxcr0 = val;
1126
1127 svm_inkernel_advance(cpudata->vmcb);
1128 return;
1129
1130 error:
1131 svm_inject_gp(mach, vcpu);
1132 }
1133
1134 static void
1135 svm_exit_invalid(struct nvmm_exit *exit, uint64_t code)
1136 {
1137 exit->u.inv.hwcode = code;
1138 exit->reason = NVMM_EXIT_INVALID;
1139 }
1140
1141 /* -------------------------------------------------------------------------- */
1142
1143 static void
1144 svm_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu)
1145 {
1146 struct svm_cpudata *cpudata = vcpu->cpudata;
1147
1148 cpudata->ts_set = (rcr0() & CR0_TS) != 0;
1149
1150 fpu_area_save(&cpudata->hfpu, svm_xcr0_mask);
1151 fpu_area_restore(&cpudata->gfpu, svm_xcr0_mask);
1152
1153 if (svm_xcr0_mask != 0) {
1154 cpudata->hxcr0 = rdxcr(0);
1155 wrxcr(0, cpudata->gxcr0);
1156 }
1157 }
1158
1159 static void
1160 svm_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu)
1161 {
1162 struct svm_cpudata *cpudata = vcpu->cpudata;
1163
1164 if (svm_xcr0_mask != 0) {
1165 cpudata->gxcr0 = rdxcr(0);
1166 wrxcr(0, cpudata->hxcr0);
1167 }
1168
1169 fpu_area_save(&cpudata->gfpu, svm_xcr0_mask);
1170 fpu_area_restore(&cpudata->hfpu, svm_xcr0_mask);
1171
1172 if (cpudata->ts_set) {
1173 stts();
1174 }
1175 }
1176
1177 static void
1178 svm_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu)
1179 {
1180 struct svm_cpudata *cpudata = vcpu->cpudata;
1181
1182 x86_dbregs_save(curlwp);
1183
1184 ldr7(0);
1185
1186 ldr0(cpudata->drs[NVMM_X64_DR_DR0]);
1187 ldr1(cpudata->drs[NVMM_X64_DR_DR1]);
1188 ldr2(cpudata->drs[NVMM_X64_DR_DR2]);
1189 ldr3(cpudata->drs[NVMM_X64_DR_DR3]);
1190 }
1191
1192 static void
1193 svm_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu)
1194 {
1195 struct svm_cpudata *cpudata = vcpu->cpudata;
1196
1197 cpudata->drs[NVMM_X64_DR_DR0] = rdr0();
1198 cpudata->drs[NVMM_X64_DR_DR1] = rdr1();
1199 cpudata->drs[NVMM_X64_DR_DR2] = rdr2();
1200 cpudata->drs[NVMM_X64_DR_DR3] = rdr3();
1201
1202 x86_dbregs_restore(curlwp);
1203 }
1204
1205 static void
1206 svm_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu)
1207 {
1208 struct svm_cpudata *cpudata = vcpu->cpudata;
1209
1210 cpudata->fsbase = rdmsr(MSR_FSBASE);
1211 cpudata->kernelgsbase = rdmsr(MSR_KERNELGSBASE);
1212 }
1213
1214 static void
1215 svm_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu)
1216 {
1217 struct svm_cpudata *cpudata = vcpu->cpudata;
1218
1219 wrmsr(MSR_STAR, cpudata->star);
1220 wrmsr(MSR_LSTAR, cpudata->lstar);
1221 wrmsr(MSR_CSTAR, cpudata->cstar);
1222 wrmsr(MSR_SFMASK, cpudata->sfmask);
1223 wrmsr(MSR_FSBASE, cpudata->fsbase);
1224 wrmsr(MSR_KERNELGSBASE, cpudata->kernelgsbase);
1225 }
1226
1227 /* -------------------------------------------------------------------------- */
1228
1229 static inline void
1230 svm_gtlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
1231 {
1232 struct svm_cpudata *cpudata = vcpu->cpudata;
1233
1234 if (vcpu->hcpu_last != hcpu || cpudata->shared_asid) {
1235 cpudata->gtlb_want_flush = true;
1236 }
1237 }
1238
1239 static inline void
1240 svm_htlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
1241 {
1242 /*
1243 * Nothing to do. If an hTLB flush was needed, either the VCPU was
1244 * executing on this hCPU and the hTLB already got flushed, or it
1245 * was executing on another hCPU in which case the catchup is done
1246 * in svm_gtlb_catchup().
1247 */
1248 }
1249
1250 static inline uint64_t
1251 svm_htlb_flush(struct svm_machdata *machdata, struct svm_cpudata *cpudata)
1252 {
1253 struct vmcb *vmcb = cpudata->vmcb;
1254 uint64_t machgen;
1255
1256 machgen = machdata->mach_htlb_gen;
1257 if (__predict_true(machgen == cpudata->vcpu_htlb_gen)) {
1258 return machgen;
1259 }
1260
1261 vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
1262 return machgen;
1263 }
1264
1265 static inline void
1266 svm_htlb_flush_ack(struct svm_cpudata *cpudata, uint64_t machgen)
1267 {
1268 struct vmcb *vmcb = cpudata->vmcb;
1269
1270 if (__predict_true(vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID)) {
1271 cpudata->vcpu_htlb_gen = machgen;
1272 }
1273 }
1274
1275 static inline void
1276 svm_exit_evt(struct svm_cpudata *cpudata, struct vmcb *vmcb)
1277 {
1278 cpudata->evt_pending = false;
1279
1280 if (__predict_false(vmcb->ctrl.exitintinfo & VMCB_CTRL_EXITINTINFO_V)) {
1281 vmcb->ctrl.eventinj = vmcb->ctrl.exitintinfo;
1282 cpudata->evt_pending = true;
1283 }
1284 }
1285
1286 static int
1287 svm_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1288 struct nvmm_exit *exit)
1289 {
1290 struct nvmm_comm_page *comm = vcpu->comm;
1291 struct svm_machdata *machdata = mach->machdata;
1292 struct svm_cpudata *cpudata = vcpu->cpudata;
1293 struct vmcb *vmcb = cpudata->vmcb;
1294 uint64_t machgen;
1295 int hcpu, s;
1296
1297 svm_vcpu_state_commit(vcpu);
1298 comm->state_cached = 0;
1299
1300 kpreempt_disable();
1301 hcpu = cpu_number();
1302
1303 svm_gtlb_catchup(vcpu, hcpu);
1304 svm_htlb_catchup(vcpu, hcpu);
1305
1306 if (vcpu->hcpu_last != hcpu) {
1307 svm_vmcb_cache_flush_all(vmcb);
1308 cpudata->gtsc_want_update = true;
1309 }
1310
1311 svm_vcpu_guest_dbregs_enter(vcpu);
1312 svm_vcpu_guest_misc_enter(vcpu);
1313
1314 while (1) {
1315 if (cpudata->gtlb_want_flush) {
1316 vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
1317 } else {
1318 vmcb->ctrl.tlb_ctrl = 0;
1319 }
1320
1321 if (__predict_false(cpudata->gtsc_want_update)) {
1322 vmcb->ctrl.tsc_offset = cpudata->gtsc - rdtsc();
1323 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
1324 }
1325
1326 s = splhigh();
1327 machgen = svm_htlb_flush(machdata, cpudata);
1328 svm_vcpu_guest_fpu_enter(vcpu);
1329 svm_vmrun(cpudata->vmcb_pa, cpudata->gprs);
1330 svm_vcpu_guest_fpu_leave(vcpu);
1331 svm_htlb_flush_ack(cpudata, machgen);
1332 splx(s);
1333
1334 svm_vmcb_cache_default(vmcb);
1335
1336 if (vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID) {
1337 cpudata->gtlb_want_flush = false;
1338 cpudata->gtsc_want_update = false;
1339 vcpu->hcpu_last = hcpu;
1340 }
1341 svm_exit_evt(cpudata, vmcb);
1342
1343 switch (vmcb->ctrl.exitcode) {
1344 case VMCB_EXITCODE_INTR:
1345 case VMCB_EXITCODE_NMI:
1346 exit->reason = NVMM_EXIT_NONE;
1347 break;
1348 case VMCB_EXITCODE_VINTR:
1349 svm_event_waitexit_disable(vcpu, false);
1350 exit->reason = NVMM_EXIT_INT_READY;
1351 break;
1352 case VMCB_EXITCODE_IRET:
1353 svm_event_waitexit_disable(vcpu, true);
1354 exit->reason = NVMM_EXIT_NMI_READY;
1355 break;
1356 case VMCB_EXITCODE_CPUID:
1357 svm_exit_cpuid(mach, vcpu, exit);
1358 break;
1359 case VMCB_EXITCODE_HLT:
1360 svm_exit_hlt(mach, vcpu, exit);
1361 break;
1362 case VMCB_EXITCODE_IOIO:
1363 svm_exit_io(mach, vcpu, exit);
1364 break;
1365 case VMCB_EXITCODE_MSR:
1366 svm_exit_msr(mach, vcpu, exit);
1367 break;
1368 case VMCB_EXITCODE_SHUTDOWN:
1369 exit->reason = NVMM_EXIT_SHUTDOWN;
1370 break;
1371 case VMCB_EXITCODE_RDPMC:
1372 case VMCB_EXITCODE_RSM:
1373 case VMCB_EXITCODE_INVLPGA:
1374 case VMCB_EXITCODE_VMRUN:
1375 case VMCB_EXITCODE_VMMCALL:
1376 case VMCB_EXITCODE_VMLOAD:
1377 case VMCB_EXITCODE_VMSAVE:
1378 case VMCB_EXITCODE_STGI:
1379 case VMCB_EXITCODE_CLGI:
1380 case VMCB_EXITCODE_SKINIT:
1381 case VMCB_EXITCODE_RDTSCP:
1382 svm_inject_ud(mach, vcpu);
1383 exit->reason = NVMM_EXIT_NONE;
1384 break;
1385 case VMCB_EXITCODE_MONITOR:
1386 svm_exit_insn(vmcb, exit, NVMM_EXIT_MONITOR);
1387 break;
1388 case VMCB_EXITCODE_MWAIT:
1389 svm_exit_insn(vmcb, exit, NVMM_EXIT_MWAIT);
1390 break;
1391 case VMCB_EXITCODE_MWAIT_CONDITIONAL:
1392 svm_exit_insn(vmcb, exit, NVMM_EXIT_MWAIT_COND);
1393 break;
1394 case VMCB_EXITCODE_XSETBV:
1395 svm_exit_xsetbv(mach, vcpu, exit);
1396 break;
1397 case VMCB_EXITCODE_NPF:
1398 svm_exit_npf(mach, vcpu, exit);
1399 break;
1400 case VMCB_EXITCODE_FERR_FREEZE: /* ? */
1401 default:
1402 svm_exit_invalid(exit, vmcb->ctrl.exitcode);
1403 break;
1404 }
1405
1406 /* If no reason to return to userland, keep rolling. */
1407 if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
1408 break;
1409 }
1410 if (curcpu()->ci_data.cpu_softints != 0) {
1411 break;
1412 }
1413 if (curlwp->l_flag & LW_USERRET) {
1414 break;
1415 }
1416 if (exit->reason != NVMM_EXIT_NONE) {
1417 break;
1418 }
1419 }
1420
1421 cpudata->gtsc = rdtsc() + vmcb->ctrl.tsc_offset;
1422
1423 svm_vcpu_guest_misc_leave(vcpu);
1424 svm_vcpu_guest_dbregs_leave(vcpu);
1425
1426 kpreempt_enable();
1427
1428 exit->exitstate[NVMM_X64_EXITSTATE_CR8] = __SHIFTOUT(vmcb->ctrl.v,
1429 VMCB_CTRL_V_TPR);
1430 exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS] = vmcb->state.rflags;
1431
1432 exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] =
1433 ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0);
1434 exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] =
1435 cpudata->int_window_exit;
1436 exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] =
1437 cpudata->nmi_window_exit;
1438 exit->exitstate[NVMM_X64_EXITSTATE_EVT_PENDING] =
1439 cpudata->evt_pending;
1440
1441 return 0;
1442 }
1443
1444 /* -------------------------------------------------------------------------- */
1445
1446 static int
1447 svm_memalloc(paddr_t *pa, vaddr_t *va, size_t npages)
1448 {
1449 struct pglist pglist;
1450 paddr_t _pa;
1451 vaddr_t _va;
1452 size_t i;
1453 int ret;
1454
1455 ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0,
1456 &pglist, 1, 0);
1457 if (ret != 0)
1458 return ENOMEM;
1459 _pa = TAILQ_FIRST(&pglist)->phys_addr;
1460 _va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0,
1461 UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1462 if (_va == 0)
1463 goto error;
1464
1465 for (i = 0; i < npages; i++) {
1466 pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE,
1467 VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK);
1468 }
1469 pmap_update(pmap_kernel());
1470
1471 memset((void *)_va, 0, npages * PAGE_SIZE);
1472
1473 *pa = _pa;
1474 *va = _va;
1475 return 0;
1476
1477 error:
1478 for (i = 0; i < npages; i++) {
1479 uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE));
1480 }
1481 return ENOMEM;
1482 }
1483
1484 static void
1485 svm_memfree(paddr_t pa, vaddr_t va, size_t npages)
1486 {
1487 size_t i;
1488
1489 pmap_kremove(va, npages * PAGE_SIZE);
1490 pmap_update(pmap_kernel());
1491 uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY);
1492 for (i = 0; i < npages; i++) {
1493 uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE));
1494 }
1495 }
1496
1497 /* -------------------------------------------------------------------------- */
1498
1499 #define SVM_MSRBM_READ __BIT(0)
1500 #define SVM_MSRBM_WRITE __BIT(1)
1501
1502 static void
1503 svm_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write)
1504 {
1505 uint64_t byte;
1506 uint8_t bitoff;
1507
1508 if (msr < 0x00002000) {
1509 /* Range 1 */
1510 byte = ((msr - 0x00000000) >> 2UL) + 0x0000;
1511 } else if (msr >= 0xC0000000 && msr < 0xC0002000) {
1512 /* Range 2 */
1513 byte = ((msr - 0xC0000000) >> 2UL) + 0x0800;
1514 } else if (msr >= 0xC0010000 && msr < 0xC0012000) {
1515 /* Range 3 */
1516 byte = ((msr - 0xC0010000) >> 2UL) + 0x1000;
1517 } else {
1518 panic("%s: wrong range", __func__);
1519 }
1520
1521 bitoff = (msr & 0x3) << 1;
1522
1523 if (read) {
1524 bitmap[byte] &= ~(SVM_MSRBM_READ << bitoff);
1525 }
1526 if (write) {
1527 bitmap[byte] &= ~(SVM_MSRBM_WRITE << bitoff);
1528 }
1529 }
1530
1531 #define SVM_SEG_ATTRIB_TYPE __BITS(3,0)
1532 #define SVM_SEG_ATTRIB_S __BIT(4)
1533 #define SVM_SEG_ATTRIB_DPL __BITS(6,5)
1534 #define SVM_SEG_ATTRIB_P __BIT(7)
1535 #define SVM_SEG_ATTRIB_AVL __BIT(8)
1536 #define SVM_SEG_ATTRIB_L __BIT(9)
1537 #define SVM_SEG_ATTRIB_DEF __BIT(10)
1538 #define SVM_SEG_ATTRIB_G __BIT(11)
1539
1540 static void
1541 svm_vcpu_setstate_seg(const struct nvmm_x64_state_seg *seg,
1542 struct vmcb_segment *vseg)
1543 {
1544 vseg->selector = seg->selector;
1545 vseg->attrib =
1546 __SHIFTIN(seg->attrib.type, SVM_SEG_ATTRIB_TYPE) |
1547 __SHIFTIN(seg->attrib.s, SVM_SEG_ATTRIB_S) |
1548 __SHIFTIN(seg->attrib.dpl, SVM_SEG_ATTRIB_DPL) |
1549 __SHIFTIN(seg->attrib.p, SVM_SEG_ATTRIB_P) |
1550 __SHIFTIN(seg->attrib.avl, SVM_SEG_ATTRIB_AVL) |
1551 __SHIFTIN(seg->attrib.l, SVM_SEG_ATTRIB_L) |
1552 __SHIFTIN(seg->attrib.def, SVM_SEG_ATTRIB_DEF) |
1553 __SHIFTIN(seg->attrib.g, SVM_SEG_ATTRIB_G);
1554 vseg->limit = seg->limit;
1555 vseg->base = seg->base;
1556 }
1557
1558 static void
1559 svm_vcpu_getstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
1560 {
1561 seg->selector = vseg->selector;
1562 seg->attrib.type = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_TYPE);
1563 seg->attrib.s = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_S);
1564 seg->attrib.dpl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DPL);
1565 seg->attrib.p = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_P);
1566 seg->attrib.avl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_AVL);
1567 seg->attrib.l = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_L);
1568 seg->attrib.def = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DEF);
1569 seg->attrib.g = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_G);
1570 seg->limit = vseg->limit;
1571 seg->base = vseg->base;
1572 }
1573
1574 static inline bool
1575 svm_state_tlb_flush(const struct vmcb *vmcb, const struct nvmm_x64_state *state,
1576 uint64_t flags)
1577 {
1578 if (flags & NVMM_X64_STATE_CRS) {
1579 if ((vmcb->state.cr0 ^
1580 state->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) {
1581 return true;
1582 }
1583 if (vmcb->state.cr3 != state->crs[NVMM_X64_CR_CR3]) {
1584 return true;
1585 }
1586 if ((vmcb->state.cr4 ^
1587 state->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) {
1588 return true;
1589 }
1590 }
1591
1592 if (flags & NVMM_X64_STATE_MSRS) {
1593 if ((vmcb->state.efer ^
1594 state->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) {
1595 return true;
1596 }
1597 }
1598
1599 return false;
1600 }
1601
1602 static void
1603 svm_vcpu_setstate(struct nvmm_cpu *vcpu)
1604 {
1605 struct nvmm_comm_page *comm = vcpu->comm;
1606 const struct nvmm_x64_state *state = &comm->state;
1607 struct svm_cpudata *cpudata = vcpu->cpudata;
1608 struct vmcb *vmcb = cpudata->vmcb;
1609 struct fxsave *fpustate;
1610 uint64_t flags;
1611
1612 flags = comm->state_wanted;
1613
1614 if (svm_state_tlb_flush(vmcb, state, flags)) {
1615 cpudata->gtlb_want_flush = true;
1616 }
1617
1618 if (flags & NVMM_X64_STATE_SEGS) {
1619 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_CS],
1620 &vmcb->state.cs);
1621 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_DS],
1622 &vmcb->state.ds);
1623 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_ES],
1624 &vmcb->state.es);
1625 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_FS],
1626 &vmcb->state.fs);
1627 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_GS],
1628 &vmcb->state.gs);
1629 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_SS],
1630 &vmcb->state.ss);
1631 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_GDT],
1632 &vmcb->state.gdt);
1633 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_IDT],
1634 &vmcb->state.idt);
1635 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_LDT],
1636 &vmcb->state.ldt);
1637 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_TR],
1638 &vmcb->state.tr);
1639
1640 vmcb->state.cpl = state->segs[NVMM_X64_SEG_SS].attrib.dpl;
1641 }
1642
1643 CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
1644 if (flags & NVMM_X64_STATE_GPRS) {
1645 memcpy(cpudata->gprs, state->gprs, sizeof(state->gprs));
1646
1647 vmcb->state.rip = state->gprs[NVMM_X64_GPR_RIP];
1648 vmcb->state.rsp = state->gprs[NVMM_X64_GPR_RSP];
1649 vmcb->state.rax = state->gprs[NVMM_X64_GPR_RAX];
1650 vmcb->state.rflags = state->gprs[NVMM_X64_GPR_RFLAGS];
1651 }
1652
1653 if (flags & NVMM_X64_STATE_CRS) {
1654 vmcb->state.cr0 = state->crs[NVMM_X64_CR_CR0];
1655 vmcb->state.cr2 = state->crs[NVMM_X64_CR_CR2];
1656 vmcb->state.cr3 = state->crs[NVMM_X64_CR_CR3];
1657 vmcb->state.cr4 = state->crs[NVMM_X64_CR_CR4];
1658
1659 vmcb->ctrl.v &= ~VMCB_CTRL_V_TPR;
1660 vmcb->ctrl.v |= __SHIFTIN(state->crs[NVMM_X64_CR_CR8],
1661 VMCB_CTRL_V_TPR);
1662
1663 if (svm_xcr0_mask != 0) {
1664 /* Clear illegal XCR0 bits, set mandatory X87 bit. */
1665 cpudata->gxcr0 = state->crs[NVMM_X64_CR_XCR0];
1666 cpudata->gxcr0 &= svm_xcr0_mask;
1667 cpudata->gxcr0 |= XCR0_X87;
1668 }
1669 }
1670
1671 CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
1672 if (flags & NVMM_X64_STATE_DRS) {
1673 memcpy(cpudata->drs, state->drs, sizeof(state->drs));
1674
1675 vmcb->state.dr6 = state->drs[NVMM_X64_DR_DR6];
1676 vmcb->state.dr7 = state->drs[NVMM_X64_DR_DR7];
1677 }
1678
1679 if (flags & NVMM_X64_STATE_MSRS) {
1680 /*
1681 * EFER_SVME is mandatory.
1682 */
1683 vmcb->state.efer = state->msrs[NVMM_X64_MSR_EFER] | EFER_SVME;
1684 vmcb->state.star = state->msrs[NVMM_X64_MSR_STAR];
1685 vmcb->state.lstar = state->msrs[NVMM_X64_MSR_LSTAR];
1686 vmcb->state.cstar = state->msrs[NVMM_X64_MSR_CSTAR];
1687 vmcb->state.sfmask = state->msrs[NVMM_X64_MSR_SFMASK];
1688 vmcb->state.kernelgsbase =
1689 state->msrs[NVMM_X64_MSR_KERNELGSBASE];
1690 vmcb->state.sysenter_cs =
1691 state->msrs[NVMM_X64_MSR_SYSENTER_CS];
1692 vmcb->state.sysenter_esp =
1693 state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
1694 vmcb->state.sysenter_eip =
1695 state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
1696 vmcb->state.g_pat = state->msrs[NVMM_X64_MSR_PAT];
1697
1698 cpudata->gtsc = state->msrs[NVMM_X64_MSR_TSC];
1699 cpudata->gtsc_want_update = true;
1700 }
1701
1702 if (flags & NVMM_X64_STATE_INTR) {
1703 if (state->intr.int_shadow) {
1704 vmcb->ctrl.intr |= VMCB_CTRL_INTR_SHADOW;
1705 } else {
1706 vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
1707 }
1708
1709 if (state->intr.int_window_exiting) {
1710 svm_event_waitexit_enable(vcpu, false);
1711 } else {
1712 svm_event_waitexit_disable(vcpu, false);
1713 }
1714
1715 if (state->intr.nmi_window_exiting) {
1716 svm_event_waitexit_enable(vcpu, true);
1717 } else {
1718 svm_event_waitexit_disable(vcpu, true);
1719 }
1720 }
1721
1722 CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu));
1723 if (flags & NVMM_X64_STATE_FPU) {
1724 memcpy(cpudata->gfpu.xsh_fxsave, &state->fpu,
1725 sizeof(state->fpu));
1726
1727 fpustate = (struct fxsave *)cpudata->gfpu.xsh_fxsave;
1728 fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
1729 fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask;
1730
1731 if (svm_xcr0_mask != 0) {
1732 /* Reset XSTATE_BV, to force a reload. */
1733 cpudata->gfpu.xsh_xstate_bv = svm_xcr0_mask;
1734 }
1735 }
1736
1737 svm_vmcb_cache_update(vmcb, flags);
1738
1739 comm->state_wanted = 0;
1740 comm->state_cached |= flags;
1741 }
1742
1743 static void
1744 svm_vcpu_getstate(struct nvmm_cpu *vcpu)
1745 {
1746 struct nvmm_comm_page *comm = vcpu->comm;
1747 struct nvmm_x64_state *state = &comm->state;
1748 struct svm_cpudata *cpudata = vcpu->cpudata;
1749 struct vmcb *vmcb = cpudata->vmcb;
1750 uint64_t flags;
1751
1752 flags = comm->state_wanted;
1753
1754 if (flags & NVMM_X64_STATE_SEGS) {
1755 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_CS],
1756 &vmcb->state.cs);
1757 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_DS],
1758 &vmcb->state.ds);
1759 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_ES],
1760 &vmcb->state.es);
1761 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_FS],
1762 &vmcb->state.fs);
1763 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_GS],
1764 &vmcb->state.gs);
1765 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_SS],
1766 &vmcb->state.ss);
1767 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_GDT],
1768 &vmcb->state.gdt);
1769 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_IDT],
1770 &vmcb->state.idt);
1771 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_LDT],
1772 &vmcb->state.ldt);
1773 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_TR],
1774 &vmcb->state.tr);
1775
1776 state->segs[NVMM_X64_SEG_SS].attrib.dpl = vmcb->state.cpl;
1777 }
1778
1779 CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
1780 if (flags & NVMM_X64_STATE_GPRS) {
1781 memcpy(state->gprs, cpudata->gprs, sizeof(state->gprs));
1782
1783 state->gprs[NVMM_X64_GPR_RIP] = vmcb->state.rip;
1784 state->gprs[NVMM_X64_GPR_RSP] = vmcb->state.rsp;
1785 state->gprs[NVMM_X64_GPR_RAX] = vmcb->state.rax;
1786 state->gprs[NVMM_X64_GPR_RFLAGS] = vmcb->state.rflags;
1787 }
1788
1789 if (flags & NVMM_X64_STATE_CRS) {
1790 state->crs[NVMM_X64_CR_CR0] = vmcb->state.cr0;
1791 state->crs[NVMM_X64_CR_CR2] = vmcb->state.cr2;
1792 state->crs[NVMM_X64_CR_CR3] = vmcb->state.cr3;
1793 state->crs[NVMM_X64_CR_CR4] = vmcb->state.cr4;
1794 state->crs[NVMM_X64_CR_CR8] = __SHIFTOUT(vmcb->ctrl.v,
1795 VMCB_CTRL_V_TPR);
1796 state->crs[NVMM_X64_CR_XCR0] = cpudata->gxcr0;
1797 }
1798
1799 CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
1800 if (flags & NVMM_X64_STATE_DRS) {
1801 memcpy(state->drs, cpudata->drs, sizeof(state->drs));
1802
1803 state->drs[NVMM_X64_DR_DR6] = vmcb->state.dr6;
1804 state->drs[NVMM_X64_DR_DR7] = vmcb->state.dr7;
1805 }
1806
1807 if (flags & NVMM_X64_STATE_MSRS) {
1808 state->msrs[NVMM_X64_MSR_EFER] = vmcb->state.efer;
1809 state->msrs[NVMM_X64_MSR_STAR] = vmcb->state.star;
1810 state->msrs[NVMM_X64_MSR_LSTAR] = vmcb->state.lstar;
1811 state->msrs[NVMM_X64_MSR_CSTAR] = vmcb->state.cstar;
1812 state->msrs[NVMM_X64_MSR_SFMASK] = vmcb->state.sfmask;
1813 state->msrs[NVMM_X64_MSR_KERNELGSBASE] =
1814 vmcb->state.kernelgsbase;
1815 state->msrs[NVMM_X64_MSR_SYSENTER_CS] =
1816 vmcb->state.sysenter_cs;
1817 state->msrs[NVMM_X64_MSR_SYSENTER_ESP] =
1818 vmcb->state.sysenter_esp;
1819 state->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
1820 vmcb->state.sysenter_eip;
1821 state->msrs[NVMM_X64_MSR_PAT] = vmcb->state.g_pat;
1822 state->msrs[NVMM_X64_MSR_TSC] = cpudata->gtsc;
1823
1824 /* Hide SVME. */
1825 state->msrs[NVMM_X64_MSR_EFER] &= ~EFER_SVME;
1826 }
1827
1828 if (flags & NVMM_X64_STATE_INTR) {
1829 state->intr.int_shadow =
1830 (vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0;
1831 state->intr.int_window_exiting = cpudata->int_window_exit;
1832 state->intr.nmi_window_exiting = cpudata->nmi_window_exit;
1833 state->intr.evt_pending = cpudata->evt_pending;
1834 }
1835
1836 CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu));
1837 if (flags & NVMM_X64_STATE_FPU) {
1838 memcpy(&state->fpu, cpudata->gfpu.xsh_fxsave,
1839 sizeof(state->fpu));
1840 }
1841
1842 comm->state_wanted = 0;
1843 comm->state_cached |= flags;
1844 }
1845
1846 static void
1847 svm_vcpu_state_provide(struct nvmm_cpu *vcpu, uint64_t flags)
1848 {
1849 vcpu->comm->state_wanted = flags;
1850 svm_vcpu_getstate(vcpu);
1851 }
1852
1853 static void
1854 svm_vcpu_state_commit(struct nvmm_cpu *vcpu)
1855 {
1856 vcpu->comm->state_wanted = vcpu->comm->state_commit;
1857 vcpu->comm->state_commit = 0;
1858 svm_vcpu_setstate(vcpu);
1859 }
1860
1861 /* -------------------------------------------------------------------------- */
1862
1863 static void
1864 svm_asid_alloc(struct nvmm_cpu *vcpu)
1865 {
1866 struct svm_cpudata *cpudata = vcpu->cpudata;
1867 struct vmcb *vmcb = cpudata->vmcb;
1868 size_t i, oct, bit;
1869
1870 mutex_enter(&svm_asidlock);
1871
1872 for (i = 0; i < svm_maxasid; i++) {
1873 oct = i / 8;
1874 bit = i % 8;
1875
1876 if (svm_asidmap[oct] & __BIT(bit)) {
1877 continue;
1878 }
1879
1880 svm_asidmap[oct] |= __BIT(bit);
1881 vmcb->ctrl.guest_asid = i;
1882 mutex_exit(&svm_asidlock);
1883 return;
1884 }
1885
1886 /*
1887 * No free ASID. Use the last one, which is shared and requires
1888 * special TLB handling.
1889 */
1890 cpudata->shared_asid = true;
1891 vmcb->ctrl.guest_asid = svm_maxasid - 1;
1892 mutex_exit(&svm_asidlock);
1893 }
1894
1895 static void
1896 svm_asid_free(struct nvmm_cpu *vcpu)
1897 {
1898 struct svm_cpudata *cpudata = vcpu->cpudata;
1899 struct vmcb *vmcb = cpudata->vmcb;
1900 size_t oct, bit;
1901
1902 if (cpudata->shared_asid) {
1903 return;
1904 }
1905
1906 oct = vmcb->ctrl.guest_asid / 8;
1907 bit = vmcb->ctrl.guest_asid % 8;
1908
1909 mutex_enter(&svm_asidlock);
1910 svm_asidmap[oct] &= ~__BIT(bit);
1911 mutex_exit(&svm_asidlock);
1912 }
1913
1914 static void
1915 svm_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
1916 {
1917 struct svm_cpudata *cpudata = vcpu->cpudata;
1918 struct vmcb *vmcb = cpudata->vmcb;
1919
1920 /* Allow reads/writes of Control Registers. */
1921 vmcb->ctrl.intercept_cr = 0;
1922
1923 /* Allow reads/writes of Debug Registers. */
1924 vmcb->ctrl.intercept_dr = 0;
1925
1926 /* Allow exceptions 0 to 31. */
1927 vmcb->ctrl.intercept_vec = 0;
1928
1929 /*
1930 * Allow:
1931 * - SMI [smm interrupts]
1932 * - VINTR [virtual interrupts]
1933 * - CR0_SPEC [CR0 writes changing other fields than CR0.TS or CR0.MP]
1934 * - RIDTR [reads of IDTR]
1935 * - RGDTR [reads of GDTR]
1936 * - RLDTR [reads of LDTR]
1937 * - RTR [reads of TR]
1938 * - WIDTR [writes of IDTR]
1939 * - WGDTR [writes of GDTR]
1940 * - WLDTR [writes of LDTR]
1941 * - WTR [writes of TR]
1942 * - RDTSC [rdtsc instruction]
1943 * - PUSHF [pushf instruction]
1944 * - POPF [popf instruction]
1945 * - IRET [iret instruction]
1946 * - INTN [int $n instructions]
1947 * - INVD [invd instruction]
1948 * - PAUSE [pause instruction]
1949 * - INVLPG [invplg instruction]
1950 * - TASKSW [task switches]
1951 *
1952 * Intercept the rest below.
1953 */
1954 vmcb->ctrl.intercept_misc1 =
1955 VMCB_CTRL_INTERCEPT_INTR |
1956 VMCB_CTRL_INTERCEPT_NMI |
1957 VMCB_CTRL_INTERCEPT_INIT |
1958 VMCB_CTRL_INTERCEPT_RDPMC |
1959 VMCB_CTRL_INTERCEPT_CPUID |
1960 VMCB_CTRL_INTERCEPT_RSM |
1961 VMCB_CTRL_INTERCEPT_HLT |
1962 VMCB_CTRL_INTERCEPT_INVLPGA |
1963 VMCB_CTRL_INTERCEPT_IOIO_PROT |
1964 VMCB_CTRL_INTERCEPT_MSR_PROT |
1965 VMCB_CTRL_INTERCEPT_FERR_FREEZE |
1966 VMCB_CTRL_INTERCEPT_SHUTDOWN;
1967
1968 /*
1969 * Allow:
1970 * - ICEBP [icebp instruction]
1971 * - WBINVD [wbinvd instruction]
1972 * - WCR_SPEC(0..15) [writes of CR0-15, received after instruction]
1973 *
1974 * Intercept the rest below.
1975 */
1976 vmcb->ctrl.intercept_misc2 =
1977 VMCB_CTRL_INTERCEPT_VMRUN |
1978 VMCB_CTRL_INTERCEPT_VMMCALL |
1979 VMCB_CTRL_INTERCEPT_VMLOAD |
1980 VMCB_CTRL_INTERCEPT_VMSAVE |
1981 VMCB_CTRL_INTERCEPT_STGI |
1982 VMCB_CTRL_INTERCEPT_CLGI |
1983 VMCB_CTRL_INTERCEPT_SKINIT |
1984 VMCB_CTRL_INTERCEPT_RDTSCP |
1985 VMCB_CTRL_INTERCEPT_MONITOR |
1986 VMCB_CTRL_INTERCEPT_MWAIT |
1987 VMCB_CTRL_INTERCEPT_XSETBV;
1988
1989 /* Intercept all I/O accesses. */
1990 memset(cpudata->iobm, 0xFF, IOBM_SIZE);
1991 vmcb->ctrl.iopm_base_pa = cpudata->iobm_pa;
1992
1993 /* Allow direct access to certain MSRs. */
1994 memset(cpudata->msrbm, 0xFF, MSRBM_SIZE);
1995 svm_vcpu_msr_allow(cpudata->msrbm, MSR_EFER, true, false);
1996 svm_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true);
1997 svm_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true);
1998 svm_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true);
1999 svm_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true);
2000 svm_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true);
2001 svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true);
2002 svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true);
2003 svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
2004 svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
2005 svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
2006 svm_vcpu_msr_allow(cpudata->msrbm, MSR_CR_PAT, true, true);
2007 svm_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false);
2008 vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
2009
2010 /* Generate ASID. */
2011 svm_asid_alloc(vcpu);
2012
2013 /* Virtual TPR. */
2014 vmcb->ctrl.v = VMCB_CTRL_V_INTR_MASKING;
2015
2016 /* Enable Nested Paging. */
2017 vmcb->ctrl.enable1 = VMCB_CTRL_ENABLE_NP;
2018 vmcb->ctrl.n_cr3 = mach->vm->vm_map.pmap->pm_pdirpa[0];
2019
2020 /* Init XSAVE header. */
2021 cpudata->gfpu.xsh_xstate_bv = svm_xcr0_mask;
2022 cpudata->gfpu.xsh_xcomp_bv = 0;
2023
2024 /* These MSRs are static. */
2025 cpudata->star = rdmsr(MSR_STAR);
2026 cpudata->lstar = rdmsr(MSR_LSTAR);
2027 cpudata->cstar = rdmsr(MSR_CSTAR);
2028 cpudata->sfmask = rdmsr(MSR_SFMASK);
2029
2030 /* Install the RESET state. */
2031 memcpy(&vcpu->comm->state, &nvmm_x86_reset_state,
2032 sizeof(nvmm_x86_reset_state));
2033 vcpu->comm->state_wanted = NVMM_X64_STATE_ALL;
2034 vcpu->comm->state_cached = 0;
2035 svm_vcpu_setstate(vcpu);
2036 }
2037
2038 static int
2039 svm_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
2040 {
2041 struct svm_cpudata *cpudata;
2042 int error;
2043
2044 /* Allocate the SVM cpudata. */
2045 cpudata = (struct svm_cpudata *)uvm_km_alloc(kernel_map,
2046 roundup(sizeof(*cpudata), PAGE_SIZE), 0,
2047 UVM_KMF_WIRED|UVM_KMF_ZERO);
2048 vcpu->cpudata = cpudata;
2049
2050 /* VMCB */
2051 error = svm_memalloc(&cpudata->vmcb_pa, (vaddr_t *)&cpudata->vmcb,
2052 VMCB_NPAGES);
2053 if (error)
2054 goto error;
2055
2056 /* I/O Bitmap */
2057 error = svm_memalloc(&cpudata->iobm_pa, (vaddr_t *)&cpudata->iobm,
2058 IOBM_NPAGES);
2059 if (error)
2060 goto error;
2061
2062 /* MSR Bitmap */
2063 error = svm_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm,
2064 MSRBM_NPAGES);
2065 if (error)
2066 goto error;
2067
2068 /* Init the VCPU info. */
2069 svm_vcpu_init(mach, vcpu);
2070
2071 return 0;
2072
2073 error:
2074 if (cpudata->vmcb_pa) {
2075 svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb,
2076 VMCB_NPAGES);
2077 }
2078 if (cpudata->iobm_pa) {
2079 svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm,
2080 IOBM_NPAGES);
2081 }
2082 if (cpudata->msrbm_pa) {
2083 svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm,
2084 MSRBM_NPAGES);
2085 }
2086 uvm_km_free(kernel_map, (vaddr_t)cpudata,
2087 roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
2088 return error;
2089 }
2090
2091 static void
2092 svm_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
2093 {
2094 struct svm_cpudata *cpudata = vcpu->cpudata;
2095
2096 svm_asid_free(vcpu);
2097
2098 svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb, VMCB_NPAGES);
2099 svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm, IOBM_NPAGES);
2100 svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES);
2101
2102 uvm_km_free(kernel_map, (vaddr_t)cpudata,
2103 roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
2104 }
2105
2106 /* -------------------------------------------------------------------------- */
2107
2108 static void
2109 svm_tlb_flush(struct pmap *pm)
2110 {
2111 struct nvmm_machine *mach = pm->pm_data;
2112 struct svm_machdata *machdata = mach->machdata;
2113
2114 atomic_inc_64(&machdata->mach_htlb_gen);
2115
2116 /* Generates IPIs, which cause #VMEXITs. */
2117 pmap_tlb_shootdown(pmap_kernel(), -1, PG_G, TLBSHOOT_UPDATE);
2118 }
2119
2120 static void
2121 svm_machine_create(struct nvmm_machine *mach)
2122 {
2123 struct svm_machdata *machdata;
2124
2125 /* Fill in pmap info. */
2126 mach->vm->vm_map.pmap->pm_data = (void *)mach;
2127 mach->vm->vm_map.pmap->pm_tlb_flush = svm_tlb_flush;
2128
2129 machdata = kmem_zalloc(sizeof(struct svm_machdata), KM_SLEEP);
2130 mach->machdata = machdata;
2131
2132 /* Start with an hTLB flush everywhere. */
2133 machdata->mach_htlb_gen = 1;
2134 }
2135
2136 static void
2137 svm_machine_destroy(struct nvmm_machine *mach)
2138 {
2139 kmem_free(mach->machdata, sizeof(struct svm_machdata));
2140 }
2141
2142 static int
2143 svm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data)
2144 {
2145 struct nvmm_x86_conf_cpuid *cpuid = data;
2146 struct svm_machdata *machdata = (struct svm_machdata *)mach->machdata;
2147 size_t i;
2148
2149 if (__predict_false(op != NVMM_X86_CONF_CPUID)) {
2150 return EINVAL;
2151 }
2152
2153 if (__predict_false((cpuid->set.eax & cpuid->del.eax) ||
2154 (cpuid->set.ebx & cpuid->del.ebx) ||
2155 (cpuid->set.ecx & cpuid->del.ecx) ||
2156 (cpuid->set.edx & cpuid->del.edx))) {
2157 return EINVAL;
2158 }
2159
2160 /* If already here, replace. */
2161 for (i = 0; i < SVM_NCPUIDS; i++) {
2162 if (!machdata->cpuidpresent[i]) {
2163 continue;
2164 }
2165 if (machdata->cpuid[i].leaf == cpuid->leaf) {
2166 memcpy(&machdata->cpuid[i], cpuid,
2167 sizeof(struct nvmm_x86_conf_cpuid));
2168 return 0;
2169 }
2170 }
2171
2172 /* Not here, insert. */
2173 for (i = 0; i < SVM_NCPUIDS; i++) {
2174 if (!machdata->cpuidpresent[i]) {
2175 machdata->cpuidpresent[i] = true;
2176 memcpy(&machdata->cpuid[i], cpuid,
2177 sizeof(struct nvmm_x86_conf_cpuid));
2178 return 0;
2179 }
2180 }
2181
2182 return ENOBUFS;
2183 }
2184
2185 /* -------------------------------------------------------------------------- */
2186
2187 static bool
2188 svm_ident(void)
2189 {
2190 u_int descs[4];
2191 uint64_t msr;
2192
2193 if (cpu_vendor != CPUVENDOR_AMD) {
2194 return false;
2195 }
2196 if (!(cpu_feature[3] & CPUID_SVM)) {
2197 return false;
2198 }
2199
2200 if (curcpu()->ci_max_ext_cpuid < 0x8000000a) {
2201 return false;
2202 }
2203 x86_cpuid(0x8000000a, descs);
2204
2205 /* Want Nested Paging. */
2206 if (!(descs[3] & CPUID_AMD_SVM_NP)) {
2207 return false;
2208 }
2209
2210 /* Want nRIP. */
2211 if (!(descs[3] & CPUID_AMD_SVM_NRIPS)) {
2212 return false;
2213 }
2214
2215 svm_decode_assist = (descs[3] & CPUID_AMD_SVM_DecodeAssist) != 0;
2216
2217 msr = rdmsr(MSR_VMCR);
2218 if ((msr & VMCR_SVMED) && (msr & VMCR_LOCK)) {
2219 return false;
2220 }
2221
2222 return true;
2223 }
2224
2225 static void
2226 svm_init_asid(uint32_t maxasid)
2227 {
2228 size_t i, j, allocsz;
2229
2230 mutex_init(&svm_asidlock, MUTEX_DEFAULT, IPL_NONE);
2231
2232 /* Arbitrarily limit. */
2233 maxasid = uimin(maxasid, 8192);
2234
2235 svm_maxasid = maxasid;
2236 allocsz = roundup(maxasid, 8) / 8;
2237 svm_asidmap = kmem_zalloc(allocsz, KM_SLEEP);
2238
2239 /* ASID 0 is reserved for the host. */
2240 svm_asidmap[0] |= __BIT(0);
2241
2242 /* ASID n-1 is special, we share it. */
2243 i = (maxasid - 1) / 8;
2244 j = (maxasid - 1) % 8;
2245 svm_asidmap[i] |= __BIT(j);
2246 }
2247
2248 static void
2249 svm_change_cpu(void *arg1, void *arg2)
2250 {
2251 bool enable = (bool)arg1;
2252 uint64_t msr;
2253
2254 msr = rdmsr(MSR_VMCR);
2255 if (msr & VMCR_SVMED) {
2256 wrmsr(MSR_VMCR, msr & ~VMCR_SVMED);
2257 }
2258
2259 if (!enable) {
2260 wrmsr(MSR_VM_HSAVE_PA, 0);
2261 }
2262
2263 msr = rdmsr(MSR_EFER);
2264 if (enable) {
2265 msr |= EFER_SVME;
2266 } else {
2267 msr &= ~EFER_SVME;
2268 }
2269 wrmsr(MSR_EFER, msr);
2270
2271 if (enable) {
2272 wrmsr(MSR_VM_HSAVE_PA, hsave[cpu_index(curcpu())].pa);
2273 }
2274 }
2275
2276 static void
2277 svm_init(void)
2278 {
2279 CPU_INFO_ITERATOR cii;
2280 struct cpu_info *ci;
2281 struct vm_page *pg;
2282 u_int descs[4];
2283 uint64_t xc;
2284
2285 x86_cpuid(0x8000000a, descs);
2286
2287 /* The guest TLB flush command. */
2288 if (descs[3] & CPUID_AMD_SVM_FlushByASID) {
2289 svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_GUEST;
2290 } else {
2291 svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_ALL;
2292 }
2293
2294 /* Init the ASID. */
2295 svm_init_asid(descs[1]);
2296
2297 /* Init the XCR0 mask. */
2298 svm_xcr0_mask = SVM_XCR0_MASK_DEFAULT & x86_xsave_features;
2299
2300 memset(hsave, 0, sizeof(hsave));
2301 for (CPU_INFO_FOREACH(cii, ci)) {
2302 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
2303 hsave[cpu_index(ci)].pa = VM_PAGE_TO_PHYS(pg);
2304 }
2305
2306 xc = xc_broadcast(0, svm_change_cpu, (void *)true, NULL);
2307 xc_wait(xc);
2308 }
2309
2310 static void
2311 svm_fini_asid(void)
2312 {
2313 size_t allocsz;
2314
2315 allocsz = roundup(svm_maxasid, 8) / 8;
2316 kmem_free(svm_asidmap, allocsz);
2317
2318 mutex_destroy(&svm_asidlock);
2319 }
2320
2321 static void
2322 svm_fini(void)
2323 {
2324 uint64_t xc;
2325 size_t i;
2326
2327 xc = xc_broadcast(0, svm_change_cpu, (void *)false, NULL);
2328 xc_wait(xc);
2329
2330 for (i = 0; i < MAXCPUS; i++) {
2331 if (hsave[i].pa != 0)
2332 uvm_pagefree(PHYS_TO_VM_PAGE(hsave[i].pa));
2333 }
2334
2335 svm_fini_asid();
2336 }
2337
2338 static void
2339 svm_capability(struct nvmm_capability *cap)
2340 {
2341 cap->arch.xcr0_mask = svm_xcr0_mask;
2342 cap->arch.mxcsr_mask = x86_fpu_mxcsr_mask;
2343 cap->arch.conf_cpuid_maxops = SVM_NCPUIDS;
2344 }
2345
2346 const struct nvmm_impl nvmm_x86_svm = {
2347 .ident = svm_ident,
2348 .init = svm_init,
2349 .fini = svm_fini,
2350 .capability = svm_capability,
2351 .conf_max = NVMM_X86_NCONF,
2352 .conf_sizes = svm_conf_sizes,
2353 .state_size = sizeof(struct nvmm_x64_state),
2354 .machine_create = svm_machine_create,
2355 .machine_destroy = svm_machine_destroy,
2356 .machine_configure = svm_machine_configure,
2357 .vcpu_create = svm_vcpu_create,
2358 .vcpu_destroy = svm_vcpu_destroy,
2359 .vcpu_setstate = svm_vcpu_setstate,
2360 .vcpu_getstate = svm_vcpu_getstate,
2361 .vcpu_inject = svm_vcpu_inject,
2362 .vcpu_run = svm_vcpu_run
2363 };
2364