cpu.c revision 1.138 1 /* $NetBSD: cpu.c,v 1.138 2020/07/08 11:11:00 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2000 The NetBSD Foundation, Inc.
5 * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by RedBack Networks Inc.
10 *
11 * Author: Bill Sommerfeld
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*
36 * Copyright (c) 1999 Stefan Grefen
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the NetBSD
49 * Foundation, Inc. and its contributors.
50 * 4. Neither the name of The NetBSD Foundation nor the names of its
51 * contributors may be used to endorse or promote products derived
52 * from this software without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
55 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 */
66
67 #include <sys/cdefs.h>
68 __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.138 2020/07/08 11:11:00 jdolecek Exp $");
69
70 #include "opt_ddb.h"
71 #include "opt_multiprocessor.h"
72 #include "opt_mpbios.h" /* for MPDEBUG */
73 #include "opt_mtrr.h"
74 #include "opt_xen.h"
75
76 #include "lapic.h"
77 #include "ioapic.h"
78
79 #include <sys/param.h>
80 #include <sys/proc.h>
81 #include <sys/systm.h>
82 #include <sys/device.h>
83 #include <sys/kmem.h>
84 #include <sys/cpu.h>
85 #include <sys/cpufreq.h>
86 #include <sys/atomic.h>
87 #include <sys/reboot.h>
88 #include <sys/idle.h>
89
90 #include <uvm/uvm.h>
91
92 #include <machine/cpu.h>
93 #include <machine/cpufunc.h>
94 #include <machine/cpuvar.h>
95 #include <machine/pmap.h>
96 #include <machine/vmparam.h>
97 #include <machine/mpbiosvar.h>
98 #include <machine/pcb.h>
99 #include <machine/specialreg.h>
100 #include <machine/segments.h>
101 #include <machine/gdt.h>
102 #include <machine/mtrr.h>
103 #include <machine/pio.h>
104
105 #include <x86/fpu.h>
106
107 #include <xen/xen.h>
108 #include <xen/include/public/vcpu.h>
109 #include <xen/vcpuvar.h>
110
111 #if NLAPIC > 0
112 #include <machine/apicvar.h>
113 #include <machine/i82489reg.h>
114 #include <machine/i82489var.h>
115 #endif
116
117 #include <dev/ic/mc146818reg.h>
118 #include <dev/isa/isareg.h>
119
120 static int cpu_match(device_t, cfdata_t, void *);
121 static void cpu_attach(device_t, device_t, void *);
122 static void cpu_defer(device_t);
123 static int cpu_rescan(device_t, const char *, const int *);
124 static void cpu_childdetached(device_t, device_t);
125 static int vcpu_match(device_t, cfdata_t, void *);
126 static void vcpu_attach(device_t, device_t, void *);
127 static void cpu_attach_common(device_t, device_t, void *);
128 void cpu_offline_md(void);
129
130 struct cpu_softc {
131 device_t sc_dev; /* device tree glue */
132 struct cpu_info *sc_info; /* pointer to CPU info */
133 bool sc_wasonline;
134 };
135
136 int mp_cpu_start(struct cpu_info *, vaddr_t);
137 void mp_cpu_start_cleanup(struct cpu_info *);
138 const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
139 mp_cpu_start_cleanup };
140
141 CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc),
142 cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached);
143
144 CFATTACH_DECL_NEW(vcpu, sizeof(struct cpu_softc),
145 vcpu_match, vcpu_attach, NULL, NULL);
146
147 /*
148 * Statically-allocated CPU info for the primary CPU (or the only
149 * CPU, on uniprocessors). The CPU info list is initialized to
150 * point at it.
151 */
152 struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = {
153 .ci_dev = 0,
154 .ci_self = &cpu_info_primary,
155 .ci_idepth = -1,
156 .ci_curlwp = &lwp0,
157 .ci_curldt = -1,
158 };
159 struct cpu_info phycpu_info_primary __aligned(CACHE_LINE_SIZE) = {
160 .ci_dev = 0,
161 .ci_self = &phycpu_info_primary,
162 };
163
164 struct cpu_info *cpu_info_list = &cpu_info_primary;
165 struct cpu_info *phycpu_info_list = &phycpu_info_primary;
166
167 uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits
168 * [0] basic features %edx
169 * [1] basic features %ecx
170 * [2] extended features %edx
171 * [3] extended features %ecx
172 * [4] VIA padlock features
173 * [5] structured extended features cpuid.7:%ebx
174 * [6] structured extended features cpuid.7:%ecx
175 */
176
177 bool x86_mp_online;
178 paddr_t mp_trampoline_paddr = MP_TRAMPOLINE;
179
180 #if defined(MULTIPROCESSOR)
181 void cpu_hatch(void *);
182 static void cpu_boot_secondary(struct cpu_info *ci);
183 static void cpu_start_secondary(struct cpu_info *ci);
184 #endif /* MULTIPROCESSOR */
185
186 static int
187 cpu_match(device_t parent, cfdata_t match, void *aux)
188 {
189
190 return 1;
191 }
192
193 static void
194 cpu_attach(device_t parent, device_t self, void *aux)
195 {
196 struct cpu_softc *sc = device_private(self);
197 struct cpu_attach_args *caa = aux;
198 struct cpu_info *ci;
199 uintptr_t ptr;
200 static int nphycpu = 0;
201
202 sc->sc_dev = self;
203
204 /*
205 * If we're an Application Processor, allocate a cpu_info
206 * If we're the first attached CPU use the primary cpu_info,
207 * otherwise allocate a new one
208 */
209 aprint_naive("\n");
210 aprint_normal("\n");
211 if (nphycpu > 0) {
212 struct cpu_info *tmp;
213 ptr = (uintptr_t)kmem_zalloc(sizeof(*ci) + CACHE_LINE_SIZE - 1,
214 KM_SLEEP);
215 ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
216 ci->ci_curldt = -1;
217
218 tmp = phycpu_info_list;
219 while (tmp->ci_next)
220 tmp = tmp->ci_next;
221
222 tmp->ci_next = ci;
223 } else {
224 ci = &phycpu_info_primary;
225 }
226
227 ci->ci_self = ci;
228 sc->sc_info = ci;
229
230 ci->ci_dev = self;
231 ci->ci_acpiid = caa->cpu_id;
232 ci->ci_cpuid = caa->cpu_number;
233 ci->ci_vcpu = NULL;
234 ci->ci_index = nphycpu++;
235 ci->ci_kfpu_spl = -1;
236
237 if (!pmf_device_register(self, NULL, NULL))
238 aprint_error_dev(self, "couldn't establish power handler\n");
239
240 (void)config_defer(self, cpu_defer);
241 }
242
243 static void
244 cpu_defer(device_t self)
245 {
246 cpu_rescan(self, NULL, NULL);
247 }
248
249 static int
250 cpu_rescan(device_t self, const char *ifattr, const int *locators)
251 {
252 struct cpu_softc *sc = device_private(self);
253 struct cpufeature_attach_args cfaa;
254 struct cpu_info *ci = sc->sc_info;
255
256 memset(&cfaa, 0, sizeof(cfaa));
257 cfaa.ci = ci;
258
259 if (ifattr_match(ifattr, "cpufeaturebus")) {
260
261 if (ci->ci_frequency == NULL) {
262 cfaa.name = "frequency";
263 ci->ci_frequency = config_found_ia(self,
264 "cpufeaturebus", &cfaa, NULL);
265 }
266 }
267
268 return 0;
269 }
270
271 static void
272 cpu_childdetached(device_t self, device_t child)
273 {
274 struct cpu_softc *sc = device_private(self);
275 struct cpu_info *ci = sc->sc_info;
276
277 if (ci->ci_frequency == child)
278 ci->ci_frequency = NULL;
279 }
280
281 static int
282 vcpu_match(device_t parent, cfdata_t match, void *aux)
283 {
284 struct vcpu_attach_args *vcaa = aux;
285 struct vcpu_runstate_info vcr;
286 int error;
287
288 if (strcmp(vcaa->vcaa_name, match->cf_name) == 0) {
289 error = HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info,
290 vcaa->vcaa_caa.cpu_number, &vcr);
291 switch (error) {
292 case 0:
293 return 1;
294 case -ENOENT:
295 return 0;
296 default:
297 panic("Unknown hypervisor error %d returned on vcpu runstate probe\n", error);
298 }
299 }
300
301 return 0;
302 }
303
304 static void
305 vcpu_attach(device_t parent, device_t self, void *aux)
306 {
307 struct vcpu_attach_args *vcaa = aux;
308
309 KASSERT(vcaa->vcaa_caa.cpu_func == NULL);
310 vcaa->vcaa_caa.cpu_func = &mp_cpu_funcs;
311 cpu_attach_common(parent, self, &vcaa->vcaa_caa);
312
313 if (!pmf_device_register(self, NULL, NULL))
314 aprint_error_dev(self, "couldn't establish power handler\n");
315 }
316
317 static int
318 vcpu_is_up(struct cpu_info *ci)
319 {
320 KASSERT(ci != NULL);
321 return HYPERVISOR_vcpu_op(VCPUOP_is_up, ci->ci_vcpuid, NULL);
322 }
323
324 static void
325 cpu_vm_init(struct cpu_info *ci)
326 {
327 int ncolors = 2, i;
328
329 for (i = CAI_ICACHE; i <= CAI_L2CACHE; i++) {
330 struct x86_cache_info *cai;
331 int tcolors;
332
333 cai = &ci->ci_cinfo[i];
334
335 tcolors = atop(cai->cai_totalsize);
336 switch (cai->cai_associativity) {
337 case 0xff:
338 tcolors = 1; /* fully associative */
339 break;
340 case 0:
341 case 1:
342 break;
343 default:
344 tcolors /= cai->cai_associativity;
345 }
346 ncolors = uimax(ncolors, tcolors);
347 }
348
349 /*
350 * Knowing the size of the largest cache on this CPU, potentially
351 * re-color our pages.
352 */
353 aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
354 uvm_page_recolor(ncolors);
355 pmap_tlb_cpu_init(ci);
356 #ifndef __HAVE_DIRECT_MAP
357 pmap_vpage_cpu_init(ci);
358 #endif
359 }
360
361 static void
362 cpu_attach_common(device_t parent, device_t self, void *aux)
363 {
364 struct cpu_softc *sc = device_private(self);
365 struct cpu_attach_args *caa = aux;
366 struct cpu_info *ci;
367 uintptr_t ptr;
368 int cpunum = caa->cpu_number;
369 static bool again = false;
370
371 sc->sc_dev = self;
372
373 /*
374 * If we're an Application Processor, allocate a cpu_info
375 * structure, otherwise use the primary's.
376 */
377 if (caa->cpu_role == CPU_ROLE_AP) {
378 aprint_naive(": Application Processor\n");
379 ptr = (uintptr_t)kmem_alloc(sizeof(*ci) + CACHE_LINE_SIZE - 1,
380 KM_SLEEP);
381 ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
382 memset(ci, 0, sizeof(*ci));
383 cpu_init_tss(ci);
384 } else {
385 aprint_naive(": %s Processor\n",
386 caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot");
387 ci = &cpu_info_primary;
388 }
389
390 ci->ci_self = ci;
391 sc->sc_info = ci;
392 ci->ci_dev = self;
393 ci->ci_cpuid = cpunum;
394 ci->ci_vcpuid = cpunum;
395 ci->ci_kfpu_spl = -1;
396
397 KASSERT(HYPERVISOR_shared_info != NULL);
398 KASSERT(cpunum < XEN_LEGACY_MAX_VCPUS);
399 ci->ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[cpunum];
400
401 KASSERT(ci->ci_func == 0);
402 ci->ci_func = caa->cpu_func;
403 aprint_normal("\n");
404
405 /* Must be called before mi_cpu_attach(). */
406 cpu_vm_init(ci);
407
408 if (caa->cpu_role == CPU_ROLE_AP) {
409 int error;
410
411 error = mi_cpu_attach(ci);
412
413 KASSERT(ci->ci_data.cpu_idlelwp != NULL);
414 if (error != 0) {
415 aprint_error_dev(self,
416 "mi_cpu_attach failed with %d\n", error);
417 return;
418 }
419
420 } else {
421 KASSERT(ci->ci_data.cpu_idlelwp != NULL);
422 }
423
424 KASSERT(ci->ci_cpuid == ci->ci_index);
425 #ifdef __x86_64__
426 /* No user PGD mapped for this CPU yet */
427 ci->ci_xen_current_user_pgd = 0;
428 #endif
429 mutex_init(&ci->ci_kpm_mtx, MUTEX_DEFAULT, IPL_VM);
430 pmap_reference(pmap_kernel());
431 ci->ci_pmap = pmap_kernel();
432 ci->ci_tlbstate = TLBSTATE_STALE;
433
434 /*
435 * Boot processor may not be attached first, but the below
436 * must be done to allow booting other processors.
437 */
438 if (!again) {
439 atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY);
440 /* Basic init. */
441 cpu_intr_init(ci);
442 cpu_get_tsc_freq(ci);
443 cpu_init(ci);
444 pmap_cpu_init_late(ci);
445
446 /* Every processor needs to init its own ipi h/w (similar to lapic) */
447 xen_ipi_init();
448
449 /* Make sure DELAY() is initialized. */
450 DELAY(1);
451 again = true;
452 }
453
454 /* further PCB init done later. */
455
456 switch (caa->cpu_role) {
457 case CPU_ROLE_SP:
458 atomic_or_32(&ci->ci_flags, CPUF_SP);
459 cpu_identify(ci);
460 x86_cpu_idle_init();
461 break;
462
463 case CPU_ROLE_BP:
464 atomic_or_32(&ci->ci_flags, CPUF_BSP);
465 cpu_identify(ci);
466 x86_cpu_idle_init();
467 break;
468
469 case CPU_ROLE_AP:
470 atomic_or_32(&ci->ci_flags, CPUF_AP);
471
472 /*
473 * report on an AP
474 */
475
476 #if defined(MULTIPROCESSOR)
477 /* interrupt handler stack */
478 cpu_intr_init(ci);
479
480 /* Setup per-cpu memory for gdt */
481 gdt_alloc_cpu(ci);
482
483 pmap_cpu_init_late(ci);
484 cpu_start_secondary(ci);
485
486 if (ci->ci_flags & CPUF_PRESENT) {
487 struct cpu_info *tmp;
488
489 cpu_identify(ci);
490 tmp = cpu_info_list;
491 while (tmp->ci_next)
492 tmp = tmp->ci_next;
493
494 tmp->ci_next = ci;
495 }
496 #else
497 aprint_error_dev(ci->ci_dev, "not started\n");
498 #endif
499 break;
500
501 default:
502 panic("unknown processor type??\n");
503 }
504
505 #ifdef MPVERBOSE
506 if (mp_verbose) {
507 struct lwp *l = ci->ci_data.cpu_idlelwp;
508 struct pcb *pcb = lwp_getpcb(l);
509
510 aprint_verbose_dev(self,
511 "idle lwp at %p, idle sp at %p\n",
512 l,
513 #ifdef i386
514 (void *)pcb->pcb_esp
515 #else
516 (void *)pcb->pcb_rsp
517 #endif
518 );
519
520 }
521 #endif /* MPVERBOSE */
522 }
523
524 /*
525 * Initialize the processor appropriately.
526 */
527
528 void
529 cpu_init(struct cpu_info *ci)
530 {
531 uint32_t cr4 = 0;
532
533 /*
534 * If we have FXSAVE/FXRESTOR, use them.
535 */
536 if (cpu_feature[0] & CPUID_FXSR) {
537 cr4 |= CR4_OSFXSR;
538
539 /*
540 * If we have SSE/SSE2, enable XMM exceptions.
541 */
542 if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2))
543 cr4 |= CR4_OSXMMEXCPT;
544 }
545
546 /* If xsave is supported, enable it */
547 if (cpu_feature[1] & CPUID2_XSAVE && x86_fpu_save >= FPU_SAVE_XSAVE)
548 cr4 |= CR4_OSXSAVE;
549
550 if (cr4) {
551 cr4 |= rcr4();
552 lcr4(cr4);
553 }
554
555 if (x86_fpu_save >= FPU_SAVE_FXSAVE) {
556 fpuinit_mxcsr_mask();
557 }
558
559 /*
560 * Changing CR4 register may change cpuid values. For example, setting
561 * CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in
562 * ci_feat_val[1], so update it.
563 * XXX Other than ci_feat_val[1] might be changed.
564 */
565 if (cpuid_level >= 1) {
566 u_int descs[4];
567
568 x86_cpuid(1, descs);
569 ci->ci_feat_val[1] = descs[2];
570 }
571
572 /* If xsave is enabled, enable all fpu features */
573 if (cr4 & CR4_OSXSAVE) {
574 wrxcr(0, x86_xsave_features & XCR0_FPU);
575 }
576
577 atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
578 }
579
580
581 #ifdef MULTIPROCESSOR
582
583 void
584 cpu_boot_secondary_processors(void)
585 {
586 struct cpu_info *ci;
587 kcpuset_t *cpus;
588 u_long i;
589
590 kcpuset_create(&cpus, true);
591 kcpuset_set(cpus, cpu_index(curcpu()));
592 for (i = 0; i < maxcpus; i++) {
593 ci = cpu_lookup(i);
594 if (ci == NULL)
595 continue;
596 if (ci->ci_data.cpu_idlelwp == NULL)
597 continue;
598 if ((ci->ci_flags & CPUF_PRESENT) == 0)
599 continue;
600 if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY))
601 continue;
602 cpu_boot_secondary(ci);
603 kcpuset_set(cpus, cpu_index(ci));
604 }
605 while (!kcpuset_match(cpus, kcpuset_running))
606 ;
607 kcpuset_destroy(cpus);
608
609 x86_mp_online = true;
610 }
611
612 static void
613 cpu_init_idle_lwp(struct cpu_info *ci)
614 {
615 struct lwp *l = ci->ci_data.cpu_idlelwp;
616 struct pcb *pcb = lwp_getpcb(l);
617
618 pcb->pcb_cr0 = rcr0();
619 }
620
621 void
622 cpu_init_idle_lwps(void)
623 {
624 struct cpu_info *ci;
625 u_long i;
626
627 for (i = 0; i < maxcpus; i++) {
628 ci = cpu_lookup(i);
629 if (ci == NULL)
630 continue;
631 if (ci->ci_data.cpu_idlelwp == NULL)
632 continue;
633 if ((ci->ci_flags & CPUF_PRESENT) == 0)
634 continue;
635 cpu_init_idle_lwp(ci);
636 }
637 }
638
639 static void
640 cpu_start_secondary(struct cpu_info *ci)
641 {
642 int i;
643
644 aprint_debug_dev(ci->ci_dev, "starting\n");
645
646 ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
647
648 if (CPU_STARTUP(ci, (vaddr_t) cpu_hatch) != 0) {
649 return;
650 }
651
652 /*
653 * wait for it to become ready
654 */
655 for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) {
656 delay(10);
657 }
658 if ((ci->ci_flags & CPUF_PRESENT) == 0) {
659 aprint_error_dev(ci->ci_dev, "failed to become ready\n");
660 #if defined(MPDEBUG) && defined(DDB)
661 printf("dropping into debugger; continue from here to resume boot\n");
662 Debugger();
663 #endif
664 }
665
666 CPU_START_CLEANUP(ci);
667 }
668
669 void
670 cpu_boot_secondary(struct cpu_info *ci)
671 {
672 int i;
673 atomic_or_32(&ci->ci_flags, CPUF_GO);
674 for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) {
675 delay(10);
676 }
677 if ((ci->ci_flags & CPUF_RUNNING) == 0) {
678 aprint_error_dev(ci->ci_dev, "CPU failed to start\n");
679 #if defined(MPDEBUG) && defined(DDB)
680 printf("dropping into debugger; continue from here to resume boot\n");
681 Debugger();
682 #endif
683 }
684 }
685
686 /*
687 * APs end up here immediately after initialisation and VCPUOP_up in
688 * mp_cpu_start().
689 * At this point, we are running in the idle pcb/idle stack of the new
690 * CPU. This function jumps to the idle loop and starts looking for
691 * work.
692 */
693 extern void x86_64_tls_switch(struct lwp *);
694 void
695 cpu_hatch(void *v)
696 {
697 struct cpu_info *ci = (struct cpu_info *)v;
698 struct pcb *pcb;
699 int s, i;
700
701 /* Setup TLS and kernel GS/FS */
702 cpu_init_msrs(ci, true);
703 cpu_init_idt();
704 gdt_init_cpu(ci);
705
706 cpu_probe(ci);
707
708 atomic_or_32(&ci->ci_flags, CPUF_PRESENT);
709
710 while ((ci->ci_flags & CPUF_GO) == 0) {
711 /* Don't use delay, boot CPU may be patching the text. */
712 for (i = 10000; i != 0; i--)
713 x86_pause();
714 }
715
716 /* Because the text may have been patched in x86_patch(). */
717 x86_flush();
718 tlbflushg();
719
720 KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);
721
722 KASSERT(ci->ci_curlwp == ci->ci_data.cpu_idlelwp);
723 KASSERT(curlwp == ci->ci_data.cpu_idlelwp);
724 pcb = lwp_getpcb(curlwp);
725 pcb->pcb_cr3 = pmap_pdirpa(pmap_kernel(), 0);
726
727 xen_ipi_init();
728
729 xen_initclocks();
730
731 #ifdef __x86_64__
732 fpuinit(ci);
733 #endif
734
735 lldt(GSEL(GLDT_SEL, SEL_KPL));
736
737 cpu_init(ci);
738 cpu_get_tsc_freq(ci);
739
740 s = splhigh();
741 x86_enable_intr();
742 splx(s);
743
744 aprint_debug_dev(ci->ci_dev, "running\n");
745
746 KASSERT(ci->ci_curlwp == ci->ci_data.cpu_idlelwp);
747 idle_loop(NULL);
748 KASSERT(false);
749 }
750
751 #if defined(DDB)
752
753 #include <ddb/db_output.h>
754 #include <machine/db_machdep.h>
755
756 /*
757 * Dump CPU information from ddb.
758 */
759 void
760 cpu_debug_dump(void)
761 {
762 struct cpu_info *ci;
763 CPU_INFO_ITERATOR cii;
764
765 db_printf("addr dev id flags ipis curlwp\n");
766 for (CPU_INFO_FOREACH(cii, ci)) {
767 db_printf("%p %s %ld %x %x %10p\n",
768 ci,
769 ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev),
770 (long)ci->ci_vcpuid,
771 ci->ci_flags, ci->ci_ipis,
772 ci->ci_curlwp);
773 }
774 }
775 #endif /* DDB */
776
777 #endif /* MULTIPROCESSOR */
778
779 extern void hypervisor_callback(void);
780 extern void failsafe_callback(void);
781 #ifdef __x86_64__
782 typedef void (vector)(void);
783 extern vector Xsyscall, Xsyscall32;
784 #endif
785
786 /*
787 * Setup the "trampoline". On Xen, we setup nearly all cpu context
788 * outside a trampoline, so we prototype and call targetip like so:
789 * void targetip(struct cpu_info *);
790 */
791
792 static void
793 gdt_prepframes(paddr_t *frames, vaddr_t base, uint32_t entries)
794 {
795 int i;
796 for (i = 0; i < entries; i++) {
797 frames[i] = ((paddr_t)xpmap_ptetomach(
798 (pt_entry_t *)(base + (i << PAGE_SHIFT)))) >> PAGE_SHIFT;
799
800 /* Mark Read-only */
801 pmap_pte_clearbits(kvtopte(base + (i << PAGE_SHIFT)),
802 PTE_W);
803 }
804 }
805
806 #ifdef __x86_64__
807 extern char *ldtstore;
808
809 static void
810 xen_init_amd64_vcpuctxt(struct cpu_info *ci, struct vcpu_guest_context *initctx,
811 void targetrip(struct cpu_info *))
812 {
813 /* page frames to point at GDT */
814 extern int gdt_size;
815 paddr_t frames[16];
816 psize_t gdt_ents;
817
818 struct lwp *l;
819 struct pcb *pcb;
820
821 volatile struct vcpu_info *vci;
822
823 KASSERT(ci != NULL);
824 KASSERT(ci != &cpu_info_primary);
825 KASSERT(initctx != NULL);
826 KASSERT(targetrip != NULL);
827
828 memset(initctx, 0, sizeof(*initctx));
829
830 gdt_ents = roundup(gdt_size, PAGE_SIZE) >> PAGE_SHIFT;
831 KASSERT(gdt_ents <= 16);
832
833 gdt_prepframes(frames, (vaddr_t)ci->ci_gdt, gdt_ents);
834
835 /* Initialise the vcpu context: We use idle_loop()'s pcb context. */
836
837 l = ci->ci_data.cpu_idlelwp;
838
839 KASSERT(l != NULL);
840 pcb = lwp_getpcb(l);
841 KASSERT(pcb != NULL);
842
843 /* resume with interrupts off */
844 vci = ci->ci_vcpu;
845 vci->evtchn_upcall_mask = 1;
846 xen_mb();
847
848 /* resume in kernel-mode */
849 initctx->flags = VGCF_in_kernel | VGCF_online;
850
851 /* Stack and entry points:
852 * We arrange for the stack frame for cpu_hatch() to
853 * appear as a callee frame of lwp_trampoline(). Being a
854 * leaf frame prevents trampling on any of the MD stack setup
855 * that x86/vm_machdep.c:cpu_lwp_fork() does for idle_loop()
856 */
857
858 initctx->user_regs.rdi = (uint64_t) ci; /* targetrip(ci); */
859 initctx->user_regs.rip = (vaddr_t) targetrip;
860
861 initctx->user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
862
863 initctx->user_regs.rflags = pcb->pcb_flags;
864 initctx->user_regs.rsp = pcb->pcb_rsp;
865
866 /* Data segments */
867 initctx->user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
868 initctx->user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
869 initctx->user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
870
871 /* GDT */
872 memcpy(initctx->gdt_frames, frames, sizeof(frames));
873 initctx->gdt_ents = gdt_ents;
874
875 /* LDT */
876 initctx->ldt_base = (unsigned long)ldtstore;
877 initctx->ldt_ents = LDT_SIZE >> 3;
878
879 /* Kernel context state */
880 initctx->kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
881 initctx->kernel_sp = pcb->pcb_rsp0;
882 initctx->ctrlreg[0] = pcb->pcb_cr0;
883 initctx->ctrlreg[1] = 0; /* "resuming" from kernel - no User cr3. */
884 initctx->ctrlreg[2] = (vaddr_t)targetrip;
885 /*
886 * Use pmap_kernel() L4 PD directly, until we setup the
887 * per-cpu L4 PD in pmap_cpu_init_late()
888 */
889 initctx->ctrlreg[3] = xen_pfn_to_cr3(x86_btop(xpmap_ptom(ci->ci_kpm_pdirpa)));
890 initctx->ctrlreg[4] = CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT;
891
892 /* Xen callbacks */
893 initctx->event_callback_eip = (unsigned long)hypervisor_callback;
894 initctx->failsafe_callback_eip = (unsigned long)failsafe_callback;
895 initctx->syscall_callback_eip = (unsigned long)Xsyscall;
896
897 return;
898 }
899 #else /* i386 */
900 extern union descriptor *ldtstore;
901 extern void Xsyscall(void);
902
903 static void
904 xen_init_i386_vcpuctxt(struct cpu_info *ci, struct vcpu_guest_context *initctx,
905 void targeteip(struct cpu_info *))
906 {
907 /* page frames to point at GDT */
908 extern int gdt_size;
909 paddr_t frames[16];
910 psize_t gdt_ents;
911
912 struct lwp *l;
913 struct pcb *pcb;
914
915 volatile struct vcpu_info *vci;
916
917 KASSERT(ci != NULL);
918 KASSERT(ci != &cpu_info_primary);
919 KASSERT(initctx != NULL);
920 KASSERT(targeteip != NULL);
921
922 memset(initctx, 0, sizeof(*initctx));
923
924 gdt_ents = roundup(gdt_size, PAGE_SIZE) >> PAGE_SHIFT;
925 KASSERT(gdt_ents <= 16);
926
927 gdt_prepframes(frames, (vaddr_t)ci->ci_gdt, gdt_ents);
928
929 /*
930 * Initialise the vcpu context:
931 * We use this cpu's idle_loop() pcb context.
932 */
933
934 l = ci->ci_data.cpu_idlelwp;
935
936 KASSERT(l != NULL);
937 pcb = lwp_getpcb(l);
938 KASSERT(pcb != NULL);
939
940 /* resume with interrupts off */
941 vci = ci->ci_vcpu;
942 vci->evtchn_upcall_mask = 1;
943 xen_mb();
944
945 /* resume in kernel-mode */
946 initctx->flags = VGCF_in_kernel | VGCF_online;
947
948 /* Stack frame setup for cpu_hatch():
949 * We arrange for the stack frame for cpu_hatch() to
950 * appear as a callee frame of lwp_trampoline(). Being a
951 * leaf frame prevents trampling on any of the MD stack setup
952 * that x86/vm_machdep.c:cpu_lwp_fork() does for idle_loop()
953 */
954
955 initctx->user_regs.esp = pcb->pcb_esp - 4; /* Leave word for
956 arg1 */
957 {
958 /* targeteip(ci); */
959 uint32_t *arg = (uint32_t *)initctx->user_regs.esp;
960 arg[1] = (uint32_t)ci; /* arg1 */
961 }
962
963 initctx->user_regs.eip = (vaddr_t)targeteip;
964 initctx->user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
965 initctx->user_regs.eflags |= pcb->pcb_iopl;
966
967 /* Data segments */
968 initctx->user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
969 initctx->user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
970 initctx->user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
971 initctx->user_regs.fs = GSEL(GDATA_SEL, SEL_KPL);
972
973 /* GDT */
974 memcpy(initctx->gdt_frames, frames, sizeof(frames));
975 initctx->gdt_ents = gdt_ents;
976
977 /* LDT */
978 initctx->ldt_base = (unsigned long)ldtstore;
979 initctx->ldt_ents = NLDT;
980
981 /* Kernel context state */
982 initctx->kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
983 initctx->kernel_sp = pcb->pcb_esp0;
984 initctx->ctrlreg[0] = pcb->pcb_cr0;
985 initctx->ctrlreg[1] = 0; /* "resuming" from kernel - no User cr3. */
986 initctx->ctrlreg[2] = (vaddr_t)targeteip;
987 initctx->ctrlreg[3] = xen_pfn_to_cr3(x86_btop(xpmap_ptom(ci->ci_pae_l3_pdirpa)));
988 initctx->ctrlreg[4] = /* CR4_PAE | */CR4_OSFXSR | CR4_OSXMMEXCPT;
989
990 /* Xen callbacks */
991 initctx->event_callback_eip = (unsigned long)hypervisor_callback;
992 initctx->event_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
993 initctx->failsafe_callback_eip = (unsigned long)failsafe_callback;
994 initctx->failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
995
996 return;
997 }
998 #endif /* __x86_64__ */
999
1000 int
1001 mp_cpu_start(struct cpu_info *ci, vaddr_t target)
1002 {
1003 int hyperror;
1004 struct vcpu_guest_context *vcpuctx;
1005
1006 KASSERT(ci != NULL);
1007 KASSERT(ci != &cpu_info_primary);
1008 KASSERT(ci->ci_flags & CPUF_AP);
1009
1010 vcpuctx = kmem_alloc(sizeof(*vcpuctx), KM_SLEEP);
1011
1012 #ifdef __x86_64__
1013 xen_init_amd64_vcpuctxt(ci, vcpuctx, (void (*)(struct cpu_info *))target);
1014 #else
1015 xen_init_i386_vcpuctxt(ci, vcpuctx, (void (*)(struct cpu_info *))target);
1016 #endif
1017
1018 /* Initialise the given vcpu to execute cpu_hatch(ci); */
1019 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_initialise, ci->ci_vcpuid, vcpuctx))) {
1020 aprint_error(": context initialisation failed. errno = %d\n", hyperror);
1021 goto out;
1022 }
1023
1024 /* Start it up */
1025
1026 /* First bring it down */
1027 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_down, ci->ci_vcpuid, NULL))) {
1028 aprint_error(": VCPUOP_down hypervisor command failed. errno = %d\n", hyperror);
1029 goto out;
1030 }
1031
1032 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_up, ci->ci_vcpuid, NULL))) {
1033 aprint_error(": VCPUOP_up hypervisor command failed. errno = %d\n", hyperror);
1034 goto out;
1035 }
1036
1037 if (!vcpu_is_up(ci)) {
1038 aprint_error(": did not come up\n");
1039 hyperror = -1;
1040 goto out;
1041 }
1042
1043 out:
1044 kmem_free(vcpuctx, sizeof(*vcpuctx));
1045 return hyperror;
1046 }
1047
1048 void
1049 mp_cpu_start_cleanup(struct cpu_info *ci)
1050 {
1051 if (vcpu_is_up(ci)) {
1052 aprint_debug_dev(ci->ci_dev, "is started.\n");
1053 } else {
1054 aprint_error_dev(ci->ci_dev, "did not start up.\n");
1055 }
1056 }
1057
1058 void
1059 cpu_init_msrs(struct cpu_info *ci, bool full)
1060 {
1061 #ifdef __x86_64__
1062 if (full) {
1063 HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
1064 HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (uint64_t)ci);
1065 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
1066 }
1067 #endif
1068
1069 if (cpu_feature[2] & CPUID_NOX)
1070 wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
1071 }
1072
1073 void
1074 cpu_offline_md(void)
1075 {
1076 return;
1077 }
1078
1079 void
1080 cpu_get_tsc_freq(struct cpu_info *ci)
1081 {
1082 uint32_t vcpu_tversion;
1083 const volatile vcpu_time_info_t *tinfo = &ci->ci_vcpu->time;
1084
1085 vcpu_tversion = tinfo->version;
1086 while (tinfo->version == vcpu_tversion); /* Wait for a time update. XXX: timeout ? */
1087
1088 uint64_t freq = 1000000000ULL << 32;
1089 freq = freq / (uint64_t)tinfo->tsc_to_system_mul;
1090 if (tinfo->tsc_shift < 0)
1091 freq = freq << -tinfo->tsc_shift;
1092 else
1093 freq = freq >> tinfo->tsc_shift;
1094 ci->ci_data.cpu_cc_freq = freq;
1095 }
1096
1097 /*
1098 * Loads pmap for the current CPU.
1099 */
1100 void
1101 cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
1102 {
1103 struct cpu_info *ci = curcpu();
1104 cpuid_t cid = cpu_index(ci);
1105 int i;
1106
1107 KASSERT(pmap != pmap_kernel());
1108
1109 mutex_enter(&ci->ci_kpm_mtx);
1110 /* make new pmap visible to xen_kpm_sync() */
1111 kcpuset_atomic_set(pmap->pm_xen_ptp_cpus, cid);
1112
1113 #ifdef __x86_64__
1114 pd_entry_t *new_pgd;
1115 paddr_t l4_pd_ma;
1116
1117 l4_pd_ma = xpmap_ptom_masked(ci->ci_kpm_pdirpa);
1118
1119 /*
1120 * Map user space address in kernel space and load
1121 * user cr3
1122 */
1123 new_pgd = pmap->pm_pdir;
1124 KASSERT(pmap == ci->ci_pmap);
1125
1126 /* Copy user pmap L4 PDEs (in user addr. range) to per-cpu L4 */
1127 for (i = 0; i < PDIR_SLOT_USERLIM; i++) {
1128 KASSERT(pmap != pmap_kernel() || new_pgd[i] == 0);
1129 if (ci->ci_kpm_pdir[i] != new_pgd[i]) {
1130 xpq_queue_pte_update(l4_pd_ma + i * sizeof(pd_entry_t),
1131 new_pgd[i]);
1132 }
1133 }
1134
1135 xen_set_user_pgd(pmap_pdirpa(pmap, 0));
1136 ci->ci_xen_current_user_pgd = pmap_pdirpa(pmap, 0);
1137 #else
1138 paddr_t l3_pd = xpmap_ptom_masked(ci->ci_pae_l3_pdirpa);
1139 /* don't update the kernel L3 slot */
1140 for (i = 0; i < PDP_SIZE - 1; i++) {
1141 xpq_queue_pte_update(l3_pd + i * sizeof(pd_entry_t),
1142 xpmap_ptom(pmap->pm_pdirpa[i]) | PTE_P);
1143 }
1144 #endif
1145
1146 tlbflush();
1147
1148 /* old pmap no longer visible to xen_kpm_sync() */
1149 if (oldpmap != pmap_kernel()) {
1150 kcpuset_atomic_clear(oldpmap->pm_xen_ptp_cpus, cid);
1151 }
1152 mutex_exit(&ci->ci_kpm_mtx);
1153 }
1154
1155 /*
1156 * pmap_cpu_init_late: perform late per-CPU initialization.
1157 *
1158 * Short note about percpu PDIR pages. Both the PAE and __x86_64__ architectures
1159 * have per-cpu PDIR tables, for two different reasons:
1160 * - on PAE, this is to get around Xen's pagetable setup constraints (multiple
1161 * L3[3]s cannot point to the same L2 - Xen will refuse to pin a table set up
1162 * this way).
1163 * - on __x86_64__, this is for multiple CPUs to map in different user pmaps
1164 * (see cpu_load_pmap()).
1165 *
1166 * What this means for us is that the PDIR of the pmap_kernel() is considered
1167 * to be a canonical "SHADOW" PDIR with the following properties:
1168 * - its recursive mapping points to itself
1169 * - per-cpu recursive mappings point to themselves on __x86_64__
1170 * - per-cpu L4 pages' kernel entries are expected to be in sync with
1171 * the shadow
1172 */
1173
1174 void
1175 pmap_cpu_init_late(struct cpu_info *ci)
1176 {
1177 int i;
1178
1179 /*
1180 * The BP has already its own PD page allocated during early
1181 * MD startup.
1182 */
1183
1184 #ifdef __x86_64__
1185 /* Setup per-cpu normal_pdes */
1186 extern pd_entry_t * const normal_pdes[];
1187 for (i = 0;i < PTP_LEVELS - 1;i++) {
1188 ci->ci_normal_pdes[i] = normal_pdes[i];
1189 }
1190 #endif
1191
1192 if (ci == &cpu_info_primary)
1193 return;
1194
1195 KASSERT(ci != NULL);
1196
1197 #if defined(i386)
1198 cpu_alloc_l3_page(ci);
1199 KASSERT(ci->ci_pae_l3_pdirpa != 0);
1200
1201 /* Initialise L2 entries 0 - 2: Point them to pmap_kernel() */
1202 for (i = 0; i < PDP_SIZE - 1; i++) {
1203 ci->ci_pae_l3_pdir[i] =
1204 xpmap_ptom_masked(pmap_kernel()->pm_pdirpa[i]) | PTE_P;
1205 }
1206 #endif
1207
1208 ci->ci_kpm_pdir = (pd_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1209 UVM_KMF_WIRED | UVM_KMF_ZERO | UVM_KMF_NOWAIT);
1210
1211 if (ci->ci_kpm_pdir == NULL) {
1212 panic("%s: failed to allocate L4 per-cpu PD for CPU %d\n",
1213 __func__, cpu_index(ci));
1214 }
1215 ci->ci_kpm_pdirpa = vtophys((vaddr_t)ci->ci_kpm_pdir);
1216 KASSERT(ci->ci_kpm_pdirpa != 0);
1217
1218 #ifdef __x86_64__
1219 extern pt_entry_t xpmap_pg_nx;
1220
1221 /* Copy over the pmap_kernel() shadow L4 entries */
1222 memcpy(ci->ci_kpm_pdir, pmap_kernel()->pm_pdir, PAGE_SIZE);
1223
1224 /* Recursive kernel mapping */
1225 ci->ci_kpm_pdir[PDIR_SLOT_PTE] = xpmap_ptom_masked(ci->ci_kpm_pdirpa)
1226 | PTE_P | xpmap_pg_nx;
1227 #else
1228 /* Copy over the pmap_kernel() shadow L2 entries */
1229 memcpy(ci->ci_kpm_pdir, pmap_kernel()->pm_pdir + PDIR_SLOT_KERN,
1230 nkptp[PTP_LEVELS - 1] * sizeof(pd_entry_t));
1231 #endif
1232
1233 /* Xen wants a RO pdir. */
1234 pmap_protect(pmap_kernel(), (vaddr_t)ci->ci_kpm_pdir,
1235 (vaddr_t)ci->ci_kpm_pdir + PAGE_SIZE, VM_PROT_READ);
1236 pmap_update(pmap_kernel());
1237
1238 #ifdef __x86_64__
1239 xpq_queue_pin_l4_table(xpmap_ptom_masked(ci->ci_kpm_pdirpa));
1240 #else
1241 /*
1242 * Initialize L3 entry 3. This mapping is shared across all pmaps and is
1243 * static, ie: loading a new pmap will not update this entry.
1244 */
1245 ci->ci_pae_l3_pdir[3] = xpmap_ptom_masked(ci->ci_kpm_pdirpa) | PTE_P;
1246
1247 /* Xen wants a RO L3. */
1248 pmap_protect(pmap_kernel(), (vaddr_t)ci->ci_pae_l3_pdir,
1249 (vaddr_t)ci->ci_pae_l3_pdir + PAGE_SIZE, VM_PROT_READ);
1250 pmap_update(pmap_kernel());
1251
1252 xpq_queue_pin_l3_table(xpmap_ptom_masked(ci->ci_pae_l3_pdirpa));
1253 #endif
1254 }
1255
1256 /*
1257 * Notify all other cpus to halt.
1258 */
1259
1260 void
1261 cpu_broadcast_halt(void)
1262 {
1263 xen_broadcast_ipi(XEN_IPI_HALT);
1264 }
1265
1266 /*
1267 * Send a dummy ipi to a cpu, and raise an AST on the running LWP.
1268 */
1269
1270 void
1271 cpu_kick(struct cpu_info *ci)
1272 {
1273 (void)xen_send_ipi(ci, XEN_IPI_AST);
1274 }
1275