cpu.c revision 1.139 1 /* $NetBSD: cpu.c,v 1.139 2020/07/14 00:45:53 yamaguchi Exp $ */
2
3 /*-
4 * Copyright (c) 2000 The NetBSD Foundation, Inc.
5 * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by RedBack Networks Inc.
10 *
11 * Author: Bill Sommerfeld
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*
36 * Copyright (c) 1999 Stefan Grefen
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the NetBSD
49 * Foundation, Inc. and its contributors.
50 * 4. Neither the name of The NetBSD Foundation nor the names of its
51 * contributors may be used to endorse or promote products derived
52 * from this software without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
55 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 */
66
67 #include <sys/cdefs.h>
68 __KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.139 2020/07/14 00:45:53 yamaguchi Exp $");
69
70 #include "opt_ddb.h"
71 #include "opt_multiprocessor.h"
72 #include "opt_mpbios.h" /* for MPDEBUG */
73 #include "opt_mtrr.h"
74 #include "opt_xen.h"
75
76 #include "lapic.h"
77 #include "ioapic.h"
78
79 #include <sys/param.h>
80 #include <sys/proc.h>
81 #include <sys/systm.h>
82 #include <sys/device.h>
83 #include <sys/kmem.h>
84 #include <sys/cpu.h>
85 #include <sys/cpufreq.h>
86 #include <sys/atomic.h>
87 #include <sys/reboot.h>
88 #include <sys/idle.h>
89
90 #include <uvm/uvm.h>
91
92 #include <machine/cpu.h>
93 #include <machine/cpufunc.h>
94 #include <machine/cpuvar.h>
95 #include <machine/pmap.h>
96 #include <machine/vmparam.h>
97 #include <machine/mpbiosvar.h>
98 #include <machine/pcb.h>
99 #include <machine/specialreg.h>
100 #include <machine/segments.h>
101 #include <machine/gdt.h>
102 #include <machine/mtrr.h>
103 #include <machine/pio.h>
104
105 #include <x86/fpu.h>
106
107 #include <xen/xen.h>
108 #include <xen/include/public/vcpu.h>
109 #include <xen/vcpuvar.h>
110
111 #if NLAPIC > 0
112 #include <machine/apicvar.h>
113 #include <machine/i82489reg.h>
114 #include <machine/i82489var.h>
115 #endif
116
117 #include <dev/ic/mc146818reg.h>
118 #include <dev/isa/isareg.h>
119
120 static int cpu_match(device_t, cfdata_t, void *);
121 static void cpu_attach(device_t, device_t, void *);
122 static void cpu_defer(device_t);
123 static int cpu_rescan(device_t, const char *, const int *);
124 static void cpu_childdetached(device_t, device_t);
125 static int vcpu_match(device_t, cfdata_t, void *);
126 static void vcpu_attach(device_t, device_t, void *);
127 static void cpu_attach_common(device_t, device_t, void *);
128 void cpu_offline_md(void);
129
130 struct cpu_softc {
131 device_t sc_dev; /* device tree glue */
132 struct cpu_info *sc_info; /* pointer to CPU info */
133 bool sc_wasonline;
134 };
135
136 int mp_cpu_start(struct cpu_info *, vaddr_t);
137 void mp_cpu_start_cleanup(struct cpu_info *);
138 const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
139 mp_cpu_start_cleanup };
140
141 CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc),
142 cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached);
143
144 CFATTACH_DECL_NEW(vcpu, sizeof(struct cpu_softc),
145 vcpu_match, vcpu_attach, NULL, NULL);
146
147 /*
148 * Statically-allocated CPU info for the primary CPU (or the only
149 * CPU, on uniprocessors). The CPU info list is initialized to
150 * point at it.
151 */
152 struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = {
153 .ci_dev = 0,
154 .ci_self = &cpu_info_primary,
155 .ci_idepth = -1,
156 .ci_curlwp = &lwp0,
157 .ci_curldt = -1,
158 };
159 struct cpu_info phycpu_info_primary __aligned(CACHE_LINE_SIZE) = {
160 .ci_dev = 0,
161 .ci_self = &phycpu_info_primary,
162 };
163
164 struct cpu_info *cpu_info_list = &cpu_info_primary;
165 struct cpu_info *phycpu_info_list = &phycpu_info_primary;
166
167 uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits
168 * [0] basic features %edx
169 * [1] basic features %ecx
170 * [2] extended features %edx
171 * [3] extended features %ecx
172 * [4] VIA padlock features
173 * [5] structured extended features cpuid.7:%ebx
174 * [6] structured extended features cpuid.7:%ecx
175 */
176
177 bool x86_mp_online;
178 paddr_t mp_trampoline_paddr = MP_TRAMPOLINE;
179
180 #if defined(MULTIPROCESSOR)
181 void cpu_hatch(void *);
182 static void cpu_boot_secondary(struct cpu_info *ci);
183 static void cpu_start_secondary(struct cpu_info *ci);
184 #endif /* MULTIPROCESSOR */
185
186 static int
187 cpu_match(device_t parent, cfdata_t match, void *aux)
188 {
189
190 return 1;
191 }
192
193 static void
194 cpu_attach(device_t parent, device_t self, void *aux)
195 {
196 struct cpu_softc *sc = device_private(self);
197 struct cpu_attach_args *caa = aux;
198 struct cpu_info *ci;
199 uintptr_t ptr;
200 static int nphycpu = 0;
201
202 sc->sc_dev = self;
203
204 /*
205 * If we're an Application Processor, allocate a cpu_info
206 * If we're the first attached CPU use the primary cpu_info,
207 * otherwise allocate a new one
208 */
209 aprint_naive("\n");
210 aprint_normal("\n");
211 if (nphycpu > 0) {
212 struct cpu_info *tmp;
213 ptr = (uintptr_t)kmem_zalloc(sizeof(*ci) + CACHE_LINE_SIZE - 1,
214 KM_SLEEP);
215 ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
216 ci->ci_curldt = -1;
217
218 tmp = phycpu_info_list;
219 while (tmp->ci_next)
220 tmp = tmp->ci_next;
221
222 tmp->ci_next = ci;
223 } else {
224 ci = &phycpu_info_primary;
225 }
226
227 ci->ci_self = ci;
228 sc->sc_info = ci;
229
230 ci->ci_dev = self;
231 ci->ci_acpiid = caa->cpu_id;
232 ci->ci_cpuid = caa->cpu_number;
233 ci->ci_vcpu = NULL;
234 ci->ci_index = nphycpu++;
235 ci->ci_kfpu_spl = -1;
236
237 if (!pmf_device_register(self, NULL, NULL))
238 aprint_error_dev(self, "couldn't establish power handler\n");
239
240 (void)config_defer(self, cpu_defer);
241 }
242
243 static void
244 cpu_defer(device_t self)
245 {
246 cpu_rescan(self, NULL, NULL);
247 }
248
249 static int
250 cpu_rescan(device_t self, const char *ifattr, const int *locators)
251 {
252 struct cpu_softc *sc = device_private(self);
253 struct cpufeature_attach_args cfaa;
254 struct cpu_info *ci = sc->sc_info;
255
256 memset(&cfaa, 0, sizeof(cfaa));
257 cfaa.ci = ci;
258
259 if (ifattr_match(ifattr, "cpufeaturebus")) {
260
261 if (ci->ci_frequency == NULL) {
262 cfaa.name = "frequency";
263 ci->ci_frequency = config_found_ia(self,
264 "cpufeaturebus", &cfaa, NULL);
265 }
266 }
267
268 return 0;
269 }
270
271 static void
272 cpu_childdetached(device_t self, device_t child)
273 {
274 struct cpu_softc *sc = device_private(self);
275 struct cpu_info *ci = sc->sc_info;
276
277 if (ci->ci_frequency == child)
278 ci->ci_frequency = NULL;
279 }
280
281 static int
282 vcpu_match(device_t parent, cfdata_t match, void *aux)
283 {
284 struct vcpu_attach_args *vcaa = aux;
285 struct vcpu_runstate_info vcr;
286 int error;
287
288 if (strcmp(vcaa->vcaa_name, match->cf_name) == 0) {
289 error = HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info,
290 vcaa->vcaa_caa.cpu_number, &vcr);
291 switch (error) {
292 case 0:
293 return 1;
294 case -ENOENT:
295 return 0;
296 default:
297 panic("Unknown hypervisor error %d returned on vcpu runstate probe\n", error);
298 }
299 }
300
301 return 0;
302 }
303
304 static void
305 vcpu_attach(device_t parent, device_t self, void *aux)
306 {
307 struct vcpu_attach_args *vcaa = aux;
308
309 KASSERT(vcaa->vcaa_caa.cpu_func == NULL);
310 vcaa->vcaa_caa.cpu_func = &mp_cpu_funcs;
311 cpu_attach_common(parent, self, &vcaa->vcaa_caa);
312
313 if (!pmf_device_register(self, NULL, NULL))
314 aprint_error_dev(self, "couldn't establish power handler\n");
315 }
316
317 static int
318 vcpu_is_up(struct cpu_info *ci)
319 {
320 KASSERT(ci != NULL);
321 return HYPERVISOR_vcpu_op(VCPUOP_is_up, ci->ci_vcpuid, NULL);
322 }
323
324 static void
325 cpu_vm_init(struct cpu_info *ci)
326 {
327 int ncolors = 2, i;
328
329 for (i = CAI_ICACHE; i <= CAI_L2CACHE; i++) {
330 struct x86_cache_info *cai;
331 int tcolors;
332
333 cai = &ci->ci_cinfo[i];
334
335 tcolors = atop(cai->cai_totalsize);
336 switch (cai->cai_associativity) {
337 case 0xff:
338 tcolors = 1; /* fully associative */
339 break;
340 case 0:
341 case 1:
342 break;
343 default:
344 tcolors /= cai->cai_associativity;
345 }
346 ncolors = uimax(ncolors, tcolors);
347 }
348
349 /*
350 * Knowing the size of the largest cache on this CPU, potentially
351 * re-color our pages.
352 */
353 aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
354 uvm_page_recolor(ncolors);
355 pmap_tlb_cpu_init(ci);
356 #ifndef __HAVE_DIRECT_MAP
357 pmap_vpage_cpu_init(ci);
358 #endif
359 }
360
361 static void
362 cpu_attach_common(device_t parent, device_t self, void *aux)
363 {
364 struct cpu_softc *sc = device_private(self);
365 struct cpu_attach_args *caa = aux;
366 struct cpu_info *ci;
367 uintptr_t ptr;
368 int cpunum = caa->cpu_number;
369 static bool again = false;
370
371 sc->sc_dev = self;
372
373 /*
374 * If we're an Application Processor, allocate a cpu_info
375 * structure, otherwise use the primary's.
376 */
377 if (caa->cpu_role == CPU_ROLE_AP) {
378 aprint_naive(": Application Processor\n");
379 ptr = (uintptr_t)kmem_alloc(sizeof(*ci) + CACHE_LINE_SIZE - 1,
380 KM_SLEEP);
381 ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
382 memset(ci, 0, sizeof(*ci));
383 cpu_init_tss(ci);
384 } else {
385 aprint_naive(": %s Processor\n",
386 caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot");
387 ci = &cpu_info_primary;
388 }
389
390 ci->ci_self = ci;
391 sc->sc_info = ci;
392 ci->ci_dev = self;
393 ci->ci_cpuid = cpunum;
394 ci->ci_vcpuid = cpunum;
395 ci->ci_kfpu_spl = -1;
396
397 KASSERT(HYPERVISOR_shared_info != NULL);
398 KASSERT(cpunum < XEN_LEGACY_MAX_VCPUS);
399 ci->ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[cpunum];
400
401 KASSERT(ci->ci_func == 0);
402 ci->ci_func = caa->cpu_func;
403 aprint_normal("\n");
404
405 /* Must be called before mi_cpu_attach(). */
406 cpu_vm_init(ci);
407
408 if (caa->cpu_role == CPU_ROLE_AP) {
409 int error;
410
411 error = mi_cpu_attach(ci);
412
413 KASSERT(ci->ci_data.cpu_idlelwp != NULL);
414 if (error != 0) {
415 aprint_error_dev(self,
416 "mi_cpu_attach failed with %d\n", error);
417 return;
418 }
419
420 } else {
421 KASSERT(ci->ci_data.cpu_idlelwp != NULL);
422 }
423
424 KASSERT(ci->ci_cpuid == ci->ci_index);
425 #ifdef __x86_64__
426 /* No user PGD mapped for this CPU yet */
427 ci->ci_xen_current_user_pgd = 0;
428 #endif
429 mutex_init(&ci->ci_kpm_mtx, MUTEX_DEFAULT, IPL_VM);
430 pmap_reference(pmap_kernel());
431 ci->ci_pmap = pmap_kernel();
432 ci->ci_tlbstate = TLBSTATE_STALE;
433
434 /*
435 * Boot processor may not be attached first, but the below
436 * must be done to allow booting other processors.
437 */
438 if (!again) {
439 atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY);
440 /* Basic init. */
441 cpu_intr_init(ci);
442 cpu_get_tsc_freq(ci);
443 cpu_init(ci);
444 pmap_cpu_init_late(ci);
445
446 /* Every processor needs to init its own ipi h/w (similar to lapic) */
447 xen_ipi_init();
448
449 /* Make sure DELAY() is initialized. */
450 DELAY(1);
451 again = true;
452 }
453
454 /* further PCB init done later. */
455
456 switch (caa->cpu_role) {
457 case CPU_ROLE_SP:
458 atomic_or_32(&ci->ci_flags, CPUF_SP);
459 cpu_identify(ci);
460 x86_cpu_idle_init();
461 break;
462
463 case CPU_ROLE_BP:
464 atomic_or_32(&ci->ci_flags, CPUF_BSP);
465 cpu_identify(ci);
466 x86_cpu_idle_init();
467 break;
468
469 case CPU_ROLE_AP:
470 atomic_or_32(&ci->ci_flags, CPUF_AP);
471
472 /*
473 * report on an AP
474 */
475
476 #if defined(MULTIPROCESSOR)
477 /* interrupt handler stack */
478 cpu_intr_init(ci);
479
480 /* Setup per-cpu memory for idt */
481 idt_vec_init_cpu_md(&ci->ci_idtvec, cpu_index(ci));
482
483 /* Setup per-cpu memory for gdt */
484 gdt_alloc_cpu(ci);
485
486 pmap_cpu_init_late(ci);
487 cpu_start_secondary(ci);
488
489 if (ci->ci_flags & CPUF_PRESENT) {
490 struct cpu_info *tmp;
491
492 cpu_identify(ci);
493 tmp = cpu_info_list;
494 while (tmp->ci_next)
495 tmp = tmp->ci_next;
496
497 tmp->ci_next = ci;
498 }
499 #else
500 aprint_error_dev(ci->ci_dev, "not started\n");
501 #endif
502 break;
503
504 default:
505 panic("unknown processor type??\n");
506 }
507
508 #ifdef MPVERBOSE
509 if (mp_verbose) {
510 struct lwp *l = ci->ci_data.cpu_idlelwp;
511 struct pcb *pcb = lwp_getpcb(l);
512
513 aprint_verbose_dev(self,
514 "idle lwp at %p, idle sp at %p\n",
515 l,
516 #ifdef i386
517 (void *)pcb->pcb_esp
518 #else
519 (void *)pcb->pcb_rsp
520 #endif
521 );
522
523 }
524 #endif /* MPVERBOSE */
525 }
526
527 /*
528 * Initialize the processor appropriately.
529 */
530
531 void
532 cpu_init(struct cpu_info *ci)
533 {
534 uint32_t cr4 = 0;
535
536 /*
537 * If we have FXSAVE/FXRESTOR, use them.
538 */
539 if (cpu_feature[0] & CPUID_FXSR) {
540 cr4 |= CR4_OSFXSR;
541
542 /*
543 * If we have SSE/SSE2, enable XMM exceptions.
544 */
545 if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2))
546 cr4 |= CR4_OSXMMEXCPT;
547 }
548
549 /* If xsave is supported, enable it */
550 if (cpu_feature[1] & CPUID2_XSAVE && x86_fpu_save >= FPU_SAVE_XSAVE)
551 cr4 |= CR4_OSXSAVE;
552
553 if (cr4) {
554 cr4 |= rcr4();
555 lcr4(cr4);
556 }
557
558 if (x86_fpu_save >= FPU_SAVE_FXSAVE) {
559 fpuinit_mxcsr_mask();
560 }
561
562 /*
563 * Changing CR4 register may change cpuid values. For example, setting
564 * CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in
565 * ci_feat_val[1], so update it.
566 * XXX Other than ci_feat_val[1] might be changed.
567 */
568 if (cpuid_level >= 1) {
569 u_int descs[4];
570
571 x86_cpuid(1, descs);
572 ci->ci_feat_val[1] = descs[2];
573 }
574
575 /* If xsave is enabled, enable all fpu features */
576 if (cr4 & CR4_OSXSAVE) {
577 wrxcr(0, x86_xsave_features & XCR0_FPU);
578 }
579
580 atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
581 }
582
583
584 #ifdef MULTIPROCESSOR
585
586 void
587 cpu_boot_secondary_processors(void)
588 {
589 struct cpu_info *ci;
590 kcpuset_t *cpus;
591 u_long i;
592
593 kcpuset_create(&cpus, true);
594 kcpuset_set(cpus, cpu_index(curcpu()));
595 for (i = 0; i < maxcpus; i++) {
596 ci = cpu_lookup(i);
597 if (ci == NULL)
598 continue;
599 if (ci->ci_data.cpu_idlelwp == NULL)
600 continue;
601 if ((ci->ci_flags & CPUF_PRESENT) == 0)
602 continue;
603 if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY))
604 continue;
605 cpu_boot_secondary(ci);
606 kcpuset_set(cpus, cpu_index(ci));
607 }
608 while (!kcpuset_match(cpus, kcpuset_running))
609 ;
610 kcpuset_destroy(cpus);
611
612 x86_mp_online = true;
613 }
614
615 static void
616 cpu_init_idle_lwp(struct cpu_info *ci)
617 {
618 struct lwp *l = ci->ci_data.cpu_idlelwp;
619 struct pcb *pcb = lwp_getpcb(l);
620
621 pcb->pcb_cr0 = rcr0();
622 }
623
624 void
625 cpu_init_idle_lwps(void)
626 {
627 struct cpu_info *ci;
628 u_long i;
629
630 for (i = 0; i < maxcpus; i++) {
631 ci = cpu_lookup(i);
632 if (ci == NULL)
633 continue;
634 if (ci->ci_data.cpu_idlelwp == NULL)
635 continue;
636 if ((ci->ci_flags & CPUF_PRESENT) == 0)
637 continue;
638 cpu_init_idle_lwp(ci);
639 }
640 }
641
642 static void
643 cpu_start_secondary(struct cpu_info *ci)
644 {
645 int i;
646
647 aprint_debug_dev(ci->ci_dev, "starting\n");
648
649 ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
650
651 if (CPU_STARTUP(ci, (vaddr_t) cpu_hatch) != 0) {
652 return;
653 }
654
655 /*
656 * wait for it to become ready
657 */
658 for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) {
659 delay(10);
660 }
661 if ((ci->ci_flags & CPUF_PRESENT) == 0) {
662 aprint_error_dev(ci->ci_dev, "failed to become ready\n");
663 #if defined(MPDEBUG) && defined(DDB)
664 printf("dropping into debugger; continue from here to resume boot\n");
665 Debugger();
666 #endif
667 }
668
669 CPU_START_CLEANUP(ci);
670 }
671
672 void
673 cpu_boot_secondary(struct cpu_info *ci)
674 {
675 int i;
676 atomic_or_32(&ci->ci_flags, CPUF_GO);
677 for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) {
678 delay(10);
679 }
680 if ((ci->ci_flags & CPUF_RUNNING) == 0) {
681 aprint_error_dev(ci->ci_dev, "CPU failed to start\n");
682 #if defined(MPDEBUG) && defined(DDB)
683 printf("dropping into debugger; continue from here to resume boot\n");
684 Debugger();
685 #endif
686 }
687 }
688
689 /*
690 * APs end up here immediately after initialisation and VCPUOP_up in
691 * mp_cpu_start().
692 * At this point, we are running in the idle pcb/idle stack of the new
693 * CPU. This function jumps to the idle loop and starts looking for
694 * work.
695 */
696 extern void x86_64_tls_switch(struct lwp *);
697 void
698 cpu_hatch(void *v)
699 {
700 struct cpu_info *ci = (struct cpu_info *)v;
701 struct pcb *pcb;
702 int s, i;
703
704 /* Setup TLS and kernel GS/FS */
705 cpu_init_msrs(ci, true);
706 cpu_init_idt(ci);
707 gdt_init_cpu(ci);
708
709 cpu_probe(ci);
710
711 atomic_or_32(&ci->ci_flags, CPUF_PRESENT);
712
713 while ((ci->ci_flags & CPUF_GO) == 0) {
714 /* Don't use delay, boot CPU may be patching the text. */
715 for (i = 10000; i != 0; i--)
716 x86_pause();
717 }
718
719 /* Because the text may have been patched in x86_patch(). */
720 x86_flush();
721 tlbflushg();
722
723 KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);
724
725 KASSERT(ci->ci_curlwp == ci->ci_data.cpu_idlelwp);
726 KASSERT(curlwp == ci->ci_data.cpu_idlelwp);
727 pcb = lwp_getpcb(curlwp);
728 pcb->pcb_cr3 = pmap_pdirpa(pmap_kernel(), 0);
729
730 xen_ipi_init();
731
732 xen_initclocks();
733
734 #ifdef __x86_64__
735 fpuinit(ci);
736 #endif
737
738 lldt(GSEL(GLDT_SEL, SEL_KPL));
739
740 cpu_init(ci);
741 cpu_get_tsc_freq(ci);
742
743 s = splhigh();
744 x86_enable_intr();
745 splx(s);
746
747 aprint_debug_dev(ci->ci_dev, "running\n");
748
749 KASSERT(ci->ci_curlwp == ci->ci_data.cpu_idlelwp);
750 idle_loop(NULL);
751 KASSERT(false);
752 }
753
754 #if defined(DDB)
755
756 #include <ddb/db_output.h>
757 #include <machine/db_machdep.h>
758
759 /*
760 * Dump CPU information from ddb.
761 */
762 void
763 cpu_debug_dump(void)
764 {
765 struct cpu_info *ci;
766 CPU_INFO_ITERATOR cii;
767
768 db_printf("addr dev id flags ipis curlwp\n");
769 for (CPU_INFO_FOREACH(cii, ci)) {
770 db_printf("%p %s %ld %x %x %10p\n",
771 ci,
772 ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev),
773 (long)ci->ci_vcpuid,
774 ci->ci_flags, ci->ci_ipis,
775 ci->ci_curlwp);
776 }
777 }
778 #endif /* DDB */
779
780 #endif /* MULTIPROCESSOR */
781
782 extern void hypervisor_callback(void);
783 extern void failsafe_callback(void);
784 #ifdef __x86_64__
785 typedef void (vector)(void);
786 extern vector Xsyscall, Xsyscall32;
787 #endif
788
789 /*
790 * Setup the "trampoline". On Xen, we setup nearly all cpu context
791 * outside a trampoline, so we prototype and call targetip like so:
792 * void targetip(struct cpu_info *);
793 */
794
795 static void
796 gdt_prepframes(paddr_t *frames, vaddr_t base, uint32_t entries)
797 {
798 int i;
799 for (i = 0; i < entries; i++) {
800 frames[i] = ((paddr_t)xpmap_ptetomach(
801 (pt_entry_t *)(base + (i << PAGE_SHIFT)))) >> PAGE_SHIFT;
802
803 /* Mark Read-only */
804 pmap_pte_clearbits(kvtopte(base + (i << PAGE_SHIFT)),
805 PTE_W);
806 }
807 }
808
809 #ifdef __x86_64__
810 extern char *ldtstore;
811
812 static void
813 xen_init_amd64_vcpuctxt(struct cpu_info *ci, struct vcpu_guest_context *initctx,
814 void targetrip(struct cpu_info *))
815 {
816 /* page frames to point at GDT */
817 extern int gdt_size;
818 paddr_t frames[16];
819 psize_t gdt_ents;
820
821 struct lwp *l;
822 struct pcb *pcb;
823
824 volatile struct vcpu_info *vci;
825
826 KASSERT(ci != NULL);
827 KASSERT(ci != &cpu_info_primary);
828 KASSERT(initctx != NULL);
829 KASSERT(targetrip != NULL);
830
831 memset(initctx, 0, sizeof(*initctx));
832
833 gdt_ents = roundup(gdt_size, PAGE_SIZE) >> PAGE_SHIFT;
834 KASSERT(gdt_ents <= 16);
835
836 gdt_prepframes(frames, (vaddr_t)ci->ci_gdt, gdt_ents);
837
838 /* Initialise the vcpu context: We use idle_loop()'s pcb context. */
839
840 l = ci->ci_data.cpu_idlelwp;
841
842 KASSERT(l != NULL);
843 pcb = lwp_getpcb(l);
844 KASSERT(pcb != NULL);
845
846 /* resume with interrupts off */
847 vci = ci->ci_vcpu;
848 vci->evtchn_upcall_mask = 1;
849 xen_mb();
850
851 /* resume in kernel-mode */
852 initctx->flags = VGCF_in_kernel | VGCF_online;
853
854 /* Stack and entry points:
855 * We arrange for the stack frame for cpu_hatch() to
856 * appear as a callee frame of lwp_trampoline(). Being a
857 * leaf frame prevents trampling on any of the MD stack setup
858 * that x86/vm_machdep.c:cpu_lwp_fork() does for idle_loop()
859 */
860
861 initctx->user_regs.rdi = (uint64_t) ci; /* targetrip(ci); */
862 initctx->user_regs.rip = (vaddr_t) targetrip;
863
864 initctx->user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
865
866 initctx->user_regs.rflags = pcb->pcb_flags;
867 initctx->user_regs.rsp = pcb->pcb_rsp;
868
869 /* Data segments */
870 initctx->user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
871 initctx->user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
872 initctx->user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
873
874 /* GDT */
875 memcpy(initctx->gdt_frames, frames, sizeof(frames));
876 initctx->gdt_ents = gdt_ents;
877
878 /* LDT */
879 initctx->ldt_base = (unsigned long)ldtstore;
880 initctx->ldt_ents = LDT_SIZE >> 3;
881
882 /* Kernel context state */
883 initctx->kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
884 initctx->kernel_sp = pcb->pcb_rsp0;
885 initctx->ctrlreg[0] = pcb->pcb_cr0;
886 initctx->ctrlreg[1] = 0; /* "resuming" from kernel - no User cr3. */
887 initctx->ctrlreg[2] = (vaddr_t)targetrip;
888 /*
889 * Use pmap_kernel() L4 PD directly, until we setup the
890 * per-cpu L4 PD in pmap_cpu_init_late()
891 */
892 initctx->ctrlreg[3] = xen_pfn_to_cr3(x86_btop(xpmap_ptom(ci->ci_kpm_pdirpa)));
893 initctx->ctrlreg[4] = CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT;
894
895 /* Xen callbacks */
896 initctx->event_callback_eip = (unsigned long)hypervisor_callback;
897 initctx->failsafe_callback_eip = (unsigned long)failsafe_callback;
898 initctx->syscall_callback_eip = (unsigned long)Xsyscall;
899
900 return;
901 }
902 #else /* i386 */
903 extern union descriptor *ldtstore;
904 extern void Xsyscall(void);
905
906 static void
907 xen_init_i386_vcpuctxt(struct cpu_info *ci, struct vcpu_guest_context *initctx,
908 void targeteip(struct cpu_info *))
909 {
910 /* page frames to point at GDT */
911 extern int gdt_size;
912 paddr_t frames[16];
913 psize_t gdt_ents;
914
915 struct lwp *l;
916 struct pcb *pcb;
917
918 volatile struct vcpu_info *vci;
919
920 KASSERT(ci != NULL);
921 KASSERT(ci != &cpu_info_primary);
922 KASSERT(initctx != NULL);
923 KASSERT(targeteip != NULL);
924
925 memset(initctx, 0, sizeof(*initctx));
926
927 gdt_ents = roundup(gdt_size, PAGE_SIZE) >> PAGE_SHIFT;
928 KASSERT(gdt_ents <= 16);
929
930 gdt_prepframes(frames, (vaddr_t)ci->ci_gdt, gdt_ents);
931
932 /*
933 * Initialise the vcpu context:
934 * We use this cpu's idle_loop() pcb context.
935 */
936
937 l = ci->ci_data.cpu_idlelwp;
938
939 KASSERT(l != NULL);
940 pcb = lwp_getpcb(l);
941 KASSERT(pcb != NULL);
942
943 /* resume with interrupts off */
944 vci = ci->ci_vcpu;
945 vci->evtchn_upcall_mask = 1;
946 xen_mb();
947
948 /* resume in kernel-mode */
949 initctx->flags = VGCF_in_kernel | VGCF_online;
950
951 /* Stack frame setup for cpu_hatch():
952 * We arrange for the stack frame for cpu_hatch() to
953 * appear as a callee frame of lwp_trampoline(). Being a
954 * leaf frame prevents trampling on any of the MD stack setup
955 * that x86/vm_machdep.c:cpu_lwp_fork() does for idle_loop()
956 */
957
958 initctx->user_regs.esp = pcb->pcb_esp - 4; /* Leave word for
959 arg1 */
960 {
961 /* targeteip(ci); */
962 uint32_t *arg = (uint32_t *)initctx->user_regs.esp;
963 arg[1] = (uint32_t)ci; /* arg1 */
964 }
965
966 initctx->user_regs.eip = (vaddr_t)targeteip;
967 initctx->user_regs.cs = GSEL(GCODE_SEL, SEL_KPL);
968 initctx->user_regs.eflags |= pcb->pcb_iopl;
969
970 /* Data segments */
971 initctx->user_regs.ss = GSEL(GDATA_SEL, SEL_KPL);
972 initctx->user_regs.es = GSEL(GDATA_SEL, SEL_KPL);
973 initctx->user_regs.ds = GSEL(GDATA_SEL, SEL_KPL);
974 initctx->user_regs.fs = GSEL(GDATA_SEL, SEL_KPL);
975
976 /* GDT */
977 memcpy(initctx->gdt_frames, frames, sizeof(frames));
978 initctx->gdt_ents = gdt_ents;
979
980 /* LDT */
981 initctx->ldt_base = (unsigned long)ldtstore;
982 initctx->ldt_ents = NLDT;
983
984 /* Kernel context state */
985 initctx->kernel_ss = GSEL(GDATA_SEL, SEL_KPL);
986 initctx->kernel_sp = pcb->pcb_esp0;
987 initctx->ctrlreg[0] = pcb->pcb_cr0;
988 initctx->ctrlreg[1] = 0; /* "resuming" from kernel - no User cr3. */
989 initctx->ctrlreg[2] = (vaddr_t)targeteip;
990 initctx->ctrlreg[3] = xen_pfn_to_cr3(x86_btop(xpmap_ptom(ci->ci_pae_l3_pdirpa)));
991 initctx->ctrlreg[4] = /* CR4_PAE | */CR4_OSFXSR | CR4_OSXMMEXCPT;
992
993 /* Xen callbacks */
994 initctx->event_callback_eip = (unsigned long)hypervisor_callback;
995 initctx->event_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
996 initctx->failsafe_callback_eip = (unsigned long)failsafe_callback;
997 initctx->failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL);
998
999 return;
1000 }
1001 #endif /* __x86_64__ */
1002
1003 int
1004 mp_cpu_start(struct cpu_info *ci, vaddr_t target)
1005 {
1006 int hyperror;
1007 struct vcpu_guest_context *vcpuctx;
1008
1009 KASSERT(ci != NULL);
1010 KASSERT(ci != &cpu_info_primary);
1011 KASSERT(ci->ci_flags & CPUF_AP);
1012
1013 vcpuctx = kmem_alloc(sizeof(*vcpuctx), KM_SLEEP);
1014
1015 #ifdef __x86_64__
1016 xen_init_amd64_vcpuctxt(ci, vcpuctx, (void (*)(struct cpu_info *))target);
1017 #else
1018 xen_init_i386_vcpuctxt(ci, vcpuctx, (void (*)(struct cpu_info *))target);
1019 #endif
1020
1021 /* Initialise the given vcpu to execute cpu_hatch(ci); */
1022 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_initialise, ci->ci_vcpuid, vcpuctx))) {
1023 aprint_error(": context initialisation failed. errno = %d\n", hyperror);
1024 goto out;
1025 }
1026
1027 /* Start it up */
1028
1029 /* First bring it down */
1030 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_down, ci->ci_vcpuid, NULL))) {
1031 aprint_error(": VCPUOP_down hypervisor command failed. errno = %d\n", hyperror);
1032 goto out;
1033 }
1034
1035 if ((hyperror = HYPERVISOR_vcpu_op(VCPUOP_up, ci->ci_vcpuid, NULL))) {
1036 aprint_error(": VCPUOP_up hypervisor command failed. errno = %d\n", hyperror);
1037 goto out;
1038 }
1039
1040 if (!vcpu_is_up(ci)) {
1041 aprint_error(": did not come up\n");
1042 hyperror = -1;
1043 goto out;
1044 }
1045
1046 out:
1047 kmem_free(vcpuctx, sizeof(*vcpuctx));
1048 return hyperror;
1049 }
1050
1051 void
1052 mp_cpu_start_cleanup(struct cpu_info *ci)
1053 {
1054 if (vcpu_is_up(ci)) {
1055 aprint_debug_dev(ci->ci_dev, "is started.\n");
1056 } else {
1057 aprint_error_dev(ci->ci_dev, "did not start up.\n");
1058 }
1059 }
1060
1061 void
1062 cpu_init_msrs(struct cpu_info *ci, bool full)
1063 {
1064 #ifdef __x86_64__
1065 if (full) {
1066 HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
1067 HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (uint64_t)ci);
1068 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
1069 }
1070 #endif
1071
1072 if (cpu_feature[2] & CPUID_NOX)
1073 wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
1074 }
1075
1076 void
1077 cpu_offline_md(void)
1078 {
1079 return;
1080 }
1081
1082 void
1083 cpu_get_tsc_freq(struct cpu_info *ci)
1084 {
1085 uint32_t vcpu_tversion;
1086 const volatile vcpu_time_info_t *tinfo = &ci->ci_vcpu->time;
1087
1088 vcpu_tversion = tinfo->version;
1089 while (tinfo->version == vcpu_tversion); /* Wait for a time update. XXX: timeout ? */
1090
1091 uint64_t freq = 1000000000ULL << 32;
1092 freq = freq / (uint64_t)tinfo->tsc_to_system_mul;
1093 if (tinfo->tsc_shift < 0)
1094 freq = freq << -tinfo->tsc_shift;
1095 else
1096 freq = freq >> tinfo->tsc_shift;
1097 ci->ci_data.cpu_cc_freq = freq;
1098 }
1099
1100 /*
1101 * Loads pmap for the current CPU.
1102 */
1103 void
1104 cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
1105 {
1106 struct cpu_info *ci = curcpu();
1107 cpuid_t cid = cpu_index(ci);
1108 int i;
1109
1110 KASSERT(pmap != pmap_kernel());
1111
1112 mutex_enter(&ci->ci_kpm_mtx);
1113 /* make new pmap visible to xen_kpm_sync() */
1114 kcpuset_atomic_set(pmap->pm_xen_ptp_cpus, cid);
1115
1116 #ifdef __x86_64__
1117 pd_entry_t *new_pgd;
1118 paddr_t l4_pd_ma;
1119
1120 l4_pd_ma = xpmap_ptom_masked(ci->ci_kpm_pdirpa);
1121
1122 /*
1123 * Map user space address in kernel space and load
1124 * user cr3
1125 */
1126 new_pgd = pmap->pm_pdir;
1127 KASSERT(pmap == ci->ci_pmap);
1128
1129 /* Copy user pmap L4 PDEs (in user addr. range) to per-cpu L4 */
1130 for (i = 0; i < PDIR_SLOT_USERLIM; i++) {
1131 KASSERT(pmap != pmap_kernel() || new_pgd[i] == 0);
1132 if (ci->ci_kpm_pdir[i] != new_pgd[i]) {
1133 xpq_queue_pte_update(l4_pd_ma + i * sizeof(pd_entry_t),
1134 new_pgd[i]);
1135 }
1136 }
1137
1138 xen_set_user_pgd(pmap_pdirpa(pmap, 0));
1139 ci->ci_xen_current_user_pgd = pmap_pdirpa(pmap, 0);
1140 #else
1141 paddr_t l3_pd = xpmap_ptom_masked(ci->ci_pae_l3_pdirpa);
1142 /* don't update the kernel L3 slot */
1143 for (i = 0; i < PDP_SIZE - 1; i++) {
1144 xpq_queue_pte_update(l3_pd + i * sizeof(pd_entry_t),
1145 xpmap_ptom(pmap->pm_pdirpa[i]) | PTE_P);
1146 }
1147 #endif
1148
1149 tlbflush();
1150
1151 /* old pmap no longer visible to xen_kpm_sync() */
1152 if (oldpmap != pmap_kernel()) {
1153 kcpuset_atomic_clear(oldpmap->pm_xen_ptp_cpus, cid);
1154 }
1155 mutex_exit(&ci->ci_kpm_mtx);
1156 }
1157
1158 /*
1159 * pmap_cpu_init_late: perform late per-CPU initialization.
1160 *
1161 * Short note about percpu PDIR pages. Both the PAE and __x86_64__ architectures
1162 * have per-cpu PDIR tables, for two different reasons:
1163 * - on PAE, this is to get around Xen's pagetable setup constraints (multiple
1164 * L3[3]s cannot point to the same L2 - Xen will refuse to pin a table set up
1165 * this way).
1166 * - on __x86_64__, this is for multiple CPUs to map in different user pmaps
1167 * (see cpu_load_pmap()).
1168 *
1169 * What this means for us is that the PDIR of the pmap_kernel() is considered
1170 * to be a canonical "SHADOW" PDIR with the following properties:
1171 * - its recursive mapping points to itself
1172 * - per-cpu recursive mappings point to themselves on __x86_64__
1173 * - per-cpu L4 pages' kernel entries are expected to be in sync with
1174 * the shadow
1175 */
1176
1177 void
1178 pmap_cpu_init_late(struct cpu_info *ci)
1179 {
1180 int i;
1181
1182 /*
1183 * The BP has already its own PD page allocated during early
1184 * MD startup.
1185 */
1186
1187 #ifdef __x86_64__
1188 /* Setup per-cpu normal_pdes */
1189 extern pd_entry_t * const normal_pdes[];
1190 for (i = 0;i < PTP_LEVELS - 1;i++) {
1191 ci->ci_normal_pdes[i] = normal_pdes[i];
1192 }
1193 #endif
1194
1195 if (ci == &cpu_info_primary)
1196 return;
1197
1198 KASSERT(ci != NULL);
1199
1200 #if defined(i386)
1201 cpu_alloc_l3_page(ci);
1202 KASSERT(ci->ci_pae_l3_pdirpa != 0);
1203
1204 /* Initialise L2 entries 0 - 2: Point them to pmap_kernel() */
1205 for (i = 0; i < PDP_SIZE - 1; i++) {
1206 ci->ci_pae_l3_pdir[i] =
1207 xpmap_ptom_masked(pmap_kernel()->pm_pdirpa[i]) | PTE_P;
1208 }
1209 #endif
1210
1211 ci->ci_kpm_pdir = (pd_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1212 UVM_KMF_WIRED | UVM_KMF_ZERO | UVM_KMF_NOWAIT);
1213
1214 if (ci->ci_kpm_pdir == NULL) {
1215 panic("%s: failed to allocate L4 per-cpu PD for CPU %d\n",
1216 __func__, cpu_index(ci));
1217 }
1218 ci->ci_kpm_pdirpa = vtophys((vaddr_t)ci->ci_kpm_pdir);
1219 KASSERT(ci->ci_kpm_pdirpa != 0);
1220
1221 #ifdef __x86_64__
1222 extern pt_entry_t xpmap_pg_nx;
1223
1224 /* Copy over the pmap_kernel() shadow L4 entries */
1225 memcpy(ci->ci_kpm_pdir, pmap_kernel()->pm_pdir, PAGE_SIZE);
1226
1227 /* Recursive kernel mapping */
1228 ci->ci_kpm_pdir[PDIR_SLOT_PTE] = xpmap_ptom_masked(ci->ci_kpm_pdirpa)
1229 | PTE_P | xpmap_pg_nx;
1230 #else
1231 /* Copy over the pmap_kernel() shadow L2 entries */
1232 memcpy(ci->ci_kpm_pdir, pmap_kernel()->pm_pdir + PDIR_SLOT_KERN,
1233 nkptp[PTP_LEVELS - 1] * sizeof(pd_entry_t));
1234 #endif
1235
1236 /* Xen wants a RO pdir. */
1237 pmap_protect(pmap_kernel(), (vaddr_t)ci->ci_kpm_pdir,
1238 (vaddr_t)ci->ci_kpm_pdir + PAGE_SIZE, VM_PROT_READ);
1239 pmap_update(pmap_kernel());
1240
1241 #ifdef __x86_64__
1242 xpq_queue_pin_l4_table(xpmap_ptom_masked(ci->ci_kpm_pdirpa));
1243 #else
1244 /*
1245 * Initialize L3 entry 3. This mapping is shared across all pmaps and is
1246 * static, ie: loading a new pmap will not update this entry.
1247 */
1248 ci->ci_pae_l3_pdir[3] = xpmap_ptom_masked(ci->ci_kpm_pdirpa) | PTE_P;
1249
1250 /* Xen wants a RO L3. */
1251 pmap_protect(pmap_kernel(), (vaddr_t)ci->ci_pae_l3_pdir,
1252 (vaddr_t)ci->ci_pae_l3_pdir + PAGE_SIZE, VM_PROT_READ);
1253 pmap_update(pmap_kernel());
1254
1255 xpq_queue_pin_l3_table(xpmap_ptom_masked(ci->ci_pae_l3_pdirpa));
1256 #endif
1257 }
1258
1259 /*
1260 * Notify all other cpus to halt.
1261 */
1262
1263 void
1264 cpu_broadcast_halt(void)
1265 {
1266 xen_broadcast_ipi(XEN_IPI_HALT);
1267 }
1268
1269 /*
1270 * Send a dummy ipi to a cpu, and raise an AST on the running LWP.
1271 */
1272
1273 void
1274 cpu_kick(struct cpu_info *ci)
1275 {
1276 (void)xen_send_ipi(ci, XEN_IPI_AST);
1277 }
1278