Home | History | Annotate | Line # | Download | only in x86
      1 /*	$NetBSD: intr.c,v 1.169 2024/09/11 05:17:45 mrg Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2007, 2008, 2009, 2019 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Andrew Doran, and by Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright 2002 (c) Wasabi Systems, Inc.
     34  * All rights reserved.
     35  *
     36  * Written by Frank van der Linden for Wasabi Systems, Inc.
     37  *
     38  * Redistribution and use in source and binary forms, with or without
     39  * modification, are permitted provided that the following conditions
     40  * are met:
     41  * 1. Redistributions of source code must retain the above copyright
     42  *    notice, this list of conditions and the following disclaimer.
     43  * 2. Redistributions in binary form must reproduce the above copyright
     44  *    notice, this list of conditions and the following disclaimer in the
     45  *    documentation and/or other materials provided with the distribution.
     46  * 3. All advertising materials mentioning features or use of this software
     47  *    must display the following acknowledgement:
     48  *      This product includes software developed for the NetBSD Project by
     49  *      Wasabi Systems, Inc.
     50  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     51  *    or promote products derived from this software without specific prior
     52  *    written permission.
     53  *
     54  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
     55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     57  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
     58  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     61  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     62  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     63  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     64  * POSSIBILITY OF SUCH DAMAGE.
     65  */
     66 
     67 /*-
     68  * Copyright (c) 1991 The Regents of the University of California.
     69  * All rights reserved.
     70  *
     71  * This code is derived from software contributed to Berkeley by
     72  * William Jolitz.
     73  *
     74  * Redistribution and use in source and binary forms, with or without
     75  * modification, are permitted provided that the following conditions
     76  * are met:
     77  * 1. Redistributions of source code must retain the above copyright
     78  *    notice, this list of conditions and the following disclaimer.
     79  * 2. Redistributions in binary form must reproduce the above copyright
     80  *    notice, this list of conditions and the following disclaimer in the
     81  *    documentation and/or other materials provided with the distribution.
     82  * 3. Neither the name of the University nor the names of its contributors
     83  *    may be used to endorse or promote products derived from this software
     84  *    without specific prior written permission.
     85  *
     86  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     87  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     88  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     89  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     90  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     91  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     92  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     93  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     94  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     95  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     96  * SUCH DAMAGE.
     97  *
     98  *	@(#)isa.c	7.2 (Berkeley) 5/13/91
     99  */
    100 
    101 /*-
    102  * Copyright (c) 1993, 1994 Charles Hannum.
    103  *
    104  * Redistribution and use in source and binary forms, with or without
    105  * modification, are permitted provided that the following conditions
    106  * are met:
    107  * 1. Redistributions of source code must retain the above copyright
    108  *    notice, this list of conditions and the following disclaimer.
    109  * 2. Redistributions in binary form must reproduce the above copyright
    110  *    notice, this list of conditions and the following disclaimer in the
    111  *    documentation and/or other materials provided with the distribution.
    112  * 3. All advertising materials mentioning features or use of this software
    113  *    must display the following acknowledgement:
    114  *	This product includes software developed by the University of
    115  *	California, Berkeley and its contributors.
    116  * 4. Neither the name of the University nor the names of its contributors
    117  *    may be used to endorse or promote products derived from this software
    118  *    without specific prior written permission.
    119  *
    120  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    122  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    123  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    124  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    125  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    126  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    127  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    128  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    129  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    130  * SUCH DAMAGE.
    131  *
    132  *	@(#)isa.c	7.2 (Berkeley) 5/13/91
    133  */
    134 
    135 #include <sys/cdefs.h>
    136 __KERNEL_RCSID(0, "$NetBSD: intr.c,v 1.169 2024/09/11 05:17:45 mrg Exp $");
    137 
    138 #include "opt_acpi.h"
    139 #include "opt_intrdebug.h"
    140 #include "opt_multiprocessor.h"
    141 #include "opt_pci.h"
    142 
    143 #include <sys/param.h>
    144 #include <sys/systm.h>
    145 #include <sys/kernel.h>
    146 #include <sys/syslog.h>
    147 #include <sys/device.h>
    148 #include <sys/kmem.h>
    149 #include <sys/proc.h>
    150 #include <sys/errno.h>
    151 #include <sys/intr.h>
    152 #include <sys/cpu.h>
    153 #include <sys/xcall.h>
    154 #include <sys/interrupt.h>
    155 #include <sys/reboot.h> /* for AB_VERBOSE */
    156 #include <sys/sdt.h>
    157 
    158 #include <sys/kauth.h>
    159 #include <sys/conf.h>
    160 
    161 #include <uvm/uvm_extern.h>
    162 
    163 #include <machine/i8259.h>
    164 #include <machine/pio.h>
    165 
    166 #include <x86/intr_private.h>
    167 
    168 #include "ioapic.h"
    169 #include "lapic.h"
    170 #include "pci.h"
    171 #include "acpica.h"
    172 #ifndef XENPV
    173 #include "hyperv.h"
    174 #if NHYPERV > 0
    175 #include <dev/hyperv/hypervvar.h>
    176 
    177 extern void Xresume_hyperv_hypercall(void);
    178 extern void Xrecurse_hyperv_hypercall(void);
    179 #endif
    180 #endif
    181 
    182 #if NIOAPIC > 0 || NACPICA > 0
    183 #include <machine/i82093var.h>
    184 #include <machine/mpbiosvar.h>
    185 #include <machine/mpacpi.h>
    186 #endif
    187 
    188 #if NLAPIC > 0
    189 #include <machine/i82489var.h>
    190 #endif
    191 
    192 #if NPCI > 0
    193 #include <dev/pci/ppbreg.h>
    194 #endif
    195 
    196 #include <x86/pci/msipic.h>
    197 #include <x86/pci/pci_msi_machdep.h>
    198 
    199 #if NPCI == 0 || !defined(__HAVE_PCI_MSI_MSIX)
    200 #define msipic_is_msi_pic(PIC)	(false)
    201 #endif
    202 
    203 #include <ddb/db_active.h>
    204 
    205 #ifdef DDB
    206 #include <ddb/db_output.h>
    207 #endif
    208 
    209 #ifdef INTRDEBUG
    210 #define DPRINTF(msg) printf msg
    211 #else
    212 #define DPRINTF(msg)
    213 #endif
    214 
    215 static SIMPLEQ_HEAD(, intrsource) io_interrupt_sources =
    216 	SIMPLEQ_HEAD_INITIALIZER(io_interrupt_sources);
    217 
    218 static kmutex_t intr_distribute_lock;
    219 
    220 static int intr_allocate_slot_cpu(struct cpu_info *, struct pic *, int, int *,
    221 				  struct intrsource *);
    222 static int __noinline intr_allocate_slot(struct pic *, int, int,
    223 					 struct cpu_info **, int *, int *,
    224 					 struct intrsource *);
    225 
    226 static void intr_source_free(struct cpu_info *, int, struct pic *, int);
    227 
    228 static void intr_establish_xcall(void *, void *);
    229 static void intr_disestablish_xcall(void *, void *);
    230 
    231 static const char *legacy_intr_string(int, char *, size_t, struct pic *);
    232 
    233 static const char *xen_intr_string(int, char *, size_t, struct pic *);
    234 
    235 #if defined(INTRSTACKSIZE)
    236 static inline bool redzone_const_or_false(bool);
    237 static inline int redzone_const_or_zero(int);
    238 #endif
    239 
    240 static void intr_redistribute_xc_t(void *, void *);
    241 static void intr_redistribute_xc_s1(void *, void *);
    242 static void intr_redistribute_xc_s2(void *, void *);
    243 static bool intr_redistribute(struct cpu_info *);
    244 static struct intrsource *intr_get_io_intrsource(const char *);
    245 static void intr_free_io_intrsource_direct(struct intrsource *);
    246 static int intr_num_handlers(struct intrsource *);
    247 static int intr_find_unused_slot(struct cpu_info *, int *);
    248 static void intr_activate_xcall(void *, void *);
    249 static void intr_deactivate_xcall(void *, void *);
    250 static void intr_get_affinity(struct intrsource *, kcpuset_t *);
    251 static int intr_set_affinity(struct intrsource *, const kcpuset_t *);
    252 
    253 SDT_PROBE_DEFINE3(sdt, kernel, intr, entry,
    254     "int (*)(void *)"/*func*/,
    255     "void *"/*arg*/,
    256     "struct intrhand *"/*ih*/);
    257 SDT_PROBE_DEFINE4(sdt, kernel, intr, return,
    258     "int (*)(void *)"/*func*/,
    259     "void *"/*arg*/,
    260     "struct intrhand *"/*ih*/,
    261     "int"/*handled*/);
    262 
    263 /*
    264  * Fill in default interrupt table (in case of spurious interrupt
    265  * during configuration of kernel), setup interrupt control unit
    266  */
    267 void
    268 intr_default_setup(void)
    269 {
    270 	struct idt_vec *iv = &(cpu_info_primary.ci_idtvec);
    271 	int i;
    272 
    273 	/* icu vectors */
    274 	for (i = 0; i < NUM_LEGACY_IRQS; i++) {
    275 		idt_vec_reserve(iv, ICU_OFFSET + i);
    276 		idt_vec_set(iv, ICU_OFFSET + i, legacy_stubs[i].ist_entry);
    277 	}
    278 
    279 	/*
    280 	 * Eventually might want to check if it's actually there.
    281 	 */
    282 	i8259_default_setup();
    283 
    284 	mutex_init(&intr_distribute_lock, MUTEX_DEFAULT, IPL_NONE);
    285 }
    286 
    287 /*
    288  * Handle a NMI, possibly a machine check.
    289  * return true to panic system, false to ignore.
    290  */
    291 void
    292 x86_nmi(void)
    293 {
    294 
    295 	log(LOG_CRIT, "NMI port 61 %x, port 70 %x\n", inb(0x61), inb(0x70));
    296 }
    297 
    298 /*
    299  * Create an interrupt id such as "ioapic0 pin 9". This interrupt id is used
    300  * by MI code and intrctl(8).
    301  */
    302 const char *
    303 intr_create_intrid(int legacy_irq, struct pic *pic, int pin, char *buf,
    304     size_t len)
    305 {
    306 	int ih = 0;
    307 
    308 #if NPCI > 0
    309 #if defined(__HAVE_PCI_MSI_MSIX)
    310 	if ((pic->pic_type == PIC_MSI) || (pic->pic_type == PIC_MSIX)) {
    311 		uint64_t pih;
    312 		int dev, vec;
    313 
    314 		dev = msipic_get_devid(pic);
    315 		vec = pin;
    316 		pih = __SHIFTIN((uint64_t)dev, MSI_INT_DEV_MASK)
    317 			| __SHIFTIN((uint64_t)vec, MSI_INT_VEC_MASK)
    318 			| APIC_INT_VIA_MSI;
    319 		if (pic->pic_type == PIC_MSI)
    320 			MSI_INT_MAKE_MSI(pih);
    321 		else if (pic->pic_type == PIC_MSIX)
    322 			MSI_INT_MAKE_MSIX(pih);
    323 
    324 		return x86_pci_msi_string(NULL, pih, buf, len);
    325 	}
    326 #endif /* __HAVE_PCI_MSI_MSIX */
    327 #endif
    328 
    329 	if (pic->pic_type == PIC_XEN) {
    330 		ih = pin;	/* Port == pin */
    331 		return xen_intr_string(pin, buf, len, pic);
    332 	}
    333 
    334 	/*
    335 	 * If the device is pci, "legacy_irq" is always -1. Least 8 bit of "ih"
    336 	 * is only used in intr_string() to show the irq number.
    337 	 * If the device is "legacy"(such as floppy), it should not use
    338 	 * intr_string().
    339 	 */
    340 	if (pic->pic_type == PIC_I8259) {
    341 		ih = legacy_irq;
    342 		return legacy_intr_string(ih, buf, len, pic);
    343 	}
    344 
    345 #if NIOAPIC > 0 || NACPICA > 0
    346 	ih = ((pic->pic_apicid << APIC_INT_APIC_SHIFT) & APIC_INT_APIC_MASK)
    347 	    | ((pin << APIC_INT_PIN_SHIFT) & APIC_INT_PIN_MASK);
    348 	if (pic->pic_type == PIC_IOAPIC) {
    349 		ih |= APIC_INT_VIA_APIC;
    350 	}
    351 	ih |= pin;
    352 	return intr_string(ih, buf, len);
    353 #endif
    354 
    355 	return NULL; /* No pic found! */
    356 }
    357 
    358 /*
    359  * Find intrsource from io_interrupt_sources list.
    360  */
    361 static struct intrsource *
    362 intr_get_io_intrsource(const char *intrid)
    363 {
    364 	struct intrsource *isp;
    365 
    366 	KASSERT(mutex_owned(&cpu_lock));
    367 
    368 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
    369 		KASSERT(isp->is_intrid != NULL);
    370 		if (strncmp(intrid, isp->is_intrid, INTRIDBUF - 1) == 0)
    371 			return isp;
    372 	}
    373 	return NULL;
    374 }
    375 
    376 /*
    377  * Allocate intrsource and add to io_interrupt_sources list.
    378  */
    379 struct intrsource *
    380 intr_allocate_io_intrsource(const char *intrid)
    381 {
    382 	CPU_INFO_ITERATOR cii;
    383 	struct cpu_info *ci;
    384 	struct intrsource *isp;
    385 	struct percpu_evcnt *pep;
    386 
    387 	KASSERT(mutex_owned(&cpu_lock));
    388 
    389 	if (intrid == NULL)
    390 		return NULL;
    391 
    392 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
    393 	pep = kmem_zalloc(sizeof(*pep) * ncpu, KM_SLEEP);
    394 	isp->is_saved_evcnt = pep;
    395 	for (CPU_INFO_FOREACH(cii, ci)) {
    396 		pep->cpuid = ci->ci_cpuid;
    397 		pep++;
    398 	}
    399 	strlcpy(isp->is_intrid, intrid, sizeof(isp->is_intrid));
    400 
    401 	SIMPLEQ_INSERT_TAIL(&io_interrupt_sources, isp, is_list);
    402 
    403 	return isp;
    404 }
    405 
    406 /*
    407  * Remove from io_interrupt_sources list and free by the intrsource pointer.
    408  */
    409 static void
    410 intr_free_io_intrsource_direct(struct intrsource *isp)
    411 {
    412 	KASSERT(mutex_owned(&cpu_lock));
    413 
    414 	SIMPLEQ_REMOVE(&io_interrupt_sources, isp, intrsource, is_list);
    415 
    416 	/* Is this interrupt established? */
    417 	if (isp->is_evname[0] != '\0') {
    418 		evcnt_detach(&isp->is_evcnt);
    419 		isp->is_evname[0] = '\0';
    420 	}
    421 
    422 	kmem_free(isp->is_saved_evcnt,
    423 	    sizeof(*(isp->is_saved_evcnt)) * ncpu);
    424 
    425 	kmem_free(isp, sizeof(*isp));
    426 }
    427 
    428 /*
    429  * Remove from io_interrupt_sources list and free by the interrupt id.
    430  * This function can be used by MI code.
    431  */
    432 void
    433 intr_free_io_intrsource(const char *intrid)
    434 {
    435 	struct intrsource *isp;
    436 
    437 	KASSERT(mutex_owned(&cpu_lock));
    438 
    439 	if (intrid == NULL)
    440 		return;
    441 
    442 	if ((isp = intr_get_io_intrsource(intrid)) == NULL) {
    443 		return;
    444 	}
    445 
    446 	/* If the interrupt uses shared IRQ, don't free yet. */
    447 	if (isp->is_handlers != NULL) {
    448 		return;
    449 	}
    450 
    451 	intr_free_io_intrsource_direct(isp);
    452 }
    453 
    454 static int
    455 intr_allocate_slot_cpu(struct cpu_info *ci, struct pic *pic, int pin,
    456 		       int *index, struct intrsource *chained)
    457 {
    458 	int slot, i;
    459 	struct intrsource *isp;
    460 
    461 	KASSERT(mutex_owned(&cpu_lock));
    462 
    463 	if (pic == &i8259_pic) {
    464 		KASSERT(CPU_IS_PRIMARY(ci));
    465 		slot = pin;
    466 	} else {
    467 		int start = 0;
    468 		int max = MAX_INTR_SOURCES;
    469 		slot = -1;
    470 
    471 		/* avoid reserved slots for legacy interrupts. */
    472 		if (CPU_IS_PRIMARY(ci) && msipic_is_msi_pic(pic))
    473 			start = NUM_LEGACY_IRQS;
    474 		/* don't step over Xen's slots */
    475 		if (vm_guest == VM_GUEST_XENPVH)
    476 			max = SIR_XENIPL_VM;
    477 		/*
    478 		 * intr_allocate_slot has checked for an existing mapping.
    479 		 * Now look for a free slot.
    480 		 */
    481 		for (i = start; i < max ; i++) {
    482 			if (ci->ci_isources[i] == NULL) {
    483 				slot = i;
    484 				break;
    485 			}
    486 		}
    487 		if (slot == -1) {
    488 			return EBUSY;
    489 		}
    490 	}
    491 
    492 	isp = ci->ci_isources[slot];
    493 	if (isp == NULL) {
    494 		const char *via;
    495 
    496 		isp = chained;
    497 		KASSERT(isp != NULL);
    498 		if (pic->pic_type == PIC_MSI || pic->pic_type == PIC_MSIX)
    499 			via = "vec";
    500 		else
    501 			via = "pin";
    502 		snprintf(isp->is_evname, sizeof (isp->is_evname),
    503 		    "%s %d", via, pin);
    504 		evcnt_attach_dynamic(&isp->is_evcnt, EVCNT_TYPE_INTR, NULL,
    505 		    pic->pic_name, isp->is_evname);
    506 		isp->is_active_cpu = ci->ci_cpuid;
    507 		ci->ci_isources[slot] = isp;
    508 	}
    509 
    510 	*index = slot;
    511 	return 0;
    512 }
    513 
    514 /*
    515  * A simple round-robin allocator to assign interrupts to CPUs.
    516  */
    517 static int __noinline
    518 intr_allocate_slot(struct pic *pic, int pin, int level,
    519 		   struct cpu_info **cip, int *index, int *idt_slot,
    520 		   struct intrsource *chained)
    521 {
    522 	CPU_INFO_ITERATOR cii;
    523 	struct cpu_info *ci, *lci;
    524 	struct intrsource *isp;
    525 	int slot = 0, idtvec, error;
    526 
    527 	KASSERT(mutex_owned(&cpu_lock));
    528 
    529 	/* First check if this pin is already used by an interrupt vector. */
    530 	for (CPU_INFO_FOREACH(cii, ci)) {
    531 		for (slot = 0 ; slot < MAX_INTR_SOURCES ; slot++) {
    532 			if ((isp = ci->ci_isources[slot]) == NULL) {
    533 				continue;
    534 			}
    535 			if (isp->is_pic == pic &&
    536 			    pin != -1 && isp->is_pin == pin) {
    537 				*idt_slot = isp->is_idtvec;
    538 				*index = slot;
    539 				*cip = ci;
    540 				return 0;
    541 			}
    542 		}
    543 	}
    544 
    545 	/*
    546 	 * The pic/pin combination doesn't have an existing mapping.
    547 	 * Find a slot for a new interrupt source.  For the i8259 case,
    548 	 * we always use reserved slots of the primary CPU.  Otherwise,
    549 	 * we make an attempt to balance the interrupt load.
    550 	 *
    551 	 * PIC and APIC usage are essentially exclusive, so the reservation
    552 	 * of the ISA slots is ignored when assigning IOAPIC slots.
    553 	 */
    554 	if (pic == &i8259_pic) {
    555 		/*
    556 		 * Must be directed to BP.
    557 		 */
    558 		ci = &cpu_info_primary;
    559 		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
    560 	} else {
    561 		/*
    562 		 * Find least loaded AP/BP and try to allocate there.
    563 		 */
    564 		ci = NULL;
    565 		for (CPU_INFO_FOREACH(cii, lci)) {
    566 			if ((lci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
    567 				continue;
    568 			}
    569 #if 0
    570 			if (ci == NULL ||
    571 			    ci->ci_nintrhand > lci->ci_nintrhand) {
    572 				ci = lci;
    573 			}
    574 #else
    575 			ci = &cpu_info_primary;
    576 #endif
    577 		}
    578 		KASSERT(ci != NULL);
    579 		error = intr_allocate_slot_cpu(ci, pic, pin, &slot, chained);
    580 
    581 		/*
    582 		 * If that did not work, allocate anywhere.
    583 		 */
    584 		if (error != 0) {
    585 			for (CPU_INFO_FOREACH(cii, ci)) {
    586 				if ((ci->ci_schedstate.spc_flags &
    587 				    SPCF_NOINTR) != 0) {
    588 					continue;
    589 				}
    590 				error = intr_allocate_slot_cpu(ci, pic,
    591 				    pin, &slot, chained);
    592 				if (error == 0) {
    593 					break;
    594 				}
    595 			}
    596 		}
    597 	}
    598 	if (error != 0) {
    599 		return error;
    600 	}
    601 	KASSERT(ci != NULL);
    602 
    603 	/*
    604 	 * Now allocate an IDT vector.
    605 	 * For the 8259 these are reserved up front.
    606 	 */
    607 	if (pic == &i8259_pic) {
    608 		idtvec = ICU_OFFSET + pin;
    609 	} else {
    610 		/*
    611 		 * TODO to support MSI (not MSI-X) multiple vectors
    612 		 *
    613 		 * PCI Local Bus Specification Revision 3.0 says the devices
    614 		 * which use MSI multiple vectors increment the low order bits
    615 		 * of MSI message data.
    616 		 * On the other hand, Intel SDM "10.11.2 Message Data Register
    617 		 * Format" says the 7:0 bits of MSI message data mean Interrupt
    618 		 * Descriptor Table(IDT) vector.
    619 		 * As the result of these two documents, the IDT vectors which
    620 		 * are used by a device using MSI multiple vectors must be
    621 		 * continuous.
    622 		 */
    623 		struct idt_vec *iv;
    624 
    625 		iv = idt_vec_ref(&ci->ci_idtvec);
    626 		idtvec = idt_vec_alloc(iv, APIC_LEVEL(level), IDT_INTR_HIGH);
    627 	}
    628 	if (idtvec < 0) {
    629 		evcnt_detach(&ci->ci_isources[slot]->is_evcnt);
    630 		ci->ci_isources[slot]->is_evname[0] = '\0';
    631 		ci->ci_isources[slot] = NULL;
    632 		return EBUSY;
    633 	}
    634 	ci->ci_isources[slot]->is_idtvec = idtvec;
    635 	*idt_slot = idtvec;
    636 	*index = slot;
    637 	*cip = ci;
    638 	return 0;
    639 }
    640 
    641 static void
    642 intr_source_free(struct cpu_info *ci, int slot, struct pic *pic, int idtvec)
    643 {
    644 	struct intrsource *isp;
    645 	struct idt_vec *iv;
    646 
    647 	isp = ci->ci_isources[slot];
    648 	iv = idt_vec_ref(&ci->ci_idtvec);
    649 
    650 	if (isp->is_handlers != NULL)
    651 		return;
    652 	ci->ci_isources[slot] = NULL;
    653 	if (pic != &i8259_pic)
    654 		idt_vec_free(iv, idtvec);
    655 
    656 	isp->is_recurse = NULL;
    657 	isp->is_resume = NULL;
    658 }
    659 
    660 #ifdef MULTIPROCESSOR
    661 static int intr_biglock_wrapper(void *);
    662 static int intr_wrapper(void *);
    663 
    664 /*
    665  * intr_wrapper: perform diagnostic checks before and after calling the
    666  * real handler.
    667  * intr_biglock_wrapper: grab biglock and call a real interrupt handler.
    668  */
    669 
    670 static int
    671 intr_wrapper(void *vp)
    672 {
    673 	struct intrhand *ih = vp;
    674 	struct lwp *l = curlwp;
    675 	int locks;
    676 	int nopreempt;
    677 	int ret;
    678 
    679 	locks = curcpu()->ci_biglock_count;
    680 	nopreempt = l->l_nopreempt;
    681 	SDT_PROBE3(sdt, kernel, intr, entry,
    682 	    ih->ih_realfun, ih->ih_realarg, ih);
    683 	ret = (*ih->ih_realfun)(ih->ih_realarg);
    684 	SDT_PROBE4(sdt, kernel, intr, return,
    685 	    ih->ih_realfun, ih->ih_realarg, ih, ret);
    686 	KASSERTMSG(locks == curcpu()->ci_biglock_count,
    687 	    "%s @ %p slipped locks %d -> %d",
    688 	    ih->ih_xname, ih->ih_realfun, locks, curcpu()->ci_biglock_count);
    689 	KASSERTMSG(nopreempt == l->l_nopreempt,
    690 	    "%s @ %p slipped nopreempt %d -> %d lwp %p/%p func %p",
    691 	    ih->ih_xname, ih->ih_realfun, nopreempt, l->l_nopreempt, l, curlwp,
    692 	    ih->ih_realfun);
    693 
    694 	return ret;
    695 }
    696 
    697 static int
    698 intr_biglock_wrapper(void *vp)
    699 {
    700 	int ret;
    701 
    702 	KERNEL_LOCK(1, NULL);
    703 
    704 	ret = intr_wrapper(vp);
    705 
    706 	KERNEL_UNLOCK_ONE(NULL);
    707 
    708 	return ret;
    709 }
    710 
    711 #endif /* MULTIPROCESSOR */
    712 
    713 #ifdef KDTRACE_HOOKS
    714 static int
    715 intr_kdtrace_wrapper(void *vp)
    716 {
    717 	struct intrhand *ih = vp;
    718 	struct lwp *l = curlwp;
    719 	int ret;
    720 
    721 	int nopreempt;
    722 	nopreempt = l->l_nopreempt;
    723 	SDT_PROBE3(sdt, kernel, intr, entry,
    724 	    ih->ih_realfun, ih->ih_realarg, ih);
    725 	ret = (*ih->ih_realfun)(ih->ih_realarg);
    726 	SDT_PROBE4(sdt, kernel, intr, return,
    727 	    ih->ih_realfun, ih->ih_realarg, ih, ret);
    728 	KASSERTMSG(nopreempt == l->l_nopreempt,
    729 	    "%s @ %p slipped nopreempt %d -> %d  lwp %p/%p",
    730 	    ih->ih_xname, ih->ih_realfun, nopreempt, l->l_nopreempt, l, curlwp);
    731 
    732 	return ret;
    733 }
    734 #endif
    735 
    736 /*
    737  * Append device name to intrsource. If device A and device B share IRQ number,
    738  * the device name of the interrupt id is "device A, device B".
    739  */
    740 static void
    741 intr_append_intrsource_xname(struct intrsource *isp, const char *xname)
    742 {
    743 
    744 	if (isp->is_xname[0] != '\0')
    745 		strlcat(isp->is_xname, ", ", sizeof(isp->is_xname));
    746 	strlcat(isp->is_xname, xname, sizeof(isp->is_xname));
    747 }
    748 
    749 /*
    750  * Called on bound CPU to handle calling pic_hwunmask from contexts
    751  * that are not already running on the bound CPU.
    752  *
    753  * => caller (on initiating CPU) holds cpu_lock on our behalf
    754  * => arg1: struct intrhand *ih
    755  */
    756 static void
    757 intr_hwunmask_xcall(void *arg1, void *arg2)
    758 {
    759 	struct intrhand * const ih = arg1;
    760 	struct cpu_info * const ci = ih->ih_cpu;
    761 
    762 	KASSERT(ci == curcpu() || !mp_online);
    763 
    764 	const u_long psl = x86_read_psl();
    765 	x86_disable_intr();
    766 
    767 	struct intrsource * const source = ci->ci_isources[ih->ih_slot];
    768 	struct pic * const pic = source->is_pic;
    769 
    770 	if (source->is_mask_count == 0) {
    771 		(*pic->pic_hwunmask)(pic, ih->ih_pin);
    772 	}
    773 
    774 	x86_write_psl(psl);
    775 }
    776 
    777 /*
    778  * Handle per-CPU component of interrupt establish.
    779  *
    780  * => caller (on initiating CPU) holds cpu_lock on our behalf
    781  * => arg1: struct intrhand *ih
    782  * => arg2: int idt_vec
    783  */
    784 static void
    785 intr_establish_xcall(void *arg1, void *arg2)
    786 {
    787 	struct idt_vec *iv;
    788 	struct intrsource *source;
    789 	struct intrstub *stubp;
    790 	struct intrhand *ih;
    791 	struct cpu_info *ci;
    792 	int idt_vec;
    793 	u_long psl;
    794 
    795 	ih = arg1;
    796 
    797 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
    798 
    799 	ci = ih->ih_cpu;
    800 	source = ci->ci_isources[ih->ih_slot];
    801 	idt_vec = (int)(intptr_t)arg2;
    802 	iv = idt_vec_ref(&ci->ci_idtvec);
    803 
    804 	/* Disable interrupts locally. */
    805 	psl = x86_read_psl();
    806 	x86_disable_intr();
    807 
    808 	/* Link in the handler and re-calculate masks. */
    809 	*(ih->ih_prevp) = ih;
    810 	x86_intr_calculatemasks(ci);
    811 
    812 	/* Hook in new IDT vector and SPL state. */
    813 	if (source->is_resume == NULL || source->is_idtvec != idt_vec) {
    814 		if (source->is_idtvec != 0 && source->is_idtvec != idt_vec)
    815 			idt_vec_free(iv, source->is_idtvec);
    816 		source->is_idtvec = idt_vec;
    817 		if (source->is_type == IST_LEVEL) {
    818 			stubp = &source->is_pic->pic_level_stubs[ih->ih_slot];
    819 		} else {
    820 			stubp = &source->is_pic->pic_edge_stubs[ih->ih_slot];
    821 		}
    822 		source->is_resume = stubp->ist_resume;
    823 		source->is_recurse = stubp->ist_recurse;
    824 		idt_vec_set(iv, idt_vec, stubp->ist_entry);
    825 	}
    826 
    827 	/* Re-enable interrupts locally. */
    828 	x86_write_psl(psl);
    829 }
    830 
    831 void *
    832 intr_establish_xname(int legacy_irq, struct pic *pic, int pin, int type,
    833 		     int level, int (*handler)(void *), void *arg,
    834 		     bool known_mpsafe, const char *xname)
    835 {
    836 	struct intrhand **p, *q, *ih;
    837 	struct cpu_info *ci;
    838 	int slot, error, idt_vec;
    839 	struct intrsource *chained, *source;
    840 #ifdef MULTIPROCESSOR
    841 	bool mpsafe = (known_mpsafe || level != IPL_VM);
    842 #endif /* MULTIPROCESSOR */
    843 	uint64_t where;
    844 	const char *intrstr;
    845 	char intrstr_buf[INTRIDBUF];
    846 
    847 	KASSERTMSG((legacy_irq == -1 || (0 <= legacy_irq && legacy_irq < 16)),
    848 	    "bad legacy IRQ value: %d", legacy_irq);
    849 	KASSERTMSG((legacy_irq != -1 || pic != &i8259_pic),
    850 	    "non-legacy IRQ on i8259");
    851 
    852 	ih = kmem_alloc(sizeof(*ih), KM_SLEEP);
    853 	intrstr = intr_create_intrid(legacy_irq, pic, pin, intrstr_buf,
    854 	    sizeof(intrstr_buf));
    855 	KASSERT(intrstr != NULL);
    856 
    857 	mutex_enter(&cpu_lock);
    858 
    859 	/* allocate intrsource pool, if not yet. */
    860 	chained = intr_get_io_intrsource(intrstr);
    861 	if (chained == NULL) {
    862 		if (msipic_is_msi_pic(pic)) {
    863 			mutex_exit(&cpu_lock);
    864 			kmem_free(ih, sizeof(*ih));
    865 			printf("%s: %s has no intrsource\n", __func__, intrstr);
    866 			return NULL;
    867 		}
    868 		chained = intr_allocate_io_intrsource(intrstr);
    869 		if (chained == NULL) {
    870 			mutex_exit(&cpu_lock);
    871 			kmem_free(ih, sizeof(*ih));
    872 			printf("%s: can't allocate io_intersource\n", __func__);
    873 			return NULL;
    874 		}
    875 	}
    876 
    877 	error = intr_allocate_slot(pic, pin, level, &ci, &slot, &idt_vec,
    878 	    chained);
    879 	if (error != 0) {
    880 		intr_free_io_intrsource_direct(chained);
    881 		mutex_exit(&cpu_lock);
    882 		kmem_free(ih, sizeof(*ih));
    883 		printf("failed to allocate interrupt slot for PIC %s pin %d\n",
    884 		    pic->pic_name, pin);
    885 		return NULL;
    886 	}
    887 
    888 	source = ci->ci_isources[slot];
    889 
    890 	if (source->is_handlers != NULL &&
    891 	    source->is_pic->pic_type != pic->pic_type) {
    892 		intr_free_io_intrsource_direct(chained);
    893 		mutex_exit(&cpu_lock);
    894 		kmem_free(ih, sizeof(*ih));
    895 		printf("%s: can't share intr source between "
    896 		       "different PIC types (legacy_irq %d pin %d slot %d)\n",
    897 		    __func__, legacy_irq, pin, slot);
    898 		return NULL;
    899 	}
    900 
    901 	source->is_pin = pin;
    902 	source->is_pic = pic;
    903 	intr_append_intrsource_xname(source, xname);
    904 	switch (source->is_type) {
    905 	case IST_NONE:
    906 		source->is_type = type;
    907 		break;
    908 	case IST_EDGE:
    909 	case IST_LEVEL:
    910 		if (source->is_type == type)
    911 			break;
    912 		/* FALLTHROUGH */
    913 	case IST_PULSE:
    914 		if (type != IST_NONE) {
    915 			int otype = source->is_type;
    916 
    917 			intr_source_free(ci, slot, pic, idt_vec);
    918 			intr_free_io_intrsource_direct(chained);
    919 			mutex_exit(&cpu_lock);
    920 			kmem_free(ih, sizeof(*ih));
    921 			printf("%s: pic %s pin %d: can't share "
    922 			       "type %d with %d\n",
    923 				__func__, pic->pic_name, pin,
    924 				otype, type);
    925 			return NULL;
    926 		}
    927 		break;
    928 	default:
    929 		panic("%s: bad intr type %d for pic %s pin %d\n",
    930 		    __func__, source->is_type, pic->pic_name, pin);
    931 		/* NOTREACHED */
    932 	}
    933 
    934 	/*
    935 	 * If the establishing interrupt uses shared IRQ, the interrupt uses
    936 	 * "ci->ci_isources[slot]" instead of allocated by the establishing
    937 	 * device's pci_intr_alloc() or this function.
    938 	 */
    939 	if (source->is_handlers != NULL) {
    940 		struct intrsource *isp, *nisp;
    941 
    942 		SIMPLEQ_FOREACH_SAFE(isp, &io_interrupt_sources,
    943 		    is_list, nisp) {
    944 			if (strncmp(intrstr, isp->is_intrid, INTRIDBUF - 1) == 0
    945 			    && isp->is_handlers == NULL)
    946 				intr_free_io_intrsource_direct(isp);
    947 		}
    948 	}
    949 
    950 	/*
    951 	 * We're now committed.  Mask the interrupt in hardware and
    952 	 * count it for load distribution.
    953 	 */
    954 	(*pic->pic_hwmask)(pic, pin);
    955 	(ci->ci_nintrhand)++;
    956 
    957 	/*
    958 	 * Figure out where to put the handler.
    959 	 * This is O(N^2), but we want to preserve the order, and N is
    960 	 * generally small.
    961 	 */
    962 	for (p = &ci->ci_isources[slot]->is_handlers;
    963 	     (q = *p) != NULL && q->ih_level > level;
    964 	     p = &q->ih_next) {
    965 		/* nothing */;
    966 	}
    967 
    968 	ih->ih_pic = pic;
    969 	ih->ih_fun = ih->ih_realfun = handler;
    970 	ih->ih_arg = ih->ih_realarg = arg;
    971 	ih->ih_prevp = p;
    972 	ih->ih_next = *p;
    973 	ih->ih_level = level;
    974 	ih->ih_pin = pin;
    975 	ih->ih_cpu = ci;
    976 	ih->ih_slot = slot;
    977 	strlcpy(ih->ih_xname, xname, sizeof(ih->ih_xname));
    978 #ifdef KDTRACE_HOOKS
    979 	/*
    980 	 * XXX i8254_clockintr is special -- takes a magic extra
    981 	 * argument.  This should be fixed properly in some way that
    982 	 * doesn't involve sketchy function pointer casts.  See also
    983 	 * the comments in x86/isa/clock.c.
    984 	 */
    985 	if (handler != __FPTRCAST(int (*)(void *), i8254_clockintr)) {
    986 		ih->ih_fun = intr_kdtrace_wrapper;
    987 		ih->ih_arg = ih;
    988 	}
    989 #endif
    990 #ifdef MULTIPROCESSOR
    991 	if (!mpsafe) {
    992 		KASSERT(handler !=			/* XXX */
    993 		    __FPTRCAST(int (*)(void *), i8254_clockintr));
    994 		ih->ih_fun = intr_biglock_wrapper;
    995 		ih->ih_arg = ih;
    996 	} else {
    997 		if (handler !=
    998 		    __FPTRCAST(int (*)(void *), i8254_clockintr)) { /* XXX */
    999 #ifdef DIAGNOSTIC
   1000 			/* wrap all interrupts */
   1001 			ih->ih_fun = intr_wrapper;
   1002 			ih->ih_arg = ih;
   1003 #endif
   1004 		}
   1005 	}
   1006 #endif /* MULTIPROCESSOR */
   1007 
   1008 	/*
   1009 	 * Call out to the remote CPU to update its interrupt state.
   1010 	 * Only make RPCs if the APs are up and running.
   1011 	 */
   1012 	if (ci == curcpu() || !mp_online) {
   1013 		intr_establish_xcall(ih, (void *)(intptr_t)idt_vec);
   1014 	} else {
   1015 		where = xc_unicast(0, intr_establish_xcall, ih,
   1016 		    (void *)(intptr_t)idt_vec, ci);
   1017 		xc_wait(where);
   1018 	}
   1019 
   1020 	/* All set up, so add a route for the interrupt and unmask it. */
   1021 	(*pic->pic_addroute)(pic, ci, pin, idt_vec, type);
   1022 	if (ci == curcpu() || !mp_online) {
   1023 		intr_hwunmask_xcall(ih, NULL);
   1024 	} else {
   1025 		where = xc_unicast(0, intr_hwunmask_xcall, ih, NULL, ci);
   1026 		xc_wait(where);
   1027 	}
   1028 	mutex_exit(&cpu_lock);
   1029 
   1030 	if (bootverbose || cpu_index(ci) != 0)
   1031 		aprint_verbose("allocated pic %s type %s pin %d level %d to "
   1032 		    "%s slot %d idt entry %d\n",
   1033 		    pic->pic_name, type == IST_EDGE ? "edge" : "level", pin,
   1034 		    level, device_xname(ci->ci_dev), slot, idt_vec);
   1035 
   1036 	return ih;
   1037 }
   1038 
   1039 void *
   1040 intr_establish(int legacy_irq, struct pic *pic, int pin, int type, int level,
   1041 	       int (*handler)(void *), void *arg, bool known_mpsafe)
   1042 {
   1043 
   1044 	return intr_establish_xname(legacy_irq, pic, pin, type,
   1045 	    level, handler, arg, known_mpsafe, "unknown");
   1046 }
   1047 
   1048 /*
   1049  * Called on bound CPU to handle intr_mask() / intr_unmask().
   1050  *
   1051  * => caller (on initiating CPU) holds cpu_lock on our behalf
   1052  * => arg1: struct intrhand *ih
   1053  * => arg2: true -> mask, false -> unmask.
   1054  */
   1055 static void
   1056 intr_mask_xcall(void *arg1, void *arg2)
   1057 {
   1058 	struct intrhand * const ih = arg1;
   1059 	const uintptr_t mask = (uintptr_t)arg2;
   1060 	struct cpu_info * const ci = ih->ih_cpu;
   1061 	bool force_pending = false;
   1062 
   1063 	KASSERT(ci == curcpu() || !mp_online);
   1064 
   1065 	/*
   1066 	 * We need to disable interrupts to hold off the interrupt
   1067 	 * vectors.
   1068 	 */
   1069 	const u_long psl = x86_read_psl();
   1070 	x86_disable_intr();
   1071 
   1072 	struct intrsource * const source = ci->ci_isources[ih->ih_slot];
   1073 	struct pic * const pic = source->is_pic;
   1074 
   1075 	if (mask) {
   1076 		source->is_mask_count++;
   1077 		KASSERT(source->is_mask_count != 0);
   1078 		if (source->is_mask_count == 1) {
   1079 			(*pic->pic_hwmask)(pic, ih->ih_pin);
   1080 		}
   1081 	} else {
   1082 		KASSERT(source->is_mask_count != 0);
   1083 		if (--source->is_mask_count == 0) {
   1084 			/*
   1085 			 * If this interrupt source is being moved, don't
   1086 			 * unmask it at the hw.
   1087 			 */
   1088 			if (! source->is_distribute_pending) {
   1089 				(*pic->pic_hwunmask)(pic, ih->ih_pin);
   1090 			}
   1091 
   1092 			/*
   1093 			 * For level-sensitive interrupts, the hardware
   1094 			 * will let us know.  For everything else, we
   1095 			 * need to explicitly handle interrupts that
   1096 			 * happened when the source was masked.
   1097 			 */
   1098 			const uint64_t bit = (1U << ih->ih_slot);
   1099 			if (ci->ci_imasked & bit) {
   1100 				ci->ci_imasked &= ~bit;
   1101 				if (source->is_type != IST_LEVEL) {
   1102 					ci->ci_ipending |= bit;
   1103 					force_pending = true;
   1104 				}
   1105 			}
   1106 		}
   1107 	}
   1108 
   1109 	/* Re-enable interrupts. */
   1110 	x86_write_psl(psl);
   1111 
   1112 	if (force_pending) {
   1113 		/* Force processing of any pending interrupts. */
   1114 		splx(splhigh());
   1115 	}
   1116 }
   1117 
   1118 static void
   1119 intr_mask_internal(struct intrhand * const ih, const bool mask)
   1120 {
   1121 
   1122 	/*
   1123 	 * Call out to the remote CPU to update its interrupt state.
   1124 	 * Only make RPCs if the APs are up and running.
   1125 	 */
   1126 	mutex_enter(&cpu_lock);
   1127 	struct cpu_info * const ci = ih->ih_cpu;
   1128 	void * const mask_arg = (void *)(uintptr_t)mask;
   1129 	if (ci == curcpu() || !mp_online) {
   1130 		intr_mask_xcall(ih, mask_arg);
   1131 	} else {
   1132 		const uint64_t where =
   1133 		    xc_unicast(0, intr_mask_xcall, ih, mask_arg, ci);
   1134 		xc_wait(where);
   1135 	}
   1136 	mutex_exit(&cpu_lock);
   1137 }
   1138 
   1139 void
   1140 intr_mask(struct intrhand *ih)
   1141 {
   1142 
   1143 	if (cpu_intr_p()) {
   1144 		/*
   1145 		 * Special case of calling intr_mask() from an interrupt
   1146 		 * handler: we MUST be called from the bound CPU for this
   1147 		 * interrupt (presumably from a handler we're about to
   1148 		 * mask).
   1149 		 *
   1150 		 * We can't take the cpu_lock in this case, and we must
   1151 		 * therefore be extra careful.
   1152 		 */
   1153 		KASSERT(ih->ih_cpu == curcpu() || !mp_online);
   1154 		intr_mask_xcall(ih, (void *)(uintptr_t)true);
   1155 		return;
   1156 	}
   1157 
   1158 	intr_mask_internal(ih, true);
   1159 }
   1160 
   1161 void
   1162 intr_unmask(struct intrhand *ih)
   1163 {
   1164 
   1165 	/*
   1166 	 * This is not safe to call from an interrupt context because
   1167 	 * we don't want to accidentally unmask an interrupt source
   1168 	 * that's masked because it's being serviced.
   1169 	 */
   1170 	KASSERT(!cpu_intr_p());
   1171 	intr_mask_internal(ih, false);
   1172 }
   1173 
   1174 /*
   1175  * Called on bound CPU to handle intr_disestablish().
   1176  *
   1177  * => caller (on initiating CPU) holds cpu_lock on our behalf
   1178  * => arg1: struct intrhand *ih
   1179  * => arg2: unused
   1180  */
   1181 static void
   1182 intr_disestablish_xcall(void *arg1, void *arg2)
   1183 {
   1184 	struct intrhand **p, *q;
   1185 	struct cpu_info *ci;
   1186 	struct pic *pic;
   1187 	struct intrsource *source;
   1188 	struct intrhand *ih;
   1189 	u_long psl;
   1190 	int idtvec;
   1191 
   1192 	ih = arg1;
   1193 	ci = ih->ih_cpu;
   1194 
   1195 	KASSERT(ci == curcpu() || !mp_online);
   1196 
   1197 	/* Disable interrupts locally. */
   1198 	psl = x86_read_psl();
   1199 	x86_disable_intr();
   1200 
   1201 	pic = ci->ci_isources[ih->ih_slot]->is_pic;
   1202 	source = ci->ci_isources[ih->ih_slot];
   1203 	idtvec = source->is_idtvec;
   1204 
   1205 	(*pic->pic_hwmask)(pic, ih->ih_pin);
   1206 
   1207 	/*
   1208 	 * ci_pending is stable on the current CPU while interrupts are
   1209 	 * blocked, and we only need to synchronize with interrupt
   1210 	 * vectors on the same CPU, so no need for atomics or membars.
   1211 	 */
   1212 	ci->ci_ipending &= ~(1ULL << ih->ih_slot);
   1213 
   1214 	/*
   1215 	 * Remove the handler from the chain.
   1216 	 */
   1217 	for (p = &source->is_handlers; (q = *p) != NULL && q != ih;
   1218 	     p = &q->ih_next)
   1219 		;
   1220 	if (q == NULL) {
   1221 		x86_write_psl(psl);
   1222 		panic("%s: handler not registered", __func__);
   1223 		/* NOTREACHED */
   1224 	}
   1225 
   1226 	*p = q->ih_next;
   1227 
   1228 	x86_intr_calculatemasks(ci);
   1229 	/*
   1230 	 * If there is no any handler, 1) do delroute because it has no
   1231 	 * any source and 2) dont' hwunmask to prevent spurious interrupt.
   1232 	 *
   1233 	 * If there is any handler, 1) don't delroute because it has source
   1234 	 * and 2) do hwunmask to be able to get interrupt again.
   1235 	 *
   1236 	 */
   1237 	if (source->is_handlers == NULL)
   1238 		(*pic->pic_delroute)(pic, ci, ih->ih_pin, idtvec,
   1239 		    source->is_type);
   1240 	else if (source->is_mask_count == 0)
   1241 		(*pic->pic_hwunmask)(pic, ih->ih_pin);
   1242 
   1243 	/* If the source is free we can drop it now. */
   1244 	intr_source_free(ci, ih->ih_slot, pic, idtvec);
   1245 
   1246 	/* Re-enable interrupts. */
   1247 	x86_write_psl(psl);
   1248 
   1249 	DPRINTF(("%s: remove slot %d (pic %s pin %d vec %d)\n",
   1250 	    device_xname(ci->ci_dev), ih->ih_slot, pic->pic_name,
   1251 	    ih->ih_pin, idtvec));
   1252 }
   1253 
   1254 static int
   1255 intr_num_handlers(struct intrsource *isp)
   1256 {
   1257 	struct intrhand *ih;
   1258 	int num;
   1259 
   1260 	num = 0;
   1261 	for (ih = isp->is_handlers; ih != NULL; ih = ih->ih_next)
   1262 		num++;
   1263 
   1264 	return num;
   1265 }
   1266 
   1267 /*
   1268  * Deregister an interrupt handler.
   1269  */
   1270 void
   1271 intr_disestablish(struct intrhand *ih)
   1272 {
   1273 	struct cpu_info *ci;
   1274 	struct intrsource *isp;
   1275 	uint64_t where;
   1276 
   1277 	/*
   1278 	 * Count the removal for load balancing.
   1279 	 * Call out to the remote CPU to update its interrupt state.
   1280 	 * Only make RPCs if the APs are up and running.
   1281 	 */
   1282 	mutex_enter(&cpu_lock);
   1283 	ci = ih->ih_cpu;
   1284 	(ci->ci_nintrhand)--;
   1285 	KASSERT(ci->ci_nintrhand >= 0);
   1286 	isp = ci->ci_isources[ih->ih_slot];
   1287 	if (ci == curcpu() || !mp_online) {
   1288 		intr_disestablish_xcall(ih, NULL);
   1289 	} else {
   1290 		where = xc_unicast(0, intr_disestablish_xcall, ih, NULL, ci);
   1291 		xc_wait(where);
   1292 	}
   1293 	if (!msipic_is_msi_pic(isp->is_pic) && intr_num_handlers(isp) < 1) {
   1294 		intr_free_io_intrsource_direct(isp);
   1295 	}
   1296 	mutex_exit(&cpu_lock);
   1297 	kmem_free(ih, sizeof(*ih));
   1298 }
   1299 
   1300 static const char *
   1301 xen_intr_string(int port, char *buf, size_t len, struct pic *pic)
   1302 {
   1303 	KASSERT(pic->pic_type == PIC_XEN);
   1304 
   1305 	KASSERT(port >= 0);
   1306 
   1307 	snprintf(buf, len, "%s chan %d", pic->pic_name, port);
   1308 
   1309 	return buf;
   1310 }
   1311 
   1312 static const char *
   1313 legacy_intr_string(int ih, char *buf, size_t len, struct pic *pic)
   1314 {
   1315 	int legacy_irq;
   1316 
   1317 	KASSERT(pic->pic_type == PIC_I8259);
   1318 #if NLAPIC > 0
   1319 	KASSERT(APIC_IRQ_ISLEGACY(ih));
   1320 
   1321 	legacy_irq = APIC_IRQ_LEGACY_IRQ(ih);
   1322 #else
   1323 	legacy_irq = ih;
   1324 #endif
   1325 	KASSERT(legacy_irq >= 0 && legacy_irq < 16);
   1326 
   1327 	snprintf(buf, len, "%s pin %d", pic->pic_name, legacy_irq);
   1328 
   1329 	return buf;
   1330 }
   1331 
   1332 const char *
   1333 intr_string(intr_handle_t ih, char *buf, size_t len)
   1334 {
   1335 #if NIOAPIC > 0
   1336 	struct ioapic_softc *pic;
   1337 #endif
   1338 
   1339 	if (ih == 0)
   1340 		panic("%s: bogus handle 0x%" PRIx64, __func__, ih);
   1341 
   1342 #if NIOAPIC > 0
   1343 	if (ih & APIC_INT_VIA_APIC) {
   1344 		pic = ioapic_find(APIC_IRQ_APIC(ih));
   1345 		if (pic != NULL) {
   1346 			snprintf(buf, len, "%s pin %d",
   1347 			    device_xname(pic->sc_dev), APIC_IRQ_PIN(ih));
   1348 		} else {
   1349 			snprintf(buf, len,
   1350 			    "apic %d int %d (irq %d)",
   1351 			    APIC_IRQ_APIC(ih),
   1352 			    APIC_IRQ_PIN(ih),
   1353 			    APIC_IRQ_LEGACY_IRQ(ih));
   1354 		}
   1355 	} else
   1356 		snprintf(buf, len, "irq %d", APIC_IRQ_LEGACY_IRQ(ih));
   1357 
   1358 #elif NLAPIC > 0
   1359 	snprintf(buf, len, "irq %d", APIC_IRQ_LEGACY_IRQ(ih));
   1360 #else
   1361 	snprintf(buf, len, "irq %d", (int) ih);
   1362 #endif
   1363 	return buf;
   1364 
   1365 }
   1366 
   1367 /*
   1368  * Fake interrupt handler structures for the benefit of symmetry with
   1369  * other interrupt sources, and the benefit of x86_intr_calculatemasks()
   1370  */
   1371 struct intrhand fake_timer_intrhand;
   1372 struct intrhand fake_ipi_intrhand;
   1373 #if NHYPERV > 0
   1374 struct intrhand fake_hyperv_intrhand;
   1375 #endif
   1376 
   1377 #if NLAPIC > 0 && defined(MULTIPROCESSOR)
   1378 static const char *x86_ipi_names[X86_NIPI] = X86_IPI_NAMES;
   1379 #endif
   1380 
   1381 #if defined(INTRSTACKSIZE)
   1382 static inline bool
   1383 redzone_const_or_false(bool x)
   1384 {
   1385 #ifdef DIAGNOSTIC
   1386 	return x;
   1387 #else
   1388 	return false;
   1389 #endif /* !DIAGNOSTIC */
   1390 }
   1391 
   1392 static inline int
   1393 redzone_const_or_zero(int x)
   1394 {
   1395 	return redzone_const_or_false(true) ? x : 0;
   1396 }
   1397 #endif
   1398 
   1399 /*
   1400  * Initialize all handlers that aren't dynamically allocated, and exist
   1401  * for each CPU.
   1402  */
   1403 void
   1404 cpu_intr_init(struct cpu_info *ci)
   1405 {
   1406 #if NLAPIC > 0
   1407 	struct intrsource *isp;
   1408 	static int first = 1;
   1409 #if defined(MULTIPROCESSOR)
   1410 	int i;
   1411 #endif
   1412 
   1413 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
   1414 	isp->is_recurse = Xrecurse_lapic_ltimer;
   1415 	isp->is_resume = Xresume_lapic_ltimer;
   1416 	fake_timer_intrhand.ih_pic = &local_pic;
   1417 	fake_timer_intrhand.ih_level = IPL_CLOCK;
   1418 	isp->is_handlers = &fake_timer_intrhand;
   1419 	isp->is_pic = &local_pic;
   1420 	ci->ci_isources[LIR_TIMER] = isp;
   1421 	evcnt_attach_dynamic(&isp->is_evcnt,
   1422 	    first ? EVCNT_TYPE_INTR : EVCNT_TYPE_MISC, NULL,
   1423 	    device_xname(ci->ci_dev), "timer");
   1424 	first = 0;
   1425 
   1426 #ifdef MULTIPROCESSOR
   1427 	isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
   1428 	isp->is_recurse = Xrecurse_lapic_ipi;
   1429 	isp->is_resume = Xresume_lapic_ipi;
   1430 	fake_ipi_intrhand.ih_pic = &local_pic;
   1431 	fake_ipi_intrhand.ih_level = IPL_HIGH;
   1432 	isp->is_handlers = &fake_ipi_intrhand;
   1433 	isp->is_pic = &local_pic;
   1434 	ci->ci_isources[LIR_IPI] = isp;
   1435 
   1436 	for (i = 0; i < X86_NIPI; i++)
   1437 		evcnt_attach_dynamic(&ci->ci_ipi_events[i], EVCNT_TYPE_MISC,
   1438 		    NULL, device_xname(ci->ci_dev), x86_ipi_names[i]);
   1439 #endif /* MULTIPROCESSOR */
   1440 
   1441 #if NHYPERV > 0
   1442 	if (hyperv_hypercall_enabled()) {
   1443 		isp = kmem_zalloc(sizeof(*isp), KM_SLEEP);
   1444 		isp->is_recurse = Xrecurse_hyperv_hypercall;
   1445 		isp->is_resume = Xresume_hyperv_hypercall;
   1446 		fake_hyperv_intrhand.ih_level = IPL_NET;
   1447 		isp->is_handlers = &fake_hyperv_intrhand;
   1448 		isp->is_pic = &local_pic;
   1449 		ci->ci_isources[LIR_HV] = isp;
   1450 		evcnt_attach_dynamic(&isp->is_evcnt, EVCNT_TYPE_INTR, NULL,
   1451 		    device_xname(ci->ci_dev), "Hyper-V hypercall");
   1452 	}
   1453 #endif /* NHYPERV > 0 */
   1454 #endif /* NLAPIC > 0 */
   1455 
   1456 #if defined(__HAVE_PREEMPTION)
   1457 	x86_init_preempt(ci);
   1458 
   1459 #endif
   1460 	x86_intr_calculatemasks(ci);
   1461 
   1462 #if defined(INTRSTACKSIZE)
   1463 	vaddr_t istack;
   1464 
   1465 	/*
   1466 	 * If the red zone is activated, protect both the top and
   1467 	 * the bottom of the stack with an unmapped page.
   1468 	 */
   1469 	istack = uvm_km_alloc(kernel_map,
   1470 	    INTRSTACKSIZE + redzone_const_or_zero(2 * PAGE_SIZE), 0,
   1471 	    UVM_KMF_WIRED | UVM_KMF_ZERO);
   1472 	if (redzone_const_or_false(true)) {
   1473 		pmap_kremove(istack, PAGE_SIZE);
   1474 		pmap_kremove(istack + INTRSTACKSIZE + PAGE_SIZE, PAGE_SIZE);
   1475 		pmap_update(pmap_kernel());
   1476 	}
   1477 
   1478 	/*
   1479 	 * 33 used to be 1.  Arbitrarily reserve 32 more register_t's
   1480 	 * of space for ddb(4) to examine some subroutine arguments
   1481 	 * and to hunt for the next stack frame.
   1482 	 */
   1483 	ci->ci_intrstack = (char *)istack + redzone_const_or_zero(PAGE_SIZE) +
   1484 	    INTRSTACKSIZE - 33 * sizeof(register_t);
   1485 #endif
   1486 
   1487 	ci->ci_idepth = -1;
   1488 }
   1489 
   1490 #if defined(INTRDEBUG) || defined(DDB)
   1491 
   1492 void
   1493 intr_printconfig(void)
   1494 {
   1495 	int i;
   1496 	struct intrhand *ih;
   1497 	struct intrsource *isp;
   1498 	struct cpu_info *ci;
   1499 	CPU_INFO_ITERATOR cii;
   1500 	void (*pr)(const char *, ...);
   1501 
   1502 	pr = printf;
   1503 #ifdef DDB
   1504 	if (db_active) {
   1505 		pr = db_printf;
   1506 	}
   1507 #endif
   1508 
   1509 	for (CPU_INFO_FOREACH(cii, ci)) {
   1510 		(*pr)("%s: interrupt masks:\n", device_xname(ci->ci_dev));
   1511 		for (i = 0; i < NIPL; i++)
   1512 			(*pr)("IPL %d mask %016"PRIx64" unmask %016"PRIx64"\n",
   1513 			    i, ci->ci_imask[i], ci->ci_iunmask[i]);
   1514 		for (i = 0; i < MAX_INTR_SOURCES; i++) {
   1515 			isp = ci->ci_isources[i];
   1516 			if (isp == NULL)
   1517 				continue;
   1518 			(*pr)("%s source %d is pin %d from pic %s type %d "
   1519 			    "maxlevel %d\n", device_xname(ci->ci_dev), i,
   1520 			    isp->is_pin, isp->is_pic->pic_name, isp->is_type,
   1521 			    isp->is_maxlevel);
   1522 			for (ih = isp->is_handlers; ih != NULL;
   1523 			     ih = ih->ih_next)
   1524 				(*pr)("\thandler %p level %d\n",
   1525 				    ih->ih_fun, ih->ih_level);
   1526 #if NIOAPIC > 0
   1527 			if (isp->is_pic->pic_type == PIC_IOAPIC) {
   1528 				struct ioapic_softc *sc;
   1529 				sc = isp->is_pic->pic_ioapic;
   1530 				(*pr)("\tioapic redir 0x%x\n",
   1531 				    sc->sc_pins[isp->is_pin].ip_map->redir);
   1532 			}
   1533 #endif
   1534 
   1535 		}
   1536 	}
   1537 }
   1538 
   1539 #endif
   1540 
   1541 /*
   1542  * Save current affinitied cpu's interrupt count.
   1543  */
   1544 static void
   1545 intr_save_evcnt(struct intrsource *source, cpuid_t cpuid)
   1546 {
   1547 	struct percpu_evcnt *pep;
   1548 	uint64_t curcnt;
   1549 	int i;
   1550 
   1551 	curcnt = source->is_evcnt.ev_count;
   1552 	pep = source->is_saved_evcnt;
   1553 
   1554 	for (i = 0; i < ncpu; i++) {
   1555 		if (pep[i].cpuid == cpuid) {
   1556 			pep[i].count = curcnt;
   1557 			break;
   1558 		}
   1559 	}
   1560 }
   1561 
   1562 /*
   1563  * Restore current affinitied cpu's interrupt count.
   1564  */
   1565 static void
   1566 intr_restore_evcnt(struct intrsource *source, cpuid_t cpuid)
   1567 {
   1568 	struct percpu_evcnt *pep;
   1569 	int i;
   1570 
   1571 	pep = source->is_saved_evcnt;
   1572 
   1573 	for (i = 0; i < ncpu; i++) {
   1574 		if (pep[i].cpuid == cpuid) {
   1575 			source->is_evcnt.ev_count = pep[i].count;
   1576 			break;
   1577 		}
   1578 	}
   1579 }
   1580 
   1581 static void
   1582 intr_redistribute_xc_t(void *arg1, void *arg2)
   1583 {
   1584 	struct cpu_info *ci;
   1585 	struct intrsource *isp;
   1586 	int slot;
   1587 	u_long psl;
   1588 
   1589 	ci = curcpu();
   1590 	isp = arg1;
   1591 	slot = (int)(intptr_t)arg2;
   1592 
   1593 	/* Disable interrupts locally. */
   1594 	psl = x86_read_psl();
   1595 	x86_disable_intr();
   1596 
   1597 	/* Hook it in and re-calculate masks. */
   1598 	ci->ci_isources[slot] = isp;
   1599 	x86_intr_calculatemasks(curcpu());
   1600 
   1601 	/* Re-enable interrupts locally. */
   1602 	x86_write_psl(psl);
   1603 }
   1604 
   1605 static void
   1606 intr_redistribute_xc_s1(void *arg1, void *arg2)
   1607 {
   1608 	struct pic *pic;
   1609 	struct intrsource *isp;
   1610 	struct cpu_info *nci;
   1611 	u_long psl;
   1612 
   1613 	isp = arg1;
   1614 	nci = arg2;
   1615 
   1616 	/*
   1617 	 * Disable interrupts on-chip and mask the pin.  Back out
   1618 	 * and let the interrupt be processed if one is pending.
   1619 	 */
   1620 	pic = isp->is_pic;
   1621 	for (;;) {
   1622 		psl = x86_read_psl();
   1623 		x86_disable_intr();
   1624 		if ((*pic->pic_trymask)(pic, isp->is_pin)) {
   1625 			break;
   1626 		}
   1627 		x86_write_psl(psl);
   1628 		DELAY(1000);
   1629 	}
   1630 
   1631 	/* pic_addroute will unmask the interrupt. */
   1632 	(*pic->pic_addroute)(pic, nci, isp->is_pin, isp->is_idtvec,
   1633 	    isp->is_type);
   1634 	x86_write_psl(psl);
   1635 }
   1636 
   1637 static void
   1638 intr_redistribute_xc_s2(void *arg1, void *arg2)
   1639 {
   1640 	struct cpu_info *ci;
   1641 	u_long psl;
   1642 	int slot;
   1643 
   1644 	ci = curcpu();
   1645 	slot = (int)(uintptr_t)arg1;
   1646 
   1647 	/* Disable interrupts locally. */
   1648 	psl = x86_read_psl();
   1649 	x86_disable_intr();
   1650 
   1651 	/* Patch out the source and re-calculate masks. */
   1652 	ci->ci_isources[slot] = NULL;
   1653 	x86_intr_calculatemasks(ci);
   1654 
   1655 	/* Re-enable interrupts locally. */
   1656 	x86_write_psl(psl);
   1657 }
   1658 
   1659 static bool
   1660 intr_redistribute(struct cpu_info *oci)
   1661 {
   1662 	struct intrsource *isp;
   1663 	struct intrhand *ih;
   1664 	CPU_INFO_ITERATOR cii;
   1665 	struct cpu_info *nci, *ici;
   1666 	int oslot, nslot;
   1667 	uint64_t where;
   1668 
   1669 	KASSERT(mutex_owned(&cpu_lock));
   1670 
   1671 	/* Look for an interrupt source that we can migrate. */
   1672 	for (oslot = 0; oslot < MAX_INTR_SOURCES; oslot++) {
   1673 		if ((isp = oci->ci_isources[oslot]) == NULL) {
   1674 			continue;
   1675 		}
   1676 		if (isp->is_pic->pic_type == PIC_IOAPIC) {
   1677 			break;
   1678 		}
   1679 	}
   1680 	if (oslot == MAX_INTR_SOURCES) {
   1681 		return false;
   1682 	}
   1683 
   1684 	/* Find least loaded CPU and try to move there. */
   1685 	nci = NULL;
   1686 	for (CPU_INFO_FOREACH(cii, ici)) {
   1687 		if ((ici->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
   1688 			continue;
   1689 		}
   1690 		KASSERT(ici != oci);
   1691 		if (nci == NULL || nci->ci_nintrhand > ici->ci_nintrhand) {
   1692 			nci = ici;
   1693 		}
   1694 	}
   1695 	if (nci == NULL) {
   1696 		return false;
   1697 	}
   1698 	for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
   1699 		if (nci->ci_isources[nslot] == NULL) {
   1700 			break;
   1701 		}
   1702 	}
   1703 
   1704 	/* If that did not work, allocate anywhere. */
   1705 	if (nslot == MAX_INTR_SOURCES) {
   1706 		for (CPU_INFO_FOREACH(cii, nci)) {
   1707 			if ((nci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
   1708 				continue;
   1709 			}
   1710 			KASSERT(nci != oci);
   1711 			for (nslot = 0; nslot < MAX_INTR_SOURCES; nslot++) {
   1712 				if (nci->ci_isources[nslot] == NULL) {
   1713 					break;
   1714 				}
   1715 			}
   1716 			if (nslot != MAX_INTR_SOURCES) {
   1717 				break;
   1718 			}
   1719 		}
   1720 	}
   1721 	if (nslot == MAX_INTR_SOURCES) {
   1722 		return false;
   1723 	}
   1724 
   1725 	/*
   1726 	 * Now we have new CPU and new slot.  Run a cross-call to set up
   1727 	 * the new vector on the target CPU.
   1728 	 */
   1729 	where = xc_unicast(0, intr_redistribute_xc_t, isp,
   1730 	    (void *)(intptr_t)nslot, nci);
   1731 	xc_wait(where);
   1732 
   1733 	/*
   1734 	 * We're ready to go on the target CPU.  Run a cross call to
   1735 	 * reroute the interrupt away from the source CPU.
   1736 	 */
   1737 	where = xc_unicast(0, intr_redistribute_xc_s1, isp, nci, oci);
   1738 	xc_wait(where);
   1739 
   1740 	/* Sleep for (at least) 10ms to allow the change to take hold. */
   1741 	(void)kpause("intrdist", false, mstohz(10), NULL);
   1742 
   1743 	/* Complete removal from the source CPU. */
   1744 	where = xc_unicast(0, intr_redistribute_xc_s2,
   1745 	    (void *)(uintptr_t)oslot, NULL, oci);
   1746 	xc_wait(where);
   1747 
   1748 	/* Finally, take care of book-keeping. */
   1749 	for (ih = isp->is_handlers; ih != NULL; ih = ih->ih_next) {
   1750 		oci->ci_nintrhand--;
   1751 		nci->ci_nintrhand++;
   1752 		ih->ih_cpu = nci;
   1753 	}
   1754 	intr_save_evcnt(isp, oci->ci_cpuid);
   1755 	intr_restore_evcnt(isp, nci->ci_cpuid);
   1756 	isp->is_active_cpu = nci->ci_cpuid;
   1757 
   1758 	return true;
   1759 }
   1760 
   1761 void
   1762 cpu_intr_redistribute(void)
   1763 {
   1764 	CPU_INFO_ITERATOR cii;
   1765 	struct cpu_info *ci;
   1766 
   1767 	KASSERT(mutex_owned(&cpu_lock));
   1768 	KASSERT(mp_online);
   1769 
   1770 	/* Direct interrupts away from shielded CPUs. */
   1771 	for (CPU_INFO_FOREACH(cii, ci)) {
   1772 		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
   1773 			continue;
   1774 		}
   1775 		while (intr_redistribute(ci)) {
   1776 			/* nothing */
   1777 		}
   1778 	}
   1779 
   1780 	/* XXX should now re-balance */
   1781 }
   1782 
   1783 u_int
   1784 cpu_intr_count(struct cpu_info *ci)
   1785 {
   1786 
   1787 	KASSERT(ci->ci_nintrhand >= 0);
   1788 
   1789 	return ci->ci_nintrhand;
   1790 }
   1791 
   1792 static int
   1793 intr_find_unused_slot(struct cpu_info *ci, int *index)
   1794 {
   1795 	int slot, i;
   1796 
   1797 	KASSERT(mutex_owned(&cpu_lock));
   1798 
   1799 	slot = -1;
   1800 	for (i = 0; i < MAX_INTR_SOURCES ; i++) {
   1801 		if (ci->ci_isources[i] == NULL) {
   1802 			slot = i;
   1803 			break;
   1804 		}
   1805 	}
   1806 	if (slot == -1) {
   1807 		DPRINTF(("cannot allocate ci_isources\n"));
   1808 		return EBUSY;
   1809 	}
   1810 
   1811 	*index = slot;
   1812 	return 0;
   1813 }
   1814 
   1815 /*
   1816  * Let cpu_info ready to accept the interrupt.
   1817  */
   1818 static void
   1819 intr_activate_xcall(void *arg1, void *arg2)
   1820 {
   1821 	struct cpu_info *ci;
   1822 	struct intrsource *source;
   1823 	struct intrstub *stubp;
   1824 	struct intrhand *ih;
   1825 	struct idt_vec *iv;
   1826 	u_long psl;
   1827 	int idt_vec;
   1828 	int slot;
   1829 
   1830 	ih = arg1;
   1831 
   1832 	kpreempt_disable();
   1833 
   1834 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
   1835 
   1836 	ci = ih->ih_cpu;
   1837 	slot = ih->ih_slot;
   1838 	source = ci->ci_isources[slot];
   1839 	idt_vec = source->is_idtvec;
   1840 	iv = idt_vec_ref(&ci->ci_idtvec);
   1841 
   1842 	psl = x86_read_psl();
   1843 	x86_disable_intr();
   1844 
   1845 	x86_intr_calculatemasks(ci);
   1846 
   1847 	if (source->is_type == IST_LEVEL) {
   1848 		stubp = &source->is_pic->pic_level_stubs[slot];
   1849 	} else {
   1850 		stubp = &source->is_pic->pic_edge_stubs[slot];
   1851 	}
   1852 
   1853 	source->is_resume = stubp->ist_resume;
   1854 	source->is_recurse = stubp->ist_recurse;
   1855 	idt_vec_set(iv, idt_vec, stubp->ist_entry);
   1856 
   1857 	x86_write_psl(psl);
   1858 
   1859 	kpreempt_enable();
   1860 }
   1861 
   1862 /*
   1863  * Let cpu_info not accept the interrupt.
   1864  */
   1865 static void
   1866 intr_deactivate_xcall(void *arg1, void *arg2)
   1867 {
   1868 	struct cpu_info *ci;
   1869 	struct intrhand *ih, *lih;
   1870 	struct intrsource *isp;
   1871 	u_long psl;
   1872 	int idt_vec;
   1873 	int slot;
   1874 
   1875 	ih = arg1;
   1876 
   1877 	kpreempt_disable();
   1878 
   1879 	KASSERT(ih->ih_cpu == curcpu() || !mp_online);
   1880 
   1881 	ci = ih->ih_cpu;
   1882 	slot = ih->ih_slot;
   1883 	isp = ci->ci_isources[slot];
   1884 	idt_vec = isp->is_idtvec;
   1885 
   1886 	psl = x86_read_psl();
   1887 	x86_disable_intr();
   1888 
   1889 	/* Move all devices sharing IRQ number. */
   1890 	ci->ci_isources[slot] = NULL;
   1891 	for (lih = ih; lih != NULL; lih = lih->ih_next) {
   1892 		ci->ci_nintrhand--;
   1893 	}
   1894 
   1895 	x86_intr_calculatemasks(ci);
   1896 
   1897 	if (idt_vec_is_pcpu()) {
   1898 		idt_vec_free(&ci->ci_idtvec, idt_vec);
   1899 	} else {
   1900 		/*
   1901 		 * Skip unsetgate(), because the same idt[] entry is
   1902 		 * overwritten in intr_activate_xcall().
   1903 		 */
   1904 	}
   1905 
   1906 	x86_write_psl(psl);
   1907 
   1908 	kpreempt_enable();
   1909 }
   1910 
   1911 static void
   1912 intr_get_affinity(struct intrsource *isp, kcpuset_t *cpuset)
   1913 {
   1914 	struct cpu_info *ci;
   1915 
   1916 	KASSERT(mutex_owned(&cpu_lock));
   1917 
   1918 	if (isp == NULL) {
   1919 		kcpuset_zero(cpuset);
   1920 		return;
   1921 	}
   1922 
   1923 	KASSERTMSG(isp->is_handlers != NULL,
   1924 	    "Don't get affinity for the device which is not established.");
   1925 
   1926 	ci = isp->is_handlers->ih_cpu;
   1927 	if (ci == NULL) {
   1928 		kcpuset_zero(cpuset);
   1929 		return;
   1930 	}
   1931 
   1932 	kcpuset_set(cpuset, cpu_index(ci));
   1933 	return;
   1934 }
   1935 
   1936 static int
   1937 intr_set_affinity(struct intrsource *isp, const kcpuset_t *cpuset)
   1938 {
   1939 	struct cpu_info *oldci, *newci;
   1940 	struct intrhand *ih, *lih;
   1941 	struct pic *pic;
   1942 	u_int cpu_idx;
   1943 	int old_idtvec, new_idtvec;
   1944 	int oldslot, newslot;
   1945 	int err;
   1946 	int pin;
   1947 
   1948 	KASSERT(mutex_owned(&intr_distribute_lock));
   1949 	KASSERT(mutex_owned(&cpu_lock));
   1950 
   1951 	/* XXX
   1952 	 * logical destination mode is not supported, use lowest index cpu.
   1953 	 */
   1954 	cpu_idx = kcpuset_ffs(cpuset) - 1;
   1955 	newci = cpu_lookup(cpu_idx);
   1956 	if (newci == NULL) {
   1957 		DPRINTF(("invalid cpu index: %u\n", cpu_idx));
   1958 		return EINVAL;
   1959 	}
   1960 	if ((newci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0) {
   1961 		DPRINTF(("the cpu is set nointr shield. index:%u\n", cpu_idx));
   1962 		return EINVAL;
   1963 	}
   1964 
   1965 	if (isp == NULL) {
   1966 		DPRINTF(("invalid intrctl handler\n"));
   1967 		return EINVAL;
   1968 	}
   1969 
   1970 	/* i8259_pic supports only primary cpu, see i8259.c. */
   1971 	pic = isp->is_pic;
   1972 	if (pic == &i8259_pic) {
   1973 		DPRINTF(("i8259 pic does not support set_affinity\n"));
   1974 		return ENOTSUP;
   1975 	}
   1976 
   1977 	ih = isp->is_handlers;
   1978 	KASSERTMSG(ih != NULL,
   1979 	    "Don't set affinity for the device which is not established.");
   1980 
   1981 	oldci = ih->ih_cpu;
   1982 	if (newci == oldci) /* nothing to do */
   1983 		return 0;
   1984 
   1985 	oldslot = ih->ih_slot;
   1986 
   1987 	err = intr_find_unused_slot(newci, &newslot);
   1988 	if (err) {
   1989 		DPRINTF(("failed to allocate interrupt slot for PIC %s intrid "
   1990 			"%s\n", isp->is_pic->pic_name, isp->is_intrid));
   1991 		return err;
   1992 	}
   1993 
   1994 	old_idtvec = isp->is_idtvec;
   1995 
   1996 	if (idt_vec_is_pcpu()) {
   1997 		new_idtvec = idt_vec_alloc(&newci->ci_idtvec,
   1998 		    APIC_LEVEL(ih->ih_level), IDT_INTR_HIGH);
   1999 		if (new_idtvec == 0)
   2000 			return EBUSY;
   2001 		DPRINTF(("interrupt from cpu%d vec %d to cpu%d vec %d\n",
   2002 		    cpu_index(oldci), old_idtvec, cpu_index(newci),
   2003 			new_idtvec));
   2004 	} else {
   2005 		new_idtvec = isp->is_idtvec;
   2006 	}
   2007 
   2008 	/* Prevent intr_unmask() from reenabling the source at the hw. */
   2009 	isp->is_distribute_pending = true;
   2010 
   2011 	pin = isp->is_pin;
   2012 	(*pic->pic_hwmask)(pic, pin); /* for ci_ipending check */
   2013 	membar_sync();
   2014 	while (oldci->ci_ipending & (1ULL << oldslot)) {
   2015 		(void)kpause("intrdist", false, 1, &cpu_lock);
   2016 		membar_sync();
   2017 	}
   2018 
   2019 	kpreempt_disable();
   2020 
   2021 	/* deactivate old interrupt setting */
   2022 	if (oldci == curcpu() || !mp_online) {
   2023 		intr_deactivate_xcall(ih, NULL);
   2024 	} else {
   2025 		uint64_t where;
   2026 		where = xc_unicast(0, intr_deactivate_xcall, ih,
   2027 				   NULL, oldci);
   2028 		xc_wait(where);
   2029 	}
   2030 	intr_save_evcnt(isp, oldci->ci_cpuid);
   2031 	(*pic->pic_delroute)(pic, oldci, pin, old_idtvec, isp->is_type);
   2032 
   2033 	/* activate new interrupt setting */
   2034 	isp->is_idtvec =  new_idtvec;
   2035 	newci->ci_isources[newslot] = isp;
   2036 	for (lih = ih; lih != NULL; lih = lih->ih_next) {
   2037 		newci->ci_nintrhand++;
   2038 		lih->ih_cpu = newci;
   2039 		lih->ih_slot = newslot;
   2040 	}
   2041 	if (newci == curcpu() || !mp_online) {
   2042 		intr_activate_xcall(ih, NULL);
   2043 	} else {
   2044 		uint64_t where;
   2045 		where = xc_unicast(0, intr_activate_xcall, ih,
   2046 				   NULL, newci);
   2047 		xc_wait(where);
   2048 	}
   2049 	intr_restore_evcnt(isp, newci->ci_cpuid);
   2050 	isp->is_active_cpu = newci->ci_cpuid;
   2051 	(*pic->pic_addroute)(pic, newci, pin, new_idtvec, isp->is_type);
   2052 
   2053 	isp->is_distribute_pending = false;
   2054 	if (newci == curcpu() || !mp_online) {
   2055 		intr_hwunmask_xcall(ih, NULL);
   2056 	} else {
   2057 		uint64_t where;
   2058 		where = xc_unicast(0, intr_hwunmask_xcall, ih, NULL, newci);
   2059 		xc_wait(where);
   2060 	}
   2061 
   2062 	kpreempt_enable();
   2063 
   2064 	return err;
   2065 }
   2066 
   2067 static bool
   2068 intr_is_affinity_intrsource(struct intrsource *isp, const kcpuset_t *cpuset)
   2069 {
   2070 	struct cpu_info *ci;
   2071 
   2072 	KASSERT(mutex_owned(&cpu_lock));
   2073 
   2074 	/*
   2075 	 * The device is already pci_intr_alloc'ed, however it is not
   2076 	 * established yet.
   2077 	 */
   2078 	if (isp->is_handlers == NULL)
   2079 		return false;
   2080 
   2081 	ci = isp->is_handlers->ih_cpu;
   2082 	KASSERT(ci != NULL);
   2083 
   2084 	return kcpuset_isset(cpuset, cpu_index(ci));
   2085 }
   2086 
   2087 static struct intrhand *
   2088 intr_get_handler(const char *intrid)
   2089 {
   2090 	struct intrsource *isp;
   2091 
   2092 	KASSERT(mutex_owned(&cpu_lock));
   2093 
   2094 	isp = intr_get_io_intrsource(intrid);
   2095 	if (isp == NULL)
   2096 		return NULL;
   2097 
   2098 	return isp->is_handlers;
   2099 }
   2100 
   2101 uint64_t
   2102 x86_intr_get_count(const char *intrid, u_int cpu_idx)
   2103 {
   2104 	struct cpu_info *ci;
   2105 	struct intrsource *isp;
   2106 	struct intrhand *ih;
   2107 	struct percpu_evcnt pep;
   2108 	cpuid_t cpuid;
   2109 	int i, slot;
   2110 	uint64_t count = 0;
   2111 
   2112 	KASSERT(mutex_owned(&cpu_lock));
   2113 	ci = cpu_lookup(cpu_idx);
   2114 	cpuid = ci->ci_cpuid;
   2115 
   2116 	ih = intr_get_handler(intrid);
   2117 	if (ih == NULL) {
   2118 		count = 0;
   2119 		goto out;
   2120 	}
   2121 	slot = ih->ih_slot;
   2122 	isp = ih->ih_cpu->ci_isources[slot];
   2123 
   2124 	for (i = 0; i < ncpu; i++) {
   2125 		pep = isp->is_saved_evcnt[i];
   2126 		if (cpuid == pep.cpuid) {
   2127 			if (isp->is_active_cpu == pep.cpuid) {
   2128 				count = isp->is_evcnt.ev_count;
   2129 				goto out;
   2130 			} else {
   2131 				count = pep.count;
   2132 				goto out;
   2133 			}
   2134 		}
   2135 	}
   2136 
   2137  out:
   2138 	return count;
   2139 }
   2140 
   2141 void
   2142 x86_intr_get_assigned(const char *intrid, kcpuset_t *cpuset)
   2143 {
   2144 	struct cpu_info *ci;
   2145 	struct intrhand *ih;
   2146 
   2147 	KASSERT(mutex_owned(&cpu_lock));
   2148 	kcpuset_zero(cpuset);
   2149 
   2150 	ih = intr_get_handler(intrid);
   2151 	if (ih == NULL)
   2152 		return;
   2153 
   2154 	ci = ih->ih_cpu;
   2155 	kcpuset_set(cpuset, cpu_index(ci));
   2156 }
   2157 
   2158 void
   2159 x86_intr_get_devname(const char *intrid, char *buf, size_t len)
   2160 {
   2161 	struct intrsource *isp;
   2162 	struct intrhand *ih;
   2163 	int slot;
   2164 
   2165 	KASSERT(mutex_owned(&cpu_lock));
   2166 
   2167 	ih = intr_get_handler(intrid);
   2168 	if (ih == NULL) {
   2169 		buf[0] = '\0';
   2170 		return;
   2171 	}
   2172 	slot = ih->ih_slot;
   2173 	isp = ih->ih_cpu->ci_isources[slot];
   2174 	strlcpy(buf, isp->is_xname, len);
   2175 
   2176 }
   2177 
   2178 /*
   2179  * MI interface for subr_interrupt.c
   2180  */
   2181 uint64_t
   2182 interrupt_get_count(const char *intrid, u_int cpu_idx)
   2183 {
   2184 	struct intrsource *isp;
   2185 	uint64_t count = 0;
   2186 
   2187 	mutex_enter(&cpu_lock);
   2188 	isp = intr_get_io_intrsource(intrid);
   2189 	if (isp != NULL)
   2190 		count = isp->is_pic->pic_intr_get_count(intrid, cpu_idx);
   2191 	mutex_exit(&cpu_lock);
   2192 	return count;
   2193 }
   2194 
   2195 /*
   2196  * MI interface for subr_interrupt.c
   2197  */
   2198 void
   2199 interrupt_get_assigned(const char *intrid, kcpuset_t *cpuset)
   2200 {
   2201 	struct intrsource *isp;
   2202 
   2203 	mutex_enter(&cpu_lock);
   2204 	isp = intr_get_io_intrsource(intrid);
   2205 	if (isp != NULL)
   2206 		isp->is_pic->pic_intr_get_assigned(intrid, cpuset);
   2207 	mutex_exit(&cpu_lock);
   2208 }
   2209 
   2210 /*
   2211  * MI interface for subr_interrupt.c
   2212  */
   2213 void
   2214 interrupt_get_available(kcpuset_t *cpuset)
   2215 {
   2216 	CPU_INFO_ITERATOR cii;
   2217 	struct cpu_info *ci;
   2218 
   2219 	kcpuset_zero(cpuset);
   2220 
   2221 	mutex_enter(&cpu_lock);
   2222 	for (CPU_INFO_FOREACH(cii, ci)) {
   2223 		if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) == 0) {
   2224 			kcpuset_set(cpuset, cpu_index(ci));
   2225 		}
   2226 	}
   2227 	mutex_exit(&cpu_lock);
   2228 }
   2229 
   2230 /*
   2231  * MI interface for subr_interrupt.c
   2232  */
   2233 void
   2234 interrupt_get_devname(const char *intrid, char *buf, size_t len)
   2235 {
   2236 	struct intrsource *isp;
   2237 
   2238 	mutex_enter(&cpu_lock);
   2239 	isp = intr_get_io_intrsource(intrid);
   2240 	if (isp != NULL) {
   2241 		if (isp->is_pic->pic_intr_get_devname == NULL) {
   2242 			printf("NULL get_devname intrid %s pic %s\n",
   2243 			    intrid, isp->is_pic->pic_name);
   2244 		} else {
   2245 			isp->is_pic->pic_intr_get_devname(intrid, buf, len);
   2246 		}
   2247 	}
   2248 	mutex_exit(&cpu_lock);
   2249 }
   2250 
   2251 static int
   2252 intr_distribute_locked(struct intrhand *ih, const kcpuset_t *newset,
   2253     kcpuset_t *oldset)
   2254 {
   2255 	struct intrsource *isp;
   2256 	int slot;
   2257 
   2258 	KASSERT(mutex_owned(&intr_distribute_lock));
   2259 	KASSERT(mutex_owned(&cpu_lock));
   2260 
   2261 	if (ih == NULL)
   2262 		return EINVAL;
   2263 
   2264 	slot = ih->ih_slot;
   2265 	isp = ih->ih_cpu->ci_isources[slot];
   2266 	KASSERT(isp != NULL);
   2267 
   2268 	if (oldset != NULL)
   2269 		intr_get_affinity(isp, oldset);
   2270 
   2271 	return intr_set_affinity(isp, newset);
   2272 }
   2273 
   2274 /*
   2275  * MI interface for subr_interrupt.c
   2276  */
   2277 int
   2278 interrupt_distribute(void *cookie, const kcpuset_t *newset, kcpuset_t *oldset)
   2279 {
   2280 	int error;
   2281 	struct intrhand *ih = cookie;
   2282 
   2283 	mutex_enter(&intr_distribute_lock);
   2284 	mutex_enter(&cpu_lock);
   2285 	error = intr_distribute_locked(ih, newset, oldset);
   2286 	mutex_exit(&cpu_lock);
   2287 	mutex_exit(&intr_distribute_lock);
   2288 
   2289 	return error;
   2290 }
   2291 
   2292 /*
   2293  * MI interface for subr_interrupt.c
   2294  */
   2295 int
   2296 interrupt_distribute_handler(const char *intrid, const kcpuset_t *newset,
   2297     kcpuset_t *oldset)
   2298 {
   2299 	int error;
   2300 	struct intrhand *ih;
   2301 
   2302 	mutex_enter(&intr_distribute_lock);
   2303 	mutex_enter(&cpu_lock);
   2304 
   2305 	ih = intr_get_handler(intrid);
   2306 	if (ih == NULL) {
   2307 		error = ENOENT;
   2308 		goto out;
   2309 	}
   2310 	error = intr_distribute_locked(ih, newset, oldset);
   2311 
   2312  out:
   2313 	mutex_exit(&cpu_lock);
   2314 	mutex_exit(&intr_distribute_lock);
   2315 	return error;
   2316 }
   2317 
   2318 /*
   2319  * MI interface for subr_interrupt.c
   2320  */
   2321 struct intrids_handler *
   2322 interrupt_construct_intrids(const kcpuset_t *cpuset)
   2323 {
   2324 	struct intrsource *isp;
   2325 	struct intrids_handler *ii_handler;
   2326 	intrid_t *ids;
   2327 	int i, count;
   2328 
   2329 	if (kcpuset_iszero(cpuset))
   2330 		return 0;
   2331 
   2332 	/*
   2333 	 * Count the number of interrupts which affinity to any cpu of
   2334 	 * "cpuset".
   2335 	 */
   2336 	count = 0;
   2337 	mutex_enter(&cpu_lock);
   2338 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
   2339 		if (intr_is_affinity_intrsource(isp, cpuset))
   2340 			count++;
   2341 	}
   2342 	mutex_exit(&cpu_lock);
   2343 
   2344 	ii_handler = kmem_zalloc(sizeof(int) + sizeof(intrid_t) * count,
   2345 	    KM_SLEEP);
   2346 	if (ii_handler == NULL)
   2347 		return NULL;
   2348 	ii_handler->iih_nids = count;
   2349 	if (count == 0)
   2350 		return ii_handler;
   2351 
   2352 	ids = ii_handler->iih_intrids;
   2353 	i = 0;
   2354 	mutex_enter(&cpu_lock);
   2355 	SIMPLEQ_FOREACH(isp, &io_interrupt_sources, is_list) {
   2356 		/* Ignore devices attached after counting "count". */
   2357 		if (i >= count) {
   2358 			DPRINTF(("New devices are attached after counting.\n"));
   2359 			break;
   2360 		}
   2361 
   2362 		if (!intr_is_affinity_intrsource(isp, cpuset))
   2363 			continue;
   2364 
   2365 		strncpy(ids[i], isp->is_intrid, sizeof(intrid_t));
   2366 		i++;
   2367 	}
   2368 	mutex_exit(&cpu_lock);
   2369 
   2370 	return ii_handler;
   2371 }
   2372 
   2373 /*
   2374  * MI interface for subr_interrupt.c
   2375  */
   2376 void
   2377 interrupt_destruct_intrids(struct intrids_handler *ii_handler)
   2378 {
   2379 	size_t iih_size;
   2380 
   2381 	if (ii_handler == NULL)
   2382 		return;
   2383 
   2384 	iih_size = sizeof(int) + sizeof(intrid_t) * ii_handler->iih_nids;
   2385 	kmem_free(ii_handler, iih_size);
   2386 }
   2387