Home | History | Annotate | Line # | Download | only in pci
      1 /*	$NetBSD: msipic.c,v 1.27 2022/05/24 14:00:23 bouyer Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2015 Internet Initiative Japan Inc.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  * POSSIBILITY OF SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: msipic.c,v 1.27 2022/05/24 14:00:23 bouyer Exp $");
     31 
     32 #include "opt_intrdebug.h"
     33 
     34 #include <sys/types.h>
     35 #include <sys/param.h>
     36 #include <sys/systm.h>
     37 #include <sys/errno.h>
     38 #include <sys/kmem.h>
     39 #include <sys/mutex.h>
     40 #include <sys/bitops.h>
     41 
     42 #include <dev/pci/pcivar.h>
     43 
     44 #include <machine/i82489reg.h>
     45 #include <machine/i82489var.h>
     46 #include <machine/i82093reg.h>
     47 #include <machine/i82093var.h>
     48 #include <machine/pic.h>
     49 #include <machine/lock.h>
     50 
     51 #include <x86/pci/msipic.h>
     52 
     53 #ifdef INTRDEBUG
     54 #define MSIPICDEBUG
     55 #endif
     56 
     57 #ifdef MSIPICDEBUG
     58 #define DPRINTF(msg) printf msg
     59 #else
     60 #define DPRINTF(msg)
     61 #endif
     62 
     63 #define BUS_SPACE_WRITE_FLUSH(pc, tag) (void)bus_space_read_4(pc, tag, 0)
     64 
     65 #define MSIPICNAMEBUF 16
     66 
     67 /*
     68  * A Pseudo pic for single MSI/MSI-X device.
     69  * The pic and MSI/MSI-X device are distinbuished by "devid". The "devid"
     70  * is managed by below "dev_seqs".
     71  */
     72 struct msipic {
     73 	struct msipic_pci_info mp_i;
     74 
     75 	int mp_devid; /* The device id for the MSI/MSI-X device. */
     76 
     77 	char mp_pic_name[MSIPICNAMEBUF]; /* The MSI/MSI-X device's name. */
     78 
     79 	struct pci_attach_args mp_pa;
     80 	bus_space_tag_t mp_bstag;
     81 	bus_space_handle_t mp_bshandle;
     82 	bus_size_t mp_bssize;
     83 	struct pic *mp_pic;
     84 
     85 	LIST_ENTRY(msipic) mp_list;
     86 };
     87 
     88 static kmutex_t msipic_list_lock;
     89 
     90 static LIST_HEAD(, msipic) msipic_list =
     91 	LIST_HEAD_INITIALIZER(msipic_list);
     92 
     93 /*
     94  * This struct managements "devid" to use the same "devid" for the device
     95  * re-attached. If the device's bus number and device number and function
     96  * number are equal, it is assumed re-attached.
     97  */
     98 struct dev_last_used_seq {
     99 	bool ds_using;
    100 	int ds_bus;
    101 	int ds_dev;
    102 	int ds_fun;
    103 };
    104 /* The number of MSI/MSI-X devices supported by system. */
    105 #define NUM_MSI_DEVS 256
    106 /* Record devids to use the same devid when the device is re-attached. */
    107 static struct dev_last_used_seq dev_seqs[NUM_MSI_DEVS];
    108 
    109 static int msipic_allocate_common_msi_devid(const struct pci_attach_args *);
    110 static void msipic_release_common_msi_devid(int);
    111 
    112 static struct pic *msipic_find_msi_pic_locked(int);
    113 static struct pic *msipic_construct_common_msi_pic(const struct pci_attach_args *,
    114 						   const struct pic *);
    115 static void msipic_destruct_common_msi_pic(struct pic *);
    116 
    117 static void msi_set_msictl_enablebit(struct pic *, int, int);
    118 static void msi_hwmask(struct pic *, int);
    119 static void msi_hwunmask(struct pic *, int);
    120 static void msi_addroute(struct pic *, struct cpu_info *, int, int, int);
    121 static void msi_delroute(struct pic *, struct cpu_info *, int, int, int);
    122 
    123 static void msix_set_vecctl_mask(struct pic *, int, int);
    124 static void msix_hwmask(struct pic *, int);
    125 static void msix_hwunmask(struct pic *, int);
    126 static void msix_addroute(struct pic *, struct cpu_info *, int, int, int);
    127 static void msix_delroute(struct pic *, struct cpu_info *, int, int, int);
    128 
    129 /*
    130  * Return new "devid" for the device attached first.
    131  * Return the same "devid" for the device re-attached after dettached once.
    132  * Return -1 if the number of attached MSI/MSI-X devices is over NUM_MSI_DEVS.
    133  */
    134 static int
    135 msipic_allocate_common_msi_devid(const struct pci_attach_args *pa)
    136 {
    137 	pci_chipset_tag_t pc;
    138 	pcitag_t tag;
    139 	int bus, dev, fun, i;
    140 
    141 	KASSERT(mutex_owned(&msipic_list_lock));
    142 
    143 	pc = pa->pa_pc;
    144 	tag = pa->pa_tag;
    145 	pci_decompose_tag(pc, tag, &bus, &dev, &fun);
    146 
    147 	/* if the device was once attached, use same devid */
    148 	for (i = 0; i < NUM_MSI_DEVS; i++) {
    149 		/* skip host bridge */
    150 		if (dev_seqs[i].ds_bus == 0
    151 		    && dev_seqs[i].ds_dev == 0
    152 		    && dev_seqs[i].ds_fun == 0)
    153 			break;
    154 
    155 		if (dev_seqs[i].ds_bus == bus
    156 		    && dev_seqs[i].ds_dev == dev
    157 		    && dev_seqs[i].ds_fun == fun) {
    158 			dev_seqs[i].ds_using = true;
    159 			return i;
    160 		}
    161 	}
    162 
    163 	for (i = 0; i < NUM_MSI_DEVS; i++) {
    164 		if (dev_seqs[i].ds_using == 0) {
    165 			dev_seqs[i].ds_using = true;
    166 			dev_seqs[i].ds_bus = bus;
    167 			dev_seqs[i].ds_dev = dev;
    168 			dev_seqs[i].ds_fun = fun;
    169 			return i;
    170 		}
    171 	}
    172 
    173 	DPRINTF(("too many MSI devices.\n"));
    174 	return -1;
    175 }
    176 
    177 /*
    178  * Set the "devid" unused, but keep reserving the "devid" to reuse when
    179  * the device is re-attached.
    180  */
    181 static void
    182 msipic_release_common_msi_devid(int devid)
    183 {
    184 
    185 	KASSERT(mutex_owned(&msipic_list_lock));
    186 
    187 	if (devid < 0 || NUM_MSI_DEVS <= devid) {
    188 		DPRINTF(("%s: invalid devid.\n", __func__));
    189 		return;
    190 	}
    191 
    192 	dev_seqs[devid].ds_using = false;
    193 	/* Keep ds_* to reuse the same devid for the same device. */
    194 }
    195 
    196 static struct pic *
    197 msipic_find_msi_pic_locked(int devid)
    198 {
    199 	struct msipic *mpp;
    200 
    201 	KASSERT(mutex_owned(&msipic_list_lock));
    202 
    203 	LIST_FOREACH(mpp, &msipic_list, mp_list) {
    204 		if (mpp->mp_devid == devid)
    205 			return mpp->mp_pic;
    206 	}
    207 	return NULL;
    208 }
    209 
    210 /*
    211  * Return the msi_pic whose device is already registered.
    212  * If the device is not registered yet, return NULL.
    213  */
    214 struct pic *
    215 msipic_find_msi_pic(int devid)
    216 {
    217 	struct pic *msipic;
    218 
    219 	mutex_enter(&msipic_list_lock);
    220 	msipic = msipic_find_msi_pic_locked(devid);
    221 	mutex_exit(&msipic_list_lock);
    222 
    223 	return msipic;
    224 }
    225 
    226 /*
    227  * A common construct process of MSI and MSI-X.
    228  */
    229 static struct pic *
    230 msipic_construct_common_msi_pic(const struct pci_attach_args *pa,
    231     const struct pic *pic_tmpl)
    232 {
    233 	struct pic *pic;
    234 	struct msipic *msipic;
    235 	int devid;
    236 
    237 	pic = kmem_alloc(sizeof(*pic), KM_SLEEP);
    238 	msipic = kmem_zalloc(sizeof(*msipic), KM_SLEEP);
    239 
    240 	mutex_enter(&msipic_list_lock);
    241 
    242 	devid = msipic_allocate_common_msi_devid(pa);
    243 	if (devid == -1) {
    244 		mutex_exit(&msipic_list_lock);
    245 		kmem_free(pic, sizeof(*pic));
    246 		kmem_free(msipic, sizeof(*msipic));
    247 		return NULL;
    248 	}
    249 
    250 	memcpy(pic, pic_tmpl, sizeof(*pic));
    251 	pic->pic_edge_stubs
    252 	    = x2apic_mode ? x2apic_edge_stubs : ioapic_edge_stubs;
    253 	pic->pic_msipic = msipic;
    254 	msipic->mp_pic = pic;
    255 	pci_decompose_tag(pa->pa_pc, pa->pa_tag,
    256 	    &msipic->mp_i.mp_bus, &msipic->mp_i.mp_dev, &msipic->mp_i.mp_fun);
    257 	memcpy(&msipic->mp_pa, pa, sizeof(msipic->mp_pa));
    258 	msipic->mp_devid = devid;
    259 	/*
    260 	 * pci_msi{,x}_alloc() must be called only once in the device driver.
    261 	 */
    262 	KASSERT(msipic_find_msi_pic_locked(msipic->mp_devid) == NULL);
    263 
    264 	LIST_INSERT_HEAD(&msipic_list, msipic, mp_list);
    265 
    266 	mutex_exit(&msipic_list_lock);
    267 
    268 	return pic;
    269 }
    270 
    271 static void
    272 msipic_destruct_common_msi_pic(struct pic *msi_pic)
    273 {
    274 	struct msipic *msipic;
    275 
    276 	if (msi_pic == NULL)
    277 		return;
    278 
    279 	msipic = msi_pic->pic_msipic;
    280 	mutex_enter(&msipic_list_lock);
    281 	LIST_REMOVE(msipic, mp_list);
    282 	msipic_release_common_msi_devid(msipic->mp_devid);
    283 	mutex_exit(&msipic_list_lock);
    284 
    285 	if (msipic->mp_i.mp_xen_pirq != NULL) {
    286 		KASSERT(msipic->mp_i.mp_veccnt > 0);
    287 #ifdef DIAGNOSTIC
    288 		for (int i = 0; i < msipic->mp_i.mp_veccnt; i++) {
    289 			KASSERT(msipic->mp_i.mp_xen_pirq[i] == 0);
    290 		}
    291 #endif
    292 		kmem_free(msipic->mp_i.mp_xen_pirq,
    293 	            sizeof(*msipic->mp_i.mp_xen_pirq) * msipic->mp_i.mp_veccnt);
    294 	}
    295 	kmem_free(msipic, sizeof(*msipic));
    296 	kmem_free(msi_pic, sizeof(*msi_pic));
    297 }
    298 
    299 /*
    300  * The pic is MSI/MSI-X pic or not.
    301  */
    302 bool
    303 msipic_is_msi_pic(struct pic *pic)
    304 {
    305 
    306 	return (pic->pic_msipic != NULL);
    307 }
    308 
    309 /*
    310  * Return the MSI/MSI-X devid which is unique for each devices.
    311  */
    312 int
    313 msipic_get_devid(struct pic *pic)
    314 {
    315 
    316 	KASSERT(msipic_is_msi_pic(pic));
    317 
    318 	return pic->pic_msipic->mp_devid;
    319 }
    320 
    321 /*
    322  * Return the PCI bus/dev/func info for the device.
    323  */
    324 const struct msipic_pci_info *
    325 msipic_get_pci_info(struct pic *pic)
    326 {
    327 	KASSERT(msipic_is_msi_pic(pic));
    328 
    329 	return &pic->pic_msipic->mp_i;
    330 }
    331 
    332 #define MSI_MSICTL_ENABLE 1
    333 #define MSI_MSICTL_DISABLE 0
    334 static void
    335 msi_set_msictl_enablebit(struct pic *pic, int msi_vec, int flag)
    336 {
    337 	pci_chipset_tag_t pc;
    338 	struct pci_attach_args *pa;
    339 	pcitag_t tag;
    340 	pcireg_t ctl;
    341 	int off, err __diagused;
    342 
    343 	pc = NULL;
    344 	pa = &pic->pic_msipic->mp_pa;
    345 	tag = pa->pa_tag;
    346 	err = pci_get_capability(pc, tag, PCI_CAP_MSI, &off, NULL);
    347 	KASSERT(err != 0);
    348 
    349 	/*
    350 	 * MSI can establish only one vector at once.
    351 	 * So, use whole device mask bit instead of a vector mask bit.
    352 	 */
    353 	ctl = pci_conf_read(pc, tag, off + PCI_MSI_CTL);
    354 	if (flag == MSI_MSICTL_ENABLE)
    355 		ctl |= PCI_MSI_CTL_MSI_ENABLE;
    356 	else
    357 		ctl &= ~PCI_MSI_CTL_MSI_ENABLE;
    358 
    359 #ifdef XENPV
    360 	pci_conf_write16(pc, tag, off + PCI_MSI_CTL + 2, ctl >> 16);
    361 #else
    362 	pci_conf_write(pc, tag, off, ctl);
    363 #endif
    364 }
    365 
    366 static void
    367 msi_hwmask(struct pic *pic, int msi_vec)
    368 {
    369 
    370 	msi_set_msictl_enablebit(pic, msi_vec, MSI_MSICTL_DISABLE);
    371 }
    372 
    373 /*
    374  * Do not use pic->hwunmask() immediately after pic->delroute().
    375  * It is required to use pic->addroute() before pic->hwunmask().
    376  */
    377 static void
    378 msi_hwunmask(struct pic *pic, int msi_vec)
    379 {
    380 
    381 	msi_set_msictl_enablebit(pic, msi_vec, MSI_MSICTL_ENABLE);
    382 }
    383 
    384 static void
    385 msi_addroute(struct pic *pic, struct cpu_info *ci,
    386 	     int unused, int idt_vec, int type)
    387 {
    388 	pci_chipset_tag_t pc;
    389 	struct pci_attach_args *pa;
    390 	pcitag_t tag;
    391 #ifndef XENPV
    392 	pcireg_t addr, data;
    393 #endif
    394 	pcireg_t ctl;
    395 	int off, err __diagused;
    396 
    397 	pc = NULL;
    398 	pa = &pic->pic_msipic->mp_pa;
    399 	tag = pa->pa_tag;
    400 	err = pci_get_capability(pc, tag, PCI_CAP_MSI, &off, NULL);
    401 	KASSERT(err != 0);
    402 
    403 	ctl = pci_conf_read(pc, tag, off + PCI_MSI_CTL);
    404 #ifndef XENPV
    405 	/*
    406 	 * See Intel 64 and IA-32 Architectures Software Developer's Manual
    407 	 * Volume 3 10.11 Message Signalled Interrupts.
    408 	 */
    409 	/*
    410 	 * "cpuid" for MSI address is local APIC ID. In NetBSD, the ID is
    411 	 * the same as ci->ci_cpuid.
    412 	 */
    413 	addr = LAPIC_MSIADDR_BASE | __SHIFTIN(ci->ci_cpuid,
    414 	    LAPIC_MSIADDR_DSTID_MASK);
    415 	/* If trigger mode is edge, it don't care level for trigger mode. */
    416 	data = __SHIFTIN(idt_vec, LAPIC_VECTOR_MASK)
    417 		| LAPIC_TRIGMODE_EDGE | LAPIC_DLMODE_FIXED;
    418 
    419 	/*
    420 	 * The size of the message data register is 16bit if the extended
    421 	 * message data is not implemented. If it's 16bit and the per-vector
    422 	 * masking is not capable, the location of the upper 16bit is out of
    423 	 * the MSI capability structure's range. The PCI spec says the upper
    424 	 * 16bit is driven to 0 if the message data register is 16bit. It's the
    425 	 * spec, so it's OK just to write it regardless of the value of the
    426 	 * upper 16bit.
    427 	 */
    428 	if (ctl & PCI_MSI_CTL_64BIT_ADDR) {
    429 		pci_conf_write(pc, tag, off + PCI_MSI_MADDR64_LO, addr);
    430 		pci_conf_write(pc, tag, off + PCI_MSI_MADDR64_HI, 0);
    431 		pci_conf_write(pc, tag, off + PCI_MSI_MDATA64, data);
    432 	} else {
    433 		pci_conf_write(pc, tag, off + PCI_MSI_MADDR, addr);
    434 		pci_conf_write(pc, tag, off + PCI_MSI_MDATA, data);
    435 	}
    436 #endif /* !XENPV */
    437 	ctl |= PCI_MSI_CTL_MSI_ENABLE;
    438 #ifdef XENPV
    439 	pci_conf_write16(pc, tag, off + PCI_MSI_CTL + 2, ctl >> 16);
    440 #else
    441 	pci_conf_write(pc, tag, off + PCI_MSI_CTL, ctl);
    442 #endif
    443 }
    444 
    445 /*
    446  * Do not use pic->hwunmask() immediately after pic->delroute().
    447  * It is required to use pic->addroute() before pic->hwunmask().
    448  */
    449 static void
    450 msi_delroute(struct pic *pic, struct cpu_info *ci,
    451 	     int msi_vec, int idt_vec, int type)
    452 {
    453 
    454 	msi_hwmask(pic, msi_vec);
    455 }
    456 
    457 /*
    458  * Template for MSI pic.
    459  * .pic_msipic is set later in construct_msi_pic().
    460  */
    461 static const struct pic msi_pic_tmpl = {
    462 	.pic_type = PIC_MSI,
    463 	.pic_vecbase = 0,
    464 	.pic_apicid = 0,
    465 	.pic_lock = __SIMPLELOCK_UNLOCKED, /* not used for msi_pic */
    466 	.pic_hwmask = msi_hwmask,
    467 	.pic_hwunmask = msi_hwunmask,
    468 	.pic_addroute = msi_addroute,
    469 	.pic_delroute = msi_delroute,
    470 	.pic_intr_get_devname = x86_intr_get_devname,
    471 	.pic_intr_get_assigned = x86_intr_get_assigned,
    472 	.pic_intr_get_count = x86_intr_get_count,
    473 };
    474 
    475 /*
    476  * Create pseudo pic for a MSI device.
    477  */
    478 struct pic *
    479 msipic_construct_msi_pic(const struct pci_attach_args *pa)
    480 {
    481 	struct pic *msi_pic;
    482 	char pic_name_buf[MSIPICNAMEBUF];
    483 
    484 	msi_pic = msipic_construct_common_msi_pic(pa, &msi_pic_tmpl);
    485 	if (msi_pic == NULL) {
    486 		DPRINTF(("cannot allocate MSI pic.\n"));
    487 		return NULL;
    488 	}
    489 
    490 	memset(pic_name_buf, 0, MSIPICNAMEBUF);
    491 	snprintf(pic_name_buf, MSIPICNAMEBUF, "msi%d",
    492 	    msi_pic->pic_msipic->mp_devid);
    493 	strncpy(msi_pic->pic_msipic->mp_pic_name, pic_name_buf,
    494 	    MSIPICNAMEBUF - 1);
    495 	msi_pic->pic_name = msi_pic->pic_msipic->mp_pic_name;
    496 
    497 	return msi_pic;
    498 }
    499 
    500 /*
    501  * Delete pseudo pic for a MSI device.
    502  */
    503 void
    504 msipic_destruct_msi_pic(struct pic *msi_pic)
    505 {
    506 
    507 	msipic_destruct_common_msi_pic(msi_pic);
    508 }
    509 
    510 #define MSIX_VECCTL_HWMASK 1
    511 #define MSIX_VECCTL_HWUNMASK 0
    512 static void
    513 msix_set_vecctl_mask(struct pic *pic, int msix_vec, int flag)
    514 {
    515 	bus_space_tag_t bstag;
    516 	bus_space_handle_t bshandle;
    517 	uint64_t entry_base;
    518 	uint32_t vecctl;
    519 
    520 	if (msix_vec < 0) {
    521 		DPRINTF(("%s: invalid MSI-X table index, devid=%d vecid=%d",
    522 			__func__, msipic_get_devid(pic), msix_vec));
    523 		return;
    524 	}
    525 
    526 	entry_base = PCI_MSIX_TABLE_ENTRY_SIZE * msix_vec;
    527 
    528 	bstag = pic->pic_msipic->mp_bstag;
    529 	bshandle = pic->pic_msipic->mp_bshandle;
    530 	vecctl = bus_space_read_4(bstag, bshandle,
    531 	    entry_base + PCI_MSIX_TABLE_ENTRY_VECTCTL);
    532 	if (flag == MSIX_VECCTL_HWMASK)
    533 		vecctl |= PCI_MSIX_VECTCTL_MASK;
    534 	else
    535 		vecctl &= ~PCI_MSIX_VECTCTL_MASK;
    536 
    537 	bus_space_write_4(bstag, bshandle,
    538 	    entry_base + PCI_MSIX_TABLE_ENTRY_VECTCTL, vecctl);
    539 	BUS_SPACE_WRITE_FLUSH(bstag, bshandle);
    540 }
    541 
    542 static void
    543 msix_hwmask(struct pic *pic, int msix_vec)
    544 {
    545 
    546 	msix_set_vecctl_mask(pic, msix_vec, MSIX_VECCTL_HWMASK);
    547 }
    548 
    549 /*
    550  * Do not use pic->hwunmask() immediately after pic->delroute().
    551  * It is required to use pic->addroute() before pic->hwunmask().
    552  */
    553 static void
    554 msix_hwunmask(struct pic *pic, int msix_vec)
    555 {
    556 
    557 	msix_set_vecctl_mask(pic, msix_vec, MSIX_VECCTL_HWUNMASK);
    558 }
    559 
    560 static void
    561 msix_addroute(struct pic *pic, struct cpu_info *ci,
    562 	     int msix_vec, int idt_vec, int type)
    563 {
    564 	pci_chipset_tag_t pc;
    565 	struct pci_attach_args *pa;
    566 	pcitag_t tag;
    567 #ifndef XENPV
    568 	bus_space_tag_t bstag;
    569 	bus_space_handle_t bshandle;
    570 	uint64_t entry_base;
    571 	pcireg_t addr, data;
    572 #endif
    573 	pcireg_t ctl;
    574 	int off, err __diagused;
    575 
    576 	if (msix_vec < 0) {
    577 		DPRINTF(("%s: invalid MSI-X table index, devid=%d vecid=%d",
    578 			__func__, msipic_get_devid(pic), msix_vec));
    579 		return;
    580 	}
    581 
    582 	pa = &pic->pic_msipic->mp_pa;
    583 	pc = pa->pa_pc;
    584 	tag = pa->pa_tag;
    585 	err = pci_get_capability(pc, tag, PCI_CAP_MSIX, &off, NULL);
    586 	KASSERT(err != 0);
    587 
    588 #ifndef XENPV
    589 	/* Disable MSI-X before writing MSI-X table */
    590 	ctl = pci_conf_read(pc, tag, off + PCI_MSIX_CTL);
    591 	ctl &= ~PCI_MSIX_CTL_ENABLE;
    592 	pci_conf_write(pc, tag, off + PCI_MSIX_CTL, ctl);
    593 
    594 	bstag = pic->pic_msipic->mp_bstag;
    595 	bshandle = pic->pic_msipic->mp_bshandle;
    596 	entry_base = PCI_MSIX_TABLE_ENTRY_SIZE * msix_vec;
    597 
    598 	/*
    599 	 * See Intel 64 and IA-32 Architectures Software Developer's Manual
    600 	 * Volume 3 10.11 Message Signalled Interrupts.
    601 	 */
    602 	/*
    603 	 * "cpuid" for MSI-X address is local APIC ID. In NetBSD, the ID is
    604 	 * the same as ci->ci_cpuid.
    605 	 */
    606 	addr = LAPIC_MSIADDR_BASE | __SHIFTIN(ci->ci_cpuid,
    607 	    LAPIC_MSIADDR_DSTID_MASK);
    608 	/* If trigger mode is edge, it don't care level for trigger mode. */
    609 	data = __SHIFTIN(idt_vec, LAPIC_VECTOR_MASK)
    610 		| LAPIC_TRIGMODE_EDGE | LAPIC_DLMODE_FIXED;
    611 
    612 	bus_space_write_4(bstag, bshandle,
    613 	    entry_base + PCI_MSIX_TABLE_ENTRY_ADDR_LO, addr);
    614 	bus_space_write_4(bstag, bshandle,
    615 	    entry_base + PCI_MSIX_TABLE_ENTRY_ADDR_HI, 0);
    616 	bus_space_write_4(bstag, bshandle,
    617 	    entry_base + PCI_MSIX_TABLE_ENTRY_DATA, data);
    618 	BUS_SPACE_WRITE_FLUSH(bstag, bshandle);
    619 #endif /* !XENPV */
    620 
    621 	ctl = pci_conf_read(pc, tag, off + PCI_MSIX_CTL);
    622 	if (ctl & PCI_MSIX_CTL_FUNCMASK) {
    623 		ctl &= ~PCI_MSIX_CTL_FUNCMASK;
    624 	}
    625 	ctl |= PCI_MSIX_CTL_ENABLE;
    626 #ifdef XENPV
    627 	pci_conf_write16(pc, tag, off + PCI_MSIX_CTL + 2, ctl >> 16);
    628 #else
    629 	pci_conf_write(pc, tag, off + PCI_MSIX_CTL, ctl);
    630 #endif
    631 }
    632 
    633 /*
    634  * Do not use pic->hwunmask() immediately after pic->delroute().
    635  * It is required to use pic->addroute() before pic->hwunmask().
    636  */
    637 static void
    638 msix_delroute(struct pic *pic, struct cpu_info *ci,
    639 	     int msix_vec, int vec, int type)
    640 {
    641 
    642 	msix_hwmask(pic, msix_vec);
    643 }
    644 
    645 /*
    646  * Template for MSI-X pic.
    647  * .pic_msipic is set later in construct_msix_pic().
    648  */
    649 static const struct pic msix_pic_tmpl = {
    650 	.pic_type = PIC_MSIX,
    651 	.pic_vecbase = 0,
    652 	.pic_apicid = 0,
    653 	.pic_lock = __SIMPLELOCK_UNLOCKED, /* not used for msix_pic */
    654 	.pic_hwmask = msix_hwmask,
    655 	.pic_hwunmask = msix_hwunmask,
    656 	.pic_addroute = msix_addroute,
    657 	.pic_delroute = msix_delroute,
    658 	.pic_intr_get_devname = x86_intr_get_devname,
    659 	.pic_intr_get_assigned = x86_intr_get_assigned,
    660 	.pic_intr_get_count = x86_intr_get_count,
    661 };
    662 
    663 struct pic *
    664 msipic_construct_msix_pic(const struct pci_attach_args *pa)
    665 {
    666 	struct pic *msix_pic;
    667 	pci_chipset_tag_t pc;
    668 	pcitag_t tag;
    669 	pcireg_t tbl;
    670 	bus_space_tag_t bstag;
    671 	bus_space_handle_t bshandle;
    672 	bus_size_t bssize;
    673 	size_t table_size;
    674 	uint32_t table_offset;
    675 	u_int memtype;
    676 	bus_addr_t memaddr;
    677 	int flags;
    678 	int bir, bar, err, off, table_nentry;
    679 	char pic_name_buf[MSIPICNAMEBUF];
    680 
    681 	table_nentry = pci_msix_count(pa->pa_pc, pa->pa_tag);
    682 	if (table_nentry == 0) {
    683 		DPRINTF(("MSI-X table entry is 0.\n"));
    684 		return NULL;
    685 	}
    686 
    687 	pc = pa->pa_pc;
    688 	tag = pa->pa_tag;
    689 	if (pci_get_capability(pc, tag, PCI_CAP_MSIX, &off, NULL) == 0) {
    690 		DPRINTF(("%s: no msix capability", __func__));
    691 		return NULL;
    692 	}
    693 
    694 	msix_pic = msipic_construct_common_msi_pic(pa, &msix_pic_tmpl);
    695 	if (msix_pic == NULL) {
    696 		DPRINTF(("cannot allocate MSI-X pic.\n"));
    697 		return NULL;
    698 	}
    699 
    700 	memset(pic_name_buf, 0, MSIPICNAMEBUF);
    701 	snprintf(pic_name_buf, MSIPICNAMEBUF, "msix%d",
    702 	    msix_pic->pic_msipic->mp_devid);
    703 	strncpy(msix_pic->pic_msipic->mp_pic_name, pic_name_buf,
    704 	    MSIPICNAMEBUF - 1);
    705 	msix_pic->pic_name = msix_pic->pic_msipic->mp_pic_name;
    706 
    707 	tbl = pci_conf_read(pc, tag, off + PCI_MSIX_TBLOFFSET);
    708 	table_offset = tbl & PCI_MSIX_TBLOFFSET_MASK;
    709 	bir = tbl & PCI_MSIX_TBLBIR_MASK;
    710 	switch (bir) {
    711 	case 0:
    712 		bar = PCI_BAR0;
    713 		break;
    714 	case 1:
    715 		bar = PCI_BAR1;
    716 		break;
    717 	case 2:
    718 		bar = PCI_BAR2;
    719 		break;
    720 	case 3:
    721 		bar = PCI_BAR3;
    722 		break;
    723 	case 4:
    724 		bar = PCI_BAR4;
    725 		break;
    726 	case 5:
    727 		bar = PCI_BAR5;
    728 		break;
    729 	default:
    730 		aprint_error("detect an illegal device! "
    731 		    "The device use reserved BIR values.\n");
    732 		msipic_destruct_common_msi_pic(msix_pic);
    733 		return NULL;
    734 	}
    735 	memtype = pci_mapreg_type(pc, tag, bar);
    736 	/*
    737 	 * PCI_MSIX_TABLE_ENTRY_SIZE consists below
    738 	 *     - Vector Control (32bit)
    739 	 *     - Message Data (32bit)
    740 	 *     - Message Upper Address (32bit)
    741 	 *     - Message Lower Address (32bit)
    742 	 */
    743 	table_size = table_nentry * PCI_MSIX_TABLE_ENTRY_SIZE;
    744 #if 0
    745 	err = pci_mapreg_submap(pa, bar, memtype, BUS_SPACE_MAP_LINEAR,
    746 	    roundup(table_size, PAGE_SIZE), table_offset,
    747 	    &bstag, &bshandle, NULL, &bssize);
    748 #else
    749 	/*
    750 	 * Workaround for PCI prefetchable bit. Some chips (e.g. Intel 82599)
    751 	 * report SERR and MSI-X doesn't work. This problem might not be the
    752 	 * driver's bug but our PCI common part or VMs' bug. Until we find a
    753 	 * real reason, we ignore the prefetchable bit.
    754 	 */
    755 	if (pci_mapreg_info(pa->pa_pc, pa->pa_tag, bar, memtype,
    756 		&memaddr, NULL, &flags) != 0) {
    757 		DPRINTF(("cannot get a map info.\n"));
    758 		msipic_destruct_common_msi_pic(msix_pic);
    759 		return NULL;
    760 	}
    761 	if ((flags & BUS_SPACE_MAP_PREFETCHABLE) != 0) {
    762 		DPRINTF(( "clear prefetchable bit\n"));
    763 		flags &= ~BUS_SPACE_MAP_PREFETCHABLE;
    764 	}
    765 	bssize = roundup(table_size, PAGE_SIZE);
    766 	err = _x86_memio_map(pa->pa_memt, memaddr + table_offset, bssize, flags,
    767 	    &bshandle);
    768 	bstag = pa->pa_memt;
    769 #endif
    770 	if (err) {
    771 		DPRINTF(("cannot map msix table.\n"));
    772 		msipic_destruct_common_msi_pic(msix_pic);
    773 		return NULL;
    774 	}
    775 	msix_pic->pic_msipic->mp_bstag = bstag;
    776 	msix_pic->pic_msipic->mp_bshandle = bshandle;
    777 	msix_pic->pic_msipic->mp_bssize = bssize;
    778 	msix_pic->pic_msipic->mp_i.mp_table_base = memaddr;
    779 
    780 	return msix_pic;
    781 }
    782 
    783 /*
    784  * Delete pseudo pic for a MSI-X device.
    785  */
    786 void
    787 msipic_destruct_msix_pic(struct pic *msix_pic)
    788 {
    789 	struct msipic *msipic;
    790 
    791 	KASSERT(msipic_is_msi_pic(msix_pic));
    792 	KASSERT(msix_pic->pic_type == PIC_MSIX);
    793 
    794 	msipic = msix_pic->pic_msipic;
    795 	_x86_memio_unmap(msipic->mp_bstag, msipic->mp_bshandle,
    796 	    msipic->mp_bssize, NULL);
    797 
    798 	msipic_destruct_common_msi_pic(msix_pic);
    799 }
    800 
    801 /*
    802  * Set the number of MSI vectors for pseudo MSI pic.
    803  */
    804 int
    805 msipic_set_msi_vectors(struct pic *msi_pic, pci_intr_handle_t *pihs,
    806     int count)
    807 {
    808 
    809 	KASSERT(msipic_is_msi_pic(msi_pic));
    810 
    811 	if (msi_pic->pic_type == PIC_MSI) {
    812 		pci_chipset_tag_t pc;
    813 		struct pci_attach_args *pa;
    814 		pcitag_t tag;
    815 		int off, err __diagused;
    816 		pcireg_t ctl;
    817 
    818 		pc = NULL;
    819 		pa = &msi_pic->pic_msipic->mp_pa;
    820 		tag = pa->pa_tag;
    821 		err = pci_get_capability(pc, tag, PCI_CAP_MSI, &off, NULL);
    822 		KASSERT(err != 0);
    823 
    824 		ctl = pci_conf_read(pc, tag, off + PCI_MSI_CTL);
    825 		ctl &= ~PCI_MSI_CTL_MME_MASK;
    826 		ctl |= __SHIFTIN(ilog2(count), PCI_MSI_CTL_MME_MASK);
    827 		pci_conf_write(pc, tag, off + PCI_MSI_CTL, ctl);
    828 	}
    829 
    830 	msi_pic->pic_msipic->mp_i.mp_veccnt = count;
    831 #ifdef XENPV
    832 	msi_pic->pic_msipic->mp_i.mp_xen_pirq =
    833 	    kmem_zalloc(sizeof(*msi_pic->pic_msipic->mp_i.mp_xen_pirq) * count,
    834 	    KM_SLEEP);
    835 #endif
    836 	return 0;
    837 }
    838 
    839 /*
    840  * Initialize the system to use MSI/MSI-X.
    841  */
    842 void
    843 msipic_init(void)
    844 {
    845 
    846 	mutex_init(&msipic_list_lock, MUTEX_DEFAULT, IPL_NONE);
    847 }
    848