Home | History | Annotate | Line # | Download | only in xen
      1 /*      $NetBSD: if_xennet_xenbus.c,v 1.132 2025/08/28 05:59:16 mrg Exp $      */
      2 
      3 /*
      4  * Copyright (c) 2006 Manuel Bouyer.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 /*
     28  * Copyright (c) 2004 Christian Limpach.
     29  * All rights reserved.
     30  *
     31  * Redistribution and use in source and binary forms, with or without
     32  * modification, are permitted provided that the following conditions
     33  * are met:
     34  * 1. Redistributions of source code must retain the above copyright
     35  *    notice, this list of conditions and the following disclaimer.
     36  * 2. Redistributions in binary form must reproduce the above copyright
     37  *    notice, this list of conditions and the following disclaimer in the
     38  *    documentation and/or other materials provided with the distribution.
     39  *
     40  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     41  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     42  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     43  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     44  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     45  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     46  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     47  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     48  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     49  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     50  */
     51 
     52 /*
     53  * This file contains the xennet frontend code required for the network
     54  * communication between two Xen domains.
     55  * It ressembles xbd, but is a little more complex as it must deal with two
     56  * rings:
     57  * - the TX ring, to transmit packets to backend (inside => outside)
     58  * - the RX ring, to receive packets from backend (outside => inside)
     59  *
     60  * Principles are following.
     61  *
     62  * For TX:
     63  * Purpose is to transmit packets to the outside. The start of day is in
     64  * xennet_start() (output routine of xennet) scheduled via a softint.
     65  * xennet_start() generates the requests associated
     66  * to the TX mbufs queued (see altq(9)).
     67  * The backend's responses are processed by xennet_tx_complete(), called
     68  * from xennet_start()
     69  *
     70  * for RX:
     71  * Purpose is to process the packets received from the outside. RX buffers
     72  * are pre-allocated through xennet_alloc_rx_buffer(), during xennet autoconf
     73  * attach. During pre-allocation, frontend pushes requests in the I/O ring, in
     74  * preparation for incoming packets from backend.
     75  * When RX packets need to be processed, backend takes the requests previously
     76  * offered by frontend and pushes the associated responses inside the I/O ring.
     77  * When done, it notifies frontend through an event notification, which will
     78  * asynchronously call xennet_handler() in frontend.
     79  * xennet_handler() processes the responses, generates the associated mbuf, and
     80  * passes it to the MI layer for further processing.
     81  */
     82 
     83 #include <sys/cdefs.h>
     84 __KERNEL_RCSID(0, "$NetBSD: if_xennet_xenbus.c,v 1.132 2025/08/28 05:59:16 mrg Exp $");
     85 
     86 #include "opt_xen.h"
     87 #include "opt_nfs_boot.h"
     88 #include "opt_net_mpsafe.h"
     89 
     90 #include <sys/param.h>
     91 #include <sys/device.h>
     92 #include <sys/conf.h>
     93 #include <sys/kernel.h>
     94 #include <sys/proc.h>
     95 #include <sys/sysctl.h>
     96 #include <sys/systm.h>
     97 #include <sys/intr.h>
     98 #include <sys/rndsource.h>
     99 
    100 #include <net/if.h>
    101 #include <net/if_dl.h>
    102 #include <net/if_ether.h>
    103 #include <net/bpf.h>
    104 
    105 #if defined(NFS_BOOT_BOOTSTATIC)
    106 #include <sys/fstypes.h>
    107 #include <sys/mount.h>
    108 #include <sys/statvfs.h>
    109 #include <netinet/in.h>
    110 #include <nfs/rpcv2.h>
    111 #include <nfs/nfsproto.h>
    112 #include <nfs/nfs.h>
    113 #include <nfs/nfsmount.h>
    114 #include <nfs/nfsdiskless.h>
    115 #include <xen/if_xennetvar.h>
    116 #endif /* defined(NFS_BOOT_BOOTSTATIC) */
    117 
    118 #include <xen/xennet_checksum.h>
    119 
    120 #include <uvm/uvm.h>
    121 
    122 #include <xen/intr.h>
    123 #include <xen/hypervisor.h>
    124 #include <xen/evtchn.h>
    125 #include <xen/granttables.h>
    126 #include <xen/include/public/io/netif.h>
    127 #include <xen/xenpmap.h>
    128 
    129 #include <xen/xenbus.h>
    130 #include "locators.h"
    131 
    132 #undef XENNET_DEBUG_DUMP
    133 #undef XENNET_DEBUG
    134 
    135 #ifdef XENNET_DEBUG
    136 #define XEDB_FOLLOW     0x01
    137 #define XEDB_INIT       0x02
    138 #define XEDB_EVENT      0x04
    139 #define XEDB_MBUF       0x08
    140 #define XEDB_MEM        0x10
    141 int xennet_debug = 0xff;
    142 #define DPRINTF(x) if (xennet_debug) printf x;
    143 #define DPRINTFN(n,x) if (xennet_debug & (n)) printf x;
    144 #else
    145 #define DPRINTF(x)
    146 #define DPRINTFN(n,x)
    147 #endif
    148 
    149 #define GRANT_INVALID_REF -1 /* entry is free */
    150 
    151 #define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
    152 #define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
    153 
    154 struct xennet_txreq {
    155 	SLIST_ENTRY(xennet_txreq) txreq_next;
    156 	uint16_t txreq_id; /* ID passed to backend */
    157 	grant_ref_t txreq_gntref; /* grant ref of this request */
    158 	struct mbuf *txreq_m; /* mbuf being transmitted */
    159 	bus_dmamap_t txreq_dmamap;
    160 };
    161 
    162 struct xennet_rxreq {
    163 	SLIST_ENTRY(xennet_rxreq) rxreq_next;
    164 	uint16_t rxreq_id; /* ID passed to backend */
    165 	grant_ref_t rxreq_gntref; /* grant ref of this request */
    166 	struct mbuf *rxreq_m;
    167 	bus_dmamap_t rxreq_dmamap;
    168 };
    169 
    170 struct xennet_xenbus_softc {
    171 	device_t sc_dev;
    172 	struct ethercom sc_ethercom;
    173 	uint8_t sc_enaddr[ETHER_ADDR_LEN];
    174 	struct xenbus_device *sc_xbusd;
    175 
    176 	netif_tx_front_ring_t sc_tx_ring;
    177 	netif_rx_front_ring_t sc_rx_ring;
    178 
    179 	unsigned int sc_evtchn;
    180 	struct intrhand *sc_ih;
    181 
    182 	grant_ref_t sc_tx_ring_gntref;
    183 	grant_ref_t sc_rx_ring_gntref;
    184 
    185 	kmutex_t sc_tx_lock; /* protects free TX list, TX ring */
    186 	kmutex_t sc_rx_lock; /* protects free RX list, RX ring, rxreql */
    187 	struct xennet_txreq sc_txreqs[NET_TX_RING_SIZE];
    188 	struct xennet_rxreq sc_rxreqs[NET_RX_RING_SIZE];
    189 	SLIST_HEAD(,xennet_txreq) sc_txreq_head; /* list of free TX requests */
    190 	SLIST_HEAD(,xennet_rxreq) sc_rxreq_head; /* list of free RX requests */
    191 	int sc_free_txreql; /* number of free transmit request structs */
    192 	int sc_free_rxreql; /* number of free receive request structs */
    193 
    194 	int sc_backend_status; /* our status with backend */
    195 #define BEST_CLOSED		0
    196 #define BEST_DISCONNECTED	1
    197 #define BEST_CONNECTED		2
    198 #define BEST_SUSPENDED		3
    199 	int sc_features;
    200 #define FEATURE_IPV6CSUM	0x01	/* IPv6 checksum offload */
    201 #define FEATURE_SG		0x02	/* scatter-gatter */
    202 #define FEATURE_RX_COPY		0x04	/* RX-copy */
    203 #define FEATURE_BITS		"\20\1IPV6-CSUM\2SG\3RX-COPY"
    204 	krndsource_t sc_rnd_source;
    205 	struct evcnt sc_cnt_tx_defrag;
    206 	struct evcnt sc_cnt_tx_queue_full;
    207 	struct evcnt sc_cnt_tx_drop;
    208 	struct evcnt sc_cnt_tx_frag;
    209 	struct evcnt sc_cnt_rx_frag;
    210 	struct evcnt sc_cnt_rx_cksum_blank;
    211 	struct evcnt sc_cnt_rx_cksum_undefer;
    212 };
    213 
    214 
    215 /*
    216  * infrastructure for the sysctl variable: hw.xennet.xnfrx_lowat
    217  */
    218 
    219 	static struct sysctllog	*xennet_log;
    220 	static int xennet_xnfrx_nodenum;
    221 
    222 
    223 #ifndef XENNET_XNFRX_LOWAT
    224 #define XENNET_XNFRX_LOWAT 0 /* HOW FEW XNFRX BUFS SHOULD WE KEEP? */
    225 #endif
    226 
    227 #define IF_XNFRX_LOWAT_MAX 128 /* Maximum minum of xnfrx buffers */
    228 
    229 static int if_xnfrx_lowat = XENNET_XNFRX_LOWAT;
    230 
    231 static pool_cache_t if_xennetrxbuf_cache;
    232 static int if_xennetrxbuf_cache_inited = 0;
    233 
    234 static int  xennet_xenbus_match(device_t, cfdata_t, void *);
    235 static void xennet_xenbus_attach(device_t, device_t, void *);
    236 static int  xennet_xenbus_detach(device_t, int);
    237 static void xennet_backend_changed(void *, XenbusState);
    238 
    239 static void xennet_alloc_rx_buffer(struct xennet_xenbus_softc *);
    240 static void xennet_free_rx_buffer(struct xennet_xenbus_softc *, bool);
    241 static void xennet_tx_complete(struct xennet_xenbus_softc *);
    242 static void xennet_rx_mbuf_free(struct mbuf *, void *, size_t, void *);
    243 static int  xennet_handler(void *);
    244 static bool xennet_talk_to_backend(struct xennet_xenbus_softc *);
    245 #ifdef XENNET_DEBUG_DUMP
    246 static void xennet_hex_dump(const unsigned char *, size_t, const char *, int);
    247 #endif
    248 
    249 static void xennet_sysctl_init(struct xennet_xenbus_softc *);
    250 static int xennet_sysctl_verify(SYSCTLFN_PROTO);
    251 static int  xennet_init(struct ifnet *);
    252 static void xennet_stop(struct ifnet *, int);
    253 static void xennet_start(struct ifnet *);
    254 static int  xennet_ioctl(struct ifnet *, u_long, void *);
    255 
    256 static bool xennet_xenbus_suspend(device_t dev, const pmf_qual_t *);
    257 static bool xennet_xenbus_resume(device_t dev, const pmf_qual_t *);
    258 
    259 CFATTACH_DECL3_NEW(xennet, sizeof(struct xennet_xenbus_softc),
    260    xennet_xenbus_match, xennet_xenbus_attach, xennet_xenbus_detach, NULL,
    261    NULL, NULL, DVF_DETACH_SHUTDOWN);
    262 
    263 static int
    264 xennet_xenbus_match(device_t parent, cfdata_t match, void *aux)
    265 {
    266 	struct xenbusdev_attach_args *xa = aux;
    267 
    268 	if (strcmp(xa->xa_type, "vif") != 0)
    269 		return 0;
    270 
    271 	if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
    272 	    match->cf_loc[XENBUSCF_ID] != xa->xa_id)
    273 		return 0;
    274 
    275 	return 1;
    276 }
    277 
    278 static void
    279 xennet_xenbus_attach(device_t parent, device_t self, void *aux)
    280 {
    281 	struct xennet_xenbus_softc *sc = device_private(self);
    282 	struct xenbusdev_attach_args *xa = aux;
    283 	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
    284 	int err;
    285 	netif_tx_sring_t *tx_ring;
    286 	netif_rx_sring_t *rx_ring;
    287 	RING_IDX i;
    288 	char *e, *p;
    289 	unsigned long uval;
    290 	extern int ifqmaxlen; /* XXX */
    291 	char mac[32];
    292 	char buf[64];
    293 	bus_size_t maxsz;
    294 	int nsegs;
    295 
    296 	aprint_normal(": Xen Virtual Network Interface\n");
    297 	sc->sc_dev = self;
    298 
    299 	sc->sc_xbusd = xa->xa_xbusd;
    300 	sc->sc_xbusd->xbusd_otherend_changed = xennet_backend_changed;
    301 
    302 	/* read feature support flags */
    303 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
    304 	    "feature-rx-copy", &uval, 10);
    305 	if (!err && uval == 1)
    306 		sc->sc_features |= FEATURE_RX_COPY;
    307 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
    308 	    "feature-ipv6-csum-offload", &uval, 10);
    309 	if (!err && uval == 1)
    310 		sc->sc_features |= FEATURE_IPV6CSUM;
    311 	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
    312 	    "feature-sg", &uval, 10);
    313 	if (!err && uval == 1)
    314 		sc->sc_features |= FEATURE_SG;
    315 	snprintb(buf, sizeof(buf), FEATURE_BITS, sc->sc_features);
    316 	aprint_normal_dev(sc->sc_dev, "backend features %s\n", buf);
    317 
    318 	/* xenbus ensure 2 devices can't be probed at the same time */
    319 	if (if_xennetrxbuf_cache_inited == 0) {
    320 		if_xennetrxbuf_cache = pool_cache_init(PAGE_SIZE, 0, 0, 0,
    321 		    "xnfrx", NULL, IPL_NET, NULL, NULL, NULL);
    322 		if (if_xnfrx_lowat) {
    323 			pool_cache_setlowat(if_xennetrxbuf_cache, if_xnfrx_lowat);
    324 		}
    325 		xennet_sysctl_init(sc);
    326 		if_xennetrxbuf_cache_inited = 1;
    327 	}
    328 
    329 	/* initialize free RX and RX request lists */
    330 	if (sc->sc_features & FEATURE_SG) {
    331 		maxsz = ETHER_MAX_LEN_JUMBO;
    332 		/*
    333 		 * Linux netback drops the packet if the request has more
    334 		 * segments than XEN_NETIF_NR_SLOTS_MIN (== 18). With 2KB
    335 		 * MCLBYTES this means maximum packet size 36KB, in reality
    336 		 * less due to mbuf chain fragmentation.
    337 		 */
    338 		nsegs = XEN_NETIF_NR_SLOTS_MIN;
    339 	} else {
    340 		maxsz = PAGE_SIZE;
    341 		nsegs = 1;
    342 	}
    343 	mutex_init(&sc->sc_tx_lock, MUTEX_DEFAULT, IPL_NET);
    344 	SLIST_INIT(&sc->sc_txreq_head);
    345 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
    346 		struct xennet_txreq *txreq = &sc->sc_txreqs[i];
    347 
    348 		txreq->txreq_id = i;
    349 		if (bus_dmamap_create(sc->sc_xbusd->xbusd_dmat, maxsz, nsegs,
    350 		    PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
    351 		    &txreq->txreq_dmamap) != 0)
    352 			break;
    353 
    354 		SLIST_INSERT_HEAD(&sc->sc_txreq_head, &sc->sc_txreqs[i],
    355 		    txreq_next);
    356 	}
    357 	sc->sc_free_txreql = i;
    358 
    359 	mutex_init(&sc->sc_rx_lock, MUTEX_DEFAULT, IPL_NET);
    360 	SLIST_INIT(&sc->sc_rxreq_head);
    361 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
    362 		struct xennet_rxreq *rxreq = &sc->sc_rxreqs[i];
    363 		rxreq->rxreq_id = i;
    364 		if (bus_dmamap_create(sc->sc_xbusd->xbusd_dmat, maxsz, nsegs,
    365 		    PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
    366 		    &rxreq->rxreq_dmamap) != 0)
    367 			break;
    368 		rxreq->rxreq_gntref = GRANT_INVALID_REF;
    369 		SLIST_INSERT_HEAD(&sc->sc_rxreq_head, rxreq, rxreq_next);
    370 	}
    371 	sc->sc_free_rxreql = i;
    372 	if (sc->sc_free_rxreql == 0) {
    373 		aprint_error_dev(self, "failed to allocate rx memory\n");
    374 		return;
    375 	}
    376 
    377 	/* read mac address */
    378 	err = xenbus_read(NULL, sc->sc_xbusd->xbusd_path, "mac",
    379 	    mac, sizeof(mac));
    380 	if (err) {
    381 		aprint_error_dev(self, "can't read mac address, err %d\n", err);
    382 		return;
    383 	}
    384 	for (i = 0, p = mac; i < ETHER_ADDR_LEN; i++) {
    385 		sc->sc_enaddr[i] = strtoul(p, &e, 16);
    386 		if ((e[0] == '\0' && i != 5) && e[0] != ':') {
    387 			aprint_error_dev(self,
    388 			    "%s is not a valid mac address\n", mac);
    389 			return;
    390 		}
    391 		p = &e[1];
    392 	}
    393 	aprint_normal_dev(self, "MAC address %s\n",
    394 	    ether_sprintf(sc->sc_enaddr));
    395 
    396 	/* Initialize ifnet structure and attach interface */
    397 	strlcpy(ifp->if_xname, device_xname(self), IFNAMSIZ);
    398 	sc->sc_ethercom.ec_capabilities |= ETHERCAP_VLAN_MTU;
    399 	if (sc->sc_features & FEATURE_SG)
    400 		sc->sc_ethercom.ec_capabilities |= ETHERCAP_JUMBO_MTU;
    401 	ifp->if_softc = sc;
    402 	ifp->if_start = xennet_start;
    403 	ifp->if_ioctl = xennet_ioctl;
    404 	ifp->if_init = xennet_init;
    405 	ifp->if_stop = xennet_stop;
    406 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
    407 	ifp->if_extflags = IFEF_MPSAFE;
    408 	ifp->if_snd.ifq_maxlen = uimax(ifqmaxlen, NET_TX_RING_SIZE * 2);
    409 	ifp->if_capabilities =
    410 		IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv4_Tx
    411 		| IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv4_Tx
    412 		| IFCAP_CSUM_UDPv6_Rx
    413 		| IFCAP_CSUM_TCPv6_Rx;
    414 #define XN_M_CSUM_SUPPORTED						\
    415 	(M_CSUM_TCPv4 | M_CSUM_UDPv4 | M_CSUM_TCPv6 | M_CSUM_UDPv6)
    416 
    417 	if (sc->sc_features & FEATURE_IPV6CSUM) {
    418 		/*
    419 		 * If backend supports IPv6 csum offloading, we can skip
    420 		 * IPv6 csum for Tx packets. Rx packet validation can
    421 		 * be skipped regardless.
    422 		 */
    423 		ifp->if_capabilities |=
    424 		    IFCAP_CSUM_UDPv6_Tx | IFCAP_CSUM_TCPv6_Tx;
    425 	}
    426 
    427 	IFQ_SET_MAXLEN(&ifp->if_snd, uimax(2 * NET_TX_RING_SIZE, IFQ_MAXLEN));
    428 	IFQ_SET_READY(&ifp->if_snd);
    429 	if_attach(ifp);
    430 	if_deferred_start_init(ifp, NULL);
    431 	ether_ifattach(ifp, sc->sc_enaddr);
    432 
    433 	/* alloc shared rings */
    434 	tx_ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    435 	    UVM_KMF_WIRED);
    436 	rx_ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
    437 	    UVM_KMF_WIRED);
    438 	if (tx_ring == NULL || rx_ring == NULL)
    439 		panic("%s: can't alloc rings", device_xname(self));
    440 
    441 	sc->sc_tx_ring.sring = tx_ring;
    442 	sc->sc_rx_ring.sring = rx_ring;
    443 
    444 	rnd_attach_source(&sc->sc_rnd_source, device_xname(sc->sc_dev),
    445 	    RND_TYPE_NET, RND_FLAG_DEFAULT);
    446 
    447 	evcnt_attach_dynamic(&sc->sc_cnt_tx_defrag, EVCNT_TYPE_MISC,
    448 	    NULL, device_xname(sc->sc_dev), "Tx packet defrag");
    449 	evcnt_attach_dynamic(&sc->sc_cnt_tx_frag, EVCNT_TYPE_MISC,
    450 	    NULL, device_xname(sc->sc_dev), "Tx multi-segment packet");
    451 	evcnt_attach_dynamic(&sc->sc_cnt_tx_drop, EVCNT_TYPE_MISC,
    452 	    NULL, device_xname(sc->sc_dev), "Tx packet dropped");
    453 	evcnt_attach_dynamic(&sc->sc_cnt_tx_queue_full, EVCNT_TYPE_MISC,
    454 	    NULL, device_xname(sc->sc_dev), "Tx queue full");
    455 	evcnt_attach_dynamic(&sc->sc_cnt_rx_frag, EVCNT_TYPE_MISC,
    456 	    NULL, device_xname(sc->sc_dev), "Rx multi-segment packet");
    457 	evcnt_attach_dynamic(&sc->sc_cnt_rx_cksum_blank, EVCNT_TYPE_MISC,
    458 	    NULL, device_xname(sc->sc_dev), "Rx csum blank");
    459 	evcnt_attach_dynamic(&sc->sc_cnt_rx_cksum_undefer, EVCNT_TYPE_MISC,
    460 	    NULL, device_xname(sc->sc_dev), "Rx csum undeferred");
    461 
    462 	if (!pmf_device_register(self, xennet_xenbus_suspend,
    463 	    xennet_xenbus_resume))
    464 		aprint_error_dev(self, "couldn't establish power handler\n");
    465 	else
    466 		pmf_class_network_register(self, ifp);
    467 
    468 	/* resume shared structures and tell backend that we are ready */
    469 	if (xennet_xenbus_resume(self, PMF_Q_NONE) == false) {
    470 		uvm_km_free(kernel_map, (vaddr_t)tx_ring, PAGE_SIZE,
    471 		    UVM_KMF_WIRED);
    472 		uvm_km_free(kernel_map, (vaddr_t)rx_ring, PAGE_SIZE,
    473 		    UVM_KMF_WIRED);
    474 		return;
    475 	}
    476 }
    477 
    478 static int
    479 xennet_xenbus_detach(device_t self, int flags)
    480 {
    481 	struct xennet_xenbus_softc *sc = device_private(self);
    482 	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
    483 
    484 	if ((flags & (DETACH_SHUTDOWN | DETACH_FORCE)) == DETACH_SHUTDOWN) {
    485 		/* Trigger state transition with backend */
    486 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosing);
    487 		return EBUSY;
    488 	}
    489 
    490 	DPRINTF(("%s: xennet_xenbus_detach\n", device_xname(self)));
    491 
    492 	/* stop interface */
    493 	IFNET_LOCK(ifp);
    494 	xennet_stop(ifp, 1);
    495 	IFNET_UNLOCK(ifp);
    496 	if (sc->sc_ih != NULL) {
    497 		xen_intr_disestablish(sc->sc_ih);
    498 		sc->sc_ih = NULL;
    499 	}
    500 
    501 	/* collect any outstanding TX responses */
    502 	mutex_enter(&sc->sc_tx_lock);
    503 	xennet_tx_complete(sc);
    504 	while (sc->sc_tx_ring.sring->rsp_prod != sc->sc_tx_ring.rsp_cons) {
    505 		kpause("xndetach", true, hz/2, &sc->sc_tx_lock);
    506 		xennet_tx_complete(sc);
    507 	}
    508 	mutex_exit(&sc->sc_tx_lock);
    509 
    510 	mutex_enter(&sc->sc_rx_lock);
    511 	xennet_free_rx_buffer(sc, true);
    512 	mutex_exit(&sc->sc_rx_lock);
    513 
    514 	ether_ifdetach(ifp);
    515 	if_detach(ifp);
    516 
    517 	evcnt_detach(&sc->sc_cnt_tx_defrag);
    518 	evcnt_detach(&sc->sc_cnt_tx_frag);
    519 	evcnt_detach(&sc->sc_cnt_tx_drop);
    520 	evcnt_detach(&sc->sc_cnt_tx_queue_full);
    521 	evcnt_detach(&sc->sc_cnt_rx_frag);
    522 	evcnt_detach(&sc->sc_cnt_rx_cksum_blank);
    523 	evcnt_detach(&sc->sc_cnt_rx_cksum_undefer);
    524 
    525 	/* Unhook the entropy source. */
    526 	rnd_detach_source(&sc->sc_rnd_source);
    527 
    528 	/* Wait until the tx/rx rings stop being used by backend */
    529 	mutex_enter(&sc->sc_tx_lock);
    530 	while (xengnt_status(sc->sc_tx_ring_gntref))
    531 		kpause("xntxref", true, hz/2, &sc->sc_tx_lock);
    532 	xengnt_revoke_access(sc->sc_tx_ring_gntref);
    533 	mutex_exit(&sc->sc_tx_lock);
    534 	uvm_km_free(kernel_map, (vaddr_t)sc->sc_tx_ring.sring, PAGE_SIZE,
    535 	    UVM_KMF_WIRED);
    536 	mutex_enter(&sc->sc_rx_lock);
    537 	while (xengnt_status(sc->sc_rx_ring_gntref))
    538 		kpause("xnrxref", true, hz/2, &sc->sc_rx_lock);
    539 	xengnt_revoke_access(sc->sc_rx_ring_gntref);
    540 	mutex_exit(&sc->sc_rx_lock);
    541 	uvm_km_free(kernel_map, (vaddr_t)sc->sc_rx_ring.sring, PAGE_SIZE,
    542 	    UVM_KMF_WIRED);
    543 
    544 	pmf_device_deregister(self);
    545 
    546 	sc->sc_backend_status = BEST_DISCONNECTED;
    547 
    548 	DPRINTF(("%s: xennet_xenbus_detach done\n", device_xname(self)));
    549 	return 0;
    550 }
    551 
    552 static bool
    553 xennet_xenbus_resume(device_t dev, const pmf_qual_t *qual)
    554 {
    555 	struct xennet_xenbus_softc *sc = device_private(dev);
    556 	int error;
    557 	netif_tx_sring_t *tx_ring;
    558 	netif_rx_sring_t *rx_ring;
    559 	paddr_t ma;
    560 
    561 	/* All grants were removed during suspend */
    562 	sc->sc_tx_ring_gntref = GRANT_INVALID_REF;
    563 	sc->sc_rx_ring_gntref = GRANT_INVALID_REF;
    564 
    565 	mutex_enter(&sc->sc_rx_lock);
    566 	/* Free but don't revoke, the grant is gone */
    567 	xennet_free_rx_buffer(sc, false);
    568 	KASSERT(sc->sc_free_rxreql == NET_TX_RING_SIZE);
    569 	mutex_exit(&sc->sc_rx_lock);
    570 
    571 	tx_ring = sc->sc_tx_ring.sring;
    572 	rx_ring = sc->sc_rx_ring.sring;
    573 
    574 	/* Initialize rings */
    575 	memset(tx_ring, 0, PAGE_SIZE);
    576 	SHARED_RING_INIT(tx_ring);
    577 	FRONT_RING_INIT(&sc->sc_tx_ring, tx_ring, PAGE_SIZE);
    578 
    579 	memset(rx_ring, 0, PAGE_SIZE);
    580 	SHARED_RING_INIT(rx_ring);
    581 	FRONT_RING_INIT(&sc->sc_rx_ring, rx_ring, PAGE_SIZE);
    582 
    583 	(void)pmap_extract_ma(pmap_kernel(), (vaddr_t)tx_ring, &ma);
    584 	error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_tx_ring_gntref);
    585 	if (error)
    586 		goto abort_resume;
    587 	(void)pmap_extract_ma(pmap_kernel(), (vaddr_t)rx_ring, &ma);
    588 	error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_rx_ring_gntref);
    589 	if (error)
    590 		goto abort_resume;
    591 
    592 	if (sc->sc_ih != NULL) {
    593 		xen_intr_disestablish(sc->sc_ih);
    594 		sc->sc_ih = NULL;
    595 	}
    596 	error = xenbus_alloc_evtchn(sc->sc_xbusd, &sc->sc_evtchn);
    597 	if (error)
    598 		goto abort_resume;
    599 	aprint_verbose_dev(dev, "using event channel %d\n",
    600 	    sc->sc_evtchn);
    601 	sc->sc_ih = xen_intr_establish_xname(-1, &xen_pic, sc->sc_evtchn,
    602 	    IST_LEVEL, IPL_NET, &xennet_handler, sc, true, device_xname(dev));
    603 	KASSERT(sc->sc_ih != NULL);
    604 
    605 	/* Re-fill Rx ring */
    606 	mutex_enter(&sc->sc_rx_lock);
    607 	xennet_alloc_rx_buffer(sc);
    608 	KASSERT(sc->sc_free_rxreql == 0);
    609 	mutex_exit(&sc->sc_rx_lock);
    610 
    611 	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateInitialised);
    612 
    613 	if (sc->sc_backend_status == BEST_SUSPENDED) {
    614 		if (xennet_talk_to_backend(sc)) {
    615 			xenbus_device_resume(sc->sc_xbusd);
    616 			hypervisor_unmask_event(sc->sc_evtchn);
    617 			xenbus_switch_state(sc->sc_xbusd, NULL,
    618 			    XenbusStateConnected);
    619 		}
    620 	}
    621 
    622 	return true;
    623 
    624 abort_resume:
    625 	xenbus_dev_fatal(sc->sc_xbusd, error, "resuming device");
    626 	return false;
    627 }
    628 
    629 static bool
    630 xennet_talk_to_backend(struct xennet_xenbus_softc *sc)
    631 {
    632 	int error;
    633 	struct xenbus_transaction *xbt;
    634 	const char *errmsg;
    635 
    636 again:
    637 	xbt = xenbus_transaction_start();
    638 	if (xbt == NULL)
    639 		return false;
    640 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    641 	    "vifname", "%s", device_xname(sc->sc_dev));
    642 	if (error) {
    643 		errmsg = "vifname";
    644 		goto abort_transaction;
    645 	}
    646 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    647 	    "tx-ring-ref","%u", sc->sc_tx_ring_gntref);
    648 	if (error) {
    649 		errmsg = "writing tx ring-ref";
    650 		goto abort_transaction;
    651 	}
    652 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    653 	    "rx-ring-ref","%u", sc->sc_rx_ring_gntref);
    654 	if (error) {
    655 		errmsg = "writing rx ring-ref";
    656 		goto abort_transaction;
    657 	}
    658 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    659 	    "request-rx-copy", "%u", 1);
    660 	if (error) {
    661 		errmsg = "writing request-rx-copy";
    662 		goto abort_transaction;
    663 	}
    664 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    665 	    "feature-rx-notify", "%u", 1);
    666 	if (error) {
    667 		errmsg = "writing feature-rx-notify";
    668 		goto abort_transaction;
    669 	}
    670 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    671 	    "feature-ipv6-csum-offload", "%u", 1);
    672 	if (error) {
    673 		errmsg = "writing feature-ipv6-csum-offload";
    674 		goto abort_transaction;
    675 	}
    676 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    677 	    "feature-sg", "%u", 1);
    678 	if (error) {
    679 		errmsg = "writing feature-sg";
    680 		goto abort_transaction;
    681 	}
    682 	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
    683 	    "event-channel", "%u", sc->sc_evtchn);
    684 	if (error) {
    685 		errmsg = "writing event channel";
    686 		goto abort_transaction;
    687 	}
    688 	error = xenbus_transaction_end(xbt, 0);
    689 	if (error == EAGAIN)
    690 		goto again;
    691 	if (error) {
    692 		xenbus_dev_fatal(sc->sc_xbusd, error, "completing transaction");
    693 		return false;
    694 	}
    695 	mutex_enter(&sc->sc_rx_lock);
    696 	xennet_alloc_rx_buffer(sc);
    697 	mutex_exit(&sc->sc_rx_lock);
    698 
    699 	sc->sc_backend_status = BEST_CONNECTED;
    700 
    701 	return true;
    702 
    703 abort_transaction:
    704 	xenbus_transaction_end(xbt, 1);
    705 	xenbus_dev_fatal(sc->sc_xbusd, error, "%s", errmsg);
    706 	return false;
    707 }
    708 
    709 static bool
    710 xennet_xenbus_suspend(device_t dev, const pmf_qual_t *qual)
    711 {
    712 	struct xennet_xenbus_softc *sc = device_private(dev);
    713 
    714 	/*
    715 	 * xennet_stop() is called by pmf(9) before xennet_xenbus_suspend(),
    716 	 * so we do not mask event channel here
    717 	 */
    718 
    719 	mutex_enter(&sc->sc_tx_lock);
    720 
    721 	/* collect any outstanding TX responses */
    722 	xennet_tx_complete(sc);
    723 	while (sc->sc_tx_ring.sring->rsp_prod != sc->sc_tx_ring.rsp_cons) {
    724 		kpause("xnsuspend", true, hz/2, &sc->sc_tx_lock);
    725 		xennet_tx_complete(sc);
    726 	}
    727 	KASSERT(sc->sc_free_txreql == NET_RX_RING_SIZE);
    728 	mutex_exit(&sc->sc_tx_lock);
    729 
    730 	/*
    731 	 * dom0 may still use references to the grants we gave away
    732 	 * earlier during RX buffers allocation. So we do not free RX buffers
    733 	 * here, as dom0 does not expect the guest domain to suddenly revoke
    734 	 * access to these grants.
    735 	 */
    736 	sc->sc_backend_status = BEST_SUSPENDED;
    737 
    738 	xenbus_device_suspend(sc->sc_xbusd);
    739 	aprint_verbose_dev(dev, "removed event channel %d\n", sc->sc_evtchn);
    740 
    741 	return true;
    742 }
    743 
    744 static void xennet_backend_changed(void *arg, XenbusState new_state)
    745 {
    746 	struct xennet_xenbus_softc *sc = device_private((device_t)arg);
    747 	DPRINTF(("%s: new backend state %d\n",
    748 	    device_xname(sc->sc_dev), new_state));
    749 
    750 	switch (new_state) {
    751 	case XenbusStateInitialising:
    752 	case XenbusStateInitialised:
    753 	case XenbusStateConnected:
    754 		break;
    755 	case XenbusStateClosing:
    756 		sc->sc_backend_status = BEST_CLOSED;
    757 		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
    758 		break;
    759 	case XenbusStateInitWait:
    760 		if (sc->sc_backend_status == BEST_CONNECTED
    761 		   || sc->sc_backend_status == BEST_SUSPENDED)
    762 			break;
    763 
    764 		if (xennet_talk_to_backend(sc))
    765 			xenbus_switch_state(sc->sc_xbusd, NULL,
    766 			    XenbusStateConnected);
    767 		break;
    768 	case XenbusStateUnknown:
    769 	default:
    770 		panic("bad backend state %d", new_state);
    771 	}
    772 }
    773 
    774 /*
    775  * Allocate RX buffers and put the associated request structures
    776  * in the ring. This allows the backend to use them to communicate with
    777  * frontend when some data is destined to frontend
    778  */
    779 static void
    780 xennet_alloc_rx_buffer(struct xennet_xenbus_softc *sc)
    781 {
    782 	RING_IDX req_prod = sc->sc_rx_ring.req_prod_pvt;
    783 	RING_IDX i;
    784 	netif_rx_request_t *rxreq;
    785 	struct xennet_rxreq *req;
    786 	int otherend_id, notify;
    787 	struct mbuf *m;
    788 	vaddr_t va;
    789 	paddr_t pa, ma;
    790 	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
    791 
    792 	KASSERT(mutex_owned(&sc->sc_rx_lock));
    793 
    794 	otherend_id = sc->sc_xbusd->xbusd_otherend_id;
    795 
    796 	for (i = 0; sc->sc_free_rxreql != 0; i++) {
    797 		req  = SLIST_FIRST(&sc->sc_rxreq_head);
    798 		KASSERT(req != NULL);
    799 		KASSERT(req == &sc->sc_rxreqs[req->rxreq_id]);
    800 		KASSERT(req->rxreq_m == NULL);
    801 		KASSERT(req->rxreq_gntref == GRANT_INVALID_REF);
    802 
    803 		MGETHDR(m, M_DONTWAIT, MT_DATA);
    804 		if (__predict_false(m == NULL)) {
    805 			printf("%s: rx no mbuf\n", ifp->if_xname);
    806 			break;
    807 		}
    808 
    809 		va = (vaddr_t)pool_cache_get_paddr(
    810 		    if_xennetrxbuf_cache, PR_NOWAIT, &pa);
    811 		if (__predict_false(va == 0)) {
    812 			printf("%s: rx no cluster\n", ifp->if_xname);
    813 			m_freem(m);
    814 			break;
    815 		}
    816 
    817 		MEXTADD(m, va, PAGE_SIZE,
    818 		    M_DEVBUF, xennet_rx_mbuf_free, NULL);
    819 		m->m_len = m->m_pkthdr.len = PAGE_SIZE;
    820 		m->m_ext.ext_paddr = pa;
    821 		m->m_flags |= M_EXT_RW; /* we own the buffer */
    822 
    823 		/* Set M_EXT_CLUSTER so that load_mbuf uses m_ext.ext_paddr */
    824 		m->m_flags |= M_EXT_CLUSTER;
    825 		if (__predict_false(bus_dmamap_load_mbuf(
    826 		    sc->sc_xbusd->xbusd_dmat,
    827 		    req->rxreq_dmamap, m, BUS_DMA_NOWAIT) != 0)) {
    828 			printf("%s: rx mbuf load failed", ifp->if_xname);
    829 			m->m_flags &= ~M_EXT_CLUSTER;
    830 			m_freem(m);
    831 			break;
    832 		}
    833 		m->m_flags &= ~M_EXT_CLUSTER;
    834 
    835 		KASSERT(req->rxreq_dmamap->dm_nsegs == 1);
    836 		ma = req->rxreq_dmamap->dm_segs[0].ds_addr;
    837 
    838 		if (xengnt_grant_access(otherend_id, trunc_page(ma),
    839 		    0, &req->rxreq_gntref) != 0) {
    840 			m_freem(m);
    841 			break;
    842 		}
    843 
    844 		req->rxreq_m = m;
    845 
    846 		rxreq = RING_GET_REQUEST(&sc->sc_rx_ring, req_prod + i);
    847 		rxreq->id = req->rxreq_id;
    848 		rxreq->gref = req->rxreq_gntref;
    849 
    850 		SLIST_REMOVE_HEAD(&sc->sc_rxreq_head, rxreq_next);
    851 		sc->sc_free_rxreql--;
    852 	}
    853 
    854 	/* Notify backend if more Rx is possible */
    855 	if (i > 0) {
    856 		sc->sc_rx_ring.req_prod_pvt = req_prod + i;
    857 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_rx_ring, notify);
    858 		if (notify)
    859 			hypervisor_notify_via_evtchn(sc->sc_evtchn);
    860 	}
    861 }
    862 
    863 /*
    864  * Reclaim all RX buffers used by the I/O ring between frontend and backend
    865  */
    866 static void
    867 xennet_free_rx_buffer(struct xennet_xenbus_softc *sc, bool revoke)
    868 {
    869 	RING_IDX i;
    870 
    871 	KASSERT(mutex_owned(&sc->sc_rx_lock));
    872 
    873 	DPRINTF(("%s: xennet_free_rx_buffer\n", device_xname(sc->sc_dev)));
    874 	/* get back memory from RX ring */
    875 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
    876 		struct xennet_rxreq *rxreq = &sc->sc_rxreqs[i];
    877 
    878 		if (rxreq->rxreq_gntref != GRANT_INVALID_REF) {
    879 			/*
    880 			 * this req is still granted. Get back the page or
    881 			 * allocate a new one, and remap it.
    882 			 */
    883 			SLIST_INSERT_HEAD(&sc->sc_rxreq_head, rxreq,
    884 			    rxreq_next);
    885 			sc->sc_free_rxreql++;
    886 
    887 			if (revoke)
    888 				xengnt_revoke_access(rxreq->rxreq_gntref);
    889 			rxreq->rxreq_gntref = GRANT_INVALID_REF;
    890 		}
    891 
    892 		if (rxreq->rxreq_m != NULL) {
    893 			m_freem(rxreq->rxreq_m);
    894 			rxreq->rxreq_m = NULL;
    895 		}
    896 	}
    897 	DPRINTF(("%s: xennet_free_rx_buffer done\n", device_xname(sc->sc_dev)));
    898 }
    899 
    900 /*
    901  * Clears a used RX request when its associated mbuf has been processed
    902  */
    903 static void
    904 xennet_rx_mbuf_free(struct mbuf *m, void *buf, size_t size, void *arg)
    905 {
    906 	KASSERT(buf == m->m_ext.ext_buf);
    907 	KASSERT(arg == NULL);
    908 	KASSERT(m != NULL);
    909 	vaddr_t va = (vaddr_t)(buf) & ~((vaddr_t)PAGE_MASK);
    910 	pool_cache_put_paddr(if_xennetrxbuf_cache,
    911 	    (void *)va, m->m_ext.ext_paddr);
    912 	pool_cache_put(mb_cache, m);
    913 };
    914 
    915 static void
    916 xennet_rx_free_req(struct xennet_xenbus_softc *sc, struct xennet_rxreq *req)
    917 {
    918 	KASSERT(mutex_owned(&sc->sc_rx_lock));
    919 
    920 	/* puts back the RX request in the list of free RX requests */
    921 	SLIST_INSERT_HEAD(&sc->sc_rxreq_head, req, rxreq_next);
    922 	sc->sc_free_rxreql++;
    923 
    924 	/*
    925 	 * ring needs more requests to be pushed in, allocate some
    926 	 * RX buffers to catch-up with backend's consumption
    927 	 */
    928 	if (sc->sc_free_rxreql >= (NET_RX_RING_SIZE * 4 / 5) &&
    929 	    __predict_true(sc->sc_backend_status == BEST_CONNECTED)) {
    930 		xennet_alloc_rx_buffer(sc);
    931 	}
    932 }
    933 
    934 /*
    935  * Process responses associated to the TX mbufs sent previously through
    936  * xennet_start()
    937  * Called at splsoftnet.
    938  */
    939 static void
    940 xennet_tx_complete(struct xennet_xenbus_softc *sc)
    941 {
    942 	struct xennet_txreq *req;
    943 	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
    944 	RING_IDX resp_prod, i;
    945 
    946 	DPRINTFN(XEDB_EVENT, ("xennet_tx_complete prod %d cons %d\n",
    947 	    sc->sc_tx_ring.sring->rsp_prod, sc->sc_tx_ring.rsp_cons));
    948 
    949 	KASSERT(mutex_owned(&sc->sc_tx_lock));
    950 again:
    951 	resp_prod = sc->sc_tx_ring.sring->rsp_prod;
    952 	xen_rmb();
    953 	for (i = sc->sc_tx_ring.rsp_cons; i != resp_prod; i++) {
    954 		req = &sc->sc_txreqs[RING_GET_RESPONSE(&sc->sc_tx_ring, i)->id];
    955 		KASSERT(req->txreq_id ==
    956 		    RING_GET_RESPONSE(&sc->sc_tx_ring, i)->id);
    957 		KASSERT(xengnt_status(req->txreq_gntref) == 0);
    958 		xengnt_revoke_access(req->txreq_gntref);
    959 		req->txreq_gntref = GRANT_INVALID_REF;
    960 
    961 		/* Cleanup/statistics if this is the master req of a chain */
    962 		if (req->txreq_m) {
    963 			if (__predict_false(
    964 			    RING_GET_RESPONSE(&sc->sc_tx_ring, i)->status !=
    965 			    NETIF_RSP_OKAY))
    966 				if_statinc(ifp, if_oerrors);
    967 			else
    968 				if_statinc(ifp, if_opackets);
    969 			bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat,
    970 			    req->txreq_dmamap);
    971 			m_freem(req->txreq_m);
    972 			req->txreq_m = NULL;
    973 		}
    974 
    975 		SLIST_INSERT_HEAD(&sc->sc_txreq_head, req, txreq_next);
    976 		sc->sc_free_txreql++;
    977 	}
    978 	sc->sc_tx_ring.rsp_cons = resp_prod;
    979 	/* set new event and check for race with rsp_cons update */
    980 	xen_wmb();
    981 	sc->sc_tx_ring.sring->rsp_event =
    982 	    resp_prod + ((sc->sc_tx_ring.sring->req_prod - resp_prod) >> 1) + 1;
    983 	xen_mb();
    984 	if (resp_prod != sc->sc_tx_ring.sring->rsp_prod)
    985 		goto again;
    986 }
    987 
    988 /*
    989  * Xennet event handler.
    990  * Get outstanding responses of TX packets, then collect all responses of
    991  * pending RX packets
    992  * Called at splnet.
    993  */
    994 static int
    995 xennet_handler(void *arg)
    996 {
    997 	struct xennet_xenbus_softc *sc = arg;
    998 	struct ifnet *ifp = &sc->sc_ethercom.ec_if;
    999 	RING_IDX resp_prod, i;
   1000 	struct xennet_rxreq *req;
   1001 	struct mbuf *m, *m0;
   1002 	int rxflags, m0_rxflags;
   1003 	int more_to_do;
   1004 
   1005 	if (sc->sc_backend_status != BEST_CONNECTED)
   1006 		return 1;
   1007 
   1008 	/* Poke Tx queue if we run out of Tx buffers earlier */
   1009 	if_schedule_deferred_start(ifp);
   1010 
   1011 	rnd_add_uint32(&sc->sc_rnd_source, sc->sc_tx_ring.req_prod_pvt);
   1012 
   1013 again:
   1014 	DPRINTFN(XEDB_EVENT, ("xennet_handler prod %d cons %d\n",
   1015 	    sc->sc_rx_ring.sring->rsp_prod, sc->sc_rx_ring.rsp_cons));
   1016 
   1017 	mutex_enter(&sc->sc_rx_lock);
   1018 	resp_prod = sc->sc_rx_ring.sring->rsp_prod;
   1019 	xen_rmb(); /* ensure we see replies up to resp_prod */
   1020 
   1021 	m0 = NULL;
   1022 	for (i = sc->sc_rx_ring.rsp_cons; i != resp_prod; i++) {
   1023 		netif_rx_response_t *rx = RING_GET_RESPONSE(&sc->sc_rx_ring, i);
   1024 		req = &sc->sc_rxreqs[rx->id];
   1025 		KASSERT(req->rxreq_gntref != GRANT_INVALID_REF);
   1026 		KASSERT(req->rxreq_id == rx->id);
   1027 
   1028 		xengnt_revoke_access(req->rxreq_gntref);
   1029 		req->rxreq_gntref = GRANT_INVALID_REF;
   1030 
   1031 		m = req->rxreq_m;
   1032 		req->rxreq_m = NULL;
   1033 
   1034 		m->m_len = m->m_pkthdr.len = rx->status;
   1035 		bus_dmamap_sync(sc->sc_xbusd->xbusd_dmat, req->rxreq_dmamap, 0,
   1036 		     m->m_pkthdr.len, BUS_DMASYNC_PREREAD);
   1037 
   1038 		if (m0 == NULL) {
   1039 			MCLAIM(m, &sc->sc_ethercom.ec_rx_mowner);
   1040 			m_set_rcvif(m, ifp);
   1041 		}
   1042 
   1043 		rxflags = rx->flags;
   1044 
   1045 		if (m0 || rxflags & NETRXF_more_data) {
   1046 			/*
   1047 			 * On Rx, every fragment (even first one) contain
   1048 			 * just length of data in the fragment.
   1049 			 */
   1050 			if (m0 == NULL) {
   1051 				m0 = m;
   1052 				m0_rxflags = rxflags;
   1053 			} else {
   1054 				m_cat(m0, m);
   1055 				m0->m_pkthdr.len += m->m_len;
   1056 			}
   1057 
   1058 			if (rxflags & NETRXF_more_data) {
   1059 				/* Still more fragments to receive */
   1060 				xennet_rx_free_req(sc, req);
   1061 				continue;
   1062 			}
   1063 
   1064 			sc->sc_cnt_rx_frag.ev_count++;
   1065 			m = m0;
   1066 			m0 = NULL;
   1067 			rxflags = m0_rxflags;
   1068 		}
   1069 
   1070 		if (rxflags & NETRXF_csum_blank) {
   1071 			xennet_checksum_fill(ifp, m, &sc->sc_cnt_rx_cksum_blank,
   1072 			    &sc->sc_cnt_rx_cksum_undefer);
   1073 		} else if (rxflags & NETRXF_data_validated)
   1074 			m->m_pkthdr.csum_flags = XN_M_CSUM_SUPPORTED;
   1075 
   1076 		/* We'are done with req */
   1077 		xennet_rx_free_req(sc, req);
   1078 
   1079 		/* Pass the packet up. */
   1080 		if_percpuq_enqueue(ifp->if_percpuq, m);
   1081 	}
   1082 	/* If the queued Rx fragments did not finish the packet, drop it */
   1083 	if (m0) {
   1084 		if_statinc(ifp, if_iqdrops);
   1085 		m_freem(m0);
   1086 	}
   1087 	sc->sc_rx_ring.rsp_cons = i;
   1088 	xen_wmb();
   1089 	RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_rx_ring, more_to_do);
   1090 	mutex_exit(&sc->sc_rx_lock);
   1091 
   1092 	if (more_to_do) {
   1093 		DPRINTF(("%s: %s more_to_do\n", ifp->if_xname, __func__));
   1094 		goto again;
   1095 	}
   1096 
   1097 	return 1;
   1098 }
   1099 
   1100 static bool
   1101 xennet_submit_tx_request(struct xennet_xenbus_softc *sc, struct mbuf *m,
   1102     struct xennet_txreq *req0, int *req_prod)
   1103 {
   1104 	struct xennet_txreq *req = req0;
   1105 	netif_tx_request_t *txreq;
   1106 	int i, prod = *req_prod;
   1107 	const bool multiseg = (req0->txreq_dmamap->dm_nsegs > 1);
   1108 	const int lastseg = req0->txreq_dmamap->dm_nsegs - 1;
   1109 	bus_dma_segment_t *ds;
   1110 	SLIST_HEAD(, xennet_txreq) txchain;
   1111 
   1112 	KASSERT(mutex_owned(&sc->sc_tx_lock));
   1113 	KASSERT(req0->txreq_dmamap->dm_nsegs > 0);
   1114 
   1115 	bus_dmamap_sync(sc->sc_xbusd->xbusd_dmat, req->txreq_dmamap, 0,
   1116 	     m->m_pkthdr.len, BUS_DMASYNC_POSTWRITE);
   1117 	MCLAIM(m, &sc->sc_ethercom.ec_tx_mowner);
   1118 	SLIST_INIT(&txchain);
   1119 
   1120 	for (i = 0; i < req0->txreq_dmamap->dm_nsegs; i++) {
   1121 		KASSERT(req != NULL);
   1122 
   1123 		ds = &req0->txreq_dmamap->dm_segs[i];
   1124 
   1125 		if (__predict_false(xengnt_grant_access(
   1126 		    sc->sc_xbusd->xbusd_otherend_id,
   1127 		    trunc_page(ds->ds_addr),
   1128 		    GNTMAP_readonly, &req->txreq_gntref) != 0)) {
   1129 			goto grant_fail;
   1130 		}
   1131 
   1132 		KASSERT(SLIST_FIRST(&sc->sc_txreq_head) == req);
   1133 		SLIST_REMOVE_HEAD(&sc->sc_txreq_head, txreq_next);
   1134 		SLIST_INSERT_HEAD(&txchain, req, txreq_next);
   1135 		sc->sc_free_txreql--;
   1136 		req->txreq_m = (req == req0) ? m : NULL;
   1137 
   1138 		txreq = RING_GET_REQUEST(&sc->sc_tx_ring, prod + i);
   1139 		txreq->id = req->txreq_id;
   1140 		txreq->gref = req->txreq_gntref;
   1141 		txreq->offset = ds->ds_addr & PAGE_MASK;
   1142 		/* For Tx, first fragment size is always set to total size */
   1143 		txreq->size = (i == 0) ? m->m_pkthdr.len : ds->ds_len;
   1144 		txreq->flags = 0;
   1145 		if (i == 0) {
   1146 			if (m->m_pkthdr.csum_flags & XN_M_CSUM_SUPPORTED) {
   1147 				txreq->flags |= NETTXF_csum_blank;
   1148 			} else {
   1149 #if 0
   1150 				/*
   1151 				 * XXX Checksum optimization disabled
   1152 				 * to avoid port-xen/57743.
   1153 				 */
   1154 				txreq->flags |= NETTXF_data_validated;
   1155 #endif
   1156 			}
   1157 		}
   1158 		if (multiseg && i < lastseg)
   1159 			txreq->flags |= NETTXF_more_data;
   1160 
   1161 		req = SLIST_FIRST(&sc->sc_txreq_head);
   1162 	}
   1163 
   1164 	if (i > 1)
   1165 		sc->sc_cnt_tx_frag.ev_count++;
   1166 
   1167 	/* All done */
   1168 	*req_prod += i;
   1169 	return true;
   1170 
   1171 grant_fail:
   1172 	printf("%s: grant_access failed\n", device_xname(sc->sc_dev));
   1173 	while (!SLIST_EMPTY(&txchain)) {
   1174 		req = SLIST_FIRST(&txchain);
   1175 		SLIST_REMOVE_HEAD(&txchain, txreq_next);
   1176 		xengnt_revoke_access(req->txreq_gntref);
   1177 		req->txreq_gntref = GRANT_INVALID_REF;
   1178 		SLIST_INSERT_HEAD(&sc->sc_txreq_head, req, txreq_next);
   1179 		sc->sc_free_txreql++;
   1180 	}
   1181 	req0->txreq_m = NULL;
   1182 	return false;
   1183 }
   1184 
   1185 /*
   1186  * The output routine of a xennet interface. Prepares mbufs for TX,
   1187  * and notify backend when finished.
   1188  * Called at splsoftnet.
   1189  */
   1190 void
   1191 xennet_start(struct ifnet *ifp)
   1192 {
   1193 	struct xennet_xenbus_softc *sc = ifp->if_softc;
   1194 	struct mbuf *m;
   1195 	RING_IDX req_prod;
   1196 	struct xennet_txreq *req;
   1197 	int notify;
   1198 
   1199 	mutex_enter(&sc->sc_tx_lock);
   1200 
   1201 	rnd_add_uint32(&sc->sc_rnd_source, sc->sc_tx_ring.req_prod_pvt);
   1202 
   1203 	xennet_tx_complete(sc);
   1204 
   1205 	req_prod = sc->sc_tx_ring.req_prod_pvt;
   1206 	while (/*CONSTCOND*/1) {
   1207 		req = SLIST_FIRST(&sc->sc_txreq_head);
   1208 		if (__predict_false(req == NULL)) {
   1209 			if (!IFQ_IS_EMPTY(&ifp->if_snd))
   1210 				sc->sc_cnt_tx_queue_full.ev_count++;
   1211 			break;
   1212 		}
   1213 		IFQ_DEQUEUE(&ifp->if_snd, m);
   1214 		if (m == NULL)
   1215 			break;
   1216 
   1217 		/*
   1218 		 * For short packets it's always way faster passing
   1219 		 * single defragmented packet, even with feature-sg.
   1220 		 * Try to defragment first if the result is likely to fit
   1221 		 * into a single mbuf.
   1222 		 */
   1223 		if (m->m_pkthdr.len < MCLBYTES && m->m_next)
   1224 			(void)m_defrag(m, M_DONTWAIT);
   1225 
   1226 		/* Try to load the mbuf as-is, if that fails defrag */
   1227 		if (__predict_false(bus_dmamap_load_mbuf(
   1228 		    sc->sc_xbusd->xbusd_dmat,
   1229 		    req->txreq_dmamap, m, BUS_DMA_NOWAIT) != 0)) {
   1230 			sc->sc_cnt_tx_defrag.ev_count++;
   1231 			if (__predict_false(m_defrag(m, M_DONTWAIT) == NULL)) {
   1232 				DPRINTF(("%s: defrag failed\n",
   1233 				    device_xname(sc->sc_dev)));
   1234 				m_freem(m);
   1235 				break;
   1236 			}
   1237 
   1238 			if (__predict_false(bus_dmamap_load_mbuf(
   1239 			    sc->sc_xbusd->xbusd_dmat,
   1240 			    req->txreq_dmamap, m, BUS_DMA_NOWAIT) != 0)) {
   1241 				printf("%s: cannot load new mbuf len %d\n",
   1242 				    device_xname(sc->sc_dev),
   1243 				    m->m_pkthdr.len);
   1244 				m_freem(m);
   1245 				break;
   1246 			}
   1247 		}
   1248 
   1249 		if (req->txreq_dmamap->dm_nsegs > sc->sc_free_txreql) {
   1250 			/* Not enough slots right now, postpone */
   1251 			sc->sc_cnt_tx_queue_full.ev_count++;
   1252 			sc->sc_cnt_tx_drop.ev_count++;
   1253 			bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat,
   1254 			    req->txreq_dmamap);
   1255 			m_freem(m);
   1256 			break;
   1257 		}
   1258 
   1259 		DPRINTFN(XEDB_MBUF, ("xennet_start id %d, "
   1260 		    "mbuf %p, buf %p, size %d\n",
   1261 		    req->txreq_id, m, mtod(m, void *), m->m_pkthdr.len));
   1262 
   1263 #ifdef XENNET_DEBUG_DUMP
   1264 		xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "s",
   1265 		    req->txreq_id);
   1266 #endif
   1267 
   1268 		if (!xennet_submit_tx_request(sc, m, req, &req_prod)) {
   1269 			/* Grant failed, postpone */
   1270 			sc->sc_cnt_tx_drop.ev_count++;
   1271 			bus_dmamap_unload(sc->sc_xbusd->xbusd_dmat,
   1272 			    req->txreq_dmamap);
   1273 			m_freem(m);
   1274 			break;
   1275 		}
   1276 
   1277 		/*
   1278 		 * Pass packet to bpf if there is a listener.
   1279 		 */
   1280 		bpf_mtap(ifp, m, BPF_D_OUT);
   1281 	}
   1282 
   1283 	sc->sc_tx_ring.req_prod_pvt = req_prod;
   1284 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_tx_ring, notify);
   1285 	if (notify)
   1286 		hypervisor_notify_via_evtchn(sc->sc_evtchn);
   1287 
   1288 	mutex_exit(&sc->sc_tx_lock);
   1289 
   1290 	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start() done\n",
   1291 	    device_xname(sc->sc_dev)));
   1292 }
   1293 
   1294 int
   1295 xennet_ioctl(struct ifnet *ifp, u_long cmd, void *data)
   1296 {
   1297 #ifdef XENNET_DEBUG
   1298 	struct xennet_xenbus_softc *sc = ifp->if_softc;
   1299 #endif
   1300 	int error = 0;
   1301 
   1302 #ifdef NET_MPSAFE
   1303 #ifdef notyet
   1304 	/* XXX IFNET_LOCK() is not taken in some cases e.g. multicast ioctls */
   1305 	KASSERT(IFNET_LOCKED(ifp));
   1306 #endif
   1307 #endif
   1308 	int s = splnet();
   1309 
   1310 	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl()\n",
   1311 	    device_xname(sc->sc_dev)));
   1312 	error = ether_ioctl(ifp, cmd, data);
   1313 	if (error == ENETRESET)
   1314 		error = 0;
   1315 
   1316 	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() returning %d\n",
   1317 	    device_xname(sc->sc_dev), error));
   1318 
   1319 	splx(s);
   1320 
   1321 	return error;
   1322 }
   1323 
   1324 int
   1325 xennet_init(struct ifnet *ifp)
   1326 {
   1327 	struct xennet_xenbus_softc *sc = ifp->if_softc;
   1328 
   1329 	KASSERT(IFNET_LOCKED(ifp));
   1330 
   1331 	DPRINTFN(XEDB_FOLLOW, ("%s: xennet_init()\n",
   1332 	    device_xname(sc->sc_dev)));
   1333 
   1334 	if ((ifp->if_flags & IFF_RUNNING) == 0) {
   1335 		mutex_enter(&sc->sc_rx_lock);
   1336 		sc->sc_rx_ring.sring->rsp_event =
   1337 		    sc->sc_rx_ring.rsp_cons + 1;
   1338 		mutex_exit(&sc->sc_rx_lock);
   1339 		hypervisor_unmask_event(sc->sc_evtchn);
   1340 		hypervisor_notify_via_evtchn(sc->sc_evtchn);
   1341 	}
   1342 	ifp->if_flags |= IFF_RUNNING;
   1343 
   1344 	return 0;
   1345 }
   1346 
   1347 void
   1348 xennet_stop(struct ifnet *ifp, int disable)
   1349 {
   1350 	struct xennet_xenbus_softc *sc = ifp->if_softc;
   1351 
   1352 	KASSERT(IFNET_LOCKED(ifp));
   1353 
   1354 	ifp->if_flags &= ~IFF_RUNNING;
   1355 	hypervisor_mask_event(sc->sc_evtchn);
   1356 }
   1357 
   1358 /*
   1359  * Set up sysctl(3) MIB, hw.xennet.*.
   1360  */
   1361 static void
   1362 xennet_sysctl_init(struct xennet_xenbus_softc *sc)
   1363 {
   1364 	int rc, xennet_root_num;
   1365 	const struct sysctlnode *node;
   1366 
   1367 	if ((rc = sysctl_createv(&xennet_log, 0, NULL, &node,
   1368 	    0, CTLTYPE_NODE, "xennet",
   1369 	    SYSCTL_DESCR("XENNET interface controls"),
   1370 	    NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL)) != 0) {
   1371 		goto out;
   1372 	}
   1373 
   1374 	xennet_root_num = node->sysctl_num;
   1375 
   1376 	/* xnfrx_lowat setting */
   1377 	if ((rc = sysctl_createv(&xennet_log, 0, NULL, &node,
   1378 	    CTLFLAG_READWRITE,
   1379 	    CTLTYPE_INT, "xnfrx_lowat",
   1380 	    SYSCTL_DESCR("xnfrx low water threshold"),
   1381 	    xennet_sysctl_verify, 0,
   1382 	    &if_xnfrx_lowat,
   1383 	    0, CTL_HW, xennet_root_num, CTL_CREATE,
   1384 	    CTL_EOL)) != 0) {
   1385 		goto out;
   1386 	}
   1387 
   1388 	xennet_xnfrx_nodenum = node->sysctl_num;
   1389 
   1390 	return;
   1391 
   1392 out:
   1393 	aprint_error("%s: sysctl_createv failed (rc = %d)\n", __func__, rc);
   1394 }
   1395 
   1396 static int
   1397 xennet_sysctl_verify(SYSCTLFN_ARGS)
   1398 {
   1399 	int error, t;
   1400 	struct sysctlnode node;
   1401 
   1402 	node = *rnode;
   1403 	t = *(int*)rnode->sysctl_data;
   1404 	node.sysctl_data = &t;
   1405 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
   1406 	if (error || newp == NULL)
   1407 		return error;
   1408 
   1409 #if 0
   1410 	DPRINTF2(("%s: t = %d, nodenum = %d, rnodenum = %d\n", __func__, t,
   1411 	    node.sysctl_num, rnode->sysctl_num));
   1412 #endif
   1413 
   1414 	if (node.sysctl_num == xennet_xnfrx_nodenum) {
   1415 		if (t < 0 || t >= IF_XNFRX_LOWAT_MAX)
   1416 			return EINVAL;
   1417 		if_xnfrx_lowat = t;
   1418 		pool_cache_setlowat(if_xennetrxbuf_cache, if_xnfrx_lowat);
   1419 	} else
   1420 		return EINVAL;
   1421 
   1422 	*(int*)rnode->sysctl_data = t;
   1423 
   1424 	return 0;
   1425 }
   1426 
   1427 #if defined(NFS_BOOT_BOOTSTATIC)
   1428 int
   1429 xennet_bootstatic_callback(struct nfs_diskless *nd)
   1430 {
   1431 #if 0
   1432 	struct ifnet *ifp = nd->nd_ifp;
   1433 	struct xennet_xenbus_softc *sc =
   1434 	    (struct xennet_xenbus_softc *)ifp->if_softc;
   1435 #endif
   1436 	int flags = 0;
   1437 	union xen_cmdline_parseinfo xcp;
   1438 	struct sockaddr_in *sin;
   1439 
   1440 	memset(&xcp, 0, sizeof(xcp.xcp_netinfo));
   1441 	xcp.xcp_netinfo.xi_ifno = /* XXX sc->sc_ifno */ 0;
   1442 	xcp.xcp_netinfo.xi_root = nd->nd_root.ndm_host;
   1443 	xen_parse_cmdline(XEN_PARSE_NETINFO, &xcp);
   1444 
   1445 	if (xcp.xcp_netinfo.xi_root[0] != '\0') {
   1446 		flags |= NFS_BOOT_HAS_SERVER;
   1447 		if (strchr(xcp.xcp_netinfo.xi_root, ':') != NULL)
   1448 			flags |= NFS_BOOT_HAS_ROOTPATH;
   1449 	}
   1450 
   1451 	nd->nd_myip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[0]);
   1452 	nd->nd_gwip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[2]);
   1453 	nd->nd_mask.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[3]);
   1454 
   1455 	sin = (struct sockaddr_in *) &nd->nd_root.ndm_saddr;
   1456 	memset((void *)sin, 0, sizeof(*sin));
   1457 	sin->sin_len = sizeof(*sin);
   1458 	sin->sin_family = AF_INET;
   1459 	sin->sin_addr.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[1]);
   1460 
   1461 	if (nd->nd_myip.s_addr)
   1462 		flags |= NFS_BOOT_HAS_MYIP;
   1463 	if (nd->nd_gwip.s_addr)
   1464 		flags |= NFS_BOOT_HAS_GWIP;
   1465 	if (nd->nd_mask.s_addr)
   1466 		flags |= NFS_BOOT_HAS_MASK;
   1467 	if (sin->sin_addr.s_addr)
   1468 		flags |= NFS_BOOT_HAS_SERVADDR;
   1469 
   1470 	return flags;
   1471 }
   1472 #endif /* defined(NFS_BOOT_BOOTSTATIC) */
   1473 
   1474 #ifdef XENNET_DEBUG_DUMP
   1475 #define XCHR(x) hexdigits[(x) & 0xf]
   1476 static void
   1477 xennet_hex_dump(const unsigned char *pkt, size_t len, const char *type, int id)
   1478 {
   1479 	size_t i, j;
   1480 
   1481 	printf("pkt %p len %zd/%zx type %s id %d\n", pkt, len, len, type, id);
   1482 	printf("00000000  ");
   1483 	for(i=0; i<len; i++) {
   1484 		printf("%c%c ", XCHR(pkt[i]>>4), XCHR(pkt[i]));
   1485 		if ((i+1) % 16 == 8)
   1486 			printf(" ");
   1487 		if ((i+1) % 16 == 0) {
   1488 			printf(" %c", '|');
   1489 			for(j=0; j<16; j++)
   1490 				printf("%c", pkt[i-15+j]>=32 &&
   1491 				    pkt[i-15+j]<127?pkt[i-15+j]:'.');
   1492 			printf("%c\n%c%c%c%c%c%c%c%c  ", '|',
   1493 			    XCHR((i+1)>>28), XCHR((i+1)>>24),
   1494 			    XCHR((i+1)>>20), XCHR((i+1)>>16),
   1495 			    XCHR((i+1)>>12), XCHR((i+1)>>8),
   1496 			    XCHR((i+1)>>4), XCHR(i+1));
   1497 		}
   1498 	}
   1499 	printf("\n");
   1500 }
   1501 #undef XCHR
   1502 #endif
   1503