Home | History | Annotate | Line # | Download | only in libshmif
if_shmem.c revision 1.87
      1 /*	$NetBSD: if_shmem.c,v 1.87 2024/08/20 16:49:10 riastradh Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2009, 2010 Antti Kantee.  All Rights Reserved.
      5  *
      6  * Development of this software was supported by The Nokia Foundation.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     18  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     23  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  */
     29 
     30 #include <sys/cdefs.h>
     31 __KERNEL_RCSID(0, "$NetBSD: if_shmem.c,v 1.87 2024/08/20 16:49:10 riastradh Exp $");
     32 
     33 #include <sys/param.h>
     34 #include <sys/atomic.h>
     35 #include <sys/fcntl.h>
     36 #include <sys/kmem.h>
     37 #include <sys/kthread.h>
     38 #include <sys/lock.h>
     39 #include <sys/vmem.h>
     40 #include <sys/cprng.h>
     41 
     42 #include <net/bpf.h>
     43 #include <net/if.h>
     44 #include <net/if_dl.h>
     45 #include <net/if_ether.h>
     46 #include <net/if_media.h>
     47 #include <net/ether_sw_offload.h>
     48 
     49 #include <netinet/in.h>
     50 #include <netinet/in_var.h>
     51 
     52 #include <rump-sys/kern.h>
     53 #include <rump-sys/net.h>
     54 
     55 #include <rump/rump.h>
     56 #include <rump/rumpuser.h>
     57 
     58 #include "shmif_user.h"
     59 
     60 static int shmif_clone(struct if_clone *, int);
     61 static int shmif_unclone(struct ifnet *);
     62 
     63 static int shmif_mediachange(struct ifnet *);
     64 static void shmif_mediastatus(struct ifnet *, struct ifmediareq *);
     65 
     66 struct if_clone shmif_cloner =
     67     IF_CLONE_INITIALIZER("shmif", shmif_clone, shmif_unclone);
     68 
     69 /*
     70  * Do r/w prefault for backend pages when attaching the interface.
     71  * At least logically thinking improves performance (although no
     72  * mlocking is done, so they might go away).
     73  */
     74 #define PREFAULT_RW
     75 
     76 /*
     77  * A virtual ethernet interface which uses shared memory from a
     78  * memory mapped file as the bus.
     79  */
     80 
     81 static int	shmif_init(struct ifnet *);
     82 static int	shmif_ioctl(struct ifnet *, u_long, void *);
     83 static void	shmif_start(struct ifnet *);
     84 static void	shmif_snd(struct ifnet *, struct mbuf *);
     85 static void	shmif_stop(struct ifnet *, int);
     86 
     87 #include "shmifvar.h"
     88 
     89 struct shmif_sc {
     90 	struct ethercom sc_ec;
     91 	struct ifmedia sc_im;
     92 	struct shmif_mem *sc_busmem;
     93 	int sc_memfd;
     94 	int sc_kq;
     95 	int sc_unit;
     96 
     97 	char *sc_backfile;
     98 	size_t sc_backfilelen;
     99 
    100 	uint64_t sc_devgen;
    101 	uint32_t sc_nextpacket;
    102 
    103 	kmutex_t sc_mtx;
    104 	kcondvar_t sc_cv;
    105 
    106 	struct lwp *sc_rcvl;
    107 	bool sc_dying;
    108 
    109 	uint64_t sc_uid;
    110 };
    111 
    112 static void shmif_rcv(void *);
    113 
    114 #define LOCK_UNLOCKED	0
    115 #define LOCK_LOCKED	1
    116 #define LOCK_COOLDOWN	1001
    117 
    118 vmem_t *shmif_units;
    119 
    120 static void
    121 dowakeup(struct shmif_sc *sc)
    122 {
    123 	struct rumpuser_iovec iov;
    124 	uint32_t ver = SHMIF_VERSION;
    125 	size_t n;
    126 
    127 	iov.iov_base = &ver;
    128 	iov.iov_len = sizeof(ver);
    129 	rumpuser_iovwrite(sc->sc_memfd, &iov, 1, IFMEM_WAKEUP, &n);
    130 }
    131 
    132 /*
    133  * This locking needs work and will misbehave severely if:
    134  * 1) the backing memory has to be paged in
    135  * 2) some lockholder exits while holding the lock
    136  */
    137 static void
    138 shmif_lockbus(struct shmif_mem *busmem)
    139 {
    140 	int i = 0;
    141 
    142 	while (__predict_false(atomic_cas_32(&busmem->shm_lock,
    143 	    LOCK_UNLOCKED, LOCK_LOCKED) == LOCK_LOCKED)) {
    144 		if (__predict_false(++i > LOCK_COOLDOWN)) {
    145 			/* wait 1ms */
    146 			rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL,
    147 			    0, 1000*1000);
    148 			i = 0;
    149 		}
    150 		continue;
    151 	}
    152 	membar_acquire();
    153 }
    154 
    155 static void
    156 shmif_unlockbus(struct shmif_mem *busmem)
    157 {
    158 	unsigned int old __diagused;
    159 
    160 	membar_release();
    161 	old = atomic_swap_32(&busmem->shm_lock, LOCK_UNLOCKED);
    162 	KASSERT(old == LOCK_LOCKED);
    163 }
    164 
    165 static int
    166 allocif(int unit, struct shmif_sc **scp)
    167 {
    168 	uint8_t enaddr[ETHER_ADDR_LEN] = { 0xb2, 0xa0, 0x00, 0x00, 0x00, 0x00 };
    169 	struct shmif_sc *sc;
    170 	struct ifnet *ifp;
    171 	uint64_t randnum;
    172 	int error = 0;
    173 
    174 	randnum = cprng_strong64();
    175 	memcpy(&enaddr[2], &randnum, 4);
    176 
    177 	sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    178 	sc->sc_memfd = -1;
    179 	sc->sc_unit = unit;
    180 	sc->sc_uid = randnum;
    181 
    182 	ifp = &sc->sc_ec.ec_if;
    183 
    184 	ifmedia_init(&sc->sc_im, 0, shmif_mediachange, shmif_mediastatus);
    185 	ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL);
    186 	ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_NONE, 0, NULL);
    187 	ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO);
    188 
    189 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "shmif%d", unit);
    190 	ifp->if_softc = sc;
    191 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
    192 	ifp->if_init = shmif_init;
    193 	ifp->if_ioctl = shmif_ioctl;
    194 	ifp->if_start = shmif_start;
    195 	ifp->if_stop = shmif_stop;
    196 	ifp->if_mtu = ETHERMTU;
    197 	ifp->if_dlt = DLT_EN10MB;
    198 	ifp->if_capabilities = IFCAP_TSOv4 | IFCAP_TSOv6 |
    199 	    IFCAP_CSUM_IPv4_Rx	| IFCAP_CSUM_IPv4_Tx |
    200 	    IFCAP_CSUM_TCPv4_Rx	| IFCAP_CSUM_TCPv4_Tx |
    201 	    IFCAP_CSUM_UDPv4_Rx	| IFCAP_CSUM_UDPv4_Tx |
    202 	    IFCAP_CSUM_TCPv6_Rx	| IFCAP_CSUM_TCPv6_Tx |
    203 	    IFCAP_CSUM_UDPv6_Rx	| IFCAP_CSUM_UDPv6_Tx;
    204 	IFQ_SET_READY(&ifp->if_snd);
    205 
    206 	mutex_init(&sc->sc_mtx, MUTEX_DEFAULT, IPL_NONE);
    207 	cv_init(&sc->sc_cv, "shmifcv");
    208 
    209 	if_initialize(ifp);
    210 #if 1
    211 	char buf[256];
    212 
    213 	if (rumpuser_getparam("RUMP_SHMIF_CAPENABLE", buf, sizeof(buf)) == 0) {
    214 		uint64_t capen = strtoul(buf, NULL, 0);
    215 
    216 		ifp->if_capenable = capen & ifp->if_capabilities;
    217 	}
    218 #endif
    219 
    220 	if_deferred_start_init(ifp, NULL);
    221 	ether_ifattach(ifp, enaddr);
    222 	if_register(ifp);
    223 
    224 	aprint_verbose("shmif%d: Ethernet address %s\n",
    225 	    unit, ether_sprintf(enaddr));
    226 
    227 	if (scp)
    228 		*scp = sc;
    229 
    230 	if (rump_threads) {
    231 		error = kthread_create(PRI_NONE,
    232 		    KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
    233 		    shmif_rcv, ifp, &sc->sc_rcvl, "shmif");
    234 	} else {
    235 		printf("WARNING: threads not enabled, shmif NOT working\n");
    236 	}
    237 
    238 	if (error) {
    239 		shmif_unclone(ifp);
    240 	}
    241 
    242 	return 0;
    243 }
    244 
    245 static int
    246 initbackend(struct shmif_sc *sc, int memfd)
    247 {
    248 	volatile uint8_t v;
    249 	volatile uint8_t *p;
    250 	void *mem;
    251 	int error;
    252 
    253 	error = rumpcomp_shmif_mmap(memfd, BUSMEM_SIZE, &mem);
    254 	if (error)
    255 		return error;
    256 	sc->sc_busmem = mem;
    257 
    258 	if (sc->sc_busmem->shm_magic
    259 	    && sc->sc_busmem->shm_magic != SHMIF_MAGIC) {
    260 		printf("bus is not magical");
    261 		rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
    262 		return ENOEXEC;
    263 	}
    264 
    265 	/*
    266 	 * Prefault in pages to minimize runtime penalty with buslock.
    267 	 * Use 512 instead of PAGE_SIZE to make sure we catch cases where
    268 	 * rump kernel PAGE_SIZE > host page size.
    269 	 */
    270 	for (p = (uint8_t *)sc->sc_busmem;
    271 	    p < (uint8_t *)sc->sc_busmem + BUSMEM_SIZE;
    272 	    p += 512)
    273 		v = *p;
    274 
    275 	shmif_lockbus(sc->sc_busmem);
    276 	/* we're first?  initialize bus */
    277 	if (sc->sc_busmem->shm_magic == 0) {
    278 		sc->sc_busmem->shm_magic = SHMIF_MAGIC;
    279 		sc->sc_busmem->shm_first = BUSMEM_DATASIZE;
    280 	}
    281 
    282 	sc->sc_nextpacket = sc->sc_busmem->shm_last;
    283 	sc->sc_devgen = sc->sc_busmem->shm_gen;
    284 
    285 #ifdef PREFAULT_RW
    286 	for (p = (uint8_t *)sc->sc_busmem;
    287 	    p < (uint8_t *)sc->sc_busmem + BUSMEM_SIZE;
    288 	    p += PAGE_SIZE) {
    289 		v = *p;
    290 		*p = v;
    291 	}
    292 #endif
    293 	shmif_unlockbus(sc->sc_busmem);
    294 
    295 	sc->sc_kq = -1;
    296 	error = rumpcomp_shmif_watchsetup(&sc->sc_kq, memfd);
    297 	if (error) {
    298 		rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
    299 		return error;
    300 	}
    301 
    302 	sc->sc_memfd = memfd;
    303 
    304 	return error;
    305 }
    306 
    307 static void
    308 finibackend(struct shmif_sc *sc)
    309 {
    310 
    311 	if (sc->sc_backfile == NULL)
    312 		return;
    313 
    314 	if (sc->sc_backfile) {
    315 		kmem_free(sc->sc_backfile, sc->sc_backfilelen);
    316 		sc->sc_backfile = NULL;
    317 		sc->sc_backfilelen = 0;
    318 	}
    319 
    320 	rumpuser_unmap(sc->sc_busmem, BUSMEM_SIZE);
    321 	rumpuser_close(sc->sc_memfd);
    322 	rumpuser_close(sc->sc_kq);
    323 
    324 	sc->sc_memfd = -1;
    325 }
    326 
    327 int
    328 rump_shmif_create(const char *path, int *ifnum)
    329 {
    330 	struct shmif_sc *sc;
    331 	vmem_addr_t t;
    332 	int unit, error;
    333 	int memfd = -1; /* XXXgcc */
    334 
    335 	if (path) {
    336 		error = rumpuser_open(path,
    337 		    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_CREATE, &memfd);
    338 		if (error)
    339 			return error;
    340 	}
    341 
    342 	error = vmem_xalloc(shmif_units, 1, 0, 0, 0,
    343 	    VMEM_ADDR_MIN, VMEM_ADDR_MAX, VM_INSTANTFIT | VM_SLEEP, &t);
    344 
    345 	if (error != 0) {
    346 		if (path)
    347 			rumpuser_close(memfd);
    348 		return error;
    349 	}
    350 
    351 	unit = t - 1;
    352 
    353 	if ((error = allocif(unit, &sc)) != 0) {
    354 		if (path)
    355 			rumpuser_close(memfd);
    356 		return error;
    357 	}
    358 
    359 	if (!path)
    360 		goto out;
    361 
    362 	error = initbackend(sc, memfd);
    363 	if (error) {
    364 		shmif_unclone(&sc->sc_ec.ec_if);
    365 		return error;
    366 	}
    367 
    368 	sc->sc_backfilelen = strlen(path)+1;
    369 	sc->sc_backfile = kmem_alloc(sc->sc_backfilelen, KM_SLEEP);
    370 	strcpy(sc->sc_backfile, path);
    371 
    372  out:
    373 	if (ifnum)
    374 		*ifnum = unit;
    375 
    376 	return 0;
    377 }
    378 
    379 static int
    380 shmif_clone(struct if_clone *ifc, int unit)
    381 {
    382 	int rc __diagused;
    383 	vmem_addr_t unit2;
    384 
    385 	/*
    386 	 * Ok, we know the unit number, but we must still reserve it.
    387 	 * Otherwise the wildcard-side of things might get the same one.
    388 	 * This is slightly offset-happy due to vmem.  First, we offset
    389 	 * the range of unit numbers by +1 since vmem cannot deal with
    390 	 * ranges starting from 0.  Talk about uuuh.
    391 	 */
    392 	rc = vmem_xalloc(shmif_units, 1, 0, 0, 0, unit+1, unit+1,
    393 	    VM_SLEEP | VM_INSTANTFIT, &unit2);
    394 	KASSERT(rc == 0 && unit2-1 == unit);
    395 
    396 	return allocif(unit, NULL);
    397 }
    398 
    399 static int
    400 shmif_unclone(struct ifnet *ifp)
    401 {
    402 	struct shmif_sc *sc = ifp->if_softc;
    403 
    404 	shmif_stop(ifp, 1);
    405 	if_down(ifp);
    406 
    407 	mutex_enter(&sc->sc_mtx);
    408 	sc->sc_dying = true;
    409 	cv_broadcast(&sc->sc_cv);
    410 	mutex_exit(&sc->sc_mtx);
    411 
    412 	if (sc->sc_rcvl)
    413 		kthread_join(sc->sc_rcvl);
    414 	sc->sc_rcvl = NULL;
    415 
    416 	/*
    417 	 * Need to be called after the kthread left, otherwise closing kqueue
    418 	 * (sc_kq) hangs sometimes perhaps because of a race condition between
    419 	 * close and kevent in the kthread on the kqueue.
    420 	 */
    421 	finibackend(sc);
    422 
    423 	vmem_xfree(shmif_units, sc->sc_unit+1, 1);
    424 
    425 	ether_ifdetach(ifp);
    426 	if_detach(ifp);
    427 
    428 	cv_destroy(&sc->sc_cv);
    429 	mutex_destroy(&sc->sc_mtx);
    430 
    431 	kmem_free(sc, sizeof(*sc));
    432 
    433 	return 0;
    434 }
    435 
    436 static int
    437 shmif_init(struct ifnet *ifp)
    438 {
    439 	struct shmif_sc *sc = ifp->if_softc;
    440 	int error = 0;
    441 
    442 	if (sc->sc_memfd == -1)
    443 		return ENXIO;
    444 	KASSERT(sc->sc_busmem);
    445 
    446 	ifp->if_flags |= IFF_RUNNING;
    447 
    448 	mutex_enter(&sc->sc_mtx);
    449 	sc->sc_nextpacket = sc->sc_busmem->shm_last;
    450 	sc->sc_devgen = sc->sc_busmem->shm_gen;
    451 
    452 	cv_broadcast(&sc->sc_cv);
    453 	mutex_exit(&sc->sc_mtx);
    454 
    455 	return error;
    456 }
    457 
    458 static int
    459 shmif_mediachange(struct ifnet *ifp)
    460 {
    461 	struct shmif_sc *sc = ifp->if_softc;
    462 
    463 	if (IFM_SUBTYPE(sc->sc_im.ifm_cur->ifm_media) == IFM_NONE &&
    464 	    ifp->if_link_state != LINK_STATE_DOWN) {
    465 		if_link_state_change(ifp, LINK_STATE_DOWN);
    466 	} else if (IFM_SUBTYPE(sc->sc_im.ifm_cur->ifm_media) == IFM_AUTO &&
    467 	    ifp->if_link_state != LINK_STATE_UP) {
    468 		if_link_state_change(ifp, LINK_STATE_UP);
    469 	}
    470 	return 0;
    471 }
    472 
    473 static void
    474 shmif_mediastatus(struct ifnet *ifp, struct ifmediareq *imr)
    475 {
    476 	struct shmif_sc *sc = ifp->if_softc;
    477 	imr->ifm_active = sc->sc_im.ifm_cur->ifm_media;
    478 }
    479 
    480 static int
    481 shmif_ioctl(struct ifnet *ifp, u_long cmd, void *data)
    482 {
    483 	struct shmif_sc *sc = ifp->if_softc;
    484 	struct ifdrv *ifd;
    485 	char *path;
    486 	int s, rv, memfd;
    487 
    488 	s = splnet();
    489 	switch (cmd) {
    490 	case SIOCGLINKSTR:
    491 		ifd = data;
    492 
    493 		if (sc->sc_backfilelen == 0) {
    494 			rv = ENOENT;
    495 			break;
    496 		}
    497 
    498 		ifd->ifd_len = sc->sc_backfilelen;
    499 		if (ifd->ifd_cmd == IFLINKSTR_QUERYLEN) {
    500 			rv = 0;
    501 			break;
    502 		}
    503 
    504 		if (ifd->ifd_cmd != 0) {
    505 			rv = EINVAL;
    506 			break;
    507 		}
    508 
    509 		rv = copyoutstr(sc->sc_backfile, ifd->ifd_data,
    510 		    MIN(sc->sc_backfilelen, ifd->ifd_len), NULL);
    511 		break;
    512 	case SIOCSLINKSTR:
    513 		if (ifp->if_flags & IFF_UP) {
    514 			rv = EBUSY;
    515 			break;
    516 		}
    517 
    518 		ifd = data;
    519 		if (ifd->ifd_cmd == IFLINKSTR_UNSET) {
    520 			finibackend(sc);
    521 			/* Back to the default just in case */
    522 			ifp->if_link_state = LINK_STATE_UNKNOWN;
    523 			rv = 0;
    524 			break;
    525 		} else if (ifd->ifd_cmd != 0) {
    526 			rv = EINVAL;
    527 			break;
    528 		} else if (sc->sc_backfile) {
    529 			rv = EBUSY;
    530 			break;
    531 		}
    532 
    533 		if (ifd->ifd_len > MAXPATHLEN) {
    534 			rv = E2BIG;
    535 			break;
    536 		} else if (ifd->ifd_len < 1) {
    537 			rv = EINVAL;
    538 			break;
    539 		}
    540 
    541 		path = kmem_alloc(ifd->ifd_len, KM_SLEEP);
    542 		rv = copyinstr(ifd->ifd_data, path, ifd->ifd_len, NULL);
    543 		if (rv) {
    544 			kmem_free(path, ifd->ifd_len);
    545 			break;
    546 		}
    547 		rv = rumpuser_open(path,
    548 		    RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_CREATE, &memfd);
    549 		if (rv) {
    550 			kmem_free(path, ifd->ifd_len);
    551 			break;
    552 		}
    553 		rv = initbackend(sc, memfd);
    554 		if (rv) {
    555 			kmem_free(path, ifd->ifd_len);
    556 			rumpuser_close(memfd);
    557 			break;
    558 		}
    559 		sc->sc_backfile = path;
    560 		sc->sc_backfilelen = ifd->ifd_len;
    561 
    562 		if_link_state_change(ifp, LINK_STATE_UP);
    563 		break;
    564 
    565 #ifdef OSIOCSIFMEDIA
    566 	case OSIOCSIFMEDIA:
    567 #endif
    568 	case SIOCSIFMEDIA:
    569 	case SIOCGIFMEDIA:
    570 		rv = ifmedia_ioctl(ifp, data, &sc->sc_im, cmd);
    571 		break;
    572 
    573 	default:
    574 		rv = ether_ioctl(ifp, cmd, data);
    575 		if (rv == ENETRESET)
    576 			rv = 0;
    577 		break;
    578 	}
    579 	splx(s);
    580 
    581 	return rv;
    582 }
    583 
    584 static void
    585 shmif_start(struct ifnet *ifp)
    586 {
    587 	struct shmif_sc *sc = ifp->if_softc;
    588 	struct mbuf *m, *n;
    589 	bool wrote = false;
    590 
    591 	ifp->if_flags |= IFF_OACTIVE;
    592 
    593 	for (;;) {
    594 		IFQ_DEQUEUE(&ifp->if_snd, m);
    595 		if (m == NULL)
    596 			break;
    597 
    598 		m = ether_sw_offload_tx(ifp, m);
    599 		if (m == NULL) {
    600 			if_statinc(ifp, if_oerrors);
    601 			break;
    602 		}
    603 
    604 		do {
    605 			n = m->m_nextpkt;
    606 			shmif_snd(ifp, m);
    607 			m = n;
    608 		} while (m != NULL);
    609 
    610 		wrote = true;
    611 	}
    612 
    613 	ifp->if_flags &= ~IFF_OACTIVE;
    614 
    615 	/* wakeup? */
    616 	if (wrote) {
    617 		dowakeup(sc);
    618 	}
    619 }
    620 
    621 /* send everything in-context since it's just a matter of mem-to-mem copy */
    622 static void
    623 shmif_snd(struct ifnet *ifp, struct mbuf *m0)
    624 {
    625 	struct shmif_sc *sc = ifp->if_softc;
    626 	struct shmif_mem *busmem = sc->sc_busmem;
    627 	struct shmif_pkthdr sp;
    628 	struct timeval tv;
    629 	struct mbuf *m;
    630 	uint32_t dataoff;
    631 	uint32_t pktsize, pktwrote;
    632 	bool wrap;
    633 
    634 	pktsize = 0;
    635 	for (m = m0; m != NULL; m = m->m_next) {
    636 		pktsize += m->m_len;
    637 	}
    638 	KASSERT(pktsize <= ETHERMTU + ETHER_HDR_LEN);
    639 
    640 	getmicrouptime(&tv);
    641 	sp.sp_len = pktsize;
    642 	sp.sp_sec = tv.tv_sec;
    643 	sp.sp_usec = tv.tv_usec;
    644 	sp.sp_sender = sc->sc_uid;
    645 
    646 	bpf_mtap(ifp, m0, BPF_D_OUT);
    647 
    648 	/*
    649 	 * Compare with DOWN to allow UNKNOWN (the default value),
    650 	 * which is required by some ATF tests using rump servers
    651 	 * written in C.
    652 	 */
    653 	if (ifp->if_link_state == LINK_STATE_DOWN)
    654 		goto dontsend;
    655 
    656 	shmif_lockbus(busmem);
    657 	KASSERT(busmem->shm_magic == SHMIF_MAGIC);
    658 	busmem->shm_last = shmif_nextpktoff(busmem, busmem->shm_last);
    659 
    660 	wrap = false;
    661 	dataoff =
    662 	    shmif_buswrite(busmem, busmem->shm_last, &sp, sizeof(sp), &wrap);
    663 	pktwrote = 0;
    664 	for (m = m0; m != NULL; m = m->m_next) {
    665 		pktwrote += m->m_len;
    666 		dataoff = shmif_buswrite(busmem, dataoff, mtod(m, void *),
    667 		    m->m_len, &wrap);
    668 	}
    669 	KASSERT(pktwrote == pktsize);
    670 	if (wrap) {
    671 		busmem->shm_gen++;
    672 		DPRINTF(("bus generation now %" PRIu64 "\n", busmem->shm_gen));
    673 	}
    674 	shmif_unlockbus(busmem);
    675 
    676 dontsend:
    677 	m_freem(m0);
    678 	if_statinc(ifp, if_opackets);
    679 
    680 	DPRINTF(("shmif_start: send %d bytes at off %d\n", pktsize,
    681 	    busmem->shm_last));
    682 }
    683 
    684 static void
    685 shmif_stop(struct ifnet *ifp, int disable)
    686 {
    687 	struct shmif_sc *sc = ifp->if_softc;
    688 
    689 	ifp->if_flags &= ~IFF_RUNNING;
    690 	membar_producer();
    691 
    692 	/*
    693 	 * wakeup thread.  this will of course wake up all bus
    694 	 * listeners, but that's life.
    695 	 */
    696 	if (sc->sc_memfd != -1) {
    697 		dowakeup(sc);
    698 	}
    699 }
    700 
    701 
    702 /*
    703  * Check if we have been sleeping too long.  Basically,
    704  * our in-sc nextpkt must by first <= nextpkt <= last"+1".
    705  * We use the fact that first is guaranteed to never overlap
    706  * with the last frame in the ring.
    707  */
    708 static __inline bool
    709 stillvalid_p(struct shmif_sc *sc)
    710 {
    711 	struct shmif_mem *busmem = sc->sc_busmem;
    712 	unsigned gendiff = busmem->shm_gen - sc->sc_devgen;
    713 	uint32_t lastoff, devoff;
    714 
    715 	KASSERT(busmem->shm_first != busmem->shm_last);
    716 
    717 	/* normalize onto a 2x busmem chunk */
    718 	devoff = sc->sc_nextpacket;
    719 	lastoff = shmif_nextpktoff(busmem, busmem->shm_last);
    720 
    721 	/* trivial case */
    722 	if (gendiff > 1)
    723 		return false;
    724 	KASSERT(gendiff <= 1);
    725 
    726 	/* Normalize onto 2x busmem chunk */
    727 	if (busmem->shm_first >= lastoff) {
    728 		lastoff += BUSMEM_DATASIZE;
    729 		if (gendiff == 0)
    730 			devoff += BUSMEM_DATASIZE;
    731 	} else {
    732 		if (gendiff)
    733 			return false;
    734 	}
    735 
    736 	return devoff >= busmem->shm_first && devoff <= lastoff;
    737 }
    738 
    739 static void
    740 shmif_rcv(void *arg)
    741 {
    742 	struct ifnet *ifp = arg;
    743 	struct shmif_sc *sc = ifp->if_softc;
    744 	struct shmif_mem *busmem;
    745 	struct mbuf *m = NULL;
    746 	struct ether_header *eth;
    747 	uint32_t nextpkt;
    748 	bool wrap, passup;
    749 	int error;
    750 	const int align
    751 	    = ALIGN(sizeof(struct ether_header)) - sizeof(struct ether_header);
    752 
    753  reup:
    754 	mutex_enter(&sc->sc_mtx);
    755 	while ((ifp->if_flags & IFF_RUNNING) == 0 && !sc->sc_dying)
    756 		cv_wait(&sc->sc_cv, &sc->sc_mtx);
    757 	mutex_exit(&sc->sc_mtx);
    758 
    759 	busmem = sc->sc_busmem;
    760 
    761 	while (ifp->if_flags & IFF_RUNNING) {
    762 		struct shmif_pkthdr sp;
    763 
    764 		if (m == NULL) {
    765 			m = m_gethdr(M_WAIT, MT_DATA);
    766 			MCLGET(m, M_WAIT);
    767 			m->m_data += align;
    768 		}
    769 
    770 		DPRINTF(("waiting %d/%" PRIu64 "\n",
    771 		    sc->sc_nextpacket, sc->sc_devgen));
    772 		KASSERT(m->m_flags & M_EXT);
    773 
    774 		shmif_lockbus(busmem);
    775 		KASSERT(busmem->shm_magic == SHMIF_MAGIC);
    776 		KASSERT(busmem->shm_gen >= sc->sc_devgen);
    777 
    778 		/* need more data? */
    779 		if (sc->sc_devgen == busmem->shm_gen &&
    780 		    shmif_nextpktoff(busmem, busmem->shm_last)
    781 		     == sc->sc_nextpacket) {
    782 			shmif_unlockbus(busmem);
    783 			error = rumpcomp_shmif_watchwait(sc->sc_kq);
    784 			if (__predict_false(error))
    785 				printf("shmif_rcv: wait failed %d\n", error);
    786 			membar_consumer();
    787 			continue;
    788 		}
    789 
    790 		if (stillvalid_p(sc)) {
    791 			nextpkt = sc->sc_nextpacket;
    792 		} else {
    793 			KASSERT(busmem->shm_gen > 0);
    794 			nextpkt = busmem->shm_first;
    795 			if (busmem->shm_first > busmem->shm_last)
    796 				sc->sc_devgen = busmem->shm_gen - 1;
    797 			else
    798 				sc->sc_devgen = busmem->shm_gen;
    799 			DPRINTF(("dev %p overrun, new data: %d/%" PRIu64 "\n",
    800 			    sc, nextpkt, sc->sc_devgen));
    801 		}
    802 
    803 		/*
    804 		 * If our read pointer is ahead the bus last write, our
    805 		 * generation must be one behind.
    806 		 */
    807 		KASSERT(!(nextpkt > busmem->shm_last
    808 		    && sc->sc_devgen == busmem->shm_gen));
    809 
    810 		wrap = false;
    811 		nextpkt = shmif_busread(busmem, &sp,
    812 		    nextpkt, sizeof(sp), &wrap);
    813 		KASSERT(sp.sp_len <= ETHERMTU + ETHER_HDR_LEN);
    814 		nextpkt = shmif_busread(busmem, mtod(m, void *),
    815 		    nextpkt, sp.sp_len, &wrap);
    816 
    817 		DPRINTF(("shmif_rcv: read packet of length %d at %d\n",
    818 		    sp.sp_len, nextpkt));
    819 
    820 		sc->sc_nextpacket = nextpkt;
    821 		shmif_unlockbus(sc->sc_busmem);
    822 
    823 		if (wrap) {
    824 			sc->sc_devgen++;
    825 			DPRINTF(("dev %p generation now %" PRIu64 "\n",
    826 			    sc, sc->sc_devgen));
    827 		}
    828 
    829 		/*
    830 		 * Ignore packets too short to possibly be valid.
    831 		 * This is hit at least for the first frame on a new bus.
    832 		 */
    833 		if (__predict_false(sp.sp_len < ETHER_HDR_LEN)) {
    834 			DPRINTF(("shmif read packet len %d < ETHER_HDR_LEN\n",
    835 			    sp.sp_len));
    836 			continue;
    837 		}
    838 
    839 		m->m_len = m->m_pkthdr.len = sp.sp_len;
    840 		m_set_rcvif(m, ifp);
    841 
    842 		/*
    843 		 * Test if we want to pass the packet upwards
    844 		 */
    845 		eth = mtod(m, struct ether_header *);
    846 		/*
    847 		 * Compare with DOWN to allow UNKNOWN (the default value),
    848 		 * which is required by some ATF tests using rump servers
    849 		 * written in C.
    850 		 */
    851 		if (ifp->if_link_state == LINK_STATE_DOWN) {
    852 			passup = false;
    853 		} else if (sp.sp_sender == sc->sc_uid) {
    854 			passup = false;
    855 		} else if (memcmp(eth->ether_dhost, CLLADDR(ifp->if_sadl),
    856 		    ETHER_ADDR_LEN) == 0) {
    857 			passup = true;
    858 		} else if (ETHER_IS_MULTICAST(eth->ether_dhost)) {
    859 			passup = true;
    860 		} else if (ifp->if_flags & IFF_PROMISC) {
    861 			m->m_flags |= M_PROMISC;
    862 			passup = true;
    863 		} else {
    864 			passup = false;
    865 		}
    866 
    867 		if (passup) {
    868 			int bound;
    869 
    870 			m = ether_sw_offload_rx(ifp, m);
    871 
    872 			KERNEL_LOCK(1, NULL);
    873 			/* Prevent LWP migrations between CPUs for psref(9) */
    874 			bound = curlwp_bind();
    875 			if_input(ifp, m);
    876 			curlwp_bindx(bound);
    877 			KERNEL_UNLOCK_ONE(NULL);
    878 
    879 			m = NULL;
    880 		}
    881 		/* else: reuse mbuf for a future packet */
    882 	}
    883 	m_freem(m);
    884 	m = NULL;
    885 
    886 	if (!sc->sc_dying)
    887 		goto reup;
    888 
    889 	kthread_exit(0);
    890 }
    891