if_tap.c revision 1.54 1 /* $NetBSD: if_tap.c,v 1.54 2009/03/13 18:40:10 plunky Exp $ */
2
3 /*
4 * Copyright (c) 2003, 2004, 2008 The NetBSD Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * tap(4) is a virtual Ethernet interface. It appears as a real Ethernet
31 * device to the system, but can also be accessed by userland through a
32 * character device interface, which allows reading and injecting frames.
33 */
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: if_tap.c,v 1.54 2009/03/13 18:40:10 plunky Exp $");
37
38 #if defined(_KERNEL_OPT)
39 #include "bpfilter.h"
40 #include "opt_modular.h"
41 #include "opt_compat_netbsd.h"
42 #endif
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/malloc.h>
48 #include <sys/conf.h>
49 #include <sys/device.h>
50 #include <sys/file.h>
51 #include <sys/filedesc.h>
52 #include <sys/ksyms.h>
53 #include <sys/poll.h>
54 #include <sys/proc.h>
55 #include <sys/select.h>
56 #include <sys/sockio.h>
57 #if defined(COMPAT_40) || defined(MODULAR)
58 #include <sys/sysctl.h>
59 #endif
60 #include <sys/kauth.h>
61 #include <sys/mutex.h>
62 #include <sys/simplelock.h>
63 #include <sys/intr.h>
64
65 #include <net/if.h>
66 #include <net/if_dl.h>
67 #include <net/if_ether.h>
68 #include <net/if_media.h>
69 #include <net/if_tap.h>
70 #if NBPFILTER > 0
71 #include <net/bpf.h>
72 #endif
73
74 #include <compat/sys/sockio.h>
75
76 #if defined(COMPAT_40) || defined(MODULAR)
77 /*
78 * sysctl node management
79 *
80 * It's not really possible to use a SYSCTL_SETUP block with
81 * current module implementation, so it is easier to just define
82 * our own function.
83 *
84 * The handler function is a "helper" in Andrew Brown's sysctl
85 * framework terminology. It is used as a gateway for sysctl
86 * requests over the nodes.
87 *
88 * tap_log allows the module to log creations of nodes and
89 * destroy them all at once using sysctl_teardown.
90 */
91 static int tap_node;
92 static int tap_sysctl_handler(SYSCTLFN_PROTO);
93 SYSCTL_SETUP_PROTO(sysctl_tap_setup);
94 #endif
95
96 /*
97 * Since we're an Ethernet device, we need the 3 following
98 * components: a leading struct device, a struct ethercom,
99 * and also a struct ifmedia since we don't attach a PHY to
100 * ourselves. We could emulate one, but there's no real
101 * point.
102 */
103
104 struct tap_softc {
105 device_t sc_dev;
106 struct ifmedia sc_im;
107 struct ethercom sc_ec;
108 int sc_flags;
109 #define TAP_INUSE 0x00000001 /* tap device can only be opened once */
110 #define TAP_ASYNCIO 0x00000002 /* user is using async I/O (SIGIO) on the device */
111 #define TAP_NBIO 0x00000004 /* user wants calls to avoid blocking */
112 #define TAP_GOING 0x00000008 /* interface is being destroyed */
113 struct selinfo sc_rsel;
114 pid_t sc_pgid; /* For async. IO */
115 kmutex_t sc_rdlock;
116 struct simplelock sc_kqlock;
117 void *sc_sih;
118 };
119
120 /* autoconf(9) glue */
121
122 void tapattach(int);
123
124 static int tap_match(device_t, cfdata_t, void *);
125 static void tap_attach(device_t, device_t, void *);
126 static int tap_detach(device_t, int);
127
128 CFATTACH_DECL_NEW(tap, sizeof(struct tap_softc),
129 tap_match, tap_attach, tap_detach, NULL);
130 extern struct cfdriver tap_cd;
131
132 /* Real device access routines */
133 static int tap_dev_close(struct tap_softc *);
134 static int tap_dev_read(int, struct uio *, int);
135 static int tap_dev_write(int, struct uio *, int);
136 static int tap_dev_ioctl(int, u_long, void *, struct lwp *);
137 static int tap_dev_poll(int, int, struct lwp *);
138 static int tap_dev_kqfilter(int, struct knote *);
139
140 /* Fileops access routines */
141 static int tap_fops_close(file_t *);
142 static int tap_fops_read(file_t *, off_t *, struct uio *,
143 kauth_cred_t, int);
144 static int tap_fops_write(file_t *, off_t *, struct uio *,
145 kauth_cred_t, int);
146 static int tap_fops_ioctl(file_t *, u_long, void *);
147 static int tap_fops_poll(file_t *, int);
148 static int tap_fops_kqfilter(file_t *, struct knote *);
149
150 static const struct fileops tap_fileops = {
151 tap_fops_read,
152 tap_fops_write,
153 tap_fops_ioctl,
154 fnullop_fcntl,
155 tap_fops_poll,
156 fbadop_stat,
157 tap_fops_close,
158 tap_fops_kqfilter,
159 };
160
161 /* Helper for cloning open() */
162 static int tap_dev_cloner(struct lwp *);
163
164 /* Character device routines */
165 static int tap_cdev_open(dev_t, int, int, struct lwp *);
166 static int tap_cdev_close(dev_t, int, int, struct lwp *);
167 static int tap_cdev_read(dev_t, struct uio *, int);
168 static int tap_cdev_write(dev_t, struct uio *, int);
169 static int tap_cdev_ioctl(dev_t, u_long, void *, int, struct lwp *);
170 static int tap_cdev_poll(dev_t, int, struct lwp *);
171 static int tap_cdev_kqfilter(dev_t, struct knote *);
172
173 const struct cdevsw tap_cdevsw = {
174 tap_cdev_open, tap_cdev_close,
175 tap_cdev_read, tap_cdev_write,
176 tap_cdev_ioctl, nostop, notty,
177 tap_cdev_poll, nommap,
178 tap_cdev_kqfilter,
179 D_OTHER,
180 };
181
182 #define TAP_CLONER 0xfffff /* Maximal minor value */
183
184 /* kqueue-related routines */
185 static void tap_kqdetach(struct knote *);
186 static int tap_kqread(struct knote *, long);
187
188 /*
189 * Those are needed by the if_media interface.
190 */
191
192 static int tap_mediachange(struct ifnet *);
193 static void tap_mediastatus(struct ifnet *, struct ifmediareq *);
194
195 /*
196 * Those are needed by the ifnet interface, and would typically be
197 * there for any network interface driver.
198 * Some other routines are optional: watchdog and drain.
199 */
200
201 static void tap_start(struct ifnet *);
202 static void tap_stop(struct ifnet *, int);
203 static int tap_init(struct ifnet *);
204 static int tap_ioctl(struct ifnet *, u_long, void *);
205
206 /* Internal functions */
207 #if defined(COMPAT_40) || defined(MODULAR)
208 static int tap_lifaddr(struct ifnet *, u_long, struct ifaliasreq *);
209 #endif
210 static void tap_softintr(void *);
211
212 /*
213 * tap is a clonable interface, although it is highly unrealistic for
214 * an Ethernet device.
215 *
216 * Here are the bits needed for a clonable interface.
217 */
218 static int tap_clone_create(struct if_clone *, int);
219 static int tap_clone_destroy(struct ifnet *);
220
221 struct if_clone tap_cloners = IF_CLONE_INITIALIZER("tap",
222 tap_clone_create,
223 tap_clone_destroy);
224
225 /* Helper functionis shared by the two cloning code paths */
226 static struct tap_softc * tap_clone_creator(int);
227 int tap_clone_destroyer(device_t);
228
229 void
230 tapattach(int n)
231 {
232 int error;
233
234 error = config_cfattach_attach(tap_cd.cd_name, &tap_ca);
235 if (error) {
236 aprint_error("%s: unable to register cfattach\n",
237 tap_cd.cd_name);
238 (void)config_cfdriver_detach(&tap_cd);
239 return;
240 }
241
242 if_clone_attach(&tap_cloners);
243 }
244
245 /* Pretty much useless for a pseudo-device */
246 static int
247 tap_match(device_t parent, cfdata_t cfdata, void *arg)
248 {
249
250 return (1);
251 }
252
253 void
254 tap_attach(device_t parent, device_t self, void *aux)
255 {
256 struct tap_softc *sc = device_private(self);
257 struct ifnet *ifp;
258 #if defined(COMPAT_40) || defined(MODULAR)
259 const struct sysctlnode *node;
260 int error;
261 #endif
262 uint8_t enaddr[ETHER_ADDR_LEN] =
263 { 0xf2, 0x0b, 0xa4, 0xff, 0xff, 0xff };
264 char enaddrstr[3 * ETHER_ADDR_LEN];
265 struct timeval tv;
266 uint32_t ui;
267
268 sc->sc_dev = self;
269 sc->sc_sih = softint_establish(SOFTINT_CLOCK, tap_softintr, sc);
270
271 if (!pmf_device_register(self, NULL, NULL))
272 aprint_error_dev(self, "couldn't establish power handler\n");
273
274 /*
275 * In order to obtain unique initial Ethernet address on a host,
276 * do some randomisation using the current uptime. It's not meant
277 * for anything but avoiding hard-coding an address.
278 */
279 getmicrouptime(&tv);
280 ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff;
281 memcpy(enaddr+3, (uint8_t *)&ui, 3);
282
283 aprint_verbose_dev(self, "Ethernet address %s\n",
284 ether_snprintf(enaddrstr, sizeof(enaddrstr), enaddr));
285
286 /*
287 * Why 1000baseT? Why not? You can add more.
288 *
289 * Note that there are 3 steps: init, one or several additions to
290 * list of supported media, and in the end, the selection of one
291 * of them.
292 */
293 ifmedia_init(&sc->sc_im, 0, tap_mediachange, tap_mediastatus);
294 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T, 0, NULL);
295 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_1000_T|IFM_FDX, 0, NULL);
296 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX, 0, NULL);
297 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_100_TX|IFM_FDX, 0, NULL);
298 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T, 0, NULL);
299 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_10_T|IFM_FDX, 0, NULL);
300 ifmedia_add(&sc->sc_im, IFM_ETHER|IFM_AUTO, 0, NULL);
301 ifmedia_set(&sc->sc_im, IFM_ETHER|IFM_AUTO);
302
303 /*
304 * One should note that an interface must do multicast in order
305 * to support IPv6.
306 */
307 ifp = &sc->sc_ec.ec_if;
308 strcpy(ifp->if_xname, device_xname(self));
309 ifp->if_softc = sc;
310 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
311 ifp->if_ioctl = tap_ioctl;
312 ifp->if_start = tap_start;
313 ifp->if_stop = tap_stop;
314 ifp->if_init = tap_init;
315 IFQ_SET_READY(&ifp->if_snd);
316
317 sc->sc_ec.ec_capabilities = ETHERCAP_VLAN_MTU | ETHERCAP_JUMBO_MTU;
318
319 /* Those steps are mandatory for an Ethernet driver, the fisrt call
320 * being common to all network interface drivers. */
321 if_attach(ifp);
322 ether_ifattach(ifp, enaddr);
323
324 sc->sc_flags = 0;
325
326 #if defined(COMPAT_40) || defined(MODULAR)
327 /*
328 * Add a sysctl node for that interface.
329 *
330 * The pointer transmitted is not a string, but instead a pointer to
331 * the softc structure, which we can use to build the string value on
332 * the fly in the helper function of the node. See the comments for
333 * tap_sysctl_handler for details.
334 *
335 * Usually sysctl_createv is called with CTL_CREATE as the before-last
336 * component. However, we can allocate a number ourselves, as we are
337 * the only consumer of the net.link.<iface> node. In this case, the
338 * unit number is conveniently used to number the node. CTL_CREATE
339 * would just work, too.
340 */
341 if ((error = sysctl_createv(NULL, 0, NULL,
342 &node, CTLFLAG_READWRITE,
343 CTLTYPE_STRING, device_xname(self), NULL,
344 tap_sysctl_handler, 0, sc, 18,
345 CTL_NET, AF_LINK, tap_node, device_unit(sc->sc_dev),
346 CTL_EOL)) != 0)
347 aprint_error_dev(self, "sysctl_createv returned %d, ignoring\n",
348 error);
349 #endif
350
351 /*
352 * Initialize the two locks for the device.
353 *
354 * We need a lock here because even though the tap device can be
355 * opened only once, the file descriptor might be passed to another
356 * process, say a fork(2)ed child.
357 *
358 * The Giant saves us from most of the hassle, but since the read
359 * operation can sleep, we don't want two processes to wake up at
360 * the same moment and both try and dequeue a single packet.
361 *
362 * The queue for event listeners (used by kqueue(9), see below) has
363 * to be protected, too, but we don't need the same level of
364 * complexity for that lock, so a simple spinning lock is fine.
365 */
366 mutex_init(&sc->sc_rdlock, MUTEX_DEFAULT, IPL_NONE);
367 simple_lock_init(&sc->sc_kqlock);
368
369 selinit(&sc->sc_rsel);
370 }
371
372 /*
373 * When detaching, we do the inverse of what is done in the attach
374 * routine, in reversed order.
375 */
376 static int
377 tap_detach(device_t self, int flags)
378 {
379 struct tap_softc *sc = device_private(self);
380 struct ifnet *ifp = &sc->sc_ec.ec_if;
381 #if defined(COMPAT_40) || defined(MODULAR)
382 int error;
383 #endif
384 int s;
385
386 sc->sc_flags |= TAP_GOING;
387 s = splnet();
388 tap_stop(ifp, 1);
389 if_down(ifp);
390 splx(s);
391
392 softint_disestablish(sc->sc_sih);
393
394 #if defined(COMPAT_40) || defined(MODULAR)
395 /*
396 * Destroying a single leaf is a very straightforward operation using
397 * sysctl_destroyv. One should be sure to always end the path with
398 * CTL_EOL.
399 */
400 if ((error = sysctl_destroyv(NULL, CTL_NET, AF_LINK, tap_node,
401 device_unit(sc->sc_dev), CTL_EOL)) != 0)
402 aprint_error_dev(self,
403 "sysctl_destroyv returned %d, ignoring\n", error);
404 #endif
405 ether_ifdetach(ifp);
406 if_detach(ifp);
407 ifmedia_delete_instance(&sc->sc_im, IFM_INST_ANY);
408 seldestroy(&sc->sc_rsel);
409 mutex_destroy(&sc->sc_rdlock);
410
411 pmf_device_deregister(self);
412
413 return (0);
414 }
415
416 /*
417 * This function is called by the ifmedia layer to notify the driver
418 * that the user requested a media change. A real driver would
419 * reconfigure the hardware.
420 */
421 static int
422 tap_mediachange(struct ifnet *ifp)
423 {
424 return (0);
425 }
426
427 /*
428 * Here the user asks for the currently used media.
429 */
430 static void
431 tap_mediastatus(struct ifnet *ifp, struct ifmediareq *imr)
432 {
433 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
434 imr->ifm_active = sc->sc_im.ifm_cur->ifm_media;
435 }
436
437 /*
438 * This is the function where we SEND packets.
439 *
440 * There is no 'receive' equivalent. A typical driver will get
441 * interrupts from the hardware, and from there will inject new packets
442 * into the network stack.
443 *
444 * Once handled, a packet must be freed. A real driver might not be able
445 * to fit all the pending packets into the hardware, and is allowed to
446 * return before having sent all the packets. It should then use the
447 * if_flags flag IFF_OACTIVE to notify the upper layer.
448 *
449 * There are also other flags one should check, such as IFF_PAUSE.
450 *
451 * It is our duty to make packets available to BPF listeners.
452 *
453 * You should be aware that this function is called by the Ethernet layer
454 * at splnet().
455 *
456 * When the device is opened, we have to pass the packet(s) to the
457 * userland. For that we stay in OACTIVE mode while the userland gets
458 * the packets, and we send a signal to the processes waiting to read.
459 *
460 * wakeup(sc) is the counterpart to the tsleep call in
461 * tap_dev_read, while selnotify() is used for kevent(2) and
462 * poll(2) (which includes select(2)) listeners.
463 */
464 static void
465 tap_start(struct ifnet *ifp)
466 {
467 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
468 struct mbuf *m0;
469
470 if ((sc->sc_flags & TAP_INUSE) == 0) {
471 /* Simply drop packets */
472 for(;;) {
473 IFQ_DEQUEUE(&ifp->if_snd, m0);
474 if (m0 == NULL)
475 return;
476
477 ifp->if_opackets++;
478 #if NBPFILTER > 0
479 if (ifp->if_bpf)
480 bpf_mtap(ifp->if_bpf, m0);
481 #endif
482
483 m_freem(m0);
484 }
485 } else if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
486 ifp->if_flags |= IFF_OACTIVE;
487 wakeup(sc);
488 selnotify(&sc->sc_rsel, 0, 1);
489 if (sc->sc_flags & TAP_ASYNCIO)
490 softint_schedule(sc->sc_sih);
491 }
492 }
493
494 static void
495 tap_softintr(void *cookie)
496 {
497 struct tap_softc *sc;
498 struct ifnet *ifp;
499 int a, b;
500
501 sc = cookie;
502
503 if (sc->sc_flags & TAP_ASYNCIO) {
504 ifp = &sc->sc_ec.ec_if;
505 if (ifp->if_flags & IFF_RUNNING) {
506 a = POLL_IN;
507 b = POLLIN|POLLRDNORM;
508 } else {
509 a = POLL_HUP;
510 b = 0;
511 }
512 fownsignal(sc->sc_pgid, SIGIO, a, b, NULL);
513 }
514 }
515
516 /*
517 * A typical driver will only contain the following handlers for
518 * ioctl calls, except SIOCSIFPHYADDR.
519 * The latter is a hack I used to set the Ethernet address of the
520 * faked device.
521 *
522 * Note that both ifmedia_ioctl() and ether_ioctl() have to be
523 * called under splnet().
524 */
525 static int
526 tap_ioctl(struct ifnet *ifp, u_long cmd, void *data)
527 {
528 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
529 struct ifreq *ifr = (struct ifreq *)data;
530 int s, error;
531
532 s = splnet();
533
534 switch (cmd) {
535 #ifdef OSIOCSIFMEDIA
536 case OSIOCSIFMEDIA:
537 #endif
538 case SIOCSIFMEDIA:
539 case SIOCGIFMEDIA:
540 error = ifmedia_ioctl(ifp, ifr, &sc->sc_im, cmd);
541 break;
542 #if defined(COMPAT_40) || defined(MODULAR)
543 case SIOCSIFPHYADDR:
544 error = tap_lifaddr(ifp, cmd, (struct ifaliasreq *)data);
545 break;
546 #endif
547 default:
548 error = ether_ioctl(ifp, cmd, data);
549 if (error == ENETRESET)
550 error = 0;
551 break;
552 }
553
554 splx(s);
555
556 return (error);
557 }
558
559 #if defined(COMPAT_40) || defined(MODULAR)
560 /*
561 * Helper function to set Ethernet address. This has been replaced by
562 * the generic SIOCALIFADDR ioctl on a PF_LINK socket.
563 */
564 static int
565 tap_lifaddr(struct ifnet *ifp, u_long cmd, struct ifaliasreq *ifra)
566 {
567 const struct sockaddr *sa = &ifra->ifra_addr;
568
569 if (sa->sa_family != AF_LINK)
570 return (EINVAL);
571
572 if_set_sadl(ifp, sa->sa_data, ETHER_ADDR_LEN, false);
573
574 return (0);
575 }
576 #endif
577
578 /*
579 * _init() would typically be called when an interface goes up,
580 * meaning it should configure itself into the state in which it
581 * can send packets.
582 */
583 static int
584 tap_init(struct ifnet *ifp)
585 {
586 ifp->if_flags |= IFF_RUNNING;
587
588 tap_start(ifp);
589
590 return (0);
591 }
592
593 /*
594 * _stop() is called when an interface goes down. It is our
595 * responsability to validate that state by clearing the
596 * IFF_RUNNING flag.
597 *
598 * We have to wake up all the sleeping processes to have the pending
599 * read requests cancelled.
600 */
601 static void
602 tap_stop(struct ifnet *ifp, int disable)
603 {
604 struct tap_softc *sc = (struct tap_softc *)ifp->if_softc;
605
606 ifp->if_flags &= ~IFF_RUNNING;
607 wakeup(sc);
608 selnotify(&sc->sc_rsel, 0, 1);
609 if (sc->sc_flags & TAP_ASYNCIO)
610 softint_schedule(sc->sc_sih);
611 }
612
613 /*
614 * The 'create' command of ifconfig can be used to create
615 * any numbered instance of a given device. Thus we have to
616 * make sure we have enough room in cd_devs to create the
617 * user-specified instance. config_attach_pseudo will do this
618 * for us.
619 */
620 static int
621 tap_clone_create(struct if_clone *ifc, int unit)
622 {
623 if (tap_clone_creator(unit) == NULL) {
624 aprint_error("%s%d: unable to attach an instance\n",
625 tap_cd.cd_name, unit);
626 return (ENXIO);
627 }
628
629 return (0);
630 }
631
632 /*
633 * tap(4) can be cloned by two ways:
634 * using 'ifconfig tap0 create', which will use the network
635 * interface cloning API, and call tap_clone_create above.
636 * opening the cloning device node, whose minor number is TAP_CLONER.
637 * See below for an explanation on how this part work.
638 */
639 static struct tap_softc *
640 tap_clone_creator(int unit)
641 {
642 struct cfdata *cf;
643
644 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK);
645 cf->cf_name = tap_cd.cd_name;
646 cf->cf_atname = tap_ca.ca_name;
647 if (unit == -1) {
648 /* let autoconf find the first free one */
649 cf->cf_unit = 0;
650 cf->cf_fstate = FSTATE_STAR;
651 } else {
652 cf->cf_unit = unit;
653 cf->cf_fstate = FSTATE_FOUND;
654 }
655
656 return device_private(config_attach_pseudo(cf));
657 }
658
659 /*
660 * The clean design of if_clone and autoconf(9) makes that part
661 * really straightforward. The second argument of config_detach
662 * means neither QUIET nor FORCED.
663 */
664 static int
665 tap_clone_destroy(struct ifnet *ifp)
666 {
667 struct tap_softc *sc = ifp->if_softc;
668
669 return tap_clone_destroyer(sc->sc_dev);
670 }
671
672 int
673 tap_clone_destroyer(device_t dev)
674 {
675 cfdata_t cf = device_cfdata(dev);
676 int error;
677
678 if ((error = config_detach(dev, 0)) != 0)
679 aprint_error_dev(dev, "unable to detach instance\n");
680 free(cf, M_DEVBUF);
681
682 return (error);
683 }
684
685 /*
686 * tap(4) is a bit of an hybrid device. It can be used in two different
687 * ways:
688 * 1. ifconfig tapN create, then use /dev/tapN to read/write off it.
689 * 2. open /dev/tap, get a new interface created and read/write off it.
690 * That interface is destroyed when the process that had it created exits.
691 *
692 * The first way is managed by the cdevsw structure, and you access interfaces
693 * through a (major, minor) mapping: tap4 is obtained by the minor number
694 * 4. The entry points for the cdevsw interface are prefixed by tap_cdev_.
695 *
696 * The second way is the so-called "cloning" device. It's a special minor
697 * number (chosen as the maximal number, to allow as much tap devices as
698 * possible). The user first opens the cloner (e.g., /dev/tap), and that
699 * call ends in tap_cdev_open. The actual place where it is handled is
700 * tap_dev_cloner.
701 *
702 * An tap device cannot be opened more than once at a time, so the cdevsw
703 * part of open() does nothing but noting that the interface is being used and
704 * hence ready to actually handle packets.
705 */
706
707 static int
708 tap_cdev_open(dev_t dev, int flags, int fmt, struct lwp *l)
709 {
710 struct tap_softc *sc;
711
712 if (minor(dev) == TAP_CLONER)
713 return tap_dev_cloner(l);
714
715 sc = device_lookup_private(&tap_cd, minor(dev));
716 if (sc == NULL)
717 return (ENXIO);
718
719 /* The device can only be opened once */
720 if (sc->sc_flags & TAP_INUSE)
721 return (EBUSY);
722 sc->sc_flags |= TAP_INUSE;
723 return (0);
724 }
725
726 /*
727 * There are several kinds of cloning devices, and the most simple is the one
728 * tap(4) uses. What it does is change the file descriptor with a new one,
729 * with its own fileops structure (which maps to the various read, write,
730 * ioctl functions). It starts allocating a new file descriptor with falloc,
731 * then actually creates the new tap devices.
732 *
733 * Once those two steps are successful, we can re-wire the existing file
734 * descriptor to its new self. This is done with fdclone(): it fills the fp
735 * structure as needed (notably f_data gets filled with the fifth parameter
736 * passed, the unit of the tap device which will allows us identifying the
737 * device later), and returns EMOVEFD.
738 *
739 * That magic value is interpreted by sys_open() which then replaces the
740 * current file descriptor by the new one (through a magic member of struct
741 * lwp, l_dupfd).
742 *
743 * The tap device is flagged as being busy since it otherwise could be
744 * externally accessed through the corresponding device node with the cdevsw
745 * interface.
746 */
747
748 static int
749 tap_dev_cloner(struct lwp *l)
750 {
751 struct tap_softc *sc;
752 file_t *fp;
753 int error, fd;
754
755 if ((error = fd_allocfile(&fp, &fd)) != 0)
756 return (error);
757
758 if ((sc = tap_clone_creator(-1)) == NULL) {
759 fd_abort(curproc, fp, fd);
760 return (ENXIO);
761 }
762
763 sc->sc_flags |= TAP_INUSE;
764
765 return fd_clone(fp, fd, FREAD|FWRITE, &tap_fileops,
766 (void *)(intptr_t)device_unit(sc->sc_dev));
767 }
768
769 /*
770 * While all other operations (read, write, ioctl, poll and kqfilter) are
771 * really the same whether we are in cdevsw or fileops mode, the close()
772 * function is slightly different in the two cases.
773 *
774 * As for the other, the core of it is shared in tap_dev_close. What
775 * it does is sufficient for the cdevsw interface, but the cloning interface
776 * needs another thing: the interface is destroyed when the processes that
777 * created it closes it.
778 */
779 static int
780 tap_cdev_close(dev_t dev, int flags, int fmt,
781 struct lwp *l)
782 {
783 struct tap_softc *sc =
784 device_lookup_private(&tap_cd, minor(dev));
785
786 if (sc == NULL)
787 return (ENXIO);
788
789 return tap_dev_close(sc);
790 }
791
792 /*
793 * It might happen that the administrator used ifconfig to externally destroy
794 * the interface. In that case, tap_fops_close will be called while
795 * tap_detach is already happening. If we called it again from here, we
796 * would dead lock. TAP_GOING ensures that this situation doesn't happen.
797 */
798 static int
799 tap_fops_close(file_t *fp)
800 {
801 int unit = (intptr_t)fp->f_data;
802 struct tap_softc *sc;
803 int error;
804
805 sc = device_lookup_private(&tap_cd, unit);
806 if (sc == NULL)
807 return (ENXIO);
808
809 /* tap_dev_close currently always succeeds, but it might not
810 * always be the case. */
811 KERNEL_LOCK(1, NULL);
812 if ((error = tap_dev_close(sc)) != 0) {
813 KERNEL_UNLOCK_ONE(NULL);
814 return (error);
815 }
816
817 /* Destroy the device now that it is no longer useful,
818 * unless it's already being destroyed. */
819 if ((sc->sc_flags & TAP_GOING) != 0) {
820 KERNEL_UNLOCK_ONE(NULL);
821 return (0);
822 }
823
824 error = tap_clone_destroyer(sc->sc_dev);
825 KERNEL_UNLOCK_ONE(NULL);
826 return error;
827 }
828
829 static int
830 tap_dev_close(struct tap_softc *sc)
831 {
832 struct ifnet *ifp;
833 int s;
834
835 s = splnet();
836 /* Let tap_start handle packets again */
837 ifp = &sc->sc_ec.ec_if;
838 ifp->if_flags &= ~IFF_OACTIVE;
839
840 /* Purge output queue */
841 if (!(IFQ_IS_EMPTY(&ifp->if_snd))) {
842 struct mbuf *m;
843
844 for (;;) {
845 IFQ_DEQUEUE(&ifp->if_snd, m);
846 if (m == NULL)
847 break;
848
849 ifp->if_opackets++;
850 #if NBPFILTER > 0
851 if (ifp->if_bpf)
852 bpf_mtap(ifp->if_bpf, m);
853 #endif
854 }
855 }
856 splx(s);
857
858 sc->sc_flags &= ~(TAP_INUSE | TAP_ASYNCIO);
859
860 return (0);
861 }
862
863 static int
864 tap_cdev_read(dev_t dev, struct uio *uio, int flags)
865 {
866 return tap_dev_read(minor(dev), uio, flags);
867 }
868
869 static int
870 tap_fops_read(file_t *fp, off_t *offp, struct uio *uio,
871 kauth_cred_t cred, int flags)
872 {
873 int error;
874
875 KERNEL_LOCK(1, NULL);
876 error = tap_dev_read((intptr_t)fp->f_data, uio, flags);
877 KERNEL_UNLOCK_ONE(NULL);
878 return error;
879 }
880
881 static int
882 tap_dev_read(int unit, struct uio *uio, int flags)
883 {
884 struct tap_softc *sc =
885 device_lookup_private(&tap_cd, unit);
886 struct ifnet *ifp;
887 struct mbuf *m, *n;
888 int error = 0, s;
889
890 if (sc == NULL)
891 return (ENXIO);
892
893 ifp = &sc->sc_ec.ec_if;
894 if ((ifp->if_flags & IFF_UP) == 0)
895 return (EHOSTDOWN);
896
897 /*
898 * In the TAP_NBIO case, we have to make sure we won't be sleeping
899 */
900 if ((sc->sc_flags & TAP_NBIO) != 0) {
901 if (!mutex_tryenter(&sc->sc_rdlock))
902 return (EWOULDBLOCK);
903 } else {
904 mutex_enter(&sc->sc_rdlock);
905 }
906
907 s = splnet();
908 if (IFQ_IS_EMPTY(&ifp->if_snd)) {
909 ifp->if_flags &= ~IFF_OACTIVE;
910 /*
911 * We must release the lock before sleeping, and re-acquire it
912 * after.
913 */
914 mutex_exit(&sc->sc_rdlock);
915 if (sc->sc_flags & TAP_NBIO)
916 error = EWOULDBLOCK;
917 else
918 error = tsleep(sc, PSOCK|PCATCH, "tap", 0);
919 splx(s);
920
921 if (error != 0)
922 return (error);
923 /* The device might have been downed */
924 if ((ifp->if_flags & IFF_UP) == 0)
925 return (EHOSTDOWN);
926 if ((sc->sc_flags & TAP_NBIO)) {
927 if (!mutex_tryenter(&sc->sc_rdlock))
928 return (EWOULDBLOCK);
929 } else {
930 mutex_enter(&sc->sc_rdlock);
931 }
932 s = splnet();
933 }
934
935 IFQ_DEQUEUE(&ifp->if_snd, m);
936 ifp->if_flags &= ~IFF_OACTIVE;
937 splx(s);
938 if (m == NULL) {
939 error = 0;
940 goto out;
941 }
942
943 ifp->if_opackets++;
944 #if NBPFILTER > 0
945 if (ifp->if_bpf)
946 bpf_mtap(ifp->if_bpf, m);
947 #endif
948
949 /*
950 * One read is one packet.
951 */
952 do {
953 error = uiomove(mtod(m, void *),
954 min(m->m_len, uio->uio_resid), uio);
955 MFREE(m, n);
956 m = n;
957 } while (m != NULL && uio->uio_resid > 0 && error == 0);
958
959 if (m != NULL)
960 m_freem(m);
961
962 out:
963 mutex_exit(&sc->sc_rdlock);
964 return (error);
965 }
966
967 static int
968 tap_cdev_write(dev_t dev, struct uio *uio, int flags)
969 {
970 return tap_dev_write(minor(dev), uio, flags);
971 }
972
973 static int
974 tap_fops_write(file_t *fp, off_t *offp, struct uio *uio,
975 kauth_cred_t cred, int flags)
976 {
977 int error;
978
979 KERNEL_LOCK(1, NULL);
980 error = tap_dev_write((intptr_t)fp->f_data, uio, flags);
981 KERNEL_UNLOCK_ONE(NULL);
982 return error;
983 }
984
985 static int
986 tap_dev_write(int unit, struct uio *uio, int flags)
987 {
988 struct tap_softc *sc =
989 device_lookup_private(&tap_cd, unit);
990 struct ifnet *ifp;
991 struct mbuf *m, **mp;
992 int error = 0;
993 int s;
994
995 if (sc == NULL)
996 return (ENXIO);
997
998 ifp = &sc->sc_ec.ec_if;
999
1000 /* One write, one packet, that's the rule */
1001 MGETHDR(m, M_DONTWAIT, MT_DATA);
1002 if (m == NULL) {
1003 ifp->if_ierrors++;
1004 return (ENOBUFS);
1005 }
1006 m->m_pkthdr.len = uio->uio_resid;
1007
1008 mp = &m;
1009 while (error == 0 && uio->uio_resid > 0) {
1010 if (*mp != m) {
1011 MGET(*mp, M_DONTWAIT, MT_DATA);
1012 if (*mp == NULL) {
1013 error = ENOBUFS;
1014 break;
1015 }
1016 }
1017 (*mp)->m_len = min(MHLEN, uio->uio_resid);
1018 error = uiomove(mtod(*mp, void *), (*mp)->m_len, uio);
1019 mp = &(*mp)->m_next;
1020 }
1021 if (error) {
1022 ifp->if_ierrors++;
1023 m_freem(m);
1024 return (error);
1025 }
1026
1027 ifp->if_ipackets++;
1028 m->m_pkthdr.rcvif = ifp;
1029
1030 #if NBPFILTER > 0
1031 if (ifp->if_bpf)
1032 bpf_mtap(ifp->if_bpf, m);
1033 #endif
1034 s =splnet();
1035 (*ifp->if_input)(ifp, m);
1036 splx(s);
1037
1038 return (0);
1039 }
1040
1041 static int
1042 tap_cdev_ioctl(dev_t dev, u_long cmd, void *data, int flags,
1043 struct lwp *l)
1044 {
1045 return tap_dev_ioctl(minor(dev), cmd, data, l);
1046 }
1047
1048 static int
1049 tap_fops_ioctl(file_t *fp, u_long cmd, void *data)
1050 {
1051 return tap_dev_ioctl((intptr_t)fp->f_data, cmd, data, curlwp);
1052 }
1053
1054 static int
1055 tap_dev_ioctl(int unit, u_long cmd, void *data, struct lwp *l)
1056 {
1057 struct tap_softc *sc =
1058 device_lookup_private(&tap_cd, unit);
1059 int error = 0;
1060
1061 if (sc == NULL)
1062 return (ENXIO);
1063
1064 switch (cmd) {
1065 case FIONREAD:
1066 {
1067 struct ifnet *ifp = &sc->sc_ec.ec_if;
1068 struct mbuf *m;
1069 int s;
1070
1071 s = splnet();
1072 IFQ_POLL(&ifp->if_snd, m);
1073
1074 if (m == NULL)
1075 *(int *)data = 0;
1076 else
1077 *(int *)data = m->m_pkthdr.len;
1078 splx(s);
1079 } break;
1080 case TIOCSPGRP:
1081 case FIOSETOWN:
1082 error = fsetown(&sc->sc_pgid, cmd, data);
1083 break;
1084 case TIOCGPGRP:
1085 case FIOGETOWN:
1086 error = fgetown(sc->sc_pgid, cmd, data);
1087 break;
1088 case FIOASYNC:
1089 if (*(int *)data)
1090 sc->sc_flags |= TAP_ASYNCIO;
1091 else
1092 sc->sc_flags &= ~TAP_ASYNCIO;
1093 break;
1094 case FIONBIO:
1095 if (*(int *)data)
1096 sc->sc_flags |= TAP_NBIO;
1097 else
1098 sc->sc_flags &= ~TAP_NBIO;
1099 break;
1100 #ifdef OTAPGIFNAME
1101 case OTAPGIFNAME:
1102 #endif
1103 case TAPGIFNAME:
1104 {
1105 struct ifreq *ifr = (struct ifreq *)data;
1106 struct ifnet *ifp = &sc->sc_ec.ec_if;
1107
1108 strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ);
1109 } break;
1110 default:
1111 error = ENOTTY;
1112 break;
1113 }
1114
1115 return (0);
1116 }
1117
1118 static int
1119 tap_cdev_poll(dev_t dev, int events, struct lwp *l)
1120 {
1121 return tap_dev_poll(minor(dev), events, l);
1122 }
1123
1124 static int
1125 tap_fops_poll(file_t *fp, int events)
1126 {
1127 return tap_dev_poll((intptr_t)fp->f_data, events, curlwp);
1128 }
1129
1130 static int
1131 tap_dev_poll(int unit, int events, struct lwp *l)
1132 {
1133 struct tap_softc *sc =
1134 device_lookup_private(&tap_cd, unit);
1135 int revents = 0;
1136
1137 if (sc == NULL)
1138 return POLLERR;
1139
1140 if (events & (POLLIN|POLLRDNORM)) {
1141 struct ifnet *ifp = &sc->sc_ec.ec_if;
1142 struct mbuf *m;
1143 int s;
1144
1145 s = splnet();
1146 IFQ_POLL(&ifp->if_snd, m);
1147 splx(s);
1148
1149 if (m != NULL)
1150 revents |= events & (POLLIN|POLLRDNORM);
1151 else {
1152 simple_lock(&sc->sc_kqlock);
1153 selrecord(l, &sc->sc_rsel);
1154 simple_unlock(&sc->sc_kqlock);
1155 }
1156 }
1157 revents |= events & (POLLOUT|POLLWRNORM);
1158
1159 return (revents);
1160 }
1161
1162 static struct filterops tap_read_filterops = { 1, NULL, tap_kqdetach,
1163 tap_kqread };
1164 static struct filterops tap_seltrue_filterops = { 1, NULL, tap_kqdetach,
1165 filt_seltrue };
1166
1167 static int
1168 tap_cdev_kqfilter(dev_t dev, struct knote *kn)
1169 {
1170 return tap_dev_kqfilter(minor(dev), kn);
1171 }
1172
1173 static int
1174 tap_fops_kqfilter(file_t *fp, struct knote *kn)
1175 {
1176 return tap_dev_kqfilter((intptr_t)fp->f_data, kn);
1177 }
1178
1179 static int
1180 tap_dev_kqfilter(int unit, struct knote *kn)
1181 {
1182 struct tap_softc *sc =
1183 device_lookup_private(&tap_cd, unit);
1184
1185 if (sc == NULL)
1186 return (ENXIO);
1187
1188 KERNEL_LOCK(1, NULL);
1189 switch(kn->kn_filter) {
1190 case EVFILT_READ:
1191 kn->kn_fop = &tap_read_filterops;
1192 break;
1193 case EVFILT_WRITE:
1194 kn->kn_fop = &tap_seltrue_filterops;
1195 break;
1196 default:
1197 KERNEL_UNLOCK_ONE(NULL);
1198 return (EINVAL);
1199 }
1200
1201 kn->kn_hook = sc;
1202 simple_lock(&sc->sc_kqlock);
1203 SLIST_INSERT_HEAD(&sc->sc_rsel.sel_klist, kn, kn_selnext);
1204 simple_unlock(&sc->sc_kqlock);
1205 KERNEL_UNLOCK_ONE(NULL);
1206 return (0);
1207 }
1208
1209 static void
1210 tap_kqdetach(struct knote *kn)
1211 {
1212 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook;
1213
1214 KERNEL_LOCK(1, NULL);
1215 simple_lock(&sc->sc_kqlock);
1216 SLIST_REMOVE(&sc->sc_rsel.sel_klist, kn, knote, kn_selnext);
1217 simple_unlock(&sc->sc_kqlock);
1218 KERNEL_UNLOCK_ONE(NULL);
1219 }
1220
1221 static int
1222 tap_kqread(struct knote *kn, long hint)
1223 {
1224 struct tap_softc *sc = (struct tap_softc *)kn->kn_hook;
1225 struct ifnet *ifp = &sc->sc_ec.ec_if;
1226 struct mbuf *m;
1227 int s, rv;
1228
1229 KERNEL_LOCK(1, NULL);
1230 s = splnet();
1231 IFQ_POLL(&ifp->if_snd, m);
1232
1233 if (m == NULL)
1234 kn->kn_data = 0;
1235 else
1236 kn->kn_data = m->m_pkthdr.len;
1237 splx(s);
1238 rv = (kn->kn_data != 0 ? 1 : 0);
1239 KERNEL_UNLOCK_ONE(NULL);
1240 return rv;
1241 }
1242
1243 #if defined(COMPAT_40) || defined(MODULAR)
1244 /*
1245 * sysctl management routines
1246 * You can set the address of an interface through:
1247 * net.link.tap.tap<number>
1248 *
1249 * Note the consistent use of tap_log in order to use
1250 * sysctl_teardown at unload time.
1251 *
1252 * In the kernel you will find a lot of SYSCTL_SETUP blocks. Those
1253 * blocks register a function in a special section of the kernel
1254 * (called a link set) which is used at init_sysctl() time to cycle
1255 * through all those functions to create the kernel's sysctl tree.
1256 *
1257 * It is not possible to use link sets in a module, so the
1258 * easiest is to simply call our own setup routine at load time.
1259 *
1260 * In the SYSCTL_SETUP blocks you find in the kernel, nodes have the
1261 * CTLFLAG_PERMANENT flag, meaning they cannot be removed. Once the
1262 * whole kernel sysctl tree is built, it is not possible to add any
1263 * permanent node.
1264 *
1265 * It should be noted that we're not saving the sysctlnode pointer
1266 * we are returned when creating the "tap" node. That structure
1267 * cannot be trusted once out of the calling function, as it might
1268 * get reused. So we just save the MIB number, and always give the
1269 * full path starting from the root for later calls to sysctl_createv
1270 * and sysctl_destroyv.
1271 */
1272 SYSCTL_SETUP(sysctl_tap_setup, "sysctl net.link.tap subtree setup")
1273 {
1274 const struct sysctlnode *node;
1275 int error = 0;
1276
1277 if ((error = sysctl_createv(clog, 0, NULL, NULL,
1278 CTLFLAG_PERMANENT,
1279 CTLTYPE_NODE, "net", NULL,
1280 NULL, 0, NULL, 0,
1281 CTL_NET, CTL_EOL)) != 0)
1282 return;
1283
1284 if ((error = sysctl_createv(clog, 0, NULL, NULL,
1285 CTLFLAG_PERMANENT,
1286 CTLTYPE_NODE, "link", NULL,
1287 NULL, 0, NULL, 0,
1288 CTL_NET, AF_LINK, CTL_EOL)) != 0)
1289 return;
1290
1291 /*
1292 * The first four parameters of sysctl_createv are for management.
1293 *
1294 * The four that follows, here starting with a '0' for the flags,
1295 * describe the node.
1296 *
1297 * The next series of four set its value, through various possible
1298 * means.
1299 *
1300 * Last but not least, the path to the node is described. That path
1301 * is relative to the given root (third argument). Here we're
1302 * starting from the root.
1303 */
1304 if ((error = sysctl_createv(clog, 0, NULL, &node,
1305 CTLFLAG_PERMANENT,
1306 CTLTYPE_NODE, "tap", NULL,
1307 NULL, 0, NULL, 0,
1308 CTL_NET, AF_LINK, CTL_CREATE, CTL_EOL)) != 0)
1309 return;
1310 tap_node = node->sysctl_num;
1311 }
1312
1313 /*
1314 * The helper functions make Andrew Brown's interface really
1315 * shine. It makes possible to create value on the fly whether
1316 * the sysctl value is read or written.
1317 *
1318 * As shown as an example in the man page, the first step is to
1319 * create a copy of the node to have sysctl_lookup work on it.
1320 *
1321 * Here, we have more work to do than just a copy, since we have
1322 * to create the string. The first step is to collect the actual
1323 * value of the node, which is a convenient pointer to the softc
1324 * of the interface. From there we create the string and use it
1325 * as the value, but only for the *copy* of the node.
1326 *
1327 * Then we let sysctl_lookup do the magic, which consists in
1328 * setting oldp and newp as required by the operation. When the
1329 * value is read, that means that the string will be copied to
1330 * the user, and when it is written, the new value will be copied
1331 * over in the addr array.
1332 *
1333 * If newp is NULL, the user was reading the value, so we don't
1334 * have anything else to do. If a new value was written, we
1335 * have to check it.
1336 *
1337 * If it is incorrect, we can return an error and leave 'node' as
1338 * it is: since it is a copy of the actual node, the change will
1339 * be forgotten.
1340 *
1341 * Upon a correct input, we commit the change to the ifnet
1342 * structure of our interface.
1343 */
1344 static int
1345 tap_sysctl_handler(SYSCTLFN_ARGS)
1346 {
1347 struct sysctlnode node;
1348 struct tap_softc *sc;
1349 struct ifnet *ifp;
1350 int error;
1351 size_t len;
1352 char addr[3 * ETHER_ADDR_LEN];
1353 uint8_t enaddr[ETHER_ADDR_LEN];
1354
1355 node = *rnode;
1356 sc = node.sysctl_data;
1357 ifp = &sc->sc_ec.ec_if;
1358 (void)ether_snprintf(addr, sizeof(addr), CLLADDR(ifp->if_sadl));
1359 node.sysctl_data = addr;
1360 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1361 if (error || newp == NULL)
1362 return (error);
1363
1364 len = strlen(addr);
1365 if (len < 11 || len > 17)
1366 return (EINVAL);
1367
1368 /* Commit change */
1369 if (ether_nonstatic_aton(enaddr, addr) != 0)
1370 return (EINVAL);
1371 if_set_sadl(ifp, enaddr, ETHER_ADDR_LEN, false);
1372 return (error);
1373 }
1374 #endif
1375