ip_mroute.c revision 1.162 1 /* $NetBSD: ip_mroute.c,v 1.162 2018/07/11 12:48:42 martin Exp $ */
2
3 /*
4 * Copyright (c) 1992, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
35 */
36
37 /*
38 * Copyright (c) 1989 Stephen Deering
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Stephen Deering of Stanford University.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
72 */
73
74 /*
75 * IP multicast forwarding procedures
76 *
77 * Written by David Waitzman, BBN Labs, August 1988.
78 * Modified by Steve Deering, Stanford, February 1989.
79 * Modified by Mark J. Steiglitz, Stanford, May, 1991
80 * Modified by Van Jacobson, LBL, January 1993
81 * Modified by Ajit Thyagarajan, PARC, August 1993
82 * Modified by Bill Fenner, PARC, April 1994
83 * Modified by Charles M. Hannum, NetBSD, May 1995.
84 * Modified by Ahmed Helmy, SGI, June 1996
85 * Modified by George Edmond Eddy (Rusty), ISI, February 1998
86 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
87 * Modified by Hitoshi Asaeda, WIDE, August 2000
88 * Modified by Pavlin Radoslavov, ICSI, October 2002
89 *
90 * MROUTING Revision: 1.2
91 * and PIM-SMv2 and PIM-DM support, advanced API support,
92 * bandwidth metering and signaling
93 */
94
95 #include <sys/cdefs.h>
96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.162 2018/07/11 12:48:42 martin Exp $");
97
98 #ifdef _KERNEL_OPT
99 #include "opt_inet.h"
100 #include "opt_ipsec.h"
101 #include "opt_pim.h"
102 #endif
103
104 #ifdef PIM
105 #define _PIM_VT 1
106 #endif
107
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/callout.h>
111 #include <sys/mbuf.h>
112 #include <sys/socket.h>
113 #include <sys/socketvar.h>
114 #include <sys/errno.h>
115 #include <sys/time.h>
116 #include <sys/kernel.h>
117 #include <sys/kmem.h>
118 #include <sys/ioctl.h>
119 #include <sys/syslog.h>
120
121 #include <net/if.h>
122 #include <net/raw_cb.h>
123
124 #include <netinet/in.h>
125 #include <netinet/in_var.h>
126 #include <netinet/in_systm.h>
127 #include <netinet/in_offload.h>
128 #include <netinet/ip.h>
129 #include <netinet/ip_var.h>
130 #include <netinet/in_pcb.h>
131 #include <netinet/udp.h>
132 #include <netinet/igmp.h>
133 #include <netinet/igmp_var.h>
134 #include <netinet/ip_mroute.h>
135 #ifdef PIM
136 #include <netinet/pim.h>
137 #include <netinet/pim_var.h>
138 #endif
139 #include <netinet/ip_encap.h>
140
141 #ifdef IPSEC
142 #include <netipsec/ipsec.h>
143 #include <netipsec/key.h>
144 #endif
145
146 #define IP_MULTICASTOPTS 0
147 #define M_PULLUP(m, len) \
148 do { \
149 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
150 (m) = m_pullup((m), (len)); \
151 } while (/*CONSTCOND*/ 0)
152
153 /*
154 * Globals. All but ip_mrouter and ip_mrtproto could be static,
155 * except for netstat or debugging purposes.
156 */
157 struct socket *ip_mrouter = NULL;
158 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */
159
160 #define MFCHASH(a, g) \
161 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
162 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
163 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
164 u_long mfchash;
165
166 u_char nexpire[MFCTBLSIZ];
167 struct vif viftable[MAXVIFS];
168 struct mrtstat mrtstat;
169 u_int mrtdebug = 0; /* debug level */
170 #define DEBUG_MFC 0x02
171 #define DEBUG_FORWARD 0x04
172 #define DEBUG_EXPIRE 0x08
173 #define DEBUG_XMIT 0x10
174 #define DEBUG_PIM 0x20
175
176 #define VIFI_INVALID ((vifi_t) -1)
177
178 u_int tbfdebug = 0; /* tbf debug level */
179
180 /* vif attachment using sys/netinet/ip_encap.c */
181 static void vif_input(struct mbuf *, int, int, void *);
182 static int vif_encapcheck(struct mbuf *, int, int, void *);
183
184 static const struct encapsw vif_encapsw = {
185 .encapsw4 = {
186 .pr_input = vif_input,
187 .pr_ctlinput = NULL,
188 }
189 };
190
191 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
192 #define UPCALL_EXPIRE 6 /* number of timeouts */
193
194 /*
195 * Define the token bucket filter structures
196 */
197
198 #define TBF_REPROCESS (hz / 100) /* 100x / second */
199
200 static int get_sg_cnt(struct sioc_sg_req *);
201 static int get_vif_cnt(struct sioc_vif_req *);
202 static int ip_mrouter_init(struct socket *, int);
203 static int set_assert(int);
204 static int add_vif(struct vifctl *);
205 static int del_vif(vifi_t *);
206 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
207 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
208 static void expire_mfc(struct mfc *);
209 static int add_mfc(struct sockopt *);
210 #ifdef UPCALL_TIMING
211 static void collate(struct timeval *);
212 #endif
213 static int del_mfc(struct sockopt *);
214 static int set_api_config(struct sockopt *); /* chose API capabilities */
215 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
216 static void expire_upcalls(void *);
217 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
218 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
219 static void encap_send(struct ip *, struct vif *, struct mbuf *);
220 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
221 static void tbf_queue(struct vif *, struct mbuf *);
222 static void tbf_process_q(struct vif *);
223 static void tbf_reprocess_q(void *);
224 static int tbf_dq_sel(struct vif *, struct ip *);
225 static void tbf_send_packet(struct vif *, struct mbuf *);
226 static void tbf_update_tokens(struct vif *);
227 static int priority(struct vif *, struct ip *);
228
229 /*
230 * Bandwidth monitoring
231 */
232 static void free_bw_list(struct bw_meter *);
233 static int add_bw_upcall(struct bw_upcall *);
234 static int del_bw_upcall(struct bw_upcall *);
235 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
236 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
237 static void bw_upcalls_send(void);
238 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
239 static void unschedule_bw_meter(struct bw_meter *);
240 static void bw_meter_process(void);
241 static void expire_bw_upcalls_send(void *);
242 static void expire_bw_meter_process(void *);
243
244 #ifdef PIM
245 static int pim_register_send(struct ip *, struct vif *,
246 struct mbuf *, struct mfc *);
247 static int pim_register_send_rp(struct ip *, struct vif *,
248 struct mbuf *, struct mfc *);
249 static int pim_register_send_upcall(struct ip *, struct vif *,
250 struct mbuf *, struct mfc *);
251 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
252 #endif
253
254 #define ENCAP_TTL 64
255 #define ENCAP_PROTO IPPROTO_IPIP
256
257 /* prototype IP hdr for encapsulated packets */
258 static const struct ip multicast_encap_iphdr = {
259 .ip_hl = sizeof(struct ip) >> 2,
260 .ip_v = IPVERSION,
261 .ip_len = sizeof(struct ip),
262 .ip_ttl = ENCAP_TTL,
263 .ip_p = ENCAP_PROTO,
264 };
265
266 /*
267 * Bandwidth meter variables and constants
268 */
269
270 /*
271 * Pending timeouts are stored in a hash table, the key being the
272 * expiration time. Periodically, the entries are analysed and processed.
273 */
274 #define BW_METER_BUCKETS 1024
275 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
276 struct callout bw_meter_ch;
277 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */
278
279 /*
280 * Pending upcalls are stored in a vector which is flushed when
281 * full, or periodically
282 */
283 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
284 static u_int bw_upcalls_n; /* # of pending upcalls */
285 struct callout bw_upcalls_ch;
286 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
287
288 #ifdef PIM
289 struct pimstat pimstat;
290
291 /*
292 * Note: the PIM Register encapsulation adds the following in front of a
293 * data packet:
294 *
295 * struct pim_encap_hdr {
296 * struct ip ip;
297 * struct pim_encap_pimhdr pim;
298 * }
299 */
300
301 struct pim_encap_pimhdr {
302 struct pim pim;
303 uint32_t flags;
304 };
305
306 static struct ip pim_encap_iphdr = {
307 .ip_v = IPVERSION,
308 .ip_hl = sizeof(struct ip) >> 2,
309 .ip_len = sizeof(struct ip),
310 .ip_ttl = ENCAP_TTL,
311 .ip_p = IPPROTO_PIM,
312 };
313
314 static struct pim_encap_pimhdr pim_encap_pimhdr = {
315 {
316 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
317 0, /* reserved */
318 0, /* checksum */
319 },
320 0 /* flags */
321 };
322
323 static struct ifnet multicast_register_if;
324 static vifi_t reg_vif_num = VIFI_INVALID;
325 #endif /* PIM */
326
327
328 /*
329 * Private variables.
330 */
331 static vifi_t numvifs = 0;
332
333 static struct callout expire_upcalls_ch;
334
335 /*
336 * whether or not special PIM assert processing is enabled.
337 */
338 static int pim_assert;
339 /*
340 * Rate limit for assert notification messages, in usec
341 */
342 #define ASSERT_MSG_TIME 3000000
343
344 /*
345 * Kernel multicast routing API capabilities and setup.
346 * If more API capabilities are added to the kernel, they should be
347 * recorded in `mrt_api_support'.
348 */
349 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
350 MRT_MFC_FLAGS_BORDER_VIF |
351 MRT_MFC_RP |
352 MRT_MFC_BW_UPCALL);
353 static u_int32_t mrt_api_config = 0;
354
355 /*
356 * Find a route for a given origin IP address and Multicast group address
357 * Type of service parameter to be added in the future!!!
358 * Statistics are updated by the caller if needed
359 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
360 */
361 static struct mfc *
362 mfc_find(struct in_addr *o, struct in_addr *g)
363 {
364 struct mfc *rt;
365
366 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
367 if (in_hosteq(rt->mfc_origin, *o) &&
368 in_hosteq(rt->mfc_mcastgrp, *g) &&
369 (rt->mfc_stall == NULL))
370 break;
371 }
372
373 return rt;
374 }
375
376 /*
377 * Macros to compute elapsed time efficiently
378 * Borrowed from Van Jacobson's scheduling code
379 */
380 #define TV_DELTA(a, b, delta) do { \
381 int xxs; \
382 delta = (a).tv_usec - (b).tv_usec; \
383 xxs = (a).tv_sec - (b).tv_sec; \
384 switch (xxs) { \
385 case 2: \
386 delta += 1000000; \
387 /* fall through */ \
388 case 1: \
389 delta += 1000000; \
390 /* fall through */ \
391 case 0: \
392 break; \
393 default: \
394 delta += (1000000 * xxs); \
395 break; \
396 } \
397 } while (/*CONSTCOND*/ 0)
398
399 #ifdef UPCALL_TIMING
400 u_int32_t upcall_data[51];
401 #endif /* UPCALL_TIMING */
402
403 /*
404 * Handle MRT setsockopt commands to modify the multicast routing tables.
405 */
406 int
407 ip_mrouter_set(struct socket *so, struct sockopt *sopt)
408 {
409 int error;
410 int optval;
411 struct vifctl vifc;
412 vifi_t vifi;
413 struct bw_upcall bwuc;
414
415 if (sopt->sopt_name != MRT_INIT && so != ip_mrouter)
416 error = ENOPROTOOPT;
417 else {
418 switch (sopt->sopt_name) {
419 case MRT_INIT:
420 error = sockopt_getint(sopt, &optval);
421 if (error)
422 break;
423
424 error = ip_mrouter_init(so, optval);
425 break;
426 case MRT_DONE:
427 error = ip_mrouter_done();
428 break;
429 case MRT_ADD_VIF:
430 error = sockopt_get(sopt, &vifc, sizeof(vifc));
431 if (error)
432 break;
433 error = add_vif(&vifc);
434 break;
435 case MRT_DEL_VIF:
436 error = sockopt_get(sopt, &vifi, sizeof(vifi));
437 if (error)
438 break;
439 error = del_vif(&vifi);
440 break;
441 case MRT_ADD_MFC:
442 error = add_mfc(sopt);
443 break;
444 case MRT_DEL_MFC:
445 error = del_mfc(sopt);
446 break;
447 case MRT_ASSERT:
448 error = sockopt_getint(sopt, &optval);
449 if (error)
450 break;
451 error = set_assert(optval);
452 break;
453 case MRT_API_CONFIG:
454 error = set_api_config(sopt);
455 break;
456 case MRT_ADD_BW_UPCALL:
457 error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
458 if (error)
459 break;
460 error = add_bw_upcall(&bwuc);
461 break;
462 case MRT_DEL_BW_UPCALL:
463 error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
464 if (error)
465 break;
466 error = del_bw_upcall(&bwuc);
467 break;
468 default:
469 error = ENOPROTOOPT;
470 break;
471 }
472 }
473 return error;
474 }
475
476 /*
477 * Handle MRT getsockopt commands
478 */
479 int
480 ip_mrouter_get(struct socket *so, struct sockopt *sopt)
481 {
482 int error;
483
484 if (so != ip_mrouter)
485 error = ENOPROTOOPT;
486 else {
487 switch (sopt->sopt_name) {
488 case MRT_VERSION:
489 error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */
490 break;
491 case MRT_ASSERT:
492 error = sockopt_setint(sopt, pim_assert);
493 break;
494 case MRT_API_SUPPORT:
495 error = sockopt_set(sopt, &mrt_api_support,
496 sizeof(mrt_api_support));
497 break;
498 case MRT_API_CONFIG:
499 error = sockopt_set(sopt, &mrt_api_config,
500 sizeof(mrt_api_config));
501 break;
502 default:
503 error = ENOPROTOOPT;
504 break;
505 }
506 }
507 return error;
508 }
509
510 /*
511 * Handle ioctl commands to obtain information from the cache
512 */
513 int
514 mrt_ioctl(struct socket *so, u_long cmd, void *data)
515 {
516 int error;
517
518 if (so != ip_mrouter)
519 error = EINVAL;
520 else
521 switch (cmd) {
522 case SIOCGETVIFCNT:
523 error = get_vif_cnt((struct sioc_vif_req *)data);
524 break;
525 case SIOCGETSGCNT:
526 error = get_sg_cnt((struct sioc_sg_req *)data);
527 break;
528 default:
529 error = EINVAL;
530 break;
531 }
532
533 return error;
534 }
535
536 /*
537 * returns the packet, byte, rpf-failure count for the source group provided
538 */
539 static int
540 get_sg_cnt(struct sioc_sg_req *req)
541 {
542 int s;
543 struct mfc *rt;
544
545 s = splsoftnet();
546 rt = mfc_find(&req->src, &req->grp);
547 if (rt == NULL) {
548 splx(s);
549 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
550 return EADDRNOTAVAIL;
551 }
552 req->pktcnt = rt->mfc_pkt_cnt;
553 req->bytecnt = rt->mfc_byte_cnt;
554 req->wrong_if = rt->mfc_wrong_if;
555 splx(s);
556
557 return 0;
558 }
559
560 /*
561 * returns the input and output packet and byte counts on the vif provided
562 */
563 static int
564 get_vif_cnt(struct sioc_vif_req *req)
565 {
566 vifi_t vifi = req->vifi;
567
568 if (vifi >= numvifs)
569 return EINVAL;
570
571 req->icount = viftable[vifi].v_pkt_in;
572 req->ocount = viftable[vifi].v_pkt_out;
573 req->ibytes = viftable[vifi].v_bytes_in;
574 req->obytes = viftable[vifi].v_bytes_out;
575
576 return 0;
577 }
578
579 /*
580 * Enable multicast routing
581 */
582 static int
583 ip_mrouter_init(struct socket *so, int v)
584 {
585 if (mrtdebug)
586 log(LOG_DEBUG,
587 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
588 so->so_type, so->so_proto->pr_protocol);
589
590 if (so->so_type != SOCK_RAW ||
591 so->so_proto->pr_protocol != IPPROTO_IGMP)
592 return EOPNOTSUPP;
593
594 if (v != 1)
595 return EINVAL;
596
597 if (ip_mrouter != NULL)
598 return EADDRINUSE;
599
600 ip_mrouter = so;
601
602 mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash);
603 memset((void *)nexpire, 0, sizeof(nexpire));
604
605 pim_assert = 0;
606
607 callout_init(&expire_upcalls_ch, 0);
608 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
609 expire_upcalls, NULL);
610
611 callout_init(&bw_upcalls_ch, 0);
612 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
613 expire_bw_upcalls_send, NULL);
614
615 callout_init(&bw_meter_ch, 0);
616 callout_reset(&bw_meter_ch, BW_METER_PERIOD,
617 expire_bw_meter_process, NULL);
618
619 if (mrtdebug)
620 log(LOG_DEBUG, "ip_mrouter_init\n");
621
622 return 0;
623 }
624
625 /*
626 * Disable multicast routing
627 */
628 int
629 ip_mrouter_done(void)
630 {
631 vifi_t vifi;
632 struct vif *vifp;
633 int i;
634 int s;
635
636 s = splsoftnet();
637
638 /* Clear out all the vifs currently in use. */
639 for (vifi = 0; vifi < numvifs; vifi++) {
640 vifp = &viftable[vifi];
641 if (!in_nullhost(vifp->v_lcl_addr))
642 reset_vif(vifp);
643 }
644
645 numvifs = 0;
646 pim_assert = 0;
647 mrt_api_config = 0;
648
649 callout_stop(&expire_upcalls_ch);
650 callout_stop(&bw_upcalls_ch);
651 callout_stop(&bw_meter_ch);
652
653 /*
654 * Free all multicast forwarding cache entries.
655 */
656 for (i = 0; i < MFCTBLSIZ; i++) {
657 struct mfc *rt, *nrt;
658
659 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
660 nrt = LIST_NEXT(rt, mfc_hash);
661
662 expire_mfc(rt);
663 }
664 }
665
666 memset((void *)nexpire, 0, sizeof(nexpire));
667 hashdone(mfchashtbl, HASH_LIST, mfchash);
668 mfchashtbl = NULL;
669
670 bw_upcalls_n = 0;
671 memset(bw_meter_timers, 0, sizeof(bw_meter_timers));
672
673 /* Reset de-encapsulation cache. */
674
675 ip_mrouter = NULL;
676
677 splx(s);
678
679 if (mrtdebug)
680 log(LOG_DEBUG, "ip_mrouter_done\n");
681
682 return 0;
683 }
684
685 void
686 ip_mrouter_detach(struct ifnet *ifp)
687 {
688 int vifi, i;
689 struct vif *vifp;
690 struct mfc *rt;
691 struct rtdetq *rte;
692
693 /* XXX not sure about side effect to userland routing daemon */
694 for (vifi = 0; vifi < numvifs; vifi++) {
695 vifp = &viftable[vifi];
696 if (vifp->v_ifp == ifp)
697 reset_vif(vifp);
698 }
699 for (i = 0; i < MFCTBLSIZ; i++) {
700 if (nexpire[i] == 0)
701 continue;
702 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
703 for (rte = rt->mfc_stall; rte; rte = rte->next) {
704 if (rte->ifp == ifp)
705 rte->ifp = NULL;
706 }
707 }
708 }
709 }
710
711 /*
712 * Set PIM assert processing global
713 */
714 static int
715 set_assert(int i)
716 {
717 pim_assert = !!i;
718 return 0;
719 }
720
721 /*
722 * Configure API capabilities
723 */
724 static int
725 set_api_config(struct sockopt *sopt)
726 {
727 u_int32_t apival;
728 int i, error;
729
730 /*
731 * We can set the API capabilities only if it is the first operation
732 * after MRT_INIT. I.e.:
733 * - there are no vifs installed
734 * - pim_assert is not enabled
735 * - the MFC table is empty
736 */
737 error = sockopt_get(sopt, &apival, sizeof(apival));
738 if (error)
739 return error;
740 if (numvifs > 0)
741 return EPERM;
742 if (pim_assert)
743 return EPERM;
744 for (i = 0; i < MFCTBLSIZ; i++) {
745 if (LIST_FIRST(&mfchashtbl[i]) != NULL)
746 return EPERM;
747 }
748
749 mrt_api_config = apival & mrt_api_support;
750 return 0;
751 }
752
753 /*
754 * Add a vif to the vif table
755 */
756 static int
757 add_vif(struct vifctl *vifcp)
758 {
759 struct vif *vifp;
760 struct ifnet *ifp;
761 int error, s;
762 struct sockaddr_in sin;
763
764 if (vifcp->vifc_vifi >= MAXVIFS)
765 return EINVAL;
766 if (in_nullhost(vifcp->vifc_lcl_addr))
767 return EADDRNOTAVAIL;
768
769 vifp = &viftable[vifcp->vifc_vifi];
770 if (!in_nullhost(vifp->v_lcl_addr))
771 return EADDRINUSE;
772
773 /* Find the interface with an address in AF_INET family. */
774 #ifdef PIM
775 if (vifcp->vifc_flags & VIFF_REGISTER) {
776 /*
777 * XXX: Because VIFF_REGISTER does not really need a valid
778 * local interface (e.g. it could be 127.0.0.2), we don't
779 * check its address.
780 */
781 ifp = NULL;
782 } else
783 #endif
784 {
785 struct ifaddr *ifa;
786
787 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0);
788 s = pserialize_read_enter();
789 ifa = ifa_ifwithaddr(sintosa(&sin));
790 if (ifa == NULL) {
791 pserialize_read_exit(s);
792 return EADDRNOTAVAIL;
793 }
794 ifp = ifa->ifa_ifp;
795 /* FIXME NOMPSAFE */
796 pserialize_read_exit(s);
797 }
798
799 if (vifcp->vifc_flags & VIFF_TUNNEL) {
800 if (vifcp->vifc_flags & VIFF_SRCRT) {
801 log(LOG_ERR, "source routed tunnels not supported\n");
802 return EOPNOTSUPP;
803 }
804
805 /* attach this vif to decapsulator dispatch table */
806 /*
807 * XXX Use addresses in registration so that matching
808 * can be done with radix tree in decapsulator. But,
809 * we need to check inner header for multicast, so
810 * this requires both radix tree lookup and then a
811 * function to check, and this is not supported yet.
812 */
813 error = encap_lock_enter();
814 if (error)
815 return error;
816 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
817 vif_encapcheck, &vif_encapsw, vifp);
818 encap_lock_exit();
819 if (!vifp->v_encap_cookie)
820 return EINVAL;
821
822 /* Create a fake encapsulation interface. */
823 ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO);
824 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
825 "mdecap%d", vifcp->vifc_vifi);
826
827 /* Prepare cached route entry. */
828 memset(&vifp->v_route, 0, sizeof(vifp->v_route));
829 #ifdef PIM
830 } else if (vifcp->vifc_flags & VIFF_REGISTER) {
831 ifp = &multicast_register_if;
832 if (mrtdebug)
833 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
834 (void *)ifp);
835 if (reg_vif_num == VIFI_INVALID) {
836 memset(ifp, 0, sizeof(*ifp));
837 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
838 "register_vif");
839 ifp->if_flags = IFF_LOOPBACK;
840 memset(&vifp->v_route, 0, sizeof(vifp->v_route));
841 reg_vif_num = vifcp->vifc_vifi;
842 }
843 #endif
844 } else {
845 /* Make sure the interface supports multicast. */
846 if ((ifp->if_flags & IFF_MULTICAST) == 0)
847 return EOPNOTSUPP;
848
849 /* Enable promiscuous reception of all IP multicasts. */
850 sockaddr_in_init(&sin, &zeroin_addr, 0);
851 error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin));
852 if (error)
853 return error;
854 }
855
856 s = splsoftnet();
857
858 /* Define parameters for the tbf structure. */
859 vifp->tbf_q = NULL;
860 vifp->tbf_t = &vifp->tbf_q;
861 microtime(&vifp->tbf_last_pkt_t);
862 vifp->tbf_n_tok = 0;
863 vifp->tbf_q_len = 0;
864 vifp->tbf_max_q_len = MAXQSIZE;
865
866 vifp->v_flags = vifcp->vifc_flags;
867 vifp->v_threshold = vifcp->vifc_threshold;
868 /* scaling up here allows division by 1024 in critical code */
869 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
870 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
871 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
872 vifp->v_ifp = ifp;
873 /* Initialize per vif pkt counters. */
874 vifp->v_pkt_in = 0;
875 vifp->v_pkt_out = 0;
876 vifp->v_bytes_in = 0;
877 vifp->v_bytes_out = 0;
878
879 callout_init(&vifp->v_repq_ch, 0);
880
881 splx(s);
882
883 /* Adjust numvifs up if the vifi is higher than numvifs. */
884 if (numvifs <= vifcp->vifc_vifi)
885 numvifs = vifcp->vifc_vifi + 1;
886
887 if (mrtdebug)
888 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
889 vifcp->vifc_vifi,
890 ntohl(vifcp->vifc_lcl_addr.s_addr),
891 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
892 ntohl(vifcp->vifc_rmt_addr.s_addr),
893 vifcp->vifc_threshold,
894 vifcp->vifc_rate_limit);
895
896 return 0;
897 }
898
899 void
900 reset_vif(struct vif *vifp)
901 {
902 struct mbuf *m, *n;
903 struct ifnet *ifp;
904 struct sockaddr_in sin;
905
906 callout_stop(&vifp->v_repq_ch);
907
908 /* detach this vif from decapsulator dispatch table */
909 encap_lock_enter();
910 encap_detach(vifp->v_encap_cookie);
911 encap_lock_exit();
912 vifp->v_encap_cookie = NULL;
913
914 /*
915 * Free packets queued at the interface
916 */
917 for (m = vifp->tbf_q; m != NULL; m = n) {
918 n = m->m_nextpkt;
919 m_freem(m);
920 }
921
922 if (vifp->v_flags & VIFF_TUNNEL)
923 free(vifp->v_ifp, M_MRTABLE);
924 else if (vifp->v_flags & VIFF_REGISTER) {
925 #ifdef PIM
926 reg_vif_num = VIFI_INVALID;
927 #endif
928 } else {
929 sockaddr_in_init(&sin, &zeroin_addr, 0);
930 ifp = vifp->v_ifp;
931 if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin));
932 }
933 memset((void *)vifp, 0, sizeof(*vifp));
934 }
935
936 /*
937 * Delete a vif from the vif table
938 */
939 static int
940 del_vif(vifi_t *vifip)
941 {
942 struct vif *vifp;
943 vifi_t vifi;
944 int s;
945
946 if (*vifip >= numvifs)
947 return EINVAL;
948
949 vifp = &viftable[*vifip];
950 if (in_nullhost(vifp->v_lcl_addr))
951 return EADDRNOTAVAIL;
952
953 s = splsoftnet();
954
955 reset_vif(vifp);
956
957 /* Adjust numvifs down */
958 for (vifi = numvifs; vifi > 0; vifi--)
959 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
960 break;
961 numvifs = vifi;
962
963 splx(s);
964
965 if (mrtdebug)
966 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
967
968 return 0;
969 }
970
971 /*
972 * update an mfc entry without resetting counters and S,G addresses.
973 */
974 static void
975 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
976 {
977 int i;
978
979 rt->mfc_parent = mfccp->mfcc_parent;
980 for (i = 0; i < numvifs; i++) {
981 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
982 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
983 MRT_MFC_FLAGS_ALL;
984 }
985 /* set the RP address */
986 if (mrt_api_config & MRT_MFC_RP)
987 rt->mfc_rp = mfccp->mfcc_rp;
988 else
989 rt->mfc_rp = zeroin_addr;
990 }
991
992 /*
993 * fully initialize an mfc entry from the parameter.
994 */
995 static void
996 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
997 {
998 rt->mfc_origin = mfccp->mfcc_origin;
999 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1000
1001 update_mfc_params(rt, mfccp);
1002
1003 /* initialize pkt counters per src-grp */
1004 rt->mfc_pkt_cnt = 0;
1005 rt->mfc_byte_cnt = 0;
1006 rt->mfc_wrong_if = 0;
1007 timerclear(&rt->mfc_last_assert);
1008 }
1009
1010 static void
1011 expire_mfc(struct mfc *rt)
1012 {
1013 struct rtdetq *rte, *nrte;
1014
1015 free_bw_list(rt->mfc_bw_meter);
1016
1017 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
1018 nrte = rte->next;
1019 m_freem(rte->m);
1020 free(rte, M_MRTABLE);
1021 }
1022
1023 LIST_REMOVE(rt, mfc_hash);
1024 free(rt, M_MRTABLE);
1025 }
1026
1027 /*
1028 * Add an mfc entry
1029 */
1030 static int
1031 add_mfc(struct sockopt *sopt)
1032 {
1033 struct mfcctl2 mfcctl2;
1034 struct mfcctl2 *mfccp;
1035 struct mfc *rt;
1036 u_int32_t hash = 0;
1037 struct rtdetq *rte, *nrte;
1038 u_short nstl;
1039 int s;
1040 int error;
1041
1042 /*
1043 * select data size depending on API version.
1044 */
1045 mfccp = &mfcctl2;
1046 memset(&mfcctl2, 0, sizeof(mfcctl2));
1047
1048 if (mrt_api_config & MRT_API_FLAGS_ALL)
1049 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1050 else
1051 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1052
1053 if (error)
1054 return error;
1055
1056 s = splsoftnet();
1057 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1058
1059 /* If an entry already exists, just update the fields */
1060 if (rt) {
1061 if (mrtdebug & DEBUG_MFC)
1062 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
1063 ntohl(mfccp->mfcc_origin.s_addr),
1064 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1065 mfccp->mfcc_parent);
1066
1067 update_mfc_params(rt, mfccp);
1068
1069 splx(s);
1070 return 0;
1071 }
1072
1073 /*
1074 * Find the entry for which the upcall was made and update
1075 */
1076 nstl = 0;
1077 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
1078 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1079 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1080 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
1081 rt->mfc_stall != NULL) {
1082 if (nstl++)
1083 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
1084 "multiple kernel entries",
1085 ntohl(mfccp->mfcc_origin.s_addr),
1086 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1087 mfccp->mfcc_parent, rt->mfc_stall);
1088
1089 if (mrtdebug & DEBUG_MFC)
1090 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
1091 ntohl(mfccp->mfcc_origin.s_addr),
1092 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1093 mfccp->mfcc_parent, rt->mfc_stall);
1094
1095 rte = rt->mfc_stall;
1096 init_mfc_params(rt, mfccp);
1097 rt->mfc_stall = NULL;
1098
1099 rt->mfc_expire = 0; /* Don't clean this guy up */
1100 nexpire[hash]--;
1101
1102 /* free packets Qed at the end of this entry */
1103 for (; rte != NULL; rte = nrte) {
1104 nrte = rte->next;
1105 if (rte->ifp) {
1106 ip_mdq(rte->m, rte->ifp, rt);
1107 }
1108 m_freem(rte->m);
1109 #ifdef UPCALL_TIMING
1110 collate(&rte->t);
1111 #endif /* UPCALL_TIMING */
1112 free(rte, M_MRTABLE);
1113 }
1114 }
1115 }
1116
1117 /*
1118 * It is possible that an entry is being inserted without an upcall
1119 */
1120 if (nstl == 0) {
1121 /*
1122 * No mfc; make a new one
1123 */
1124 if (mrtdebug & DEBUG_MFC)
1125 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
1126 ntohl(mfccp->mfcc_origin.s_addr),
1127 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1128 mfccp->mfcc_parent);
1129
1130 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1131 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1132 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
1133 init_mfc_params(rt, mfccp);
1134 if (rt->mfc_expire)
1135 nexpire[hash]--;
1136 rt->mfc_expire = 0;
1137 break; /* XXX */
1138 }
1139 }
1140 if (rt == NULL) { /* no upcall, so make a new entry */
1141 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1142 if (rt == NULL) {
1143 splx(s);
1144 return ENOBUFS;
1145 }
1146
1147 init_mfc_params(rt, mfccp);
1148 rt->mfc_expire = 0;
1149 rt->mfc_stall = NULL;
1150 rt->mfc_bw_meter = NULL;
1151
1152 /* insert new entry at head of hash chain */
1153 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1154 }
1155 }
1156
1157 splx(s);
1158 return 0;
1159 }
1160
1161 #ifdef UPCALL_TIMING
1162 /*
1163 * collect delay statistics on the upcalls
1164 */
1165 static void
1166 collate(struct timeval *t)
1167 {
1168 u_int32_t d;
1169 struct timeval tp;
1170 u_int32_t delta;
1171
1172 microtime(&tp);
1173
1174 if (timercmp(t, &tp, <)) {
1175 TV_DELTA(tp, *t, delta);
1176
1177 d = delta >> 10;
1178 if (d > 50)
1179 d = 50;
1180
1181 ++upcall_data[d];
1182 }
1183 }
1184 #endif /* UPCALL_TIMING */
1185
1186 /*
1187 * Delete an mfc entry
1188 */
1189 static int
1190 del_mfc(struct sockopt *sopt)
1191 {
1192 struct mfcctl2 mfcctl2;
1193 struct mfcctl2 *mfccp;
1194 struct mfc *rt;
1195 int s;
1196 int error;
1197
1198 /*
1199 * XXX: for deleting MFC entries the information in entries
1200 * of size "struct mfcctl" is sufficient.
1201 */
1202
1203 mfccp = &mfcctl2;
1204 memset(&mfcctl2, 0, sizeof(mfcctl2));
1205
1206 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1207 if (error) {
1208 /* Try with the size of mfcctl2. */
1209 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1210 if (error)
1211 return error;
1212 }
1213
1214 if (mrtdebug & DEBUG_MFC)
1215 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
1216 ntohl(mfccp->mfcc_origin.s_addr),
1217 ntohl(mfccp->mfcc_mcastgrp.s_addr));
1218
1219 s = splsoftnet();
1220
1221 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1222 if (rt == NULL) {
1223 splx(s);
1224 return EADDRNOTAVAIL;
1225 }
1226
1227 /*
1228 * free the bw_meter entries
1229 */
1230 free_bw_list(rt->mfc_bw_meter);
1231 rt->mfc_bw_meter = NULL;
1232
1233 LIST_REMOVE(rt, mfc_hash);
1234 free(rt, M_MRTABLE);
1235
1236 splx(s);
1237 return 0;
1238 }
1239
1240 static int
1241 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
1242 {
1243 if (s) {
1244 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) {
1245 sorwakeup(s);
1246 return 0;
1247 }
1248 soroverflow(s);
1249 }
1250 m_freem(mm);
1251 return -1;
1252 }
1253
1254 /*
1255 * IP multicast forwarding function. This function assumes that the packet
1256 * pointed to by "ip" has arrived on (or is about to be sent to) the interface
1257 * pointed to by "ifp", and the packet is to be relayed to other networks
1258 * that have members of the packet's destination IP multicast group.
1259 *
1260 * The packet is returned unscathed to the caller, unless it is
1261 * erroneous, in which case a non-zero return value tells the caller to
1262 * discard it.
1263 */
1264
1265 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */
1266 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1267
1268 int
1269 ip_mforward(struct mbuf *m, struct ifnet *ifp)
1270 {
1271 struct ip *ip = mtod(m, struct ip *);
1272 struct mfc *rt;
1273 static int srctun = 0;
1274 struct mbuf *mm;
1275 struct sockaddr_in sin;
1276 int s;
1277 vifi_t vifi;
1278
1279 if (mrtdebug & DEBUG_FORWARD)
1280 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1281 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1282
1283 /*
1284 * XXX XXX: Why do we check [1] against IPOPT_LSRR? Because we
1285 * expect [0] to be IPOPT_NOP, maybe? In all cases that doesn't
1286 * make a lot of sense, a forged packet can just put two IPOPT_NOPs
1287 * followed by one IPOPT_LSRR, and bypass the check.
1288 */
1289 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1290 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1291 /*
1292 * Packet arrived via a physical interface or
1293 * an encapsulated tunnel or a register_vif.
1294 */
1295 } else {
1296 /*
1297 * Packet arrived through a source-route tunnel.
1298 * Source-route tunnels are no longer supported.
1299 */
1300 if ((srctun++ % 1000) == 0)
1301 log(LOG_ERR,
1302 "ip_mforward: received source-routed packet from %x\n",
1303 ntohl(ip->ip_src.s_addr));
1304 return EOPNOTSUPP;
1305 }
1306
1307 /*
1308 * Clear any in-bound checksum flags for this packet.
1309 */
1310 m->m_pkthdr.csum_flags = 0;
1311
1312 /*
1313 * Don't forward a packet with time-to-live of zero or one,
1314 * or a packet destined to a local-only group.
1315 */
1316 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1317 return 0;
1318
1319 /*
1320 * Determine forwarding vifs from the forwarding cache table
1321 */
1322 s = splsoftnet();
1323 ++mrtstat.mrts_mfc_lookups;
1324 rt = mfc_find(&ip->ip_src, &ip->ip_dst);
1325
1326 /* Entry exists, so forward if necessary */
1327 if (rt != NULL) {
1328 splx(s);
1329 return ip_mdq(m, ifp, rt);
1330 } else {
1331 /*
1332 * If we don't have a route for packet's origin, make a copy
1333 * of the packet and send message to routing daemon.
1334 */
1335
1336 struct mbuf *mb0;
1337 struct rtdetq *rte;
1338 u_int32_t hash;
1339 const int hlen = ip->ip_hl << 2;
1340 #ifdef UPCALL_TIMING
1341 struct timeval tp;
1342 microtime(&tp);
1343 #endif
1344
1345 ++mrtstat.mrts_mfc_misses;
1346
1347 mrtstat.mrts_no_route++;
1348 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1349 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1350 ntohl(ip->ip_src.s_addr),
1351 ntohl(ip->ip_dst.s_addr));
1352
1353 /*
1354 * Allocate mbufs early so that we don't do extra work if we are
1355 * just going to fail anyway. Make sure to pullup the header so
1356 * that other people can't step on it.
1357 */
1358 rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
1359 if (rte == NULL) {
1360 splx(s);
1361 return ENOBUFS;
1362 }
1363 mb0 = m_copypacket(m, M_DONTWAIT);
1364 M_PULLUP(mb0, hlen);
1365 if (mb0 == NULL) {
1366 free(rte, M_MRTABLE);
1367 splx(s);
1368 return ENOBUFS;
1369 }
1370
1371 /* is there an upcall waiting for this flow? */
1372 hash = MFCHASH(ip->ip_src, ip->ip_dst);
1373 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1374 if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1375 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1376 rt->mfc_stall != NULL)
1377 break;
1378 }
1379
1380 if (rt == NULL) {
1381 int i;
1382 struct igmpmsg *im;
1383
1384 /*
1385 * Locate the vifi for the incoming interface for
1386 * this packet.
1387 * If none found, drop packet.
1388 */
1389 for (vifi = 0; vifi < numvifs &&
1390 viftable[vifi].v_ifp != ifp; vifi++)
1391 ;
1392 if (vifi >= numvifs) /* vif not found, drop packet */
1393 goto non_fatal;
1394
1395 /* no upcall, so make a new entry */
1396 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1397 if (rt == NULL)
1398 goto fail;
1399
1400 /*
1401 * Make a copy of the header to send to the user level
1402 * process
1403 */
1404 mm = m_copym(m, 0, hlen, M_DONTWAIT);
1405 M_PULLUP(mm, hlen);
1406 if (mm == NULL)
1407 goto fail1;
1408
1409 /*
1410 * Send message to routing daemon to install
1411 * a route into the kernel table
1412 */
1413
1414 im = mtod(mm, struct igmpmsg *);
1415 im->im_msgtype = IGMPMSG_NOCACHE;
1416 im->im_mbz = 0;
1417 im->im_vif = vifi;
1418
1419 mrtstat.mrts_upcalls++;
1420
1421 sockaddr_in_init(&sin, &ip->ip_src, 0);
1422 if (socket_send(ip_mrouter, mm, &sin) < 0) {
1423 log(LOG_WARNING,
1424 "ip_mforward: ip_mrouter socket queue full\n");
1425 ++mrtstat.mrts_upq_sockfull;
1426 fail1:
1427 free(rt, M_MRTABLE);
1428 fail:
1429 free(rte, M_MRTABLE);
1430 m_freem(mb0);
1431 splx(s);
1432 return ENOBUFS;
1433 }
1434
1435 /* insert new entry at head of hash chain */
1436 rt->mfc_origin = ip->ip_src;
1437 rt->mfc_mcastgrp = ip->ip_dst;
1438 rt->mfc_pkt_cnt = 0;
1439 rt->mfc_byte_cnt = 0;
1440 rt->mfc_wrong_if = 0;
1441 rt->mfc_expire = UPCALL_EXPIRE;
1442 nexpire[hash]++;
1443 for (i = 0; i < numvifs; i++) {
1444 rt->mfc_ttls[i] = 0;
1445 rt->mfc_flags[i] = 0;
1446 }
1447 rt->mfc_parent = -1;
1448
1449 /* clear the RP address */
1450 rt->mfc_rp = zeroin_addr;
1451
1452 rt->mfc_bw_meter = NULL;
1453
1454 /* link into table */
1455 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1456 /* Add this entry to the end of the queue */
1457 rt->mfc_stall = rte;
1458 } else {
1459 /* determine if q has overflowed */
1460 struct rtdetq **p;
1461 int npkts = 0;
1462
1463 /*
1464 * XXX ouch! we need to append to the list, but we
1465 * only have a pointer to the front, so we have to
1466 * scan the entire list every time.
1467 */
1468 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
1469 if (++npkts > MAX_UPQ) {
1470 mrtstat.mrts_upq_ovflw++;
1471 non_fatal:
1472 free(rte, M_MRTABLE);
1473 m_freem(mb0);
1474 splx(s);
1475 return 0;
1476 }
1477
1478 /* Add this entry to the end of the queue */
1479 *p = rte;
1480 }
1481
1482 rte->next = NULL;
1483 rte->m = mb0;
1484 rte->ifp = ifp;
1485 #ifdef UPCALL_TIMING
1486 rte->t = tp;
1487 #endif
1488
1489 splx(s);
1490
1491 return 0;
1492 }
1493 }
1494
1495 /*ARGSUSED*/
1496 static void
1497 expire_upcalls(void *v)
1498 {
1499 int i;
1500
1501 /* XXX NOMPSAFE still need softnet_lock */
1502 mutex_enter(softnet_lock);
1503 KERNEL_LOCK(1, NULL);
1504
1505 for (i = 0; i < MFCTBLSIZ; i++) {
1506 struct mfc *rt, *nrt;
1507
1508 if (nexpire[i] == 0)
1509 continue;
1510
1511 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1512 nrt = LIST_NEXT(rt, mfc_hash);
1513
1514 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
1515 continue;
1516 nexpire[i]--;
1517
1518 /*
1519 * free the bw_meter entries
1520 */
1521 while (rt->mfc_bw_meter != NULL) {
1522 struct bw_meter *x = rt->mfc_bw_meter;
1523
1524 rt->mfc_bw_meter = x->bm_mfc_next;
1525 kmem_intr_free(x, sizeof(*x));
1526 }
1527
1528 ++mrtstat.mrts_cache_cleanups;
1529 if (mrtdebug & DEBUG_EXPIRE)
1530 log(LOG_DEBUG,
1531 "expire_upcalls: expiring (%x %x)\n",
1532 ntohl(rt->mfc_origin.s_addr),
1533 ntohl(rt->mfc_mcastgrp.s_addr));
1534
1535 expire_mfc(rt);
1536 }
1537 }
1538
1539 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
1540 expire_upcalls, NULL);
1541
1542 KERNEL_UNLOCK_ONE(NULL);
1543 mutex_exit(softnet_lock);
1544 }
1545
1546 /*
1547 * Macro to send packet on vif.
1548 */
1549 #define MC_SEND(ip, vifp, m) do { \
1550 if ((vifp)->v_flags & VIFF_TUNNEL) \
1551 encap_send((ip), (vifp), (m)); \
1552 else \
1553 phyint_send((ip), (vifp), (m)); \
1554 } while (/*CONSTCOND*/ 0)
1555
1556 /*
1557 * Packet forwarding routine once entry in the cache is made
1558 */
1559 static int
1560 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
1561 {
1562 struct ip *ip = mtod(m, struct ip *);
1563 vifi_t vifi;
1564 struct vif *vifp;
1565 struct sockaddr_in sin;
1566 const int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
1567
1568 /*
1569 * Don't forward if it didn't arrive from the parent vif for its origin.
1570 */
1571 vifi = rt->mfc_parent;
1572 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1573 /* came in the wrong interface */
1574 if (mrtdebug & DEBUG_FORWARD)
1575 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1576 ifp, vifi,
1577 vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
1578 ++mrtstat.mrts_wrong_if;
1579 ++rt->mfc_wrong_if;
1580
1581 /*
1582 * If we are doing PIM assert processing, send a message
1583 * to the routing daemon.
1584 *
1585 * XXX: A PIM-SM router needs the WRONGVIF detection so it
1586 * can complete the SPT switch, regardless of the type
1587 * of the iif (broadcast media, GRE tunnel, etc).
1588 */
1589 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
1590 struct timeval now;
1591 u_int32_t delta;
1592
1593 #ifdef PIM
1594 if (ifp == &multicast_register_if)
1595 pimstat.pims_rcv_registers_wrongiif++;
1596 #endif
1597
1598 /* Get vifi for the incoming packet */
1599 for (vifi = 0;
1600 vifi < numvifs && viftable[vifi].v_ifp != ifp;
1601 vifi++)
1602 ;
1603 if (vifi >= numvifs) {
1604 /* The iif is not found: ignore the packet. */
1605 return 0;
1606 }
1607
1608 if (rt->mfc_flags[vifi] &
1609 MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
1610 /* WRONGVIF disabled: ignore the packet */
1611 return 0;
1612 }
1613
1614 microtime(&now);
1615
1616 TV_DELTA(rt->mfc_last_assert, now, delta);
1617
1618 if (delta > ASSERT_MSG_TIME) {
1619 struct igmpmsg *im;
1620 const int hlen = ip->ip_hl << 2;
1621 struct mbuf *mm =
1622 m_copym(m, 0, hlen, M_DONTWAIT);
1623
1624 M_PULLUP(mm, hlen);
1625 if (mm == NULL)
1626 return ENOBUFS;
1627
1628 rt->mfc_last_assert = now;
1629
1630 im = mtod(mm, struct igmpmsg *);
1631 im->im_msgtype = IGMPMSG_WRONGVIF;
1632 im->im_mbz = 0;
1633 im->im_vif = vifi;
1634
1635 mrtstat.mrts_upcalls++;
1636
1637 sockaddr_in_init(&sin, &im->im_src, 0);
1638 if (socket_send(ip_mrouter, mm, &sin) < 0) {
1639 log(LOG_WARNING,
1640 "ip_mforward: ip_mrouter socket queue full\n");
1641 ++mrtstat.mrts_upq_sockfull;
1642 return ENOBUFS;
1643 }
1644 }
1645 }
1646 return 0;
1647 }
1648
1649 /* If I sourced this packet, it counts as output, else it was input. */
1650 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1651 viftable[vifi].v_pkt_out++;
1652 viftable[vifi].v_bytes_out += plen;
1653 } else {
1654 viftable[vifi].v_pkt_in++;
1655 viftable[vifi].v_bytes_in += plen;
1656 }
1657 rt->mfc_pkt_cnt++;
1658 rt->mfc_byte_cnt += plen;
1659
1660 /*
1661 * For each vif, decide if a copy of the packet should be forwarded.
1662 * Forward if:
1663 * - the ttl exceeds the vif's threshold
1664 * - there are group members downstream on interface
1665 */
1666 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) {
1667 if ((rt->mfc_ttls[vifi] > 0) &&
1668 (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1669 vifp->v_pkt_out++;
1670 vifp->v_bytes_out += plen;
1671 #ifdef PIM
1672 if (vifp->v_flags & VIFF_REGISTER)
1673 pim_register_send(ip, vifp, m, rt);
1674 else
1675 #endif
1676 MC_SEND(ip, vifp, m);
1677 }
1678 }
1679
1680 /*
1681 * Perform upcall-related bw measuring.
1682 */
1683 if (rt->mfc_bw_meter != NULL) {
1684 struct bw_meter *x;
1685 struct timeval now;
1686
1687 microtime(&now);
1688 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
1689 bw_meter_receive_packet(x, plen, &now);
1690 }
1691
1692 return 0;
1693 }
1694
1695 static void
1696 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1697 {
1698 struct mbuf *mb_copy;
1699 const int hlen = ip->ip_hl << 2;
1700
1701 /*
1702 * Make a new reference to the packet; make sure that
1703 * the IP header is actually copied, not just referenced,
1704 * so that ip_output() only scribbles on the copy.
1705 */
1706 mb_copy = m_copypacket(m, M_DONTWAIT);
1707 M_PULLUP(mb_copy, hlen);
1708 if (mb_copy == NULL)
1709 return;
1710
1711 if (vifp->v_rate_limit <= 0)
1712 tbf_send_packet(vifp, mb_copy);
1713 else
1714 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
1715 ntohs(ip->ip_len));
1716 }
1717
1718 static void
1719 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1720 {
1721 struct mbuf *mb_copy;
1722 struct ip *ip_copy;
1723 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
1724
1725 /* Take care of delayed checksums */
1726 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1727 in_undefer_cksum_tcpudp(m);
1728 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1729 }
1730
1731 /*
1732 * copy the old packet & pullup its IP header into the
1733 * new mbuf so we can modify it. Try to fill the new
1734 * mbuf since if we don't the ethernet driver will.
1735 */
1736 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1737 if (mb_copy == NULL)
1738 return;
1739 mb_copy->m_data += max_linkhdr;
1740 mb_copy->m_pkthdr.len = len;
1741 mb_copy->m_len = sizeof(multicast_encap_iphdr);
1742
1743 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
1744 m_freem(mb_copy);
1745 return;
1746 }
1747 i = MHLEN - max_linkhdr;
1748 if (i > len)
1749 i = len;
1750 mb_copy = m_pullup(mb_copy, i);
1751 if (mb_copy == NULL)
1752 return;
1753
1754 /*
1755 * fill in the encapsulating IP header.
1756 */
1757 ip_copy = mtod(mb_copy, struct ip *);
1758 *ip_copy = multicast_encap_iphdr;
1759 if (len < IP_MINFRAGSIZE)
1760 ip_copy->ip_id = 0;
1761 else
1762 ip_copy->ip_id = ip_newid(NULL);
1763 ip_copy->ip_len = htons(len);
1764 ip_copy->ip_src = vifp->v_lcl_addr;
1765 ip_copy->ip_dst = vifp->v_rmt_addr;
1766
1767 /*
1768 * turn the encapsulated IP header back into a valid one.
1769 */
1770 ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr));
1771 --ip->ip_ttl;
1772 ip->ip_sum = 0;
1773 mb_copy->m_data += sizeof(multicast_encap_iphdr);
1774 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1775 mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1776
1777 if (vifp->v_rate_limit <= 0)
1778 tbf_send_packet(vifp, mb_copy);
1779 else
1780 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
1781 }
1782
1783 /*
1784 * De-encapsulate a packet and feed it back through ip input.
1785 */
1786 static void
1787 vif_input(struct mbuf *m, int off, int proto, void *eparg)
1788 {
1789 struct vif *vifp = eparg;
1790
1791 KASSERT(vifp != NULL);
1792
1793 if (proto != ENCAP_PROTO) {
1794 m_freem(m);
1795 mrtstat.mrts_bad_tunnel++;
1796 return;
1797 }
1798
1799 m_adj(m, off);
1800 m_set_rcvif(m, vifp->v_ifp);
1801
1802 if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) {
1803 m_freem(m);
1804 }
1805 }
1806
1807 /*
1808 * Check if the packet should be received on the vif denoted by arg.
1809 * (The encap selection code will call this once per vif since each is
1810 * registered separately.)
1811 */
1812 static int
1813 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
1814 {
1815 struct vif *vifp;
1816 struct ip ip;
1817
1818 #ifdef DIAGNOSTIC
1819 if (!arg || proto != IPPROTO_IPV4)
1820 panic("unexpected arg in vif_encapcheck");
1821 #endif
1822
1823 /*
1824 * Accept the packet only if the inner heaader is multicast
1825 * and the outer header matches a tunnel-mode vif. Order
1826 * checks in the hope that common non-matching packets will be
1827 * rejected quickly. Assume that unicast IPv4 traffic in a
1828 * parallel tunnel (e.g. gif(4)) is unlikely.
1829 */
1830
1831 /* Obtain the outer IP header and the vif pointer. */
1832 m_copydata(m, 0, sizeof(ip), (void *)&ip);
1833 vifp = (struct vif *)arg;
1834
1835 /*
1836 * The outer source must match the vif's remote peer address.
1837 * For a multicast router with several tunnels, this is the
1838 * only check that will fail on packets in other tunnels,
1839 * assuming the local address is the same.
1840 */
1841 if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
1842 return 0;
1843
1844 /* The outer destination must match the vif's local address. */
1845 if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
1846 return 0;
1847
1848 /* The vif must be of tunnel type. */
1849 if ((vifp->v_flags & VIFF_TUNNEL) == 0)
1850 return 0;
1851
1852 /* Check that the inner destination is multicast. */
1853 if (off + sizeof(ip) > m->m_pkthdr.len)
1854 return 0;
1855 m_copydata(m, off, sizeof(ip), (void *)&ip);
1856 if (!IN_MULTICAST(ip.ip_dst.s_addr))
1857 return 0;
1858
1859 /*
1860 * We have checked that both the outer src and dst addresses
1861 * match the vif, and that the inner destination is multicast
1862 * (224/5). By claiming more than 64, we intend to
1863 * preferentially take packets that also match a parallel
1864 * gif(4).
1865 */
1866 return 32 + 32 + 5;
1867 }
1868
1869 /*
1870 * Token bucket filter module
1871 */
1872 static void
1873 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
1874 {
1875
1876 if (len > MAX_BKT_SIZE) {
1877 /* drop if packet is too large */
1878 mrtstat.mrts_pkt2large++;
1879 m_freem(m);
1880 return;
1881 }
1882
1883 tbf_update_tokens(vifp);
1884
1885 /*
1886 * If there are enough tokens, and the queue is empty, send this packet
1887 * out immediately. Otherwise, try to insert it on this vif's queue.
1888 */
1889 if (vifp->tbf_q_len == 0) {
1890 if (len <= vifp->tbf_n_tok) {
1891 vifp->tbf_n_tok -= len;
1892 tbf_send_packet(vifp, m);
1893 } else {
1894 /* queue packet and timeout till later */
1895 tbf_queue(vifp, m);
1896 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1897 tbf_reprocess_q, vifp);
1898 }
1899 } else {
1900 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
1901 !tbf_dq_sel(vifp, ip)) {
1902 /* queue full, and couldn't make room */
1903 mrtstat.mrts_q_overflow++;
1904 m_freem(m);
1905 } else {
1906 /* queue length low enough, or made room */
1907 tbf_queue(vifp, m);
1908 tbf_process_q(vifp);
1909 }
1910 }
1911 }
1912
1913 /*
1914 * adds a packet to the queue at the interface
1915 */
1916 static void
1917 tbf_queue(struct vif *vifp, struct mbuf *m)
1918 {
1919 int s = splsoftnet();
1920
1921 /* insert at tail */
1922 *vifp->tbf_t = m;
1923 vifp->tbf_t = &m->m_nextpkt;
1924 vifp->tbf_q_len++;
1925
1926 splx(s);
1927 }
1928
1929 /*
1930 * processes the queue at the interface
1931 */
1932 static void
1933 tbf_process_q(struct vif *vifp)
1934 {
1935 struct mbuf *m;
1936 int len;
1937 int s = splsoftnet();
1938
1939 /*
1940 * Loop through the queue at the interface and send as many packets
1941 * as possible.
1942 */
1943 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
1944 len = ntohs(mtod(m, struct ip *)->ip_len);
1945
1946 /* determine if the packet can be sent */
1947 if (len <= vifp->tbf_n_tok) {
1948 /* if so,
1949 * reduce no of tokens, dequeue the packet,
1950 * send the packet.
1951 */
1952 if ((vifp->tbf_q = m->m_nextpkt) == NULL)
1953 vifp->tbf_t = &vifp->tbf_q;
1954 --vifp->tbf_q_len;
1955
1956 m->m_nextpkt = NULL;
1957 vifp->tbf_n_tok -= len;
1958 tbf_send_packet(vifp, m);
1959 } else
1960 break;
1961 }
1962 splx(s);
1963 }
1964
1965 static void
1966 tbf_reprocess_q(void *arg)
1967 {
1968 struct vif *vifp = arg;
1969
1970 if (ip_mrouter == NULL)
1971 return;
1972
1973 tbf_update_tokens(vifp);
1974 tbf_process_q(vifp);
1975
1976 if (vifp->tbf_q_len != 0)
1977 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1978 tbf_reprocess_q, vifp);
1979 }
1980
1981 /* function that will selectively discard a member of the queue
1982 * based on the precedence value and the priority
1983 */
1984 static int
1985 tbf_dq_sel(struct vif *vifp, struct ip *ip)
1986 {
1987 u_int p;
1988 struct mbuf **mp, *m;
1989 int s = splsoftnet();
1990
1991 p = priority(vifp, ip);
1992
1993 for (mp = &vifp->tbf_q, m = *mp;
1994 m != NULL;
1995 mp = &m->m_nextpkt, m = *mp) {
1996 if (p > priority(vifp, mtod(m, struct ip *))) {
1997 if ((*mp = m->m_nextpkt) == NULL)
1998 vifp->tbf_t = mp;
1999 --vifp->tbf_q_len;
2000
2001 m_freem(m);
2002 mrtstat.mrts_drop_sel++;
2003 splx(s);
2004 return 1;
2005 }
2006 }
2007 splx(s);
2008 return 0;
2009 }
2010
2011 static void
2012 tbf_send_packet(struct vif *vifp, struct mbuf *m)
2013 {
2014 int error;
2015 int s = splsoftnet();
2016
2017 if (vifp->v_flags & VIFF_TUNNEL) {
2018 /* If tunnel options */
2019 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
2020 } else {
2021 /* if physical interface option, extract the options and then send */
2022 struct ip_moptions imo;
2023
2024 imo.imo_multicast_if_index = if_get_index(vifp->v_ifp);
2025 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
2026 imo.imo_multicast_loop = 1;
2027
2028 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS,
2029 &imo, NULL);
2030
2031 if (mrtdebug & DEBUG_XMIT)
2032 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
2033 (long)(vifp - viftable), error);
2034 }
2035 splx(s);
2036 }
2037
2038 /* determine the current time and then
2039 * the elapsed time (between the last time and time now)
2040 * in milliseconds & update the no. of tokens in the bucket
2041 */
2042 static void
2043 tbf_update_tokens(struct vif *vifp)
2044 {
2045 struct timeval tp;
2046 u_int32_t tm;
2047 int s = splsoftnet();
2048
2049 microtime(&tp);
2050
2051 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
2052
2053 /*
2054 * This formula is actually
2055 * "time in seconds" * "bytes/second".
2056 *
2057 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
2058 *
2059 * The (1000/1024) was introduced in add_vif to optimize
2060 * this divide into a shift.
2061 */
2062 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
2063 vifp->tbf_last_pkt_t = tp;
2064
2065 if (vifp->tbf_n_tok > MAX_BKT_SIZE)
2066 vifp->tbf_n_tok = MAX_BKT_SIZE;
2067
2068 splx(s);
2069 }
2070
2071 static int
2072 priority(struct vif *vifp, struct ip *ip)
2073 {
2074 int prio = 50; /* the lowest priority -- default case */
2075
2076 /* temporary hack; may add general packet classifier some day */
2077
2078 /*
2079 * XXX XXX: We're reading the UDP header, but we didn't ensure
2080 * it was present in the packet.
2081 */
2082
2083 /*
2084 * The UDP port space is divided up into four priority ranges:
2085 * [0, 16384) : unclassified - lowest priority
2086 * [16384, 32768) : audio - highest priority
2087 * [32768, 49152) : whiteboard - medium priority
2088 * [49152, 65536) : video - low priority
2089 */
2090 if (ip->ip_p == IPPROTO_UDP) {
2091 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
2092
2093 switch (ntohs(udp->uh_dport) & 0xc000) {
2094 case 0x4000:
2095 prio = 70;
2096 break;
2097 case 0x8000:
2098 prio = 60;
2099 break;
2100 case 0xc000:
2101 prio = 55;
2102 break;
2103 }
2104
2105 if (tbfdebug > 1)
2106 log(LOG_DEBUG, "port %x prio %d\n",
2107 ntohs(udp->uh_dport), prio);
2108 }
2109
2110 return prio;
2111 }
2112
2113 /*
2114 * Code for bandwidth monitors
2115 */
2116
2117 /*
2118 * Define common interface for timeval-related methods
2119 */
2120 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
2121 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
2122 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
2123
2124 static uint32_t
2125 compute_bw_meter_flags(struct bw_upcall *req)
2126 {
2127 uint32_t flags = 0;
2128
2129 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
2130 flags |= BW_METER_UNIT_PACKETS;
2131 if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
2132 flags |= BW_METER_UNIT_BYTES;
2133 if (req->bu_flags & BW_UPCALL_GEQ)
2134 flags |= BW_METER_GEQ;
2135 if (req->bu_flags & BW_UPCALL_LEQ)
2136 flags |= BW_METER_LEQ;
2137
2138 return flags;
2139 }
2140
2141 /*
2142 * Add a bw_meter entry
2143 */
2144 static int
2145 add_bw_upcall(struct bw_upcall *req)
2146 {
2147 int s;
2148 struct mfc *mfc;
2149 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
2150 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
2151 struct timeval now;
2152 struct bw_meter *x;
2153 uint32_t flags;
2154
2155 if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2156 return EOPNOTSUPP;
2157
2158 /* Test if the flags are valid */
2159 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
2160 return EINVAL;
2161 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
2162 return EINVAL;
2163 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2164 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2165 return EINVAL;
2166
2167 /* Test if the threshold time interval is valid */
2168 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
2169 return EINVAL;
2170
2171 flags = compute_bw_meter_flags(req);
2172
2173 /*
2174 * Find if we have already same bw_meter entry
2175 */
2176 s = splsoftnet();
2177 mfc = mfc_find(&req->bu_src, &req->bu_dst);
2178 if (mfc == NULL) {
2179 splx(s);
2180 return EADDRNOTAVAIL;
2181 }
2182 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
2183 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2184 &req->bu_threshold.b_time, ==)) &&
2185 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2186 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2187 (x->bm_flags & BW_METER_USER_FLAGS) == flags) {
2188 splx(s);
2189 return 0; /* XXX Already installed */
2190 }
2191 }
2192
2193 /* Allocate the new bw_meter entry */
2194 x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP);
2195 if (x == NULL) {
2196 splx(s);
2197 return ENOBUFS;
2198 }
2199
2200 /* Set the new bw_meter entry */
2201 x->bm_threshold.b_time = req->bu_threshold.b_time;
2202 microtime(&now);
2203 x->bm_start_time = now;
2204 x->bm_threshold.b_packets = req->bu_threshold.b_packets;
2205 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
2206 x->bm_measured.b_packets = 0;
2207 x->bm_measured.b_bytes = 0;
2208 x->bm_flags = flags;
2209 x->bm_time_next = NULL;
2210 x->bm_time_hash = BW_METER_BUCKETS;
2211
2212 /* Add the new bw_meter entry to the front of entries for this MFC */
2213 x->bm_mfc = mfc;
2214 x->bm_mfc_next = mfc->mfc_bw_meter;
2215 mfc->mfc_bw_meter = x;
2216 schedule_bw_meter(x, &now);
2217 splx(s);
2218
2219 return 0;
2220 }
2221
2222 static void
2223 free_bw_list(struct bw_meter *list)
2224 {
2225 while (list != NULL) {
2226 struct bw_meter *x = list;
2227
2228 list = list->bm_mfc_next;
2229 unschedule_bw_meter(x);
2230 kmem_intr_free(x, sizeof(*x));
2231 }
2232 }
2233
2234 /*
2235 * Delete one or multiple bw_meter entries
2236 */
2237 static int
2238 del_bw_upcall(struct bw_upcall *req)
2239 {
2240 int s;
2241 struct mfc *mfc;
2242 struct bw_meter *x;
2243
2244 if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2245 return EOPNOTSUPP;
2246
2247 s = splsoftnet();
2248 /* Find the corresponding MFC entry */
2249 mfc = mfc_find(&req->bu_src, &req->bu_dst);
2250 if (mfc == NULL) {
2251 splx(s);
2252 return EADDRNOTAVAIL;
2253 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
2254 /*
2255 * Delete all bw_meter entries for this mfc
2256 */
2257 struct bw_meter *list;
2258
2259 list = mfc->mfc_bw_meter;
2260 mfc->mfc_bw_meter = NULL;
2261 free_bw_list(list);
2262 splx(s);
2263 return 0;
2264 } else { /* Delete a single bw_meter entry */
2265 struct bw_meter *prev;
2266 uint32_t flags = 0;
2267
2268 flags = compute_bw_meter_flags(req);
2269
2270 /* Find the bw_meter entry to delete */
2271 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
2272 prev = x, x = x->bm_mfc_next) {
2273 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2274 &req->bu_threshold.b_time, ==)) &&
2275 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2276 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2277 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
2278 break;
2279 }
2280 if (x != NULL) { /* Delete entry from the list for this MFC */
2281 if (prev != NULL)
2282 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
2283 else
2284 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
2285
2286 unschedule_bw_meter(x);
2287 splx(s);
2288 /* Free the bw_meter entry */
2289 kmem_intr_free(x, sizeof(*x));
2290 return 0;
2291 } else {
2292 splx(s);
2293 return EINVAL;
2294 }
2295 }
2296 /* NOTREACHED */
2297 }
2298
2299 /*
2300 * Perform bandwidth measurement processing that may result in an upcall
2301 */
2302 static void
2303 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
2304 {
2305 struct timeval delta;
2306
2307 delta = *nowp;
2308 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2309
2310 if (x->bm_flags & BW_METER_GEQ) {
2311 /*
2312 * Processing for ">=" type of bw_meter entry
2313 */
2314 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2315 /* Reset the bw_meter entry */
2316 x->bm_start_time = *nowp;
2317 x->bm_measured.b_packets = 0;
2318 x->bm_measured.b_bytes = 0;
2319 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2320 }
2321
2322 /* Record that a packet is received */
2323 x->bm_measured.b_packets++;
2324 x->bm_measured.b_bytes += plen;
2325
2326 /*
2327 * Test if we should deliver an upcall
2328 */
2329 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
2330 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2331 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
2332 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2333 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
2334 /* Prepare an upcall for delivery */
2335 bw_meter_prepare_upcall(x, nowp);
2336 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
2337 }
2338 }
2339 } else if (x->bm_flags & BW_METER_LEQ) {
2340 /*
2341 * Processing for "<=" type of bw_meter entry
2342 */
2343 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2344 /*
2345 * We are behind time with the multicast forwarding table
2346 * scanning for "<=" type of bw_meter entries, so test now
2347 * if we should deliver an upcall.
2348 */
2349 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2350 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2351 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2352 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2353 /* Prepare an upcall for delivery */
2354 bw_meter_prepare_upcall(x, nowp);
2355 }
2356 /* Reschedule the bw_meter entry */
2357 unschedule_bw_meter(x);
2358 schedule_bw_meter(x, nowp);
2359 }
2360
2361 /* Record that a packet is received */
2362 x->bm_measured.b_packets++;
2363 x->bm_measured.b_bytes += plen;
2364
2365 /*
2366 * Test if we should restart the measuring interval
2367 */
2368 if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
2369 x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
2370 (x->bm_flags & BW_METER_UNIT_BYTES &&
2371 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
2372 /* Don't restart the measuring interval */
2373 } else {
2374 /* Do restart the measuring interval */
2375 /*
2376 * XXX: note that we don't unschedule and schedule, because this
2377 * might be too much overhead per packet. Instead, when we process
2378 * all entries for a given timer hash bin, we check whether it is
2379 * really a timeout. If not, we reschedule at that time.
2380 */
2381 x->bm_start_time = *nowp;
2382 x->bm_measured.b_packets = 0;
2383 x->bm_measured.b_bytes = 0;
2384 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2385 }
2386 }
2387 }
2388
2389 /*
2390 * Prepare a bandwidth-related upcall
2391 */
2392 static void
2393 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
2394 {
2395 struct timeval delta;
2396 struct bw_upcall *u;
2397
2398 /*
2399 * Compute the measured time interval
2400 */
2401 delta = *nowp;
2402 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2403
2404 /*
2405 * If there are too many pending upcalls, deliver them now
2406 */
2407 if (bw_upcalls_n >= BW_UPCALLS_MAX)
2408 bw_upcalls_send();
2409
2410 /*
2411 * Set the bw_upcall entry
2412 */
2413 u = &bw_upcalls[bw_upcalls_n++];
2414 u->bu_src = x->bm_mfc->mfc_origin;
2415 u->bu_dst = x->bm_mfc->mfc_mcastgrp;
2416 u->bu_threshold.b_time = x->bm_threshold.b_time;
2417 u->bu_threshold.b_packets = x->bm_threshold.b_packets;
2418 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
2419 u->bu_measured.b_time = delta;
2420 u->bu_measured.b_packets = x->bm_measured.b_packets;
2421 u->bu_measured.b_bytes = x->bm_measured.b_bytes;
2422 u->bu_flags = 0;
2423 if (x->bm_flags & BW_METER_UNIT_PACKETS)
2424 u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
2425 if (x->bm_flags & BW_METER_UNIT_BYTES)
2426 u->bu_flags |= BW_UPCALL_UNIT_BYTES;
2427 if (x->bm_flags & BW_METER_GEQ)
2428 u->bu_flags |= BW_UPCALL_GEQ;
2429 if (x->bm_flags & BW_METER_LEQ)
2430 u->bu_flags |= BW_UPCALL_LEQ;
2431 }
2432
2433 /*
2434 * Send the pending bandwidth-related upcalls
2435 */
2436 static void
2437 bw_upcalls_send(void)
2438 {
2439 struct mbuf *m;
2440 int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
2441 struct sockaddr_in k_igmpsrc = {
2442 .sin_len = sizeof(k_igmpsrc),
2443 .sin_family = AF_INET,
2444 };
2445 static struct igmpmsg igmpmsg = {
2446 0, /* unused1 */
2447 0, /* unused2 */
2448 IGMPMSG_BW_UPCALL,/* im_msgtype */
2449 0, /* im_mbz */
2450 0, /* im_vif */
2451 0, /* unused3 */
2452 { 0 }, /* im_src */
2453 { 0 } /* im_dst */
2454 };
2455
2456 if (bw_upcalls_n == 0)
2457 return; /* No pending upcalls */
2458
2459 bw_upcalls_n = 0;
2460
2461 /*
2462 * Allocate a new mbuf, initialize it with the header and
2463 * the payload for the pending calls.
2464 */
2465 MGETHDR(m, M_DONTWAIT, MT_HEADER);
2466 if (m == NULL) {
2467 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
2468 return;
2469 }
2470
2471 m->m_len = m->m_pkthdr.len = 0;
2472 m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg);
2473 m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]);
2474
2475 /*
2476 * Send the upcalls
2477 * XXX do we need to set the address in k_igmpsrc ?
2478 */
2479 mrtstat.mrts_upcalls++;
2480 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
2481 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
2482 ++mrtstat.mrts_upq_sockfull;
2483 }
2484 }
2485
2486 /*
2487 * Compute the timeout hash value for the bw_meter entries
2488 */
2489 #define BW_METER_TIMEHASH(bw_meter, hash) \
2490 do { \
2491 struct timeval next_timeval = (bw_meter)->bm_start_time; \
2492 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
2493 (hash) = next_timeval.tv_sec; \
2494 if (next_timeval.tv_usec) \
2495 (hash)++; /* XXX: make sure we don't timeout early */ \
2496 (hash) %= BW_METER_BUCKETS; \
2497 } while (/*CONSTCOND*/ 0)
2498
2499 /*
2500 * Schedule a timer to process periodically bw_meter entry of type "<="
2501 * by linking the entry in the proper hash bucket.
2502 */
2503 static void
2504 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
2505 {
2506 int time_hash;
2507
2508 if (!(x->bm_flags & BW_METER_LEQ))
2509 return; /* XXX: we schedule timers only for "<=" entries */
2510
2511 /*
2512 * Reset the bw_meter entry
2513 */
2514 x->bm_start_time = *nowp;
2515 x->bm_measured.b_packets = 0;
2516 x->bm_measured.b_bytes = 0;
2517 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2518
2519 /*
2520 * Compute the timeout hash value and insert the entry
2521 */
2522 BW_METER_TIMEHASH(x, time_hash);
2523 x->bm_time_next = bw_meter_timers[time_hash];
2524 bw_meter_timers[time_hash] = x;
2525 x->bm_time_hash = time_hash;
2526 }
2527
2528 /*
2529 * Unschedule the periodic timer that processes bw_meter entry of type "<="
2530 * by removing the entry from the proper hash bucket.
2531 */
2532 static void
2533 unschedule_bw_meter(struct bw_meter *x)
2534 {
2535 int time_hash;
2536 struct bw_meter *prev, *tmp;
2537
2538 if (!(x->bm_flags & BW_METER_LEQ))
2539 return; /* XXX: we schedule timers only for "<=" entries */
2540
2541 /*
2542 * Compute the timeout hash value and delete the entry
2543 */
2544 time_hash = x->bm_time_hash;
2545 if (time_hash >= BW_METER_BUCKETS)
2546 return; /* Entry was not scheduled */
2547
2548 for (prev = NULL, tmp = bw_meter_timers[time_hash];
2549 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
2550 if (tmp == x)
2551 break;
2552
2553 if (tmp == NULL)
2554 panic("unschedule_bw_meter: bw_meter entry not found");
2555
2556 if (prev != NULL)
2557 prev->bm_time_next = x->bm_time_next;
2558 else
2559 bw_meter_timers[time_hash] = x->bm_time_next;
2560
2561 x->bm_time_next = NULL;
2562 x->bm_time_hash = BW_METER_BUCKETS;
2563 }
2564
2565 /*
2566 * Process all "<=" type of bw_meter that should be processed now,
2567 * and for each entry prepare an upcall if necessary. Each processed
2568 * entry is rescheduled again for the (periodic) processing.
2569 *
2570 * This is run periodically (once per second normally). On each round,
2571 * all the potentially matching entries are in the hash slot that we are
2572 * looking at.
2573 */
2574 static void
2575 bw_meter_process(void)
2576 {
2577 int s;
2578 static uint32_t last_tv_sec; /* last time we processed this */
2579
2580 uint32_t loops;
2581 int i;
2582 struct timeval now, process_endtime;
2583
2584 microtime(&now);
2585 if (last_tv_sec == now.tv_sec)
2586 return; /* nothing to do */
2587
2588 loops = now.tv_sec - last_tv_sec;
2589 last_tv_sec = now.tv_sec;
2590 if (loops > BW_METER_BUCKETS)
2591 loops = BW_METER_BUCKETS;
2592
2593 s = splsoftnet();
2594 /*
2595 * Process all bins of bw_meter entries from the one after the last
2596 * processed to the current one. On entry, i points to the last bucket
2597 * visited, so we need to increment i at the beginning of the loop.
2598 */
2599 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
2600 struct bw_meter *x, *tmp_list;
2601
2602 if (++i >= BW_METER_BUCKETS)
2603 i = 0;
2604
2605 /* Disconnect the list of bw_meter entries from the bin */
2606 tmp_list = bw_meter_timers[i];
2607 bw_meter_timers[i] = NULL;
2608
2609 /* Process the list of bw_meter entries */
2610 while (tmp_list != NULL) {
2611 x = tmp_list;
2612 tmp_list = tmp_list->bm_time_next;
2613
2614 /* Test if the time interval is over */
2615 process_endtime = x->bm_start_time;
2616 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
2617 if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
2618 /* Not yet: reschedule, but don't reset */
2619 int time_hash;
2620
2621 BW_METER_TIMEHASH(x, time_hash);
2622 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
2623 /*
2624 * XXX: somehow the bin processing is a bit ahead of time.
2625 * Put the entry in the next bin.
2626 */
2627 if (++time_hash >= BW_METER_BUCKETS)
2628 time_hash = 0;
2629 }
2630 x->bm_time_next = bw_meter_timers[time_hash];
2631 bw_meter_timers[time_hash] = x;
2632 x->bm_time_hash = time_hash;
2633
2634 continue;
2635 }
2636
2637 /*
2638 * Test if we should deliver an upcall
2639 */
2640 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2641 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2642 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2643 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2644 /* Prepare an upcall for delivery */
2645 bw_meter_prepare_upcall(x, &now);
2646 }
2647
2648 /*
2649 * Reschedule for next processing
2650 */
2651 schedule_bw_meter(x, &now);
2652 }
2653 }
2654
2655 /* Send all upcalls that are pending delivery */
2656 bw_upcalls_send();
2657
2658 splx(s);
2659 }
2660
2661 /*
2662 * A periodic function for sending all upcalls that are pending delivery
2663 */
2664 static void
2665 expire_bw_upcalls_send(void *unused)
2666 {
2667 int s;
2668
2669 s = splsoftnet();
2670 bw_upcalls_send();
2671 splx(s);
2672
2673 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
2674 expire_bw_upcalls_send, NULL);
2675 }
2676
2677 /*
2678 * A periodic function for periodic scanning of the multicast forwarding
2679 * table for processing all "<=" bw_meter entries.
2680 */
2681 static void
2682 expire_bw_meter_process(void *unused)
2683 {
2684 if (mrt_api_config & MRT_MFC_BW_UPCALL)
2685 bw_meter_process();
2686
2687 callout_reset(&bw_meter_ch, BW_METER_PERIOD,
2688 expire_bw_meter_process, NULL);
2689 }
2690
2691 /*
2692 * End of bandwidth monitoring code
2693 */
2694
2695 #ifdef PIM
2696 /*
2697 * Send the packet up to the user daemon, or eventually do kernel encapsulation
2698 */
2699 static int
2700 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
2701 struct mfc *rt)
2702 {
2703 struct mbuf *mb_copy, *mm;
2704
2705 if (mrtdebug & DEBUG_PIM)
2706 log(LOG_DEBUG, "pim_register_send: \n");
2707
2708 mb_copy = pim_register_prepare(ip, m);
2709 if (mb_copy == NULL)
2710 return ENOBUFS;
2711
2712 /*
2713 * Send all the fragments. Note that the mbuf for each fragment
2714 * is freed by the sending machinery.
2715 */
2716 for (mm = mb_copy; mm; mm = mb_copy) {
2717 mb_copy = mm->m_nextpkt;
2718 mm->m_nextpkt = NULL;
2719 mm = m_pullup(mm, sizeof(struct ip));
2720 if (mm != NULL) {
2721 ip = mtod(mm, struct ip *);
2722 if ((mrt_api_config & MRT_MFC_RP) &&
2723 !in_nullhost(rt->mfc_rp)) {
2724 pim_register_send_rp(ip, vifp, mm, rt);
2725 } else {
2726 pim_register_send_upcall(ip, vifp, mm, rt);
2727 }
2728 }
2729 }
2730
2731 return 0;
2732 }
2733
2734 /*
2735 * Return a copy of the data packet that is ready for PIM Register
2736 * encapsulation.
2737 * XXX: Note that in the returned copy the IP header is a valid one.
2738 */
2739 static struct mbuf *
2740 pim_register_prepare(struct ip *ip, struct mbuf *m)
2741 {
2742 struct mbuf *mb_copy = NULL;
2743 int mtu;
2744
2745 /* Take care of delayed checksums */
2746 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
2747 in_undefer_cksum_tcpudp(m);
2748 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
2749 }
2750
2751 /*
2752 * Copy the old packet & pullup its IP header into the
2753 * new mbuf so we can modify it.
2754 */
2755 mb_copy = m_copypacket(m, M_DONTWAIT);
2756 if (mb_copy == NULL)
2757 return NULL;
2758 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
2759 if (mb_copy == NULL)
2760 return NULL;
2761
2762 /* take care of the TTL */
2763 ip = mtod(mb_copy, struct ip *);
2764 --ip->ip_ttl;
2765
2766 /* Compute the MTU after the PIM Register encapsulation */
2767 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
2768
2769 if (ntohs(ip->ip_len) <= mtu) {
2770 /* Turn the IP header into a valid one */
2771 ip->ip_sum = 0;
2772 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
2773 } else {
2774 /* Fragment the packet */
2775 if (ip_fragment(mb_copy, NULL, mtu) != 0) {
2776 /* XXX: mb_copy was freed by ip_fragment() */
2777 return NULL;
2778 }
2779 }
2780 return mb_copy;
2781 }
2782
2783 /*
2784 * Send an upcall with the data packet to the user-level process.
2785 */
2786 static int
2787 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
2788 struct mbuf *mb_copy, struct mfc *rt)
2789 {
2790 struct mbuf *mb_first;
2791 int len = ntohs(ip->ip_len);
2792 struct igmpmsg *im;
2793 struct sockaddr_in k_igmpsrc = {
2794 .sin_len = sizeof(k_igmpsrc),
2795 .sin_family = AF_INET,
2796 };
2797
2798 /*
2799 * Add a new mbuf with an upcall header
2800 */
2801 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2802 if (mb_first == NULL) {
2803 m_freem(mb_copy);
2804 return ENOBUFS;
2805 }
2806 mb_first->m_data += max_linkhdr;
2807 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
2808 mb_first->m_len = sizeof(struct igmpmsg);
2809 mb_first->m_next = mb_copy;
2810
2811 /* Send message to routing daemon */
2812 im = mtod(mb_first, struct igmpmsg *);
2813 im->im_msgtype = IGMPMSG_WHOLEPKT;
2814 im->im_mbz = 0;
2815 im->im_vif = vifp - viftable;
2816 im->im_src = ip->ip_src;
2817 im->im_dst = ip->ip_dst;
2818
2819 k_igmpsrc.sin_addr = ip->ip_src;
2820
2821 mrtstat.mrts_upcalls++;
2822
2823 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
2824 if (mrtdebug & DEBUG_PIM)
2825 log(LOG_WARNING,
2826 "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n");
2827 ++mrtstat.mrts_upq_sockfull;
2828 return ENOBUFS;
2829 }
2830
2831 /* Keep statistics */
2832 pimstat.pims_snd_registers_msgs++;
2833 pimstat.pims_snd_registers_bytes += len;
2834
2835 return 0;
2836 }
2837
2838 /*
2839 * Encapsulate the data packet in PIM Register message and send it to the RP.
2840 */
2841 static int
2842 pim_register_send_rp(struct ip *ip, struct vif *vifp,
2843 struct mbuf *mb_copy, struct mfc *rt)
2844 {
2845 struct mbuf *mb_first;
2846 struct ip *ip_outer;
2847 struct pim_encap_pimhdr *pimhdr;
2848 int len = ntohs(ip->ip_len);
2849 vifi_t vifi = rt->mfc_parent;
2850
2851 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
2852 m_freem(mb_copy);
2853 return EADDRNOTAVAIL; /* The iif vif is invalid */
2854 }
2855
2856 /*
2857 * Add a new mbuf with the encapsulating header
2858 */
2859 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2860 if (mb_first == NULL) {
2861 m_freem(mb_copy);
2862 return ENOBUFS;
2863 }
2864 mb_first->m_data += max_linkhdr;
2865 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
2866 mb_first->m_next = mb_copy;
2867
2868 mb_first->m_pkthdr.len = len + mb_first->m_len;
2869
2870 /*
2871 * Fill in the encapsulating IP and PIM header
2872 */
2873 ip_outer = mtod(mb_first, struct ip *);
2874 *ip_outer = pim_encap_iphdr;
2875 if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE)
2876 ip_outer->ip_id = 0;
2877 else
2878 ip_outer->ip_id = ip_newid(NULL);
2879 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
2880 sizeof(pim_encap_pimhdr));
2881 ip_outer->ip_src = viftable[vifi].v_lcl_addr;
2882 ip_outer->ip_dst = rt->mfc_rp;
2883 /*
2884 * Copy the inner header TOS to the outer header, and take care of the
2885 * IP_DF bit.
2886 */
2887 ip_outer->ip_tos = ip->ip_tos;
2888 if (ntohs(ip->ip_off) & IP_DF)
2889 ip_outer->ip_off |= htons(IP_DF);
2890 pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer
2891 + sizeof(pim_encap_iphdr));
2892 *pimhdr = pim_encap_pimhdr;
2893 /* If the iif crosses a border, set the Border-bit */
2894 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
2895 pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
2896
2897 mb_first->m_data += sizeof(pim_encap_iphdr);
2898 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
2899 mb_first->m_data -= sizeof(pim_encap_iphdr);
2900
2901 if (vifp->v_rate_limit == 0)
2902 tbf_send_packet(vifp, mb_first);
2903 else
2904 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
2905
2906 /* Keep statistics */
2907 pimstat.pims_snd_registers_msgs++;
2908 pimstat.pims_snd_registers_bytes += len;
2909
2910 return 0;
2911 }
2912
2913 /*
2914 * PIM-SMv2 and PIM-DM messages processing.
2915 * Receives and verifies the PIM control messages, and passes them
2916 * up to the listening socket, using rip_input().
2917 * The only message with special processing is the PIM_REGISTER message
2918 * (used by PIM-SM): the PIM header is stripped off, and the inner packet
2919 * is passed to if_simloop().
2920 */
2921 void
2922 pim_input(struct mbuf *m, ...)
2923 {
2924 struct ip *ip = mtod(m, struct ip *);
2925 struct pim *pim;
2926 int minlen;
2927 int datalen;
2928 int ip_tos;
2929 int proto;
2930 int iphlen;
2931 va_list ap;
2932
2933 va_start(ap, m);
2934 iphlen = va_arg(ap, int);
2935 proto = va_arg(ap, int);
2936 va_end(ap);
2937
2938 datalen = ntohs(ip->ip_len) - iphlen;
2939
2940 /* Keep statistics */
2941 pimstat.pims_rcv_total_msgs++;
2942 pimstat.pims_rcv_total_bytes += datalen;
2943
2944 /*
2945 * Validate lengths
2946 */
2947 if (datalen < PIM_MINLEN) {
2948 pimstat.pims_rcv_tooshort++;
2949 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
2950 datalen, (u_long)ip->ip_src.s_addr);
2951 m_freem(m);
2952 return;
2953 }
2954
2955 /*
2956 * If the packet is at least as big as a REGISTER, go ahead
2957 * and grab the PIM REGISTER header size, to avoid another
2958 * possible m_pullup() later.
2959 *
2960 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
2961 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
2962 */
2963 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
2964
2965 /*
2966 * Get the IP and PIM headers in contiguous memory, and
2967 * possibly the PIM REGISTER header.
2968 */
2969 if ((m->m_flags & M_EXT || m->m_len < minlen) &&
2970 (m = m_pullup(m, minlen)) == NULL) {
2971 log(LOG_ERR, "pim_input: m_pullup failure\n");
2972 return;
2973 }
2974 ip = mtod(m, struct ip *);
2975 ip_tos = ip->ip_tos;
2976
2977 /* adjust mbuf to point to the PIM header */
2978 m->m_data += iphlen;
2979 m->m_len -= iphlen;
2980 pim = mtod(m, struct pim *);
2981
2982 /*
2983 * Validate checksum. If PIM REGISTER, exclude the data packet.
2984 *
2985 * XXX: some older PIMv2 implementations don't make this distinction,
2986 * so for compatibility reason perform the checksum over part of the
2987 * message, and if error, then over the whole message.
2988 */
2989 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
2990 /* do nothing, checksum okay */
2991 } else if (in_cksum(m, datalen)) {
2992 pimstat.pims_rcv_badsum++;
2993 if (mrtdebug & DEBUG_PIM)
2994 log(LOG_DEBUG, "pim_input: invalid checksum\n");
2995 m_freem(m);
2996 return;
2997 }
2998
2999 /* PIM version check */
3000 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
3001 pimstat.pims_rcv_badversion++;
3002 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
3003 PIM_VT_V(pim->pim_vt), PIM_VERSION);
3004 m_freem(m);
3005 return;
3006 }
3007
3008 /* restore mbuf back to the outer IP */
3009 m->m_data -= iphlen;
3010 m->m_len += iphlen;
3011
3012 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
3013 /*
3014 * Since this is a REGISTER, we'll make a copy of the register
3015 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
3016 * routing daemon.
3017 */
3018 int s;
3019 struct sockaddr_in dst = {
3020 .sin_len = sizeof(dst),
3021 .sin_family = AF_INET,
3022 };
3023 struct mbuf *mcp;
3024 struct ip *encap_ip;
3025 u_int32_t *reghdr;
3026 struct ifnet *vifp;
3027
3028 s = splsoftnet();
3029 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
3030 splx(s);
3031 if (mrtdebug & DEBUG_PIM)
3032 log(LOG_DEBUG,
3033 "pim_input: register vif not set: %d\n", reg_vif_num);
3034 m_freem(m);
3035 return;
3036 }
3037 /* XXX need refcnt? */
3038 vifp = viftable[reg_vif_num].v_ifp;
3039 splx(s);
3040
3041 /*
3042 * Validate length
3043 */
3044 if (datalen < PIM_REG_MINLEN) {
3045 pimstat.pims_rcv_tooshort++;
3046 pimstat.pims_rcv_badregisters++;
3047 log(LOG_ERR,
3048 "pim_input: register packet size too small %d from %lx\n",
3049 datalen, (u_long)ip->ip_src.s_addr);
3050 m_freem(m);
3051 return;
3052 }
3053
3054 reghdr = (u_int32_t *)(pim + 1);
3055 encap_ip = (struct ip *)(reghdr + 1);
3056
3057 if (mrtdebug & DEBUG_PIM) {
3058 log(LOG_DEBUG,
3059 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
3060 (u_long)ntohl(encap_ip->ip_src.s_addr),
3061 (u_long)ntohl(encap_ip->ip_dst.s_addr),
3062 ntohs(encap_ip->ip_len));
3063 }
3064
3065 /* verify the version number of the inner packet */
3066 if (encap_ip->ip_v != IPVERSION) {
3067 pimstat.pims_rcv_badregisters++;
3068 if (mrtdebug & DEBUG_PIM) {
3069 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
3070 "of the inner packet\n", encap_ip->ip_v);
3071 }
3072 m_freem(m);
3073 return;
3074 }
3075
3076 /* verify the inner packet doesn't have options */
3077 if (encap_ip->ip_hl != (sizeof(struct ip) >> 2)) {
3078 pimstat.pims_rcv_badregisters++;
3079 m_freem(m);
3080 return;
3081 }
3082
3083 /* verify the inner packet is destined to a mcast group */
3084 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
3085 pimstat.pims_rcv_badregisters++;
3086 if (mrtdebug & DEBUG_PIM)
3087 log(LOG_DEBUG,
3088 "pim_input: inner packet of register is not "
3089 "multicast %lx\n",
3090 (u_long)ntohl(encap_ip->ip_dst.s_addr));
3091 m_freem(m);
3092 return;
3093 }
3094
3095 /* If a NULL_REGISTER, pass it to the daemon */
3096 if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
3097 goto pim_input_to_daemon;
3098
3099 /*
3100 * Copy the TOS from the outer IP header to the inner IP header.
3101 */
3102 if (encap_ip->ip_tos != ip_tos) {
3103 /* Outer TOS -> inner TOS */
3104 encap_ip->ip_tos = ip_tos;
3105 /* Recompute the inner header checksum. Sigh... */
3106
3107 /* adjust mbuf to point to the inner IP header */
3108 m->m_data += (iphlen + PIM_MINLEN);
3109 m->m_len -= (iphlen + PIM_MINLEN);
3110
3111 encap_ip->ip_sum = 0;
3112 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
3113
3114 /* restore mbuf to point back to the outer IP header */
3115 m->m_data -= (iphlen + PIM_MINLEN);
3116 m->m_len += (iphlen + PIM_MINLEN);
3117 }
3118
3119 /*
3120 * Decapsulate the inner IP packet and loopback to forward it
3121 * as a normal multicast packet. Also, make a copy of the
3122 * outer_iphdr + pimhdr + reghdr + encap_iphdr
3123 * to pass to the daemon later, so it can take the appropriate
3124 * actions (e.g., send back PIM_REGISTER_STOP).
3125 * XXX: here m->m_data points to the outer IP header.
3126 */
3127 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT);
3128 if (mcp == NULL) {
3129 log(LOG_ERR,
3130 "pim_input: pim register: could not copy register head\n");
3131 m_freem(m);
3132 return;
3133 }
3134
3135 /* Keep statistics */
3136 /* XXX: registers_bytes include only the encap. mcast pkt */
3137 pimstat.pims_rcv_registers_msgs++;
3138 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
3139
3140 /*
3141 * forward the inner ip packet; point m_data at the inner ip.
3142 */
3143 m_adj(m, iphlen + PIM_MINLEN);
3144
3145 if (mrtdebug & DEBUG_PIM) {
3146 log(LOG_DEBUG,
3147 "pim_input: forwarding decapsulated register: "
3148 "src %lx, dst %lx, vif %d\n",
3149 (u_long)ntohl(encap_ip->ip_src.s_addr),
3150 (u_long)ntohl(encap_ip->ip_dst.s_addr),
3151 reg_vif_num);
3152 }
3153 /* NB: vifp was collected above; can it change on us? */
3154 looutput(vifp, m, (struct sockaddr *)&dst, NULL);
3155
3156 /* prepare the register head to send to the mrouting daemon */
3157 m = mcp;
3158 }
3159
3160 pim_input_to_daemon:
3161 /*
3162 * Pass the PIM message up to the daemon; if it is a Register message,
3163 * pass the 'head' only up to the daemon. This includes the
3164 * outer IP header, PIM header, PIM-Register header and the
3165 * inner IP header.
3166 * XXX: the outer IP header pkt size of a Register is not adjust to
3167 * reflect the fact that the inner multicast data is truncated.
3168 */
3169 /*
3170 * Currently, pim_input() is always called holding softnet_lock
3171 * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
3172 */
3173 KASSERT(mutex_owned(softnet_lock));
3174 rip_input(m, iphlen, proto);
3175
3176 return;
3177 }
3178 #endif /* PIM */
3179