ip_mroute.c revision 1.149 1 /* $NetBSD: ip_mroute.c,v 1.149 2018/02/07 11:42:57 maxv Exp $ */
2
3 /*
4 * Copyright (c) 1992, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
35 */
36
37 /*
38 * Copyright (c) 1989 Stephen Deering
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Stephen Deering of Stanford University.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
72 */
73
74 /*
75 * IP multicast forwarding procedures
76 *
77 * Written by David Waitzman, BBN Labs, August 1988.
78 * Modified by Steve Deering, Stanford, February 1989.
79 * Modified by Mark J. Steiglitz, Stanford, May, 1991
80 * Modified by Van Jacobson, LBL, January 1993
81 * Modified by Ajit Thyagarajan, PARC, August 1993
82 * Modified by Bill Fenner, PARC, April 1994
83 * Modified by Charles M. Hannum, NetBSD, May 1995.
84 * Modified by Ahmed Helmy, SGI, June 1996
85 * Modified by George Edmond Eddy (Rusty), ISI, February 1998
86 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
87 * Modified by Hitoshi Asaeda, WIDE, August 2000
88 * Modified by Pavlin Radoslavov, ICSI, October 2002
89 *
90 * MROUTING Revision: 1.2
91 * and PIM-SMv2 and PIM-DM support, advanced API support,
92 * bandwidth metering and signaling
93 */
94
95 #include <sys/cdefs.h>
96 __KERNEL_RCSID(0, "$NetBSD: ip_mroute.c,v 1.149 2018/02/07 11:42:57 maxv Exp $");
97
98 #ifdef _KERNEL_OPT
99 #include "opt_inet.h"
100 #include "opt_ipsec.h"
101 #include "opt_pim.h"
102 #endif
103
104 #ifdef PIM
105 #define _PIM_VT 1
106 #endif
107
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/callout.h>
111 #include <sys/mbuf.h>
112 #include <sys/socket.h>
113 #include <sys/socketvar.h>
114 #include <sys/errno.h>
115 #include <sys/time.h>
116 #include <sys/kernel.h>
117 #include <sys/kmem.h>
118 #include <sys/ioctl.h>
119 #include <sys/syslog.h>
120
121 #include <net/if.h>
122 #include <net/raw_cb.h>
123
124 #include <netinet/in.h>
125 #include <netinet/in_var.h>
126 #include <netinet/in_systm.h>
127 #include <netinet/ip.h>
128 #include <netinet/ip_var.h>
129 #include <netinet/in_pcb.h>
130 #include <netinet/udp.h>
131 #include <netinet/igmp.h>
132 #include <netinet/igmp_var.h>
133 #include <netinet/ip_mroute.h>
134 #ifdef PIM
135 #include <netinet/pim.h>
136 #include <netinet/pim_var.h>
137 #endif
138 #include <netinet/ip_encap.h>
139
140 #ifdef IPSEC
141 #include <netipsec/ipsec.h>
142 #include <netipsec/key.h>
143 #endif
144
145 #define IP_MULTICASTOPTS 0
146 #define M_PULLUP(m, len) \
147 do { \
148 if ((m) && ((m)->m_flags & M_EXT || (m)->m_len < (len))) \
149 (m) = m_pullup((m), (len)); \
150 } while (/*CONSTCOND*/ 0)
151
152 /*
153 * Globals. All but ip_mrouter and ip_mrtproto could be static,
154 * except for netstat or debugging purposes.
155 */
156 struct socket *ip_mrouter = NULL;
157 int ip_mrtproto = IGMP_DVMRP; /* for netstat only */
158
159 #define NO_RTE_FOUND 0x1
160 #define RTE_FOUND 0x2
161
162 #define MFCHASH(a, g) \
163 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
164 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash)
165 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl;
166 u_long mfchash;
167
168 u_char nexpire[MFCTBLSIZ];
169 struct vif viftable[MAXVIFS];
170 struct mrtstat mrtstat;
171 u_int mrtdebug = 0; /* debug level */
172 #define DEBUG_MFC 0x02
173 #define DEBUG_FORWARD 0x04
174 #define DEBUG_EXPIRE 0x08
175 #define DEBUG_XMIT 0x10
176 #define DEBUG_PIM 0x20
177
178 #define VIFI_INVALID ((vifi_t) -1)
179
180 u_int tbfdebug = 0; /* tbf debug level */
181
182 /* vif attachment using sys/netinet/ip_encap.c */
183 static void vif_input(struct mbuf *, int, int, void *);
184 static int vif_encapcheck(struct mbuf *, int, int, void *);
185
186 static const struct encapsw vif_encapsw = {
187 .encapsw4 = {
188 .pr_input = vif_input,
189 .pr_ctlinput = NULL,
190 }
191 };
192
193 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
194 #define UPCALL_EXPIRE 6 /* number of timeouts */
195
196 /*
197 * Define the token bucket filter structures
198 */
199
200 #define TBF_REPROCESS (hz / 100) /* 100x / second */
201
202 static int get_sg_cnt(struct sioc_sg_req *);
203 static int get_vif_cnt(struct sioc_vif_req *);
204 static int ip_mrouter_init(struct socket *, int);
205 static int set_assert(int);
206 static int add_vif(struct vifctl *);
207 static int del_vif(vifi_t *);
208 static void update_mfc_params(struct mfc *, struct mfcctl2 *);
209 static void init_mfc_params(struct mfc *, struct mfcctl2 *);
210 static void expire_mfc(struct mfc *);
211 static int add_mfc(struct sockopt *);
212 #ifdef UPCALL_TIMING
213 static void collate(struct timeval *);
214 #endif
215 static int del_mfc(struct sockopt *);
216 static int set_api_config(struct sockopt *); /* chose API capabilities */
217 static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
218 static void expire_upcalls(void *);
219 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *);
220 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
221 static void encap_send(struct ip *, struct vif *, struct mbuf *);
222 static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_int32_t);
223 static void tbf_queue(struct vif *, struct mbuf *);
224 static void tbf_process_q(struct vif *);
225 static void tbf_reprocess_q(void *);
226 static int tbf_dq_sel(struct vif *, struct ip *);
227 static void tbf_send_packet(struct vif *, struct mbuf *);
228 static void tbf_update_tokens(struct vif *);
229 static int priority(struct vif *, struct ip *);
230
231 /*
232 * Bandwidth monitoring
233 */
234 static void free_bw_list(struct bw_meter *);
235 static int add_bw_upcall(struct bw_upcall *);
236 static int del_bw_upcall(struct bw_upcall *);
237 static void bw_meter_receive_packet(struct bw_meter *, int , struct timeval *);
238 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
239 static void bw_upcalls_send(void);
240 static void schedule_bw_meter(struct bw_meter *, struct timeval *);
241 static void unschedule_bw_meter(struct bw_meter *);
242 static void bw_meter_process(void);
243 static void expire_bw_upcalls_send(void *);
244 static void expire_bw_meter_process(void *);
245
246 #ifdef PIM
247 static int pim_register_send(struct ip *, struct vif *,
248 struct mbuf *, struct mfc *);
249 static int pim_register_send_rp(struct ip *, struct vif *,
250 struct mbuf *, struct mfc *);
251 static int pim_register_send_upcall(struct ip *, struct vif *,
252 struct mbuf *, struct mfc *);
253 static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *);
254 #endif
255
256 /*
257 * 'Interfaces' associated with decapsulator (so we can tell
258 * packets that went through it from ones that get reflected
259 * by a broken gateway). These interfaces are never linked into
260 * the system ifnet list & no routes point to them. I.e., packets
261 * can't be sent this way. They only exist as a placeholder for
262 * multicast source verification.
263 */
264 #if 0
265 struct ifnet multicast_decap_if[MAXVIFS];
266 #endif
267
268 #define ENCAP_TTL 64
269 #define ENCAP_PROTO IPPROTO_IPIP /* 4 */
270
271 /* prototype IP hdr for encapsulated packets */
272 struct ip multicast_encap_iphdr = {
273 .ip_hl = sizeof(struct ip) >> 2,
274 .ip_v = IPVERSION,
275 .ip_len = sizeof(struct ip),
276 .ip_ttl = ENCAP_TTL,
277 .ip_p = ENCAP_PROTO,
278 };
279
280 /*
281 * Bandwidth meter variables and constants
282 */
283
284 /*
285 * Pending timeouts are stored in a hash table, the key being the
286 * expiration time. Periodically, the entries are analysed and processed.
287 */
288 #define BW_METER_BUCKETS 1024
289 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
290 struct callout bw_meter_ch;
291 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */
292
293 /*
294 * Pending upcalls are stored in a vector which is flushed when
295 * full, or periodically
296 */
297 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
298 static u_int bw_upcalls_n; /* # of pending upcalls */
299 struct callout bw_upcalls_ch;
300 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
301
302 #ifdef PIM
303 struct pimstat pimstat;
304
305 /*
306 * Note: the PIM Register encapsulation adds the following in front of a
307 * data packet:
308 *
309 * struct pim_encap_hdr {
310 * struct ip ip;
311 * struct pim_encap_pimhdr pim;
312 * }
313 *
314 */
315
316 struct pim_encap_pimhdr {
317 struct pim pim;
318 uint32_t flags;
319 };
320
321 static struct ip pim_encap_iphdr = {
322 .ip_v = IPVERSION,
323 .ip_hl = sizeof(struct ip) >> 2,
324 .ip_len = sizeof(struct ip),
325 .ip_ttl = ENCAP_TTL,
326 .ip_p = IPPROTO_PIM,
327 };
328
329 static struct pim_encap_pimhdr pim_encap_pimhdr = {
330 {
331 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
332 0, /* reserved */
333 0, /* checksum */
334 },
335 0 /* flags */
336 };
337
338 static struct ifnet multicast_register_if;
339 static vifi_t reg_vif_num = VIFI_INVALID;
340 #endif /* PIM */
341
342
343 /*
344 * Private variables.
345 */
346 static vifi_t numvifs = 0;
347
348 static struct callout expire_upcalls_ch;
349
350 /*
351 * whether or not special PIM assert processing is enabled.
352 */
353 static int pim_assert;
354 /*
355 * Rate limit for assert notification messages, in usec
356 */
357 #define ASSERT_MSG_TIME 3000000
358
359 /*
360 * Kernel multicast routing API capabilities and setup.
361 * If more API capabilities are added to the kernel, they should be
362 * recorded in `mrt_api_support'.
363 */
364 static const u_int32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
365 MRT_MFC_FLAGS_BORDER_VIF |
366 MRT_MFC_RP |
367 MRT_MFC_BW_UPCALL);
368 static u_int32_t mrt_api_config = 0;
369
370 /*
371 * Find a route for a given origin IP address and Multicast group address
372 * Type of service parameter to be added in the future!!!
373 * Statistics are updated by the caller if needed
374 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
375 */
376 static struct mfc *
377 mfc_find(struct in_addr *o, struct in_addr *g)
378 {
379 struct mfc *rt;
380
381 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
382 if (in_hosteq(rt->mfc_origin, *o) &&
383 in_hosteq(rt->mfc_mcastgrp, *g) &&
384 (rt->mfc_stall == NULL))
385 break;
386 }
387
388 return (rt);
389 }
390
391 /*
392 * Macros to compute elapsed time efficiently
393 * Borrowed from Van Jacobson's scheduling code
394 */
395 #define TV_DELTA(a, b, delta) do { \
396 int xxs; \
397 delta = (a).tv_usec - (b).tv_usec; \
398 xxs = (a).tv_sec - (b).tv_sec; \
399 switch (xxs) { \
400 case 2: \
401 delta += 1000000; \
402 /* fall through */ \
403 case 1: \
404 delta += 1000000; \
405 /* fall through */ \
406 case 0: \
407 break; \
408 default: \
409 delta += (1000000 * xxs); \
410 break; \
411 } \
412 } while (/*CONSTCOND*/ 0)
413
414 #ifdef UPCALL_TIMING
415 u_int32_t upcall_data[51];
416 #endif /* UPCALL_TIMING */
417
418 /*
419 * Handle MRT setsockopt commands to modify the multicast routing tables.
420 */
421 int
422 ip_mrouter_set(struct socket *so, struct sockopt *sopt)
423 {
424 int error;
425 int optval;
426 struct vifctl vifc;
427 vifi_t vifi;
428 struct bw_upcall bwuc;
429
430 if (sopt->sopt_name != MRT_INIT && so != ip_mrouter)
431 error = ENOPROTOOPT;
432 else {
433 switch (sopt->sopt_name) {
434 case MRT_INIT:
435 error = sockopt_getint(sopt, &optval);
436 if (error)
437 break;
438
439 error = ip_mrouter_init(so, optval);
440 break;
441 case MRT_DONE:
442 error = ip_mrouter_done();
443 break;
444 case MRT_ADD_VIF:
445 error = sockopt_get(sopt, &vifc, sizeof(vifc));
446 if (error)
447 break;
448 error = add_vif(&vifc);
449 break;
450 case MRT_DEL_VIF:
451 error = sockopt_get(sopt, &vifi, sizeof(vifi));
452 if (error)
453 break;
454 error = del_vif(&vifi);
455 break;
456 case MRT_ADD_MFC:
457 error = add_mfc(sopt);
458 break;
459 case MRT_DEL_MFC:
460 error = del_mfc(sopt);
461 break;
462 case MRT_ASSERT:
463 error = sockopt_getint(sopt, &optval);
464 if (error)
465 break;
466 error = set_assert(optval);
467 break;
468 case MRT_API_CONFIG:
469 error = set_api_config(sopt);
470 break;
471 case MRT_ADD_BW_UPCALL:
472 error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
473 if (error)
474 break;
475 error = add_bw_upcall(&bwuc);
476 break;
477 case MRT_DEL_BW_UPCALL:
478 error = sockopt_get(sopt, &bwuc, sizeof(bwuc));
479 if (error)
480 break;
481 error = del_bw_upcall(&bwuc);
482 break;
483 default:
484 error = ENOPROTOOPT;
485 break;
486 }
487 }
488 return (error);
489 }
490
491 /*
492 * Handle MRT getsockopt commands
493 */
494 int
495 ip_mrouter_get(struct socket *so, struct sockopt *sopt)
496 {
497 int error;
498
499 if (so != ip_mrouter)
500 error = ENOPROTOOPT;
501 else {
502 switch (sopt->sopt_name) {
503 case MRT_VERSION:
504 error = sockopt_setint(sopt, 0x0305); /* XXX !!!! */
505 break;
506 case MRT_ASSERT:
507 error = sockopt_setint(sopt, pim_assert);
508 break;
509 case MRT_API_SUPPORT:
510 error = sockopt_set(sopt, &mrt_api_support,
511 sizeof(mrt_api_support));
512 break;
513 case MRT_API_CONFIG:
514 error = sockopt_set(sopt, &mrt_api_config,
515 sizeof(mrt_api_config));
516 break;
517 default:
518 error = ENOPROTOOPT;
519 break;
520 }
521 }
522 return (error);
523 }
524
525 /*
526 * Handle ioctl commands to obtain information from the cache
527 */
528 int
529 mrt_ioctl(struct socket *so, u_long cmd, void *data)
530 {
531 int error;
532
533 if (so != ip_mrouter)
534 error = EINVAL;
535 else
536 switch (cmd) {
537 case SIOCGETVIFCNT:
538 error = get_vif_cnt((struct sioc_vif_req *)data);
539 break;
540 case SIOCGETSGCNT:
541 error = get_sg_cnt((struct sioc_sg_req *)data);
542 break;
543 default:
544 error = EINVAL;
545 break;
546 }
547
548 return (error);
549 }
550
551 /*
552 * returns the packet, byte, rpf-failure count for the source group provided
553 */
554 static int
555 get_sg_cnt(struct sioc_sg_req *req)
556 {
557 int s;
558 struct mfc *rt;
559
560 s = splsoftnet();
561 rt = mfc_find(&req->src, &req->grp);
562 if (rt == NULL) {
563 splx(s);
564 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
565 return (EADDRNOTAVAIL);
566 }
567 req->pktcnt = rt->mfc_pkt_cnt;
568 req->bytecnt = rt->mfc_byte_cnt;
569 req->wrong_if = rt->mfc_wrong_if;
570 splx(s);
571
572 return (0);
573 }
574
575 /*
576 * returns the input and output packet and byte counts on the vif provided
577 */
578 static int
579 get_vif_cnt(struct sioc_vif_req *req)
580 {
581 vifi_t vifi = req->vifi;
582
583 if (vifi >= numvifs)
584 return (EINVAL);
585
586 req->icount = viftable[vifi].v_pkt_in;
587 req->ocount = viftable[vifi].v_pkt_out;
588 req->ibytes = viftable[vifi].v_bytes_in;
589 req->obytes = viftable[vifi].v_bytes_out;
590
591 return (0);
592 }
593
594 /*
595 * Enable multicast routing
596 */
597 static int
598 ip_mrouter_init(struct socket *so, int v)
599 {
600 if (mrtdebug)
601 log(LOG_DEBUG,
602 "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
603 so->so_type, so->so_proto->pr_protocol);
604
605 if (so->so_type != SOCK_RAW ||
606 so->so_proto->pr_protocol != IPPROTO_IGMP)
607 return (EOPNOTSUPP);
608
609 if (v != 1)
610 return (EINVAL);
611
612 if (ip_mrouter != NULL)
613 return (EADDRINUSE);
614
615 ip_mrouter = so;
616
617 mfchashtbl = hashinit(MFCTBLSIZ, HASH_LIST, true, &mfchash);
618 memset((void *)nexpire, 0, sizeof(nexpire));
619
620 pim_assert = 0;
621
622 callout_init(&expire_upcalls_ch, 0);
623 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
624 expire_upcalls, NULL);
625
626 callout_init(&bw_upcalls_ch, 0);
627 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
628 expire_bw_upcalls_send, NULL);
629
630 callout_init(&bw_meter_ch, 0);
631 callout_reset(&bw_meter_ch, BW_METER_PERIOD,
632 expire_bw_meter_process, NULL);
633
634 if (mrtdebug)
635 log(LOG_DEBUG, "ip_mrouter_init\n");
636
637 return (0);
638 }
639
640 /*
641 * Disable multicast routing
642 */
643 int
644 ip_mrouter_done(void)
645 {
646 vifi_t vifi;
647 struct vif *vifp;
648 int i;
649 int s;
650
651 s = splsoftnet();
652
653 /* Clear out all the vifs currently in use. */
654 for (vifi = 0; vifi < numvifs; vifi++) {
655 vifp = &viftable[vifi];
656 if (!in_nullhost(vifp->v_lcl_addr))
657 reset_vif(vifp);
658 }
659
660 numvifs = 0;
661 pim_assert = 0;
662 mrt_api_config = 0;
663
664 callout_stop(&expire_upcalls_ch);
665 callout_stop(&bw_upcalls_ch);
666 callout_stop(&bw_meter_ch);
667
668 /*
669 * Free all multicast forwarding cache entries.
670 */
671 for (i = 0; i < MFCTBLSIZ; i++) {
672 struct mfc *rt, *nrt;
673
674 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
675 nrt = LIST_NEXT(rt, mfc_hash);
676
677 expire_mfc(rt);
678 }
679 }
680
681 memset((void *)nexpire, 0, sizeof(nexpire));
682 hashdone(mfchashtbl, HASH_LIST, mfchash);
683 mfchashtbl = NULL;
684
685 bw_upcalls_n = 0;
686 memset(bw_meter_timers, 0, sizeof(bw_meter_timers));
687
688 /* Reset de-encapsulation cache. */
689
690 ip_mrouter = NULL;
691
692 splx(s);
693
694 if (mrtdebug)
695 log(LOG_DEBUG, "ip_mrouter_done\n");
696
697 return (0);
698 }
699
700 void
701 ip_mrouter_detach(struct ifnet *ifp)
702 {
703 int vifi, i;
704 struct vif *vifp;
705 struct mfc *rt;
706 struct rtdetq *rte;
707
708 /* XXX not sure about side effect to userland routing daemon */
709 for (vifi = 0; vifi < numvifs; vifi++) {
710 vifp = &viftable[vifi];
711 if (vifp->v_ifp == ifp)
712 reset_vif(vifp);
713 }
714 for (i = 0; i < MFCTBLSIZ; i++) {
715 if (nexpire[i] == 0)
716 continue;
717 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) {
718 for (rte = rt->mfc_stall; rte; rte = rte->next) {
719 if (rte->ifp == ifp)
720 rte->ifp = NULL;
721 }
722 }
723 }
724 }
725
726 /*
727 * Set PIM assert processing global
728 */
729 static int
730 set_assert(int i)
731 {
732 pim_assert = !!i;
733 return (0);
734 }
735
736 /*
737 * Configure API capabilities
738 */
739 static int
740 set_api_config(struct sockopt *sopt)
741 {
742 u_int32_t apival;
743 int i, error;
744
745 /*
746 * We can set the API capabilities only if it is the first operation
747 * after MRT_INIT. I.e.:
748 * - there are no vifs installed
749 * - pim_assert is not enabled
750 * - the MFC table is empty
751 */
752 error = sockopt_get(sopt, &apival, sizeof(apival));
753 if (error)
754 return (error);
755 if (numvifs > 0)
756 return (EPERM);
757 if (pim_assert)
758 return (EPERM);
759 for (i = 0; i < MFCTBLSIZ; i++) {
760 if (LIST_FIRST(&mfchashtbl[i]) != NULL)
761 return (EPERM);
762 }
763
764 mrt_api_config = apival & mrt_api_support;
765 return (0);
766 }
767
768 /*
769 * Add a vif to the vif table
770 */
771 static int
772 add_vif(struct vifctl *vifcp)
773 {
774 struct vif *vifp;
775 struct ifnet *ifp;
776 int error, s;
777 struct sockaddr_in sin;
778
779 if (vifcp->vifc_vifi >= MAXVIFS)
780 return (EINVAL);
781 if (in_nullhost(vifcp->vifc_lcl_addr))
782 return (EADDRNOTAVAIL);
783
784 vifp = &viftable[vifcp->vifc_vifi];
785 if (!in_nullhost(vifp->v_lcl_addr))
786 return (EADDRINUSE);
787
788 /* Find the interface with an address in AF_INET family. */
789 #ifdef PIM
790 if (vifcp->vifc_flags & VIFF_REGISTER) {
791 /*
792 * XXX: Because VIFF_REGISTER does not really need a valid
793 * local interface (e.g. it could be 127.0.0.2), we don't
794 * check its address.
795 */
796 ifp = NULL;
797 } else
798 #endif
799 {
800 struct ifaddr *ifa;
801
802 sockaddr_in_init(&sin, &vifcp->vifc_lcl_addr, 0);
803 s = pserialize_read_enter();
804 ifa = ifa_ifwithaddr(sintosa(&sin));
805 if (ifa == NULL) {
806 pserialize_read_exit(s);
807 return EADDRNOTAVAIL;
808 }
809 ifp = ifa->ifa_ifp;
810 /* FIXME NOMPSAFE */
811 pserialize_read_exit(s);
812 }
813
814 if (vifcp->vifc_flags & VIFF_TUNNEL) {
815 if (vifcp->vifc_flags & VIFF_SRCRT) {
816 log(LOG_ERR, "source routed tunnels not supported\n");
817 return (EOPNOTSUPP);
818 }
819
820 /* attach this vif to decapsulator dispatch table */
821 /*
822 * XXX Use addresses in registration so that matching
823 * can be done with radix tree in decapsulator. But,
824 * we need to check inner header for multicast, so
825 * this requires both radix tree lookup and then a
826 * function to check, and this is not supported yet.
827 */
828 error = encap_lock_enter();
829 if (error)
830 return error;
831 vifp->v_encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
832 vif_encapcheck, &vif_encapsw, vifp);
833 encap_lock_exit();
834 if (!vifp->v_encap_cookie)
835 return (EINVAL);
836
837 /* Create a fake encapsulation interface. */
838 ifp = malloc(sizeof(*ifp), M_MRTABLE, M_WAITOK|M_ZERO);
839 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
840 "mdecap%d", vifcp->vifc_vifi);
841
842 /* Prepare cached route entry. */
843 memset(&vifp->v_route, 0, sizeof(vifp->v_route));
844 #ifdef PIM
845 } else if (vifcp->vifc_flags & VIFF_REGISTER) {
846 ifp = &multicast_register_if;
847 if (mrtdebug)
848 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
849 (void *)ifp);
850 if (reg_vif_num == VIFI_INVALID) {
851 memset(ifp, 0, sizeof(*ifp));
852 snprintf(ifp->if_xname, sizeof(ifp->if_xname),
853 "register_vif");
854 ifp->if_flags = IFF_LOOPBACK;
855 memset(&vifp->v_route, 0, sizeof(vifp->v_route));
856 reg_vif_num = vifcp->vifc_vifi;
857 }
858 #endif
859 } else {
860 /* Make sure the interface supports multicast. */
861 if ((ifp->if_flags & IFF_MULTICAST) == 0)
862 return (EOPNOTSUPP);
863
864 /* Enable promiscuous reception of all IP multicasts. */
865 sockaddr_in_init(&sin, &zeroin_addr, 0);
866 error = if_mcast_op(ifp, SIOCADDMULTI, sintosa(&sin));
867 if (error)
868 return (error);
869 }
870
871 s = splsoftnet();
872
873 /* Define parameters for the tbf structure. */
874 vifp->tbf_q = NULL;
875 vifp->tbf_t = &vifp->tbf_q;
876 microtime(&vifp->tbf_last_pkt_t);
877 vifp->tbf_n_tok = 0;
878 vifp->tbf_q_len = 0;
879 vifp->tbf_max_q_len = MAXQSIZE;
880
881 vifp->v_flags = vifcp->vifc_flags;
882 vifp->v_threshold = vifcp->vifc_threshold;
883 /* scaling up here allows division by 1024 in critical code */
884 vifp->v_rate_limit = vifcp->vifc_rate_limit * 1024 / 1000;
885 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
886 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
887 vifp->v_ifp = ifp;
888 /* Initialize per vif pkt counters. */
889 vifp->v_pkt_in = 0;
890 vifp->v_pkt_out = 0;
891 vifp->v_bytes_in = 0;
892 vifp->v_bytes_out = 0;
893
894 callout_init(&vifp->v_repq_ch, 0);
895
896 splx(s);
897
898 /* Adjust numvifs up if the vifi is higher than numvifs. */
899 if (numvifs <= vifcp->vifc_vifi)
900 numvifs = vifcp->vifc_vifi + 1;
901
902 if (mrtdebug)
903 log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n",
904 vifcp->vifc_vifi,
905 ntohl(vifcp->vifc_lcl_addr.s_addr),
906 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
907 ntohl(vifcp->vifc_rmt_addr.s_addr),
908 vifcp->vifc_threshold,
909 vifcp->vifc_rate_limit);
910
911 return (0);
912 }
913
914 void
915 reset_vif(struct vif *vifp)
916 {
917 struct mbuf *m, *n;
918 struct ifnet *ifp;
919 struct sockaddr_in sin;
920
921 callout_stop(&vifp->v_repq_ch);
922
923 /* detach this vif from decapsulator dispatch table */
924 encap_lock_enter();
925 encap_detach(vifp->v_encap_cookie);
926 encap_lock_exit();
927 vifp->v_encap_cookie = NULL;
928
929 /*
930 * Free packets queued at the interface
931 */
932 for (m = vifp->tbf_q; m != NULL; m = n) {
933 n = m->m_nextpkt;
934 m_freem(m);
935 }
936
937 if (vifp->v_flags & VIFF_TUNNEL)
938 free(vifp->v_ifp, M_MRTABLE);
939 else if (vifp->v_flags & VIFF_REGISTER) {
940 #ifdef PIM
941 reg_vif_num = VIFI_INVALID;
942 #endif
943 } else {
944 sockaddr_in_init(&sin, &zeroin_addr, 0);
945 ifp = vifp->v_ifp;
946 if_mcast_op(ifp, SIOCDELMULTI, sintosa(&sin));
947 }
948 memset((void *)vifp, 0, sizeof(*vifp));
949 }
950
951 /*
952 * Delete a vif from the vif table
953 */
954 static int
955 del_vif(vifi_t *vifip)
956 {
957 struct vif *vifp;
958 vifi_t vifi;
959 int s;
960
961 if (*vifip >= numvifs)
962 return (EINVAL);
963
964 vifp = &viftable[*vifip];
965 if (in_nullhost(vifp->v_lcl_addr))
966 return (EADDRNOTAVAIL);
967
968 s = splsoftnet();
969
970 reset_vif(vifp);
971
972 /* Adjust numvifs down */
973 for (vifi = numvifs; vifi > 0; vifi--)
974 if (!in_nullhost(viftable[vifi - 1].v_lcl_addr))
975 break;
976 numvifs = vifi;
977
978 splx(s);
979
980 if (mrtdebug)
981 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs);
982
983 return (0);
984 }
985
986 /*
987 * update an mfc entry without resetting counters and S,G addresses.
988 */
989 static void
990 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
991 {
992 int i;
993
994 rt->mfc_parent = mfccp->mfcc_parent;
995 for (i = 0; i < numvifs; i++) {
996 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
997 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
998 MRT_MFC_FLAGS_ALL;
999 }
1000 /* set the RP address */
1001 if (mrt_api_config & MRT_MFC_RP)
1002 rt->mfc_rp = mfccp->mfcc_rp;
1003 else
1004 rt->mfc_rp = zeroin_addr;
1005 }
1006
1007 /*
1008 * fully initialize an mfc entry from the parameter.
1009 */
1010 static void
1011 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
1012 {
1013 rt->mfc_origin = mfccp->mfcc_origin;
1014 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1015
1016 update_mfc_params(rt, mfccp);
1017
1018 /* initialize pkt counters per src-grp */
1019 rt->mfc_pkt_cnt = 0;
1020 rt->mfc_byte_cnt = 0;
1021 rt->mfc_wrong_if = 0;
1022 timerclear(&rt->mfc_last_assert);
1023 }
1024
1025 static void
1026 expire_mfc(struct mfc *rt)
1027 {
1028 struct rtdetq *rte, *nrte;
1029
1030 free_bw_list(rt->mfc_bw_meter);
1031
1032 for (rte = rt->mfc_stall; rte != NULL; rte = nrte) {
1033 nrte = rte->next;
1034 m_freem(rte->m);
1035 free(rte, M_MRTABLE);
1036 }
1037
1038 LIST_REMOVE(rt, mfc_hash);
1039 free(rt, M_MRTABLE);
1040 }
1041
1042 /*
1043 * Add an mfc entry
1044 */
1045 static int
1046 add_mfc(struct sockopt *sopt)
1047 {
1048 struct mfcctl2 mfcctl2;
1049 struct mfcctl2 *mfccp;
1050 struct mfc *rt;
1051 u_int32_t hash = 0;
1052 struct rtdetq *rte, *nrte;
1053 u_short nstl;
1054 int s;
1055 int error;
1056
1057 /*
1058 * select data size depending on API version.
1059 */
1060 mfccp = &mfcctl2;
1061 memset(&mfcctl2, 0, sizeof(mfcctl2));
1062
1063 if (mrt_api_config & MRT_API_FLAGS_ALL)
1064 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1065 else
1066 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1067
1068 if (error)
1069 return (error);
1070
1071 s = splsoftnet();
1072 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1073
1074 /* If an entry already exists, just update the fields */
1075 if (rt) {
1076 if (mrtdebug & DEBUG_MFC)
1077 log(LOG_DEBUG, "add_mfc update o %x g %x p %x\n",
1078 ntohl(mfccp->mfcc_origin.s_addr),
1079 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1080 mfccp->mfcc_parent);
1081
1082 update_mfc_params(rt, mfccp);
1083
1084 splx(s);
1085 return (0);
1086 }
1087
1088 /*
1089 * Find the entry for which the upcall was made and update
1090 */
1091 nstl = 0;
1092 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
1093 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1094 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1095 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
1096 rt->mfc_stall != NULL) {
1097 if (nstl++)
1098 log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %p\n",
1099 "multiple kernel entries",
1100 ntohl(mfccp->mfcc_origin.s_addr),
1101 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1102 mfccp->mfcc_parent, rt->mfc_stall);
1103
1104 if (mrtdebug & DEBUG_MFC)
1105 log(LOG_DEBUG, "add_mfc o %x g %x p %x dbg %p\n",
1106 ntohl(mfccp->mfcc_origin.s_addr),
1107 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1108 mfccp->mfcc_parent, rt->mfc_stall);
1109
1110 rte = rt->mfc_stall;
1111 init_mfc_params(rt, mfccp);
1112 rt->mfc_stall = NULL;
1113
1114 rt->mfc_expire = 0; /* Don't clean this guy up */
1115 nexpire[hash]--;
1116
1117 /* free packets Qed at the end of this entry */
1118 for (; rte != NULL; rte = nrte) {
1119 nrte = rte->next;
1120 if (rte->ifp) {
1121 ip_mdq(rte->m, rte->ifp, rt);
1122 }
1123 m_freem(rte->m);
1124 #ifdef UPCALL_TIMING
1125 collate(&rte->t);
1126 #endif /* UPCALL_TIMING */
1127 free(rte, M_MRTABLE);
1128 }
1129 }
1130 }
1131
1132 /*
1133 * It is possible that an entry is being inserted without an upcall
1134 */
1135 if (nstl == 0) {
1136 /*
1137 * No mfc; make a new one
1138 */
1139 if (mrtdebug & DEBUG_MFC)
1140 log(LOG_DEBUG, "add_mfc no upcall o %x g %x p %x\n",
1141 ntohl(mfccp->mfcc_origin.s_addr),
1142 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1143 mfccp->mfcc_parent);
1144
1145 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1146 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1147 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
1148 init_mfc_params(rt, mfccp);
1149 if (rt->mfc_expire)
1150 nexpire[hash]--;
1151 rt->mfc_expire = 0;
1152 break; /* XXX */
1153 }
1154 }
1155 if (rt == NULL) { /* no upcall, so make a new entry */
1156 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
1157 M_NOWAIT);
1158 if (rt == NULL) {
1159 splx(s);
1160 return (ENOBUFS);
1161 }
1162
1163 init_mfc_params(rt, mfccp);
1164 rt->mfc_expire = 0;
1165 rt->mfc_stall = NULL;
1166 rt->mfc_bw_meter = NULL;
1167
1168 /* insert new entry at head of hash chain */
1169 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1170 }
1171 }
1172
1173 splx(s);
1174 return (0);
1175 }
1176
1177 #ifdef UPCALL_TIMING
1178 /*
1179 * collect delay statistics on the upcalls
1180 */
1181 static void
1182 collate(struct timeval *t)
1183 {
1184 u_int32_t d;
1185 struct timeval tp;
1186 u_int32_t delta;
1187
1188 microtime(&tp);
1189
1190 if (timercmp(t, &tp, <)) {
1191 TV_DELTA(tp, *t, delta);
1192
1193 d = delta >> 10;
1194 if (d > 50)
1195 d = 50;
1196
1197 ++upcall_data[d];
1198 }
1199 }
1200 #endif /* UPCALL_TIMING */
1201
1202 /*
1203 * Delete an mfc entry
1204 */
1205 static int
1206 del_mfc(struct sockopt *sopt)
1207 {
1208 struct mfcctl2 mfcctl2;
1209 struct mfcctl2 *mfccp;
1210 struct mfc *rt;
1211 int s;
1212 int error;
1213
1214 /*
1215 * XXX: for deleting MFC entries the information in entries
1216 * of size "struct mfcctl" is sufficient.
1217 */
1218
1219 mfccp = &mfcctl2;
1220 memset(&mfcctl2, 0, sizeof(mfcctl2));
1221
1222 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl));
1223 if (error) {
1224 /* Try with the size of mfcctl2. */
1225 error = sockopt_get(sopt, mfccp, sizeof(struct mfcctl2));
1226 if (error)
1227 return (error);
1228 }
1229
1230 if (mrtdebug & DEBUG_MFC)
1231 log(LOG_DEBUG, "del_mfc origin %x mcastgrp %x\n",
1232 ntohl(mfccp->mfcc_origin.s_addr),
1233 ntohl(mfccp->mfcc_mcastgrp.s_addr));
1234
1235 s = splsoftnet();
1236
1237 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1238 if (rt == NULL) {
1239 splx(s);
1240 return (EADDRNOTAVAIL);
1241 }
1242
1243 /*
1244 * free the bw_meter entries
1245 */
1246 free_bw_list(rt->mfc_bw_meter);
1247 rt->mfc_bw_meter = NULL;
1248
1249 LIST_REMOVE(rt, mfc_hash);
1250 free(rt, M_MRTABLE);
1251
1252 splx(s);
1253 return (0);
1254 }
1255
1256 static int
1257 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
1258 {
1259 if (s) {
1260 if (sbappendaddr(&s->so_rcv, sintosa(src), mm, NULL) != 0) {
1261 sorwakeup(s);
1262 return (0);
1263 }
1264 }
1265 m_freem(mm);
1266 return (-1);
1267 }
1268
1269 /*
1270 * IP multicast forwarding function. This function assumes that the packet
1271 * pointed to by "ip" has arrived on (or is about to be sent to) the interface
1272 * pointed to by "ifp", and the packet is to be relayed to other networks
1273 * that have members of the packet's destination IP multicast group.
1274 *
1275 * The packet is returned unscathed to the caller, unless it is
1276 * erroneous, in which case a non-zero return value tells the caller to
1277 * discard it.
1278 */
1279
1280 #define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */
1281 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1282
1283 int
1284 ip_mforward(struct mbuf *m, struct ifnet *ifp)
1285 {
1286 struct ip *ip = mtod(m, struct ip *);
1287 struct mfc *rt;
1288 static int srctun = 0;
1289 struct mbuf *mm;
1290 struct sockaddr_in sin;
1291 int s;
1292 vifi_t vifi;
1293
1294 if (mrtdebug & DEBUG_FORWARD)
1295 log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %p\n",
1296 ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
1297
1298 if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
1299 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1300 /*
1301 * Packet arrived via a physical interface or
1302 * an encapsulated tunnel or a register_vif.
1303 */
1304 } else {
1305 /*
1306 * Packet arrived through a source-route tunnel.
1307 * Source-route tunnels are no longer supported.
1308 */
1309 if ((srctun++ % 1000) == 0)
1310 log(LOG_ERR,
1311 "ip_mforward: received source-routed packet from %x\n",
1312 ntohl(ip->ip_src.s_addr));
1313
1314 return (1);
1315 }
1316
1317 /*
1318 * Clear any in-bound checksum flags for this packet.
1319 */
1320 m->m_pkthdr.csum_flags = 0;
1321
1322 /*
1323 * Don't forward a packet with time-to-live of zero or one,
1324 * or a packet destined to a local-only group.
1325 */
1326 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ip->ip_dst.s_addr))
1327 return (0);
1328
1329 /*
1330 * Determine forwarding vifs from the forwarding cache table
1331 */
1332 s = splsoftnet();
1333 ++mrtstat.mrts_mfc_lookups;
1334 rt = mfc_find(&ip->ip_src, &ip->ip_dst);
1335
1336 /* Entry exists, so forward if necessary */
1337 if (rt != NULL) {
1338 splx(s);
1339 return (ip_mdq(m, ifp, rt));
1340 } else {
1341 /*
1342 * If we don't have a route for packet's origin,
1343 * Make a copy of the packet & send message to routing daemon
1344 */
1345
1346 struct mbuf *mb0;
1347 struct rtdetq *rte;
1348 u_int32_t hash;
1349 int hlen = ip->ip_hl << 2;
1350 #ifdef UPCALL_TIMING
1351 struct timeval tp;
1352
1353 microtime(&tp);
1354 #endif /* UPCALL_TIMING */
1355
1356 ++mrtstat.mrts_mfc_misses;
1357
1358 mrtstat.mrts_no_route++;
1359 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1360 log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n",
1361 ntohl(ip->ip_src.s_addr),
1362 ntohl(ip->ip_dst.s_addr));
1363
1364 /*
1365 * Allocate mbufs early so that we don't do extra work if we are
1366 * just going to fail anyway. Make sure to pullup the header so
1367 * that other people can't step on it.
1368 */
1369 rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE,
1370 M_NOWAIT);
1371 if (rte == NULL) {
1372 splx(s);
1373 return (ENOBUFS);
1374 }
1375 mb0 = m_copypacket(m, M_DONTWAIT);
1376 M_PULLUP(mb0, hlen);
1377 if (mb0 == NULL) {
1378 free(rte, M_MRTABLE);
1379 splx(s);
1380 return (ENOBUFS);
1381 }
1382
1383 /* is there an upcall waiting for this flow? */
1384 hash = MFCHASH(ip->ip_src, ip->ip_dst);
1385 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) {
1386 if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1387 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1388 rt->mfc_stall != NULL)
1389 break;
1390 }
1391
1392 if (rt == NULL) {
1393 int i;
1394 struct igmpmsg *im;
1395
1396 /*
1397 * Locate the vifi for the incoming interface for
1398 * this packet.
1399 * If none found, drop packet.
1400 */
1401 for (vifi = 0; vifi < numvifs &&
1402 viftable[vifi].v_ifp != ifp; vifi++)
1403 ;
1404 if (vifi >= numvifs) /* vif not found, drop packet */
1405 goto non_fatal;
1406
1407 /* no upcall, so make a new entry */
1408 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE,
1409 M_NOWAIT);
1410 if (rt == NULL)
1411 goto fail;
1412
1413 /*
1414 * Make a copy of the header to send to the user level
1415 * process
1416 */
1417 mm = m_copym(m, 0, hlen, M_DONTWAIT);
1418 M_PULLUP(mm, hlen);
1419 if (mm == NULL)
1420 goto fail1;
1421
1422 /*
1423 * Send message to routing daemon to install
1424 * a route into the kernel table
1425 */
1426
1427 im = mtod(mm, struct igmpmsg *);
1428 im->im_msgtype = IGMPMSG_NOCACHE;
1429 im->im_mbz = 0;
1430 im->im_vif = vifi;
1431
1432 mrtstat.mrts_upcalls++;
1433
1434 sockaddr_in_init(&sin, &ip->ip_src, 0);
1435 if (socket_send(ip_mrouter, mm, &sin) < 0) {
1436 log(LOG_WARNING,
1437 "ip_mforward: ip_mrouter socket queue full\n");
1438 ++mrtstat.mrts_upq_sockfull;
1439 fail1:
1440 free(rt, M_MRTABLE);
1441 fail:
1442 free(rte, M_MRTABLE);
1443 m_freem(mb0);
1444 splx(s);
1445 return (ENOBUFS);
1446 }
1447
1448 /* insert new entry at head of hash chain */
1449 rt->mfc_origin = ip->ip_src;
1450 rt->mfc_mcastgrp = ip->ip_dst;
1451 rt->mfc_pkt_cnt = 0;
1452 rt->mfc_byte_cnt = 0;
1453 rt->mfc_wrong_if = 0;
1454 rt->mfc_expire = UPCALL_EXPIRE;
1455 nexpire[hash]++;
1456 for (i = 0; i < numvifs; i++) {
1457 rt->mfc_ttls[i] = 0;
1458 rt->mfc_flags[i] = 0;
1459 }
1460 rt->mfc_parent = -1;
1461
1462 /* clear the RP address */
1463 rt->mfc_rp = zeroin_addr;
1464
1465 rt->mfc_bw_meter = NULL;
1466
1467 /* link into table */
1468 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash);
1469 /* Add this entry to the end of the queue */
1470 rt->mfc_stall = rte;
1471 } else {
1472 /* determine if q has overflowed */
1473 struct rtdetq **p;
1474 int npkts = 0;
1475
1476 /*
1477 * XXX ouch! we need to append to the list, but we
1478 * only have a pointer to the front, so we have to
1479 * scan the entire list every time.
1480 */
1481 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
1482 if (++npkts > MAX_UPQ) {
1483 mrtstat.mrts_upq_ovflw++;
1484 non_fatal:
1485 free(rte, M_MRTABLE);
1486 m_freem(mb0);
1487 splx(s);
1488 return (0);
1489 }
1490
1491 /* Add this entry to the end of the queue */
1492 *p = rte;
1493 }
1494
1495 rte->next = NULL;
1496 rte->m = mb0;
1497 rte->ifp = ifp;
1498 #ifdef UPCALL_TIMING
1499 rte->t = tp;
1500 #endif /* UPCALL_TIMING */
1501
1502 splx(s);
1503
1504 return (0);
1505 }
1506 }
1507
1508
1509 /*ARGSUSED*/
1510 static void
1511 expire_upcalls(void *v)
1512 {
1513 int i;
1514
1515 /* XXX NOMPSAFE still need softnet_lock */
1516 mutex_enter(softnet_lock);
1517 KERNEL_LOCK(1, NULL);
1518
1519 for (i = 0; i < MFCTBLSIZ; i++) {
1520 struct mfc *rt, *nrt;
1521
1522 if (nexpire[i] == 0)
1523 continue;
1524
1525 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) {
1526 nrt = LIST_NEXT(rt, mfc_hash);
1527
1528 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
1529 continue;
1530 nexpire[i]--;
1531
1532 /*
1533 * free the bw_meter entries
1534 */
1535 while (rt->mfc_bw_meter != NULL) {
1536 struct bw_meter *x = rt->mfc_bw_meter;
1537
1538 rt->mfc_bw_meter = x->bm_mfc_next;
1539 kmem_intr_free(x, sizeof(*x));
1540 }
1541
1542 ++mrtstat.mrts_cache_cleanups;
1543 if (mrtdebug & DEBUG_EXPIRE)
1544 log(LOG_DEBUG,
1545 "expire_upcalls: expiring (%x %x)\n",
1546 ntohl(rt->mfc_origin.s_addr),
1547 ntohl(rt->mfc_mcastgrp.s_addr));
1548
1549 expire_mfc(rt);
1550 }
1551 }
1552
1553 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
1554 expire_upcalls, NULL);
1555
1556 KERNEL_UNLOCK_ONE(NULL);
1557 mutex_exit(softnet_lock);
1558 }
1559
1560 /*
1561 * Packet forwarding routine once entry in the cache is made
1562 */
1563 static int
1564 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt)
1565 {
1566 struct ip *ip = mtod(m, struct ip *);
1567 vifi_t vifi;
1568 struct vif *vifp;
1569 struct sockaddr_in sin;
1570 int plen = ntohs(ip->ip_len) - (ip->ip_hl << 2);
1571
1572 /*
1573 * Macro to send packet on vif. Since RSVP packets don't get counted on
1574 * input, they shouldn't get counted on output, so statistics keeping is
1575 * separate.
1576 */
1577 #define MC_SEND(ip, vifp, m) do { \
1578 if ((vifp)->v_flags & VIFF_TUNNEL) \
1579 encap_send((ip), (vifp), (m)); \
1580 else \
1581 phyint_send((ip), (vifp), (m)); \
1582 } while (/*CONSTCOND*/ 0)
1583
1584 /*
1585 * Don't forward if it didn't arrive from the parent vif for its origin.
1586 */
1587 vifi = rt->mfc_parent;
1588 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1589 /* came in the wrong interface */
1590 if (mrtdebug & DEBUG_FORWARD)
1591 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1592 ifp, vifi,
1593 vifi >= numvifs ? 0 : viftable[vifi].v_ifp);
1594 ++mrtstat.mrts_wrong_if;
1595 ++rt->mfc_wrong_if;
1596 /*
1597 * If we are doing PIM assert processing, send a message
1598 * to the routing daemon.
1599 *
1600 * XXX: A PIM-SM router needs the WRONGVIF detection so it
1601 * can complete the SPT switch, regardless of the type
1602 * of the iif (broadcast media, GRE tunnel, etc).
1603 */
1604 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
1605 struct timeval now;
1606 u_int32_t delta;
1607
1608 #ifdef PIM
1609 if (ifp == &multicast_register_if)
1610 pimstat.pims_rcv_registers_wrongiif++;
1611 #endif
1612
1613 /* Get vifi for the incoming packet */
1614 for (vifi = 0;
1615 vifi < numvifs && viftable[vifi].v_ifp != ifp;
1616 vifi++)
1617 ;
1618 if (vifi >= numvifs) {
1619 /* The iif is not found: ignore the packet. */
1620 return (0);
1621 }
1622
1623 if (rt->mfc_flags[vifi] &
1624 MRT_MFC_FLAGS_DISABLE_WRONGVIF) {
1625 /* WRONGVIF disabled: ignore the packet */
1626 return (0);
1627 }
1628
1629 microtime(&now);
1630
1631 TV_DELTA(rt->mfc_last_assert, now, delta);
1632
1633 if (delta > ASSERT_MSG_TIME) {
1634 struct igmpmsg *im;
1635 int hlen = ip->ip_hl << 2;
1636 struct mbuf *mm =
1637 m_copym(m, 0, hlen, M_DONTWAIT);
1638
1639 M_PULLUP(mm, hlen);
1640 if (mm == NULL)
1641 return (ENOBUFS);
1642
1643 rt->mfc_last_assert = now;
1644
1645 im = mtod(mm, struct igmpmsg *);
1646 im->im_msgtype = IGMPMSG_WRONGVIF;
1647 im->im_mbz = 0;
1648 im->im_vif = vifi;
1649
1650 mrtstat.mrts_upcalls++;
1651
1652 sockaddr_in_init(&sin, &im->im_src, 0);
1653 if (socket_send(ip_mrouter, mm, &sin) < 0) {
1654 log(LOG_WARNING,
1655 "ip_mforward: ip_mrouter socket queue full\n");
1656 ++mrtstat.mrts_upq_sockfull;
1657 return (ENOBUFS);
1658 }
1659 }
1660 }
1661 return (0);
1662 }
1663
1664 /* If I sourced this packet, it counts as output, else it was input. */
1665 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) {
1666 viftable[vifi].v_pkt_out++;
1667 viftable[vifi].v_bytes_out += plen;
1668 } else {
1669 viftable[vifi].v_pkt_in++;
1670 viftable[vifi].v_bytes_in += plen;
1671 }
1672 rt->mfc_pkt_cnt++;
1673 rt->mfc_byte_cnt += plen;
1674
1675 /*
1676 * For each vif, decide if a copy of the packet should be forwarded.
1677 * Forward if:
1678 * - the ttl exceeds the vif's threshold
1679 * - there are group members downstream on interface
1680 */
1681 for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
1682 if ((rt->mfc_ttls[vifi] > 0) &&
1683 (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1684 vifp->v_pkt_out++;
1685 vifp->v_bytes_out += plen;
1686 #ifdef PIM
1687 if (vifp->v_flags & VIFF_REGISTER)
1688 pim_register_send(ip, vifp, m, rt);
1689 else
1690 #endif
1691 MC_SEND(ip, vifp, m);
1692 }
1693
1694 /*
1695 * Perform upcall-related bw measuring.
1696 */
1697 if (rt->mfc_bw_meter != NULL) {
1698 struct bw_meter *x;
1699 struct timeval now;
1700
1701 microtime(&now);
1702 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
1703 bw_meter_receive_packet(x, plen, &now);
1704 }
1705
1706 return (0);
1707 }
1708
1709 static void
1710 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1711 {
1712 struct mbuf *mb_copy;
1713 int hlen = ip->ip_hl << 2;
1714
1715 /*
1716 * Make a new reference to the packet; make sure that
1717 * the IP header is actually copied, not just referenced,
1718 * so that ip_output() only scribbles on the copy.
1719 */
1720 mb_copy = m_copypacket(m, M_DONTWAIT);
1721 M_PULLUP(mb_copy, hlen);
1722 if (mb_copy == NULL)
1723 return;
1724
1725 if (vifp->v_rate_limit <= 0)
1726 tbf_send_packet(vifp, mb_copy);
1727 else
1728 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *),
1729 ntohs(ip->ip_len));
1730 }
1731
1732 static void
1733 encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1734 {
1735 struct mbuf *mb_copy;
1736 struct ip *ip_copy;
1737 int i, len = ntohs(ip->ip_len) + sizeof(multicast_encap_iphdr);
1738
1739 /* Take care of delayed checksums */
1740 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1741 in_delayed_cksum(m);
1742 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1743 }
1744
1745 /*
1746 * copy the old packet & pullup its IP header into the
1747 * new mbuf so we can modify it. Try to fill the new
1748 * mbuf since if we don't the ethernet driver will.
1749 */
1750 MGETHDR(mb_copy, M_DONTWAIT, MT_DATA);
1751 if (mb_copy == NULL)
1752 return;
1753 mb_copy->m_data += max_linkhdr;
1754 mb_copy->m_pkthdr.len = len;
1755 mb_copy->m_len = sizeof(multicast_encap_iphdr);
1756
1757 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
1758 m_freem(mb_copy);
1759 return;
1760 }
1761 i = MHLEN - max_linkhdr;
1762 if (i > len)
1763 i = len;
1764 mb_copy = m_pullup(mb_copy, i);
1765 if (mb_copy == NULL)
1766 return;
1767
1768 /*
1769 * fill in the encapsulating IP header.
1770 */
1771 ip_copy = mtod(mb_copy, struct ip *);
1772 *ip_copy = multicast_encap_iphdr;
1773 if (len < IP_MINFRAGSIZE)
1774 ip_copy->ip_id = 0;
1775 else
1776 ip_copy->ip_id = ip_newid(NULL);
1777 ip_copy->ip_len = htons(len);
1778 ip_copy->ip_src = vifp->v_lcl_addr;
1779 ip_copy->ip_dst = vifp->v_rmt_addr;
1780
1781 /*
1782 * turn the encapsulated IP header back into a valid one.
1783 */
1784 ip = (struct ip *)((char *)ip_copy + sizeof(multicast_encap_iphdr));
1785 --ip->ip_ttl;
1786 ip->ip_sum = 0;
1787 mb_copy->m_data += sizeof(multicast_encap_iphdr);
1788 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1789 mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1790
1791 if (vifp->v_rate_limit <= 0)
1792 tbf_send_packet(vifp, mb_copy);
1793 else
1794 tbf_control(vifp, mb_copy, ip, ntohs(ip_copy->ip_len));
1795 }
1796
1797 /*
1798 * De-encapsulate a packet and feed it back through ip input.
1799 */
1800 static void
1801 vif_input(struct mbuf *m, int off, int proto, void *eparg)
1802 {
1803 struct vif *vifp = eparg;
1804
1805 KASSERT(vifp != NULL);
1806
1807 if (proto != ENCAP_PROTO) {
1808 m_freem(m);
1809 mrtstat.mrts_bad_tunnel++;
1810 return;
1811 }
1812
1813 m_adj(m, off);
1814 m_set_rcvif(m, vifp->v_ifp);
1815
1816 if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) {
1817 m_freem(m);
1818 }
1819 }
1820
1821 /*
1822 * Check if the packet should be received on the vif denoted by arg.
1823 * (The encap selection code will call this once per vif since each is
1824 * registered separately.)
1825 */
1826 static int
1827 vif_encapcheck(struct mbuf *m, int off, int proto, void *arg)
1828 {
1829 struct vif *vifp;
1830 struct ip ip;
1831
1832 #ifdef DIAGNOSTIC
1833 if (!arg || proto != IPPROTO_IPV4)
1834 panic("unexpected arg in vif_encapcheck");
1835 #endif
1836
1837 /*
1838 * Accept the packet only if the inner heaader is multicast
1839 * and the outer header matches a tunnel-mode vif. Order
1840 * checks in the hope that common non-matching packets will be
1841 * rejected quickly. Assume that unicast IPv4 traffic in a
1842 * parallel tunnel (e.g. gif(4)) is unlikely.
1843 */
1844
1845 /* Obtain the outer IP header and the vif pointer. */
1846 m_copydata((struct mbuf *)m, 0, sizeof(ip), (void *)&ip);
1847 vifp = (struct vif *)arg;
1848
1849 /*
1850 * The outer source must match the vif's remote peer address.
1851 * For a multicast router with several tunnels, this is the
1852 * only check that will fail on packets in other tunnels,
1853 * assuming the local address is the same.
1854 */
1855 if (!in_hosteq(vifp->v_rmt_addr, ip.ip_src))
1856 return 0;
1857
1858 /* The outer destination must match the vif's local address. */
1859 if (!in_hosteq(vifp->v_lcl_addr, ip.ip_dst))
1860 return 0;
1861
1862 /* The vif must be of tunnel type. */
1863 if ((vifp->v_flags & VIFF_TUNNEL) == 0)
1864 return 0;
1865
1866 /* Check that the inner destination is multicast. */
1867 m_copydata((struct mbuf *)m, off, sizeof(ip), (void *)&ip);
1868 if (!IN_MULTICAST(ip.ip_dst.s_addr))
1869 return 0;
1870
1871 /*
1872 * We have checked that both the outer src and dst addresses
1873 * match the vif, and that the inner destination is multicast
1874 * (224/5). By claiming more than 64, we intend to
1875 * preferentially take packets that also match a parallel
1876 * gif(4).
1877 */
1878 return 32 + 32 + 5;
1879 }
1880
1881 /*
1882 * Token bucket filter module
1883 */
1884 static void
1885 tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_int32_t len)
1886 {
1887
1888 if (len > MAX_BKT_SIZE) {
1889 /* drop if packet is too large */
1890 mrtstat.mrts_pkt2large++;
1891 m_freem(m);
1892 return;
1893 }
1894
1895 tbf_update_tokens(vifp);
1896
1897 /*
1898 * If there are enough tokens, and the queue is empty, send this packet
1899 * out immediately. Otherwise, try to insert it on this vif's queue.
1900 */
1901 if (vifp->tbf_q_len == 0) {
1902 if (len <= vifp->tbf_n_tok) {
1903 vifp->tbf_n_tok -= len;
1904 tbf_send_packet(vifp, m);
1905 } else {
1906 /* queue packet and timeout till later */
1907 tbf_queue(vifp, m);
1908 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1909 tbf_reprocess_q, vifp);
1910 }
1911 } else {
1912 if (vifp->tbf_q_len >= vifp->tbf_max_q_len &&
1913 !tbf_dq_sel(vifp, ip)) {
1914 /* queue full, and couldn't make room */
1915 mrtstat.mrts_q_overflow++;
1916 m_freem(m);
1917 } else {
1918 /* queue length low enough, or made room */
1919 tbf_queue(vifp, m);
1920 tbf_process_q(vifp);
1921 }
1922 }
1923 }
1924
1925 /*
1926 * adds a packet to the queue at the interface
1927 */
1928 static void
1929 tbf_queue(struct vif *vifp, struct mbuf *m)
1930 {
1931 int s = splsoftnet();
1932
1933 /* insert at tail */
1934 *vifp->tbf_t = m;
1935 vifp->tbf_t = &m->m_nextpkt;
1936 vifp->tbf_q_len++;
1937
1938 splx(s);
1939 }
1940
1941
1942 /*
1943 * processes the queue at the interface
1944 */
1945 static void
1946 tbf_process_q(struct vif *vifp)
1947 {
1948 struct mbuf *m;
1949 int len;
1950 int s = splsoftnet();
1951
1952 /*
1953 * Loop through the queue at the interface and send as many packets
1954 * as possible.
1955 */
1956 for (m = vifp->tbf_q; m != NULL; m = vifp->tbf_q) {
1957 len = ntohs(mtod(m, struct ip *)->ip_len);
1958
1959 /* determine if the packet can be sent */
1960 if (len <= vifp->tbf_n_tok) {
1961 /* if so,
1962 * reduce no of tokens, dequeue the packet,
1963 * send the packet.
1964 */
1965 if ((vifp->tbf_q = m->m_nextpkt) == NULL)
1966 vifp->tbf_t = &vifp->tbf_q;
1967 --vifp->tbf_q_len;
1968
1969 m->m_nextpkt = NULL;
1970 vifp->tbf_n_tok -= len;
1971 tbf_send_packet(vifp, m);
1972 } else
1973 break;
1974 }
1975 splx(s);
1976 }
1977
1978 static void
1979 tbf_reprocess_q(void *arg)
1980 {
1981 struct vif *vifp = arg;
1982
1983 if (ip_mrouter == NULL)
1984 return;
1985
1986 tbf_update_tokens(vifp);
1987 tbf_process_q(vifp);
1988
1989 if (vifp->tbf_q_len != 0)
1990 callout_reset(&vifp->v_repq_ch, TBF_REPROCESS,
1991 tbf_reprocess_q, vifp);
1992 }
1993
1994 /* function that will selectively discard a member of the queue
1995 * based on the precedence value and the priority
1996 */
1997 static int
1998 tbf_dq_sel(struct vif *vifp, struct ip *ip)
1999 {
2000 u_int p;
2001 struct mbuf **mp, *m;
2002 int s = splsoftnet();
2003
2004 p = priority(vifp, ip);
2005
2006 for (mp = &vifp->tbf_q, m = *mp;
2007 m != NULL;
2008 mp = &m->m_nextpkt, m = *mp) {
2009 if (p > priority(vifp, mtod(m, struct ip *))) {
2010 if ((*mp = m->m_nextpkt) == NULL)
2011 vifp->tbf_t = mp;
2012 --vifp->tbf_q_len;
2013
2014 m_freem(m);
2015 mrtstat.mrts_drop_sel++;
2016 splx(s);
2017 return (1);
2018 }
2019 }
2020 splx(s);
2021 return (0);
2022 }
2023
2024 static void
2025 tbf_send_packet(struct vif *vifp, struct mbuf *m)
2026 {
2027 int error;
2028 int s = splsoftnet();
2029
2030 if (vifp->v_flags & VIFF_TUNNEL) {
2031 /* If tunnel options */
2032 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
2033 } else {
2034 /* if physical interface option, extract the options and then send */
2035 struct ip_moptions imo;
2036
2037 imo.imo_multicast_if_index = if_get_index(vifp->v_ifp);
2038 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
2039 imo.imo_multicast_loop = 1;
2040
2041 error = ip_output(m, NULL, NULL, IP_FORWARDING|IP_MULTICASTOPTS,
2042 &imo, NULL);
2043
2044 if (mrtdebug & DEBUG_XMIT)
2045 log(LOG_DEBUG, "phyint_send on vif %ld err %d\n",
2046 (long)(vifp - viftable), error);
2047 }
2048 splx(s);
2049 }
2050
2051 /* determine the current time and then
2052 * the elapsed time (between the last time and time now)
2053 * in milliseconds & update the no. of tokens in the bucket
2054 */
2055 static void
2056 tbf_update_tokens(struct vif *vifp)
2057 {
2058 struct timeval tp;
2059 u_int32_t tm;
2060 int s = splsoftnet();
2061
2062 microtime(&tp);
2063
2064 TV_DELTA(tp, vifp->tbf_last_pkt_t, tm);
2065
2066 /*
2067 * This formula is actually
2068 * "time in seconds" * "bytes/second".
2069 *
2070 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
2071 *
2072 * The (1000/1024) was introduced in add_vif to optimize
2073 * this divide into a shift.
2074 */
2075 vifp->tbf_n_tok += tm * vifp->v_rate_limit / 8192;
2076 vifp->tbf_last_pkt_t = tp;
2077
2078 if (vifp->tbf_n_tok > MAX_BKT_SIZE)
2079 vifp->tbf_n_tok = MAX_BKT_SIZE;
2080
2081 splx(s);
2082 }
2083
2084 static int
2085 priority(struct vif *vifp, struct ip *ip)
2086 {
2087 int prio = 50; /* the lowest priority -- default case */
2088
2089 /* temporary hack; may add general packet classifier some day */
2090
2091 /*
2092 * The UDP port space is divided up into four priority ranges:
2093 * [0, 16384) : unclassified - lowest priority
2094 * [16384, 32768) : audio - highest priority
2095 * [32768, 49152) : whiteboard - medium priority
2096 * [49152, 65536) : video - low priority
2097 */
2098 if (ip->ip_p == IPPROTO_UDP) {
2099 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
2100
2101 switch (ntohs(udp->uh_dport) & 0xc000) {
2102 case 0x4000:
2103 prio = 70;
2104 break;
2105 case 0x8000:
2106 prio = 60;
2107 break;
2108 case 0xc000:
2109 prio = 55;
2110 break;
2111 }
2112
2113 if (tbfdebug > 1)
2114 log(LOG_DEBUG, "port %x prio %d\n",
2115 ntohs(udp->uh_dport), prio);
2116 }
2117
2118 return (prio);
2119 }
2120
2121 /*
2122 * Code for bandwidth monitors
2123 */
2124
2125 /*
2126 * Define common interface for timeval-related methods
2127 */
2128 #define BW_TIMEVALCMP(tvp, uvp, cmp) timercmp((tvp), (uvp), cmp)
2129 #define BW_TIMEVALDECR(vvp, uvp) timersub((vvp), (uvp), (vvp))
2130 #define BW_TIMEVALADD(vvp, uvp) timeradd((vvp), (uvp), (vvp))
2131
2132 static uint32_t
2133 compute_bw_meter_flags(struct bw_upcall *req)
2134 {
2135 uint32_t flags = 0;
2136
2137 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
2138 flags |= BW_METER_UNIT_PACKETS;
2139 if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
2140 flags |= BW_METER_UNIT_BYTES;
2141 if (req->bu_flags & BW_UPCALL_GEQ)
2142 flags |= BW_METER_GEQ;
2143 if (req->bu_flags & BW_UPCALL_LEQ)
2144 flags |= BW_METER_LEQ;
2145
2146 return flags;
2147 }
2148
2149 /*
2150 * Add a bw_meter entry
2151 */
2152 static int
2153 add_bw_upcall(struct bw_upcall *req)
2154 {
2155 int s;
2156 struct mfc *mfc;
2157 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
2158 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
2159 struct timeval now;
2160 struct bw_meter *x;
2161 uint32_t flags;
2162
2163 if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2164 return EOPNOTSUPP;
2165
2166 /* Test if the flags are valid */
2167 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
2168 return EINVAL;
2169 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
2170 return EINVAL;
2171 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2172 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
2173 return EINVAL;
2174
2175 /* Test if the threshold time interval is valid */
2176 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
2177 return EINVAL;
2178
2179 flags = compute_bw_meter_flags(req);
2180
2181 /*
2182 * Find if we have already same bw_meter entry
2183 */
2184 s = splsoftnet();
2185 mfc = mfc_find(&req->bu_src, &req->bu_dst);
2186 if (mfc == NULL) {
2187 splx(s);
2188 return EADDRNOTAVAIL;
2189 }
2190 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
2191 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2192 &req->bu_threshold.b_time, ==)) &&
2193 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2194 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2195 (x->bm_flags & BW_METER_USER_FLAGS) == flags) {
2196 splx(s);
2197 return 0; /* XXX Already installed */
2198 }
2199 }
2200
2201 /* Allocate the new bw_meter entry */
2202 x = kmem_intr_alloc(sizeof(*x), KM_NOSLEEP);
2203 if (x == NULL) {
2204 splx(s);
2205 return ENOBUFS;
2206 }
2207
2208 /* Set the new bw_meter entry */
2209 x->bm_threshold.b_time = req->bu_threshold.b_time;
2210 microtime(&now);
2211 x->bm_start_time = now;
2212 x->bm_threshold.b_packets = req->bu_threshold.b_packets;
2213 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
2214 x->bm_measured.b_packets = 0;
2215 x->bm_measured.b_bytes = 0;
2216 x->bm_flags = flags;
2217 x->bm_time_next = NULL;
2218 x->bm_time_hash = BW_METER_BUCKETS;
2219
2220 /* Add the new bw_meter entry to the front of entries for this MFC */
2221 x->bm_mfc = mfc;
2222 x->bm_mfc_next = mfc->mfc_bw_meter;
2223 mfc->mfc_bw_meter = x;
2224 schedule_bw_meter(x, &now);
2225 splx(s);
2226
2227 return 0;
2228 }
2229
2230 static void
2231 free_bw_list(struct bw_meter *list)
2232 {
2233 while (list != NULL) {
2234 struct bw_meter *x = list;
2235
2236 list = list->bm_mfc_next;
2237 unschedule_bw_meter(x);
2238 kmem_intr_free(x, sizeof(*x));
2239 }
2240 }
2241
2242 /*
2243 * Delete one or multiple bw_meter entries
2244 */
2245 static int
2246 del_bw_upcall(struct bw_upcall *req)
2247 {
2248 int s;
2249 struct mfc *mfc;
2250 struct bw_meter *x;
2251
2252 if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
2253 return EOPNOTSUPP;
2254
2255 s = splsoftnet();
2256 /* Find the corresponding MFC entry */
2257 mfc = mfc_find(&req->bu_src, &req->bu_dst);
2258 if (mfc == NULL) {
2259 splx(s);
2260 return EADDRNOTAVAIL;
2261 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
2262 /*
2263 * Delete all bw_meter entries for this mfc
2264 */
2265 struct bw_meter *list;
2266
2267 list = mfc->mfc_bw_meter;
2268 mfc->mfc_bw_meter = NULL;
2269 free_bw_list(list);
2270 splx(s);
2271 return 0;
2272 } else { /* Delete a single bw_meter entry */
2273 struct bw_meter *prev;
2274 uint32_t flags = 0;
2275
2276 flags = compute_bw_meter_flags(req);
2277
2278 /* Find the bw_meter entry to delete */
2279 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
2280 prev = x, x = x->bm_mfc_next) {
2281 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
2282 &req->bu_threshold.b_time, ==)) &&
2283 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2284 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2285 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
2286 break;
2287 }
2288 if (x != NULL) { /* Delete entry from the list for this MFC */
2289 if (prev != NULL)
2290 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
2291 else
2292 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
2293
2294 unschedule_bw_meter(x);
2295 splx(s);
2296 /* Free the bw_meter entry */
2297 kmem_intr_free(x, sizeof(*x));
2298 return 0;
2299 } else {
2300 splx(s);
2301 return EINVAL;
2302 }
2303 }
2304 /* NOTREACHED */
2305 }
2306
2307 /*
2308 * Perform bandwidth measurement processing that may result in an upcall
2309 */
2310 static void
2311 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
2312 {
2313 struct timeval delta;
2314
2315 delta = *nowp;
2316 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2317
2318 if (x->bm_flags & BW_METER_GEQ) {
2319 /*
2320 * Processing for ">=" type of bw_meter entry
2321 */
2322 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2323 /* Reset the bw_meter entry */
2324 x->bm_start_time = *nowp;
2325 x->bm_measured.b_packets = 0;
2326 x->bm_measured.b_bytes = 0;
2327 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2328 }
2329
2330 /* Record that a packet is received */
2331 x->bm_measured.b_packets++;
2332 x->bm_measured.b_bytes += plen;
2333
2334 /*
2335 * Test if we should deliver an upcall
2336 */
2337 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
2338 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2339 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
2340 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2341 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
2342 /* Prepare an upcall for delivery */
2343 bw_meter_prepare_upcall(x, nowp);
2344 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
2345 }
2346 }
2347 } else if (x->bm_flags & BW_METER_LEQ) {
2348 /*
2349 * Processing for "<=" type of bw_meter entry
2350 */
2351 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2352 /*
2353 * We are behind time with the multicast forwarding table
2354 * scanning for "<=" type of bw_meter entries, so test now
2355 * if we should deliver an upcall.
2356 */
2357 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2358 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2359 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2360 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2361 /* Prepare an upcall for delivery */
2362 bw_meter_prepare_upcall(x, nowp);
2363 }
2364 /* Reschedule the bw_meter entry */
2365 unschedule_bw_meter(x);
2366 schedule_bw_meter(x, nowp);
2367 }
2368
2369 /* Record that a packet is received */
2370 x->bm_measured.b_packets++;
2371 x->bm_measured.b_bytes += plen;
2372
2373 /*
2374 * Test if we should restart the measuring interval
2375 */
2376 if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
2377 x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
2378 (x->bm_flags & BW_METER_UNIT_BYTES &&
2379 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
2380 /* Don't restart the measuring interval */
2381 } else {
2382 /* Do restart the measuring interval */
2383 /*
2384 * XXX: note that we don't unschedule and schedule, because this
2385 * might be too much overhead per packet. Instead, when we process
2386 * all entries for a given timer hash bin, we check whether it is
2387 * really a timeout. If not, we reschedule at that time.
2388 */
2389 x->bm_start_time = *nowp;
2390 x->bm_measured.b_packets = 0;
2391 x->bm_measured.b_bytes = 0;
2392 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2393 }
2394 }
2395 }
2396
2397 /*
2398 * Prepare a bandwidth-related upcall
2399 */
2400 static void
2401 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
2402 {
2403 struct timeval delta;
2404 struct bw_upcall *u;
2405
2406 /*
2407 * Compute the measured time interval
2408 */
2409 delta = *nowp;
2410 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2411
2412 /*
2413 * If there are too many pending upcalls, deliver them now
2414 */
2415 if (bw_upcalls_n >= BW_UPCALLS_MAX)
2416 bw_upcalls_send();
2417
2418 /*
2419 * Set the bw_upcall entry
2420 */
2421 u = &bw_upcalls[bw_upcalls_n++];
2422 u->bu_src = x->bm_mfc->mfc_origin;
2423 u->bu_dst = x->bm_mfc->mfc_mcastgrp;
2424 u->bu_threshold.b_time = x->bm_threshold.b_time;
2425 u->bu_threshold.b_packets = x->bm_threshold.b_packets;
2426 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
2427 u->bu_measured.b_time = delta;
2428 u->bu_measured.b_packets = x->bm_measured.b_packets;
2429 u->bu_measured.b_bytes = x->bm_measured.b_bytes;
2430 u->bu_flags = 0;
2431 if (x->bm_flags & BW_METER_UNIT_PACKETS)
2432 u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
2433 if (x->bm_flags & BW_METER_UNIT_BYTES)
2434 u->bu_flags |= BW_UPCALL_UNIT_BYTES;
2435 if (x->bm_flags & BW_METER_GEQ)
2436 u->bu_flags |= BW_UPCALL_GEQ;
2437 if (x->bm_flags & BW_METER_LEQ)
2438 u->bu_flags |= BW_UPCALL_LEQ;
2439 }
2440
2441 /*
2442 * Send the pending bandwidth-related upcalls
2443 */
2444 static void
2445 bw_upcalls_send(void)
2446 {
2447 struct mbuf *m;
2448 int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
2449 struct sockaddr_in k_igmpsrc = {
2450 .sin_len = sizeof(k_igmpsrc),
2451 .sin_family = AF_INET,
2452 };
2453 static struct igmpmsg igmpmsg = { 0, /* unused1 */
2454 0, /* unused2 */
2455 IGMPMSG_BW_UPCALL,/* im_msgtype */
2456 0, /* im_mbz */
2457 0, /* im_vif */
2458 0, /* unused3 */
2459 { 0 }, /* im_src */
2460 { 0 } }; /* im_dst */
2461
2462 if (bw_upcalls_n == 0)
2463 return; /* No pending upcalls */
2464
2465 bw_upcalls_n = 0;
2466
2467 /*
2468 * Allocate a new mbuf, initialize it with the header and
2469 * the payload for the pending calls.
2470 */
2471 MGETHDR(m, M_DONTWAIT, MT_HEADER);
2472 if (m == NULL) {
2473 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
2474 return;
2475 }
2476
2477 m->m_len = m->m_pkthdr.len = 0;
2478 m_copyback(m, 0, sizeof(struct igmpmsg), (void *)&igmpmsg);
2479 m_copyback(m, sizeof(struct igmpmsg), len, (void *)&bw_upcalls[0]);
2480
2481 /*
2482 * Send the upcalls
2483 * XXX do we need to set the address in k_igmpsrc ?
2484 */
2485 mrtstat.mrts_upcalls++;
2486 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) {
2487 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
2488 ++mrtstat.mrts_upq_sockfull;
2489 }
2490 }
2491
2492 /*
2493 * Compute the timeout hash value for the bw_meter entries
2494 */
2495 #define BW_METER_TIMEHASH(bw_meter, hash) \
2496 do { \
2497 struct timeval next_timeval = (bw_meter)->bm_start_time; \
2498 \
2499 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
2500 (hash) = next_timeval.tv_sec; \
2501 if (next_timeval.tv_usec) \
2502 (hash)++; /* XXX: make sure we don't timeout early */ \
2503 (hash) %= BW_METER_BUCKETS; \
2504 } while (/*CONSTCOND*/ 0)
2505
2506 /*
2507 * Schedule a timer to process periodically bw_meter entry of type "<="
2508 * by linking the entry in the proper hash bucket.
2509 */
2510 static void
2511 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
2512 {
2513 int time_hash;
2514
2515 if (!(x->bm_flags & BW_METER_LEQ))
2516 return; /* XXX: we schedule timers only for "<=" entries */
2517
2518 /*
2519 * Reset the bw_meter entry
2520 */
2521 x->bm_start_time = *nowp;
2522 x->bm_measured.b_packets = 0;
2523 x->bm_measured.b_bytes = 0;
2524 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2525
2526 /*
2527 * Compute the timeout hash value and insert the entry
2528 */
2529 BW_METER_TIMEHASH(x, time_hash);
2530 x->bm_time_next = bw_meter_timers[time_hash];
2531 bw_meter_timers[time_hash] = x;
2532 x->bm_time_hash = time_hash;
2533 }
2534
2535 /*
2536 * Unschedule the periodic timer that processes bw_meter entry of type "<="
2537 * by removing the entry from the proper hash bucket.
2538 */
2539 static void
2540 unschedule_bw_meter(struct bw_meter *x)
2541 {
2542 int time_hash;
2543 struct bw_meter *prev, *tmp;
2544
2545 if (!(x->bm_flags & BW_METER_LEQ))
2546 return; /* XXX: we schedule timers only for "<=" entries */
2547
2548 /*
2549 * Compute the timeout hash value and delete the entry
2550 */
2551 time_hash = x->bm_time_hash;
2552 if (time_hash >= BW_METER_BUCKETS)
2553 return; /* Entry was not scheduled */
2554
2555 for (prev = NULL, tmp = bw_meter_timers[time_hash];
2556 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
2557 if (tmp == x)
2558 break;
2559
2560 if (tmp == NULL)
2561 panic("unschedule_bw_meter: bw_meter entry not found");
2562
2563 if (prev != NULL)
2564 prev->bm_time_next = x->bm_time_next;
2565 else
2566 bw_meter_timers[time_hash] = x->bm_time_next;
2567
2568 x->bm_time_next = NULL;
2569 x->bm_time_hash = BW_METER_BUCKETS;
2570 }
2571
2572 /*
2573 * Process all "<=" type of bw_meter that should be processed now,
2574 * and for each entry prepare an upcall if necessary. Each processed
2575 * entry is rescheduled again for the (periodic) processing.
2576 *
2577 * This is run periodically (once per second normally). On each round,
2578 * all the potentially matching entries are in the hash slot that we are
2579 * looking at.
2580 */
2581 static void
2582 bw_meter_process(void)
2583 {
2584 int s;
2585 static uint32_t last_tv_sec; /* last time we processed this */
2586
2587 uint32_t loops;
2588 int i;
2589 struct timeval now, process_endtime;
2590
2591 microtime(&now);
2592 if (last_tv_sec == now.tv_sec)
2593 return; /* nothing to do */
2594
2595 loops = now.tv_sec - last_tv_sec;
2596 last_tv_sec = now.tv_sec;
2597 if (loops > BW_METER_BUCKETS)
2598 loops = BW_METER_BUCKETS;
2599
2600 s = splsoftnet();
2601 /*
2602 * Process all bins of bw_meter entries from the one after the last
2603 * processed to the current one. On entry, i points to the last bucket
2604 * visited, so we need to increment i at the beginning of the loop.
2605 */
2606 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
2607 struct bw_meter *x, *tmp_list;
2608
2609 if (++i >= BW_METER_BUCKETS)
2610 i = 0;
2611
2612 /* Disconnect the list of bw_meter entries from the bin */
2613 tmp_list = bw_meter_timers[i];
2614 bw_meter_timers[i] = NULL;
2615
2616 /* Process the list of bw_meter entries */
2617 while (tmp_list != NULL) {
2618 x = tmp_list;
2619 tmp_list = tmp_list->bm_time_next;
2620
2621 /* Test if the time interval is over */
2622 process_endtime = x->bm_start_time;
2623 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
2624 if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
2625 /* Not yet: reschedule, but don't reset */
2626 int time_hash;
2627
2628 BW_METER_TIMEHASH(x, time_hash);
2629 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
2630 /*
2631 * XXX: somehow the bin processing is a bit ahead of time.
2632 * Put the entry in the next bin.
2633 */
2634 if (++time_hash >= BW_METER_BUCKETS)
2635 time_hash = 0;
2636 }
2637 x->bm_time_next = bw_meter_timers[time_hash];
2638 bw_meter_timers[time_hash] = x;
2639 x->bm_time_hash = time_hash;
2640
2641 continue;
2642 }
2643
2644 /*
2645 * Test if we should deliver an upcall
2646 */
2647 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2648 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
2649 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2650 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
2651 /* Prepare an upcall for delivery */
2652 bw_meter_prepare_upcall(x, &now);
2653 }
2654
2655 /*
2656 * Reschedule for next processing
2657 */
2658 schedule_bw_meter(x, &now);
2659 }
2660 }
2661
2662 /* Send all upcalls that are pending delivery */
2663 bw_upcalls_send();
2664
2665 splx(s);
2666 }
2667
2668 /*
2669 * A periodic function for sending all upcalls that are pending delivery
2670 */
2671 static void
2672 expire_bw_upcalls_send(void *unused)
2673 {
2674 int s;
2675
2676 s = splsoftnet();
2677 bw_upcalls_send();
2678 splx(s);
2679
2680 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
2681 expire_bw_upcalls_send, NULL);
2682 }
2683
2684 /*
2685 * A periodic function for periodic scanning of the multicast forwarding
2686 * table for processing all "<=" bw_meter entries.
2687 */
2688 static void
2689 expire_bw_meter_process(void *unused)
2690 {
2691 if (mrt_api_config & MRT_MFC_BW_UPCALL)
2692 bw_meter_process();
2693
2694 callout_reset(&bw_meter_ch, BW_METER_PERIOD,
2695 expire_bw_meter_process, NULL);
2696 }
2697
2698 /*
2699 * End of bandwidth monitoring code
2700 */
2701
2702 #ifdef PIM
2703 /*
2704 * Send the packet up to the user daemon, or eventually do kernel encapsulation
2705 */
2706 static int
2707 pim_register_send(struct ip *ip, struct vif *vifp,
2708 struct mbuf *m, struct mfc *rt)
2709 {
2710 struct mbuf *mb_copy, *mm;
2711
2712 if (mrtdebug & DEBUG_PIM)
2713 log(LOG_DEBUG, "pim_register_send: \n");
2714
2715 mb_copy = pim_register_prepare(ip, m);
2716 if (mb_copy == NULL)
2717 return ENOBUFS;
2718
2719 /*
2720 * Send all the fragments. Note that the mbuf for each fragment
2721 * is freed by the sending machinery.
2722 */
2723 for (mm = mb_copy; mm; mm = mb_copy) {
2724 mb_copy = mm->m_nextpkt;
2725 mm->m_nextpkt = NULL;
2726 mm = m_pullup(mm, sizeof(struct ip));
2727 if (mm != NULL) {
2728 ip = mtod(mm, struct ip *);
2729 if ((mrt_api_config & MRT_MFC_RP) &&
2730 !in_nullhost(rt->mfc_rp)) {
2731 pim_register_send_rp(ip, vifp, mm, rt);
2732 } else {
2733 pim_register_send_upcall(ip, vifp, mm, rt);
2734 }
2735 }
2736 }
2737
2738 return 0;
2739 }
2740
2741 /*
2742 * Return a copy of the data packet that is ready for PIM Register
2743 * encapsulation.
2744 * XXX: Note that in the returned copy the IP header is a valid one.
2745 */
2746 static struct mbuf *
2747 pim_register_prepare(struct ip *ip, struct mbuf *m)
2748 {
2749 struct mbuf *mb_copy = NULL;
2750 int mtu;
2751
2752 /* Take care of delayed checksums */
2753 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
2754 in_delayed_cksum(m);
2755 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
2756 }
2757
2758 /*
2759 * Copy the old packet & pullup its IP header into the
2760 * new mbuf so we can modify it.
2761 */
2762 mb_copy = m_copypacket(m, M_DONTWAIT);
2763 if (mb_copy == NULL)
2764 return NULL;
2765 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
2766 if (mb_copy == NULL)
2767 return NULL;
2768
2769 /* take care of the TTL */
2770 ip = mtod(mb_copy, struct ip *);
2771 --ip->ip_ttl;
2772
2773 /* Compute the MTU after the PIM Register encapsulation */
2774 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
2775
2776 if (ntohs(ip->ip_len) <= mtu) {
2777 /* Turn the IP header into a valid one */
2778 ip->ip_sum = 0;
2779 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
2780 } else {
2781 /* Fragment the packet */
2782 if (ip_fragment(mb_copy, NULL, mtu) != 0) {
2783 /* XXX: mb_copy was freed by ip_fragment() */
2784 return NULL;
2785 }
2786 }
2787 return mb_copy;
2788 }
2789
2790 /*
2791 * Send an upcall with the data packet to the user-level process.
2792 */
2793 static int
2794 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
2795 struct mbuf *mb_copy, struct mfc *rt)
2796 {
2797 struct mbuf *mb_first;
2798 int len = ntohs(ip->ip_len);
2799 struct igmpmsg *im;
2800 struct sockaddr_in k_igmpsrc = {
2801 .sin_len = sizeof(k_igmpsrc),
2802 .sin_family = AF_INET,
2803 };
2804
2805 /*
2806 * Add a new mbuf with an upcall header
2807 */
2808 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2809 if (mb_first == NULL) {
2810 m_freem(mb_copy);
2811 return ENOBUFS;
2812 }
2813 mb_first->m_data += max_linkhdr;
2814 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
2815 mb_first->m_len = sizeof(struct igmpmsg);
2816 mb_first->m_next = mb_copy;
2817
2818 /* Send message to routing daemon */
2819 im = mtod(mb_first, struct igmpmsg *);
2820 im->im_msgtype = IGMPMSG_WHOLEPKT;
2821 im->im_mbz = 0;
2822 im->im_vif = vifp - viftable;
2823 im->im_src = ip->ip_src;
2824 im->im_dst = ip->ip_dst;
2825
2826 k_igmpsrc.sin_addr = ip->ip_src;
2827
2828 mrtstat.mrts_upcalls++;
2829
2830 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) {
2831 if (mrtdebug & DEBUG_PIM)
2832 log(LOG_WARNING,
2833 "mcast: pim_register_send_upcall: ip_mrouter socket queue full\n");
2834 ++mrtstat.mrts_upq_sockfull;
2835 return ENOBUFS;
2836 }
2837
2838 /* Keep statistics */
2839 pimstat.pims_snd_registers_msgs++;
2840 pimstat.pims_snd_registers_bytes += len;
2841
2842 return 0;
2843 }
2844
2845 /*
2846 * Encapsulate the data packet in PIM Register message and send it to the RP.
2847 */
2848 static int
2849 pim_register_send_rp(struct ip *ip, struct vif *vifp,
2850 struct mbuf *mb_copy, struct mfc *rt)
2851 {
2852 struct mbuf *mb_first;
2853 struct ip *ip_outer;
2854 struct pim_encap_pimhdr *pimhdr;
2855 int len = ntohs(ip->ip_len);
2856 vifi_t vifi = rt->mfc_parent;
2857
2858 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) {
2859 m_freem(mb_copy);
2860 return EADDRNOTAVAIL; /* The iif vif is invalid */
2861 }
2862
2863 /*
2864 * Add a new mbuf with the encapsulating header
2865 */
2866 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER);
2867 if (mb_first == NULL) {
2868 m_freem(mb_copy);
2869 return ENOBUFS;
2870 }
2871 mb_first->m_data += max_linkhdr;
2872 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
2873 mb_first->m_next = mb_copy;
2874
2875 mb_first->m_pkthdr.len = len + mb_first->m_len;
2876
2877 /*
2878 * Fill in the encapsulating IP and PIM header
2879 */
2880 ip_outer = mtod(mb_first, struct ip *);
2881 *ip_outer = pim_encap_iphdr;
2882 if (mb_first->m_pkthdr.len < IP_MINFRAGSIZE)
2883 ip_outer->ip_id = 0;
2884 else
2885 ip_outer->ip_id = ip_newid(NULL);
2886 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
2887 sizeof(pim_encap_pimhdr));
2888 ip_outer->ip_src = viftable[vifi].v_lcl_addr;
2889 ip_outer->ip_dst = rt->mfc_rp;
2890 /*
2891 * Copy the inner header TOS to the outer header, and take care of the
2892 * IP_DF bit.
2893 */
2894 ip_outer->ip_tos = ip->ip_tos;
2895 if (ntohs(ip->ip_off) & IP_DF)
2896 ip_outer->ip_off |= htons(IP_DF);
2897 pimhdr = (struct pim_encap_pimhdr *)((char *)ip_outer
2898 + sizeof(pim_encap_iphdr));
2899 *pimhdr = pim_encap_pimhdr;
2900 /* If the iif crosses a border, set the Border-bit */
2901 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
2902 pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
2903
2904 mb_first->m_data += sizeof(pim_encap_iphdr);
2905 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
2906 mb_first->m_data -= sizeof(pim_encap_iphdr);
2907
2908 if (vifp->v_rate_limit == 0)
2909 tbf_send_packet(vifp, mb_first);
2910 else
2911 tbf_control(vifp, mb_first, ip, ntohs(ip_outer->ip_len));
2912
2913 /* Keep statistics */
2914 pimstat.pims_snd_registers_msgs++;
2915 pimstat.pims_snd_registers_bytes += len;
2916
2917 return 0;
2918 }
2919
2920 /*
2921 * PIM-SMv2 and PIM-DM messages processing.
2922 * Receives and verifies the PIM control messages, and passes them
2923 * up to the listening socket, using rip_input().
2924 * The only message with special processing is the PIM_REGISTER message
2925 * (used by PIM-SM): the PIM header is stripped off, and the inner packet
2926 * is passed to if_simloop().
2927 */
2928 void
2929 pim_input(struct mbuf *m, ...)
2930 {
2931 struct ip *ip = mtod(m, struct ip *);
2932 struct pim *pim;
2933 int minlen;
2934 int datalen;
2935 int ip_tos;
2936 int proto;
2937 int iphlen;
2938 va_list ap;
2939
2940 va_start(ap, m);
2941 iphlen = va_arg(ap, int);
2942 proto = va_arg(ap, int);
2943 va_end(ap);
2944
2945 datalen = ntohs(ip->ip_len) - iphlen;
2946
2947 /* Keep statistics */
2948 pimstat.pims_rcv_total_msgs++;
2949 pimstat.pims_rcv_total_bytes += datalen;
2950
2951 /*
2952 * Validate lengths
2953 */
2954 if (datalen < PIM_MINLEN) {
2955 pimstat.pims_rcv_tooshort++;
2956 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
2957 datalen, (u_long)ip->ip_src.s_addr);
2958 m_freem(m);
2959 return;
2960 }
2961
2962 /*
2963 * If the packet is at least as big as a REGISTER, go agead
2964 * and grab the PIM REGISTER header size, to avoid another
2965 * possible m_pullup() later.
2966 *
2967 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
2968 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
2969 */
2970 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
2971 /*
2972 * Get the IP and PIM headers in contiguous memory, and
2973 * possibly the PIM REGISTER header.
2974 */
2975 if ((m->m_flags & M_EXT || m->m_len < minlen) &&
2976 (m = m_pullup(m, minlen)) == NULL) {
2977 log(LOG_ERR, "pim_input: m_pullup failure\n");
2978 return;
2979 }
2980 /* m_pullup() may have given us a new mbuf so reset ip. */
2981 ip = mtod(m, struct ip *);
2982 ip_tos = ip->ip_tos;
2983
2984 /* adjust mbuf to point to the PIM header */
2985 m->m_data += iphlen;
2986 m->m_len -= iphlen;
2987 pim = mtod(m, struct pim *);
2988
2989 /*
2990 * Validate checksum. If PIM REGISTER, exclude the data packet.
2991 *
2992 * XXX: some older PIMv2 implementations don't make this distinction,
2993 * so for compatibility reason perform the checksum over part of the
2994 * message, and if error, then over the whole message.
2995 */
2996 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
2997 /* do nothing, checksum okay */
2998 } else if (in_cksum(m, datalen)) {
2999 pimstat.pims_rcv_badsum++;
3000 if (mrtdebug & DEBUG_PIM)
3001 log(LOG_DEBUG, "pim_input: invalid checksum\n");
3002 m_freem(m);
3003 return;
3004 }
3005
3006 /* PIM version check */
3007 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
3008 pimstat.pims_rcv_badversion++;
3009 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
3010 PIM_VT_V(pim->pim_vt), PIM_VERSION);
3011 m_freem(m);
3012 return;
3013 }
3014
3015 /* restore mbuf back to the outer IP */
3016 m->m_data -= iphlen;
3017 m->m_len += iphlen;
3018
3019 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
3020 /*
3021 * Since this is a REGISTER, we'll make a copy of the register
3022 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
3023 * routing daemon.
3024 */
3025 int s;
3026 struct sockaddr_in dst = {
3027 .sin_len = sizeof(dst),
3028 .sin_family = AF_INET,
3029 };
3030 struct mbuf *mcp;
3031 struct ip *encap_ip;
3032 u_int32_t *reghdr;
3033 struct ifnet *vifp;
3034
3035 s = splsoftnet();
3036 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) {
3037 splx(s);
3038 if (mrtdebug & DEBUG_PIM)
3039 log(LOG_DEBUG,
3040 "pim_input: register vif not set: %d\n", reg_vif_num);
3041 m_freem(m);
3042 return;
3043 }
3044 /* XXX need refcnt? */
3045 vifp = viftable[reg_vif_num].v_ifp;
3046 splx(s);
3047
3048 /*
3049 * Validate length
3050 */
3051 if (datalen < PIM_REG_MINLEN) {
3052 pimstat.pims_rcv_tooshort++;
3053 pimstat.pims_rcv_badregisters++;
3054 log(LOG_ERR,
3055 "pim_input: register packet size too small %d from %lx\n",
3056 datalen, (u_long)ip->ip_src.s_addr);
3057 m_freem(m);
3058 return;
3059 }
3060
3061 reghdr = (u_int32_t *)(pim + 1);
3062 encap_ip = (struct ip *)(reghdr + 1);
3063
3064 if (mrtdebug & DEBUG_PIM) {
3065 log(LOG_DEBUG,
3066 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
3067 (u_long)ntohl(encap_ip->ip_src.s_addr),
3068 (u_long)ntohl(encap_ip->ip_dst.s_addr),
3069 ntohs(encap_ip->ip_len));
3070 }
3071
3072 /* verify the version number of the inner packet */
3073 if (encap_ip->ip_v != IPVERSION) {
3074 pimstat.pims_rcv_badregisters++;
3075 if (mrtdebug & DEBUG_PIM) {
3076 log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
3077 "of the inner packet\n", encap_ip->ip_v);
3078 }
3079 m_freem(m);
3080 return;
3081 }
3082
3083 /* verify the inner packet is destined to a mcast group */
3084 if (!IN_MULTICAST(encap_ip->ip_dst.s_addr)) {
3085 pimstat.pims_rcv_badregisters++;
3086 if (mrtdebug & DEBUG_PIM)
3087 log(LOG_DEBUG,
3088 "pim_input: inner packet of register is not "
3089 "multicast %lx\n",
3090 (u_long)ntohl(encap_ip->ip_dst.s_addr));
3091 m_freem(m);
3092 return;
3093 }
3094
3095 /* If a NULL_REGISTER, pass it to the daemon */
3096 if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
3097 goto pim_input_to_daemon;
3098
3099 /*
3100 * Copy the TOS from the outer IP header to the inner IP header.
3101 */
3102 if (encap_ip->ip_tos != ip_tos) {
3103 /* Outer TOS -> inner TOS */
3104 encap_ip->ip_tos = ip_tos;
3105 /* Recompute the inner header checksum. Sigh... */
3106
3107 /* adjust mbuf to point to the inner IP header */
3108 m->m_data += (iphlen + PIM_MINLEN);
3109 m->m_len -= (iphlen + PIM_MINLEN);
3110
3111 encap_ip->ip_sum = 0;
3112 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
3113
3114 /* restore mbuf to point back to the outer IP header */
3115 m->m_data -= (iphlen + PIM_MINLEN);
3116 m->m_len += (iphlen + PIM_MINLEN);
3117 }
3118
3119 /*
3120 * Decapsulate the inner IP packet and loopback to forward it
3121 * as a normal multicast packet. Also, make a copy of the
3122 * outer_iphdr + pimhdr + reghdr + encap_iphdr
3123 * to pass to the daemon later, so it can take the appropriate
3124 * actions (e.g., send back PIM_REGISTER_STOP).
3125 * XXX: here m->m_data points to the outer IP header.
3126 */
3127 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_DONTWAIT);
3128 if (mcp == NULL) {
3129 log(LOG_ERR,
3130 "pim_input: pim register: could not copy register head\n");
3131 m_freem(m);
3132 return;
3133 }
3134
3135 /* Keep statistics */
3136 /* XXX: registers_bytes include only the encap. mcast pkt */
3137 pimstat.pims_rcv_registers_msgs++;
3138 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);
3139
3140 /*
3141 * forward the inner ip packet; point m_data at the inner ip.
3142 */
3143 m_adj(m, iphlen + PIM_MINLEN);
3144
3145 if (mrtdebug & DEBUG_PIM) {
3146 log(LOG_DEBUG,
3147 "pim_input: forwarding decapsulated register: "
3148 "src %lx, dst %lx, vif %d\n",
3149 (u_long)ntohl(encap_ip->ip_src.s_addr),
3150 (u_long)ntohl(encap_ip->ip_dst.s_addr),
3151 reg_vif_num);
3152 }
3153 /* NB: vifp was collected above; can it change on us? */
3154 looutput(vifp, m, (struct sockaddr *)&dst, NULL);
3155
3156 /* prepare the register head to send to the mrouting daemon */
3157 m = mcp;
3158 }
3159
3160 pim_input_to_daemon:
3161 /*
3162 * Pass the PIM message up to the daemon; if it is a Register message,
3163 * pass the 'head' only up to the daemon. This includes the
3164 * outer IP header, PIM header, PIM-Register header and the
3165 * inner IP header.
3166 * XXX: the outer IP header pkt size of a Register is not adjust to
3167 * reflect the fact that the inner multicast data is truncated.
3168 */
3169 rip_input(m, iphlen, proto);
3170
3171 return;
3172 }
3173 #endif /* PIM */
3174