ip_output.c revision 1.244 1 /* $NetBSD: ip_output.c,v 1.244 2015/07/17 02:21:08 ozaki-r Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix"). It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
59 * POSSIBILITY OF SUCH DAMAGE.
60 */
61
62 /*
63 * Copyright (c) 1982, 1986, 1988, 1990, 1993
64 * The Regents of the University of California. All rights reserved.
65 *
66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions
68 * are met:
69 * 1. Redistributions of source code must retain the above copyright
70 * notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright
72 * notice, this list of conditions and the following disclaimer in the
73 * documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors
75 * may be used to endorse or promote products derived from this software
76 * without specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 *
90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
91 */
92
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.244 2015/07/17 02:21:08 ozaki-r Exp $");
95
96 #include "opt_inet.h"
97 #include "opt_ipsec.h"
98 #include "opt_mrouting.h"
99 #include "opt_net_mpsafe.h"
100 #include "opt_mpls.h"
101
102 #include <sys/param.h>
103 #include <sys/kmem.h>
104 #include <sys/mbuf.h>
105 #include <sys/protosw.h>
106 #include <sys/socket.h>
107 #include <sys/socketvar.h>
108 #include <sys/kauth.h>
109 #ifdef IPSEC
110 #include <sys/domain.h>
111 #endif
112 #include <sys/systm.h>
113
114 #include <net/if.h>
115 #include <net/if_types.h>
116 #include <net/route.h>
117 #include <net/pfil.h>
118
119 #include <netinet/in.h>
120 #include <netinet/in_systm.h>
121 #include <netinet/ip.h>
122 #include <netinet/in_pcb.h>
123 #include <netinet/in_var.h>
124 #include <netinet/ip_var.h>
125 #include <netinet/ip_private.h>
126 #include <netinet/in_offload.h>
127 #include <netinet/portalgo.h>
128 #include <netinet/udp.h>
129
130 #ifdef INET6
131 #include <netinet6/ip6_var.h>
132 #endif
133
134 #ifdef MROUTING
135 #include <netinet/ip_mroute.h>
136 #endif
137
138 #ifdef IPSEC
139 #include <netipsec/ipsec.h>
140 #include <netipsec/key.h>
141 #endif
142
143 #ifdef MPLS
144 #include <netmpls/mpls.h>
145 #include <netmpls/mpls_var.h>
146 #endif
147
148 static int ip_pcbopts(struct inpcb *, const struct sockopt *);
149 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
150 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
151 static void ip_mloopback(struct ifnet *, struct mbuf *,
152 const struct sockaddr_in *);
153
154 extern pfil_head_t *inet_pfil_hook; /* XXX */
155
156 int ip_do_loopback_cksum = 0;
157
158 static bool
159 ip_hresolv_needed(const struct ifnet * const ifp)
160 {
161 switch (ifp->if_type) {
162 case IFT_ARCNET:
163 case IFT_ATM:
164 case IFT_ECONET:
165 case IFT_ETHER:
166 case IFT_FDDI:
167 case IFT_HIPPI:
168 case IFT_IEEE1394:
169 case IFT_ISO88025:
170 case IFT_SLIP:
171 return true;
172 default:
173 return false;
174 }
175 }
176
177 static int
178 klock_if_output(struct ifnet * const ifp, struct mbuf * const m,
179 const struct sockaddr * const dst, struct rtentry *rt)
180 {
181 int error;
182
183 #ifndef NET_MPSAFE
184 KERNEL_LOCK(1, NULL);
185 #endif
186
187 error = (*ifp->if_output)(ifp, m, dst, rt);
188
189 #ifndef NET_MPSAFE
190 KERNEL_UNLOCK_ONE(NULL);
191 #endif
192
193 return error;
194 }
195
196 /*
197 * Send an IP packet to a host.
198 *
199 * If necessary, resolve the arbitrary IP route, rt0, to an IP host route before
200 * calling ifp's output routine.
201 */
202 int
203 ip_hresolv_output(struct ifnet * const ifp0, struct mbuf * const m,
204 const struct sockaddr * const dst, struct rtentry *rt00)
205 {
206 int error = 0;
207 struct ifnet *ifp = ifp0;
208 struct rtentry *rt, *rt0, *gwrt;
209
210 #define RTFREE_IF_NEEDED(_rt) \
211 if ((_rt) != NULL && (_rt) != rt00) \
212 rtfree((_rt));
213
214 rt0 = rt00;
215 retry:
216 if (!ip_hresolv_needed(ifp)) {
217 rt = rt0;
218 goto out;
219 }
220
221 if (rt0 == NULL) {
222 rt = NULL;
223 goto out;
224 }
225
226 rt = rt0;
227
228 /*
229 * The following block is highly questionable. How did we get here
230 * with a !RTF_UP route? Does rtalloc1() always return an RTF_UP
231 * route?
232 */
233 if ((rt->rt_flags & RTF_UP) == 0) {
234 rt = rtalloc1(dst, 1);
235 if (rt == NULL) {
236 error = EHOSTUNREACH;
237 goto bad;
238 }
239 rt0 = rt;
240 if (rt->rt_ifp != ifp) {
241 ifp = rt->rt_ifp;
242 goto retry;
243 }
244 }
245
246 if ((rt->rt_flags & RTF_GATEWAY) == 0)
247 goto out;
248
249 gwrt = rt_get_gwroute(rt);
250 RTFREE_IF_NEEDED(rt);
251 rt = gwrt;
252 if (rt == NULL || (rt->rt_flags & RTF_UP) == 0) {
253 if (rt != NULL) {
254 RTFREE_IF_NEEDED(rt);
255 rt = rt0;
256 }
257 if (rt == NULL) {
258 error = EHOSTUNREACH;
259 goto bad;
260 }
261 gwrt = rt->rt_gwroute = rtalloc1(rt->rt_gateway, 1);
262 RTFREE_IF_NEEDED(rt);
263 rt = gwrt;
264 if (rt == NULL) {
265 error = EHOSTUNREACH;
266 goto bad;
267 }
268 /* the "G" test below also prevents rt == rt0 */
269 if ((rt->rt_flags & RTF_GATEWAY) != 0 || rt->rt_ifp != ifp) {
270 rt0->rt_gwroute = NULL;
271 error = EHOSTUNREACH;
272 goto bad;
273 }
274 }
275 if ((rt->rt_flags & RTF_REJECT) != 0) {
276 if (rt->rt_rmx.rmx_expire == 0 ||
277 time_second < rt->rt_rmx.rmx_expire) {
278 error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH;
279 goto bad;
280 }
281 }
282
283 out:
284 #ifdef MPLS
285 if (rt0 != NULL && rt_gettag(rt0) != NULL &&
286 rt_gettag(rt0)->sa_family == AF_MPLS &&
287 (m->m_flags & (M_MCAST | M_BCAST)) == 0 &&
288 ifp->if_type == IFT_ETHER) {
289 union mpls_shim msh;
290 msh.s_addr = MPLS_GETSADDR(rt0);
291 if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
292 struct m_tag *mtag;
293 /*
294 * XXX tentative solution to tell ether_output
295 * it's MPLS. Need some more efficient solution.
296 */
297 mtag = m_tag_get(PACKET_TAG_MPLS,
298 sizeof(int) /* dummy */,
299 M_NOWAIT);
300 if (mtag == NULL) {
301 error = ENOMEM;
302 goto bad;
303 }
304 m_tag_prepend(m, mtag);
305 }
306 }
307 #endif
308
309 error = klock_if_output(ifp, m, dst, rt);
310 goto exit;
311
312 bad:
313 if (m != NULL)
314 m_freem(m);
315 exit:
316 RTFREE_IF_NEEDED(rt);
317
318 return error;
319
320 #undef RTFREE_IF_NEEDED
321 }
322
323 /*
324 * IP output. The packet in mbuf chain m contains a skeletal IP
325 * header (with len, off, ttl, proto, tos, src, dst).
326 * The mbuf chain containing the packet will be freed.
327 * The mbuf opt, if present, will not be freed.
328 */
329 int
330 ip_output(struct mbuf *m0, ...)
331 {
332 struct rtentry *rt;
333 struct ip *ip;
334 struct ifnet *ifp;
335 struct mbuf *m = m0;
336 int hlen = sizeof (struct ip);
337 int len, error = 0;
338 struct route iproute;
339 const struct sockaddr_in *dst;
340 struct in_ifaddr *ia;
341 int isbroadcast;
342 struct mbuf *opt;
343 struct route *ro;
344 int flags, sw_csum;
345 u_long mtu;
346 struct ip_moptions *imo;
347 struct socket *so;
348 va_list ap;
349 #ifdef IPSEC
350 struct secpolicy *sp = NULL;
351 #endif
352 bool natt_frag = false;
353 bool rtmtu_nolock;
354 union {
355 struct sockaddr dst;
356 struct sockaddr_in dst4;
357 } u;
358 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed
359 * to the nexthop
360 */
361
362 len = 0;
363 va_start(ap, m0);
364 opt = va_arg(ap, struct mbuf *);
365 ro = va_arg(ap, struct route *);
366 flags = va_arg(ap, int);
367 imo = va_arg(ap, struct ip_moptions *);
368 so = va_arg(ap, struct socket *);
369 va_end(ap);
370
371 MCLAIM(m, &ip_tx_mowner);
372
373 KASSERT((m->m_flags & M_PKTHDR) != 0);
374 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0);
375 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) !=
376 (M_CSUM_TCPv4|M_CSUM_UDPv4));
377
378 if (opt) {
379 m = ip_insertoptions(m, opt, &len);
380 if (len >= sizeof(struct ip))
381 hlen = len;
382 }
383 ip = mtod(m, struct ip *);
384
385 /*
386 * Fill in IP header.
387 */
388 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
389 ip->ip_v = IPVERSION;
390 ip->ip_off = htons(0);
391 /* ip->ip_id filled in after we find out source ia */
392 ip->ip_hl = hlen >> 2;
393 IP_STATINC(IP_STAT_LOCALOUT);
394 } else {
395 hlen = ip->ip_hl << 2;
396 }
397
398 /*
399 * Route packet.
400 */
401 if (ro == NULL) {
402 memset(&iproute, 0, sizeof(iproute));
403 ro = &iproute;
404 }
405 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0);
406 dst = satocsin(rtcache_getdst(ro));
407
408 /*
409 * If there is a cached route, check that it is to the same
410 * destination and is still up. If not, free it and try again.
411 * The address family should also be checked in case of sharing
412 * the cache with IPv6.
413 */
414 if (dst && (dst->sin_family != AF_INET ||
415 !in_hosteq(dst->sin_addr, ip->ip_dst)))
416 rtcache_free(ro);
417
418 if ((rt = rtcache_validate(ro)) == NULL &&
419 (rt = rtcache_update(ro, 1)) == NULL) {
420 dst = &u.dst4;
421 error = rtcache_setdst(ro, &u.dst);
422 if (error != 0)
423 goto bad;
424 }
425
426 /*
427 * If routing to interface only, short circuit routing lookup.
428 */
429 if (flags & IP_ROUTETOIF) {
430 if ((ia = ifatoia(ifa_ifwithladdr(sintocsa(dst)))) == NULL) {
431 IP_STATINC(IP_STAT_NOROUTE);
432 error = ENETUNREACH;
433 goto bad;
434 }
435 ifp = ia->ia_ifp;
436 mtu = ifp->if_mtu;
437 ip->ip_ttl = 1;
438 isbroadcast = in_broadcast(dst->sin_addr, ifp);
439 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
440 ip->ip_dst.s_addr == INADDR_BROADCAST) &&
441 imo != NULL && imo->imo_multicast_ifp != NULL) {
442 ifp = imo->imo_multicast_ifp;
443 mtu = ifp->if_mtu;
444 IFP_TO_IA(ifp, ia);
445 isbroadcast = 0;
446 } else {
447 if (rt == NULL)
448 rt = rtcache_init(ro);
449 if (rt == NULL) {
450 IP_STATINC(IP_STAT_NOROUTE);
451 error = EHOSTUNREACH;
452 goto bad;
453 }
454 ia = ifatoia(rt->rt_ifa);
455 ifp = rt->rt_ifp;
456 if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
457 mtu = ifp->if_mtu;
458 rt->rt_use++;
459 if (rt->rt_flags & RTF_GATEWAY)
460 dst = satosin(rt->rt_gateway);
461 if (rt->rt_flags & RTF_HOST)
462 isbroadcast = rt->rt_flags & RTF_BROADCAST;
463 else
464 isbroadcast = in_broadcast(dst->sin_addr, ifp);
465 }
466 rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0;
467
468 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
469 (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
470 bool inmgroup;
471
472 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
473 M_BCAST : M_MCAST;
474 /*
475 * See if the caller provided any multicast options
476 */
477 if (imo != NULL)
478 ip->ip_ttl = imo->imo_multicast_ttl;
479 else
480 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
481
482 /*
483 * if we don't know the outgoing ifp yet, we can't generate
484 * output
485 */
486 if (!ifp) {
487 IP_STATINC(IP_STAT_NOROUTE);
488 error = ENETUNREACH;
489 goto bad;
490 }
491
492 /*
493 * If the packet is multicast or broadcast, confirm that
494 * the outgoing interface can transmit it.
495 */
496 if (((m->m_flags & M_MCAST) &&
497 (ifp->if_flags & IFF_MULTICAST) == 0) ||
498 ((m->m_flags & M_BCAST) &&
499 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) {
500 IP_STATINC(IP_STAT_NOROUTE);
501 error = ENETUNREACH;
502 goto bad;
503 }
504 /*
505 * If source address not specified yet, use an address
506 * of outgoing interface.
507 */
508 if (in_nullhost(ip->ip_src)) {
509 struct in_ifaddr *xia;
510 struct ifaddr *xifa;
511
512 IFP_TO_IA(ifp, xia);
513 if (!xia) {
514 error = EADDRNOTAVAIL;
515 goto bad;
516 }
517 xifa = &xia->ia_ifa;
518 if (xifa->ifa_getifa != NULL) {
519 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
520 if (xia == NULL) {
521 error = EADDRNOTAVAIL;
522 goto bad;
523 }
524 }
525 ip->ip_src = xia->ia_addr.sin_addr;
526 }
527
528 inmgroup = in_multi_group(ip->ip_dst, ifp, flags);
529 if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) {
530 /*
531 * If we belong to the destination multicast group
532 * on the outgoing interface, and the caller did not
533 * forbid loopback, loop back a copy.
534 */
535 ip_mloopback(ifp, m, &u.dst4);
536 }
537 #ifdef MROUTING
538 else {
539 /*
540 * If we are acting as a multicast router, perform
541 * multicast forwarding as if the packet had just
542 * arrived on the interface to which we are about
543 * to send. The multicast forwarding function
544 * recursively calls this function, using the
545 * IP_FORWARDING flag to prevent infinite recursion.
546 *
547 * Multicasts that are looped back by ip_mloopback(),
548 * above, will be forwarded by the ip_input() routine,
549 * if necessary.
550 */
551 extern struct socket *ip_mrouter;
552
553 if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
554 if (ip_mforward(m, ifp) != 0) {
555 m_freem(m);
556 goto done;
557 }
558 }
559 }
560 #endif
561 /*
562 * Multicasts with a time-to-live of zero may be looped-
563 * back, above, but must not be transmitted on a network.
564 * Also, multicasts addressed to the loopback interface
565 * are not sent -- the above call to ip_mloopback() will
566 * loop back a copy if this host actually belongs to the
567 * destination group on the loopback interface.
568 */
569 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
570 m_freem(m);
571 goto done;
572 }
573 goto sendit;
574 }
575
576 /*
577 * If source address not specified yet, use address
578 * of outgoing interface.
579 */
580 if (in_nullhost(ip->ip_src)) {
581 struct ifaddr *xifa;
582
583 xifa = &ia->ia_ifa;
584 if (xifa->ifa_getifa != NULL) {
585 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
586 if (ia == NULL) {
587 error = EADDRNOTAVAIL;
588 goto bad;
589 }
590 }
591 ip->ip_src = ia->ia_addr.sin_addr;
592 }
593
594 /*
595 * packets with Class-D address as source are not valid per
596 * RFC 1112
597 */
598 if (IN_MULTICAST(ip->ip_src.s_addr)) {
599 IP_STATINC(IP_STAT_ODROPPED);
600 error = EADDRNOTAVAIL;
601 goto bad;
602 }
603
604 /*
605 * Look for broadcast address and and verify user is allowed to
606 * send such a packet.
607 */
608 if (isbroadcast) {
609 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
610 error = EADDRNOTAVAIL;
611 goto bad;
612 }
613 if ((flags & IP_ALLOWBROADCAST) == 0) {
614 error = EACCES;
615 goto bad;
616 }
617 /* don't allow broadcast messages to be fragmented */
618 if (ntohs(ip->ip_len) > ifp->if_mtu) {
619 error = EMSGSIZE;
620 goto bad;
621 }
622 m->m_flags |= M_BCAST;
623 } else
624 m->m_flags &= ~M_BCAST;
625
626 sendit:
627 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) {
628 if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
629 ip->ip_id = 0;
630 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
631 ip->ip_id = ip_newid(ia);
632 } else {
633
634 /*
635 * TSO capable interfaces (typically?) increment
636 * ip_id for each segment.
637 * "allocate" enough ids here to increase the chance
638 * for them to be unique.
639 *
640 * note that the following calculation is not
641 * needed to be precise. wasting some ip_id is fine.
642 */
643
644 unsigned int segsz = m->m_pkthdr.segsz;
645 unsigned int datasz = ntohs(ip->ip_len) - hlen;
646 unsigned int num = howmany(datasz, segsz);
647
648 ip->ip_id = ip_newid_range(ia, num);
649 }
650 }
651
652 /*
653 * If we're doing Path MTU Discovery, we need to set DF unless
654 * the route's MTU is locked.
655 */
656 if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) {
657 ip->ip_off |= htons(IP_DF);
658 }
659
660 #ifdef IPSEC
661 if (ipsec_used) {
662 bool ipsec_done = false;
663
664 /* Perform IPsec processing, if any. */
665 error = ipsec4_output(m, so, flags, &sp, &mtu, &natt_frag,
666 &ipsec_done);
667 if (error || ipsec_done)
668 goto done;
669 }
670 #endif
671
672 /*
673 * Run through list of hooks for output packets.
674 */
675 error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT);
676 if (error)
677 goto done;
678 if (m == NULL)
679 goto done;
680
681 ip = mtod(m, struct ip *);
682 hlen = ip->ip_hl << 2;
683
684 m->m_pkthdr.csum_data |= hlen << 16;
685
686 #if IFA_STATS
687 /*
688 * search for the source address structure to
689 * maintain output statistics.
690 */
691 INADDR_TO_IA(ip->ip_src, ia);
692 #endif
693
694 /* Maybe skip checksums on loopback interfaces. */
695 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
696 m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
697 }
698 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
699 /*
700 * If small enough for mtu of path, or if using TCP segmentation
701 * offload, can just send directly.
702 */
703 if (ntohs(ip->ip_len) <= mtu ||
704 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) {
705 const struct sockaddr *sa;
706
707 #if IFA_STATS
708 if (ia)
709 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
710 #endif
711 /*
712 * Always initialize the sum to 0! Some HW assisted
713 * checksumming requires this.
714 */
715 ip->ip_sum = 0;
716
717 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
718 /*
719 * Perform any checksums that the hardware can't do
720 * for us.
721 *
722 * XXX Does any hardware require the {th,uh}_sum
723 * XXX fields to be 0?
724 */
725 if (sw_csum & M_CSUM_IPv4) {
726 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
727 ip->ip_sum = in_cksum(m, hlen);
728 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
729 }
730 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
731 if (IN_NEED_CHECKSUM(ifp,
732 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
733 in_delayed_cksum(m);
734 }
735 m->m_pkthdr.csum_flags &=
736 ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
737 }
738 }
739
740 sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst);
741 if (__predict_true(
742 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 ||
743 (ifp->if_capenable & IFCAP_TSOv4) != 0)) {
744 error = ip_hresolv_output(ifp, m, sa, rt);
745 } else {
746 error = ip_tso_output(ifp, m, sa, rt);
747 }
748 goto done;
749 }
750
751 /*
752 * We can't use HW checksumming if we're about to
753 * to fragment the packet.
754 *
755 * XXX Some hardware can do this.
756 */
757 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
758 if (IN_NEED_CHECKSUM(ifp,
759 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
760 in_delayed_cksum(m);
761 }
762 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
763 }
764
765 /*
766 * Too large for interface; fragment if possible.
767 * Must be able to put at least 8 bytes per fragment.
768 */
769 if (ntohs(ip->ip_off) & IP_DF) {
770 if (flags & IP_RETURNMTU) {
771 struct inpcb *inp;
772
773 KASSERT(so && solocked(so));
774 inp = sotoinpcb(so);
775 inp->inp_errormtu = mtu;
776 }
777 error = EMSGSIZE;
778 IP_STATINC(IP_STAT_CANTFRAG);
779 goto bad;
780 }
781
782 error = ip_fragment(m, ifp, mtu);
783 if (error) {
784 m = NULL;
785 goto bad;
786 }
787
788 for (; m; m = m0) {
789 m0 = m->m_nextpkt;
790 m->m_nextpkt = 0;
791 if (error) {
792 m_freem(m);
793 continue;
794 }
795 #if IFA_STATS
796 if (ia)
797 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
798 #endif
799 /*
800 * If we get there, the packet has not been handled by
801 * IPsec whereas it should have. Now that it has been
802 * fragmented, re-inject it in ip_output so that IPsec
803 * processing can occur.
804 */
805 if (natt_frag) {
806 error = ip_output(m, opt, ro,
807 flags | IP_RAWOUTPUT | IP_NOIPNEWID,
808 imo, so);
809 } else {
810 KASSERT((m->m_pkthdr.csum_flags &
811 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
812 error = ip_hresolv_output(ifp, m,
813 (m->m_flags & M_MCAST) ?
814 sintocsa(rdst) : sintocsa(dst), rt);
815 }
816 }
817 if (error == 0) {
818 IP_STATINC(IP_STAT_FRAGMENTED);
819 }
820 done:
821 if (ro == &iproute) {
822 rtcache_free(&iproute);
823 }
824 #ifdef IPSEC
825 if (sp) {
826 KEY_FREESP(&sp);
827 }
828 #endif
829 return error;
830 bad:
831 m_freem(m);
832 goto done;
833 }
834
835 int
836 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
837 {
838 struct ip *ip, *mhip;
839 struct mbuf *m0;
840 int len, hlen, off;
841 int mhlen, firstlen;
842 struct mbuf **mnext;
843 int sw_csum = m->m_pkthdr.csum_flags;
844 int fragments = 0;
845 int s;
846 int error = 0;
847
848 ip = mtod(m, struct ip *);
849 hlen = ip->ip_hl << 2;
850 if (ifp != NULL)
851 sw_csum &= ~ifp->if_csum_flags_tx;
852
853 len = (mtu - hlen) &~ 7;
854 if (len < 8) {
855 m_freem(m);
856 return (EMSGSIZE);
857 }
858
859 firstlen = len;
860 mnext = &m->m_nextpkt;
861
862 /*
863 * Loop through length of segment after first fragment,
864 * make new header and copy data of each part and link onto chain.
865 */
866 m0 = m;
867 mhlen = sizeof (struct ip);
868 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
869 MGETHDR(m, M_DONTWAIT, MT_HEADER);
870 if (m == 0) {
871 error = ENOBUFS;
872 IP_STATINC(IP_STAT_ODROPPED);
873 goto sendorfree;
874 }
875 MCLAIM(m, m0->m_owner);
876 *mnext = m;
877 mnext = &m->m_nextpkt;
878 m->m_data += max_linkhdr;
879 mhip = mtod(m, struct ip *);
880 *mhip = *ip;
881 /* we must inherit MCAST and BCAST flags */
882 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST);
883 if (hlen > sizeof (struct ip)) {
884 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
885 mhip->ip_hl = mhlen >> 2;
886 }
887 m->m_len = mhlen;
888 mhip->ip_off = ((off - hlen) >> 3) +
889 (ntohs(ip->ip_off) & ~IP_MF);
890 if (ip->ip_off & htons(IP_MF))
891 mhip->ip_off |= IP_MF;
892 if (off + len >= ntohs(ip->ip_len))
893 len = ntohs(ip->ip_len) - off;
894 else
895 mhip->ip_off |= IP_MF;
896 HTONS(mhip->ip_off);
897 mhip->ip_len = htons((u_int16_t)(len + mhlen));
898 m->m_next = m_copym(m0, off, len, M_DONTWAIT);
899 if (m->m_next == 0) {
900 error = ENOBUFS; /* ??? */
901 IP_STATINC(IP_STAT_ODROPPED);
902 goto sendorfree;
903 }
904 m->m_pkthdr.len = mhlen + len;
905 m->m_pkthdr.rcvif = NULL;
906 mhip->ip_sum = 0;
907 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
908 if (sw_csum & M_CSUM_IPv4) {
909 mhip->ip_sum = in_cksum(m, mhlen);
910 } else {
911 /*
912 * checksum is hw-offloaded or not necessary.
913 */
914 m->m_pkthdr.csum_flags |=
915 m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
916 m->m_pkthdr.csum_data |= mhlen << 16;
917 KASSERT(!(ifp != NULL &&
918 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
919 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
920 }
921 IP_STATINC(IP_STAT_OFRAGMENTS);
922 fragments++;
923 }
924 /*
925 * Update first fragment by trimming what's been copied out
926 * and updating header, then send each fragment (in order).
927 */
928 m = m0;
929 m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
930 m->m_pkthdr.len = hlen + firstlen;
931 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
932 ip->ip_off |= htons(IP_MF);
933 ip->ip_sum = 0;
934 if (sw_csum & M_CSUM_IPv4) {
935 ip->ip_sum = in_cksum(m, hlen);
936 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
937 } else {
938 /*
939 * checksum is hw-offloaded or not necessary.
940 */
941 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
942 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
943 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
944 sizeof(struct ip));
945 }
946 sendorfree:
947 /*
948 * If there is no room for all the fragments, don't queue
949 * any of them.
950 */
951 if (ifp != NULL) {
952 s = splnet();
953 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
954 error == 0) {
955 error = ENOBUFS;
956 IP_STATINC(IP_STAT_ODROPPED);
957 IFQ_INC_DROPS(&ifp->if_snd);
958 }
959 splx(s);
960 }
961 if (error) {
962 for (m = m0; m; m = m0) {
963 m0 = m->m_nextpkt;
964 m->m_nextpkt = NULL;
965 m_freem(m);
966 }
967 }
968 return (error);
969 }
970
971 /*
972 * Process a delayed payload checksum calculation.
973 */
974 void
975 in_delayed_cksum(struct mbuf *m)
976 {
977 struct ip *ip;
978 u_int16_t csum, offset;
979
980 ip = mtod(m, struct ip *);
981 offset = ip->ip_hl << 2;
982 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset);
983 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0)
984 csum = 0xffff;
985
986 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data);
987
988 if ((offset + sizeof(u_int16_t)) > m->m_len) {
989 /* This happen when ip options were inserted
990 printf("in_delayed_cksum: pullup len %d off %d proto %d\n",
991 m->m_len, offset, ip->ip_p);
992 */
993 m_copyback(m, offset, sizeof(csum), (void *) &csum);
994 } else
995 *(u_int16_t *)(mtod(m, char *) + offset) = csum;
996 }
997
998 /*
999 * Determine the maximum length of the options to be inserted;
1000 * we would far rather allocate too much space rather than too little.
1001 */
1002
1003 u_int
1004 ip_optlen(struct inpcb *inp)
1005 {
1006 struct mbuf *m = inp->inp_options;
1007
1008 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) {
1009 return (m->m_len - offsetof(struct ipoption, ipopt_dst));
1010 }
1011 return 0;
1012 }
1013
1014 /*
1015 * Insert IP options into preformed packet.
1016 * Adjust IP destination as required for IP source routing,
1017 * as indicated by a non-zero in_addr at the start of the options.
1018 */
1019 static struct mbuf *
1020 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
1021 {
1022 struct ipoption *p = mtod(opt, struct ipoption *);
1023 struct mbuf *n;
1024 struct ip *ip = mtod(m, struct ip *);
1025 unsigned optlen;
1026
1027 optlen = opt->m_len - sizeof(p->ipopt_dst);
1028 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
1029 return (m); /* XXX should fail */
1030 if (!in_nullhost(p->ipopt_dst))
1031 ip->ip_dst = p->ipopt_dst;
1032 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
1033 MGETHDR(n, M_DONTWAIT, MT_HEADER);
1034 if (n == 0)
1035 return (m);
1036 MCLAIM(n, m->m_owner);
1037 M_MOVE_PKTHDR(n, m);
1038 m->m_len -= sizeof(struct ip);
1039 m->m_data += sizeof(struct ip);
1040 n->m_next = m;
1041 m = n;
1042 m->m_len = optlen + sizeof(struct ip);
1043 m->m_data += max_linkhdr;
1044 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip));
1045 } else {
1046 m->m_data -= optlen;
1047 m->m_len += optlen;
1048 memmove(mtod(m, void *), ip, sizeof(struct ip));
1049 }
1050 m->m_pkthdr.len += optlen;
1051 ip = mtod(m, struct ip *);
1052 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen);
1053 *phlen = sizeof(struct ip) + optlen;
1054 ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
1055 return (m);
1056 }
1057
1058 /*
1059 * Copy options from ip to jp,
1060 * omitting those not copied during fragmentation.
1061 */
1062 int
1063 ip_optcopy(struct ip *ip, struct ip *jp)
1064 {
1065 u_char *cp, *dp;
1066 int opt, optlen, cnt;
1067
1068 cp = (u_char *)(ip + 1);
1069 dp = (u_char *)(jp + 1);
1070 cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1071 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1072 opt = cp[0];
1073 if (opt == IPOPT_EOL)
1074 break;
1075 if (opt == IPOPT_NOP) {
1076 /* Preserve for IP mcast tunnel's LSRR alignment. */
1077 *dp++ = IPOPT_NOP;
1078 optlen = 1;
1079 continue;
1080 }
1081
1082 KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp));
1083 optlen = cp[IPOPT_OLEN];
1084 KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt);
1085
1086 /* Invalid lengths should have been caught by ip_dooptions. */
1087 if (optlen > cnt)
1088 optlen = cnt;
1089 if (IPOPT_COPIED(opt)) {
1090 bcopy((void *)cp, (void *)dp, (unsigned)optlen);
1091 dp += optlen;
1092 }
1093 }
1094 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1095 *dp++ = IPOPT_EOL;
1096 return (optlen);
1097 }
1098
1099 /*
1100 * IP socket option processing.
1101 */
1102 int
1103 ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
1104 {
1105 struct inpcb *inp = sotoinpcb(so);
1106 struct ip *ip = &inp->inp_ip;
1107 int inpflags = inp->inp_flags;
1108 int optval = 0, error = 0;
1109
1110 if (sopt->sopt_level != IPPROTO_IP) {
1111 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
1112 return 0;
1113 return ENOPROTOOPT;
1114 }
1115
1116 switch (op) {
1117 case PRCO_SETOPT:
1118 switch (sopt->sopt_name) {
1119 case IP_OPTIONS:
1120 #ifdef notyet
1121 case IP_RETOPTS:
1122 #endif
1123 error = ip_pcbopts(inp, sopt);
1124 break;
1125
1126 case IP_TOS:
1127 case IP_TTL:
1128 case IP_MINTTL:
1129 case IP_PKTINFO:
1130 case IP_RECVOPTS:
1131 case IP_RECVRETOPTS:
1132 case IP_RECVDSTADDR:
1133 case IP_RECVIF:
1134 case IP_RECVPKTINFO:
1135 case IP_RECVTTL:
1136 error = sockopt_getint(sopt, &optval);
1137 if (error)
1138 break;
1139
1140 switch (sopt->sopt_name) {
1141 case IP_TOS:
1142 ip->ip_tos = optval;
1143 break;
1144
1145 case IP_TTL:
1146 ip->ip_ttl = optval;
1147 break;
1148
1149 case IP_MINTTL:
1150 if (optval > 0 && optval <= MAXTTL)
1151 inp->inp_ip_minttl = optval;
1152 else
1153 error = EINVAL;
1154 break;
1155 #define OPTSET(bit) \
1156 if (optval) \
1157 inpflags |= bit; \
1158 else \
1159 inpflags &= ~bit;
1160
1161 case IP_PKTINFO:
1162 OPTSET(INP_PKTINFO);
1163 break;
1164
1165 case IP_RECVOPTS:
1166 OPTSET(INP_RECVOPTS);
1167 break;
1168
1169 case IP_RECVPKTINFO:
1170 OPTSET(INP_RECVPKTINFO);
1171 break;
1172
1173 case IP_RECVRETOPTS:
1174 OPTSET(INP_RECVRETOPTS);
1175 break;
1176
1177 case IP_RECVDSTADDR:
1178 OPTSET(INP_RECVDSTADDR);
1179 break;
1180
1181 case IP_RECVIF:
1182 OPTSET(INP_RECVIF);
1183 break;
1184
1185 case IP_RECVTTL:
1186 OPTSET(INP_RECVTTL);
1187 break;
1188 }
1189 break;
1190 #undef OPTSET
1191
1192 case IP_MULTICAST_IF:
1193 case IP_MULTICAST_TTL:
1194 case IP_MULTICAST_LOOP:
1195 case IP_ADD_MEMBERSHIP:
1196 case IP_DROP_MEMBERSHIP:
1197 error = ip_setmoptions(&inp->inp_moptions, sopt);
1198 break;
1199
1200 case IP_PORTRANGE:
1201 error = sockopt_getint(sopt, &optval);
1202 if (error)
1203 break;
1204
1205 switch (optval) {
1206 case IP_PORTRANGE_DEFAULT:
1207 case IP_PORTRANGE_HIGH:
1208 inpflags &= ~(INP_LOWPORT);
1209 break;
1210
1211 case IP_PORTRANGE_LOW:
1212 inpflags |= INP_LOWPORT;
1213 break;
1214
1215 default:
1216 error = EINVAL;
1217 break;
1218 }
1219 break;
1220
1221 case IP_PORTALGO:
1222 error = sockopt_getint(sopt, &optval);
1223 if (error)
1224 break;
1225
1226 error = portalgo_algo_index_select(
1227 (struct inpcb_hdr *)inp, optval);
1228 break;
1229
1230 #if defined(IPSEC)
1231 case IP_IPSEC_POLICY:
1232 if (ipsec_enabled) {
1233 error = ipsec4_set_policy(inp, sopt->sopt_name,
1234 sopt->sopt_data, sopt->sopt_size,
1235 curlwp->l_cred);
1236 break;
1237 }
1238 /*FALLTHROUGH*/
1239 #endif /* IPSEC */
1240
1241 default:
1242 error = ENOPROTOOPT;
1243 break;
1244 }
1245 break;
1246
1247 case PRCO_GETOPT:
1248 switch (sopt->sopt_name) {
1249 case IP_OPTIONS:
1250 case IP_RETOPTS: {
1251 struct mbuf *mopts = inp->inp_options;
1252
1253 if (mopts) {
1254 struct mbuf *m;
1255
1256 m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT);
1257 if (m == NULL) {
1258 error = ENOBUFS;
1259 break;
1260 }
1261 error = sockopt_setmbuf(sopt, m);
1262 }
1263 break;
1264 }
1265 case IP_PKTINFO:
1266 case IP_TOS:
1267 case IP_TTL:
1268 case IP_MINTTL:
1269 case IP_RECVOPTS:
1270 case IP_RECVRETOPTS:
1271 case IP_RECVDSTADDR:
1272 case IP_RECVIF:
1273 case IP_RECVPKTINFO:
1274 case IP_RECVTTL:
1275 case IP_ERRORMTU:
1276 switch (sopt->sopt_name) {
1277 case IP_TOS:
1278 optval = ip->ip_tos;
1279 break;
1280
1281 case IP_TTL:
1282 optval = ip->ip_ttl;
1283 break;
1284
1285 case IP_MINTTL:
1286 optval = inp->inp_ip_minttl;
1287 break;
1288
1289 case IP_ERRORMTU:
1290 optval = inp->inp_errormtu;
1291 break;
1292
1293 #define OPTBIT(bit) (inpflags & bit ? 1 : 0)
1294
1295 case IP_PKTINFO:
1296 optval = OPTBIT(INP_PKTINFO);
1297 break;
1298
1299 case IP_RECVOPTS:
1300 optval = OPTBIT(INP_RECVOPTS);
1301 break;
1302
1303 case IP_RECVPKTINFO:
1304 optval = OPTBIT(INP_RECVPKTINFO);
1305 break;
1306
1307 case IP_RECVRETOPTS:
1308 optval = OPTBIT(INP_RECVRETOPTS);
1309 break;
1310
1311 case IP_RECVDSTADDR:
1312 optval = OPTBIT(INP_RECVDSTADDR);
1313 break;
1314
1315 case IP_RECVIF:
1316 optval = OPTBIT(INP_RECVIF);
1317 break;
1318
1319 case IP_RECVTTL:
1320 optval = OPTBIT(INP_RECVTTL);
1321 break;
1322 }
1323 error = sockopt_setint(sopt, optval);
1324 break;
1325
1326 #if 0 /* defined(IPSEC) */
1327 case IP_IPSEC_POLICY:
1328 {
1329 struct mbuf *m = NULL;
1330
1331 /* XXX this will return EINVAL as sopt is empty */
1332 error = ipsec4_get_policy(inp, sopt->sopt_data,
1333 sopt->sopt_size, &m);
1334 if (error == 0)
1335 error = sockopt_setmbuf(sopt, m);
1336 break;
1337 }
1338 #endif /*IPSEC*/
1339
1340 case IP_MULTICAST_IF:
1341 case IP_MULTICAST_TTL:
1342 case IP_MULTICAST_LOOP:
1343 case IP_ADD_MEMBERSHIP:
1344 case IP_DROP_MEMBERSHIP:
1345 error = ip_getmoptions(inp->inp_moptions, sopt);
1346 break;
1347
1348 case IP_PORTRANGE:
1349 if (inpflags & INP_LOWPORT)
1350 optval = IP_PORTRANGE_LOW;
1351 else
1352 optval = IP_PORTRANGE_DEFAULT;
1353 error = sockopt_setint(sopt, optval);
1354 break;
1355
1356 case IP_PORTALGO:
1357 optval = inp->inp_portalgo;
1358 error = sockopt_setint(sopt, optval);
1359 break;
1360
1361 default:
1362 error = ENOPROTOOPT;
1363 break;
1364 }
1365 break;
1366 }
1367
1368 if (!error) {
1369 inp->inp_flags = inpflags;
1370 }
1371 return error;
1372 }
1373
1374 /*
1375 * Set up IP options in pcb for insertion in output packets.
1376 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1377 * with destination address if source routed.
1378 */
1379 static int
1380 ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt)
1381 {
1382 struct mbuf *m;
1383 const u_char *cp;
1384 u_char *dp;
1385 int cnt;
1386
1387 /* Turn off any old options. */
1388 if (inp->inp_options) {
1389 m_free(inp->inp_options);
1390 }
1391 inp->inp_options = NULL;
1392 if ((cnt = sopt->sopt_size) == 0) {
1393 /* Only turning off any previous options. */
1394 return 0;
1395 }
1396 cp = sopt->sopt_data;
1397
1398 #ifndef __vax__
1399 if (cnt % sizeof(int32_t))
1400 return (EINVAL);
1401 #endif
1402
1403 m = m_get(M_DONTWAIT, MT_SOOPTS);
1404 if (m == NULL)
1405 return (ENOBUFS);
1406
1407 dp = mtod(m, u_char *);
1408 memset(dp, 0, sizeof(struct in_addr));
1409 dp += sizeof(struct in_addr);
1410 m->m_len = sizeof(struct in_addr);
1411
1412 /*
1413 * IP option list according to RFC791. Each option is of the form
1414 *
1415 * [optval] [olen] [(olen - 2) data bytes]
1416 *
1417 * We validate the list and copy options to an mbuf for prepending
1418 * to data packets. The IP first-hop destination address will be
1419 * stored before actual options and is zero if unset.
1420 */
1421 while (cnt > 0) {
1422 uint8_t optval, olen, offset;
1423
1424 optval = cp[IPOPT_OPTVAL];
1425
1426 if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
1427 olen = 1;
1428 } else {
1429 if (cnt < IPOPT_OLEN + 1)
1430 goto bad;
1431
1432 olen = cp[IPOPT_OLEN];
1433 if (olen < IPOPT_OLEN + 1 || olen > cnt)
1434 goto bad;
1435 }
1436
1437 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
1438 /*
1439 * user process specifies route as:
1440 * ->A->B->C->D
1441 * D must be our final destination (but we can't
1442 * check that since we may not have connected yet).
1443 * A is first hop destination, which doesn't appear in
1444 * actual IP option, but is stored before the options.
1445 */
1446 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
1447 goto bad;
1448
1449 offset = cp[IPOPT_OFFSET];
1450 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
1451 sizeof(struct in_addr));
1452
1453 cp += sizeof(struct in_addr);
1454 cnt -= sizeof(struct in_addr);
1455 olen -= sizeof(struct in_addr);
1456
1457 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1458 goto bad;
1459
1460 memcpy(dp, cp, olen);
1461 dp[IPOPT_OPTVAL] = optval;
1462 dp[IPOPT_OLEN] = olen;
1463 dp[IPOPT_OFFSET] = offset;
1464 break;
1465 } else {
1466 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1467 goto bad;
1468
1469 memcpy(dp, cp, olen);
1470 break;
1471 }
1472
1473 dp += olen;
1474 m->m_len += olen;
1475
1476 if (optval == IPOPT_EOL)
1477 break;
1478
1479 cp += olen;
1480 cnt -= olen;
1481 }
1482
1483 inp->inp_options = m;
1484 return 0;
1485 bad:
1486 (void)m_free(m);
1487 return EINVAL;
1488 }
1489
1490 /*
1491 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1492 */
1493 static struct ifnet *
1494 ip_multicast_if(struct in_addr *a, int *ifindexp)
1495 {
1496 int ifindex;
1497 struct ifnet *ifp = NULL;
1498 struct in_ifaddr *ia;
1499
1500 if (ifindexp)
1501 *ifindexp = 0;
1502 if (ntohl(a->s_addr) >> 24 == 0) {
1503 ifindex = ntohl(a->s_addr) & 0xffffff;
1504 ifp = if_byindex(ifindex);
1505 if (!ifp)
1506 return NULL;
1507 if (ifindexp)
1508 *ifindexp = ifindex;
1509 } else {
1510 LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) {
1511 if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
1512 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
1513 ifp = ia->ia_ifp;
1514 break;
1515 }
1516 }
1517 }
1518 return ifp;
1519 }
1520
1521 static int
1522 ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
1523 {
1524 u_int tval;
1525 u_char cval;
1526 int error;
1527
1528 if (sopt == NULL)
1529 return EINVAL;
1530
1531 switch (sopt->sopt_size) {
1532 case sizeof(u_char):
1533 error = sockopt_get(sopt, &cval, sizeof(u_char));
1534 tval = cval;
1535 break;
1536
1537 case sizeof(u_int):
1538 error = sockopt_get(sopt, &tval, sizeof(u_int));
1539 break;
1540
1541 default:
1542 error = EINVAL;
1543 }
1544
1545 if (error)
1546 return error;
1547
1548 if (tval > maxval)
1549 return EINVAL;
1550
1551 *val = tval;
1552 return 0;
1553 }
1554
1555 static int
1556 ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
1557 struct in_addr *ia, bool add)
1558 {
1559 int error;
1560 struct ip_mreq mreq;
1561
1562 error = sockopt_get(sopt, &mreq, sizeof(mreq));
1563 if (error)
1564 return error;
1565
1566 if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr))
1567 return EINVAL;
1568
1569 memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia));
1570
1571 if (in_nullhost(mreq.imr_interface)) {
1572 union {
1573 struct sockaddr dst;
1574 struct sockaddr_in dst4;
1575 } u;
1576 struct route ro;
1577
1578 if (!add) {
1579 *ifp = NULL;
1580 return 0;
1581 }
1582 /*
1583 * If no interface address was provided, use the interface of
1584 * the route to the given multicast address.
1585 */
1586 struct rtentry *rt;
1587 memset(&ro, 0, sizeof(ro));
1588
1589 sockaddr_in_init(&u.dst4, ia, 0);
1590 error = rtcache_setdst(&ro, &u.dst);
1591 if (error != 0)
1592 return error;
1593 *ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL;
1594 rtcache_free(&ro);
1595 } else {
1596 *ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1597 if (!add && *ifp == NULL)
1598 return EADDRNOTAVAIL;
1599 }
1600 return 0;
1601 }
1602
1603 /*
1604 * Add a multicast group membership.
1605 * Group must be a valid IP multicast address.
1606 */
1607 static int
1608 ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt)
1609 {
1610 struct ifnet *ifp;
1611 struct in_addr ia;
1612 int i, error;
1613
1614 if (sopt->sopt_size == sizeof(struct ip_mreq))
1615 error = ip_get_membership(sopt, &ifp, &ia, true);
1616 else
1617 #ifdef INET6
1618 error = ip6_get_membership(sopt, &ifp, &ia, sizeof(ia));
1619 #else
1620 return EINVAL;
1621 #endif
1622
1623 if (error)
1624 return error;
1625
1626 /*
1627 * See if we found an interface, and confirm that it
1628 * supports multicast.
1629 */
1630 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
1631 return EADDRNOTAVAIL;
1632
1633 /*
1634 * See if the membership already exists or if all the
1635 * membership slots are full.
1636 */
1637 for (i = 0; i < imo->imo_num_memberships; ++i) {
1638 if (imo->imo_membership[i]->inm_ifp == ifp &&
1639 in_hosteq(imo->imo_membership[i]->inm_addr, ia))
1640 break;
1641 }
1642 if (i < imo->imo_num_memberships)
1643 return EADDRINUSE;
1644
1645 if (i == IP_MAX_MEMBERSHIPS)
1646 return ETOOMANYREFS;
1647
1648 /*
1649 * Everything looks good; add a new record to the multicast
1650 * address list for the given interface.
1651 */
1652 if ((imo->imo_membership[i] = in_addmulti(&ia, ifp)) == NULL)
1653 return ENOBUFS;
1654
1655 ++imo->imo_num_memberships;
1656 return 0;
1657 }
1658
1659 /*
1660 * Drop a multicast group membership.
1661 * Group must be a valid IP multicast address.
1662 */
1663 static int
1664 ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt)
1665 {
1666 struct in_addr ia;
1667 struct ifnet *ifp;
1668 int i, error;
1669
1670 if (sopt->sopt_size == sizeof(struct ip_mreq))
1671 error = ip_get_membership(sopt, &ifp, &ia, false);
1672 else
1673 #ifdef INET6
1674 error = ip6_get_membership(sopt, &ifp, &ia, sizeof(ia));
1675 #else
1676 return EINVAL;
1677 #endif
1678
1679 if (error)
1680 return error;
1681
1682 /*
1683 * Find the membership in the membership array.
1684 */
1685 for (i = 0; i < imo->imo_num_memberships; ++i) {
1686 if ((ifp == NULL ||
1687 imo->imo_membership[i]->inm_ifp == ifp) &&
1688 in_hosteq(imo->imo_membership[i]->inm_addr, ia))
1689 break;
1690 }
1691 if (i == imo->imo_num_memberships)
1692 return EADDRNOTAVAIL;
1693
1694 /*
1695 * Give up the multicast address record to which the
1696 * membership points.
1697 */
1698 in_delmulti(imo->imo_membership[i]);
1699
1700 /*
1701 * Remove the gap in the membership array.
1702 */
1703 for (++i; i < imo->imo_num_memberships; ++i)
1704 imo->imo_membership[i-1] = imo->imo_membership[i];
1705 --imo->imo_num_memberships;
1706 return 0;
1707 }
1708
1709 /*
1710 * Set the IP multicast options in response to user setsockopt().
1711 */
1712 int
1713 ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt)
1714 {
1715 struct ip_moptions *imo = *pimo;
1716 struct in_addr addr;
1717 struct ifnet *ifp;
1718 int ifindex, error = 0;
1719
1720 if (!imo) {
1721 /*
1722 * No multicast option buffer attached to the pcb;
1723 * allocate one and initialize to default values.
1724 */
1725 imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP);
1726 if (imo == NULL)
1727 return ENOBUFS;
1728
1729 imo->imo_multicast_ifp = NULL;
1730 imo->imo_multicast_addr.s_addr = INADDR_ANY;
1731 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1732 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1733 imo->imo_num_memberships = 0;
1734 *pimo = imo;
1735 }
1736
1737 switch (sopt->sopt_name) {
1738 case IP_MULTICAST_IF:
1739 /*
1740 * Select the interface for outgoing multicast packets.
1741 */
1742 error = sockopt_get(sopt, &addr, sizeof(addr));
1743 if (error)
1744 break;
1745
1746 /*
1747 * INADDR_ANY is used to remove a previous selection.
1748 * When no interface is selected, a default one is
1749 * chosen every time a multicast packet is sent.
1750 */
1751 if (in_nullhost(addr)) {
1752 imo->imo_multicast_ifp = NULL;
1753 break;
1754 }
1755 /*
1756 * The selected interface is identified by its local
1757 * IP address. Find the interface and confirm that
1758 * it supports multicasting.
1759 */
1760 ifp = ip_multicast_if(&addr, &ifindex);
1761 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1762 error = EADDRNOTAVAIL;
1763 break;
1764 }
1765 imo->imo_multicast_ifp = ifp;
1766 if (ifindex)
1767 imo->imo_multicast_addr = addr;
1768 else
1769 imo->imo_multicast_addr.s_addr = INADDR_ANY;
1770 break;
1771
1772 case IP_MULTICAST_TTL:
1773 /*
1774 * Set the IP time-to-live for outgoing multicast packets.
1775 */
1776 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
1777 break;
1778
1779 case IP_MULTICAST_LOOP:
1780 /*
1781 * Set the loopback flag for outgoing multicast packets.
1782 * Must be zero or one.
1783 */
1784 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
1785 break;
1786
1787 case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */
1788 error = ip_add_membership(imo, sopt);
1789 break;
1790
1791 case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */
1792 error = ip_drop_membership(imo, sopt);
1793 break;
1794
1795 default:
1796 error = EOPNOTSUPP;
1797 break;
1798 }
1799
1800 /*
1801 * If all options have default values, no need to keep the mbuf.
1802 */
1803 if (imo->imo_multicast_ifp == NULL &&
1804 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
1805 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
1806 imo->imo_num_memberships == 0) {
1807 kmem_free(imo, sizeof(*imo));
1808 *pimo = NULL;
1809 }
1810
1811 return error;
1812 }
1813
1814 /*
1815 * Return the IP multicast options in response to user getsockopt().
1816 */
1817 int
1818 ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
1819 {
1820 struct in_addr addr;
1821 struct in_ifaddr *ia;
1822 uint8_t optval;
1823 int error = 0;
1824
1825 switch (sopt->sopt_name) {
1826 case IP_MULTICAST_IF:
1827 if (imo == NULL || imo->imo_multicast_ifp == NULL)
1828 addr = zeroin_addr;
1829 else if (imo->imo_multicast_addr.s_addr) {
1830 /* return the value user has set */
1831 addr = imo->imo_multicast_addr;
1832 } else {
1833 IFP_TO_IA(imo->imo_multicast_ifp, ia);
1834 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
1835 }
1836 error = sockopt_set(sopt, &addr, sizeof(addr));
1837 break;
1838
1839 case IP_MULTICAST_TTL:
1840 optval = imo ? imo->imo_multicast_ttl
1841 : IP_DEFAULT_MULTICAST_TTL;
1842
1843 error = sockopt_set(sopt, &optval, sizeof(optval));
1844 break;
1845
1846 case IP_MULTICAST_LOOP:
1847 optval = imo ? imo->imo_multicast_loop
1848 : IP_DEFAULT_MULTICAST_LOOP;
1849
1850 error = sockopt_set(sopt, &optval, sizeof(optval));
1851 break;
1852
1853 default:
1854 error = EOPNOTSUPP;
1855 }
1856
1857 return error;
1858 }
1859
1860 /*
1861 * Discard the IP multicast options.
1862 */
1863 void
1864 ip_freemoptions(struct ip_moptions *imo)
1865 {
1866 int i;
1867
1868 if (imo != NULL) {
1869 for (i = 0; i < imo->imo_num_memberships; ++i)
1870 in_delmulti(imo->imo_membership[i]);
1871 kmem_free(imo, sizeof(*imo));
1872 }
1873 }
1874
1875 /*
1876 * Routine called from ip_output() to loop back a copy of an IP multicast
1877 * packet to the input queue of a specified interface. Note that this
1878 * calls the output routine of the loopback "driver", but with an interface
1879 * pointer that might NOT be lo0ifp -- easier than replicating that code here.
1880 */
1881 static void
1882 ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
1883 {
1884 struct ip *ip;
1885 struct mbuf *copym;
1886
1887 copym = m_copypacket(m, M_DONTWAIT);
1888 if (copym != NULL &&
1889 (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
1890 copym = m_pullup(copym, sizeof(struct ip));
1891 if (copym == NULL)
1892 return;
1893 /*
1894 * We don't bother to fragment if the IP length is greater
1895 * than the interface's MTU. Can this possibly matter?
1896 */
1897 ip = mtod(copym, struct ip *);
1898
1899 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1900 in_delayed_cksum(copym);
1901 copym->m_pkthdr.csum_flags &=
1902 ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1903 }
1904
1905 ip->ip_sum = 0;
1906 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
1907 #ifndef NET_MPSAFE
1908 KERNEL_LOCK(1, NULL);
1909 #endif
1910 (void)looutput(ifp, copym, sintocsa(dst), NULL);
1911 #ifndef NET_MPSAFE
1912 KERNEL_UNLOCK_ONE(NULL);
1913 #endif
1914 }
1915