ip_output.c revision 1.275 1 /* $NetBSD: ip_output.c,v 1.275 2017/03/03 07:13:06 ozaki-r Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Public Access Networks Corporation ("Panix"). It was developed under
38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
59 * POSSIBILITY OF SUCH DAMAGE.
60 */
61
62 /*
63 * Copyright (c) 1982, 1986, 1988, 1990, 1993
64 * The Regents of the University of California. All rights reserved.
65 *
66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions
68 * are met:
69 * 1. Redistributions of source code must retain the above copyright
70 * notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright
72 * notice, this list of conditions and the following disclaimer in the
73 * documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors
75 * may be used to endorse or promote products derived from this software
76 * without specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 *
90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
91 */
92
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.275 2017/03/03 07:13:06 ozaki-r Exp $");
95
96 #ifdef _KERNEL_OPT
97 #include "opt_inet.h"
98 #include "opt_ipsec.h"
99 #include "opt_mrouting.h"
100 #include "opt_net_mpsafe.h"
101 #include "opt_mpls.h"
102 #endif
103
104 #include "arp.h"
105
106 #include <sys/param.h>
107 #include <sys/kmem.h>
108 #include <sys/mbuf.h>
109 #include <sys/socket.h>
110 #include <sys/socketvar.h>
111 #include <sys/kauth.h>
112 #include <sys/systm.h>
113 #include <sys/syslog.h>
114
115 #include <net/if.h>
116 #include <net/if_types.h>
117 #include <net/route.h>
118 #include <net/pfil.h>
119
120 #include <netinet/in.h>
121 #include <netinet/in_systm.h>
122 #include <netinet/ip.h>
123 #include <netinet/in_pcb.h>
124 #include <netinet/in_var.h>
125 #include <netinet/ip_var.h>
126 #include <netinet/ip_private.h>
127 #include <netinet/in_offload.h>
128 #include <netinet/portalgo.h>
129 #include <netinet/udp.h>
130
131 #ifdef INET6
132 #include <netinet6/ip6_var.h>
133 #endif
134
135 #ifdef MROUTING
136 #include <netinet/ip_mroute.h>
137 #endif
138
139 #ifdef IPSEC
140 #include <netipsec/ipsec.h>
141 #include <netipsec/key.h>
142 #endif
143
144 #ifdef MPLS
145 #include <netmpls/mpls.h>
146 #include <netmpls/mpls_var.h>
147 #endif
148
149 static int ip_pcbopts(struct inpcb *, const struct sockopt *);
150 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
151 static struct ifnet *ip_multicast_if(struct in_addr *, int *);
152 static void ip_mloopback(struct ifnet *, struct mbuf *,
153 const struct sockaddr_in *);
154 static int ip_ifaddrvalid(const struct in_ifaddr *);
155
156 extern pfil_head_t *inet_pfil_hook; /* XXX */
157
158 int ip_do_loopback_cksum = 0;
159
160 static int
161 ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m,
162 const struct rtentry *rt)
163 {
164 int error = 0;
165 #ifdef MPLS
166 union mpls_shim msh;
167
168 if (rt == NULL || rt_gettag(rt) == NULL ||
169 rt_gettag(rt)->sa_family != AF_MPLS ||
170 (m->m_flags & (M_MCAST | M_BCAST)) != 0 ||
171 ifp->if_type != IFT_ETHER)
172 return 0;
173
174 msh.s_addr = MPLS_GETSADDR(rt);
175 if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
176 struct m_tag *mtag;
177 /*
178 * XXX tentative solution to tell ether_output
179 * it's MPLS. Need some more efficient solution.
180 */
181 mtag = m_tag_get(PACKET_TAG_MPLS,
182 sizeof(int) /* dummy */,
183 M_NOWAIT);
184 if (mtag == NULL)
185 return ENOMEM;
186 m_tag_prepend(m, mtag);
187 }
188 #endif
189 return error;
190 }
191
192 /*
193 * Send an IP packet to a host.
194 */
195 int
196 ip_if_output(struct ifnet * const ifp, struct mbuf * const m,
197 const struct sockaddr * const dst, const struct rtentry *rt)
198 {
199 int error = 0;
200
201 if (rt != NULL) {
202 error = rt_check_reject_route(rt, ifp);
203 if (error != 0) {
204 m_freem(m);
205 return error;
206 }
207 }
208
209 error = ip_mark_mpls(ifp, m, rt);
210 if (error != 0) {
211 m_freem(m);
212 return error;
213 }
214
215 error = if_output_lock(ifp, ifp, m, dst, rt);
216
217 return error;
218 }
219
220 /*
221 * IP output. The packet in mbuf chain m contains a skeletal IP
222 * header (with len, off, ttl, proto, tos, src, dst).
223 * The mbuf chain containing the packet will be freed.
224 * The mbuf opt, if present, will not be freed.
225 */
226 int
227 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
228 struct ip_moptions *imo, struct inpcb *inp)
229 {
230 struct rtentry *rt;
231 struct ip *ip;
232 struct ifnet *ifp, *mifp = NULL;
233 struct mbuf *m = m0;
234 int hlen = sizeof (struct ip);
235 int len, error = 0;
236 struct route iproute;
237 const struct sockaddr_in *dst;
238 struct in_ifaddr *ia = NULL;
239 int isbroadcast;
240 int sw_csum;
241 u_long mtu;
242 #ifdef IPSEC
243 struct secpolicy *sp = NULL;
244 #endif
245 bool natt_frag = false;
246 bool rtmtu_nolock;
247 union {
248 struct sockaddr dst;
249 struct sockaddr_in dst4;
250 } u;
251 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed
252 * to the nexthop
253 */
254 struct psref psref, psref_ia;
255 int bound;
256 bool bind_need_restore = false;
257
258 len = 0;
259
260 MCLAIM(m, &ip_tx_mowner);
261
262 KASSERT((m->m_flags & M_PKTHDR) != 0);
263 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0);
264 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) !=
265 (M_CSUM_TCPv4|M_CSUM_UDPv4));
266
267 if (opt) {
268 m = ip_insertoptions(m, opt, &len);
269 if (len >= sizeof(struct ip))
270 hlen = len;
271 }
272 ip = mtod(m, struct ip *);
273
274 /*
275 * Fill in IP header.
276 */
277 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
278 ip->ip_v = IPVERSION;
279 ip->ip_off = htons(0);
280 /* ip->ip_id filled in after we find out source ia */
281 ip->ip_hl = hlen >> 2;
282 IP_STATINC(IP_STAT_LOCALOUT);
283 } else {
284 hlen = ip->ip_hl << 2;
285 }
286
287 /*
288 * Route packet.
289 */
290 if (ro == NULL) {
291 memset(&iproute, 0, sizeof(iproute));
292 ro = &iproute;
293 }
294 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0);
295 dst = satocsin(rtcache_getdst(ro));
296
297 /*
298 * If there is a cached route, check that it is to the same
299 * destination and is still up. If not, free it and try again.
300 * The address family should also be checked in case of sharing
301 * the cache with IPv6.
302 */
303 if (dst && (dst->sin_family != AF_INET ||
304 !in_hosteq(dst->sin_addr, ip->ip_dst)))
305 rtcache_free(ro);
306
307 if ((rt = rtcache_validate(ro)) == NULL &&
308 (rt = rtcache_update(ro, 1)) == NULL) {
309 dst = &u.dst4;
310 error = rtcache_setdst(ro, &u.dst);
311 if (error != 0)
312 goto bad;
313 }
314
315 bound = curlwp_bind();
316 bind_need_restore = true;
317 /*
318 * If routing to interface only, short circuit routing lookup.
319 */
320 if (flags & IP_ROUTETOIF) {
321 struct ifaddr *ifa;
322
323 ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia);
324 if (ifa == NULL) {
325 IP_STATINC(IP_STAT_NOROUTE);
326 error = ENETUNREACH;
327 goto bad;
328 }
329 /* ia is already referenced by psref_ia */
330 ia = ifatoia(ifa);
331
332 ifp = ia->ia_ifp;
333 mtu = ifp->if_mtu;
334 ip->ip_ttl = 1;
335 isbroadcast = in_broadcast(dst->sin_addr, ifp);
336 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) ||
337 ip->ip_dst.s_addr == INADDR_BROADCAST) &&
338 imo != NULL && imo->imo_multicast_if_index != 0) {
339 ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref);
340 if (ifp == NULL) {
341 IP_STATINC(IP_STAT_NOROUTE);
342 error = ENETUNREACH;
343 goto bad;
344 }
345 mtu = ifp->if_mtu;
346 ia = in_get_ia_from_ifp_psref(ifp, &psref_ia);
347 if (ia == NULL) {
348 error = EADDRNOTAVAIL;
349 goto bad;
350 }
351 isbroadcast = 0;
352 } else {
353 if (rt == NULL)
354 rt = rtcache_init(ro);
355 if (rt == NULL) {
356 IP_STATINC(IP_STAT_NOROUTE);
357 error = EHOSTUNREACH;
358 goto bad;
359 }
360 if (ifa_is_destroying(rt->rt_ifa)) {
361 rtcache_unref(rt, ro);
362 rt = NULL;
363 IP_STATINC(IP_STAT_NOROUTE);
364 error = EHOSTUNREACH;
365 goto bad;
366 }
367 ifa_acquire(rt->rt_ifa, &psref_ia);
368 ia = ifatoia(rt->rt_ifa);
369 ifp = rt->rt_ifp;
370 if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
371 mtu = ifp->if_mtu;
372 rt->rt_use++;
373 if (rt->rt_flags & RTF_GATEWAY)
374 dst = satosin(rt->rt_gateway);
375 if (rt->rt_flags & RTF_HOST)
376 isbroadcast = rt->rt_flags & RTF_BROADCAST;
377 else
378 isbroadcast = in_broadcast(dst->sin_addr, ifp);
379 }
380 rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0;
381
382 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
383 (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
384 bool inmgroup;
385
386 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
387 M_BCAST : M_MCAST;
388 /*
389 * See if the caller provided any multicast options
390 */
391 if (imo != NULL)
392 ip->ip_ttl = imo->imo_multicast_ttl;
393 else
394 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
395
396 /*
397 * if we don't know the outgoing ifp yet, we can't generate
398 * output
399 */
400 if (!ifp) {
401 IP_STATINC(IP_STAT_NOROUTE);
402 error = ENETUNREACH;
403 goto bad;
404 }
405
406 /*
407 * If the packet is multicast or broadcast, confirm that
408 * the outgoing interface can transmit it.
409 */
410 if (((m->m_flags & M_MCAST) &&
411 (ifp->if_flags & IFF_MULTICAST) == 0) ||
412 ((m->m_flags & M_BCAST) &&
413 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) {
414 IP_STATINC(IP_STAT_NOROUTE);
415 error = ENETUNREACH;
416 goto bad;
417 }
418 /*
419 * If source address not specified yet, use an address
420 * of outgoing interface.
421 */
422 if (in_nullhost(ip->ip_src)) {
423 struct in_ifaddr *xia;
424 struct ifaddr *xifa;
425 struct psref _psref;
426
427 xia = in_get_ia_from_ifp_psref(ifp, &_psref);
428 if (!xia) {
429 error = EADDRNOTAVAIL;
430 goto bad;
431 }
432 xifa = &xia->ia_ifa;
433 if (xifa->ifa_getifa != NULL) {
434 ia4_release(xia, &_psref);
435 /* FIXME ifa_getifa is NOMPSAFE */
436 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
437 if (xia == NULL) {
438 error = EADDRNOTAVAIL;
439 goto bad;
440 }
441 ia4_acquire(xia, &_psref);
442 }
443 ip->ip_src = xia->ia_addr.sin_addr;
444 ia4_release(xia, &_psref);
445 }
446
447 inmgroup = in_multi_group(ip->ip_dst, ifp, flags);
448 if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) {
449 /*
450 * If we belong to the destination multicast group
451 * on the outgoing interface, and the caller did not
452 * forbid loopback, loop back a copy.
453 */
454 ip_mloopback(ifp, m, &u.dst4);
455 }
456 #ifdef MROUTING
457 else {
458 /*
459 * If we are acting as a multicast router, perform
460 * multicast forwarding as if the packet had just
461 * arrived on the interface to which we are about
462 * to send. The multicast forwarding function
463 * recursively calls this function, using the
464 * IP_FORWARDING flag to prevent infinite recursion.
465 *
466 * Multicasts that are looped back by ip_mloopback(),
467 * above, will be forwarded by the ip_input() routine,
468 * if necessary.
469 */
470 extern struct socket *ip_mrouter;
471
472 if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
473 if (ip_mforward(m, ifp) != 0) {
474 m_freem(m);
475 goto done;
476 }
477 }
478 }
479 #endif
480 /*
481 * Multicasts with a time-to-live of zero may be looped-
482 * back, above, but must not be transmitted on a network.
483 * Also, multicasts addressed to the loopback interface
484 * are not sent -- the above call to ip_mloopback() will
485 * loop back a copy if this host actually belongs to the
486 * destination group on the loopback interface.
487 */
488 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
489 m_freem(m);
490 goto done;
491 }
492 goto sendit;
493 }
494
495 /*
496 * If source address not specified yet, use address
497 * of outgoing interface.
498 */
499 if (in_nullhost(ip->ip_src)) {
500 struct ifaddr *xifa;
501
502 xifa = &ia->ia_ifa;
503 if (xifa->ifa_getifa != NULL) {
504 ia4_release(ia, &psref_ia);
505 /* FIXME ifa_getifa is NOMPSAFE */
506 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
507 if (ia == NULL) {
508 error = EADDRNOTAVAIL;
509 goto bad;
510 }
511 ia4_acquire(ia, &psref_ia);
512 }
513 ip->ip_src = ia->ia_addr.sin_addr;
514 }
515
516 /*
517 * packets with Class-D address as source are not valid per
518 * RFC 1112
519 */
520 if (IN_MULTICAST(ip->ip_src.s_addr)) {
521 IP_STATINC(IP_STAT_ODROPPED);
522 error = EADDRNOTAVAIL;
523 goto bad;
524 }
525
526 /*
527 * Look for broadcast address and and verify user is allowed to
528 * send such a packet.
529 */
530 if (isbroadcast) {
531 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
532 error = EADDRNOTAVAIL;
533 goto bad;
534 }
535 if ((flags & IP_ALLOWBROADCAST) == 0) {
536 error = EACCES;
537 goto bad;
538 }
539 /* don't allow broadcast messages to be fragmented */
540 if (ntohs(ip->ip_len) > ifp->if_mtu) {
541 error = EMSGSIZE;
542 goto bad;
543 }
544 m->m_flags |= M_BCAST;
545 } else
546 m->m_flags &= ~M_BCAST;
547
548 sendit:
549 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) {
550 if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
551 ip->ip_id = 0;
552 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
553 ip->ip_id = ip_newid(ia);
554 } else {
555
556 /*
557 * TSO capable interfaces (typically?) increment
558 * ip_id for each segment.
559 * "allocate" enough ids here to increase the chance
560 * for them to be unique.
561 *
562 * note that the following calculation is not
563 * needed to be precise. wasting some ip_id is fine.
564 */
565
566 unsigned int segsz = m->m_pkthdr.segsz;
567 unsigned int datasz = ntohs(ip->ip_len) - hlen;
568 unsigned int num = howmany(datasz, segsz);
569
570 ip->ip_id = ip_newid_range(ia, num);
571 }
572 }
573 if (ia != NULL) {
574 ia4_release(ia, &psref_ia);
575 ia = NULL;
576 }
577
578 /*
579 * If we're doing Path MTU Discovery, we need to set DF unless
580 * the route's MTU is locked.
581 */
582 if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) {
583 ip->ip_off |= htons(IP_DF);
584 }
585
586 #ifdef IPSEC
587 if (ipsec_used) {
588 bool ipsec_done = false;
589
590 /* Perform IPsec processing, if any. */
591 error = ipsec4_output(m, inp, flags, &sp, &mtu, &natt_frag,
592 &ipsec_done);
593 if (error || ipsec_done)
594 goto done;
595 }
596 #endif
597
598 /*
599 * Run through list of hooks for output packets.
600 */
601 error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT);
602 if (error)
603 goto done;
604 if (m == NULL)
605 goto done;
606
607 ip = mtod(m, struct ip *);
608 hlen = ip->ip_hl << 2;
609
610 m->m_pkthdr.csum_data |= hlen << 16;
611
612 /*
613 * search for the source address structure to
614 * maintain output statistics.
615 */
616 KASSERT(ia == NULL);
617 ia = in_get_ia_psref(ip->ip_src, &psref_ia);
618
619 /* Ensure we only send from a valid address. */
620 if ((ia != NULL || (flags & IP_FORWARDING) == 0) &&
621 (error = ip_ifaddrvalid(ia)) != 0)
622 {
623 ARPLOG(LOG_ERR,
624 "refusing to send from invalid address %s (pid %d)\n",
625 ARPLOGADDR(ip->ip_src), curproc->p_pid);
626 IP_STATINC(IP_STAT_ODROPPED);
627 if (error == 1)
628 /*
629 * Address exists, but is tentative or detached.
630 * We can't send from it because it's invalid,
631 * so we drop the packet.
632 */
633 error = 0;
634 else
635 error = EADDRNOTAVAIL;
636 goto bad;
637 }
638
639 /* Maybe skip checksums on loopback interfaces. */
640 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
641 m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
642 }
643 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
644 /*
645 * If small enough for mtu of path, or if using TCP segmentation
646 * offload, can just send directly.
647 */
648 if (ntohs(ip->ip_len) <= mtu ||
649 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) {
650 const struct sockaddr *sa;
651
652 #if IFA_STATS
653 if (ia)
654 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
655 #endif
656 /*
657 * Always initialize the sum to 0! Some HW assisted
658 * checksumming requires this.
659 */
660 ip->ip_sum = 0;
661
662 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
663 /*
664 * Perform any checksums that the hardware can't do
665 * for us.
666 *
667 * XXX Does any hardware require the {th,uh}_sum
668 * XXX fields to be 0?
669 */
670 if (sw_csum & M_CSUM_IPv4) {
671 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
672 ip->ip_sum = in_cksum(m, hlen);
673 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
674 }
675 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
676 if (IN_NEED_CHECKSUM(ifp,
677 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
678 in_delayed_cksum(m);
679 }
680 m->m_pkthdr.csum_flags &=
681 ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
682 }
683 }
684
685 sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst);
686 if (__predict_true(
687 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 ||
688 (ifp->if_capenable & IFCAP_TSOv4) != 0)) {
689 error = ip_if_output(ifp, m, sa, rt);
690 } else {
691 error = ip_tso_output(ifp, m, sa, rt);
692 }
693 goto done;
694 }
695
696 /*
697 * We can't use HW checksumming if we're about to
698 * to fragment the packet.
699 *
700 * XXX Some hardware can do this.
701 */
702 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
703 if (IN_NEED_CHECKSUM(ifp,
704 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
705 in_delayed_cksum(m);
706 }
707 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
708 }
709
710 /*
711 * Too large for interface; fragment if possible.
712 * Must be able to put at least 8 bytes per fragment.
713 */
714 if (ntohs(ip->ip_off) & IP_DF) {
715 if (flags & IP_RETURNMTU) {
716 KASSERT(inp != NULL);
717 inp->inp_errormtu = mtu;
718 }
719 error = EMSGSIZE;
720 IP_STATINC(IP_STAT_CANTFRAG);
721 goto bad;
722 }
723
724 error = ip_fragment(m, ifp, mtu);
725 if (error) {
726 m = NULL;
727 goto bad;
728 }
729
730 for (; m; m = m0) {
731 m0 = m->m_nextpkt;
732 m->m_nextpkt = 0;
733 if (error) {
734 m_freem(m);
735 continue;
736 }
737 #if IFA_STATS
738 if (ia)
739 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
740 #endif
741 /*
742 * If we get there, the packet has not been handled by
743 * IPsec whereas it should have. Now that it has been
744 * fragmented, re-inject it in ip_output so that IPsec
745 * processing can occur.
746 */
747 if (natt_frag) {
748 error = ip_output(m, opt, ro,
749 flags | IP_RAWOUTPUT | IP_NOIPNEWID,
750 imo, inp);
751 } else {
752 KASSERT((m->m_pkthdr.csum_flags &
753 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
754 error = ip_if_output(ifp, m,
755 (m->m_flags & M_MCAST) ?
756 sintocsa(rdst) : sintocsa(dst), rt);
757 }
758 }
759 if (error == 0) {
760 IP_STATINC(IP_STAT_FRAGMENTED);
761 }
762 done:
763 ia4_release(ia, &psref_ia);
764 rtcache_unref(rt, ro);
765 if (ro == &iproute) {
766 rtcache_free(&iproute);
767 }
768 #ifdef IPSEC
769 if (sp) {
770 KEY_FREESP(&sp);
771 }
772 #endif
773 if (mifp != NULL) {
774 if_put(mifp, &psref);
775 }
776 if (bind_need_restore)
777 curlwp_bindx(bound);
778 return error;
779 bad:
780 m_freem(m);
781 goto done;
782 }
783
784 int
785 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
786 {
787 struct ip *ip, *mhip;
788 struct mbuf *m0;
789 int len, hlen, off;
790 int mhlen, firstlen;
791 struct mbuf **mnext;
792 int sw_csum = m->m_pkthdr.csum_flags;
793 int fragments = 0;
794 int error = 0;
795
796 ip = mtod(m, struct ip *);
797 hlen = ip->ip_hl << 2;
798 if (ifp != NULL)
799 sw_csum &= ~ifp->if_csum_flags_tx;
800
801 len = (mtu - hlen) &~ 7;
802 if (len < 8) {
803 m_freem(m);
804 return (EMSGSIZE);
805 }
806
807 firstlen = len;
808 mnext = &m->m_nextpkt;
809
810 /*
811 * Loop through length of segment after first fragment,
812 * make new header and copy data of each part and link onto chain.
813 */
814 m0 = m;
815 mhlen = sizeof (struct ip);
816 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
817 MGETHDR(m, M_DONTWAIT, MT_HEADER);
818 if (m == 0) {
819 error = ENOBUFS;
820 IP_STATINC(IP_STAT_ODROPPED);
821 goto sendorfree;
822 }
823 MCLAIM(m, m0->m_owner);
824 *mnext = m;
825 mnext = &m->m_nextpkt;
826 m->m_data += max_linkhdr;
827 mhip = mtod(m, struct ip *);
828 *mhip = *ip;
829 /* we must inherit MCAST and BCAST flags */
830 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST);
831 if (hlen > sizeof (struct ip)) {
832 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
833 mhip->ip_hl = mhlen >> 2;
834 }
835 m->m_len = mhlen;
836 mhip->ip_off = ((off - hlen) >> 3) +
837 (ntohs(ip->ip_off) & ~IP_MF);
838 if (ip->ip_off & htons(IP_MF))
839 mhip->ip_off |= IP_MF;
840 if (off + len >= ntohs(ip->ip_len))
841 len = ntohs(ip->ip_len) - off;
842 else
843 mhip->ip_off |= IP_MF;
844 HTONS(mhip->ip_off);
845 mhip->ip_len = htons((u_int16_t)(len + mhlen));
846 m->m_next = m_copym(m0, off, len, M_DONTWAIT);
847 if (m->m_next == 0) {
848 error = ENOBUFS; /* ??? */
849 IP_STATINC(IP_STAT_ODROPPED);
850 goto sendorfree;
851 }
852 m->m_pkthdr.len = mhlen + len;
853 m_reset_rcvif(m);
854 mhip->ip_sum = 0;
855 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
856 if (sw_csum & M_CSUM_IPv4) {
857 mhip->ip_sum = in_cksum(m, mhlen);
858 } else {
859 /*
860 * checksum is hw-offloaded or not necessary.
861 */
862 m->m_pkthdr.csum_flags |=
863 m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
864 m->m_pkthdr.csum_data |= mhlen << 16;
865 KASSERT(!(ifp != NULL &&
866 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
867 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
868 }
869 IP_STATINC(IP_STAT_OFRAGMENTS);
870 fragments++;
871 }
872 /*
873 * Update first fragment by trimming what's been copied out
874 * and updating header, then send each fragment (in order).
875 */
876 m = m0;
877 m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
878 m->m_pkthdr.len = hlen + firstlen;
879 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
880 ip->ip_off |= htons(IP_MF);
881 ip->ip_sum = 0;
882 if (sw_csum & M_CSUM_IPv4) {
883 ip->ip_sum = in_cksum(m, hlen);
884 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
885 } else {
886 /*
887 * checksum is hw-offloaded or not necessary.
888 */
889 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
890 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
891 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
892 sizeof(struct ip));
893 }
894 sendorfree:
895 /*
896 * If there is no room for all the fragments, don't queue
897 * any of them.
898 */
899 if (ifp != NULL) {
900 IFQ_LOCK(&ifp->if_snd);
901 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
902 error == 0) {
903 error = ENOBUFS;
904 IP_STATINC(IP_STAT_ODROPPED);
905 IFQ_INC_DROPS(&ifp->if_snd);
906 }
907 IFQ_UNLOCK(&ifp->if_snd);
908 }
909 if (error) {
910 for (m = m0; m; m = m0) {
911 m0 = m->m_nextpkt;
912 m->m_nextpkt = NULL;
913 m_freem(m);
914 }
915 }
916 return (error);
917 }
918
919 /*
920 * Process a delayed payload checksum calculation.
921 */
922 void
923 in_delayed_cksum(struct mbuf *m)
924 {
925 struct ip *ip;
926 u_int16_t csum, offset;
927
928 ip = mtod(m, struct ip *);
929 offset = ip->ip_hl << 2;
930 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset);
931 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0)
932 csum = 0xffff;
933
934 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data);
935
936 if ((offset + sizeof(u_int16_t)) > m->m_len) {
937 /* This happen when ip options were inserted
938 printf("in_delayed_cksum: pullup len %d off %d proto %d\n",
939 m->m_len, offset, ip->ip_p);
940 */
941 m_copyback(m, offset, sizeof(csum), (void *) &csum);
942 } else
943 *(u_int16_t *)(mtod(m, char *) + offset) = csum;
944 }
945
946 /*
947 * Determine the maximum length of the options to be inserted;
948 * we would far rather allocate too much space rather than too little.
949 */
950
951 u_int
952 ip_optlen(struct inpcb *inp)
953 {
954 struct mbuf *m = inp->inp_options;
955
956 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) {
957 return (m->m_len - offsetof(struct ipoption, ipopt_dst));
958 }
959 return 0;
960 }
961
962 /*
963 * Insert IP options into preformed packet.
964 * Adjust IP destination as required for IP source routing,
965 * as indicated by a non-zero in_addr at the start of the options.
966 */
967 static struct mbuf *
968 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
969 {
970 struct ipoption *p = mtod(opt, struct ipoption *);
971 struct mbuf *n;
972 struct ip *ip = mtod(m, struct ip *);
973 unsigned optlen;
974
975 optlen = opt->m_len - sizeof(p->ipopt_dst);
976 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
977 return (m); /* XXX should fail */
978 if (!in_nullhost(p->ipopt_dst))
979 ip->ip_dst = p->ipopt_dst;
980 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
981 MGETHDR(n, M_DONTWAIT, MT_HEADER);
982 if (n == 0)
983 return (m);
984 MCLAIM(n, m->m_owner);
985 M_MOVE_PKTHDR(n, m);
986 m->m_len -= sizeof(struct ip);
987 m->m_data += sizeof(struct ip);
988 n->m_next = m;
989 m = n;
990 m->m_len = optlen + sizeof(struct ip);
991 m->m_data += max_linkhdr;
992 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip));
993 } else {
994 m->m_data -= optlen;
995 m->m_len += optlen;
996 memmove(mtod(m, void *), ip, sizeof(struct ip));
997 }
998 m->m_pkthdr.len += optlen;
999 ip = mtod(m, struct ip *);
1000 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen);
1001 *phlen = sizeof(struct ip) + optlen;
1002 ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
1003 return (m);
1004 }
1005
1006 /*
1007 * Copy options from ip to jp,
1008 * omitting those not copied during fragmentation.
1009 */
1010 int
1011 ip_optcopy(struct ip *ip, struct ip *jp)
1012 {
1013 u_char *cp, *dp;
1014 int opt, optlen, cnt;
1015
1016 cp = (u_char *)(ip + 1);
1017 dp = (u_char *)(jp + 1);
1018 cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1019 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1020 opt = cp[0];
1021 if (opt == IPOPT_EOL)
1022 break;
1023 if (opt == IPOPT_NOP) {
1024 /* Preserve for IP mcast tunnel's LSRR alignment. */
1025 *dp++ = IPOPT_NOP;
1026 optlen = 1;
1027 continue;
1028 }
1029
1030 KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp));
1031 optlen = cp[IPOPT_OLEN];
1032 KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt);
1033
1034 /* Invalid lengths should have been caught by ip_dooptions. */
1035 if (optlen > cnt)
1036 optlen = cnt;
1037 if (IPOPT_COPIED(opt)) {
1038 bcopy((void *)cp, (void *)dp, (unsigned)optlen);
1039 dp += optlen;
1040 }
1041 }
1042 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
1043 *dp++ = IPOPT_EOL;
1044 return (optlen);
1045 }
1046
1047 /*
1048 * IP socket option processing.
1049 */
1050 int
1051 ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
1052 {
1053 struct inpcb *inp = sotoinpcb(so);
1054 struct ip *ip = &inp->inp_ip;
1055 int inpflags = inp->inp_flags;
1056 int optval = 0, error = 0;
1057
1058 KASSERT(solocked(so));
1059
1060 if (sopt->sopt_level != IPPROTO_IP) {
1061 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
1062 return 0;
1063 return ENOPROTOOPT;
1064 }
1065
1066 switch (op) {
1067 case PRCO_SETOPT:
1068 switch (sopt->sopt_name) {
1069 case IP_OPTIONS:
1070 #ifdef notyet
1071 case IP_RETOPTS:
1072 #endif
1073 error = ip_pcbopts(inp, sopt);
1074 break;
1075
1076 case IP_TOS:
1077 case IP_TTL:
1078 case IP_MINTTL:
1079 case IP_PKTINFO:
1080 case IP_RECVOPTS:
1081 case IP_RECVRETOPTS:
1082 case IP_RECVDSTADDR:
1083 case IP_RECVIF:
1084 case IP_RECVPKTINFO:
1085 case IP_RECVTTL:
1086 error = sockopt_getint(sopt, &optval);
1087 if (error)
1088 break;
1089
1090 switch (sopt->sopt_name) {
1091 case IP_TOS:
1092 ip->ip_tos = optval;
1093 break;
1094
1095 case IP_TTL:
1096 ip->ip_ttl = optval;
1097 break;
1098
1099 case IP_MINTTL:
1100 if (optval > 0 && optval <= MAXTTL)
1101 inp->inp_ip_minttl = optval;
1102 else
1103 error = EINVAL;
1104 break;
1105 #define OPTSET(bit) \
1106 if (optval) \
1107 inpflags |= bit; \
1108 else \
1109 inpflags &= ~bit;
1110
1111 case IP_PKTINFO:
1112 OPTSET(INP_PKTINFO);
1113 break;
1114
1115 case IP_RECVOPTS:
1116 OPTSET(INP_RECVOPTS);
1117 break;
1118
1119 case IP_RECVPKTINFO:
1120 OPTSET(INP_RECVPKTINFO);
1121 break;
1122
1123 case IP_RECVRETOPTS:
1124 OPTSET(INP_RECVRETOPTS);
1125 break;
1126
1127 case IP_RECVDSTADDR:
1128 OPTSET(INP_RECVDSTADDR);
1129 break;
1130
1131 case IP_RECVIF:
1132 OPTSET(INP_RECVIF);
1133 break;
1134
1135 case IP_RECVTTL:
1136 OPTSET(INP_RECVTTL);
1137 break;
1138 }
1139 break;
1140 #undef OPTSET
1141
1142 case IP_MULTICAST_IF:
1143 case IP_MULTICAST_TTL:
1144 case IP_MULTICAST_LOOP:
1145 case IP_ADD_MEMBERSHIP:
1146 case IP_DROP_MEMBERSHIP:
1147 error = ip_setmoptions(&inp->inp_moptions, sopt);
1148 break;
1149
1150 case IP_PORTRANGE:
1151 error = sockopt_getint(sopt, &optval);
1152 if (error)
1153 break;
1154
1155 switch (optval) {
1156 case IP_PORTRANGE_DEFAULT:
1157 case IP_PORTRANGE_HIGH:
1158 inpflags &= ~(INP_LOWPORT);
1159 break;
1160
1161 case IP_PORTRANGE_LOW:
1162 inpflags |= INP_LOWPORT;
1163 break;
1164
1165 default:
1166 error = EINVAL;
1167 break;
1168 }
1169 break;
1170
1171 case IP_PORTALGO:
1172 error = sockopt_getint(sopt, &optval);
1173 if (error)
1174 break;
1175
1176 error = portalgo_algo_index_select(
1177 (struct inpcb_hdr *)inp, optval);
1178 break;
1179
1180 #if defined(IPSEC)
1181 case IP_IPSEC_POLICY:
1182 if (ipsec_enabled) {
1183 error = ipsec4_set_policy(inp, sopt->sopt_name,
1184 sopt->sopt_data, sopt->sopt_size,
1185 curlwp->l_cred);
1186 break;
1187 }
1188 /*FALLTHROUGH*/
1189 #endif /* IPSEC */
1190
1191 default:
1192 error = ENOPROTOOPT;
1193 break;
1194 }
1195 break;
1196
1197 case PRCO_GETOPT:
1198 switch (sopt->sopt_name) {
1199 case IP_OPTIONS:
1200 case IP_RETOPTS: {
1201 struct mbuf *mopts = inp->inp_options;
1202
1203 if (mopts) {
1204 struct mbuf *m;
1205
1206 m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT);
1207 if (m == NULL) {
1208 error = ENOBUFS;
1209 break;
1210 }
1211 error = sockopt_setmbuf(sopt, m);
1212 }
1213 break;
1214 }
1215 case IP_PKTINFO:
1216 case IP_TOS:
1217 case IP_TTL:
1218 case IP_MINTTL:
1219 case IP_RECVOPTS:
1220 case IP_RECVRETOPTS:
1221 case IP_RECVDSTADDR:
1222 case IP_RECVIF:
1223 case IP_RECVPKTINFO:
1224 case IP_RECVTTL:
1225 case IP_ERRORMTU:
1226 switch (sopt->sopt_name) {
1227 case IP_TOS:
1228 optval = ip->ip_tos;
1229 break;
1230
1231 case IP_TTL:
1232 optval = ip->ip_ttl;
1233 break;
1234
1235 case IP_MINTTL:
1236 optval = inp->inp_ip_minttl;
1237 break;
1238
1239 case IP_ERRORMTU:
1240 optval = inp->inp_errormtu;
1241 break;
1242
1243 #define OPTBIT(bit) (inpflags & bit ? 1 : 0)
1244
1245 case IP_PKTINFO:
1246 optval = OPTBIT(INP_PKTINFO);
1247 break;
1248
1249 case IP_RECVOPTS:
1250 optval = OPTBIT(INP_RECVOPTS);
1251 break;
1252
1253 case IP_RECVPKTINFO:
1254 optval = OPTBIT(INP_RECVPKTINFO);
1255 break;
1256
1257 case IP_RECVRETOPTS:
1258 optval = OPTBIT(INP_RECVRETOPTS);
1259 break;
1260
1261 case IP_RECVDSTADDR:
1262 optval = OPTBIT(INP_RECVDSTADDR);
1263 break;
1264
1265 case IP_RECVIF:
1266 optval = OPTBIT(INP_RECVIF);
1267 break;
1268
1269 case IP_RECVTTL:
1270 optval = OPTBIT(INP_RECVTTL);
1271 break;
1272 }
1273 error = sockopt_setint(sopt, optval);
1274 break;
1275
1276 #if 0 /* defined(IPSEC) */
1277 case IP_IPSEC_POLICY:
1278 {
1279 struct mbuf *m = NULL;
1280
1281 /* XXX this will return EINVAL as sopt is empty */
1282 error = ipsec4_get_policy(inp, sopt->sopt_data,
1283 sopt->sopt_size, &m);
1284 if (error == 0)
1285 error = sockopt_setmbuf(sopt, m);
1286 break;
1287 }
1288 #endif /*IPSEC*/
1289
1290 case IP_MULTICAST_IF:
1291 case IP_MULTICAST_TTL:
1292 case IP_MULTICAST_LOOP:
1293 case IP_ADD_MEMBERSHIP:
1294 case IP_DROP_MEMBERSHIP:
1295 error = ip_getmoptions(inp->inp_moptions, sopt);
1296 break;
1297
1298 case IP_PORTRANGE:
1299 if (inpflags & INP_LOWPORT)
1300 optval = IP_PORTRANGE_LOW;
1301 else
1302 optval = IP_PORTRANGE_DEFAULT;
1303 error = sockopt_setint(sopt, optval);
1304 break;
1305
1306 case IP_PORTALGO:
1307 optval = inp->inp_portalgo;
1308 error = sockopt_setint(sopt, optval);
1309 break;
1310
1311 default:
1312 error = ENOPROTOOPT;
1313 break;
1314 }
1315 break;
1316 }
1317
1318 if (!error) {
1319 inp->inp_flags = inpflags;
1320 }
1321 return error;
1322 }
1323
1324 /*
1325 * Set up IP options in pcb for insertion in output packets.
1326 * Store in mbuf with pointer in pcbopt, adding pseudo-option
1327 * with destination address if source routed.
1328 */
1329 static int
1330 ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt)
1331 {
1332 struct mbuf *m;
1333 const u_char *cp;
1334 u_char *dp;
1335 int cnt;
1336
1337 KASSERT(inp_locked(inp));
1338
1339 /* Turn off any old options. */
1340 if (inp->inp_options) {
1341 m_free(inp->inp_options);
1342 }
1343 inp->inp_options = NULL;
1344 if ((cnt = sopt->sopt_size) == 0) {
1345 /* Only turning off any previous options. */
1346 return 0;
1347 }
1348 cp = sopt->sopt_data;
1349
1350 #ifndef __vax__
1351 if (cnt % sizeof(int32_t))
1352 return (EINVAL);
1353 #endif
1354
1355 m = m_get(M_DONTWAIT, MT_SOOPTS);
1356 if (m == NULL)
1357 return (ENOBUFS);
1358
1359 dp = mtod(m, u_char *);
1360 memset(dp, 0, sizeof(struct in_addr));
1361 dp += sizeof(struct in_addr);
1362 m->m_len = sizeof(struct in_addr);
1363
1364 /*
1365 * IP option list according to RFC791. Each option is of the form
1366 *
1367 * [optval] [olen] [(olen - 2) data bytes]
1368 *
1369 * We validate the list and copy options to an mbuf for prepending
1370 * to data packets. The IP first-hop destination address will be
1371 * stored before actual options and is zero if unset.
1372 */
1373 while (cnt > 0) {
1374 uint8_t optval, olen, offset;
1375
1376 optval = cp[IPOPT_OPTVAL];
1377
1378 if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
1379 olen = 1;
1380 } else {
1381 if (cnt < IPOPT_OLEN + 1)
1382 goto bad;
1383
1384 olen = cp[IPOPT_OLEN];
1385 if (olen < IPOPT_OLEN + 1 || olen > cnt)
1386 goto bad;
1387 }
1388
1389 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
1390 /*
1391 * user process specifies route as:
1392 * ->A->B->C->D
1393 * D must be our final destination (but we can't
1394 * check that since we may not have connected yet).
1395 * A is first hop destination, which doesn't appear in
1396 * actual IP option, but is stored before the options.
1397 */
1398 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
1399 goto bad;
1400
1401 offset = cp[IPOPT_OFFSET];
1402 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
1403 sizeof(struct in_addr));
1404
1405 cp += sizeof(struct in_addr);
1406 cnt -= sizeof(struct in_addr);
1407 olen -= sizeof(struct in_addr);
1408
1409 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1410 goto bad;
1411
1412 memcpy(dp, cp, olen);
1413 dp[IPOPT_OPTVAL] = optval;
1414 dp[IPOPT_OLEN] = olen;
1415 dp[IPOPT_OFFSET] = offset;
1416 break;
1417 } else {
1418 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
1419 goto bad;
1420
1421 memcpy(dp, cp, olen);
1422 break;
1423 }
1424
1425 dp += olen;
1426 m->m_len += olen;
1427
1428 if (optval == IPOPT_EOL)
1429 break;
1430
1431 cp += olen;
1432 cnt -= olen;
1433 }
1434
1435 inp->inp_options = m;
1436 return 0;
1437 bad:
1438 (void)m_free(m);
1439 return EINVAL;
1440 }
1441
1442 /*
1443 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
1444 * Must be called in a pserialize critical section.
1445 */
1446 static struct ifnet *
1447 ip_multicast_if(struct in_addr *a, int *ifindexp)
1448 {
1449 int ifindex;
1450 struct ifnet *ifp = NULL;
1451 struct in_ifaddr *ia;
1452
1453 if (ifindexp)
1454 *ifindexp = 0;
1455 if (ntohl(a->s_addr) >> 24 == 0) {
1456 ifindex = ntohl(a->s_addr) & 0xffffff;
1457 ifp = if_byindex(ifindex);
1458 if (!ifp)
1459 return NULL;
1460 if (ifindexp)
1461 *ifindexp = ifindex;
1462 } else {
1463 IN_ADDRHASH_READER_FOREACH(ia, a->s_addr) {
1464 if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
1465 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
1466 ifp = ia->ia_ifp;
1467 if (if_is_deactivated(ifp))
1468 ifp = NULL;
1469 break;
1470 }
1471 }
1472 }
1473 return ifp;
1474 }
1475
1476 static int
1477 ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
1478 {
1479 u_int tval;
1480 u_char cval;
1481 int error;
1482
1483 if (sopt == NULL)
1484 return EINVAL;
1485
1486 switch (sopt->sopt_size) {
1487 case sizeof(u_char):
1488 error = sockopt_get(sopt, &cval, sizeof(u_char));
1489 tval = cval;
1490 break;
1491
1492 case sizeof(u_int):
1493 error = sockopt_get(sopt, &tval, sizeof(u_int));
1494 break;
1495
1496 default:
1497 error = EINVAL;
1498 }
1499
1500 if (error)
1501 return error;
1502
1503 if (tval > maxval)
1504 return EINVAL;
1505
1506 *val = tval;
1507 return 0;
1508 }
1509
1510 static int
1511 ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
1512 struct psref *psref, struct in_addr *ia, bool add)
1513 {
1514 int error;
1515 struct ip_mreq mreq;
1516
1517 error = sockopt_get(sopt, &mreq, sizeof(mreq));
1518 if (error)
1519 return error;
1520
1521 if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr))
1522 return EINVAL;
1523
1524 memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia));
1525
1526 if (in_nullhost(mreq.imr_interface)) {
1527 union {
1528 struct sockaddr dst;
1529 struct sockaddr_in dst4;
1530 } u;
1531 struct route ro;
1532
1533 if (!add) {
1534 *ifp = NULL;
1535 return 0;
1536 }
1537 /*
1538 * If no interface address was provided, use the interface of
1539 * the route to the given multicast address.
1540 */
1541 struct rtentry *rt;
1542 memset(&ro, 0, sizeof(ro));
1543
1544 sockaddr_in_init(&u.dst4, ia, 0);
1545 error = rtcache_setdst(&ro, &u.dst);
1546 if (error != 0)
1547 return error;
1548 *ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL;
1549 if (*ifp != NULL) {
1550 if (if_is_deactivated(*ifp))
1551 *ifp = NULL;
1552 else
1553 if_acquire(*ifp, psref);
1554 }
1555 rtcache_unref(rt, &ro);
1556 rtcache_free(&ro);
1557 } else {
1558 int s = pserialize_read_enter();
1559 *ifp = ip_multicast_if(&mreq.imr_interface, NULL);
1560 if (!add && *ifp == NULL) {
1561 pserialize_read_exit(s);
1562 return EADDRNOTAVAIL;
1563 }
1564 if (*ifp != NULL) {
1565 if (if_is_deactivated(*ifp))
1566 *ifp = NULL;
1567 else
1568 if_acquire(*ifp, psref);
1569 }
1570 pserialize_read_exit(s);
1571 }
1572 return 0;
1573 }
1574
1575 /*
1576 * Add a multicast group membership.
1577 * Group must be a valid IP multicast address.
1578 */
1579 static int
1580 ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt)
1581 {
1582 struct ifnet *ifp = NULL; // XXX: gcc [ppc]
1583 struct in_addr ia;
1584 int i, error, bound;
1585 struct psref psref;
1586
1587 /* imo is protected by solock or referenced only by the caller */
1588
1589 bound = curlwp_bind();
1590 if (sopt->sopt_size == sizeof(struct ip_mreq))
1591 error = ip_get_membership(sopt, &ifp, &psref, &ia, true);
1592 else
1593 #ifdef INET6
1594 error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
1595 #else
1596 error = EINVAL;
1597 goto out;
1598 #endif
1599
1600 if (error)
1601 goto out;
1602
1603 /*
1604 * See if we found an interface, and confirm that it
1605 * supports multicast.
1606 */
1607 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1608 error = EADDRNOTAVAIL;
1609 goto out;
1610 }
1611
1612 /*
1613 * See if the membership already exists or if all the
1614 * membership slots are full.
1615 */
1616 for (i = 0; i < imo->imo_num_memberships; ++i) {
1617 if (imo->imo_membership[i]->inm_ifp == ifp &&
1618 in_hosteq(imo->imo_membership[i]->inm_addr, ia))
1619 break;
1620 }
1621 if (i < imo->imo_num_memberships) {
1622 error = EADDRINUSE;
1623 goto out;
1624 }
1625
1626 if (i == IP_MAX_MEMBERSHIPS) {
1627 error = ETOOMANYREFS;
1628 goto out;
1629 }
1630
1631 /*
1632 * Everything looks good; add a new record to the multicast
1633 * address list for the given interface.
1634 */
1635 if ((imo->imo_membership[i] = in_addmulti(&ia, ifp)) == NULL) {
1636 error = ENOBUFS;
1637 goto out;
1638 }
1639
1640 ++imo->imo_num_memberships;
1641 error = 0;
1642 out:
1643 if_put(ifp, &psref);
1644 curlwp_bindx(bound);
1645 return error;
1646 }
1647
1648 /*
1649 * Drop a multicast group membership.
1650 * Group must be a valid IP multicast address.
1651 */
1652 static int
1653 ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt)
1654 {
1655 struct in_addr ia = { .s_addr = 0 }; // XXX: gcc [ppc]
1656 struct ifnet *ifp = NULL; // XXX: gcc [ppc]
1657 int i, error, bound;
1658 struct psref psref;
1659
1660 /* imo is protected by solock or referenced only by the caller */
1661
1662 bound = curlwp_bind();
1663 if (sopt->sopt_size == sizeof(struct ip_mreq))
1664 error = ip_get_membership(sopt, &ifp, &psref, &ia, false);
1665 else
1666 #ifdef INET6
1667 error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
1668 #else
1669 error = EINVAL;
1670 goto out;
1671 #endif
1672
1673 if (error)
1674 goto out;
1675
1676 /*
1677 * Find the membership in the membership array.
1678 */
1679 for (i = 0; i < imo->imo_num_memberships; ++i) {
1680 if ((ifp == NULL ||
1681 imo->imo_membership[i]->inm_ifp == ifp) &&
1682 in_hosteq(imo->imo_membership[i]->inm_addr, ia))
1683 break;
1684 }
1685 if (i == imo->imo_num_memberships) {
1686 error = EADDRNOTAVAIL;
1687 goto out;
1688 }
1689
1690 /*
1691 * Give up the multicast address record to which the
1692 * membership points.
1693 */
1694 in_delmulti(imo->imo_membership[i]);
1695
1696 /*
1697 * Remove the gap in the membership array.
1698 */
1699 for (++i; i < imo->imo_num_memberships; ++i)
1700 imo->imo_membership[i-1] = imo->imo_membership[i];
1701 --imo->imo_num_memberships;
1702 error = 0;
1703 out:
1704 curlwp_bindx(bound);
1705 if_put(ifp, &psref);
1706 return error;
1707 }
1708
1709 /*
1710 * Set the IP multicast options in response to user setsockopt().
1711 */
1712 int
1713 ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt)
1714 {
1715 struct ip_moptions *imo = *pimo;
1716 struct in_addr addr;
1717 struct ifnet *ifp;
1718 int ifindex, error = 0;
1719
1720 /* The passed imo isn't NULL, it should be protected by solock */
1721
1722 if (!imo) {
1723 /*
1724 * No multicast option buffer attached to the pcb;
1725 * allocate one and initialize to default values.
1726 */
1727 imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP);
1728 if (imo == NULL)
1729 return ENOBUFS;
1730
1731 imo->imo_multicast_if_index = 0;
1732 imo->imo_multicast_addr.s_addr = INADDR_ANY;
1733 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1734 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1735 imo->imo_num_memberships = 0;
1736 *pimo = imo;
1737 }
1738
1739 switch (sopt->sopt_name) {
1740 case IP_MULTICAST_IF: {
1741 int s;
1742 /*
1743 * Select the interface for outgoing multicast packets.
1744 */
1745 error = sockopt_get(sopt, &addr, sizeof(addr));
1746 if (error)
1747 break;
1748
1749 /*
1750 * INADDR_ANY is used to remove a previous selection.
1751 * When no interface is selected, a default one is
1752 * chosen every time a multicast packet is sent.
1753 */
1754 if (in_nullhost(addr)) {
1755 imo->imo_multicast_if_index = 0;
1756 break;
1757 }
1758 /*
1759 * The selected interface is identified by its local
1760 * IP address. Find the interface and confirm that
1761 * it supports multicasting.
1762 */
1763 s = pserialize_read_enter();
1764 ifp = ip_multicast_if(&addr, &ifindex);
1765 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1766 pserialize_read_exit(s);
1767 error = EADDRNOTAVAIL;
1768 break;
1769 }
1770 imo->imo_multicast_if_index = ifp->if_index;
1771 pserialize_read_exit(s);
1772 if (ifindex)
1773 imo->imo_multicast_addr = addr;
1774 else
1775 imo->imo_multicast_addr.s_addr = INADDR_ANY;
1776 break;
1777 }
1778
1779 case IP_MULTICAST_TTL:
1780 /*
1781 * Set the IP time-to-live for outgoing multicast packets.
1782 */
1783 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
1784 break;
1785
1786 case IP_MULTICAST_LOOP:
1787 /*
1788 * Set the loopback flag for outgoing multicast packets.
1789 * Must be zero or one.
1790 */
1791 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
1792 break;
1793
1794 case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */
1795 error = ip_add_membership(imo, sopt);
1796 break;
1797
1798 case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */
1799 error = ip_drop_membership(imo, sopt);
1800 break;
1801
1802 default:
1803 error = EOPNOTSUPP;
1804 break;
1805 }
1806
1807 /*
1808 * If all options have default values, no need to keep the mbuf.
1809 */
1810 if (imo->imo_multicast_if_index == 0 &&
1811 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
1812 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
1813 imo->imo_num_memberships == 0) {
1814 kmem_free(imo, sizeof(*imo));
1815 *pimo = NULL;
1816 }
1817
1818 return error;
1819 }
1820
1821 /*
1822 * Return the IP multicast options in response to user getsockopt().
1823 */
1824 int
1825 ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
1826 {
1827 struct in_addr addr;
1828 uint8_t optval;
1829 int error = 0;
1830
1831 /* imo is protected by solock or refereced only by the caller */
1832
1833 switch (sopt->sopt_name) {
1834 case IP_MULTICAST_IF:
1835 if (imo == NULL || imo->imo_multicast_if_index == 0)
1836 addr = zeroin_addr;
1837 else if (imo->imo_multicast_addr.s_addr) {
1838 /* return the value user has set */
1839 addr = imo->imo_multicast_addr;
1840 } else {
1841 struct ifnet *ifp;
1842 struct in_ifaddr *ia = NULL;
1843 int s = pserialize_read_enter();
1844
1845 ifp = if_byindex(imo->imo_multicast_if_index);
1846 if (ifp != NULL) {
1847 ia = in_get_ia_from_ifp(ifp);
1848 }
1849 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
1850 pserialize_read_exit(s);
1851 }
1852 error = sockopt_set(sopt, &addr, sizeof(addr));
1853 break;
1854
1855 case IP_MULTICAST_TTL:
1856 optval = imo ? imo->imo_multicast_ttl
1857 : IP_DEFAULT_MULTICAST_TTL;
1858
1859 error = sockopt_set(sopt, &optval, sizeof(optval));
1860 break;
1861
1862 case IP_MULTICAST_LOOP:
1863 optval = imo ? imo->imo_multicast_loop
1864 : IP_DEFAULT_MULTICAST_LOOP;
1865
1866 error = sockopt_set(sopt, &optval, sizeof(optval));
1867 break;
1868
1869 default:
1870 error = EOPNOTSUPP;
1871 }
1872
1873 return error;
1874 }
1875
1876 /*
1877 * Discard the IP multicast options.
1878 */
1879 void
1880 ip_freemoptions(struct ip_moptions *imo)
1881 {
1882 int i;
1883
1884 /* The owner of imo (inp) should be protected by solock */
1885
1886 if (imo != NULL) {
1887 for (i = 0; i < imo->imo_num_memberships; ++i)
1888 in_delmulti(imo->imo_membership[i]);
1889 kmem_free(imo, sizeof(*imo));
1890 }
1891 }
1892
1893 /*
1894 * Routine called from ip_output() to loop back a copy of an IP multicast
1895 * packet to the input queue of a specified interface. Note that this
1896 * calls the output routine of the loopback "driver", but with an interface
1897 * pointer that might NOT be lo0ifp -- easier than replicating that code here.
1898 */
1899 static void
1900 ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
1901 {
1902 struct ip *ip;
1903 struct mbuf *copym;
1904
1905 copym = m_copypacket(m, M_DONTWAIT);
1906 if (copym != NULL &&
1907 (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
1908 copym = m_pullup(copym, sizeof(struct ip));
1909 if (copym == NULL)
1910 return;
1911 /*
1912 * We don't bother to fragment if the IP length is greater
1913 * than the interface's MTU. Can this possibly matter?
1914 */
1915 ip = mtod(copym, struct ip *);
1916
1917 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
1918 in_delayed_cksum(copym);
1919 copym->m_pkthdr.csum_flags &=
1920 ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
1921 }
1922
1923 ip->ip_sum = 0;
1924 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
1925 #ifndef NET_MPSAFE
1926 KERNEL_LOCK(1, NULL);
1927 #endif
1928 (void)looutput(ifp, copym, sintocsa(dst), NULL);
1929 #ifndef NET_MPSAFE
1930 KERNEL_UNLOCK_ONE(NULL);
1931 #endif
1932 }
1933
1934 /*
1935 * Ensure sending address is valid.
1936 * Returns 0 on success, -1 if an error should be sent back or 1
1937 * if the packet could be dropped without error (protocol dependent).
1938 */
1939 static int
1940 ip_ifaddrvalid(const struct in_ifaddr *ia)
1941 {
1942
1943 if (ia == NULL)
1944 return -1;
1945
1946 if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
1947 return 0;
1948
1949 if (ia->ia4_flags & IN_IFF_DUPLICATED)
1950 return -1;
1951 else if (ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DETACHED))
1952 return 1;
1953
1954 return 0;
1955 }
1956