tcp_subr.c revision 1.82 1 /* $NetBSD: tcp_subr.c,v 1.82 1999/12/08 16:22:20 itojun Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
38 * Facility, NASA Ames Research Center.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 * must display the following acknowledgement:
50 * This product includes software developed by the NetBSD
51 * Foundation, Inc. and its contributors.
52 * 4. Neither the name of The NetBSD Foundation nor the names of its
53 * contributors may be used to endorse or promote products derived
54 * from this software without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
57 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
58 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
59 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
60 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 * POSSIBILITY OF SUCH DAMAGE.
67 */
68
69 /*
70 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
71 * The Regents of the University of California. All rights reserved.
72 *
73 * Redistribution and use in source and binary forms, with or without
74 * modification, are permitted provided that the following conditions
75 * are met:
76 * 1. Redistributions of source code must retain the above copyright
77 * notice, this list of conditions and the following disclaimer.
78 * 2. Redistributions in binary form must reproduce the above copyright
79 * notice, this list of conditions and the following disclaimer in the
80 * documentation and/or other materials provided with the distribution.
81 * 3. All advertising materials mentioning features or use of this software
82 * must display the following acknowledgement:
83 * This product includes software developed by the University of
84 * California, Berkeley and its contributors.
85 * 4. Neither the name of the University nor the names of its contributors
86 * may be used to endorse or promote products derived from this software
87 * without specific prior written permission.
88 *
89 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
90 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
91 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
92 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
93 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
94 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
95 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
96 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
97 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
98 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
99 * SUCH DAMAGE.
100 *
101 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
102 */
103
104 #include "opt_inet.h"
105 #include "opt_ipsec.h"
106 #include "opt_tcp_compat_42.h"
107 #include "rnd.h"
108
109 #include <sys/param.h>
110 #include <sys/proc.h>
111 #include <sys/systm.h>
112 #include <sys/malloc.h>
113 #include <sys/mbuf.h>
114 #include <sys/socket.h>
115 #include <sys/socketvar.h>
116 #include <sys/protosw.h>
117 #include <sys/errno.h>
118 #include <sys/kernel.h>
119 #include <sys/pool.h>
120 #if NRND > 0
121 #include <sys/rnd.h>
122 #endif
123
124 #include <net/route.h>
125 #include <net/if.h>
126
127 #include <netinet/in.h>
128 #include <netinet/in_systm.h>
129 #include <netinet/ip.h>
130 #include <netinet/in_pcb.h>
131 #include <netinet/ip_var.h>
132 #include <netinet/ip_icmp.h>
133
134 #ifdef INET6
135 #ifndef INET
136 #include <netinet/in.h>
137 #endif
138 #include <netinet/ip6.h>
139 #include <netinet6/in6_pcb.h>
140 #include <netinet6/ip6_var.h>
141 #include <netinet6/in6_var.h>
142 #include <netinet6/ip6protosw.h>
143 #endif
144
145 #include <netinet/tcp.h>
146 #include <netinet/tcp_fsm.h>
147 #include <netinet/tcp_seq.h>
148 #include <netinet/tcp_timer.h>
149 #include <netinet/tcp_var.h>
150 #include <netinet/tcpip.h>
151
152 #ifdef IPSEC
153 #include <netinet6/ipsec.h>
154 #endif /*IPSEC*/
155
156 #ifdef INET6
157 struct in6pcb tcb6;
158 #endif
159
160 /* patchable/settable parameters for tcp */
161 int tcp_mssdflt = TCP_MSS;
162 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
163 int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
164 int tcp_do_sack = 1; /* selective acknowledgement */
165 int tcp_do_win_scale = 1; /* RFC1323 window scaling */
166 int tcp_do_timestamps = 1; /* RFC1323 timestamps */
167 int tcp_do_newreno = 0; /* Use the New Reno algorithms */
168 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
169 int tcp_init_win = 1;
170 int tcp_mss_ifmtu = 0;
171 #ifdef TCP_COMPAT_42
172 int tcp_compat_42 = 1;
173 #else
174 int tcp_compat_42 = 0;
175 #endif
176
177 #ifndef TCBHASHSIZE
178 #define TCBHASHSIZE 128
179 #endif
180 int tcbhashsize = TCBHASHSIZE;
181
182 int tcp_freeq __P((struct tcpcb *));
183
184 struct pool tcpcb_pool;
185
186 /*
187 * Tcp initialization
188 */
189 void
190 tcp_init()
191 {
192 int hlen;
193
194 pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
195 0, NULL, NULL, M_PCB);
196 in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
197 #ifdef INET6
198 tcb6.in6p_next = tcb6.in6p_prev = &tcb6;
199 #endif
200 LIST_INIT(&tcp_delacks);
201
202 hlen = sizeof(struct ip) + sizeof(struct tcphdr);
203 #ifdef INET6
204 if (sizeof(struct ip) < sizeof(struct ip6_hdr))
205 hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
206 #endif
207 if (max_protohdr < hlen)
208 max_protohdr = hlen;
209 if (max_linkhdr + hlen > MHLEN)
210 panic("tcp_init");
211
212 /* Initialize the compressed state engine. */
213 syn_cache_init();
214 }
215
216 /*
217 * Create template to be used to send tcp packets on a connection.
218 * Call after host entry created, allocates an mbuf and fills
219 * in a skeletal tcp/ip header, minimizing the amount of work
220 * necessary when the connection is used.
221 */
222 struct mbuf *
223 tcp_template(tp)
224 struct tcpcb *tp;
225 {
226 register struct inpcb *inp = tp->t_inpcb;
227 #ifdef INET6
228 register struct in6pcb *in6p = tp->t_in6pcb;
229 #endif
230 register struct tcphdr *n;
231 register struct mbuf *m;
232 int hlen;
233
234 switch (tp->t_family) {
235 case AF_INET:
236 hlen = sizeof(struct ip);
237 if (inp)
238 break;
239 #ifdef INET6
240 if (in6p) {
241 /* mapped addr case */
242 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr)
243 && IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr))
244 break;
245 }
246 #endif
247 return NULL; /*EINVAL*/
248 #ifdef INET6
249 case AF_INET6:
250 hlen = sizeof(struct ip6_hdr);
251 if (in6p) {
252 /* more sainty check? */
253 break;
254 }
255 return NULL; /*EINVAL*/
256 #endif
257 default:
258 hlen = 0; /*pacify gcc*/
259 return NULL; /*EAFNOSUPPORT*/
260 }
261 if ((m = tp->t_template) == 0) {
262 MGETHDR(m, M_DONTWAIT, MT_HEADER);
263 if (m) {
264 MCLGET(m, M_DONTWAIT);
265 if ((m->m_flags & M_EXT) == 0) {
266 m_free(m);
267 m = NULL;
268 }
269 }
270 if (m == NULL)
271 return NULL;
272 m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
273 }
274 bzero(mtod(m, caddr_t), m->m_len);
275 switch (tp->t_family) {
276 case AF_INET:
277 {
278 struct ipovly *ipov;
279 mtod(m, struct ip *)->ip_v = 4;
280 ipov = mtod(m, struct ipovly *);
281 ipov->ih_pr = IPPROTO_TCP;
282 ipov->ih_len = htons(sizeof(struct tcphdr));
283 if (inp) {
284 ipov->ih_src = inp->inp_laddr;
285 ipov->ih_dst = inp->inp_faddr;
286 }
287 #ifdef INET6
288 else if (in6p) {
289 /* mapped addr case */
290 bcopy(&in6p->in6p_laddr.s6_addr32[3], &ipov->ih_src,
291 sizeof(ipov->ih_src));
292 bcopy(&in6p->in6p_faddr.s6_addr32[3], &ipov->ih_dst,
293 sizeof(ipov->ih_dst));
294 }
295 #endif
296 break;
297 }
298 #ifdef INET6
299 case AF_INET6:
300 {
301 struct ip6_hdr *ip6;
302 mtod(m, struct ip *)->ip_v = 6;
303 ip6 = mtod(m, struct ip6_hdr *);
304 ip6->ip6_nxt = IPPROTO_TCP;
305 ip6->ip6_plen = htons(sizeof(struct tcphdr));
306 ip6->ip6_src = in6p->in6p_laddr;
307 ip6->ip6_dst = in6p->in6p_faddr;
308 ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK;
309 if (ip6_auto_flowlabel) {
310 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
311 ip6->ip6_flow |=
312 (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK);
313 }
314 ip6->ip6_vfc = IPV6_VERSION;
315 break;
316 }
317 #endif
318 }
319 n = (struct tcphdr *)(mtod(m, caddr_t) + hlen);
320 if (inp) {
321 n->th_sport = inp->inp_lport;
322 n->th_dport = inp->inp_fport;
323 }
324 #ifdef INET6
325 else if (in6p) {
326 n->th_sport = in6p->in6p_lport;
327 n->th_dport = in6p->in6p_fport;
328 }
329 #endif
330 n->th_seq = 0;
331 n->th_ack = 0;
332 n->th_x2 = 0;
333 n->th_off = 5;
334 n->th_flags = 0;
335 n->th_win = 0;
336 n->th_sum = 0;
337 n->th_urp = 0;
338 return (m);
339 }
340
341 /*
342 * Send a single message to the TCP at address specified by
343 * the given TCP/IP header. If m == 0, then we make a copy
344 * of the tcpiphdr at ti and send directly to the addressed host.
345 * This is used to force keep alive messages out using the TCP
346 * template for a connection tp->t_template. If flags are given
347 * then we send a message back to the TCP which originated the
348 * segment ti, and discard the mbuf containing it and any other
349 * attached mbufs.
350 *
351 * In any case the ack and sequence number of the transmitted
352 * segment are as specified by the parameters.
353 */
354 int
355 tcp_respond(tp, template, m, th0, ack, seq, flags)
356 struct tcpcb *tp;
357 struct mbuf *template;
358 register struct mbuf *m;
359 struct tcphdr *th0;
360 tcp_seq ack, seq;
361 int flags;
362 {
363 #ifndef INET6
364 struct route iproute;
365 #else
366 struct route_in6 iproute; /* sizeof(route_in6) > sizeof(route) */
367 #endif
368 struct route *ro;
369 struct rtentry *rt;
370 int error, tlen, win = 0;
371 int hlen;
372 struct ip *ip;
373 #ifdef INET6
374 struct ip6_hdr *ip6;
375 #endif
376 int family; /* family on packet, not inpcb/in6pcb! */
377 struct tcphdr *th;
378
379 if (tp != NULL && (flags & TH_RST) == 0) {
380 if (tp->t_inpcb)
381 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
382 #ifdef INET6
383 else if (tp->t_in6pcb)
384 win = sbspace(&tp->t_in6pcb->in6p_socket->so_rcv);
385 #endif
386 }
387
388 ip = NULL;
389 #ifdef INET6
390 ip6 = NULL;
391 #endif
392 if (m == 0) {
393 if (!template)
394 return EINVAL;
395
396 /* get family information from template */
397 switch (mtod(template, struct ip *)->ip_v) {
398 case 4:
399 family = AF_INET;
400 hlen = sizeof(struct ip);
401 break;
402 #ifdef INET6
403 case 6:
404 family = AF_INET6;
405 hlen = sizeof(struct ip6_hdr);
406 break;
407 #endif
408 default:
409 return EAFNOSUPPORT;
410 }
411
412 MGETHDR(m, M_DONTWAIT, MT_HEADER);
413 if (m) {
414 MCLGET(m, M_DONTWAIT);
415 if ((m->m_flags & M_EXT) == 0) {
416 m_free(m);
417 m = NULL;
418 }
419 }
420 if (m == NULL)
421 return (ENOBUFS);
422
423 if (tcp_compat_42)
424 tlen = 1;
425 else
426 tlen = 0;
427
428 m->m_data += max_linkhdr;
429 bcopy(mtod(template, caddr_t), mtod(m, caddr_t),
430 template->m_len);
431 switch (family) {
432 case AF_INET:
433 ip = mtod(m, struct ip *);
434 th = (struct tcphdr *)(ip + 1);
435 break;
436 #ifdef INET6
437 case AF_INET6:
438 ip6 = mtod(m, struct ip6_hdr *);
439 th = (struct tcphdr *)(ip6 + 1);
440 break;
441 #endif
442 default: /*pacify gcc*/
443 ip = NULL;
444 #ifdef INET6
445 ip6 = NULL;
446 #endif
447 th = NULL;
448 break;
449 }
450 flags = TH_ACK;
451 } else {
452 /* get family information from m */
453 switch (mtod(m, struct ip *)->ip_v) {
454 case 4:
455 family = AF_INET;
456 hlen = sizeof(struct ip);
457 break;
458 #ifdef INET6
459 case 6:
460 family = AF_INET6;
461 hlen = sizeof(struct ip6_hdr);
462 break;
463 #endif
464 default:
465 if (m)
466 m_freem(m);
467 return EAFNOSUPPORT;
468 }
469
470 /* template pointer almost has no meaning */
471 m_freem(m->m_next);
472 m->m_next = 0;
473 m->m_len = hlen + sizeof(struct tcphdr);
474 if ((m->m_flags & M_PKTHDR) == 0) {
475 printf("non PKTHDR to tcp_respond\n");
476 m_freem(m);
477 return EINVAL;
478 }
479
480 tlen = 0;
481 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
482 switch (family) {
483 case AF_INET:
484 ip = mtod(m, struct ip *);
485 th = (struct tcphdr *)(ip + 1);
486 xchg(ip->ip_dst, ip->ip_src, struct in_addr);
487 ip->ip_p = IPPROTO_TCP;
488 break;
489 #ifdef INET6
490 case AF_INET6:
491 ip6 = mtod(m, struct ip6_hdr *);
492 th = (struct tcphdr *)(ip6 + 1);
493 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
494 ip6->ip6_nxt = IPPROTO_TCP;
495 break;
496 #endif
497 }
498 *th = *th0;
499 xchg(th->th_dport, th->th_sport, u_int16_t);
500 #undef xchg
501 }
502 th->th_seq = htonl(seq);
503 th->th_ack = htonl(ack);
504 th->th_x2 = 0;
505 if ((flags & TH_SYN) == 0) {
506 if (tp)
507 th->th_win = htons((u_int16_t) (win >> tp->rcv_scale));
508 else
509 th->th_win = htons((u_int16_t)win);
510 th->th_off = sizeof (struct tcphdr) >> 2;
511 tlen += sizeof (struct tcphdr);
512 } else
513 tlen += th->th_off << 2;
514 m->m_len = hlen + tlen;
515 m->m_pkthdr.len = hlen + tlen;
516 m->m_pkthdr.rcvif = (struct ifnet *) 0;
517 th->th_flags = flags;
518 th->th_urp = 0;
519
520 switch (family) {
521 case AF_INET:
522 {
523 struct ipovly *ipov = (struct ipovly *)ip;
524 bzero(ipov->ih_x1, sizeof ipov->ih_x1);
525 ipov->ih_len = htons((u_int16_t)tlen);
526
527 th->th_sum = 0;
528 th->th_sum = in_cksum(m, hlen + tlen);
529 ip->ip_len = hlen + tlen; /*will be flipped on output*/
530 ip->ip_ttl = ip_defttl;
531 break;
532 }
533 #ifdef INET6
534 case AF_INET6:
535 {
536 th->th_sum = 0;
537 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
538 tlen);
539 ip6->ip6_plen = ntohs(tlen);
540 ip6->ip6_hlim = ip6_defhlim;
541 ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
542 if (ip6_auto_flowlabel) {
543 ip6->ip6_flow |=
544 (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK);
545 }
546 break;
547 }
548 #endif
549 }
550
551 #ifdef IPSEC
552 m->m_pkthdr.rcvif = NULL;
553 #endif /*IPSEC*/
554
555 /*
556 * If we're doing Path MTU discovery, we need to set DF unless
557 * the route's MTU is locked. If we lack a route, we need to
558 * look it up now.
559 *
560 * ip_output() could do this for us, but it's convenient to just
561 * do it here unconditionally.
562 */
563 if (tp != NULL && tp->t_inpcb != NULL) {
564 ro = &tp->t_inpcb->inp_route;
565 #ifdef IPSEC
566 m->m_pkthdr.rcvif = (struct ifnet *)tp->t_inpcb->inp_socket;
567 #endif
568 #ifdef DIAGNOSTIC
569 if (family != AF_INET)
570 panic("tcp_respond: address family mismatch");
571 if (!in_hosteq(ip->ip_dst, tp->t_inpcb->inp_faddr)) {
572 panic("tcp_respond: ip_dst %x != inp_faddr %x",
573 ntohl(ip->ip_dst.s_addr),
574 ntohl(tp->t_inpcb->inp_faddr.s_addr));
575 }
576 #endif
577 }
578 #ifdef INET6
579 else if (tp != NULL && tp->t_in6pcb != NULL) {
580 ro = (struct route *)&tp->t_in6pcb->in6p_route;
581 #ifdef IPSEC
582 m->m_pkthdr.rcvif = (struct ifnet *)tp->t_in6pcb->in6p_socket;
583 #endif
584 #ifdef DIAGNOSTIC
585 if (family == AF_INET) {
586 if (!IN6_IS_ADDR_V4MAPPED(&tp->t_in6pcb->in6p_faddr))
587 panic("tcp_respond: not mapped addr");
588 if (bcmp(&ip->ip_dst,
589 &tp->t_in6pcb->in6p_faddr.s6_addr32[3],
590 sizeof(ip->ip_dst)) != 0) {
591 panic("tcp_respond: ip_dst != in6p_faddr");
592 }
593 } else if (family == AF_INET6) {
594 if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &tp->t_in6pcb->in6p_faddr))
595 panic("tcp_respond: ip6_dst != in6p_faddr");
596 } else
597 panic("tcp_respond: address family mismatch");
598 #endif
599 }
600 #endif
601 else {
602 ro = (struct route *)&iproute;
603 bzero(ro, sizeof(iproute));
604 }
605 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) {
606 if (ro->ro_rt != NULL) {
607 RTFREE(ro->ro_rt);
608 ro->ro_rt = NULL;
609 }
610 switch (family) {
611 case AF_INET:
612 {
613 struct sockaddr_in *dst;
614 dst = satosin(&ro->ro_dst);
615 dst->sin_family = AF_INET;
616 dst->sin_len = sizeof(*dst);
617 dst->sin_addr = ip->ip_dst;
618 break;
619 }
620 #ifdef INET6
621 case AF_INET6:
622 {
623 struct sockaddr_in6 *dst;
624 dst = satosin6(&ro->ro_dst);
625 bzero(dst, sizeof(*dst));
626 dst->sin6_family = AF_INET6;
627 dst->sin6_len = sizeof(*dst);
628 dst->sin6_addr = ip6->ip6_dst;
629 break;
630 }
631 #endif
632 }
633 rtalloc(ro);
634 if ((rt = ro->ro_rt) == NULL) {
635 m_freem(m);
636 switch (family) {
637 case AF_INET:
638 ipstat.ips_noroute++;
639 break;
640 #ifdef INET6
641 case AF_INET6:
642 ip6stat.ip6s_noroute++;
643 break;
644 #endif
645 }
646 return (EHOSTUNREACH);
647 }
648 }
649 switch (family) {
650 case AF_INET:
651 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
652 ip->ip_off |= IP_DF;
653
654 error = ip_output(m, NULL, ro, 0, NULL);
655 break;
656 #ifdef INET6
657 case AF_INET6:
658 error = ip6_output(m, NULL, (struct route_in6 *)ro, 0, NULL);
659 break;
660 #endif
661 default:
662 error = EAFNOSUPPORT;
663 break;
664 }
665
666 if (ro == (struct route *)&iproute) {
667 RTFREE(ro->ro_rt);
668 ro->ro_rt = NULL;
669 }
670
671 return (error);
672 }
673
674 /*
675 * Create a new TCP control block, making an
676 * empty reassembly queue and hooking it to the argument
677 * protocol control block.
678 */
679 struct tcpcb *
680 tcp_newtcpcb(family, aux)
681 int family; /* selects inpcb, or in6pcb */
682 void *aux;
683 {
684 register struct tcpcb *tp;
685
686 switch (family) {
687 case PF_INET:
688 break;
689 #ifdef INET6
690 case PF_INET6:
691 break;
692 #endif
693 default:
694 return NULL;
695 }
696
697 tp = pool_get(&tcpcb_pool, PR_NOWAIT);
698 if (tp == NULL)
699 return (NULL);
700 bzero((caddr_t)tp, sizeof(struct tcpcb));
701 LIST_INIT(&tp->segq);
702 LIST_INIT(&tp->timeq);
703 tp->t_family = family; /* may be overridden later on */
704 tp->t_peermss = tcp_mssdflt;
705 tp->t_ourmss = tcp_mssdflt;
706 tp->t_segsz = tcp_mssdflt;
707 LIST_INIT(&tp->t_sc);
708
709 tp->t_flags = 0;
710 if (tcp_do_rfc1323 && tcp_do_win_scale)
711 tp->t_flags |= TF_REQ_SCALE;
712 if (tcp_do_rfc1323 && tcp_do_timestamps)
713 tp->t_flags |= TF_REQ_TSTMP;
714 if (tcp_do_sack == 2)
715 tp->t_flags |= TF_WILL_SACK;
716 else if (tcp_do_sack == 1)
717 tp->t_flags |= TF_WILL_SACK|TF_IGNR_RXSACK;
718 tp->t_flags |= TF_CANT_TXSACK;
719 switch (family) {
720 case PF_INET:
721 tp->t_inpcb = (struct inpcb *)aux;
722 break;
723 #ifdef INET6
724 case PF_INET6:
725 tp->t_in6pcb = (struct in6pcb *)aux;
726 break;
727 #endif
728 }
729 /*
730 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
731 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
732 * reasonable initial retransmit time.
733 */
734 tp->t_srtt = TCPTV_SRTTBASE;
735 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
736 tp->t_rttmin = TCPTV_MIN;
737 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
738 TCPTV_MIN, TCPTV_REXMTMAX);
739 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
740 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
741 if (family == AF_INET) {
742 struct inpcb *inp = (struct inpcb *)aux;
743 inp->inp_ip.ip_ttl = ip_defttl;
744 inp->inp_ppcb = (caddr_t)tp;
745 }
746 #ifdef INET6
747 else if (family == AF_INET6) {
748 struct in6pcb *in6p = (struct in6pcb *)aux;
749 in6p->in6p_ip6.ip6_hlim = ip6_defhlim;
750 in6p->in6p_ppcb = (caddr_t)tp;
751 }
752 #endif
753 return (tp);
754 }
755
756 /*
757 * Drop a TCP connection, reporting
758 * the specified error. If connection is synchronized,
759 * then send a RST to peer.
760 */
761 struct tcpcb *
762 tcp_drop(tp, errno)
763 register struct tcpcb *tp;
764 int errno;
765 {
766 struct socket *so;
767
768 if (tp->t_inpcb)
769 so = tp->t_inpcb->inp_socket;
770 #ifdef INET6
771 else if (tp->t_in6pcb)
772 so = tp->t_in6pcb->in6p_socket;
773 #endif
774 else
775 return NULL;
776
777 if (TCPS_HAVERCVDSYN(tp->t_state)) {
778 tp->t_state = TCPS_CLOSED;
779 (void) tcp_output(tp);
780 tcpstat.tcps_drops++;
781 } else
782 tcpstat.tcps_conndrops++;
783 if (errno == ETIMEDOUT && tp->t_softerror)
784 errno = tp->t_softerror;
785 so->so_error = errno;
786 return (tcp_close(tp));
787 }
788
789 /*
790 * Close a TCP control block:
791 * discard all space held by the tcp
792 * discard internet protocol block
793 * wake up any sleepers
794 */
795 struct tcpcb *
796 tcp_close(tp)
797 register struct tcpcb *tp;
798 {
799 struct inpcb *inp;
800 #ifdef INET6
801 struct in6pcb *in6p;
802 #endif
803 struct socket *so;
804 #ifdef RTV_RTT
805 register struct rtentry *rt;
806 #endif
807 struct route *ro;
808
809 inp = tp->t_inpcb;
810 #ifdef INET6
811 in6p = tp->t_in6pcb;
812 #endif
813 so = NULL;
814 ro = NULL;
815 if (inp) {
816 so = inp->inp_socket;
817 ro = &inp->inp_route;
818 }
819 #ifdef INET6
820 else if (in6p) {
821 so = in6p->in6p_socket;
822 ro = (struct route *)&in6p->in6p_route;
823 }
824 #endif
825
826 #ifdef RTV_RTT
827 /*
828 * If we sent enough data to get some meaningful characteristics,
829 * save them in the routing entry. 'Enough' is arbitrarily
830 * defined as the sendpipesize (default 4K) * 16. This would
831 * give us 16 rtt samples assuming we only get one sample per
832 * window (the usual case on a long haul net). 16 samples is
833 * enough for the srtt filter to converge to within 5% of the correct
834 * value; fewer samples and we could save a very bogus rtt.
835 *
836 * Don't update the default route's characteristics and don't
837 * update anything that the user "locked".
838 */
839 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
840 ro && (rt = ro->ro_rt) &&
841 !in_nullhost(satosin(rt_key(rt))->sin_addr)) {
842 register u_long i = 0;
843
844 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
845 i = tp->t_srtt *
846 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
847 if (rt->rt_rmx.rmx_rtt && i)
848 /*
849 * filter this update to half the old & half
850 * the new values, converting scale.
851 * See route.h and tcp_var.h for a
852 * description of the scaling constants.
853 */
854 rt->rt_rmx.rmx_rtt =
855 (rt->rt_rmx.rmx_rtt + i) / 2;
856 else
857 rt->rt_rmx.rmx_rtt = i;
858 }
859 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
860 i = tp->t_rttvar *
861 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
862 if (rt->rt_rmx.rmx_rttvar && i)
863 rt->rt_rmx.rmx_rttvar =
864 (rt->rt_rmx.rmx_rttvar + i) / 2;
865 else
866 rt->rt_rmx.rmx_rttvar = i;
867 }
868 /*
869 * update the pipelimit (ssthresh) if it has been updated
870 * already or if a pipesize was specified & the threshhold
871 * got below half the pipesize. I.e., wait for bad news
872 * before we start updating, then update on both good
873 * and bad news.
874 */
875 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
876 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
877 i < (rt->rt_rmx.rmx_sendpipe / 2)) {
878 /*
879 * convert the limit from user data bytes to
880 * packets then to packet data bytes.
881 */
882 i = (i + tp->t_segsz / 2) / tp->t_segsz;
883 if (i < 2)
884 i = 2;
885 i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
886 if (rt->rt_rmx.rmx_ssthresh)
887 rt->rt_rmx.rmx_ssthresh =
888 (rt->rt_rmx.rmx_ssthresh + i) / 2;
889 else
890 rt->rt_rmx.rmx_ssthresh = i;
891 }
892 }
893 #endif /* RTV_RTT */
894 /* free the reassembly queue, if any */
895 TCP_REASS_LOCK(tp);
896 (void) tcp_freeq(tp);
897 TCP_REASS_UNLOCK(tp);
898
899 TCP_CLEAR_DELACK(tp);
900 syn_cache_cleanup(tp);
901
902 if (tp->t_template) {
903 m_free(tp->t_template);
904 tp->t_template = NULL;
905 }
906 pool_put(&tcpcb_pool, tp);
907 if (inp) {
908 inp->inp_ppcb = 0;
909 soisdisconnected(so);
910 in_pcbdetach(inp);
911 }
912 #ifdef INET6
913 else if (in6p) {
914 in6p->in6p_ppcb = 0;
915 soisdisconnected(so);
916 in6_pcbdetach(in6p);
917 }
918 #endif
919 tcpstat.tcps_closed++;
920 return ((struct tcpcb *)0);
921 }
922
923 int
924 tcp_freeq(tp)
925 struct tcpcb *tp;
926 {
927 register struct ipqent *qe;
928 int rv = 0;
929 #ifdef TCPREASS_DEBUG
930 int i = 0;
931 #endif
932
933 TCP_REASS_LOCK_CHECK(tp);
934
935 while ((qe = tp->segq.lh_first) != NULL) {
936 #ifdef TCPREASS_DEBUG
937 printf("tcp_freeq[%p,%d]: %u:%u(%u) 0x%02x\n",
938 tp, i++, qe->ipqe_seq, qe->ipqe_seq + qe->ipqe_len,
939 qe->ipqe_len, qe->ipqe_flags & (TH_SYN|TH_FIN|TH_RST));
940 #endif
941 LIST_REMOVE(qe, ipqe_q);
942 LIST_REMOVE(qe, ipqe_timeq);
943 m_freem(qe->ipqe_m);
944 pool_put(&ipqent_pool, qe);
945 rv = 1;
946 }
947 return (rv);
948 }
949
950 /*
951 * Protocol drain routine. Called when memory is in short supply.
952 */
953 void
954 tcp_drain()
955 {
956 register struct inpcb *inp;
957 register struct tcpcb *tp;
958
959 /*
960 * Free the sequence queue of all TCP connections.
961 */
962 inp = tcbtable.inpt_queue.cqh_first;
963 if (inp) /* XXX */
964 for (; inp != (struct inpcb *)&tcbtable.inpt_queue;
965 inp = inp->inp_queue.cqe_next) {
966 if ((tp = intotcpcb(inp)) != NULL) {
967 /*
968 * We may be called from a device's interrupt
969 * context. If the tcpcb is already busy,
970 * just bail out now.
971 */
972 if (tcp_reass_lock_try(tp) == 0)
973 continue;
974 if (tcp_freeq(tp))
975 tcpstat.tcps_connsdrained++;
976 TCP_REASS_UNLOCK(tp);
977 }
978 }
979 }
980
981 /*
982 * Notify a tcp user of an asynchronous error;
983 * store error as soft error, but wake up user
984 * (for now, won't do anything until can select for soft error).
985 */
986 void
987 tcp_notify(inp, error)
988 struct inpcb *inp;
989 int error;
990 {
991 register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
992 register struct socket *so = inp->inp_socket;
993
994 /*
995 * Ignore some errors if we are hooked up.
996 * If connection hasn't completed, has retransmitted several times,
997 * and receives a second error, give up now. This is better
998 * than waiting a long time to establish a connection that
999 * can never complete.
1000 */
1001 if (tp->t_state == TCPS_ESTABLISHED &&
1002 (error == EHOSTUNREACH || error == ENETUNREACH ||
1003 error == EHOSTDOWN)) {
1004 return;
1005 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
1006 tp->t_rxtshift > 3 && tp->t_softerror)
1007 so->so_error = error;
1008 else
1009 tp->t_softerror = error;
1010 wakeup((caddr_t) &so->so_timeo);
1011 sorwakeup(so);
1012 sowwakeup(so);
1013 }
1014
1015 #if defined(INET6) && !defined(TCP6)
1016 void
1017 tcp6_notify(in6p, error)
1018 struct in6pcb *in6p;
1019 int error;
1020 {
1021 register struct tcpcb *tp = (struct tcpcb *)in6p->in6p_ppcb;
1022 register struct socket *so = in6p->in6p_socket;
1023
1024 /*
1025 * Ignore some errors if we are hooked up.
1026 * If connection hasn't completed, has retransmitted several times,
1027 * and receives a second error, give up now. This is better
1028 * than waiting a long time to establish a connection that
1029 * can never complete.
1030 */
1031 if (tp->t_state == TCPS_ESTABLISHED &&
1032 (error == EHOSTUNREACH || error == ENETUNREACH ||
1033 error == EHOSTDOWN)) {
1034 return;
1035 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
1036 tp->t_rxtshift > 3 && tp->t_softerror)
1037 so->so_error = error;
1038 else
1039 tp->t_softerror = error;
1040 wakeup((caddr_t) &so->so_timeo);
1041 sorwakeup(so);
1042 sowwakeup(so);
1043 }
1044 #endif
1045
1046 #if defined(INET6) && !defined(TCP6)
1047 void
1048 tcp6_ctlinput(cmd, sa, ip6, m, off)
1049 int cmd;
1050 struct sockaddr *sa;
1051 register struct ip6_hdr *ip6;
1052 struct mbuf *m;
1053 int off;
1054 {
1055 register struct tcphdr *thp;
1056 struct tcphdr th;
1057 void (*notify) __P((struct in6pcb *, int)) = tcp6_notify;
1058 int nmatch;
1059 extern struct in6_addr zeroin6_addr; /* netinet6/in6_pcb.c */
1060 struct sockaddr_in6 sa6;
1061
1062 if (sa->sa_family != AF_INET6 ||
1063 sa->sa_len != sizeof(struct sockaddr_in6))
1064 return;
1065 if (cmd == PRC_QUENCH)
1066 notify = tcp6_quench;
1067 else if (cmd == PRC_MSGSIZE)
1068 notify = tcp6_mtudisc;
1069 else if (!PRC_IS_REDIRECT(cmd) &&
1070 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1071 return;
1072
1073 /* translate addresses into internal form */
1074 sa6 = *(struct sockaddr_in6 *)sa;
1075 if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr))
1076 sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1077
1078 if (ip6) {
1079 /*
1080 * XXX: We assume that when ip6 is non NULL,
1081 * M and OFF are valid.
1082 */
1083 struct in6_addr s;
1084
1085 /* translate addresses into internal form */
1086 memcpy(&s, &ip6->ip6_src, sizeof(s));
1087 if (IN6_IS_ADDR_LINKLOCAL(&s))
1088 s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index);
1089
1090 if (m->m_len < off + sizeof(th)) {
1091 /*
1092 * this should be rare case,
1093 * so we compromise on this copy...
1094 */
1095 m_copydata(m, off, sizeof(th), (caddr_t)&th);
1096 thp = &th;
1097 } else
1098 thp = (struct tcphdr *)(mtod(m, caddr_t) + off);
1099 nmatch = in6_pcbnotify(&tcb6, (struct sockaddr *)&sa6,
1100 thp->th_dport, &s, thp->th_sport, cmd, notify);
1101 if (nmatch == 0 && syn_cache_count &&
1102 (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
1103 inet6ctlerrmap[cmd] == ENETUNREACH ||
1104 inet6ctlerrmap[cmd] == EHOSTDOWN)) {
1105 struct sockaddr_in6 sin6;
1106 bzero(&sin6, sizeof(sin6));
1107 sin6.sin6_len = sizeof(sin6);
1108 sin6.sin6_family = AF_INET6;
1109 sin6.sin6_port = thp->th_sport;
1110 sin6.sin6_addr = s;
1111 syn_cache_unreach((struct sockaddr *)&sin6, sa, thp);
1112 }
1113 } else {
1114 (void) in6_pcbnotify(&tcb6, (struct sockaddr *)&sa6, 0,
1115 &zeroin6_addr, 0, cmd, notify);
1116 }
1117 }
1118 #endif
1119
1120 /* assumes that ip header and tcp header are contiguous on mbuf */
1121 void *
1122 tcp_ctlinput(cmd, sa, v)
1123 int cmd;
1124 struct sockaddr *sa;
1125 register void *v;
1126 {
1127 register struct ip *ip = v;
1128 register struct tcphdr *th;
1129 extern int inetctlerrmap[];
1130 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
1131 int errno;
1132 int nmatch;
1133
1134 if (sa->sa_family != AF_INET ||
1135 sa->sa_len != sizeof(struct sockaddr_in))
1136 return NULL;
1137 if ((unsigned)cmd >= PRC_NCMDS)
1138 return NULL;
1139 errno = inetctlerrmap[cmd];
1140 if (cmd == PRC_QUENCH)
1141 notify = tcp_quench;
1142 else if (PRC_IS_REDIRECT(cmd))
1143 notify = in_rtchange, ip = 0;
1144 else if (cmd == PRC_MSGSIZE && ip_mtudisc)
1145 notify = tcp_mtudisc, ip = 0;
1146 else if (cmd == PRC_HOSTDEAD)
1147 ip = 0;
1148 else if (errno == 0)
1149 return NULL;
1150 if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
1151 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
1152 nmatch = in_pcbnotify(&tcbtable, satosin(sa)->sin_addr,
1153 th->th_dport, ip->ip_src, th->th_sport, errno, notify);
1154 if (nmatch == 0 && syn_cache_count &&
1155 (inetctlerrmap[cmd] == EHOSTUNREACH ||
1156 inetctlerrmap[cmd] == ENETUNREACH ||
1157 inetctlerrmap[cmd] == EHOSTDOWN)) {
1158 struct sockaddr_in sin;
1159 bzero(&sin, sizeof(sin));
1160 sin.sin_len = sizeof(sin);
1161 sin.sin_family = AF_INET;
1162 sin.sin_port = th->th_sport;
1163 sin.sin_addr = ip->ip_src;
1164 syn_cache_unreach((struct sockaddr *)&sin, sa, th);
1165 }
1166
1167 /* XXX mapped address case */
1168 }
1169 else {
1170 (void)in_pcbnotifyall(&tcbtable, satosin(sa)->sin_addr, errno,
1171 notify);
1172 }
1173 return NULL;
1174 }
1175
1176 /*
1177 * When a source quence is received, we are being notifed of congestion.
1178 * Close the congestion window down to the Loss Window (one segment).
1179 * We will gradually open it again as we proceed.
1180 */
1181 void
1182 tcp_quench(inp, errno)
1183 struct inpcb *inp;
1184 int errno;
1185 {
1186 struct tcpcb *tp = intotcpcb(inp);
1187
1188 if (tp)
1189 tp->snd_cwnd = tp->t_segsz;
1190 }
1191
1192 #if defined(INET6) && !defined(TCP6)
1193 void
1194 tcp6_quench(in6p, errno)
1195 struct in6pcb *in6p;
1196 int errno;
1197 {
1198 struct tcpcb *tp = in6totcpcb(in6p);
1199
1200 if (tp)
1201 tp->snd_cwnd = tp->t_segsz;
1202 }
1203 #endif
1204
1205 /*
1206 * On receipt of path MTU corrections, flush old route and replace it
1207 * with the new one. Retransmit all unacknowledged packets, to ensure
1208 * that all packets will be received.
1209 */
1210 void
1211 tcp_mtudisc(inp, errno)
1212 struct inpcb *inp;
1213 int errno;
1214 {
1215 struct tcpcb *tp = intotcpcb(inp);
1216 struct rtentry *rt = in_pcbrtentry(inp);
1217
1218 if (tp != 0) {
1219 if (rt != 0) {
1220 /*
1221 * If this was not a host route, remove and realloc.
1222 */
1223 if ((rt->rt_flags & RTF_HOST) == 0) {
1224 in_rtchange(inp, errno);
1225 if ((rt = in_pcbrtentry(inp)) == 0)
1226 return;
1227 }
1228
1229 /*
1230 * Slow start out of the error condition. We
1231 * use the MTU because we know it's smaller
1232 * than the previously transmitted segment.
1233 *
1234 * Note: This is more conservative than the
1235 * suggestion in draft-floyd-incr-init-win-03.
1236 */
1237 if (rt->rt_rmx.rmx_mtu != 0)
1238 tp->snd_cwnd =
1239 TCP_INITIAL_WINDOW(tcp_init_win,
1240 rt->rt_rmx.rmx_mtu);
1241 }
1242
1243 /*
1244 * Resend unacknowledged packets.
1245 */
1246 tp->snd_nxt = tp->snd_una;
1247 tcp_output(tp);
1248 }
1249 }
1250
1251 #if defined(INET6) && !defined(TCP6)
1252 void
1253 tcp6_mtudisc(in6p, errno)
1254 struct in6pcb *in6p;
1255 int errno;
1256 {
1257 struct tcpcb *tp = in6totcpcb(in6p);
1258 struct rtentry *rt = in6_pcbrtentry(in6p);
1259
1260 if (tp != 0) {
1261 if (rt != 0) {
1262 /*
1263 * If this was not a host route, remove and realloc.
1264 */
1265 if ((rt->rt_flags & RTF_HOST) == 0) {
1266 in6_rtchange(in6p, errno);
1267 if ((rt = in6_pcbrtentry(in6p)) == 0)
1268 return;
1269 }
1270
1271 /*
1272 * Slow start out of the error condition. We
1273 * use the MTU because we know it's smaller
1274 * than the previously transmitted segment.
1275 *
1276 * Note: This is more conservative than the
1277 * suggestion in draft-floyd-incr-init-win-03.
1278 */
1279 if (rt->rt_rmx.rmx_mtu != 0)
1280 tp->snd_cwnd =
1281 TCP_INITIAL_WINDOW(tcp_init_win,
1282 rt->rt_rmx.rmx_mtu);
1283 }
1284
1285 /*
1286 * Resend unacknowledged packets.
1287 */
1288 tp->snd_nxt = tp->snd_una;
1289 tcp_output(tp);
1290 }
1291 }
1292 #endif
1293
1294 /*
1295 * Compute the MSS to advertise to the peer. Called only during
1296 * the 3-way handshake. If we are the server (peer initiated
1297 * connection), we are called with a pointer to the interface
1298 * on which the SYN packet arrived. If we are the client (we
1299 * initiated connection), we are called with a pointer to the
1300 * interface out which this connection should go.
1301 *
1302 * NOTE: Do not subtract IP option/extension header size nor IPsec
1303 * header size from MSS advertisement. MSS option must hold the maximum
1304 * segment size we can accept, so it must always be:
1305 * max(if mtu) - ip header - tcp header
1306 */
1307 u_long
1308 tcp_mss_to_advertise(ifp, af)
1309 const struct ifnet *ifp;
1310 int af;
1311 {
1312 extern u_long in_maxmtu;
1313 u_long mss = 0;
1314 u_long hdrsiz;
1315
1316 /*
1317 * In order to avoid defeating path MTU discovery on the peer,
1318 * we advertise the max MTU of all attached networks as our MSS,
1319 * per RFC 1191, section 3.1.
1320 *
1321 * We provide the option to advertise just the MTU of
1322 * the interface on which we hope this connection will
1323 * be receiving. If we are responding to a SYN, we
1324 * will have a pretty good idea about this, but when
1325 * initiating a connection there is a bit more doubt.
1326 *
1327 * We also need to ensure that loopback has a large enough
1328 * MSS, as the loopback MTU is never included in in_maxmtu.
1329 */
1330
1331 if (ifp != NULL)
1332 mss = ifp->if_mtu;
1333
1334 if (tcp_mss_ifmtu == 0)
1335 mss = max(in_maxmtu, mss);
1336
1337 switch (af) {
1338 case AF_INET:
1339 hdrsiz = sizeof(struct ip);
1340 break;
1341 #ifdef INET6
1342 case AF_INET6:
1343 hdrsiz = sizeof(struct ip6_hdr);
1344 break;
1345 #endif
1346 default:
1347 hdrsiz = 0;
1348 break;
1349 }
1350 hdrsiz += sizeof(struct tcphdr);
1351 if (mss > hdrsiz)
1352 mss -= hdrsiz;
1353
1354 mss = max(tcp_mssdflt, mss);
1355 return (mss);
1356 }
1357
1358 /*
1359 * Set connection variables based on the peer's advertised MSS.
1360 * We are passed the TCPCB for the actual connection. If we
1361 * are the server, we are called by the compressed state engine
1362 * when the 3-way handshake is complete. If we are the client,
1363 * we are called when we recieve the SYN,ACK from the server.
1364 *
1365 * NOTE: Our advertised MSS value must be initialized in the TCPCB
1366 * before this routine is called!
1367 */
1368 void
1369 tcp_mss_from_peer(tp, offer)
1370 struct tcpcb *tp;
1371 int offer;
1372 {
1373 struct socket *so;
1374 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1375 struct rtentry *rt;
1376 #endif
1377 u_long bufsize;
1378 int mss;
1379
1380 so = NULL;
1381 rt = NULL;
1382 if (tp->t_inpcb) {
1383 so = tp->t_inpcb->inp_socket;
1384 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1385 rt = in_pcbrtentry(tp->t_inpcb);
1386 #endif
1387 }
1388 #ifdef INET6
1389 else if (tp->t_in6pcb) {
1390 so = tp->t_in6pcb->in6p_socket;
1391 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1392 #ifdef TCP6
1393 rt = NULL;
1394 #else
1395 rt = in6_pcbrtentry(tp->t_in6pcb);
1396 #endif
1397 #endif
1398 }
1399 #endif
1400
1401 /*
1402 * As per RFC1122, use the default MSS value, unless they
1403 * sent us an offer. Do not accept offers less than 32 bytes.
1404 */
1405 mss = tcp_mssdflt;
1406 if (offer)
1407 mss = offer;
1408 mss = max(mss, 32); /* sanity */
1409 tp->t_peermss = mss;
1410 mss -= tcp_optlen(tp);
1411 if (tp->t_inpcb)
1412 mss -= ip_optlen(tp->t_inpcb);
1413 #ifdef INET6
1414 else if (tp->t_in6pcb)
1415 mss -= ip6_optlen(tp->t_in6pcb);
1416 #endif
1417
1418 /*
1419 * If there's a pipesize, change the socket buffer to that size.
1420 * Make the socket buffer an integral number of MSS units. If
1421 * the MSS is larger than the socket buffer, artificially decrease
1422 * the MSS.
1423 */
1424 #ifdef RTV_SPIPE
1425 if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
1426 bufsize = rt->rt_rmx.rmx_sendpipe;
1427 else
1428 #endif
1429 bufsize = so->so_snd.sb_hiwat;
1430 if (bufsize < mss)
1431 mss = bufsize;
1432 else {
1433 bufsize = roundup(bufsize, mss);
1434 if (bufsize > sb_max)
1435 bufsize = sb_max;
1436 (void) sbreserve(&so->so_snd, bufsize);
1437 }
1438 tp->t_segsz = mss;
1439
1440 #ifdef RTV_SSTHRESH
1441 if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
1442 /*
1443 * There's some sort of gateway or interface buffer
1444 * limit on the path. Use this to set the slow
1445 * start threshold, but set the threshold to no less
1446 * than 2 * MSS.
1447 */
1448 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
1449 }
1450 #endif
1451 }
1452
1453 /*
1454 * Processing necessary when a TCP connection is established.
1455 */
1456 void
1457 tcp_established(tp)
1458 struct tcpcb *tp;
1459 {
1460 struct socket *so;
1461 #ifdef RTV_RPIPE
1462 struct rtentry *rt;
1463 #endif
1464 u_long bufsize;
1465
1466 so = NULL;
1467 rt = NULL;
1468 if (tp->t_inpcb) {
1469 so = tp->t_inpcb->inp_socket;
1470 #if defined(RTV_RPIPE)
1471 rt = in_pcbrtentry(tp->t_inpcb);
1472 #endif
1473 }
1474 #ifdef INET6
1475 else if (tp->t_in6pcb) {
1476 so = tp->t_in6pcb->in6p_socket;
1477 #if defined(RTV_RPIPE)
1478 #ifdef TCP6
1479 rt = NULL;
1480 #else
1481 rt = in6_pcbrtentry(tp->t_in6pcb);
1482 #endif
1483 #endif
1484 }
1485 #endif
1486
1487 tp->t_state = TCPS_ESTABLISHED;
1488 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1489
1490 #ifdef RTV_RPIPE
1491 if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
1492 bufsize = rt->rt_rmx.rmx_recvpipe;
1493 else
1494 #endif
1495 bufsize = so->so_rcv.sb_hiwat;
1496 if (bufsize > tp->t_ourmss) {
1497 bufsize = roundup(bufsize, tp->t_ourmss);
1498 if (bufsize > sb_max)
1499 bufsize = sb_max;
1500 (void) sbreserve(&so->so_rcv, bufsize);
1501 }
1502 }
1503
1504 /*
1505 * Check if there's an initial rtt or rttvar. Convert from the
1506 * route-table units to scaled multiples of the slow timeout timer.
1507 * Called only during the 3-way handshake.
1508 */
1509 void
1510 tcp_rmx_rtt(tp)
1511 struct tcpcb *tp;
1512 {
1513 #ifdef RTV_RTT
1514 struct rtentry *rt = NULL;
1515 int rtt;
1516
1517 if (tp->t_inpcb)
1518 rt = in_pcbrtentry(tp->t_inpcb);
1519 #ifdef INET6
1520 else if (tp->t_in6pcb) {
1521 #ifdef TCP6
1522 rt = NULL;
1523 #else
1524 rt = in6_pcbrtentry(tp->t_in6pcb);
1525 #endif
1526 }
1527 #endif
1528 if (rt == NULL)
1529 return;
1530
1531 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
1532 /*
1533 * XXX The lock bit for MTU indicates that the value
1534 * is also a minimum value; this is subject to time.
1535 */
1536 if (rt->rt_rmx.rmx_locks & RTV_RTT)
1537 TCPT_RANGESET(tp->t_rttmin,
1538 rtt / (RTM_RTTUNIT / PR_SLOWHZ),
1539 TCPTV_MIN, TCPTV_REXMTMAX);
1540 tp->t_srtt = rtt /
1541 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
1542 if (rt->rt_rmx.rmx_rttvar) {
1543 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1544 ((RTM_RTTUNIT / PR_SLOWHZ) >>
1545 (TCP_RTTVAR_SHIFT + 2));
1546 } else {
1547 /* Default variation is +- 1 rtt */
1548 tp->t_rttvar =
1549 tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
1550 }
1551 TCPT_RANGESET(tp->t_rxtcur,
1552 ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
1553 tp->t_rttmin, TCPTV_REXMTMAX);
1554 }
1555 #endif
1556 }
1557
1558 tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
1559
1560 /*
1561 * Get a new sequence value given a tcp control block
1562 */
1563 tcp_seq
1564 tcp_new_iss(tp, len, addin)
1565 void *tp;
1566 u_long len;
1567 tcp_seq addin;
1568 {
1569 tcp_seq tcp_iss;
1570
1571 /*
1572 * Randomize.
1573 */
1574 #if NRND > 0
1575 rnd_extract_data(&tcp_iss, sizeof(tcp_iss), RND_EXTRACT_ANY);
1576 #else
1577 tcp_iss = random();
1578 #endif
1579
1580 /*
1581 * If we were asked to add some amount to a known value,
1582 * we will take a random value obtained above, mask off the upper
1583 * bits, and add in the known value. We also add in a constant to
1584 * ensure that we are at least a certain distance from the original
1585 * value.
1586 *
1587 * This is used when an old connection is in timed wait
1588 * and we have a new one coming in, for instance.
1589 */
1590 if (addin != 0) {
1591 #ifdef TCPISS_DEBUG
1592 printf("Random %08x, ", tcp_iss);
1593 #endif
1594 tcp_iss &= TCP_ISS_RANDOM_MASK;
1595 tcp_iss += addin + TCP_ISSINCR;
1596 #ifdef TCPISS_DEBUG
1597 printf("Old ISS %08x, ISS %08x\n", addin, tcp_iss);
1598 #endif
1599 } else {
1600 tcp_iss &= TCP_ISS_RANDOM_MASK;
1601 tcp_iss += tcp_iss_seq;
1602 tcp_iss_seq += TCP_ISSINCR;
1603 #ifdef TCPISS_DEBUG
1604 printf("ISS %08x\n", tcp_iss);
1605 #endif
1606 }
1607
1608 if (tcp_compat_42) {
1609 /*
1610 * Limit it to the positive range for really old TCP
1611 * implementations.
1612 */
1613 if (tcp_iss >= 0x80000000)
1614 tcp_iss &= 0x7fffffff; /* XXX */
1615 }
1616
1617 return tcp_iss;
1618 }
1619
1620 #ifdef IPSEC
1621 /* compute ESP/AH header size for TCP, including outer IP header. */
1622 size_t
1623 ipsec4_hdrsiz_tcp(tp)
1624 struct tcpcb *tp;
1625 {
1626 struct inpcb *inp;
1627 size_t hdrsiz;
1628
1629 /* XXX mapped addr case (tp->t_in6pcb) */
1630 if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
1631 return 0;
1632 switch (tp->t_family) {
1633 case AF_INET:
1634 hdrsiz = ipsec4_hdrsiz(tp->t_template, inp);
1635 break;
1636 default:
1637 hdrsiz = 0;
1638 break;
1639 }
1640
1641 return hdrsiz;
1642 }
1643
1644 #if defined(INET6) && !defined(TCP6)
1645 size_t
1646 ipsec6_hdrsiz_tcp(tp)
1647 struct tcpcb *tp;
1648 {
1649 struct in6pcb *in6p;
1650 size_t hdrsiz;
1651
1652 if (!tp || !tp->t_template || !(in6p = tp->t_in6pcb))
1653 return 0;
1654 switch (tp->t_family) {
1655 case AF_INET6:
1656 hdrsiz = ipsec6_hdrsiz(tp->t_template, in6p);
1657 break;
1658 case AF_INET:
1659 /* mapped address case - tricky */
1660 default:
1661 hdrsiz = 0;
1662 break;
1663 }
1664
1665 return hdrsiz;
1666 }
1667 #endif
1668 #endif /*IPSEC*/
1669
1670 /*
1671 * Determine the length of the TCP options for this connection.
1672 *
1673 * XXX: What do we do for SACK, when we add that? Just reserve
1674 * all of the space? Otherwise we can't exactly be incrementing
1675 * cwnd by an amount that varies depending on the amount we last
1676 * had to SACK!
1677 */
1678
1679 u_int
1680 tcp_optlen(tp)
1681 struct tcpcb *tp;
1682 {
1683 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1684 (TF_REQ_TSTMP | TF_RCVD_TSTMP))
1685 return TCPOLEN_TSTAMP_APPA;
1686 else
1687 return 0;
1688 }
1689