tcp_input.c revision 1.116 1 /* $NetBSD: tcp_input.c,v 1.116 2000/07/27 11:34:06 itojun Exp $ */
2
3 /*
4 %%% portions-copyright-nrl-95
5 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
7 Reserved. All rights under this copyright have been assigned to the US
8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
10 software.
11 You should have received a copy of the license with this software. If you
12 didn't get a copy, you may request one from <license (at) ipv6.nrl.navy.mil>.
13
14 */
15
16 /*
17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
18 * All rights reserved.
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 * 1. Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in the
27 * documentation and/or other materials provided with the distribution.
28 * 3. Neither the name of the project nor the names of its contributors
29 * may be used to endorse or promote products derived from this software
30 * without specific prior written permission.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 * SUCH DAMAGE.
43 */
44
45 /*-
46 * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc.
47 * All rights reserved.
48 *
49 * This code is derived from software contributed to The NetBSD Foundation
50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
51 * Facility, NASA Ames Research Center.
52 *
53 * Redistribution and use in source and binary forms, with or without
54 * modification, are permitted provided that the following conditions
55 * are met:
56 * 1. Redistributions of source code must retain the above copyright
57 * notice, this list of conditions and the following disclaimer.
58 * 2. Redistributions in binary form must reproduce the above copyright
59 * notice, this list of conditions and the following disclaimer in the
60 * documentation and/or other materials provided with the distribution.
61 * 3. All advertising materials mentioning features or use of this software
62 * must display the following acknowledgement:
63 * This product includes software developed by the NetBSD
64 * Foundation, Inc. and its contributors.
65 * 4. Neither the name of The NetBSD Foundation nor the names of its
66 * contributors may be used to endorse or promote products derived
67 * from this software without specific prior written permission.
68 *
69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
79 * POSSIBILITY OF SUCH DAMAGE.
80 */
81
82 /*
83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
84 * The Regents of the University of California. All rights reserved.
85 *
86 * Redistribution and use in source and binary forms, with or without
87 * modification, are permitted provided that the following conditions
88 * are met:
89 * 1. Redistributions of source code must retain the above copyright
90 * notice, this list of conditions and the following disclaimer.
91 * 2. Redistributions in binary form must reproduce the above copyright
92 * notice, this list of conditions and the following disclaimer in the
93 * documentation and/or other materials provided with the distribution.
94 * 3. All advertising materials mentioning features or use of this software
95 * must display the following acknowledgement:
96 * This product includes software developed by the University of
97 * California, Berkeley and its contributors.
98 * 4. Neither the name of the University nor the names of its contributors
99 * may be used to endorse or promote products derived from this software
100 * without specific prior written permission.
101 *
102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
112 * SUCH DAMAGE.
113 *
114 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
115 */
116
117 /*
118 * TODO list for SYN cache stuff:
119 *
120 * Find room for a "state" field, which is needed to keep a
121 * compressed state for TIME_WAIT TCBs. It's been noted already
122 * that this is fairly important for very high-volume web and
123 * mail servers, which use a large number of short-lived
124 * connections.
125 */
126
127 #include "opt_inet.h"
128 #include "opt_ipsec.h"
129
130 #include <sys/param.h>
131 #include <sys/systm.h>
132 #include <sys/malloc.h>
133 #include <sys/mbuf.h>
134 #include <sys/protosw.h>
135 #include <sys/socket.h>
136 #include <sys/socketvar.h>
137 #include <sys/errno.h>
138 #include <sys/syslog.h>
139 #include <sys/pool.h>
140 #include <sys/domain.h>
141
142 #include <net/if.h>
143 #include <net/route.h>
144 #include <net/if_types.h>
145
146 #include <netinet/in.h>
147 #include <netinet/in_systm.h>
148 #include <netinet/ip.h>
149 #include <netinet/in_pcb.h>
150 #include <netinet/ip_var.h>
151
152 #ifdef INET6
153 #ifndef INET
154 #include <netinet/in.h>
155 #endif
156 #include <netinet/ip6.h>
157 #include <netinet6/ip6_var.h>
158 #include <netinet6/in6_pcb.h>
159 #include <netinet6/ip6_var.h>
160 #include <netinet6/in6_var.h>
161 #include <netinet/icmp6.h>
162 #include <netinet6/nd6.h>
163 #endif
164
165 #ifdef PULLDOWN_TEST
166 #ifndef INET6
167 /* always need ip6.h for IP6_EXTHDR_GET */
168 #include <netinet/ip6.h>
169 #endif
170 #endif
171
172 #include <netinet/tcp.h>
173 #include <netinet/tcp_fsm.h>
174 #include <netinet/tcp_seq.h>
175 #include <netinet/tcp_timer.h>
176 #include <netinet/tcp_var.h>
177 #include <netinet/tcpip.h>
178 #include <netinet/tcp_debug.h>
179
180 #include <machine/stdarg.h>
181
182 #ifdef IPSEC
183 #include <netinet6/ipsec.h>
184 #include <netkey/key.h>
185 #endif /*IPSEC*/
186 #ifdef INET6
187 #include "faith.h"
188 #endif
189
190 int tcprexmtthresh = 3;
191 int tcp_log_refused;
192
193 struct timeval tcp_rst_ratelim_last;
194 static int tcp_rst_ppslim_count = 0;
195 static struct timeval tcp_rst_ppslim_last;
196
197 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
198
199 /* for modulo comparisons of timestamps */
200 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
201 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
202
203 /*
204 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
205 */
206 #ifdef INET6
207 #define ND6_HINT(tp) \
208 do { \
209 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 \
210 && tp->t_in6pcb->in6p_route.ro_rt) { \
211 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \
212 } \
213 } while (0)
214 #else
215 #define ND6_HINT(tp)
216 #endif
217
218 /*
219 * Macro to compute ACK transmission behavior. Delay the ACK unless
220 * we have already delayed an ACK (must send an ACK every two segments).
221 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
222 * option is enabled.
223 */
224 #define TCP_SETUP_ACK(tp, th) \
225 do { \
226 if ((tp)->t_flags & TF_DELACK || \
227 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \
228 tp->t_flags |= TF_ACKNOW; \
229 else \
230 TCP_SET_DELACK(tp); \
231 } while (0)
232
233 /*
234 * Convert TCP protocol fields to host order for easier processing.
235 */
236 #define TCP_FIELDS_TO_HOST(th) \
237 do { \
238 NTOHL((th)->th_seq); \
239 NTOHL((th)->th_ack); \
240 NTOHS((th)->th_win); \
241 NTOHS((th)->th_urp); \
242 } while (0)
243
244 int
245 tcp_reass(tp, th, m, tlen)
246 struct tcpcb *tp;
247 struct tcphdr *th;
248 struct mbuf *m;
249 int *tlen;
250 {
251 struct ipqent *p, *q, *nq, *tiqe = NULL;
252 struct socket *so = NULL;
253 int pkt_flags;
254 tcp_seq pkt_seq;
255 unsigned pkt_len;
256 u_long rcvpartdupbyte = 0;
257 u_long rcvoobyte;
258
259 if (tp->t_inpcb)
260 so = tp->t_inpcb->inp_socket;
261 #ifdef INET6
262 else if (tp->t_in6pcb)
263 so = tp->t_in6pcb->in6p_socket;
264 #endif
265
266 TCP_REASS_LOCK_CHECK(tp);
267
268 /*
269 * Call with th==0 after become established to
270 * force pre-ESTABLISHED data up to user socket.
271 */
272 if (th == 0)
273 goto present;
274
275 rcvoobyte = *tlen;
276 /*
277 * Copy these to local variables because the tcpiphdr
278 * gets munged while we are collapsing mbufs.
279 */
280 pkt_seq = th->th_seq;
281 pkt_len = *tlen;
282 pkt_flags = th->th_flags;
283 /*
284 * Find a segment which begins after this one does.
285 */
286 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) {
287 nq = q->ipqe_q.le_next;
288 /*
289 * If the received segment is just right after this
290 * fragment, merge the two together and then check
291 * for further overlaps.
292 */
293 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
294 #ifdef TCPREASS_DEBUG
295 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
296 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
297 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
298 #endif
299 pkt_len += q->ipqe_len;
300 pkt_flags |= q->ipqe_flags;
301 pkt_seq = q->ipqe_seq;
302 m_cat(q->ipqe_m, m);
303 m = q->ipqe_m;
304 goto free_ipqe;
305 }
306 /*
307 * If the received segment is completely past this
308 * fragment, we need to go the next fragment.
309 */
310 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
311 p = q;
312 continue;
313 }
314 /*
315 * If the fragment is past the received segment,
316 * it (or any following) can't be concatenated.
317 */
318 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len))
319 break;
320 /*
321 * We've received all the data in this segment before.
322 * mark it as a duplicate and return.
323 */
324 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
325 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
326 tcpstat.tcps_rcvduppack++;
327 tcpstat.tcps_rcvdupbyte += pkt_len;
328 m_freem(m);
329 if (tiqe != NULL)
330 pool_put(&ipqent_pool, tiqe);
331 return (0);
332 }
333 /*
334 * Received segment completely overlaps this fragment
335 * so we drop the fragment (this keeps the temporal
336 * ordering of segments correct).
337 */
338 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
339 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
340 rcvpartdupbyte += q->ipqe_len;
341 m_freem(q->ipqe_m);
342 goto free_ipqe;
343 }
344 /*
345 * RX'ed segment extends past the end of the
346 * fragment. Drop the overlapping bytes. Then
347 * merge the fragment and segment then treat as
348 * a longer received packet.
349 */
350 if (SEQ_LT(q->ipqe_seq, pkt_seq)
351 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
352 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
353 #ifdef TCPREASS_DEBUG
354 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
355 tp, overlap,
356 pkt_seq, pkt_seq + pkt_len, pkt_len);
357 #endif
358 m_adj(m, overlap);
359 rcvpartdupbyte += overlap;
360 m_cat(q->ipqe_m, m);
361 m = q->ipqe_m;
362 pkt_seq = q->ipqe_seq;
363 pkt_len += q->ipqe_len - overlap;
364 rcvoobyte -= overlap;
365 goto free_ipqe;
366 }
367 /*
368 * RX'ed segment extends past the front of the
369 * fragment. Drop the overlapping bytes on the
370 * received packet. The packet will then be
371 * contatentated with this fragment a bit later.
372 */
373 if (SEQ_GT(q->ipqe_seq, pkt_seq)
374 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) {
375 int overlap = pkt_seq + pkt_len - q->ipqe_seq;
376 #ifdef TCPREASS_DEBUG
377 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
378 tp, overlap,
379 pkt_seq, pkt_seq + pkt_len, pkt_len);
380 #endif
381 m_adj(m, -overlap);
382 pkt_len -= overlap;
383 rcvpartdupbyte += overlap;
384 rcvoobyte -= overlap;
385 }
386 /*
387 * If the received segment immediates precedes this
388 * fragment then tack the fragment onto this segment
389 * and reinsert the data.
390 */
391 if (q->ipqe_seq == pkt_seq + pkt_len) {
392 #ifdef TCPREASS_DEBUG
393 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
394 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
395 pkt_seq, pkt_seq + pkt_len, pkt_len);
396 #endif
397 pkt_len += q->ipqe_len;
398 pkt_flags |= q->ipqe_flags;
399 m_cat(m, q->ipqe_m);
400 LIST_REMOVE(q, ipqe_q);
401 LIST_REMOVE(q, ipqe_timeq);
402 if (tiqe == NULL) {
403 tiqe = q;
404 } else {
405 pool_put(&ipqent_pool, q);
406 }
407 break;
408 }
409 /*
410 * If the fragment is before the segment, remember it.
411 * When this loop is terminated, p will contain the
412 * pointer to fragment that is right before the received
413 * segment.
414 */
415 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
416 p = q;
417
418 continue;
419
420 /*
421 * This is a common operation. It also will allow
422 * to save doing a malloc/free in most instances.
423 */
424 free_ipqe:
425 LIST_REMOVE(q, ipqe_q);
426 LIST_REMOVE(q, ipqe_timeq);
427 if (tiqe == NULL) {
428 tiqe = q;
429 } else {
430 pool_put(&ipqent_pool, q);
431 }
432 }
433
434 /*
435 * Allocate a new queue entry since the received segment did not
436 * collapse onto any other out-of-order block; thus we are allocating
437 * a new block. If it had collapsed, tiqe would not be NULL and
438 * we would be reusing it.
439 * XXX If we can't, just drop the packet. XXX
440 */
441 if (tiqe == NULL) {
442 tiqe = pool_get(&ipqent_pool, PR_NOWAIT);
443 if (tiqe == NULL) {
444 tcpstat.tcps_rcvmemdrop++;
445 m_freem(m);
446 return (0);
447 }
448 }
449
450 /*
451 * Update the counters.
452 */
453 tcpstat.tcps_rcvoopack++;
454 tcpstat.tcps_rcvoobyte += rcvoobyte;
455 if (rcvpartdupbyte) {
456 tcpstat.tcps_rcvpartduppack++;
457 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
458 }
459
460 /*
461 * Insert the new fragment queue entry into both queues.
462 */
463 tiqe->ipqe_m = m;
464 tiqe->ipqe_seq = pkt_seq;
465 tiqe->ipqe_len = pkt_len;
466 tiqe->ipqe_flags = pkt_flags;
467 if (p == NULL) {
468 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
469 #ifdef TCPREASS_DEBUG
470 if (tiqe->ipqe_seq != tp->rcv_nxt)
471 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
472 tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
473 #endif
474 } else {
475 LIST_INSERT_AFTER(p, tiqe, ipqe_q);
476 #ifdef TCPREASS_DEBUG
477 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
478 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
479 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
480 #endif
481 }
482
483 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
484
485 present:
486 /*
487 * Present data to user, advancing rcv_nxt through
488 * completed sequence space.
489 */
490 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
491 return (0);
492 q = tp->segq.lh_first;
493 if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
494 return (0);
495 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
496 return (0);
497
498 tp->rcv_nxt += q->ipqe_len;
499 pkt_flags = q->ipqe_flags & TH_FIN;
500 ND6_HINT(tp);
501
502 LIST_REMOVE(q, ipqe_q);
503 LIST_REMOVE(q, ipqe_timeq);
504 if (so->so_state & SS_CANTRCVMORE)
505 m_freem(q->ipqe_m);
506 else
507 sbappend(&so->so_rcv, q->ipqe_m);
508 pool_put(&ipqent_pool, q);
509 sorwakeup(so);
510 return (pkt_flags);
511 }
512
513 #if defined(INET6) && !defined(TCP6)
514 int
515 tcp6_input(mp, offp, proto)
516 struct mbuf **mp;
517 int *offp, proto;
518 {
519 struct mbuf *m = *mp;
520
521 /*
522 * draft-itojun-ipv6-tcp-to-anycast
523 * better place to put this in?
524 */
525 if (m->m_flags & M_ANYCAST6) {
526 struct ip6_hdr *ip6;
527 if (m->m_len < sizeof(struct ip6_hdr)) {
528 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
529 tcpstat.tcps_rcvshort++;
530 return IPPROTO_DONE;
531 }
532 }
533 ip6 = mtod(m, struct ip6_hdr *);
534 icmp6_error(m, ICMP6_DST_UNREACH,
535 ICMP6_DST_UNREACH_ADDR,
536 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
537 return IPPROTO_DONE;
538 }
539
540 tcp_input(m, *offp, proto);
541 return IPPROTO_DONE;
542 }
543 #endif
544
545 /*
546 * TCP input routine, follows pages 65-76 of the
547 * protocol specification dated September, 1981 very closely.
548 */
549 void
550 #if __STDC__
551 tcp_input(struct mbuf *m, ...)
552 #else
553 tcp_input(m, va_alist)
554 struct mbuf *m;
555 #endif
556 {
557 int proto;
558 struct tcphdr *th;
559 struct ip *ip;
560 struct inpcb *inp;
561 #ifdef INET6
562 struct ip6_hdr *ip6;
563 struct in6pcb *in6p;
564 #endif
565 caddr_t optp = NULL;
566 int optlen = 0;
567 int len, tlen, toff, hdroptlen = 0;
568 struct tcpcb *tp = 0;
569 int tiflags;
570 struct socket *so = NULL;
571 int todrop, acked, ourfinisacked, needoutput = 0;
572 short ostate = 0;
573 int iss = 0;
574 u_long tiwin;
575 struct tcp_opt_info opti;
576 int off, iphlen;
577 va_list ap;
578 int af; /* af on the wire */
579 struct mbuf *tcp_saveti = NULL;
580
581 va_start(ap, m);
582 toff = va_arg(ap, int);
583 proto = va_arg(ap, int);
584 va_end(ap);
585
586 tcpstat.tcps_rcvtotal++;
587
588 bzero(&opti, sizeof(opti));
589 opti.ts_present = 0;
590 opti.maxseg = 0;
591
592 /*
593 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
594 *
595 * TCP is, by definition, unicast, so we reject all
596 * multicast outright.
597 *
598 * Note, there are additional src/dst address checks in
599 * the AF-specific code below.
600 */
601 if (m->m_flags & (M_BCAST|M_MCAST)) {
602 /* XXX stat */
603 goto drop;
604 }
605 #ifdef INET6
606 if (m->m_flags & M_ANYCAST6) {
607 /* XXX stat */
608 goto drop;
609 }
610 #endif
611
612 /*
613 * Get IP and TCP header together in first mbuf.
614 * Note: IP leaves IP header in first mbuf.
615 */
616 ip = mtod(m, struct ip *);
617 #ifdef INET6
618 ip6 = NULL;
619 #endif
620 switch (ip->ip_v) {
621 case 4:
622 af = AF_INET;
623 iphlen = sizeof(struct ip);
624 #ifndef PULLDOWN_TEST
625 /* would like to get rid of this... */
626 if (toff > sizeof (struct ip)) {
627 ip_stripoptions(m, (struct mbuf *)0);
628 toff = sizeof(struct ip);
629 }
630 if (m->m_len < toff + sizeof (struct tcphdr)) {
631 if ((m = m_pullup(m, toff + sizeof (struct tcphdr))) == 0) {
632 tcpstat.tcps_rcvshort++;
633 return;
634 }
635 }
636 ip = mtod(m, struct ip *);
637 th = (struct tcphdr *)(mtod(m, caddr_t) + toff);
638 #else
639 ip = mtod(m, struct ip *);
640 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
641 sizeof(struct tcphdr));
642 if (th == NULL) {
643 tcpstat.tcps_rcvshort++;
644 return;
645 }
646 #endif
647
648 /*
649 * Make sure destination address is not multicast.
650 * Source address checked in ip_input().
651 */
652 if (IN_MULTICAST(ip->ip_dst.s_addr)) {
653 /* XXX stat */
654 goto drop;
655 }
656
657 /* We do the checksum after PCB lookup... */
658 len = ip->ip_len;
659 tlen = len - toff;
660 break;
661 #ifdef INET6
662 case 6:
663 ip = NULL;
664 iphlen = sizeof(struct ip6_hdr);
665 af = AF_INET6;
666 #ifndef PULLDOWN_TEST
667 if (m->m_len < toff + sizeof(struct tcphdr)) {
668 m = m_pullup(m, toff + sizeof(struct tcphdr)); /*XXX*/
669 if (m == NULL) {
670 tcpstat.tcps_rcvshort++;
671 return;
672 }
673 }
674 ip6 = mtod(m, struct ip6_hdr *);
675 th = (struct tcphdr *)(mtod(m, caddr_t) + toff);
676 #else
677 ip6 = mtod(m, struct ip6_hdr *);
678 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
679 sizeof(struct tcphdr));
680 if (th == NULL) {
681 tcpstat.tcps_rcvshort++;
682 return;
683 }
684 #endif
685
686 /* Be proactive about malicious use of IPv4 mapped address */
687 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
688 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
689 /* XXX stat */
690 goto drop;
691 }
692
693 /*
694 * Be proactive about unspecified IPv6 address in source.
695 * As we use all-zero to indicate unbounded/unconnected pcb,
696 * unspecified IPv6 address can be used to confuse us.
697 *
698 * Note that packets with unspecified IPv6 destination is
699 * already dropped in ip6_input.
700 */
701 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
702 /* XXX stat */
703 goto drop;
704 }
705
706 /*
707 * Make sure destination address is not multicast.
708 * Source address checked in ip6_input().
709 */
710 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
711 /* XXX stat */
712 goto drop;
713 }
714
715 /* We do the checksum after PCB lookup... */
716 len = m->m_pkthdr.len;
717 tlen = len - toff;
718 break;
719 #endif
720 default:
721 m_freem(m);
722 return;
723 }
724
725 /*
726 * Check that TCP offset makes sense,
727 * pull out TCP options and adjust length. XXX
728 */
729 off = th->th_off << 2;
730 if (off < sizeof (struct tcphdr) || off > tlen) {
731 tcpstat.tcps_rcvbadoff++;
732 goto drop;
733 }
734 tlen -= off;
735
736 /*
737 * tcp_input() has been modified to use tlen to mean the TCP data
738 * length throughout the function. Other functions can use
739 * m->m_pkthdr.len as the basis for calculating the TCP data length.
740 * rja
741 */
742
743 if (off > sizeof (struct tcphdr)) {
744 #ifndef PULLDOWN_TEST
745 if (m->m_len < toff + off) {
746 if ((m = m_pullup(m, toff + off)) == 0) {
747 tcpstat.tcps_rcvshort++;
748 return;
749 }
750 switch (af) {
751 case AF_INET:
752 ip = mtod(m, struct ip *);
753 break;
754 #ifdef INET6
755 case AF_INET6:
756 ip6 = mtod(m, struct ip6_hdr *);
757 break;
758 #endif
759 }
760 th = (struct tcphdr *)(mtod(m, caddr_t) + toff);
761 }
762 #else
763 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
764 if (th == NULL) {
765 tcpstat.tcps_rcvshort++;
766 return;
767 }
768 /*
769 * NOTE: ip/ip6 will not be affected by m_pulldown()
770 * (as they're before toff) and we don't need to update those.
771 */
772 #endif
773 optlen = off - sizeof (struct tcphdr);
774 optp = ((caddr_t)th) + sizeof(struct tcphdr);
775 /*
776 * Do quick retrieval of timestamp options ("options
777 * prediction?"). If timestamp is the only option and it's
778 * formatted as recommended in RFC 1323 appendix A, we
779 * quickly get the values now and not bother calling
780 * tcp_dooptions(), etc.
781 */
782 if ((optlen == TCPOLEN_TSTAMP_APPA ||
783 (optlen > TCPOLEN_TSTAMP_APPA &&
784 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
785 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
786 (th->th_flags & TH_SYN) == 0) {
787 opti.ts_present = 1;
788 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
789 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
790 optp = NULL; /* we've parsed the options */
791 }
792 }
793 tiflags = th->th_flags;
794
795 /*
796 * Locate pcb for segment.
797 */
798 findpcb:
799 inp = NULL;
800 #ifdef INET6
801 in6p = NULL;
802 #endif
803 switch (af) {
804 case AF_INET:
805 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
806 ip->ip_dst, th->th_dport);
807 if (inp == 0) {
808 ++tcpstat.tcps_pcbhashmiss;
809 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
810 }
811 #if defined(INET6) && !defined(TCP6)
812 if (inp == 0) {
813 struct in6_addr s, d;
814
815 /* mapped addr case */
816 bzero(&s, sizeof(s));
817 s.s6_addr16[5] = htons(0xffff);
818 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
819 bzero(&d, sizeof(d));
820 d.s6_addr16[5] = htons(0xffff);
821 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
822 in6p = in6_pcblookup_connect(&tcb6, &s, th->th_sport,
823 &d, th->th_dport, 0);
824 if (in6p == 0) {
825 ++tcpstat.tcps_pcbhashmiss;
826 in6p = in6_pcblookup_bind(&tcb6, &d,
827 th->th_dport, 0);
828 }
829 }
830 #endif
831 #ifndef INET6
832 if (inp == 0)
833 #else
834 if (inp == 0 && in6p == 0)
835 #endif
836 {
837 ++tcpstat.tcps_noport;
838 if (tcp_log_refused && (tiflags & TH_SYN)) {
839 #ifndef INET6
840 char src[4*sizeof "123"];
841 char dst[4*sizeof "123"];
842 #else
843 char src[INET6_ADDRSTRLEN];
844 char dst[INET6_ADDRSTRLEN];
845 #endif
846 if (ip) {
847 strcpy(src, inet_ntoa(ip->ip_src));
848 strcpy(dst, inet_ntoa(ip->ip_dst));
849 }
850 #ifdef INET6
851 else if (ip6) {
852 strcpy(src, ip6_sprintf(&ip6->ip6_src));
853 strcpy(dst, ip6_sprintf(&ip6->ip6_dst));
854 }
855 #endif
856 else {
857 strcpy(src, "(unknown)");
858 strcpy(dst, "(unknown)");
859 }
860 log(LOG_INFO,
861 "Connection attempt to TCP %s:%d from %s:%d\n",
862 dst, ntohs(th->th_dport),
863 src, ntohs(th->th_sport));
864 }
865 TCP_FIELDS_TO_HOST(th);
866 goto dropwithreset_ratelim;
867 }
868 #ifdef IPSEC
869 if (inp && ipsec4_in_reject(m, inp)) {
870 ipsecstat.in_polvio++;
871 goto drop;
872 }
873 #ifdef INET6
874 else if (in6p && ipsec4_in_reject_so(m, in6p->in6p_socket)) {
875 ipsecstat.in_polvio++;
876 goto drop;
877 }
878 #endif
879 #endif /*IPSEC*/
880 break;
881 #if defined(INET6) && !defined(TCP6)
882 case AF_INET6:
883 {
884 int faith;
885
886 #if defined(NFAITH) && NFAITH > 0
887 if (m->m_pkthdr.rcvif
888 && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
889 faith = 1;
890 } else
891 faith = 0;
892 #else
893 faith = 0;
894 #endif
895 in6p = in6_pcblookup_connect(&tcb6, &ip6->ip6_src, th->th_sport,
896 &ip6->ip6_dst, th->th_dport, faith);
897 if (in6p == NULL) {
898 ++tcpstat.tcps_pcbhashmiss;
899 in6p = in6_pcblookup_bind(&tcb6, &ip6->ip6_dst,
900 th->th_dport, faith);
901 }
902 if (in6p == NULL) {
903 ++tcpstat.tcps_noport;
904 TCP_FIELDS_TO_HOST(th);
905 goto dropwithreset_ratelim;
906 }
907 #ifdef IPSEC
908 if (ipsec6_in_reject(m, in6p)) {
909 ipsec6stat.in_polvio++;
910 goto drop;
911 }
912 #endif /*IPSEC*/
913 break;
914 }
915 #endif
916 }
917
918 /*
919 * If the state is CLOSED (i.e., TCB does not exist) then
920 * all data in the incoming segment is discarded.
921 * If the TCB exists but is in CLOSED state, it is embryonic,
922 * but should either do a listen or a connect soon.
923 */
924 tp = NULL;
925 so = NULL;
926 if (inp) {
927 tp = intotcpcb(inp);
928 so = inp->inp_socket;
929 }
930 #ifdef INET6
931 else if (in6p) {
932 tp = in6totcpcb(in6p);
933 so = in6p->in6p_socket;
934 }
935 #endif
936 if (tp == 0) {
937 TCP_FIELDS_TO_HOST(th);
938 goto dropwithreset_ratelim;
939 }
940 if (tp->t_state == TCPS_CLOSED)
941 goto drop;
942
943 /*
944 * Checksum extended TCP header and data.
945 */
946 switch (af) {
947 case AF_INET:
948 #ifndef PULLDOWN_TEST
949 {
950 struct ipovly *ipov;
951 ipov = (struct ipovly *)ip;
952 bzero(ipov->ih_x1, sizeof ipov->ih_x1);
953 ipov->ih_len = htons(tlen + off);
954
955 if (in_cksum(m, len) != 0) {
956 tcpstat.tcps_rcvbadsum++;
957 goto drop;
958 }
959 }
960 #else
961 if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) {
962 tcpstat.tcps_rcvbadsum++;
963 goto drop;
964 }
965 #endif
966 break;
967
968 #ifdef INET6
969 case AF_INET6:
970 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0) {
971 tcpstat.tcps_rcvbadsum++;
972 goto drop;
973 }
974 break;
975 #endif
976 }
977
978 TCP_FIELDS_TO_HOST(th);
979
980 /* Unscale the window into a 32-bit value. */
981 if ((tiflags & TH_SYN) == 0)
982 tiwin = th->th_win << tp->snd_scale;
983 else
984 tiwin = th->th_win;
985
986 #ifdef INET6
987 /* save packet options if user wanted */
988 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
989 if (in6p->in6p_options) {
990 m_freem(in6p->in6p_options);
991 in6p->in6p_options = 0;
992 }
993 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
994 }
995 #endif
996
997 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
998 union syn_cache_sa src;
999 union syn_cache_sa dst;
1000
1001 bzero(&src, sizeof(src));
1002 bzero(&dst, sizeof(dst));
1003 switch (af) {
1004 case AF_INET:
1005 src.sin.sin_len = sizeof(struct sockaddr_in);
1006 src.sin.sin_family = AF_INET;
1007 src.sin.sin_addr = ip->ip_src;
1008 src.sin.sin_port = th->th_sport;
1009
1010 dst.sin.sin_len = sizeof(struct sockaddr_in);
1011 dst.sin.sin_family = AF_INET;
1012 dst.sin.sin_addr = ip->ip_dst;
1013 dst.sin.sin_port = th->th_dport;
1014 break;
1015 #ifdef INET6
1016 case AF_INET6:
1017 src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1018 src.sin6.sin6_family = AF_INET6;
1019 src.sin6.sin6_addr = ip6->ip6_src;
1020 src.sin6.sin6_port = th->th_sport;
1021
1022 dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1023 dst.sin6.sin6_family = AF_INET6;
1024 dst.sin6.sin6_addr = ip6->ip6_dst;
1025 dst.sin6.sin6_port = th->th_dport;
1026 break;
1027 #endif /* INET6 */
1028 default:
1029 goto badsyn; /*sanity*/
1030 }
1031
1032 if (so->so_options & SO_DEBUG) {
1033 ostate = tp->t_state;
1034
1035 tcp_saveti = NULL;
1036 if (iphlen + sizeof(struct tcphdr) > MHLEN)
1037 goto nosave;
1038
1039 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
1040 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
1041 if (!tcp_saveti)
1042 goto nosave;
1043 } else {
1044 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1045 if (!tcp_saveti)
1046 goto nosave;
1047 tcp_saveti->m_len = iphlen;
1048 m_copydata(m, 0, iphlen,
1049 mtod(tcp_saveti, caddr_t));
1050 }
1051
1052 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1053 m_freem(tcp_saveti);
1054 tcp_saveti = NULL;
1055 } else {
1056 tcp_saveti->m_len += sizeof(struct tcphdr);
1057 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen,
1058 sizeof(struct tcphdr));
1059 }
1060 if (tcp_saveti) {
1061 /*
1062 * need to recover version # field, which was
1063 * overwritten on ip_cksum computation.
1064 */
1065 struct ip *sip;
1066 sip = mtod(tcp_saveti, struct ip *);
1067 switch (af) {
1068 case AF_INET:
1069 sip->ip_v = 4;
1070 break;
1071 #ifdef INET6
1072 case AF_INET6:
1073 sip->ip_v = 6;
1074 break;
1075 #endif
1076 }
1077 }
1078 nosave:;
1079 }
1080 if (so->so_options & SO_ACCEPTCONN) {
1081 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1082 if (tiflags & TH_RST) {
1083 syn_cache_reset(&src.sa, &dst.sa, th);
1084 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
1085 (TH_ACK|TH_SYN)) {
1086 /*
1087 * Received a SYN,ACK. This should
1088 * never happen while we are in
1089 * LISTEN. Send an RST.
1090 */
1091 goto badsyn;
1092 } else if (tiflags & TH_ACK) {
1093 so = syn_cache_get(&src.sa, &dst.sa,
1094 th, toff, tlen, so, m);
1095 if (so == NULL) {
1096 /*
1097 * We don't have a SYN for
1098 * this ACK; send an RST.
1099 */
1100 goto badsyn;
1101 } else if (so ==
1102 (struct socket *)(-1)) {
1103 /*
1104 * We were unable to create
1105 * the connection. If the
1106 * 3-way handshake was
1107 * completed, and RST has
1108 * been sent to the peer.
1109 * Since the mbuf might be
1110 * in use for the reply,
1111 * do not free it.
1112 */
1113 m = NULL;
1114 } else {
1115 /*
1116 * We have created a
1117 * full-blown connection.
1118 */
1119 tp = NULL;
1120 inp = NULL;
1121 #ifdef INET6
1122 in6p = NULL;
1123 #endif
1124 switch (so->so_proto->pr_domain->dom_family) {
1125 case AF_INET:
1126 inp = sotoinpcb(so);
1127 tp = intotcpcb(inp);
1128 break;
1129 #ifdef INET6
1130 case AF_INET6:
1131 in6p = sotoin6pcb(so);
1132 tp = in6totcpcb(in6p);
1133 break;
1134 #endif
1135 }
1136 if (tp == NULL)
1137 goto badsyn; /*XXX*/
1138 tiwin <<= tp->snd_scale;
1139 goto after_listen;
1140 }
1141 } else {
1142 /*
1143 * None of RST, SYN or ACK was set.
1144 * This is an invalid packet for a
1145 * TCB in LISTEN state. Send a RST.
1146 */
1147 goto badsyn;
1148 }
1149 } else {
1150 /*
1151 * Received a SYN.
1152 */
1153
1154 /*
1155 * LISTEN socket received a SYN
1156 * from itself? This can't possibly
1157 * be valid; drop the packet.
1158 */
1159 if (th->th_sport == th->th_dport) {
1160 int i;
1161
1162 switch (af) {
1163 case AF_INET:
1164 i = in_hosteq(ip->ip_src, ip->ip_dst);
1165 break;
1166 #ifdef INET6
1167 case AF_INET6:
1168 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1169 break;
1170 #endif
1171 default:
1172 i = 1;
1173 }
1174 if (i) {
1175 tcpstat.tcps_badsyn++;
1176 goto drop;
1177 }
1178 }
1179
1180 /*
1181 * SYN looks ok; create compressed TCP
1182 * state for it.
1183 */
1184 if (so->so_qlen <= so->so_qlimit &&
1185 syn_cache_add(&src.sa, &dst.sa, th, tlen,
1186 so, m, optp, optlen, &opti))
1187 m = NULL;
1188 }
1189 goto drop;
1190 }
1191 }
1192
1193 after_listen:
1194 #ifdef DIAGNOSTIC
1195 /*
1196 * Should not happen now that all embryonic connections
1197 * are handled with compressed state.
1198 */
1199 if (tp->t_state == TCPS_LISTEN)
1200 panic("tcp_input: TCPS_LISTEN");
1201 #endif
1202
1203 /*
1204 * Segment received on connection.
1205 * Reset idle time and keep-alive timer.
1206 */
1207 tp->t_idle = 0;
1208 if (TCPS_HAVEESTABLISHED(tp->t_state))
1209 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1210
1211 /*
1212 * Process options.
1213 */
1214 if (optp)
1215 tcp_dooptions(tp, optp, optlen, th, &opti);
1216
1217 /*
1218 * Header prediction: check for the two common cases
1219 * of a uni-directional data xfer. If the packet has
1220 * no control flags, is in-sequence, the window didn't
1221 * change and we're not retransmitting, it's a
1222 * candidate. If the length is zero and the ack moved
1223 * forward, we're the sender side of the xfer. Just
1224 * free the data acked & wake any higher level process
1225 * that was blocked waiting for space. If the length
1226 * is non-zero and the ack didn't move, we're the
1227 * receiver side. If we're getting packets in-order
1228 * (the reassembly queue is empty), add the data to
1229 * the socket buffer and note that we need a delayed ack.
1230 */
1231 if (tp->t_state == TCPS_ESTABLISHED &&
1232 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1233 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1234 th->th_seq == tp->rcv_nxt &&
1235 tiwin && tiwin == tp->snd_wnd &&
1236 tp->snd_nxt == tp->snd_max) {
1237
1238 /*
1239 * If last ACK falls within this segment's sequence numbers,
1240 * record the timestamp.
1241 */
1242 if (opti.ts_present &&
1243 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1244 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) {
1245 tp->ts_recent_age = tcp_now;
1246 tp->ts_recent = opti.ts_val;
1247 }
1248
1249 if (tlen == 0) {
1250 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1251 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1252 tp->snd_cwnd >= tp->snd_wnd &&
1253 tp->t_dupacks < tcprexmtthresh) {
1254 /*
1255 * this is a pure ack for outstanding data.
1256 */
1257 ++tcpstat.tcps_predack;
1258 if (opti.ts_present && opti.ts_ecr)
1259 tcp_xmit_timer(tp,
1260 tcp_now - opti.ts_ecr + 1);
1261 else if (tp->t_rtt &&
1262 SEQ_GT(th->th_ack, tp->t_rtseq))
1263 tcp_xmit_timer(tp, tp->t_rtt);
1264 acked = th->th_ack - tp->snd_una;
1265 tcpstat.tcps_rcvackpack++;
1266 tcpstat.tcps_rcvackbyte += acked;
1267 ND6_HINT(tp);
1268 sbdrop(&so->so_snd, acked);
1269 /*
1270 * We want snd_recover to track snd_una to
1271 * avoid sequence wraparound problems for
1272 * very large transfers.
1273 */
1274 tp->snd_una = tp->snd_recover = th->th_ack;
1275 m_freem(m);
1276
1277 /*
1278 * If all outstanding data are acked, stop
1279 * retransmit timer, otherwise restart timer
1280 * using current (possibly backed-off) value.
1281 * If process is waiting for space,
1282 * wakeup/selwakeup/signal. If data
1283 * are ready to send, let tcp_output
1284 * decide between more output or persist.
1285 */
1286 if (tp->snd_una == tp->snd_max)
1287 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1288 else if (TCP_TIMER_ISARMED(tp,
1289 TCPT_PERSIST) == 0)
1290 TCP_TIMER_ARM(tp, TCPT_REXMT,
1291 tp->t_rxtcur);
1292
1293 sowwakeup(so);
1294 if (so->so_snd.sb_cc)
1295 (void) tcp_output(tp);
1296 if (tcp_saveti)
1297 m_freem(tcp_saveti);
1298 return;
1299 }
1300 } else if (th->th_ack == tp->snd_una &&
1301 tp->segq.lh_first == NULL &&
1302 tlen <= sbspace(&so->so_rcv)) {
1303 /*
1304 * this is a pure, in-sequence data packet
1305 * with nothing on the reassembly queue and
1306 * we have enough buffer space to take it.
1307 */
1308 ++tcpstat.tcps_preddat;
1309 tp->rcv_nxt += tlen;
1310 tcpstat.tcps_rcvpack++;
1311 tcpstat.tcps_rcvbyte += tlen;
1312 ND6_HINT(tp);
1313 /*
1314 * Drop TCP, IP headers and TCP options then add data
1315 * to socket buffer.
1316 */
1317 m_adj(m, toff + off);
1318 sbappend(&so->so_rcv, m);
1319 sorwakeup(so);
1320 TCP_SETUP_ACK(tp, th);
1321 if (tp->t_flags & TF_ACKNOW)
1322 (void) tcp_output(tp);
1323 if (tcp_saveti)
1324 m_freem(tcp_saveti);
1325 return;
1326 }
1327 }
1328
1329 /*
1330 * Compute mbuf offset to TCP data segment.
1331 */
1332 hdroptlen = toff + off;
1333
1334 /*
1335 * Calculate amount of space in receive window,
1336 * and then do TCP input processing.
1337 * Receive window is amount of space in rcv queue,
1338 * but not less than advertised window.
1339 */
1340 { int win;
1341
1342 win = sbspace(&so->so_rcv);
1343 if (win < 0)
1344 win = 0;
1345 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1346 }
1347
1348 switch (tp->t_state) {
1349
1350 /*
1351 * If the state is SYN_SENT:
1352 * if seg contains an ACK, but not for our SYN, drop the input.
1353 * if seg contains a RST, then drop the connection.
1354 * if seg does not contain SYN, then drop it.
1355 * Otherwise this is an acceptable SYN segment
1356 * initialize tp->rcv_nxt and tp->irs
1357 * if seg contains ack then advance tp->snd_una
1358 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1359 * arrange for segment to be acked (eventually)
1360 * continue processing rest of data/controls, beginning with URG
1361 */
1362 case TCPS_SYN_SENT:
1363 if ((tiflags & TH_ACK) &&
1364 (SEQ_LEQ(th->th_ack, tp->iss) ||
1365 SEQ_GT(th->th_ack, tp->snd_max)))
1366 goto dropwithreset;
1367 if (tiflags & TH_RST) {
1368 if (tiflags & TH_ACK)
1369 tp = tcp_drop(tp, ECONNREFUSED);
1370 goto drop;
1371 }
1372 if ((tiflags & TH_SYN) == 0)
1373 goto drop;
1374 if (tiflags & TH_ACK) {
1375 tp->snd_una = tp->snd_recover = th->th_ack;
1376 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1377 tp->snd_nxt = tp->snd_una;
1378 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1379 }
1380 tp->irs = th->th_seq;
1381 tcp_rcvseqinit(tp);
1382 tp->t_flags |= TF_ACKNOW;
1383 tcp_mss_from_peer(tp, opti.maxseg);
1384
1385 /*
1386 * Initialize the initial congestion window. If we
1387 * had to retransmit the SYN, we must initialize cwnd
1388 * to 1 segment (i.e. the Loss Window).
1389 */
1390 if (tp->t_flags & TF_SYN_REXMT)
1391 tp->snd_cwnd = tp->t_peermss;
1392 else
1393 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
1394 tp->t_peermss);
1395
1396 tcp_rmx_rtt(tp);
1397 if (tiflags & TH_ACK) {
1398 tcpstat.tcps_connects++;
1399 soisconnected(so);
1400 tcp_established(tp);
1401 /* Do window scaling on this connection? */
1402 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1403 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1404 tp->snd_scale = tp->requested_s_scale;
1405 tp->rcv_scale = tp->request_r_scale;
1406 }
1407 TCP_REASS_LOCK(tp);
1408 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1409 TCP_REASS_UNLOCK(tp);
1410 /*
1411 * if we didn't have to retransmit the SYN,
1412 * use its rtt as our initial srtt & rtt var.
1413 */
1414 if (tp->t_rtt)
1415 tcp_xmit_timer(tp, tp->t_rtt);
1416 } else
1417 tp->t_state = TCPS_SYN_RECEIVED;
1418
1419 /*
1420 * Advance th->th_seq to correspond to first data byte.
1421 * If data, trim to stay within window,
1422 * dropping FIN if necessary.
1423 */
1424 th->th_seq++;
1425 if (tlen > tp->rcv_wnd) {
1426 todrop = tlen - tp->rcv_wnd;
1427 m_adj(m, -todrop);
1428 tlen = tp->rcv_wnd;
1429 tiflags &= ~TH_FIN;
1430 tcpstat.tcps_rcvpackafterwin++;
1431 tcpstat.tcps_rcvbyteafterwin += todrop;
1432 }
1433 tp->snd_wl1 = th->th_seq - 1;
1434 tp->rcv_up = th->th_seq;
1435 goto step6;
1436
1437 /*
1438 * If the state is SYN_RECEIVED:
1439 * If seg contains an ACK, but not for our SYN, drop the input
1440 * and generate an RST. See page 36, rfc793
1441 */
1442 case TCPS_SYN_RECEIVED:
1443 if ((tiflags & TH_ACK) &&
1444 (SEQ_LEQ(th->th_ack, tp->iss) ||
1445 SEQ_GT(th->th_ack, tp->snd_max)))
1446 goto dropwithreset;
1447 break;
1448 }
1449
1450 /*
1451 * States other than LISTEN or SYN_SENT.
1452 * First check timestamp, if present.
1453 * Then check that at least some bytes of segment are within
1454 * receive window. If segment begins before rcv_nxt,
1455 * drop leading data (and SYN); if nothing left, just ack.
1456 *
1457 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1458 * and it's less than ts_recent, drop it.
1459 */
1460 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1461 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
1462
1463 /* Check to see if ts_recent is over 24 days old. */
1464 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1465 /*
1466 * Invalidate ts_recent. If this segment updates
1467 * ts_recent, the age will be reset later and ts_recent
1468 * will get a valid value. If it does not, setting
1469 * ts_recent to zero will at least satisfy the
1470 * requirement that zero be placed in the timestamp
1471 * echo reply when ts_recent isn't valid. The
1472 * age isn't reset until we get a valid ts_recent
1473 * because we don't want out-of-order segments to be
1474 * dropped when ts_recent is old.
1475 */
1476 tp->ts_recent = 0;
1477 } else {
1478 tcpstat.tcps_rcvduppack++;
1479 tcpstat.tcps_rcvdupbyte += tlen;
1480 tcpstat.tcps_pawsdrop++;
1481 goto dropafterack;
1482 }
1483 }
1484
1485 todrop = tp->rcv_nxt - th->th_seq;
1486 if (todrop > 0) {
1487 if (tiflags & TH_SYN) {
1488 tiflags &= ~TH_SYN;
1489 th->th_seq++;
1490 if (th->th_urp > 1)
1491 th->th_urp--;
1492 else {
1493 tiflags &= ~TH_URG;
1494 th->th_urp = 0;
1495 }
1496 todrop--;
1497 }
1498 if (todrop > tlen ||
1499 (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1500 /*
1501 * Any valid FIN must be to the left of the window.
1502 * At this point the FIN must be a duplicate or
1503 * out of sequence; drop it.
1504 */
1505 tiflags &= ~TH_FIN;
1506 /*
1507 * Send an ACK to resynchronize and drop any data.
1508 * But keep on processing for RST or ACK.
1509 */
1510 tp->t_flags |= TF_ACKNOW;
1511 todrop = tlen;
1512 tcpstat.tcps_rcvdupbyte += todrop;
1513 tcpstat.tcps_rcvduppack++;
1514 } else {
1515 tcpstat.tcps_rcvpartduppack++;
1516 tcpstat.tcps_rcvpartdupbyte += todrop;
1517 }
1518 hdroptlen += todrop; /*drop from head afterwards*/
1519 th->th_seq += todrop;
1520 tlen -= todrop;
1521 if (th->th_urp > todrop)
1522 th->th_urp -= todrop;
1523 else {
1524 tiflags &= ~TH_URG;
1525 th->th_urp = 0;
1526 }
1527 }
1528
1529 /*
1530 * If new data are received on a connection after the
1531 * user processes are gone, then RST the other end.
1532 */
1533 if ((so->so_state & SS_NOFDREF) &&
1534 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1535 tp = tcp_close(tp);
1536 tcpstat.tcps_rcvafterclose++;
1537 goto dropwithreset;
1538 }
1539
1540 /*
1541 * If segment ends after window, drop trailing data
1542 * (and PUSH and FIN); if nothing left, just ACK.
1543 */
1544 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1545 if (todrop > 0) {
1546 tcpstat.tcps_rcvpackafterwin++;
1547 if (todrop >= tlen) {
1548 tcpstat.tcps_rcvbyteafterwin += tlen;
1549 /*
1550 * If a new connection request is received
1551 * while in TIME_WAIT, drop the old connection
1552 * and start over if the sequence numbers
1553 * are above the previous ones.
1554 */
1555 if (tiflags & TH_SYN &&
1556 tp->t_state == TCPS_TIME_WAIT &&
1557 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1558 iss = tcp_new_iss(tp, sizeof(struct tcpcb),
1559 tp->snd_nxt);
1560 tp = tcp_close(tp);
1561 goto findpcb;
1562 }
1563 /*
1564 * If window is closed can only take segments at
1565 * window edge, and have to drop data and PUSH from
1566 * incoming segments. Continue processing, but
1567 * remember to ack. Otherwise, drop segment
1568 * and ack.
1569 */
1570 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1571 tp->t_flags |= TF_ACKNOW;
1572 tcpstat.tcps_rcvwinprobe++;
1573 } else
1574 goto dropafterack;
1575 } else
1576 tcpstat.tcps_rcvbyteafterwin += todrop;
1577 m_adj(m, -todrop);
1578 tlen -= todrop;
1579 tiflags &= ~(TH_PUSH|TH_FIN);
1580 }
1581
1582 /*
1583 * If last ACK falls within this segment's sequence numbers,
1584 * and the timestamp is newer, record it.
1585 */
1586 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
1587 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1588 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen +
1589 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
1590 tp->ts_recent_age = tcp_now;
1591 tp->ts_recent = opti.ts_val;
1592 }
1593
1594 /*
1595 * If the RST bit is set examine the state:
1596 * SYN_RECEIVED STATE:
1597 * If passive open, return to LISTEN state.
1598 * If active open, inform user that connection was refused.
1599 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1600 * Inform user that connection was reset, and close tcb.
1601 * CLOSING, LAST_ACK, TIME_WAIT STATES
1602 * Close the tcb.
1603 */
1604 if (tiflags&TH_RST) switch (tp->t_state) {
1605
1606 case TCPS_SYN_RECEIVED:
1607 so->so_error = ECONNREFUSED;
1608 goto close;
1609
1610 case TCPS_ESTABLISHED:
1611 case TCPS_FIN_WAIT_1:
1612 case TCPS_FIN_WAIT_2:
1613 case TCPS_CLOSE_WAIT:
1614 so->so_error = ECONNRESET;
1615 close:
1616 tp->t_state = TCPS_CLOSED;
1617 tcpstat.tcps_drops++;
1618 tp = tcp_close(tp);
1619 goto drop;
1620
1621 case TCPS_CLOSING:
1622 case TCPS_LAST_ACK:
1623 case TCPS_TIME_WAIT:
1624 tp = tcp_close(tp);
1625 goto drop;
1626 }
1627
1628 /*
1629 * If a SYN is in the window, then this is an
1630 * error and we send an RST and drop the connection.
1631 */
1632 if (tiflags & TH_SYN) {
1633 tp = tcp_drop(tp, ECONNRESET);
1634 goto dropwithreset;
1635 }
1636
1637 /*
1638 * If the ACK bit is off we drop the segment and return.
1639 */
1640 if ((tiflags & TH_ACK) == 0) {
1641 if (tp->t_flags & TF_ACKNOW)
1642 goto dropafterack;
1643 else
1644 goto drop;
1645 }
1646
1647 /*
1648 * Ack processing.
1649 */
1650 switch (tp->t_state) {
1651
1652 /*
1653 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1654 * ESTABLISHED state and continue processing, otherwise
1655 * send an RST.
1656 */
1657 case TCPS_SYN_RECEIVED:
1658 if (SEQ_GT(tp->snd_una, th->th_ack) ||
1659 SEQ_GT(th->th_ack, tp->snd_max))
1660 goto dropwithreset;
1661 tcpstat.tcps_connects++;
1662 soisconnected(so);
1663 tcp_established(tp);
1664 /* Do window scaling? */
1665 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1666 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1667 tp->snd_scale = tp->requested_s_scale;
1668 tp->rcv_scale = tp->request_r_scale;
1669 }
1670 TCP_REASS_LOCK(tp);
1671 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1672 TCP_REASS_UNLOCK(tp);
1673 tp->snd_wl1 = th->th_seq - 1;
1674 /* fall into ... */
1675
1676 /*
1677 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1678 * ACKs. If the ack is in the range
1679 * tp->snd_una < th->th_ack <= tp->snd_max
1680 * then advance tp->snd_una to th->th_ack and drop
1681 * data from the retransmission queue. If this ACK reflects
1682 * more up to date window information we update our window information.
1683 */
1684 case TCPS_ESTABLISHED:
1685 case TCPS_FIN_WAIT_1:
1686 case TCPS_FIN_WAIT_2:
1687 case TCPS_CLOSE_WAIT:
1688 case TCPS_CLOSING:
1689 case TCPS_LAST_ACK:
1690 case TCPS_TIME_WAIT:
1691
1692 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1693 if (tlen == 0 && tiwin == tp->snd_wnd) {
1694 tcpstat.tcps_rcvdupack++;
1695 /*
1696 * If we have outstanding data (other than
1697 * a window probe), this is a completely
1698 * duplicate ack (ie, window info didn't
1699 * change), the ack is the biggest we've
1700 * seen and we've seen exactly our rexmt
1701 * threshhold of them, assume a packet
1702 * has been dropped and retransmit it.
1703 * Kludge snd_nxt & the congestion
1704 * window so we send only this one
1705 * packet.
1706 *
1707 * We know we're losing at the current
1708 * window size so do congestion avoidance
1709 * (set ssthresh to half the current window
1710 * and pull our congestion window back to
1711 * the new ssthresh).
1712 *
1713 * Dup acks mean that packets have left the
1714 * network (they're now cached at the receiver)
1715 * so bump cwnd by the amount in the receiver
1716 * to keep a constant cwnd packets in the
1717 * network.
1718 */
1719 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
1720 th->th_ack != tp->snd_una)
1721 tp->t_dupacks = 0;
1722 else if (++tp->t_dupacks == tcprexmtthresh) {
1723 tcp_seq onxt = tp->snd_nxt;
1724 u_int win =
1725 min(tp->snd_wnd, tp->snd_cwnd) /
1726 2 / tp->t_segsz;
1727 if (tcp_do_newreno && SEQ_LT(th->th_ack,
1728 tp->snd_recover)) {
1729 /*
1730 * False fast retransmit after
1731 * timeout. Do not cut window.
1732 */
1733 tp->snd_cwnd += tp->t_segsz;
1734 tp->t_dupacks = 0;
1735 (void) tcp_output(tp);
1736 goto drop;
1737 }
1738
1739 if (win < 2)
1740 win = 2;
1741 tp->snd_ssthresh = win * tp->t_segsz;
1742 tp->snd_recover = tp->snd_max;
1743 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1744 tp->t_rtt = 0;
1745 tp->snd_nxt = th->th_ack;
1746 tp->snd_cwnd = tp->t_segsz;
1747 (void) tcp_output(tp);
1748 tp->snd_cwnd = tp->snd_ssthresh +
1749 tp->t_segsz * tp->t_dupacks;
1750 if (SEQ_GT(onxt, tp->snd_nxt))
1751 tp->snd_nxt = onxt;
1752 goto drop;
1753 } else if (tp->t_dupacks > tcprexmtthresh) {
1754 tp->snd_cwnd += tp->t_segsz;
1755 (void) tcp_output(tp);
1756 goto drop;
1757 }
1758 } else
1759 tp->t_dupacks = 0;
1760 break;
1761 }
1762 /*
1763 * If the congestion window was inflated to account
1764 * for the other side's cached packets, retract it.
1765 */
1766 if (tcp_do_newreno == 0) {
1767 if (tp->t_dupacks >= tcprexmtthresh &&
1768 tp->snd_cwnd > tp->snd_ssthresh)
1769 tp->snd_cwnd = tp->snd_ssthresh;
1770 tp->t_dupacks = 0;
1771 } else if (tp->t_dupacks >= tcprexmtthresh &&
1772 tcp_newreno(tp, th) == 0) {
1773 tp->snd_cwnd = tp->snd_ssthresh;
1774 /*
1775 * Window inflation should have left us with approx.
1776 * snd_ssthresh outstanding data. But in case we
1777 * would be inclined to send a burst, better to do
1778 * it via the slow start mechanism.
1779 */
1780 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
1781 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
1782 + tp->t_segsz;
1783 tp->t_dupacks = 0;
1784 }
1785 if (SEQ_GT(th->th_ack, tp->snd_max)) {
1786 tcpstat.tcps_rcvacktoomuch++;
1787 goto dropafterack;
1788 }
1789 acked = th->th_ack - tp->snd_una;
1790 tcpstat.tcps_rcvackpack++;
1791 tcpstat.tcps_rcvackbyte += acked;
1792
1793 /*
1794 * If we have a timestamp reply, update smoothed
1795 * round trip time. If no timestamp is present but
1796 * transmit timer is running and timed sequence
1797 * number was acked, update smoothed round trip time.
1798 * Since we now have an rtt measurement, cancel the
1799 * timer backoff (cf., Phil Karn's retransmit alg.).
1800 * Recompute the initial retransmit timer.
1801 */
1802 if (opti.ts_present && opti.ts_ecr)
1803 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1);
1804 else if (tp->t_rtt && SEQ_GT(th->th_ack, tp->t_rtseq))
1805 tcp_xmit_timer(tp,tp->t_rtt);
1806
1807 /*
1808 * If all outstanding data is acked, stop retransmit
1809 * timer and remember to restart (more output or persist).
1810 * If there is more data to be acked, restart retransmit
1811 * timer, using current (possibly backed-off) value.
1812 */
1813 if (th->th_ack == tp->snd_max) {
1814 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1815 needoutput = 1;
1816 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1817 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1818 /*
1819 * When new data is acked, open the congestion window.
1820 * If the window gives us less than ssthresh packets
1821 * in flight, open exponentially (segsz per packet).
1822 * Otherwise open linearly: segsz per window
1823 * (segsz^2 / cwnd per packet), plus a constant
1824 * fraction of a packet (segsz/8) to help larger windows
1825 * open quickly enough.
1826 */
1827 {
1828 u_int cw = tp->snd_cwnd;
1829 u_int incr = tp->t_segsz;
1830
1831 if (cw > tp->snd_ssthresh)
1832 incr = incr * incr / cw;
1833 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
1834 tp->snd_cwnd = min(cw + incr,
1835 TCP_MAXWIN << tp->snd_scale);
1836 }
1837 ND6_HINT(tp);
1838 if (acked > so->so_snd.sb_cc) {
1839 tp->snd_wnd -= so->so_snd.sb_cc;
1840 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1841 ourfinisacked = 1;
1842 } else {
1843 sbdrop(&so->so_snd, acked);
1844 tp->snd_wnd -= acked;
1845 ourfinisacked = 0;
1846 }
1847 sowwakeup(so);
1848 /*
1849 * We want snd_recover to track snd_una to
1850 * avoid sequence wraparound problems for
1851 * very large transfers.
1852 */
1853 tp->snd_una = tp->snd_recover = th->th_ack;
1854 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1855 tp->snd_nxt = tp->snd_una;
1856
1857 switch (tp->t_state) {
1858
1859 /*
1860 * In FIN_WAIT_1 STATE in addition to the processing
1861 * for the ESTABLISHED state if our FIN is now acknowledged
1862 * then enter FIN_WAIT_2.
1863 */
1864 case TCPS_FIN_WAIT_1:
1865 if (ourfinisacked) {
1866 /*
1867 * If we can't receive any more
1868 * data, then closing user can proceed.
1869 * Starting the timer is contrary to the
1870 * specification, but if we don't get a FIN
1871 * we'll hang forever.
1872 */
1873 if (so->so_state & SS_CANTRCVMORE) {
1874 soisdisconnected(so);
1875 if (tcp_maxidle > 0)
1876 TCP_TIMER_ARM(tp, TCPT_2MSL,
1877 tcp_maxidle);
1878 }
1879 tp->t_state = TCPS_FIN_WAIT_2;
1880 }
1881 break;
1882
1883 /*
1884 * In CLOSING STATE in addition to the processing for
1885 * the ESTABLISHED state if the ACK acknowledges our FIN
1886 * then enter the TIME-WAIT state, otherwise ignore
1887 * the segment.
1888 */
1889 case TCPS_CLOSING:
1890 if (ourfinisacked) {
1891 tp->t_state = TCPS_TIME_WAIT;
1892 tcp_canceltimers(tp);
1893 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1894 soisdisconnected(so);
1895 }
1896 break;
1897
1898 /*
1899 * In LAST_ACK, we may still be waiting for data to drain
1900 * and/or to be acked, as well as for the ack of our FIN.
1901 * If our FIN is now acknowledged, delete the TCB,
1902 * enter the closed state and return.
1903 */
1904 case TCPS_LAST_ACK:
1905 if (ourfinisacked) {
1906 tp = tcp_close(tp);
1907 goto drop;
1908 }
1909 break;
1910
1911 /*
1912 * In TIME_WAIT state the only thing that should arrive
1913 * is a retransmission of the remote FIN. Acknowledge
1914 * it and restart the finack timer.
1915 */
1916 case TCPS_TIME_WAIT:
1917 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1918 goto dropafterack;
1919 }
1920 }
1921
1922 step6:
1923 /*
1924 * Update window information.
1925 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1926 */
1927 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1928 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) ||
1929 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) {
1930 /* keep track of pure window updates */
1931 if (tlen == 0 &&
1932 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1933 tcpstat.tcps_rcvwinupd++;
1934 tp->snd_wnd = tiwin;
1935 tp->snd_wl1 = th->th_seq;
1936 tp->snd_wl2 = th->th_ack;
1937 if (tp->snd_wnd > tp->max_sndwnd)
1938 tp->max_sndwnd = tp->snd_wnd;
1939 needoutput = 1;
1940 }
1941
1942 /*
1943 * Process segments with URG.
1944 */
1945 if ((tiflags & TH_URG) && th->th_urp &&
1946 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1947 /*
1948 * This is a kludge, but if we receive and accept
1949 * random urgent pointers, we'll crash in
1950 * soreceive. It's hard to imagine someone
1951 * actually wanting to send this much urgent data.
1952 */
1953 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
1954 th->th_urp = 0; /* XXX */
1955 tiflags &= ~TH_URG; /* XXX */
1956 goto dodata; /* XXX */
1957 }
1958 /*
1959 * If this segment advances the known urgent pointer,
1960 * then mark the data stream. This should not happen
1961 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1962 * a FIN has been received from the remote side.
1963 * In these states we ignore the URG.
1964 *
1965 * According to RFC961 (Assigned Protocols),
1966 * the urgent pointer points to the last octet
1967 * of urgent data. We continue, however,
1968 * to consider it to indicate the first octet
1969 * of data past the urgent section as the original
1970 * spec states (in one of two places).
1971 */
1972 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1973 tp->rcv_up = th->th_seq + th->th_urp;
1974 so->so_oobmark = so->so_rcv.sb_cc +
1975 (tp->rcv_up - tp->rcv_nxt) - 1;
1976 if (so->so_oobmark == 0)
1977 so->so_state |= SS_RCVATMARK;
1978 sohasoutofband(so);
1979 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1980 }
1981 /*
1982 * Remove out of band data so doesn't get presented to user.
1983 * This can happen independent of advancing the URG pointer,
1984 * but if two URG's are pending at once, some out-of-band
1985 * data may creep in... ick.
1986 */
1987 if (th->th_urp <= (u_int16_t) tlen
1988 #ifdef SO_OOBINLINE
1989 && (so->so_options & SO_OOBINLINE) == 0
1990 #endif
1991 )
1992 tcp_pulloutofband(so, th, m, hdroptlen);
1993 } else
1994 /*
1995 * If no out of band data is expected,
1996 * pull receive urgent pointer along
1997 * with the receive window.
1998 */
1999 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2000 tp->rcv_up = tp->rcv_nxt;
2001 dodata: /* XXX */
2002
2003 /*
2004 * Process the segment text, merging it into the TCP sequencing queue,
2005 * and arranging for acknowledgement of receipt if necessary.
2006 * This process logically involves adjusting tp->rcv_wnd as data
2007 * is presented to the user (this happens in tcp_usrreq.c,
2008 * case PRU_RCVD). If a FIN has already been received on this
2009 * connection then we just ignore the text.
2010 */
2011 if ((tlen || (tiflags & TH_FIN)) &&
2012 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2013 /*
2014 * Insert segment ti into reassembly queue of tcp with
2015 * control block tp. Return TH_FIN if reassembly now includes
2016 * a segment with FIN. The macro form does the common case
2017 * inline (segment is the next to be received on an
2018 * established connection, and the queue is empty),
2019 * avoiding linkage into and removal from the queue and
2020 * repetition of various conversions.
2021 * Set DELACK for segments received in order, but ack
2022 * immediately when segments are out of order
2023 * (so fast retransmit can work).
2024 */
2025 /* NOTE: this was TCP_REASS() macro, but used only once */
2026 TCP_REASS_LOCK(tp);
2027 if (th->th_seq == tp->rcv_nxt &&
2028 tp->segq.lh_first == NULL &&
2029 tp->t_state == TCPS_ESTABLISHED) {
2030 TCP_SETUP_ACK(tp, th);
2031 tp->rcv_nxt += tlen;
2032 tiflags = th->th_flags & TH_FIN;
2033 tcpstat.tcps_rcvpack++;
2034 tcpstat.tcps_rcvbyte += tlen;
2035 ND6_HINT(tp);
2036 m_adj(m, hdroptlen);
2037 sbappend(&(so)->so_rcv, m);
2038 sorwakeup(so);
2039 } else {
2040 m_adj(m, hdroptlen);
2041 tiflags = tcp_reass(tp, th, m, &tlen);
2042 tp->t_flags |= TF_ACKNOW;
2043 }
2044 TCP_REASS_UNLOCK(tp);
2045
2046 /*
2047 * Note the amount of data that peer has sent into
2048 * our window, in order to estimate the sender's
2049 * buffer size.
2050 */
2051 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2052 } else {
2053 m_freem(m);
2054 m = NULL;
2055 tiflags &= ~TH_FIN;
2056 }
2057
2058 /*
2059 * If FIN is received ACK the FIN and let the user know
2060 * that the connection is closing. Ignore a FIN received before
2061 * the connection is fully established.
2062 */
2063 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2064 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2065 socantrcvmore(so);
2066 tp->t_flags |= TF_ACKNOW;
2067 tp->rcv_nxt++;
2068 }
2069 switch (tp->t_state) {
2070
2071 /*
2072 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2073 */
2074 case TCPS_ESTABLISHED:
2075 tp->t_state = TCPS_CLOSE_WAIT;
2076 break;
2077
2078 /*
2079 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2080 * enter the CLOSING state.
2081 */
2082 case TCPS_FIN_WAIT_1:
2083 tp->t_state = TCPS_CLOSING;
2084 break;
2085
2086 /*
2087 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2088 * starting the time-wait timer, turning off the other
2089 * standard timers.
2090 */
2091 case TCPS_FIN_WAIT_2:
2092 tp->t_state = TCPS_TIME_WAIT;
2093 tcp_canceltimers(tp);
2094 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2095 soisdisconnected(so);
2096 break;
2097
2098 /*
2099 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2100 */
2101 case TCPS_TIME_WAIT:
2102 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2103 break;
2104 }
2105 }
2106 if (so->so_options & SO_DEBUG) {
2107 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2108 }
2109
2110 /*
2111 * Return any desired output.
2112 */
2113 if (needoutput || (tp->t_flags & TF_ACKNOW))
2114 (void) tcp_output(tp);
2115 if (tcp_saveti)
2116 m_freem(tcp_saveti);
2117 return;
2118
2119 badsyn:
2120 /*
2121 * Received a bad SYN. Increment counters and dropwithreset.
2122 */
2123 tcpstat.tcps_badsyn++;
2124 tp = NULL;
2125 goto dropwithreset;
2126
2127 dropafterack:
2128 /*
2129 * Generate an ACK dropping incoming segment if it occupies
2130 * sequence space, where the ACK reflects our state.
2131 */
2132 if (tiflags & TH_RST)
2133 goto drop;
2134 m_freem(m);
2135 tp->t_flags |= TF_ACKNOW;
2136 (void) tcp_output(tp);
2137 if (tcp_saveti)
2138 m_freem(tcp_saveti);
2139 return;
2140
2141 dropwithreset_ratelim:
2142 /*
2143 * We may want to rate-limit RSTs in certain situations,
2144 * particularly if we are sending an RST in response to
2145 * an attempt to connect to or otherwise communicate with
2146 * a port for which we have no socket.
2147 */
2148 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2149 tcp_rst_ppslim) == 0) {
2150 /* XXX stat */
2151 goto drop;
2152 }
2153 if (ratecheck(&tcp_rst_ratelim_last, &tcp_rst_ratelim) == 0) {
2154 /* XXX stat */
2155 goto drop;
2156 }
2157 /* ...fall into dropwithreset... */
2158
2159 dropwithreset:
2160 /*
2161 * Generate a RST, dropping incoming segment.
2162 * Make ACK acceptable to originator of segment.
2163 */
2164 if (tiflags & TH_RST)
2165 goto drop;
2166 {
2167 /*
2168 * need to recover version # field, which was overwritten on
2169 * ip_cksum computation.
2170 */
2171 struct ip *sip;
2172 sip = mtod(m, struct ip *);
2173 switch (af) {
2174 case AF_INET:
2175 sip->ip_v = 4;
2176 break;
2177 #ifdef INET6
2178 case AF_INET6:
2179 sip->ip_v = 6;
2180 break;
2181 #endif
2182 }
2183 }
2184 if (tiflags & TH_ACK)
2185 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2186 else {
2187 if (tiflags & TH_SYN)
2188 tlen++;
2189 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2190 TH_RST|TH_ACK);
2191 }
2192 if (tcp_saveti)
2193 m_freem(tcp_saveti);
2194 return;
2195
2196 drop:
2197 /*
2198 * Drop space held by incoming segment and return.
2199 */
2200 if (tp) {
2201 if (tp->t_inpcb)
2202 so = tp->t_inpcb->inp_socket;
2203 #ifdef INET6
2204 else if (tp->t_in6pcb)
2205 so = tp->t_in6pcb->in6p_socket;
2206 #endif
2207 else
2208 so = NULL;
2209 if (so && (so->so_options & SO_DEBUG) != 0)
2210 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2211 }
2212 if (tcp_saveti)
2213 m_freem(tcp_saveti);
2214 m_freem(m);
2215 return;
2216 }
2217
2218 void
2219 tcp_dooptions(tp, cp, cnt, th, oi)
2220 struct tcpcb *tp;
2221 u_char *cp;
2222 int cnt;
2223 struct tcphdr *th;
2224 struct tcp_opt_info *oi;
2225 {
2226 u_int16_t mss;
2227 int opt, optlen;
2228
2229 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2230 opt = cp[0];
2231 if (opt == TCPOPT_EOL)
2232 break;
2233 if (opt == TCPOPT_NOP)
2234 optlen = 1;
2235 else {
2236 if (cnt < 2)
2237 break;
2238 optlen = cp[1];
2239 if (optlen < 2 || optlen > cnt)
2240 break;
2241 }
2242 switch (opt) {
2243
2244 default:
2245 continue;
2246
2247 case TCPOPT_MAXSEG:
2248 if (optlen != TCPOLEN_MAXSEG)
2249 continue;
2250 if (!(th->th_flags & TH_SYN))
2251 continue;
2252 bcopy(cp + 2, &mss, sizeof(mss));
2253 oi->maxseg = ntohs(mss);
2254 break;
2255
2256 case TCPOPT_WINDOW:
2257 if (optlen != TCPOLEN_WINDOW)
2258 continue;
2259 if (!(th->th_flags & TH_SYN))
2260 continue;
2261 tp->t_flags |= TF_RCVD_SCALE;
2262 tp->requested_s_scale = cp[2];
2263 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
2264 #if 0 /*XXX*/
2265 char *p;
2266
2267 if (ip)
2268 p = ntohl(ip->ip_src);
2269 #ifdef INET6
2270 else if (ip6)
2271 p = ip6_sprintf(&ip6->ip6_src);
2272 #endif
2273 else
2274 p = "(unknown)";
2275 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
2276 "assuming %d\n",
2277 tp->requested_s_scale, p,
2278 TCP_MAX_WINSHIFT);
2279 #else
2280 log(LOG_ERR, "TCP: invalid wscale %d, "
2281 "assuming %d\n",
2282 tp->requested_s_scale,
2283 TCP_MAX_WINSHIFT);
2284 #endif
2285 tp->requested_s_scale = TCP_MAX_WINSHIFT;
2286 }
2287 break;
2288
2289 case TCPOPT_TIMESTAMP:
2290 if (optlen != TCPOLEN_TIMESTAMP)
2291 continue;
2292 oi->ts_present = 1;
2293 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
2294 NTOHL(oi->ts_val);
2295 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
2296 NTOHL(oi->ts_ecr);
2297
2298 /*
2299 * A timestamp received in a SYN makes
2300 * it ok to send timestamp requests and replies.
2301 */
2302 if (th->th_flags & TH_SYN) {
2303 tp->t_flags |= TF_RCVD_TSTMP;
2304 tp->ts_recent = oi->ts_val;
2305 tp->ts_recent_age = tcp_now;
2306 }
2307 break;
2308 case TCPOPT_SACK_PERMITTED:
2309 if (optlen != TCPOLEN_SACK_PERMITTED)
2310 continue;
2311 if (!(th->th_flags & TH_SYN))
2312 continue;
2313 tp->t_flags &= ~TF_CANT_TXSACK;
2314 break;
2315
2316 case TCPOPT_SACK:
2317 if (tp->t_flags & TF_IGNR_RXSACK)
2318 continue;
2319 if (optlen % 8 != 2 || optlen < 10)
2320 continue;
2321 cp += 2;
2322 optlen -= 2;
2323 for (; optlen > 0; cp -= 8, optlen -= 8) {
2324 tcp_seq lwe, rwe;
2325 bcopy((char *)cp, (char *) &lwe, sizeof(lwe));
2326 NTOHL(lwe);
2327 bcopy((char *)cp, (char *) &rwe, sizeof(rwe));
2328 NTOHL(rwe);
2329 /* tcp_mark_sacked(tp, lwe, rwe); */
2330 }
2331 break;
2332 }
2333 }
2334 }
2335
2336 /*
2337 * Pull out of band byte out of a segment so
2338 * it doesn't appear in the user's data queue.
2339 * It is still reflected in the segment length for
2340 * sequencing purposes.
2341 */
2342 void
2343 tcp_pulloutofband(so, th, m, off)
2344 struct socket *so;
2345 struct tcphdr *th;
2346 struct mbuf *m;
2347 int off;
2348 {
2349 int cnt = off + th->th_urp - 1;
2350
2351 while (cnt >= 0) {
2352 if (m->m_len > cnt) {
2353 char *cp = mtod(m, caddr_t) + cnt;
2354 struct tcpcb *tp = sototcpcb(so);
2355
2356 tp->t_iobc = *cp;
2357 tp->t_oobflags |= TCPOOB_HAVEDATA;
2358 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2359 m->m_len--;
2360 return;
2361 }
2362 cnt -= m->m_len;
2363 m = m->m_next;
2364 if (m == 0)
2365 break;
2366 }
2367 panic("tcp_pulloutofband");
2368 }
2369
2370 /*
2371 * Collect new round-trip time estimate
2372 * and update averages and current timeout.
2373 */
2374 void
2375 tcp_xmit_timer(tp, rtt)
2376 struct tcpcb *tp;
2377 short rtt;
2378 {
2379 short delta;
2380 short rttmin;
2381
2382 tcpstat.tcps_rttupdated++;
2383 --rtt;
2384 if (tp->t_srtt != 0) {
2385 /*
2386 * srtt is stored as fixed point with 3 bits after the
2387 * binary point (i.e., scaled by 8). The following magic
2388 * is equivalent to the smoothing algorithm in rfc793 with
2389 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2390 * point). Adjust rtt to origin 0.
2391 */
2392 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
2393 if ((tp->t_srtt += delta) <= 0)
2394 tp->t_srtt = 1 << 2;
2395 /*
2396 * We accumulate a smoothed rtt variance (actually, a
2397 * smoothed mean difference), then set the retransmit
2398 * timer to smoothed rtt + 4 times the smoothed variance.
2399 * rttvar is stored as fixed point with 2 bits after the
2400 * binary point (scaled by 4). The following is
2401 * equivalent to rfc793 smoothing with an alpha of .75
2402 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2403 * rfc793's wired-in beta.
2404 */
2405 if (delta < 0)
2406 delta = -delta;
2407 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
2408 if ((tp->t_rttvar += delta) <= 0)
2409 tp->t_rttvar = 1 << 2;
2410 } else {
2411 /*
2412 * No rtt measurement yet - use the unsmoothed rtt.
2413 * Set the variance to half the rtt (so our first
2414 * retransmit happens at 3*rtt).
2415 */
2416 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
2417 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
2418 }
2419 tp->t_rtt = 0;
2420 tp->t_rxtshift = 0;
2421
2422 /*
2423 * the retransmit should happen at rtt + 4 * rttvar.
2424 * Because of the way we do the smoothing, srtt and rttvar
2425 * will each average +1/2 tick of bias. When we compute
2426 * the retransmit timer, we want 1/2 tick of rounding and
2427 * 1 extra tick because of +-1/2 tick uncertainty in the
2428 * firing of the timer. The bias will give us exactly the
2429 * 1.5 tick we need. But, because the bias is
2430 * statistical, we have to test that we don't drop below
2431 * the minimum feasible timer (which is 2 ticks).
2432 */
2433 if (tp->t_rttmin > rtt + 2)
2434 rttmin = tp->t_rttmin;
2435 else
2436 rttmin = rtt + 2;
2437 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
2438
2439 /*
2440 * We received an ack for a packet that wasn't retransmitted;
2441 * it is probably safe to discard any error indications we've
2442 * received recently. This isn't quite right, but close enough
2443 * for now (a route might have failed after we sent a segment,
2444 * and the return path might not be symmetrical).
2445 */
2446 tp->t_softerror = 0;
2447 }
2448
2449 /*
2450 * Checks for partial ack. If partial ack arrives, force the retransmission
2451 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
2452 * 1. By setting snd_nxt to th_ack, this forces retransmission timer to
2453 * be started again. If the ack advances at least to tp->snd_recover, return 0.
2454 */
2455 int
2456 tcp_newreno(tp, th)
2457 struct tcpcb *tp;
2458 struct tcphdr *th;
2459 {
2460 tcp_seq onxt = tp->snd_nxt;
2461 u_long ocwnd = tp->snd_cwnd;
2462
2463 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2464 /*
2465 * snd_una has not yet been updated and the socket's send
2466 * buffer has not yet drained off the ACK'd data, so we
2467 * have to leave snd_una as it was to get the correct data
2468 * offset in tcp_output().
2469 */
2470 TCP_TIMER_DISARM(tp, TCPT_REXMT);
2471 tp->t_rtt = 0;
2472 tp->snd_nxt = th->th_ack;
2473 /*
2474 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una
2475 * is not yet updated when we're called.
2476 */
2477 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
2478 (void) tcp_output(tp);
2479 tp->snd_cwnd = ocwnd;
2480 if (SEQ_GT(onxt, tp->snd_nxt))
2481 tp->snd_nxt = onxt;
2482 /*
2483 * Partial window deflation. Relies on fact that tp->snd_una
2484 * not updated yet.
2485 */
2486 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
2487 return 1;
2488 }
2489 return 0;
2490 }
2491
2492
2493 /*
2494 * TCP compressed state engine. Currently used to hold compressed
2495 * state for SYN_RECEIVED.
2496 */
2497
2498 u_long syn_cache_count;
2499 u_int32_t syn_hash1, syn_hash2;
2500
2501 #define SYN_HASH(sa, sp, dp) \
2502 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
2503 ((u_int32_t)(sp)))^syn_hash2)))
2504 #ifndef INET6
2505 #define SYN_HASHALL(hash, src, dst) \
2506 do { \
2507 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
2508 ((struct sockaddr_in *)(src))->sin_port, \
2509 ((struct sockaddr_in *)(dst))->sin_port); \
2510 } while (0)
2511 #else
2512 #define SYN_HASH6(sa, sp, dp) \
2513 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
2514 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
2515 & 0x7fffffff)
2516
2517 #define SYN_HASHALL(hash, src, dst) \
2518 do { \
2519 switch ((src)->sa_family) { \
2520 case AF_INET: \
2521 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
2522 ((struct sockaddr_in *)(src))->sin_port, \
2523 ((struct sockaddr_in *)(dst))->sin_port); \
2524 break; \
2525 case AF_INET6: \
2526 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
2527 ((struct sockaddr_in6 *)(src))->sin6_port, \
2528 ((struct sockaddr_in6 *)(dst))->sin6_port); \
2529 break; \
2530 default: \
2531 hash = 0; \
2532 } \
2533 } while (0)
2534 #endif /* INET6 */
2535
2536 #define SYN_CACHE_RM(sc) \
2537 do { \
2538 LIST_REMOVE((sc), sc_bucketq); \
2539 (sc)->sc_tp = NULL; \
2540 LIST_REMOVE((sc), sc_tpq); \
2541 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
2542 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \
2543 syn_cache_count--; \
2544 } while (0)
2545
2546 #define SYN_CACHE_PUT(sc) \
2547 do { \
2548 if ((sc)->sc_ipopts) \
2549 (void) m_free((sc)->sc_ipopts); \
2550 if ((sc)->sc_route4.ro_rt != NULL) \
2551 RTFREE((sc)->sc_route4.ro_rt); \
2552 pool_put(&syn_cache_pool, (sc)); \
2553 } while (0)
2554
2555 struct pool syn_cache_pool;
2556
2557 /*
2558 * We don't estimate RTT with SYNs, so each packet starts with the default
2559 * RTT and each timer queue has a fixed timeout value. This allows us to
2560 * optimize the timer queues somewhat.
2561 */
2562 #define SYN_CACHE_TIMER_ARM(sc) \
2563 do { \
2564 TCPT_RANGESET((sc)->sc_rxtcur, \
2565 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
2566 TCPTV_REXMTMAX); \
2567 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \
2568 } while (0)
2569
2570 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1];
2571
2572 void
2573 syn_cache_init()
2574 {
2575 int i;
2576
2577 /* Initialize the hash buckets. */
2578 for (i = 0; i < tcp_syn_cache_size; i++)
2579 LIST_INIT(&tcp_syn_cache[i].sch_bucket);
2580
2581 /* Initialize the timer queues. */
2582 for (i = 0; i <= TCP_MAXRXTSHIFT; i++)
2583 TAILQ_INIT(&tcp_syn_cache_timeq[i]);
2584
2585 /* Initialize the syn cache pool. */
2586 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
2587 "synpl", 0, NULL, NULL, M_PCB);
2588 }
2589
2590 void
2591 syn_cache_insert(sc, tp)
2592 struct syn_cache *sc;
2593 struct tcpcb *tp;
2594 {
2595 struct syn_cache_head *scp;
2596 struct syn_cache *sc2;
2597 int s, i;
2598
2599 /*
2600 * If there are no entries in the hash table, reinitialize
2601 * the hash secrets.
2602 */
2603 if (syn_cache_count == 0) {
2604 struct timeval tv;
2605 microtime(&tv);
2606 syn_hash1 = random() ^ (u_long)≻
2607 syn_hash2 = random() ^ tv.tv_usec;
2608 }
2609
2610 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
2611 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
2612 scp = &tcp_syn_cache[sc->sc_bucketidx];
2613
2614 /*
2615 * Make sure that we don't overflow the per-bucket
2616 * limit or the total cache size limit.
2617 */
2618 s = splsoftnet();
2619 if (scp->sch_length >= tcp_syn_bucket_limit) {
2620 tcpstat.tcps_sc_bucketoverflow++;
2621 /*
2622 * The bucket is full. Toss the oldest element in the
2623 * bucket. This will be the entry with our bucket
2624 * index closest to the front of the timer queue with
2625 * the largest timeout value.
2626 *
2627 * Note: This timer queue traversal may be expensive, so
2628 * we hope that this doesn't happen very often. It is
2629 * much more likely that we'll overflow the entire
2630 * cache, which is much easier to handle; see below.
2631 */
2632 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
2633 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
2634 sc2 != NULL;
2635 sc2 = TAILQ_NEXT(sc2, sc_timeq)) {
2636 if (sc2->sc_bucketidx == sc->sc_bucketidx) {
2637 SYN_CACHE_RM(sc2);
2638 SYN_CACHE_PUT(sc2);
2639 goto insert; /* 2 level break */
2640 }
2641 }
2642 }
2643 #ifdef DIAGNOSTIC
2644 /*
2645 * This should never happen; we should always find an
2646 * entry in our bucket.
2647 */
2648 panic("syn_cache_insert: bucketoverflow: impossible");
2649 #endif
2650 } else if (syn_cache_count >= tcp_syn_cache_limit) {
2651 tcpstat.tcps_sc_overflowed++;
2652 /*
2653 * The cache is full. Toss the oldest entry in the
2654 * entire cache. This is the front entry in the
2655 * first non-empty timer queue with the largest
2656 * timeout value.
2657 */
2658 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
2659 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
2660 if (sc2 == NULL)
2661 continue;
2662 SYN_CACHE_RM(sc2);
2663 SYN_CACHE_PUT(sc2);
2664 goto insert; /* symmetry with above */
2665 }
2666 #ifdef DIAGNOSTIC
2667 /*
2668 * This should never happen; we should always find an
2669 * entry in the cache.
2670 */
2671 panic("syn_cache_insert: cache overflow: impossible");
2672 #endif
2673 }
2674
2675 insert:
2676 /*
2677 * Initialize the entry's timer.
2678 */
2679 sc->sc_rxttot = 0;
2680 sc->sc_rxtshift = 0;
2681 SYN_CACHE_TIMER_ARM(sc);
2682 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq);
2683
2684 /* Link it from tcpcb entry */
2685 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
2686
2687 /* Put it into the bucket. */
2688 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq);
2689 scp->sch_length++;
2690 syn_cache_count++;
2691
2692 tcpstat.tcps_sc_added++;
2693 splx(s);
2694 }
2695
2696 /*
2697 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
2698 * If we have retransmitted an entry the maximum number of times, expire
2699 * that entry.
2700 */
2701 void
2702 syn_cache_timer()
2703 {
2704 struct syn_cache *sc, *nsc;
2705 int i, s;
2706
2707 s = splsoftnet();
2708
2709 /*
2710 * First, get all the entries that need to be retransmitted, or
2711 * must be expired due to exceeding the initial keepalive time.
2712 */
2713 for (i = 0; i < TCP_MAXRXTSHIFT; i++) {
2714 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
2715 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
2716 sc = nsc) {
2717 nsc = TAILQ_NEXT(sc, sc_timeq);
2718
2719 /*
2720 * Compute the total amount of time this entry has
2721 * been on a queue. If this entry has been on longer
2722 * than the keep alive timer would allow, expire it.
2723 */
2724 sc->sc_rxttot += sc->sc_rxtcur;
2725 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) {
2726 tcpstat.tcps_sc_timed_out++;
2727 SYN_CACHE_RM(sc);
2728 SYN_CACHE_PUT(sc);
2729 continue;
2730 }
2731
2732 tcpstat.tcps_sc_retransmitted++;
2733 (void) syn_cache_respond(sc, NULL);
2734
2735 /* Advance this entry onto the next timer queue. */
2736 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq);
2737 sc->sc_rxtshift = i + 1;
2738 SYN_CACHE_TIMER_ARM(sc);
2739 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift],
2740 sc, sc_timeq);
2741 }
2742 }
2743
2744 /*
2745 * Now get all the entries that are expired due to too many
2746 * retransmissions.
2747 */
2748 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]);
2749 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
2750 sc = nsc) {
2751 nsc = TAILQ_NEXT(sc, sc_timeq);
2752 tcpstat.tcps_sc_timed_out++;
2753 SYN_CACHE_RM(sc);
2754 SYN_CACHE_PUT(sc);
2755 }
2756 splx(s);
2757 }
2758
2759 /*
2760 * Remove syn cache created by the specified tcb entry,
2761 * because this does not make sense to keep them
2762 * (if there's no tcb entry, syn cache entry will never be used)
2763 */
2764 void
2765 syn_cache_cleanup(tp)
2766 struct tcpcb *tp;
2767 {
2768 struct syn_cache *sc, *nsc;
2769 int s;
2770
2771 s = splsoftnet();
2772
2773 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
2774 nsc = LIST_NEXT(sc, sc_tpq);
2775
2776 #ifdef DIAGNOSTIC
2777 if (sc->sc_tp != tp)
2778 panic("invalid sc_tp in syn_cache_cleanup");
2779 #endif
2780 SYN_CACHE_RM(sc);
2781 SYN_CACHE_PUT(sc);
2782 }
2783 /* just for safety */
2784 LIST_INIT(&tp->t_sc);
2785
2786 splx(s);
2787 }
2788
2789 /*
2790 * Find an entry in the syn cache.
2791 */
2792 struct syn_cache *
2793 syn_cache_lookup(src, dst, headp)
2794 struct sockaddr *src;
2795 struct sockaddr *dst;
2796 struct syn_cache_head **headp;
2797 {
2798 struct syn_cache *sc;
2799 struct syn_cache_head *scp;
2800 u_int32_t hash;
2801 int s;
2802
2803 SYN_HASHALL(hash, src, dst);
2804
2805 scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
2806 *headp = scp;
2807 s = splsoftnet();
2808 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL;
2809 sc = LIST_NEXT(sc, sc_bucketq)) {
2810 if (sc->sc_hash != hash)
2811 continue;
2812 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
2813 !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
2814 splx(s);
2815 return (sc);
2816 }
2817 }
2818 splx(s);
2819 return (NULL);
2820 }
2821
2822 /*
2823 * This function gets called when we receive an ACK for a
2824 * socket in the LISTEN state. We look up the connection
2825 * in the syn cache, and if its there, we pull it out of
2826 * the cache and turn it into a full-blown connection in
2827 * the SYN-RECEIVED state.
2828 *
2829 * The return values may not be immediately obvious, and their effects
2830 * can be subtle, so here they are:
2831 *
2832 * NULL SYN was not found in cache; caller should drop the
2833 * packet and send an RST.
2834 *
2835 * -1 We were unable to create the new connection, and are
2836 * aborting it. An ACK,RST is being sent to the peer
2837 * (unless we got screwey sequence numbners; see below),
2838 * because the 3-way handshake has been completed. Caller
2839 * should not free the mbuf, since we may be using it. If
2840 * we are not, we will free it.
2841 *
2842 * Otherwise, the return value is a pointer to the new socket
2843 * associated with the connection.
2844 */
2845 struct socket *
2846 syn_cache_get(src, dst, th, hlen, tlen, so, m)
2847 struct sockaddr *src;
2848 struct sockaddr *dst;
2849 struct tcphdr *th;
2850 unsigned int hlen, tlen;
2851 struct socket *so;
2852 struct mbuf *m;
2853 {
2854 struct syn_cache *sc;
2855 struct syn_cache_head *scp;
2856 struct inpcb *inp = NULL;
2857 #ifdef INET6
2858 struct in6pcb *in6p = NULL;
2859 #endif
2860 struct tcpcb *tp = 0;
2861 struct mbuf *am;
2862 int s;
2863 struct socket *oso;
2864
2865 s = splsoftnet();
2866 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
2867 splx(s);
2868 return (NULL);
2869 }
2870
2871 /*
2872 * Verify the sequence and ack numbers. Try getting the correct
2873 * response again.
2874 */
2875 if ((th->th_ack != sc->sc_iss + 1) ||
2876 SEQ_LEQ(th->th_seq, sc->sc_irs) ||
2877 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
2878 (void) syn_cache_respond(sc, m);
2879 splx(s);
2880 return ((struct socket *)(-1));
2881 }
2882
2883 /* Remove this cache entry */
2884 SYN_CACHE_RM(sc);
2885 splx(s);
2886
2887 /*
2888 * Ok, create the full blown connection, and set things up
2889 * as they would have been set up if we had created the
2890 * connection when the SYN arrived. If we can't create
2891 * the connection, abort it.
2892 */
2893 /*
2894 * inp still has the OLD in_pcb stuff, set the
2895 * v6-related flags on the new guy, too. This is
2896 * done particularly for the case where an AF_INET6
2897 * socket is bound only to a port, and a v4 connection
2898 * comes in on that port.
2899 * we also copy the flowinfo from the original pcb
2900 * to the new one.
2901 */
2902 {
2903 struct inpcb *parentinpcb;
2904
2905 parentinpcb = (struct inpcb *)so->so_pcb;
2906
2907 oso = so;
2908 so = sonewconn(so, SS_ISCONNECTED);
2909 if (so == NULL)
2910 goto resetandabort;
2911
2912 switch (so->so_proto->pr_domain->dom_family) {
2913 case AF_INET:
2914 inp = sotoinpcb(so);
2915 break;
2916 #ifdef INET6
2917 case AF_INET6:
2918 in6p = sotoin6pcb(so);
2919 #if 0 /*def INET6*/
2920 inp->inp_flags |= (parentinpcb->inp_flags &
2921 (INP_IPV6 | INP_IPV6_UNDEC | INP_IPV6_MAPPED));
2922 if ((inp->inp_flags & INP_IPV6) &&
2923 !(inp->inp_flags & INP_IPV6_MAPPED)) {
2924 inp->inp_ipv6.ip6_hlim = parentinpcb->inp_ipv6.ip6_hlim;
2925 inp->inp_ipv6.ip6_vfc = parentinpcb->inp_ipv6.ip6_vfc;
2926 }
2927 #endif
2928 break;
2929 #endif
2930 }
2931 }
2932 switch (src->sa_family) {
2933 case AF_INET:
2934 if (inp) {
2935 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
2936 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
2937 inp->inp_options = ip_srcroute();
2938 in_pcbstate(inp, INP_BOUND);
2939 if (inp->inp_options == NULL) {
2940 inp->inp_options = sc->sc_ipopts;
2941 sc->sc_ipopts = NULL;
2942 }
2943 }
2944 #ifdef INET6
2945 else if (in6p) {
2946 /* IPv4 packet to AF_INET6 socket */
2947 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr));
2948 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
2949 bcopy(&((struct sockaddr_in *)dst)->sin_addr,
2950 &in6p->in6p_laddr.s6_addr32[3],
2951 sizeof(((struct sockaddr_in *)dst)->sin_addr));
2952 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
2953 in6totcpcb(in6p)->t_family = AF_INET;
2954 }
2955 #endif
2956 break;
2957 #ifdef INET6
2958 case AF_INET6:
2959 if (in6p) {
2960 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
2961 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
2962 #if 0
2963 in6p->in6p_flowinfo = ip6->ip6_flow & IPV6_FLOWINFO_MASK;
2964 /*inp->inp_options = ip6_srcroute();*/ /* soon. */
2965 #endif
2966 }
2967 break;
2968 #endif
2969 }
2970 #ifdef INET6
2971 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
2972 struct in6pcb *oin6p = sotoin6pcb(oso);
2973 /* inherit socket options from the listening socket */
2974 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
2975 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
2976 m_freem(in6p->in6p_options);
2977 in6p->in6p_options = 0;
2978 }
2979 ip6_savecontrol(in6p, &in6p->in6p_options,
2980 mtod(m, struct ip6_hdr *), m);
2981 }
2982 #endif
2983
2984 #ifdef IPSEC
2985 /*
2986 * we make a copy of policy, instead of sharing the policy,
2987 * for better behavior in terms of SA lookup and dead SA removal.
2988 */
2989 if (inp) {
2990 /* copy old policy into new socket's */
2991 if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
2992 printf("tcp_input: could not copy policy\n");
2993 }
2994 #ifdef INET6
2995 else if (in6p) {
2996 /* copy old policy into new socket's */
2997 if (ipsec_copy_policy(sotoin6pcb(oso)->in6p_sp, in6p->in6p_sp))
2998 printf("tcp_input: could not copy policy\n");
2999 }
3000 #endif
3001 #endif
3002
3003 /*
3004 * Give the new socket our cached route reference.
3005 */
3006 if (inp)
3007 inp->inp_route = sc->sc_route4; /* struct assignment */
3008 #ifdef INET6
3009 else
3010 in6p->in6p_route = sc->sc_route6;
3011 #endif
3012 sc->sc_route4.ro_rt = NULL;
3013
3014 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
3015 if (am == NULL)
3016 goto resetandabort;
3017 am->m_len = src->sa_len;
3018 bcopy(src, mtod(am, caddr_t), src->sa_len);
3019 if (inp) {
3020 if (in_pcbconnect(inp, am)) {
3021 (void) m_free(am);
3022 goto resetandabort;
3023 }
3024 }
3025 #ifdef INET6
3026 else if (in6p) {
3027 if (src->sa_family == AF_INET) {
3028 /* IPv4 packet to AF_INET6 socket */
3029 struct sockaddr_in6 *sin6;
3030 sin6 = mtod(am, struct sockaddr_in6 *);
3031 am->m_len = sizeof(*sin6);
3032 bzero(sin6, sizeof(*sin6));
3033 sin6->sin6_family = AF_INET6;
3034 sin6->sin6_len = sizeof(*sin6);
3035 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
3036 sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
3037 bcopy(&((struct sockaddr_in *)src)->sin_addr,
3038 &sin6->sin6_addr.s6_addr32[3],
3039 sizeof(sin6->sin6_addr.s6_addr32[3]));
3040 }
3041 if (in6_pcbconnect(in6p, am)) {
3042 (void) m_free(am);
3043 goto resetandabort;
3044 }
3045 }
3046 #endif
3047 else {
3048 (void) m_free(am);
3049 goto resetandabort;
3050 }
3051 (void) m_free(am);
3052
3053 if (inp)
3054 tp = intotcpcb(inp);
3055 #ifdef INET6
3056 else if (in6p)
3057 tp = in6totcpcb(in6p);
3058 #endif
3059 else
3060 tp = NULL;
3061 if (sc->sc_request_r_scale != 15) {
3062 tp->requested_s_scale = sc->sc_requested_s_scale;
3063 tp->request_r_scale = sc->sc_request_r_scale;
3064 tp->snd_scale = sc->sc_requested_s_scale;
3065 tp->rcv_scale = sc->sc_request_r_scale;
3066 tp->t_flags |= TF_RCVD_SCALE;
3067 }
3068 if (sc->sc_flags & SCF_TIMESTAMP)
3069 tp->t_flags |= TF_RCVD_TSTMP;
3070
3071 tp->t_template = tcp_template(tp);
3072 if (tp->t_template == 0) {
3073 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
3074 so = NULL;
3075 m_freem(m);
3076 goto abort;
3077 }
3078
3079 tp->iss = sc->sc_iss;
3080 tp->irs = sc->sc_irs;
3081 tcp_sendseqinit(tp);
3082 tcp_rcvseqinit(tp);
3083 tp->t_state = TCPS_SYN_RECEIVED;
3084 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
3085 tcpstat.tcps_accepts++;
3086
3087 /* Initialize tp->t_ourmss before we deal with the peer's! */
3088 tp->t_ourmss = sc->sc_ourmaxseg;
3089 tcp_mss_from_peer(tp, sc->sc_peermaxseg);
3090
3091 /*
3092 * Initialize the initial congestion window. If we
3093 * had to retransmit the SYN,ACK, we must initialize cwnd
3094 * to 1 segment (i.e. the Loss Window).
3095 */
3096 if (sc->sc_rxtshift)
3097 tp->snd_cwnd = tp->t_peermss;
3098 else
3099 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss);
3100
3101 tcp_rmx_rtt(tp);
3102 tp->snd_wl1 = sc->sc_irs;
3103 tp->rcv_up = sc->sc_irs + 1;
3104
3105 /*
3106 * This is what whould have happened in tcp_ouput() when
3107 * the SYN,ACK was sent.
3108 */
3109 tp->snd_up = tp->snd_una;
3110 tp->snd_max = tp->snd_nxt = tp->iss+1;
3111 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3112 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3113 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3114 tp->last_ack_sent = tp->rcv_nxt;
3115
3116 tcpstat.tcps_sc_completed++;
3117 SYN_CACHE_PUT(sc);
3118 return (so);
3119
3120 resetandabort:
3121 (void) tcp_respond(NULL, m, m, th,
3122 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
3123 abort:
3124 if (so != NULL)
3125 (void) soabort(so);
3126 SYN_CACHE_PUT(sc);
3127 tcpstat.tcps_sc_aborted++;
3128 return ((struct socket *)(-1));
3129 }
3130
3131 /*
3132 * This function is called when we get a RST for a
3133 * non-existant connection, so that we can see if the
3134 * connection is in the syn cache. If it is, zap it.
3135 */
3136
3137 void
3138 syn_cache_reset(src, dst, th)
3139 struct sockaddr *src;
3140 struct sockaddr *dst;
3141 struct tcphdr *th;
3142 {
3143 struct syn_cache *sc;
3144 struct syn_cache_head *scp;
3145 int s = splsoftnet();
3146
3147 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3148 splx(s);
3149 return;
3150 }
3151 if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3152 SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3153 splx(s);
3154 return;
3155 }
3156 SYN_CACHE_RM(sc);
3157 splx(s);
3158 tcpstat.tcps_sc_reset++;
3159 SYN_CACHE_PUT(sc);
3160 }
3161
3162 void
3163 syn_cache_unreach(src, dst, th)
3164 struct sockaddr *src;
3165 struct sockaddr *dst;
3166 struct tcphdr *th;
3167 {
3168 struct syn_cache *sc;
3169 struct syn_cache_head *scp;
3170 int s;
3171
3172 s = splsoftnet();
3173 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3174 splx(s);
3175 return;
3176 }
3177 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3178 if (ntohl (th->th_seq) != sc->sc_iss) {
3179 splx(s);
3180 return;
3181 }
3182
3183 /*
3184 * If we've rertransmitted 3 times and this is our second error,
3185 * we remove the entry. Otherwise, we allow it to continue on.
3186 * This prevents us from incorrectly nuking an entry during a
3187 * spurious network outage.
3188 *
3189 * See tcp_notify().
3190 */
3191 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3192 sc->sc_flags |= SCF_UNREACH;
3193 splx(s);
3194 return;
3195 }
3196
3197 SYN_CACHE_RM(sc);
3198 splx(s);
3199 tcpstat.tcps_sc_unreach++;
3200 SYN_CACHE_PUT(sc);
3201 }
3202
3203 /*
3204 * Given a LISTEN socket and an inbound SYN request, add
3205 * this to the syn cache, and send back a segment:
3206 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3207 * to the source.
3208 *
3209 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3210 * Doing so would require that we hold onto the data and deliver it
3211 * to the application. However, if we are the target of a SYN-flood
3212 * DoS attack, an attacker could send data which would eventually
3213 * consume all available buffer space if it were ACKed. By not ACKing
3214 * the data, we avoid this DoS scenario.
3215 */
3216
3217 int
3218 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi)
3219 struct sockaddr *src;
3220 struct sockaddr *dst;
3221 struct tcphdr *th;
3222 unsigned int hlen;
3223 struct socket *so;
3224 struct mbuf *m;
3225 u_char *optp;
3226 int optlen;
3227 struct tcp_opt_info *oi;
3228 {
3229 struct tcpcb tb, *tp;
3230 long win;
3231 struct syn_cache *sc;
3232 struct syn_cache_head *scp;
3233 struct mbuf *ipopts;
3234
3235 tp = sototcpcb(so);
3236
3237 /*
3238 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
3239 *
3240 * Note this check is performed in tcp_input() very early on.
3241 */
3242
3243 /*
3244 * Initialize some local state.
3245 */
3246 win = sbspace(&so->so_rcv);
3247 if (win > TCP_MAXWIN)
3248 win = TCP_MAXWIN;
3249
3250 if (src->sa_family == AF_INET) {
3251 /*
3252 * Remember the IP options, if any.
3253 */
3254 ipopts = ip_srcroute();
3255 } else
3256 ipopts = NULL;
3257
3258 if (optp) {
3259 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
3260 tcp_dooptions(&tb, optp, optlen, th, oi);
3261 } else
3262 tb.t_flags = 0;
3263
3264 /*
3265 * See if we already have an entry for this connection.
3266 * If we do, resend the SYN,ACK. We do not count this
3267 * as a retransmission (XXX though maybe we should).
3268 */
3269 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
3270 tcpstat.tcps_sc_dupesyn++;
3271 if (ipopts) {
3272 /*
3273 * If we were remembering a previous source route,
3274 * forget it and use the new one we've been given.
3275 */
3276 if (sc->sc_ipopts)
3277 (void) m_free(sc->sc_ipopts);
3278 sc->sc_ipopts = ipopts;
3279 }
3280 sc->sc_timestamp = tb.ts_recent;
3281 if (syn_cache_respond(sc, m) == 0) {
3282 tcpstat.tcps_sndacks++;
3283 tcpstat.tcps_sndtotal++;
3284 }
3285 return (1);
3286 }
3287
3288 sc = pool_get(&syn_cache_pool, PR_NOWAIT);
3289 if (sc == NULL) {
3290 if (ipopts)
3291 (void) m_free(ipopts);
3292 return (0);
3293 }
3294
3295 /*
3296 * Fill in the cache, and put the necessary IP and TCP
3297 * options into the reply.
3298 */
3299 bzero(sc, sizeof(struct syn_cache));
3300 bcopy(src, &sc->sc_src, src->sa_len);
3301 bcopy(dst, &sc->sc_dst, dst->sa_len);
3302 sc->sc_flags = 0;
3303 sc->sc_ipopts = ipopts;
3304 sc->sc_irs = th->th_seq;
3305 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0);
3306 sc->sc_peermaxseg = oi->maxseg;
3307 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
3308 m->m_pkthdr.rcvif : NULL,
3309 sc->sc_src.sa.sa_family);
3310 sc->sc_win = win;
3311 sc->sc_timestamp = tb.ts_recent;
3312 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP))
3313 sc->sc_flags |= SCF_TIMESTAMP;
3314 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3315 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3316 sc->sc_requested_s_scale = tb.requested_s_scale;
3317 sc->sc_request_r_scale = 0;
3318 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
3319 TCP_MAXWIN << sc->sc_request_r_scale <
3320 so->so_rcv.sb_hiwat)
3321 sc->sc_request_r_scale++;
3322 } else {
3323 sc->sc_requested_s_scale = 15;
3324 sc->sc_request_r_scale = 15;
3325 }
3326 sc->sc_tp = tp;
3327 if (syn_cache_respond(sc, m) == 0) {
3328 syn_cache_insert(sc, tp);
3329 tcpstat.tcps_sndacks++;
3330 tcpstat.tcps_sndtotal++;
3331 } else {
3332 SYN_CACHE_PUT(sc);
3333 tcpstat.tcps_sc_dropped++;
3334 }
3335 return (1);
3336 }
3337
3338 int
3339 syn_cache_respond(sc, m)
3340 struct syn_cache *sc;
3341 struct mbuf *m;
3342 {
3343 struct route *ro;
3344 struct rtentry *rt;
3345 u_int8_t *optp;
3346 int optlen, error;
3347 u_int16_t tlen;
3348 struct ip *ip = NULL;
3349 #ifdef INET6
3350 struct ip6_hdr *ip6 = NULL;
3351 #endif
3352 struct tcphdr *th;
3353 u_int hlen;
3354
3355 switch (sc->sc_src.sa.sa_family) {
3356 case AF_INET:
3357 hlen = sizeof(struct ip);
3358 ro = &sc->sc_route4;
3359 break;
3360 #ifdef INET6
3361 case AF_INET6:
3362 hlen = sizeof(struct ip6_hdr);
3363 ro = (struct route *)&sc->sc_route6;
3364 break;
3365 #endif
3366 default:
3367 if (m)
3368 m_freem(m);
3369 return EAFNOSUPPORT;
3370 }
3371
3372 /* Compute the size of the TCP options. */
3373 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
3374 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
3375
3376 tlen = hlen + sizeof(struct tcphdr) + optlen;
3377
3378 /*
3379 * Create the IP+TCP header from scratch.
3380 */
3381 if (m)
3382 m_freem(m);
3383 #ifdef DIAGNOSTIC
3384 if (max_linkhdr + tlen > MCLBYTES)
3385 return (ENOBUFS);
3386 #endif
3387 MGETHDR(m, M_DONTWAIT, MT_DATA);
3388 if (m && tlen > MHLEN) {
3389 MCLGET(m, M_DONTWAIT);
3390 if ((m->m_flags & M_EXT) == 0) {
3391 m_freem(m);
3392 m = NULL;
3393 }
3394 }
3395 if (m == NULL)
3396 return (ENOBUFS);
3397
3398 /* Fixup the mbuf. */
3399 m->m_data += max_linkhdr;
3400 m->m_len = m->m_pkthdr.len = tlen;
3401 #ifdef IPSEC
3402 if (sc->sc_tp) {
3403 struct tcpcb *tp;
3404 struct socket *so;
3405
3406 tp = sc->sc_tp;
3407 if (tp->t_inpcb)
3408 so = tp->t_inpcb->inp_socket;
3409 #ifdef INET6
3410 else if (tp->t_in6pcb)
3411 so = tp->t_in6pcb->in6p_socket;
3412 #endif
3413 else
3414 so = NULL;
3415 /* use IPsec policy on listening socket, on SYN ACK */
3416 ipsec_setsocket(m, so);
3417 }
3418 #endif
3419 m->m_pkthdr.rcvif = NULL;
3420 memset(mtod(m, u_char *), 0, tlen);
3421
3422 switch (sc->sc_src.sa.sa_family) {
3423 case AF_INET:
3424 ip = mtod(m, struct ip *);
3425 ip->ip_dst = sc->sc_src.sin.sin_addr;
3426 ip->ip_src = sc->sc_dst.sin.sin_addr;
3427 ip->ip_p = IPPROTO_TCP;
3428 th = (struct tcphdr *)(ip + 1);
3429 th->th_dport = sc->sc_src.sin.sin_port;
3430 th->th_sport = sc->sc_dst.sin.sin_port;
3431 break;
3432 #ifdef INET6
3433 case AF_INET6:
3434 ip6 = mtod(m, struct ip6_hdr *);
3435 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
3436 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
3437 ip6->ip6_nxt = IPPROTO_TCP;
3438 /* ip6_plen will be updated in ip6_output() */
3439 th = (struct tcphdr *)(ip6 + 1);
3440 th->th_dport = sc->sc_src.sin6.sin6_port;
3441 th->th_sport = sc->sc_dst.sin6.sin6_port;
3442 break;
3443 #endif
3444 default:
3445 th = NULL;
3446 }
3447
3448 th->th_seq = htonl(sc->sc_iss);
3449 th->th_ack = htonl(sc->sc_irs + 1);
3450 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
3451 th->th_flags = TH_SYN|TH_ACK;
3452 th->th_win = htons(sc->sc_win);
3453 /* th_sum already 0 */
3454 /* th_urp already 0 */
3455
3456 /* Tack on the TCP options. */
3457 optp = (u_int8_t *)(th + 1);
3458 *optp++ = TCPOPT_MAXSEG;
3459 *optp++ = 4;
3460 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
3461 *optp++ = sc->sc_ourmaxseg & 0xff;
3462
3463 if (sc->sc_request_r_scale != 15) {
3464 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
3465 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
3466 sc->sc_request_r_scale);
3467 optp += 4;
3468 }
3469
3470 if (sc->sc_flags & SCF_TIMESTAMP) {
3471 u_int32_t *lp = (u_int32_t *)(optp);
3472 /* Form timestamp option as shown in appendix A of RFC 1323. */
3473 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
3474 *lp++ = htonl(tcp_now);
3475 *lp = htonl(sc->sc_timestamp);
3476 optp += TCPOLEN_TSTAMP_APPA;
3477 }
3478
3479 /* Compute the packet's checksum. */
3480 switch (sc->sc_src.sa.sa_family) {
3481 case AF_INET:
3482 ip->ip_len = htons(tlen - hlen);
3483 th->th_sum = 0;
3484 th->th_sum = in_cksum(m, tlen);
3485 break;
3486 #ifdef INET6
3487 case AF_INET6:
3488 ip6->ip6_plen = htons(tlen - hlen);
3489 th->th_sum = 0;
3490 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
3491 break;
3492 #endif
3493 }
3494
3495 /*
3496 * Fill in some straggling IP bits. Note the stack expects
3497 * ip_len to be in host order, for convenience.
3498 */
3499 switch (sc->sc_src.sa.sa_family) {
3500 case AF_INET:
3501 ip->ip_len = tlen;
3502 ip->ip_ttl = ip_defttl;
3503 /* XXX tos? */
3504 break;
3505 #ifdef INET6
3506 case AF_INET6:
3507 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
3508 ip6->ip6_vfc |= IPV6_VERSION;
3509 ip6->ip6_plen = htons(tlen - hlen);
3510 /* ip6_hlim will be initialized afterwards */
3511 /* XXX flowlabel? */
3512 break;
3513 #endif
3514 }
3515
3516 /*
3517 * If we're doing Path MTU discovery, we need to set DF unless
3518 * the route's MTU is locked. If we don't yet know the route,
3519 * look it up now. We will copy this reference to the inpcb
3520 * when we finish creating the connection.
3521 */
3522 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) {
3523 if (ro->ro_rt != NULL) {
3524 RTFREE(ro->ro_rt);
3525 ro->ro_rt = NULL;
3526 }
3527 bcopy(&sc->sc_src, &ro->ro_dst, sc->sc_src.sa.sa_len);
3528 rtalloc(ro);
3529 if ((rt = ro->ro_rt) == NULL) {
3530 m_freem(m);
3531 switch (sc->sc_src.sa.sa_family) {
3532 case AF_INET:
3533 ipstat.ips_noroute++;
3534 break;
3535 #ifdef INET6
3536 case AF_INET6:
3537 ip6stat.ip6s_noroute++;
3538 break;
3539 #endif
3540 }
3541 return (EHOSTUNREACH);
3542 }
3543 }
3544
3545 switch (sc->sc_src.sa.sa_family) {
3546 case AF_INET:
3547 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
3548 ip->ip_off |= IP_DF;
3549
3550 /* ...and send it off! */
3551 error = ip_output(m, sc->sc_ipopts, ro, 0, NULL);
3552 break;
3553 #ifdef INET6
3554 case AF_INET6:
3555 ip6->ip6_hlim = in6_selecthlim(NULL,
3556 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
3557
3558 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro,
3559 0, NULL, NULL);
3560 break;
3561 #endif
3562 default:
3563 error = EAFNOSUPPORT;
3564 break;
3565 }
3566 return (error);
3567 }
3568