tcp_input.c revision 1.224 1 /* $NetBSD: tcp_input.c,v 1.224 2005/03/16 00:39:56 yamt Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 *
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
38 *
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
70 */
71
72 /*-
73 * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
74 * All rights reserved.
75 *
76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78 * Facility, NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum.
81 *
82 * Redistribution and use in source and binary forms, with or without
83 * modification, are permitted provided that the following conditions
84 * are met:
85 * 1. Redistributions of source code must retain the above copyright
86 * notice, this list of conditions and the following disclaimer.
87 * 2. Redistributions in binary form must reproduce the above copyright
88 * notice, this list of conditions and the following disclaimer in the
89 * documentation and/or other materials provided with the distribution.
90 * 3. All advertising materials mentioning features or use of this software
91 * must display the following acknowledgement:
92 * This product includes software developed by the NetBSD
93 * Foundation, Inc. and its contributors.
94 * 4. Neither the name of The NetBSD Foundation nor the names of its
95 * contributors may be used to endorse or promote products derived
96 * from this software without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
99 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108 * POSSIBILITY OF SUCH DAMAGE.
109 */
110
111 /*
112 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
113 * The Regents of the University of California. All rights reserved.
114 *
115 * Redistribution and use in source and binary forms, with or without
116 * modification, are permitted provided that the following conditions
117 * are met:
118 * 1. Redistributions of source code must retain the above copyright
119 * notice, this list of conditions and the following disclaimer.
120 * 2. Redistributions in binary form must reproduce the above copyright
121 * notice, this list of conditions and the following disclaimer in the
122 * documentation and/or other materials provided with the distribution.
123 * 3. Neither the name of the University nor the names of its contributors
124 * may be used to endorse or promote products derived from this software
125 * without specific prior written permission.
126 *
127 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
128 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
129 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
130 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
131 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
132 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
133 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
134 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
135 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
136 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
137 * SUCH DAMAGE.
138 *
139 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
140 */
141
142 /*
143 * TODO list for SYN cache stuff:
144 *
145 * Find room for a "state" field, which is needed to keep a
146 * compressed state for TIME_WAIT TCBs. It's been noted already
147 * that this is fairly important for very high-volume web and
148 * mail servers, which use a large number of short-lived
149 * connections.
150 */
151
152 #include <sys/cdefs.h>
153 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.224 2005/03/16 00:39:56 yamt Exp $");
154
155 #include "opt_inet.h"
156 #include "opt_ipsec.h"
157 #include "opt_inet_csum.h"
158 #include "opt_tcp_debug.h"
159
160 #include <sys/param.h>
161 #include <sys/systm.h>
162 #include <sys/malloc.h>
163 #include <sys/mbuf.h>
164 #include <sys/protosw.h>
165 #include <sys/socket.h>
166 #include <sys/socketvar.h>
167 #include <sys/errno.h>
168 #include <sys/syslog.h>
169 #include <sys/pool.h>
170 #include <sys/domain.h>
171 #include <sys/kernel.h>
172 #ifdef TCP_SIGNATURE
173 #include <sys/md5.h>
174 #endif
175
176 #include <net/if.h>
177 #include <net/route.h>
178 #include <net/if_types.h>
179
180 #include <netinet/in.h>
181 #include <netinet/in_systm.h>
182 #include <netinet/ip.h>
183 #include <netinet/in_pcb.h>
184 #include <netinet/in_var.h>
185 #include <netinet/ip_var.h>
186
187 #ifdef INET6
188 #ifndef INET
189 #include <netinet/in.h>
190 #endif
191 #include <netinet/ip6.h>
192 #include <netinet6/ip6_var.h>
193 #include <netinet6/in6_pcb.h>
194 #include <netinet6/ip6_var.h>
195 #include <netinet6/in6_var.h>
196 #include <netinet/icmp6.h>
197 #include <netinet6/nd6.h>
198 #endif
199
200 #ifndef INET6
201 /* always need ip6.h for IP6_EXTHDR_GET */
202 #include <netinet/ip6.h>
203 #endif
204
205 #include <netinet/tcp.h>
206 #include <netinet/tcp_fsm.h>
207 #include <netinet/tcp_seq.h>
208 #include <netinet/tcp_timer.h>
209 #include <netinet/tcp_var.h>
210 #include <netinet/tcpip.h>
211 #include <netinet/tcp_debug.h>
212
213 #include <machine/stdarg.h>
214
215 #ifdef IPSEC
216 #include <netinet6/ipsec.h>
217 #include <netkey/key.h>
218 #endif /*IPSEC*/
219 #ifdef INET6
220 #include "faith.h"
221 #if defined(NFAITH) && NFAITH > 0
222 #include <net/if_faith.h>
223 #endif
224 #endif /* IPSEC */
225
226 #ifdef FAST_IPSEC
227 #include <netipsec/ipsec.h>
228 #include <netipsec/ipsec_var.h> /* XXX ipsecstat namespace */
229 #include <netipsec/key.h>
230 #ifdef INET6
231 #include <netipsec/ipsec6.h>
232 #endif
233 #endif /* FAST_IPSEC*/
234
235 int tcprexmtthresh = 3;
236 int tcp_log_refused;
237
238 static int tcp_rst_ppslim_count = 0;
239 static struct timeval tcp_rst_ppslim_last;
240 static int tcp_ackdrop_ppslim_count = 0;
241 static struct timeval tcp_ackdrop_ppslim_last;
242
243 #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ)
244
245 /* for modulo comparisons of timestamps */
246 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
247 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
248
249 /*
250 * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
251 */
252 #ifdef INET6
253 #define ND6_HINT(tp) \
254 do { \
255 if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 && \
256 tp->t_in6pcb->in6p_route.ro_rt) { \
257 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \
258 } \
259 } while (/*CONSTCOND*/ 0)
260 #else
261 #define ND6_HINT(tp)
262 #endif
263
264 /*
265 * Macro to compute ACK transmission behavior. Delay the ACK unless
266 * we have already delayed an ACK (must send an ACK every two segments).
267 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
268 * option is enabled.
269 */
270 #define TCP_SETUP_ACK(tp, th) \
271 do { \
272 if ((tp)->t_flags & TF_DELACK || \
273 (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \
274 tp->t_flags |= TF_ACKNOW; \
275 else \
276 TCP_SET_DELACK(tp); \
277 } while (/*CONSTCOND*/ 0)
278
279 /*
280 * Convert TCP protocol fields to host order for easier processing.
281 */
282 #define TCP_FIELDS_TO_HOST(th) \
283 do { \
284 NTOHL((th)->th_seq); \
285 NTOHL((th)->th_ack); \
286 NTOHS((th)->th_win); \
287 NTOHS((th)->th_urp); \
288 } while (/*CONSTCOND*/ 0)
289
290 /*
291 * ... and reverse the above.
292 */
293 #define TCP_FIELDS_TO_NET(th) \
294 do { \
295 HTONL((th)->th_seq); \
296 HTONL((th)->th_ack); \
297 HTONS((th)->th_win); \
298 HTONS((th)->th_urp); \
299 } while (/*CONSTCOND*/ 0)
300
301 #ifdef TCP_CSUM_COUNTERS
302 #include <sys/device.h>
303
304 extern struct evcnt tcp_hwcsum_ok;
305 extern struct evcnt tcp_hwcsum_bad;
306 extern struct evcnt tcp_hwcsum_data;
307 extern struct evcnt tcp_swcsum;
308
309 #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
310
311 #else
312
313 #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */
314
315 #endif /* TCP_CSUM_COUNTERS */
316
317 #ifdef TCP_REASS_COUNTERS
318 #include <sys/device.h>
319
320 extern struct evcnt tcp_reass_;
321 extern struct evcnt tcp_reass_empty;
322 extern struct evcnt tcp_reass_iteration[8];
323 extern struct evcnt tcp_reass_prependfirst;
324 extern struct evcnt tcp_reass_prepend;
325 extern struct evcnt tcp_reass_insert;
326 extern struct evcnt tcp_reass_inserttail;
327 extern struct evcnt tcp_reass_append;
328 extern struct evcnt tcp_reass_appendtail;
329 extern struct evcnt tcp_reass_overlaptail;
330 extern struct evcnt tcp_reass_overlapfront;
331 extern struct evcnt tcp_reass_segdup;
332 extern struct evcnt tcp_reass_fragdup;
333
334 #define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++
335
336 #else
337
338 #define TCP_REASS_COUNTER_INCR(ev) /* nothing */
339
340 #endif /* TCP_REASS_COUNTERS */
341
342 #ifdef INET
343 static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
344 #endif
345 #ifdef INET6
346 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
347 #endif
348
349 #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
350
351 POOL_INIT(tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl", NULL);
352
353 int
354 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
355 {
356 struct ipqent *p, *q, *nq, *tiqe = NULL;
357 struct socket *so = NULL;
358 int pkt_flags;
359 tcp_seq pkt_seq;
360 unsigned pkt_len;
361 u_long rcvpartdupbyte = 0;
362 u_long rcvoobyte;
363 #ifdef TCP_REASS_COUNTERS
364 u_int count = 0;
365 #endif
366
367 if (tp->t_inpcb)
368 so = tp->t_inpcb->inp_socket;
369 #ifdef INET6
370 else if (tp->t_in6pcb)
371 so = tp->t_in6pcb->in6p_socket;
372 #endif
373
374 TCP_REASS_LOCK_CHECK(tp);
375
376 /*
377 * Call with th==0 after become established to
378 * force pre-ESTABLISHED data up to user socket.
379 */
380 if (th == 0)
381 goto present;
382
383 rcvoobyte = *tlen;
384 /*
385 * Copy these to local variables because the tcpiphdr
386 * gets munged while we are collapsing mbufs.
387 */
388 pkt_seq = th->th_seq;
389 pkt_len = *tlen;
390 pkt_flags = th->th_flags;
391
392 TCP_REASS_COUNTER_INCR(&tcp_reass_);
393
394 if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
395 /*
396 * When we miss a packet, the vast majority of time we get
397 * packets that follow it in order. So optimize for that.
398 */
399 if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
400 p->ipqe_len += pkt_len;
401 p->ipqe_flags |= pkt_flags;
402 m_cat(p->ipre_mlast, m);
403 TRAVERSE(p->ipre_mlast);
404 m = NULL;
405 tiqe = p;
406 TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
407 TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
408 goto skip_replacement;
409 }
410 /*
411 * While we're here, if the pkt is completely beyond
412 * anything we have, just insert it at the tail.
413 */
414 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
415 TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
416 goto insert_it;
417 }
418 }
419
420 q = TAILQ_FIRST(&tp->segq);
421
422 if (q != NULL) {
423 /*
424 * If this segment immediately precedes the first out-of-order
425 * block, simply slap the segment in front of it and (mostly)
426 * skip the complicated logic.
427 */
428 if (pkt_seq + pkt_len == q->ipqe_seq) {
429 q->ipqe_seq = pkt_seq;
430 q->ipqe_len += pkt_len;
431 q->ipqe_flags |= pkt_flags;
432 m_cat(m, q->ipqe_m);
433 q->ipqe_m = m;
434 q->ipre_mlast = m; /* last mbuf may have changed */
435 TRAVERSE(q->ipre_mlast);
436 tiqe = q;
437 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
438 TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
439 goto skip_replacement;
440 }
441 } else {
442 TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
443 }
444
445 /*
446 * Find a segment which begins after this one does.
447 */
448 for (p = NULL; q != NULL; q = nq) {
449 nq = TAILQ_NEXT(q, ipqe_q);
450 #ifdef TCP_REASS_COUNTERS
451 count++;
452 #endif
453 /*
454 * If the received segment is just right after this
455 * fragment, merge the two together and then check
456 * for further overlaps.
457 */
458 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
459 #ifdef TCPREASS_DEBUG
460 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
461 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
462 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
463 #endif
464 pkt_len += q->ipqe_len;
465 pkt_flags |= q->ipqe_flags;
466 pkt_seq = q->ipqe_seq;
467 m_cat(q->ipre_mlast, m);
468 TRAVERSE(q->ipre_mlast);
469 m = q->ipqe_m;
470 TCP_REASS_COUNTER_INCR(&tcp_reass_append);
471 goto free_ipqe;
472 }
473 /*
474 * If the received segment is completely past this
475 * fragment, we need to go the next fragment.
476 */
477 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
478 p = q;
479 continue;
480 }
481 /*
482 * If the fragment is past the received segment,
483 * it (or any following) can't be concatenated.
484 */
485 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
486 TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
487 break;
488 }
489
490 /*
491 * We've received all the data in this segment before.
492 * mark it as a duplicate and return.
493 */
494 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
495 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
496 tcpstat.tcps_rcvduppack++;
497 tcpstat.tcps_rcvdupbyte += pkt_len;
498 tcp_new_dsack(tp, pkt_seq, pkt_len);
499 m_freem(m);
500 if (tiqe != NULL)
501 pool_put(&tcpipqent_pool, tiqe);
502 TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
503 return (0);
504 }
505 /*
506 * Received segment completely overlaps this fragment
507 * so we drop the fragment (this keeps the temporal
508 * ordering of segments correct).
509 */
510 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
511 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
512 rcvpartdupbyte += q->ipqe_len;
513 m_freem(q->ipqe_m);
514 TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
515 goto free_ipqe;
516 }
517 /*
518 * RX'ed segment extends past the end of the
519 * fragment. Drop the overlapping bytes. Then
520 * merge the fragment and segment then treat as
521 * a longer received packet.
522 */
523 if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
524 SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
525 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
526 #ifdef TCPREASS_DEBUG
527 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
528 tp, overlap,
529 pkt_seq, pkt_seq + pkt_len, pkt_len);
530 #endif
531 m_adj(m, overlap);
532 rcvpartdupbyte += overlap;
533 m_cat(q->ipre_mlast, m);
534 TRAVERSE(q->ipre_mlast);
535 m = q->ipqe_m;
536 pkt_seq = q->ipqe_seq;
537 pkt_len += q->ipqe_len - overlap;
538 rcvoobyte -= overlap;
539 TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
540 goto free_ipqe;
541 }
542 /*
543 * RX'ed segment extends past the front of the
544 * fragment. Drop the overlapping bytes on the
545 * received packet. The packet will then be
546 * contatentated with this fragment a bit later.
547 */
548 if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
549 SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) {
550 int overlap = pkt_seq + pkt_len - q->ipqe_seq;
551 #ifdef TCPREASS_DEBUG
552 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
553 tp, overlap,
554 pkt_seq, pkt_seq + pkt_len, pkt_len);
555 #endif
556 m_adj(m, -overlap);
557 pkt_len -= overlap;
558 rcvpartdupbyte += overlap;
559 TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
560 rcvoobyte -= overlap;
561 }
562 /*
563 * If the received segment immediates precedes this
564 * fragment then tack the fragment onto this segment
565 * and reinsert the data.
566 */
567 if (q->ipqe_seq == pkt_seq + pkt_len) {
568 #ifdef TCPREASS_DEBUG
569 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
570 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
571 pkt_seq, pkt_seq + pkt_len, pkt_len);
572 #endif
573 pkt_len += q->ipqe_len;
574 pkt_flags |= q->ipqe_flags;
575 m_cat(m, q->ipqe_m);
576 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
577 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
578 tp->t_segqlen--;
579 KASSERT(tp->t_segqlen >= 0);
580 KASSERT(tp->t_segqlen != 0 ||
581 (TAILQ_EMPTY(&tp->segq) &&
582 TAILQ_EMPTY(&tp->timeq)));
583 if (tiqe == NULL)
584 tiqe = q;
585 else
586 pool_put(&tcpipqent_pool, q);
587 TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
588 break;
589 }
590 /*
591 * If the fragment is before the segment, remember it.
592 * When this loop is terminated, p will contain the
593 * pointer to fragment that is right before the received
594 * segment.
595 */
596 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
597 p = q;
598
599 continue;
600
601 /*
602 * This is a common operation. It also will allow
603 * to save doing a malloc/free in most instances.
604 */
605 free_ipqe:
606 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
607 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
608 tp->t_segqlen--;
609 KASSERT(tp->t_segqlen >= 0);
610 KASSERT(tp->t_segqlen != 0 ||
611 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
612 if (tiqe == NULL)
613 tiqe = q;
614 else
615 pool_put(&tcpipqent_pool, q);
616 }
617
618 #ifdef TCP_REASS_COUNTERS
619 if (count > 7)
620 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
621 else if (count > 0)
622 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
623 #endif
624
625 insert_it:
626
627 /*
628 * Allocate a new queue entry since the received segment did not
629 * collapse onto any other out-of-order block; thus we are allocating
630 * a new block. If it had collapsed, tiqe would not be NULL and
631 * we would be reusing it.
632 * XXX If we can't, just drop the packet. XXX
633 */
634 if (tiqe == NULL) {
635 tiqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
636 if (tiqe == NULL) {
637 tcpstat.tcps_rcvmemdrop++;
638 m_freem(m);
639 return (0);
640 }
641 }
642
643 /*
644 * Update the counters.
645 */
646 tcpstat.tcps_rcvoopack++;
647 tcpstat.tcps_rcvoobyte += rcvoobyte;
648 if (rcvpartdupbyte) {
649 tcpstat.tcps_rcvpartduppack++;
650 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
651 }
652
653 /*
654 * Insert the new fragment queue entry into both queues.
655 */
656 tiqe->ipqe_m = m;
657 tiqe->ipre_mlast = m;
658 tiqe->ipqe_seq = pkt_seq;
659 tiqe->ipqe_len = pkt_len;
660 tiqe->ipqe_flags = pkt_flags;
661 if (p == NULL) {
662 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
663 #ifdef TCPREASS_DEBUG
664 if (tiqe->ipqe_seq != tp->rcv_nxt)
665 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
666 tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
667 #endif
668 } else {
669 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
670 #ifdef TCPREASS_DEBUG
671 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
672 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
673 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
674 #endif
675 }
676 tp->t_segqlen++;
677
678 skip_replacement:
679
680 TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
681
682 present:
683 /*
684 * Present data to user, advancing rcv_nxt through
685 * completed sequence space.
686 */
687 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
688 return (0);
689 q = TAILQ_FIRST(&tp->segq);
690 if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
691 return (0);
692 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
693 return (0);
694
695 tp->rcv_nxt += q->ipqe_len;
696 pkt_flags = q->ipqe_flags & TH_FIN;
697 ND6_HINT(tp);
698
699 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
700 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
701 tp->t_segqlen--;
702 KASSERT(tp->t_segqlen >= 0);
703 KASSERT(tp->t_segqlen != 0 ||
704 (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
705 if (so->so_state & SS_CANTRCVMORE)
706 m_freem(q->ipqe_m);
707 else
708 sbappendstream(&so->so_rcv, q->ipqe_m);
709 pool_put(&tcpipqent_pool, q);
710 sorwakeup(so);
711 return (pkt_flags);
712 }
713
714 #ifdef INET6
715 int
716 tcp6_input(struct mbuf **mp, int *offp, int proto)
717 {
718 struct mbuf *m = *mp;
719
720 /*
721 * draft-itojun-ipv6-tcp-to-anycast
722 * better place to put this in?
723 */
724 if (m->m_flags & M_ANYCAST6) {
725 struct ip6_hdr *ip6;
726 if (m->m_len < sizeof(struct ip6_hdr)) {
727 if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
728 tcpstat.tcps_rcvshort++;
729 return IPPROTO_DONE;
730 }
731 }
732 ip6 = mtod(m, struct ip6_hdr *);
733 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
734 (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
735 return IPPROTO_DONE;
736 }
737
738 tcp_input(m, *offp, proto);
739 return IPPROTO_DONE;
740 }
741 #endif
742
743 #ifdef INET
744 static void
745 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
746 {
747 char src[4*sizeof "123"];
748 char dst[4*sizeof "123"];
749
750 if (ip) {
751 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
752 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
753 }
754 else {
755 strlcpy(src, "(unknown)", sizeof(src));
756 strlcpy(dst, "(unknown)", sizeof(dst));
757 }
758 log(LOG_INFO,
759 "Connection attempt to TCP %s:%d from %s:%d\n",
760 dst, ntohs(th->th_dport),
761 src, ntohs(th->th_sport));
762 }
763 #endif
764
765 #ifdef INET6
766 static void
767 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
768 {
769 char src[INET6_ADDRSTRLEN];
770 char dst[INET6_ADDRSTRLEN];
771
772 if (ip6) {
773 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
774 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
775 }
776 else {
777 strlcpy(src, "(unknown v6)", sizeof(src));
778 strlcpy(dst, "(unknown v6)", sizeof(dst));
779 }
780 log(LOG_INFO,
781 "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
782 dst, ntohs(th->th_dport),
783 src, ntohs(th->th_sport));
784 }
785 #endif
786
787 /*
788 * Checksum extended TCP header and data.
789 */
790 int
791 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, int toff,
792 int off, int tlen)
793 {
794
795 /*
796 * XXX it's better to record and check if this mbuf is
797 * already checked.
798 */
799
800 switch (af) {
801 #ifdef INET
802 case AF_INET:
803 switch (m->m_pkthdr.csum_flags &
804 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
805 M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
806 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
807 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
808 goto badcsum;
809
810 case M_CSUM_TCPv4|M_CSUM_DATA: {
811 u_int32_t hw_csum = m->m_pkthdr.csum_data;
812
813 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
814 if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
815 const struct ip *ip =
816 mtod(m, const struct ip *);
817
818 hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
819 ip->ip_dst.s_addr,
820 htons(hw_csum + tlen + off + IPPROTO_TCP));
821 }
822 if ((hw_csum ^ 0xffff) != 0)
823 goto badcsum;
824 break;
825 }
826
827 case M_CSUM_TCPv4:
828 /* Checksum was okay. */
829 TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
830 break;
831
832 default:
833 /*
834 * Must compute it ourselves. Maybe skip checksum
835 * on loopback interfaces.
836 */
837 if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
838 IFF_LOOPBACK) ||
839 tcp_do_loopback_cksum)) {
840 TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
841 if (in4_cksum(m, IPPROTO_TCP, toff,
842 tlen + off) != 0)
843 goto badcsum;
844 }
845 break;
846 }
847 break;
848 #endif /* INET4 */
849
850 #ifdef INET6
851 case AF_INET6:
852 if (__predict_true((m->m_flags & M_LOOP) == 0 ||
853 tcp_do_loopback_cksum)) {
854 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0)
855 goto badcsum;
856 }
857 break;
858 #endif /* INET6 */
859 }
860
861 return 0;
862
863 badcsum:
864 tcpstat.tcps_rcvbadsum++;
865 return -1;
866 }
867
868 /*
869 * TCP input routine, follows pages 65-76 of the
870 * protocol specification dated September, 1981 very closely.
871 */
872 void
873 tcp_input(struct mbuf *m, ...)
874 {
875 struct tcphdr *th;
876 struct ip *ip;
877 struct inpcb *inp;
878 #ifdef INET6
879 struct ip6_hdr *ip6;
880 struct in6pcb *in6p;
881 #endif
882 u_int8_t *optp = NULL;
883 int optlen = 0;
884 int len, tlen, toff, hdroptlen = 0;
885 struct tcpcb *tp = 0;
886 int tiflags;
887 struct socket *so = NULL;
888 int todrop, dupseg, acked, ourfinisacked, needoutput = 0;
889 #ifdef TCP_DEBUG
890 short ostate = 0;
891 #endif
892 int iss = 0;
893 u_long tiwin;
894 struct tcp_opt_info opti;
895 int off, iphlen;
896 va_list ap;
897 int af; /* af on the wire */
898 struct mbuf *tcp_saveti = NULL;
899
900 MCLAIM(m, &tcp_rx_mowner);
901 va_start(ap, m);
902 toff = va_arg(ap, int);
903 (void)va_arg(ap, int); /* ignore value, advance ap */
904 va_end(ap);
905
906 tcpstat.tcps_rcvtotal++;
907
908 bzero(&opti, sizeof(opti));
909 opti.ts_present = 0;
910 opti.maxseg = 0;
911
912 /*
913 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
914 *
915 * TCP is, by definition, unicast, so we reject all
916 * multicast outright.
917 *
918 * Note, there are additional src/dst address checks in
919 * the AF-specific code below.
920 */
921 if (m->m_flags & (M_BCAST|M_MCAST)) {
922 /* XXX stat */
923 goto drop;
924 }
925 #ifdef INET6
926 if (m->m_flags & M_ANYCAST6) {
927 /* XXX stat */
928 goto drop;
929 }
930 #endif
931
932 /*
933 * Get IP and TCP header.
934 * Note: IP leaves IP header in first mbuf.
935 */
936 ip = mtod(m, struct ip *);
937 #ifdef INET6
938 ip6 = NULL;
939 #endif
940 switch (ip->ip_v) {
941 #ifdef INET
942 case 4:
943 af = AF_INET;
944 iphlen = sizeof(struct ip);
945 ip = mtod(m, struct ip *);
946 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
947 sizeof(struct tcphdr));
948 if (th == NULL) {
949 tcpstat.tcps_rcvshort++;
950 return;
951 }
952 /* We do the checksum after PCB lookup... */
953 len = ntohs(ip->ip_len);
954 tlen = len - toff;
955 break;
956 #endif
957 #ifdef INET6
958 case 6:
959 ip = NULL;
960 iphlen = sizeof(struct ip6_hdr);
961 af = AF_INET6;
962 ip6 = mtod(m, struct ip6_hdr *);
963 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
964 sizeof(struct tcphdr));
965 if (th == NULL) {
966 tcpstat.tcps_rcvshort++;
967 return;
968 }
969
970 /* Be proactive about malicious use of IPv4 mapped address */
971 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
972 IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
973 /* XXX stat */
974 goto drop;
975 }
976
977 /*
978 * Be proactive about unspecified IPv6 address in source.
979 * As we use all-zero to indicate unbounded/unconnected pcb,
980 * unspecified IPv6 address can be used to confuse us.
981 *
982 * Note that packets with unspecified IPv6 destination is
983 * already dropped in ip6_input.
984 */
985 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
986 /* XXX stat */
987 goto drop;
988 }
989
990 /*
991 * Make sure destination address is not multicast.
992 * Source address checked in ip6_input().
993 */
994 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
995 /* XXX stat */
996 goto drop;
997 }
998
999 /* We do the checksum after PCB lookup... */
1000 len = m->m_pkthdr.len;
1001 tlen = len - toff;
1002 break;
1003 #endif
1004 default:
1005 m_freem(m);
1006 return;
1007 }
1008
1009 KASSERT(TCP_HDR_ALIGNED_P(th));
1010
1011 /*
1012 * Check that TCP offset makes sense,
1013 * pull out TCP options and adjust length. XXX
1014 */
1015 off = th->th_off << 2;
1016 if (off < sizeof (struct tcphdr) || off > tlen) {
1017 tcpstat.tcps_rcvbadoff++;
1018 goto drop;
1019 }
1020 tlen -= off;
1021
1022 /*
1023 * tcp_input() has been modified to use tlen to mean the TCP data
1024 * length throughout the function. Other functions can use
1025 * m->m_pkthdr.len as the basis for calculating the TCP data length.
1026 * rja
1027 */
1028
1029 if (off > sizeof (struct tcphdr)) {
1030 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
1031 if (th == NULL) {
1032 tcpstat.tcps_rcvshort++;
1033 return;
1034 }
1035 /*
1036 * NOTE: ip/ip6 will not be affected by m_pulldown()
1037 * (as they're before toff) and we don't need to update those.
1038 */
1039 KASSERT(TCP_HDR_ALIGNED_P(th));
1040 optlen = off - sizeof (struct tcphdr);
1041 optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
1042 /*
1043 * Do quick retrieval of timestamp options ("options
1044 * prediction?"). If timestamp is the only option and it's
1045 * formatted as recommended in RFC 1323 appendix A, we
1046 * quickly get the values now and not bother calling
1047 * tcp_dooptions(), etc.
1048 */
1049 if ((optlen == TCPOLEN_TSTAMP_APPA ||
1050 (optlen > TCPOLEN_TSTAMP_APPA &&
1051 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
1052 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
1053 (th->th_flags & TH_SYN) == 0) {
1054 opti.ts_present = 1;
1055 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
1056 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
1057 optp = NULL; /* we've parsed the options */
1058 }
1059 }
1060 tiflags = th->th_flags;
1061
1062 /*
1063 * Locate pcb for segment.
1064 */
1065 findpcb:
1066 inp = NULL;
1067 #ifdef INET6
1068 in6p = NULL;
1069 #endif
1070 switch (af) {
1071 #ifdef INET
1072 case AF_INET:
1073 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
1074 ip->ip_dst, th->th_dport);
1075 if (inp == 0) {
1076 ++tcpstat.tcps_pcbhashmiss;
1077 inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
1078 }
1079 #ifdef INET6
1080 if (inp == 0) {
1081 struct in6_addr s, d;
1082
1083 /* mapped addr case */
1084 bzero(&s, sizeof(s));
1085 s.s6_addr16[5] = htons(0xffff);
1086 bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
1087 bzero(&d, sizeof(d));
1088 d.s6_addr16[5] = htons(0xffff);
1089 bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
1090 in6p = in6_pcblookup_connect(&tcbtable, &s,
1091 th->th_sport, &d, th->th_dport, 0);
1092 if (in6p == 0) {
1093 ++tcpstat.tcps_pcbhashmiss;
1094 in6p = in6_pcblookup_bind(&tcbtable, &d,
1095 th->th_dport, 0);
1096 }
1097 }
1098 #endif
1099 #ifndef INET6
1100 if (inp == 0)
1101 #else
1102 if (inp == 0 && in6p == 0)
1103 #endif
1104 {
1105 ++tcpstat.tcps_noport;
1106 if (tcp_log_refused &&
1107 (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1108 tcp4_log_refused(ip, th);
1109 }
1110 TCP_FIELDS_TO_HOST(th);
1111 goto dropwithreset_ratelim;
1112 }
1113 #if defined(IPSEC) || defined(FAST_IPSEC)
1114 if (inp && (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 &&
1115 ipsec4_in_reject(m, inp)) {
1116 ipsecstat.in_polvio++;
1117 goto drop;
1118 }
1119 #ifdef INET6
1120 else if (in6p &&
1121 (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1122 ipsec4_in_reject_so(m, in6p->in6p_socket)) {
1123 ipsecstat.in_polvio++;
1124 goto drop;
1125 }
1126 #endif
1127 #endif /*IPSEC*/
1128 break;
1129 #endif /*INET*/
1130 #ifdef INET6
1131 case AF_INET6:
1132 {
1133 int faith;
1134
1135 #if defined(NFAITH) && NFAITH > 0
1136 faith = faithprefix(&ip6->ip6_dst);
1137 #else
1138 faith = 0;
1139 #endif
1140 in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
1141 th->th_sport, &ip6->ip6_dst, th->th_dport, faith);
1142 if (in6p == NULL) {
1143 ++tcpstat.tcps_pcbhashmiss;
1144 in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
1145 th->th_dport, faith);
1146 }
1147 if (in6p == NULL) {
1148 ++tcpstat.tcps_noport;
1149 if (tcp_log_refused &&
1150 (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
1151 tcp6_log_refused(ip6, th);
1152 }
1153 TCP_FIELDS_TO_HOST(th);
1154 goto dropwithreset_ratelim;
1155 }
1156 #if defined(IPSEC) || defined(FAST_IPSEC)
1157 if ((in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
1158 ipsec6_in_reject(m, in6p)) {
1159 ipsec6stat.in_polvio++;
1160 goto drop;
1161 }
1162 #endif /*IPSEC*/
1163 break;
1164 }
1165 #endif
1166 }
1167
1168 /*
1169 * If the state is CLOSED (i.e., TCB does not exist) then
1170 * all data in the incoming segment is discarded.
1171 * If the TCB exists but is in CLOSED state, it is embryonic,
1172 * but should either do a listen or a connect soon.
1173 */
1174 tp = NULL;
1175 so = NULL;
1176 if (inp) {
1177 tp = intotcpcb(inp);
1178 so = inp->inp_socket;
1179 }
1180 #ifdef INET6
1181 else if (in6p) {
1182 tp = in6totcpcb(in6p);
1183 so = in6p->in6p_socket;
1184 }
1185 #endif
1186 if (tp == 0) {
1187 TCP_FIELDS_TO_HOST(th);
1188 goto dropwithreset_ratelim;
1189 }
1190 if (tp->t_state == TCPS_CLOSED)
1191 goto drop;
1192
1193 /*
1194 * Checksum extended TCP header and data.
1195 */
1196 if (tcp_input_checksum(af, m, th, toff, off, tlen))
1197 goto badcsum;
1198
1199 TCP_FIELDS_TO_HOST(th);
1200
1201 /* Unscale the window into a 32-bit value. */
1202 if ((tiflags & TH_SYN) == 0)
1203 tiwin = th->th_win << tp->snd_scale;
1204 else
1205 tiwin = th->th_win;
1206
1207 #ifdef INET6
1208 /* save packet options if user wanted */
1209 if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
1210 if (in6p->in6p_options) {
1211 m_freem(in6p->in6p_options);
1212 in6p->in6p_options = 0;
1213 }
1214 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
1215 }
1216 #endif
1217
1218 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
1219 union syn_cache_sa src;
1220 union syn_cache_sa dst;
1221
1222 bzero(&src, sizeof(src));
1223 bzero(&dst, sizeof(dst));
1224 switch (af) {
1225 #ifdef INET
1226 case AF_INET:
1227 src.sin.sin_len = sizeof(struct sockaddr_in);
1228 src.sin.sin_family = AF_INET;
1229 src.sin.sin_addr = ip->ip_src;
1230 src.sin.sin_port = th->th_sport;
1231
1232 dst.sin.sin_len = sizeof(struct sockaddr_in);
1233 dst.sin.sin_family = AF_INET;
1234 dst.sin.sin_addr = ip->ip_dst;
1235 dst.sin.sin_port = th->th_dport;
1236 break;
1237 #endif
1238 #ifdef INET6
1239 case AF_INET6:
1240 src.sin6.sin6_len = sizeof(struct sockaddr_in6);
1241 src.sin6.sin6_family = AF_INET6;
1242 src.sin6.sin6_addr = ip6->ip6_src;
1243 src.sin6.sin6_port = th->th_sport;
1244
1245 dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
1246 dst.sin6.sin6_family = AF_INET6;
1247 dst.sin6.sin6_addr = ip6->ip6_dst;
1248 dst.sin6.sin6_port = th->th_dport;
1249 break;
1250 #endif /* INET6 */
1251 default:
1252 goto badsyn; /*sanity*/
1253 }
1254
1255 if (so->so_options & SO_DEBUG) {
1256 #ifdef TCP_DEBUG
1257 ostate = tp->t_state;
1258 #endif
1259
1260 tcp_saveti = NULL;
1261 if (iphlen + sizeof(struct tcphdr) > MHLEN)
1262 goto nosave;
1263
1264 if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
1265 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
1266 if (!tcp_saveti)
1267 goto nosave;
1268 } else {
1269 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
1270 if (!tcp_saveti)
1271 goto nosave;
1272 MCLAIM(m, &tcp_mowner);
1273 tcp_saveti->m_len = iphlen;
1274 m_copydata(m, 0, iphlen,
1275 mtod(tcp_saveti, caddr_t));
1276 }
1277
1278 if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
1279 m_freem(tcp_saveti);
1280 tcp_saveti = NULL;
1281 } else {
1282 tcp_saveti->m_len += sizeof(struct tcphdr);
1283 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen,
1284 sizeof(struct tcphdr));
1285 }
1286 nosave:;
1287 }
1288 if (so->so_options & SO_ACCEPTCONN) {
1289 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
1290 if (tiflags & TH_RST) {
1291 syn_cache_reset(&src.sa, &dst.sa, th);
1292 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
1293 (TH_ACK|TH_SYN)) {
1294 /*
1295 * Received a SYN,ACK. This should
1296 * never happen while we are in
1297 * LISTEN. Send an RST.
1298 */
1299 goto badsyn;
1300 } else if (tiflags & TH_ACK) {
1301 so = syn_cache_get(&src.sa, &dst.sa,
1302 th, toff, tlen, so, m);
1303 if (so == NULL) {
1304 /*
1305 * We don't have a SYN for
1306 * this ACK; send an RST.
1307 */
1308 goto badsyn;
1309 } else if (so ==
1310 (struct socket *)(-1)) {
1311 /*
1312 * We were unable to create
1313 * the connection. If the
1314 * 3-way handshake was
1315 * completed, and RST has
1316 * been sent to the peer.
1317 * Since the mbuf might be
1318 * in use for the reply,
1319 * do not free it.
1320 */
1321 m = NULL;
1322 } else {
1323 /*
1324 * We have created a
1325 * full-blown connection.
1326 */
1327 tp = NULL;
1328 inp = NULL;
1329 #ifdef INET6
1330 in6p = NULL;
1331 #endif
1332 switch (so->so_proto->pr_domain->dom_family) {
1333 #ifdef INET
1334 case AF_INET:
1335 inp = sotoinpcb(so);
1336 tp = intotcpcb(inp);
1337 break;
1338 #endif
1339 #ifdef INET6
1340 case AF_INET6:
1341 in6p = sotoin6pcb(so);
1342 tp = in6totcpcb(in6p);
1343 break;
1344 #endif
1345 }
1346 if (tp == NULL)
1347 goto badsyn; /*XXX*/
1348 tiwin <<= tp->snd_scale;
1349 goto after_listen;
1350 }
1351 } else {
1352 /*
1353 * None of RST, SYN or ACK was set.
1354 * This is an invalid packet for a
1355 * TCB in LISTEN state. Send a RST.
1356 */
1357 goto badsyn;
1358 }
1359 } else {
1360 /*
1361 * Received a SYN.
1362 */
1363
1364 #ifdef INET6
1365 /*
1366 * If deprecated address is forbidden, we do
1367 * not accept SYN to deprecated interface
1368 * address to prevent any new inbound
1369 * connection from getting established.
1370 * When we do not accept SYN, we send a TCP
1371 * RST, with deprecated source address (instead
1372 * of dropping it). We compromise it as it is
1373 * much better for peer to send a RST, and
1374 * RST will be the final packet for the
1375 * exchange.
1376 *
1377 * If we do not forbid deprecated addresses, we
1378 * accept the SYN packet. RFC2462 does not
1379 * suggest dropping SYN in this case.
1380 * If we decipher RFC2462 5.5.4, it says like
1381 * this:
1382 * 1. use of deprecated addr with existing
1383 * communication is okay - "SHOULD continue
1384 * to be used"
1385 * 2. use of it with new communication:
1386 * (2a) "SHOULD NOT be used if alternate
1387 * address with sufficient scope is
1388 * available"
1389 * (2b) nothing mentioned otherwise.
1390 * Here we fall into (2b) case as we have no
1391 * choice in our source address selection - we
1392 * must obey the peer.
1393 *
1394 * The wording in RFC2462 is confusing, and
1395 * there are multiple description text for
1396 * deprecated address handling - worse, they
1397 * are not exactly the same. I believe 5.5.4
1398 * is the best one, so we follow 5.5.4.
1399 */
1400 if (af == AF_INET6 && !ip6_use_deprecated) {
1401 struct in6_ifaddr *ia6;
1402 if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
1403 &ip6->ip6_dst)) &&
1404 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1405 tp = NULL;
1406 goto dropwithreset;
1407 }
1408 }
1409 #endif
1410
1411 #ifdef IPSEC
1412 switch (af) {
1413 #ifdef INET
1414 case AF_INET:
1415 if (ipsec4_in_reject_so(m, so)) {
1416 ipsecstat.in_polvio++;
1417 tp = NULL;
1418 goto dropwithreset;
1419 }
1420 break;
1421 #endif
1422 #ifdef INET6
1423 case AF_INET6:
1424 if (ipsec6_in_reject_so(m, so)) {
1425 ipsec6stat.in_polvio++;
1426 tp = NULL;
1427 goto dropwithreset;
1428 }
1429 break;
1430 #endif
1431 }
1432 #endif
1433
1434 /*
1435 * LISTEN socket received a SYN
1436 * from itself? This can't possibly
1437 * be valid; drop the packet.
1438 */
1439 if (th->th_sport == th->th_dport) {
1440 int i;
1441
1442 switch (af) {
1443 #ifdef INET
1444 case AF_INET:
1445 i = in_hosteq(ip->ip_src, ip->ip_dst);
1446 break;
1447 #endif
1448 #ifdef INET6
1449 case AF_INET6:
1450 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
1451 break;
1452 #endif
1453 default:
1454 i = 1;
1455 }
1456 if (i) {
1457 tcpstat.tcps_badsyn++;
1458 goto drop;
1459 }
1460 }
1461
1462 /*
1463 * SYN looks ok; create compressed TCP
1464 * state for it.
1465 */
1466 if (so->so_qlen <= so->so_qlimit &&
1467 syn_cache_add(&src.sa, &dst.sa, th, tlen,
1468 so, m, optp, optlen, &opti))
1469 m = NULL;
1470 }
1471 goto drop;
1472 }
1473 }
1474
1475 after_listen:
1476 #ifdef DIAGNOSTIC
1477 /*
1478 * Should not happen now that all embryonic connections
1479 * are handled with compressed state.
1480 */
1481 if (tp->t_state == TCPS_LISTEN)
1482 panic("tcp_input: TCPS_LISTEN");
1483 #endif
1484
1485 /*
1486 * Segment received on connection.
1487 * Reset idle time and keep-alive timer.
1488 */
1489 tp->t_rcvtime = tcp_now;
1490 if (TCPS_HAVEESTABLISHED(tp->t_state))
1491 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1492
1493 /*
1494 * Process options.
1495 */
1496 #ifdef TCP_SIGNATURE
1497 if (optp || (tp->t_flags & TF_SIGNATURE))
1498 #else
1499 if (optp)
1500 #endif
1501 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
1502 goto drop;
1503
1504 if (TCP_SACK_ENABLED(tp)) {
1505 tcp_del_sackholes(tp, th);
1506 }
1507
1508 if (opti.ts_present && opti.ts_ecr) {
1509 /*
1510 * Calculate the RTT from the returned time stamp and the
1511 * connection's time base. If the time stamp is later than
1512 * the current time, or is extremely old, fall back to non-1323
1513 * RTT calculation. Since ts_ecr is unsigned, we can test both
1514 * at the same time.
1515 */
1516 opti.ts_ecr = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
1517 if (opti.ts_ecr > TCP_PAWS_IDLE)
1518 opti.ts_ecr = 0;
1519 }
1520
1521 /*
1522 * Header prediction: check for the two common cases
1523 * of a uni-directional data xfer. If the packet has
1524 * no control flags, is in-sequence, the window didn't
1525 * change and we're not retransmitting, it's a
1526 * candidate. If the length is zero and the ack moved
1527 * forward, we're the sender side of the xfer. Just
1528 * free the data acked & wake any higher level process
1529 * that was blocked waiting for space. If the length
1530 * is non-zero and the ack didn't move, we're the
1531 * receiver side. If we're getting packets in-order
1532 * (the reassembly queue is empty), add the data to
1533 * the socket buffer and note that we need a delayed ack.
1534 */
1535 if (tp->t_state == TCPS_ESTABLISHED &&
1536 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1537 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
1538 th->th_seq == tp->rcv_nxt &&
1539 tiwin && tiwin == tp->snd_wnd &&
1540 tp->snd_nxt == tp->snd_max) {
1541
1542 /*
1543 * If last ACK falls within this segment's sequence numbers,
1544 * record the timestamp.
1545 */
1546 if (opti.ts_present &&
1547 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1548 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) {
1549 tp->ts_recent_age = tcp_now;
1550 tp->ts_recent = opti.ts_val;
1551 }
1552
1553 if (tlen == 0) {
1554 /* Ack prediction. */
1555 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1556 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1557 tp->snd_cwnd >= tp->snd_wnd &&
1558 tp->t_partialacks < 0) {
1559 /*
1560 * this is a pure ack for outstanding data.
1561 */
1562 ++tcpstat.tcps_predack;
1563 if (opti.ts_present && opti.ts_ecr)
1564 tcp_xmit_timer(tp, opti.ts_ecr);
1565 else if (tp->t_rtttime &&
1566 SEQ_GT(th->th_ack, tp->t_rtseq))
1567 tcp_xmit_timer(tp,
1568 tcp_now - tp->t_rtttime);
1569 acked = th->th_ack - tp->snd_una;
1570 tcpstat.tcps_rcvackpack++;
1571 tcpstat.tcps_rcvackbyte += acked;
1572 ND6_HINT(tp);
1573
1574 if (acked > (tp->t_lastoff - tp->t_inoff))
1575 tp->t_lastm = NULL;
1576 sbdrop(&so->so_snd, acked);
1577 tp->t_lastoff -= acked;
1578
1579 tp->snd_una = th->th_ack;
1580 tp->snd_fack = tp->snd_una;
1581 if (SEQ_LT(tp->snd_high, tp->snd_una))
1582 tp->snd_high = tp->snd_una;
1583 m_freem(m);
1584
1585 /*
1586 * If all outstanding data are acked, stop
1587 * retransmit timer, otherwise restart timer
1588 * using current (possibly backed-off) value.
1589 * If process is waiting for space,
1590 * wakeup/selwakeup/signal. If data
1591 * are ready to send, let tcp_output
1592 * decide between more output or persist.
1593 */
1594 if (tp->snd_una == tp->snd_max)
1595 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1596 else if (TCP_TIMER_ISARMED(tp,
1597 TCPT_PERSIST) == 0)
1598 TCP_TIMER_ARM(tp, TCPT_REXMT,
1599 tp->t_rxtcur);
1600
1601 sowwakeup(so);
1602 if (so->so_snd.sb_cc)
1603 (void) tcp_output(tp);
1604 if (tcp_saveti)
1605 m_freem(tcp_saveti);
1606 return;
1607 }
1608 } else if (th->th_ack == tp->snd_una &&
1609 TAILQ_FIRST(&tp->segq) == NULL &&
1610 tlen <= sbspace(&so->so_rcv)) {
1611 /*
1612 * this is a pure, in-sequence data packet
1613 * with nothing on the reassembly queue and
1614 * we have enough buffer space to take it.
1615 */
1616 ++tcpstat.tcps_preddat;
1617 tp->rcv_nxt += tlen;
1618 tcpstat.tcps_rcvpack++;
1619 tcpstat.tcps_rcvbyte += tlen;
1620 ND6_HINT(tp);
1621 /*
1622 * Drop TCP, IP headers and TCP options then add data
1623 * to socket buffer.
1624 */
1625 if (so->so_state & SS_CANTRCVMORE)
1626 m_freem(m);
1627 else {
1628 m_adj(m, toff + off);
1629 sbappendstream(&so->so_rcv, m);
1630 }
1631 sorwakeup(so);
1632 TCP_SETUP_ACK(tp, th);
1633 if (tp->t_flags & TF_ACKNOW)
1634 (void) tcp_output(tp);
1635 if (tcp_saveti)
1636 m_freem(tcp_saveti);
1637 return;
1638 }
1639 }
1640
1641 /*
1642 * Compute mbuf offset to TCP data segment.
1643 */
1644 hdroptlen = toff + off;
1645
1646 /*
1647 * Calculate amount of space in receive window,
1648 * and then do TCP input processing.
1649 * Receive window is amount of space in rcv queue,
1650 * but not less than advertised window.
1651 */
1652 { int win;
1653
1654 win = sbspace(&so->so_rcv);
1655 if (win < 0)
1656 win = 0;
1657 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1658 }
1659
1660 switch (tp->t_state) {
1661 case TCPS_LISTEN:
1662 /*
1663 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
1664 */
1665 if (m->m_flags & (M_BCAST|M_MCAST))
1666 goto drop;
1667 switch (af) {
1668 #ifdef INET6
1669 case AF_INET6:
1670 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
1671 goto drop;
1672 break;
1673 #endif /* INET6 */
1674 case AF_INET:
1675 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
1676 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
1677 goto drop;
1678 break;
1679 }
1680 break;
1681
1682 /*
1683 * If the state is SYN_SENT:
1684 * if seg contains an ACK, but not for our SYN, drop the input.
1685 * if seg contains a RST, then drop the connection.
1686 * if seg does not contain SYN, then drop it.
1687 * Otherwise this is an acceptable SYN segment
1688 * initialize tp->rcv_nxt and tp->irs
1689 * if seg contains ack then advance tp->snd_una
1690 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1691 * arrange for segment to be acked (eventually)
1692 * continue processing rest of data/controls, beginning with URG
1693 */
1694 case TCPS_SYN_SENT:
1695 if ((tiflags & TH_ACK) &&
1696 (SEQ_LEQ(th->th_ack, tp->iss) ||
1697 SEQ_GT(th->th_ack, tp->snd_max)))
1698 goto dropwithreset;
1699 if (tiflags & TH_RST) {
1700 if (tiflags & TH_ACK)
1701 tp = tcp_drop(tp, ECONNREFUSED);
1702 goto drop;
1703 }
1704 if ((tiflags & TH_SYN) == 0)
1705 goto drop;
1706 if (tiflags & TH_ACK) {
1707 tp->snd_una = th->th_ack;
1708 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1709 tp->snd_nxt = tp->snd_una;
1710 if (SEQ_LT(tp->snd_high, tp->snd_una))
1711 tp->snd_high = tp->snd_una;
1712 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1713 }
1714 tp->irs = th->th_seq;
1715 tcp_rcvseqinit(tp);
1716 tp->t_flags |= TF_ACKNOW;
1717 tcp_mss_from_peer(tp, opti.maxseg);
1718
1719 /*
1720 * Initialize the initial congestion window. If we
1721 * had to retransmit the SYN, we must initialize cwnd
1722 * to 1 segment (i.e. the Loss Window).
1723 */
1724 if (tp->t_flags & TF_SYN_REXMT)
1725 tp->snd_cwnd = tp->t_peermss;
1726 else {
1727 int ss = tcp_init_win;
1728 #ifdef INET
1729 if (inp != NULL && in_localaddr(inp->inp_faddr))
1730 ss = tcp_init_win_local;
1731 #endif
1732 #ifdef INET6
1733 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
1734 ss = tcp_init_win_local;
1735 #endif
1736 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
1737 }
1738
1739 tcp_rmx_rtt(tp);
1740 if (tiflags & TH_ACK) {
1741 tcpstat.tcps_connects++;
1742 soisconnected(so);
1743 tcp_established(tp);
1744 /* Do window scaling on this connection? */
1745 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1746 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1747 tp->snd_scale = tp->requested_s_scale;
1748 tp->rcv_scale = tp->request_r_scale;
1749 }
1750 TCP_REASS_LOCK(tp);
1751 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
1752 TCP_REASS_UNLOCK(tp);
1753 /*
1754 * if we didn't have to retransmit the SYN,
1755 * use its rtt as our initial srtt & rtt var.
1756 */
1757 if (tp->t_rtttime)
1758 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1759 } else
1760 tp->t_state = TCPS_SYN_RECEIVED;
1761
1762 /*
1763 * Advance th->th_seq to correspond to first data byte.
1764 * If data, trim to stay within window,
1765 * dropping FIN if necessary.
1766 */
1767 th->th_seq++;
1768 if (tlen > tp->rcv_wnd) {
1769 todrop = tlen - tp->rcv_wnd;
1770 m_adj(m, -todrop);
1771 tlen = tp->rcv_wnd;
1772 tiflags &= ~TH_FIN;
1773 tcpstat.tcps_rcvpackafterwin++;
1774 tcpstat.tcps_rcvbyteafterwin += todrop;
1775 }
1776 tp->snd_wl1 = th->th_seq - 1;
1777 tp->rcv_up = th->th_seq;
1778 goto step6;
1779
1780 /*
1781 * If the state is SYN_RECEIVED:
1782 * If seg contains an ACK, but not for our SYN, drop the input
1783 * and generate an RST. See page 36, rfc793
1784 */
1785 case TCPS_SYN_RECEIVED:
1786 if ((tiflags & TH_ACK) &&
1787 (SEQ_LEQ(th->th_ack, tp->iss) ||
1788 SEQ_GT(th->th_ack, tp->snd_max)))
1789 goto dropwithreset;
1790 break;
1791 }
1792
1793 /*
1794 * States other than LISTEN or SYN_SENT.
1795 * First check timestamp, if present.
1796 * Then check that at least some bytes of segment are within
1797 * receive window. If segment begins before rcv_nxt,
1798 * drop leading data (and SYN); if nothing left, just ack.
1799 *
1800 * RFC 1323 PAWS: If we have a timestamp reply on this segment
1801 * and it's less than ts_recent, drop it.
1802 */
1803 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1804 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
1805
1806 /* Check to see if ts_recent is over 24 days old. */
1807 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
1808 /*
1809 * Invalidate ts_recent. If this segment updates
1810 * ts_recent, the age will be reset later and ts_recent
1811 * will get a valid value. If it does not, setting
1812 * ts_recent to zero will at least satisfy the
1813 * requirement that zero be placed in the timestamp
1814 * echo reply when ts_recent isn't valid. The
1815 * age isn't reset until we get a valid ts_recent
1816 * because we don't want out-of-order segments to be
1817 * dropped when ts_recent is old.
1818 */
1819 tp->ts_recent = 0;
1820 } else {
1821 tcpstat.tcps_rcvduppack++;
1822 tcpstat.tcps_rcvdupbyte += tlen;
1823 tcpstat.tcps_pawsdrop++;
1824 tcp_new_dsack(tp, th->th_seq, tlen);
1825 goto dropafterack;
1826 }
1827 }
1828
1829 todrop = tp->rcv_nxt - th->th_seq;
1830 dupseg = FALSE;
1831 if (todrop > 0) {
1832 if (tiflags & TH_SYN) {
1833 tiflags &= ~TH_SYN;
1834 th->th_seq++;
1835 if (th->th_urp > 1)
1836 th->th_urp--;
1837 else {
1838 tiflags &= ~TH_URG;
1839 th->th_urp = 0;
1840 }
1841 todrop--;
1842 }
1843 if (todrop > tlen ||
1844 (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1845 /*
1846 * Any valid FIN or RST must be to the left of the
1847 * window. At this point the FIN or RST must be a
1848 * duplicate or out of sequence; drop it.
1849 */
1850 if (tiflags & TH_RST)
1851 goto drop;
1852 tiflags &= ~(TH_FIN|TH_RST);
1853 /*
1854 * Send an ACK to resynchronize and drop any data.
1855 * But keep on processing for RST or ACK.
1856 */
1857 tp->t_flags |= TF_ACKNOW;
1858 todrop = tlen;
1859 dupseg = TRUE;
1860 tcpstat.tcps_rcvdupbyte += todrop;
1861 tcpstat.tcps_rcvduppack++;
1862 } else if ((tiflags & TH_RST) &&
1863 th->th_seq != tp->last_ack_sent) {
1864 /*
1865 * Test for reset before adjusting the sequence
1866 * number for overlapping data.
1867 */
1868 goto dropafterack_ratelim;
1869 } else {
1870 tcpstat.tcps_rcvpartduppack++;
1871 tcpstat.tcps_rcvpartdupbyte += todrop;
1872 }
1873 tcp_new_dsack(tp, th->th_seq, todrop);
1874 hdroptlen += todrop; /*drop from head afterwards*/
1875 th->th_seq += todrop;
1876 tlen -= todrop;
1877 if (th->th_urp > todrop)
1878 th->th_urp -= todrop;
1879 else {
1880 tiflags &= ~TH_URG;
1881 th->th_urp = 0;
1882 }
1883 }
1884
1885 /*
1886 * If new data are received on a connection after the
1887 * user processes are gone, then RST the other end.
1888 */
1889 if ((so->so_state & SS_NOFDREF) &&
1890 tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1891 tp = tcp_close(tp);
1892 tcpstat.tcps_rcvafterclose++;
1893 goto dropwithreset;
1894 }
1895
1896 /*
1897 * If segment ends after window, drop trailing data
1898 * (and PUSH and FIN); if nothing left, just ACK.
1899 */
1900 todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1901 if (todrop > 0) {
1902 tcpstat.tcps_rcvpackafterwin++;
1903 if (todrop >= tlen) {
1904 /*
1905 * The segment actually starts after the window.
1906 * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
1907 * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
1908 * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
1909 */
1910 tcpstat.tcps_rcvbyteafterwin += tlen;
1911 /*
1912 * If a new connection request is received
1913 * while in TIME_WAIT, drop the old connection
1914 * and start over if the sequence numbers
1915 * are above the previous ones.
1916 *
1917 * NOTE: We will checksum the packet again, and
1918 * so we need to put the header fields back into
1919 * network order!
1920 * XXX This kind of sucks, but we don't expect
1921 * XXX this to happen very often, so maybe it
1922 * XXX doesn't matter so much.
1923 */
1924 if (tiflags & TH_SYN &&
1925 tp->t_state == TCPS_TIME_WAIT &&
1926 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1927 iss = tcp_new_iss(tp, tp->snd_nxt);
1928 tp = tcp_close(tp);
1929 TCP_FIELDS_TO_NET(th);
1930 goto findpcb;
1931 }
1932 /*
1933 * If window is closed can only take segments at
1934 * window edge, and have to drop data and PUSH from
1935 * incoming segments. Continue processing, but
1936 * remember to ack. Otherwise, drop segment
1937 * and (if not RST) ack.
1938 */
1939 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1940 tp->t_flags |= TF_ACKNOW;
1941 tcpstat.tcps_rcvwinprobe++;
1942 } else
1943 goto dropafterack;
1944 } else
1945 tcpstat.tcps_rcvbyteafterwin += todrop;
1946 m_adj(m, -todrop);
1947 tlen -= todrop;
1948 tiflags &= ~(TH_PUSH|TH_FIN);
1949 }
1950
1951 /*
1952 * If last ACK falls within this segment's sequence numbers,
1953 * and the timestamp is newer, record it.
1954 */
1955 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
1956 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
1957 SEQ_LT(tp->last_ack_sent, th->th_seq + tlen +
1958 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
1959 tp->ts_recent_age = tcp_now;
1960 tp->ts_recent = opti.ts_val;
1961 }
1962
1963 /*
1964 * If the RST bit is set examine the state:
1965 * SYN_RECEIVED STATE:
1966 * If passive open, return to LISTEN state.
1967 * If active open, inform user that connection was refused.
1968 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1969 * Inform user that connection was reset, and close tcb.
1970 * CLOSING, LAST_ACK, TIME_WAIT STATES
1971 * Close the tcb.
1972 */
1973 if (tiflags & TH_RST) {
1974 if (th->th_seq != tp->last_ack_sent)
1975 goto dropafterack_ratelim;
1976
1977 switch (tp->t_state) {
1978 case TCPS_SYN_RECEIVED:
1979 so->so_error = ECONNREFUSED;
1980 goto close;
1981
1982 case TCPS_ESTABLISHED:
1983 case TCPS_FIN_WAIT_1:
1984 case TCPS_FIN_WAIT_2:
1985 case TCPS_CLOSE_WAIT:
1986 so->so_error = ECONNRESET;
1987 close:
1988 tp->t_state = TCPS_CLOSED;
1989 tcpstat.tcps_drops++;
1990 tp = tcp_close(tp);
1991 goto drop;
1992
1993 case TCPS_CLOSING:
1994 case TCPS_LAST_ACK:
1995 case TCPS_TIME_WAIT:
1996 tp = tcp_close(tp);
1997 goto drop;
1998 }
1999 }
2000
2001 /*
2002 * Since we've covered the SYN-SENT and SYN-RECEIVED states above
2003 * we must be in a synchronized state. RFC791 states (under RST
2004 * generation) that any unacceptable segment (an out-of-order SYN
2005 * qualifies) received in a synchronized state must elicit only an
2006 * empty acknowledgment segment ... and the connection remains in
2007 * the same state.
2008 */
2009 if (tiflags & TH_SYN) {
2010 if (tp->rcv_nxt == th->th_seq) {
2011 tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
2012 TH_ACK);
2013 if (tcp_saveti)
2014 m_freem(tcp_saveti);
2015 return;
2016 }
2017
2018 goto dropafterack_ratelim;
2019 }
2020
2021 /*
2022 * If the ACK bit is off we drop the segment and return.
2023 */
2024 if ((tiflags & TH_ACK) == 0) {
2025 if (tp->t_flags & TF_ACKNOW)
2026 goto dropafterack;
2027 else
2028 goto drop;
2029 }
2030
2031 /*
2032 * Ack processing.
2033 */
2034 switch (tp->t_state) {
2035
2036 /*
2037 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
2038 * ESTABLISHED state and continue processing, otherwise
2039 * send an RST.
2040 */
2041 case TCPS_SYN_RECEIVED:
2042 if (SEQ_GT(tp->snd_una, th->th_ack) ||
2043 SEQ_GT(th->th_ack, tp->snd_max))
2044 goto dropwithreset;
2045 tcpstat.tcps_connects++;
2046 soisconnected(so);
2047 tcp_established(tp);
2048 /* Do window scaling? */
2049 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2050 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2051 tp->snd_scale = tp->requested_s_scale;
2052 tp->rcv_scale = tp->request_r_scale;
2053 }
2054 TCP_REASS_LOCK(tp);
2055 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
2056 TCP_REASS_UNLOCK(tp);
2057 tp->snd_wl1 = th->th_seq - 1;
2058 /* fall into ... */
2059
2060 /*
2061 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2062 * ACKs. If the ack is in the range
2063 * tp->snd_una < th->th_ack <= tp->snd_max
2064 * then advance tp->snd_una to th->th_ack and drop
2065 * data from the retransmission queue. If this ACK reflects
2066 * more up to date window information we update our window information.
2067 */
2068 case TCPS_ESTABLISHED:
2069 case TCPS_FIN_WAIT_1:
2070 case TCPS_FIN_WAIT_2:
2071 case TCPS_CLOSE_WAIT:
2072 case TCPS_CLOSING:
2073 case TCPS_LAST_ACK:
2074 case TCPS_TIME_WAIT:
2075
2076 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
2077 if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
2078 tcpstat.tcps_rcvdupack++;
2079 /*
2080 * If we have outstanding data (other than
2081 * a window probe), this is a completely
2082 * duplicate ack (ie, window info didn't
2083 * change), the ack is the biggest we've
2084 * seen and we've seen exactly our rexmt
2085 * threshhold of them, assume a packet
2086 * has been dropped and retransmit it.
2087 * Kludge snd_nxt & the congestion
2088 * window so we send only this one
2089 * packet.
2090 *
2091 * We know we're losing at the current
2092 * window size so do congestion avoidance
2093 * (set ssthresh to half the current window
2094 * and pull our congestion window back to
2095 * the new ssthresh).
2096 *
2097 * Dup acks mean that packets have left the
2098 * network (they're now cached at the receiver)
2099 * so bump cwnd by the amount in the receiver
2100 * to keep a constant cwnd packets in the
2101 * network.
2102 *
2103 * If we are using TCP/SACK, then enter
2104 * Fast Recovery if the receiver SACKs
2105 * data that is tcprexmtthresh * MSS
2106 * bytes past the last ACKed segment,
2107 * irrespective of the number of DupAcks.
2108 */
2109 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
2110 th->th_ack != tp->snd_una)
2111 tp->t_dupacks = 0;
2112 else if (tp->t_partialacks < 0 &&
2113 (++tp->t_dupacks == tcprexmtthresh ||
2114 TCP_FACK_FASTRECOV(tp))) {
2115 tcp_seq onxt;
2116 u_int win;
2117
2118 if (tcp_do_newreno &&
2119 SEQ_LT(th->th_ack, tp->snd_high)) {
2120 /*
2121 * False fast retransmit after
2122 * timeout. Do not enter fast
2123 * recovery.
2124 */
2125 tp->t_dupacks = 0;
2126 break;
2127 }
2128
2129 onxt = tp->snd_nxt;
2130 win = min(tp->snd_wnd, tp->snd_cwnd) /
2131 2 / tp->t_segsz;
2132 if (win < 2)
2133 win = 2;
2134 tp->snd_ssthresh = win * tp->t_segsz;
2135 tp->snd_recover = tp->snd_max;
2136 tp->t_partialacks = 0;
2137 TCP_TIMER_DISARM(tp, TCPT_REXMT);
2138 tp->t_rtttime = 0;
2139 if (TCP_SACK_ENABLED(tp)) {
2140 tp->t_dupacks = tcprexmtthresh;
2141 tp->sack_newdata = tp->snd_nxt;
2142 tp->snd_cwnd = tp->t_segsz;
2143 (void) tcp_output(tp);
2144 goto drop;
2145 }
2146 tp->snd_nxt = th->th_ack;
2147 tp->snd_cwnd = tp->t_segsz;
2148 (void) tcp_output(tp);
2149 tp->snd_cwnd = tp->snd_ssthresh +
2150 tp->t_segsz * tp->t_dupacks;
2151 if (SEQ_GT(onxt, tp->snd_nxt))
2152 tp->snd_nxt = onxt;
2153 goto drop;
2154 } else if (tp->t_dupacks > tcprexmtthresh) {
2155 tp->snd_cwnd += tp->t_segsz;
2156 (void) tcp_output(tp);
2157 goto drop;
2158 }
2159 } else {
2160 /*
2161 * If the ack appears to be very old, only
2162 * allow data that is in-sequence. This
2163 * makes it somewhat more difficult to insert
2164 * forged data by guessing sequence numbers.
2165 * Sent an ack to try to update the send
2166 * sequence number on the other side.
2167 */
2168 if (tlen && th->th_seq != tp->rcv_nxt &&
2169 SEQ_LT(th->th_ack,
2170 tp->snd_una - tp->max_sndwnd))
2171 goto dropafterack;
2172 }
2173 break;
2174 }
2175 /*
2176 * If the congestion window was inflated to account
2177 * for the other side's cached packets, retract it.
2178 */
2179 if (TCP_SACK_ENABLED(tp))
2180 tcp_sack_newack(tp, th);
2181 else if (tcp_do_newreno)
2182 tcp_newreno_newack(tp, th);
2183 else
2184 tcp_reno_newack(tp, th);
2185 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2186 tcpstat.tcps_rcvacktoomuch++;
2187 goto dropafterack;
2188 }
2189 acked = th->th_ack - tp->snd_una;
2190 tcpstat.tcps_rcvackpack++;
2191 tcpstat.tcps_rcvackbyte += acked;
2192
2193 /*
2194 * If we have a timestamp reply, update smoothed
2195 * round trip time. If no timestamp is present but
2196 * transmit timer is running and timed sequence
2197 * number was acked, update smoothed round trip time.
2198 * Since we now have an rtt measurement, cancel the
2199 * timer backoff (cf., Phil Karn's retransmit alg.).
2200 * Recompute the initial retransmit timer.
2201 */
2202 if (opti.ts_present && opti.ts_ecr)
2203 tcp_xmit_timer(tp, opti.ts_ecr);
2204 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
2205 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
2206
2207 /*
2208 * If all outstanding data is acked, stop retransmit
2209 * timer and remember to restart (more output or persist).
2210 * If there is more data to be acked, restart retransmit
2211 * timer, using current (possibly backed-off) value.
2212 */
2213 if (th->th_ack == tp->snd_max) {
2214 TCP_TIMER_DISARM(tp, TCPT_REXMT);
2215 needoutput = 1;
2216 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
2217 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2218 /*
2219 * When new data is acked, open the congestion window.
2220 * If the window gives us less than ssthresh packets
2221 * in flight, open exponentially (segsz per packet).
2222 * Otherwise open linearly: segsz per window
2223 * (segsz^2 / cwnd per packet), plus a constant
2224 * fraction of a packet (segsz/8) to help larger windows
2225 * open quickly enough.
2226 *
2227 * If we are still in fast recovery (meaning we are using
2228 * NewReno and we have only received partial acks), do not
2229 * inflate the window yet.
2230 */
2231 if (tp->t_partialacks < 0) {
2232 u_int cw = tp->snd_cwnd;
2233 u_int incr = tp->t_segsz;
2234
2235 if (cw >= tp->snd_ssthresh)
2236 incr = incr * incr / cw;
2237 tp->snd_cwnd = min(cw + incr,
2238 TCP_MAXWIN << tp->snd_scale);
2239 }
2240 ND6_HINT(tp);
2241 if (acked > so->so_snd.sb_cc) {
2242 tp->snd_wnd -= so->so_snd.sb_cc;
2243 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
2244 ourfinisacked = 1;
2245 } else {
2246 if (acked > (tp->t_lastoff - tp->t_inoff))
2247 tp->t_lastm = NULL;
2248 sbdrop(&so->so_snd, acked);
2249 tp->t_lastoff -= acked;
2250 tp->snd_wnd -= acked;
2251 ourfinisacked = 0;
2252 }
2253 sowwakeup(so);
2254 tp->snd_una = th->th_ack;
2255 if (SEQ_GT(tp->snd_una, tp->snd_fack))
2256 tp->snd_fack = tp->snd_una;
2257 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2258 tp->snd_nxt = tp->snd_una;
2259 if (SEQ_LT(tp->snd_high, tp->snd_una))
2260 tp->snd_high = tp->snd_una;
2261
2262 switch (tp->t_state) {
2263
2264 /*
2265 * In FIN_WAIT_1 STATE in addition to the processing
2266 * for the ESTABLISHED state if our FIN is now acknowledged
2267 * then enter FIN_WAIT_2.
2268 */
2269 case TCPS_FIN_WAIT_1:
2270 if (ourfinisacked) {
2271 /*
2272 * If we can't receive any more
2273 * data, then closing user can proceed.
2274 * Starting the timer is contrary to the
2275 * specification, but if we don't get a FIN
2276 * we'll hang forever.
2277 */
2278 if (so->so_state & SS_CANTRCVMORE) {
2279 soisdisconnected(so);
2280 if (tcp_maxidle > 0)
2281 TCP_TIMER_ARM(tp, TCPT_2MSL,
2282 tcp_maxidle);
2283 }
2284 tp->t_state = TCPS_FIN_WAIT_2;
2285 }
2286 break;
2287
2288 /*
2289 * In CLOSING STATE in addition to the processing for
2290 * the ESTABLISHED state if the ACK acknowledges our FIN
2291 * then enter the TIME-WAIT state, otherwise ignore
2292 * the segment.
2293 */
2294 case TCPS_CLOSING:
2295 if (ourfinisacked) {
2296 tp->t_state = TCPS_TIME_WAIT;
2297 tcp_canceltimers(tp);
2298 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2299 soisdisconnected(so);
2300 }
2301 break;
2302
2303 /*
2304 * In LAST_ACK, we may still be waiting for data to drain
2305 * and/or to be acked, as well as for the ack of our FIN.
2306 * If our FIN is now acknowledged, delete the TCB,
2307 * enter the closed state and return.
2308 */
2309 case TCPS_LAST_ACK:
2310 if (ourfinisacked) {
2311 tp = tcp_close(tp);
2312 goto drop;
2313 }
2314 break;
2315
2316 /*
2317 * In TIME_WAIT state the only thing that should arrive
2318 * is a retransmission of the remote FIN. Acknowledge
2319 * it and restart the finack timer.
2320 */
2321 case TCPS_TIME_WAIT:
2322 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2323 goto dropafterack;
2324 }
2325 }
2326
2327 step6:
2328 /*
2329 * Update window information.
2330 * Don't look at window if no ACK: TAC's send garbage on first SYN.
2331 */
2332 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
2333 (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) ||
2334 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) {
2335 /* keep track of pure window updates */
2336 if (tlen == 0 &&
2337 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
2338 tcpstat.tcps_rcvwinupd++;
2339 tp->snd_wnd = tiwin;
2340 tp->snd_wl1 = th->th_seq;
2341 tp->snd_wl2 = th->th_ack;
2342 if (tp->snd_wnd > tp->max_sndwnd)
2343 tp->max_sndwnd = tp->snd_wnd;
2344 needoutput = 1;
2345 }
2346
2347 /*
2348 * Process segments with URG.
2349 */
2350 if ((tiflags & TH_URG) && th->th_urp &&
2351 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2352 /*
2353 * This is a kludge, but if we receive and accept
2354 * random urgent pointers, we'll crash in
2355 * soreceive. It's hard to imagine someone
2356 * actually wanting to send this much urgent data.
2357 */
2358 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
2359 th->th_urp = 0; /* XXX */
2360 tiflags &= ~TH_URG; /* XXX */
2361 goto dodata; /* XXX */
2362 }
2363 /*
2364 * If this segment advances the known urgent pointer,
2365 * then mark the data stream. This should not happen
2366 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
2367 * a FIN has been received from the remote side.
2368 * In these states we ignore the URG.
2369 *
2370 * According to RFC961 (Assigned Protocols),
2371 * the urgent pointer points to the last octet
2372 * of urgent data. We continue, however,
2373 * to consider it to indicate the first octet
2374 * of data past the urgent section as the original
2375 * spec states (in one of two places).
2376 */
2377 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
2378 tp->rcv_up = th->th_seq + th->th_urp;
2379 so->so_oobmark = so->so_rcv.sb_cc +
2380 (tp->rcv_up - tp->rcv_nxt) - 1;
2381 if (so->so_oobmark == 0)
2382 so->so_state |= SS_RCVATMARK;
2383 sohasoutofband(so);
2384 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2385 }
2386 /*
2387 * Remove out of band data so doesn't get presented to user.
2388 * This can happen independent of advancing the URG pointer,
2389 * but if two URG's are pending at once, some out-of-band
2390 * data may creep in... ick.
2391 */
2392 if (th->th_urp <= (u_int16_t) tlen
2393 #ifdef SO_OOBINLINE
2394 && (so->so_options & SO_OOBINLINE) == 0
2395 #endif
2396 )
2397 tcp_pulloutofband(so, th, m, hdroptlen);
2398 } else
2399 /*
2400 * If no out of band data is expected,
2401 * pull receive urgent pointer along
2402 * with the receive window.
2403 */
2404 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2405 tp->rcv_up = tp->rcv_nxt;
2406 dodata: /* XXX */
2407
2408 /*
2409 * Process the segment text, merging it into the TCP sequencing queue,
2410 * and arranging for acknowledgement of receipt if necessary.
2411 * This process logically involves adjusting tp->rcv_wnd as data
2412 * is presented to the user (this happens in tcp_usrreq.c,
2413 * case PRU_RCVD). If a FIN has already been received on this
2414 * connection then we just ignore the text.
2415 */
2416 if ((tlen || (tiflags & TH_FIN)) &&
2417 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2418 /*
2419 * Insert segment ti into reassembly queue of tcp with
2420 * control block tp. Return TH_FIN if reassembly now includes
2421 * a segment with FIN. The macro form does the common case
2422 * inline (segment is the next to be received on an
2423 * established connection, and the queue is empty),
2424 * avoiding linkage into and removal from the queue and
2425 * repetition of various conversions.
2426 * Set DELACK for segments received in order, but ack
2427 * immediately when segments are out of order
2428 * (so fast retransmit can work).
2429 */
2430 /* NOTE: this was TCP_REASS() macro, but used only once */
2431 TCP_REASS_LOCK(tp);
2432 if (th->th_seq == tp->rcv_nxt &&
2433 TAILQ_FIRST(&tp->segq) == NULL &&
2434 tp->t_state == TCPS_ESTABLISHED) {
2435 TCP_SETUP_ACK(tp, th);
2436 tp->rcv_nxt += tlen;
2437 tiflags = th->th_flags & TH_FIN;
2438 tcpstat.tcps_rcvpack++;
2439 tcpstat.tcps_rcvbyte += tlen;
2440 ND6_HINT(tp);
2441 if (so->so_state & SS_CANTRCVMORE)
2442 m_freem(m);
2443 else {
2444 m_adj(m, hdroptlen);
2445 sbappendstream(&(so)->so_rcv, m);
2446 }
2447 sorwakeup(so);
2448 } else {
2449 m_adj(m, hdroptlen);
2450 tiflags = tcp_reass(tp, th, m, &tlen);
2451 tp->t_flags |= TF_ACKNOW;
2452 }
2453 TCP_REASS_UNLOCK(tp);
2454
2455 /*
2456 * Note the amount of data that peer has sent into
2457 * our window, in order to estimate the sender's
2458 * buffer size.
2459 */
2460 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2461 } else {
2462 m_freem(m);
2463 m = NULL;
2464 tiflags &= ~TH_FIN;
2465 }
2466
2467 /*
2468 * If FIN is received ACK the FIN and let the user know
2469 * that the connection is closing. Ignore a FIN received before
2470 * the connection is fully established.
2471 */
2472 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2473 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2474 socantrcvmore(so);
2475 tp->t_flags |= TF_ACKNOW;
2476 tp->rcv_nxt++;
2477 }
2478 switch (tp->t_state) {
2479
2480 /*
2481 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2482 */
2483 case TCPS_ESTABLISHED:
2484 tp->t_state = TCPS_CLOSE_WAIT;
2485 break;
2486
2487 /*
2488 * If still in FIN_WAIT_1 STATE FIN has not been acked so
2489 * enter the CLOSING state.
2490 */
2491 case TCPS_FIN_WAIT_1:
2492 tp->t_state = TCPS_CLOSING;
2493 break;
2494
2495 /*
2496 * In FIN_WAIT_2 state enter the TIME_WAIT state,
2497 * starting the time-wait timer, turning off the other
2498 * standard timers.
2499 */
2500 case TCPS_FIN_WAIT_2:
2501 tp->t_state = TCPS_TIME_WAIT;
2502 tcp_canceltimers(tp);
2503 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2504 soisdisconnected(so);
2505 break;
2506
2507 /*
2508 * In TIME_WAIT state restart the 2 MSL time_wait timer.
2509 */
2510 case TCPS_TIME_WAIT:
2511 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2512 break;
2513 }
2514 }
2515 #ifdef TCP_DEBUG
2516 if (so->so_options & SO_DEBUG)
2517 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
2518 #endif
2519
2520 /*
2521 * Return any desired output.
2522 */
2523 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2524 (void) tcp_output(tp);
2525 }
2526 if (tcp_saveti)
2527 m_freem(tcp_saveti);
2528 return;
2529
2530 badsyn:
2531 /*
2532 * Received a bad SYN. Increment counters and dropwithreset.
2533 */
2534 tcpstat.tcps_badsyn++;
2535 tp = NULL;
2536 goto dropwithreset;
2537
2538 dropafterack:
2539 /*
2540 * Generate an ACK dropping incoming segment if it occupies
2541 * sequence space, where the ACK reflects our state.
2542 */
2543 if (tiflags & TH_RST)
2544 goto drop;
2545 goto dropafterack2;
2546
2547 dropafterack_ratelim:
2548 /*
2549 * We may want to rate-limit ACKs against SYN/RST attack.
2550 */
2551 if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2552 tcp_ackdrop_ppslim) == 0) {
2553 /* XXX stat */
2554 goto drop;
2555 }
2556 /* ...fall into dropafterack2... */
2557
2558 dropafterack2:
2559 m_freem(m);
2560 tp->t_flags |= TF_ACKNOW;
2561 (void) tcp_output(tp);
2562 if (tcp_saveti)
2563 m_freem(tcp_saveti);
2564 return;
2565
2566 dropwithreset_ratelim:
2567 /*
2568 * We may want to rate-limit RSTs in certain situations,
2569 * particularly if we are sending an RST in response to
2570 * an attempt to connect to or otherwise communicate with
2571 * a port for which we have no socket.
2572 */
2573 if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2574 tcp_rst_ppslim) == 0) {
2575 /* XXX stat */
2576 goto drop;
2577 }
2578 /* ...fall into dropwithreset... */
2579
2580 dropwithreset:
2581 /*
2582 * Generate a RST, dropping incoming segment.
2583 * Make ACK acceptable to originator of segment.
2584 */
2585 if (tiflags & TH_RST)
2586 goto drop;
2587
2588 switch (af) {
2589 #ifdef INET6
2590 case AF_INET6:
2591 /* For following calls to tcp_respond */
2592 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
2593 goto drop;
2594 break;
2595 #endif /* INET6 */
2596 case AF_INET:
2597 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
2598 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
2599 goto drop;
2600 }
2601
2602 if (tiflags & TH_ACK)
2603 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
2604 else {
2605 if (tiflags & TH_SYN)
2606 tlen++;
2607 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
2608 TH_RST|TH_ACK);
2609 }
2610 if (tcp_saveti)
2611 m_freem(tcp_saveti);
2612 return;
2613
2614 badcsum:
2615 drop:
2616 /*
2617 * Drop space held by incoming segment and return.
2618 */
2619 if (tp) {
2620 if (tp->t_inpcb)
2621 so = tp->t_inpcb->inp_socket;
2622 #ifdef INET6
2623 else if (tp->t_in6pcb)
2624 so = tp->t_in6pcb->in6p_socket;
2625 #endif
2626 else
2627 so = NULL;
2628 #ifdef TCP_DEBUG
2629 if (so && (so->so_options & SO_DEBUG) != 0)
2630 tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
2631 #endif
2632 }
2633 if (tcp_saveti)
2634 m_freem(tcp_saveti);
2635 m_freem(m);
2636 return;
2637 }
2638
2639 #ifdef TCP_SIGNATURE
2640 int
2641 tcp_signature_apply(void *fstate, caddr_t data, u_int len)
2642 {
2643
2644 MD5Update(fstate, (u_char *)data, len);
2645 return (0);
2646 }
2647
2648 struct secasvar *
2649 tcp_signature_getsav(struct mbuf *m, struct tcphdr *th)
2650 {
2651 struct secasvar *sav;
2652 #ifdef FAST_IPSEC
2653 union sockaddr_union dst;
2654 #endif
2655 struct ip *ip;
2656 struct ip6_hdr *ip6;
2657
2658 ip = mtod(m, struct ip *);
2659 switch (ip->ip_v) {
2660 case 4:
2661 ip = mtod(m, struct ip *);
2662 ip6 = NULL;
2663 break;
2664 case 6:
2665 ip = NULL;
2666 ip6 = mtod(m, struct ip6_hdr *);
2667 break;
2668 default:
2669 return (NULL);
2670 }
2671
2672 #ifdef FAST_IPSEC
2673 /* Extract the destination from the IP header in the mbuf. */
2674 bzero(&dst, sizeof(union sockaddr_union));
2675 dst.sa.sa_len = sizeof(struct sockaddr_in);
2676 dst.sa.sa_family = AF_INET;
2677 dst.sin.sin_addr = ip->ip_dst;
2678
2679 /*
2680 * Look up an SADB entry which matches the address of the peer.
2681 */
2682 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
2683 #else
2684 if (ip)
2685 sav = key_allocsa(AF_INET, (caddr_t)&ip->ip_src,
2686 (caddr_t)&ip->ip_dst, IPPROTO_TCP,
2687 htonl(TCP_SIG_SPI));
2688 else
2689 sav = key_allocsa(AF_INET6, (caddr_t)&ip6->ip6_src,
2690 (caddr_t)&ip6->ip6_dst, IPPROTO_TCP,
2691 htonl(TCP_SIG_SPI));
2692 #endif
2693
2694 return (sav); /* freesav must be performed by caller */
2695 }
2696
2697 int
2698 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
2699 struct secasvar *sav, char *sig)
2700 {
2701 MD5_CTX ctx;
2702 struct ip *ip;
2703 struct ipovly *ipovly;
2704 struct ip6_hdr *ip6;
2705 struct ippseudo ippseudo;
2706 struct ip6_hdr_pseudo ip6pseudo;
2707 struct tcphdr th0;
2708 int l, tcphdrlen;
2709
2710 if (sav == NULL)
2711 return (-1);
2712
2713 tcphdrlen = th->th_off * 4;
2714
2715 switch (mtod(m, struct ip *)->ip_v) {
2716 case 4:
2717 ip = mtod(m, struct ip *);
2718 ip6 = NULL;
2719 break;
2720 case 6:
2721 ip = NULL;
2722 ip6 = mtod(m, struct ip6_hdr *);
2723 break;
2724 default:
2725 return (-1);
2726 }
2727
2728 MD5Init(&ctx);
2729
2730 if (ip) {
2731 memset(&ippseudo, 0, sizeof(ippseudo));
2732 ipovly = (struct ipovly *)ip;
2733 ippseudo.ippseudo_src = ipovly->ih_src;
2734 ippseudo.ippseudo_dst = ipovly->ih_dst;
2735 ippseudo.ippseudo_pad = 0;
2736 ippseudo.ippseudo_p = IPPROTO_TCP;
2737 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
2738 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
2739 } else {
2740 memset(&ip6pseudo, 0, sizeof(ip6pseudo));
2741 ip6pseudo.ip6ph_src = ip6->ip6_src;
2742 in6_clearscope(&ip6pseudo.ip6ph_src);
2743 ip6pseudo.ip6ph_dst = ip6->ip6_dst;
2744 in6_clearscope(&ip6pseudo.ip6ph_dst);
2745 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
2746 ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
2747 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
2748 }
2749
2750 th0 = *th;
2751 th0.th_sum = 0;
2752 MD5Update(&ctx, (char *)&th0, sizeof(th0));
2753
2754 l = m->m_pkthdr.len - thoff - tcphdrlen;
2755 if (l > 0)
2756 m_apply(m, thoff + tcphdrlen,
2757 m->m_pkthdr.len - thoff - tcphdrlen,
2758 tcp_signature_apply, &ctx);
2759
2760 MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
2761 MD5Final(sig, &ctx);
2762
2763 return (0);
2764 }
2765 #endif
2766
2767 int
2768 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
2769 struct mbuf *m, int toff, struct tcp_opt_info *oi)
2770 {
2771 u_int16_t mss;
2772 int opt, optlen = 0;
2773 #ifdef TCP_SIGNATURE
2774 caddr_t sigp = NULL;
2775 char sigbuf[TCP_SIGLEN];
2776 struct secasvar *sav = NULL;
2777 #endif
2778
2779 for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2780 opt = cp[0];
2781 if (opt == TCPOPT_EOL)
2782 break;
2783 if (opt == TCPOPT_NOP)
2784 optlen = 1;
2785 else {
2786 if (cnt < 2)
2787 break;
2788 optlen = cp[1];
2789 if (optlen < 2 || optlen > cnt)
2790 break;
2791 }
2792 switch (opt) {
2793
2794 default:
2795 continue;
2796
2797 case TCPOPT_MAXSEG:
2798 if (optlen != TCPOLEN_MAXSEG)
2799 continue;
2800 if (!(th->th_flags & TH_SYN))
2801 continue;
2802 bcopy(cp + 2, &mss, sizeof(mss));
2803 oi->maxseg = ntohs(mss);
2804 break;
2805
2806 case TCPOPT_WINDOW:
2807 if (optlen != TCPOLEN_WINDOW)
2808 continue;
2809 if (!(th->th_flags & TH_SYN))
2810 continue;
2811 tp->t_flags |= TF_RCVD_SCALE;
2812 tp->requested_s_scale = cp[2];
2813 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
2814 #if 0 /*XXX*/
2815 char *p;
2816
2817 if (ip)
2818 p = ntohl(ip->ip_src);
2819 #ifdef INET6
2820 else if (ip6)
2821 p = ip6_sprintf(&ip6->ip6_src);
2822 #endif
2823 else
2824 p = "(unknown)";
2825 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
2826 "assuming %d\n",
2827 tp->requested_s_scale, p,
2828 TCP_MAX_WINSHIFT);
2829 #else
2830 log(LOG_ERR, "TCP: invalid wscale %d, "
2831 "assuming %d\n",
2832 tp->requested_s_scale,
2833 TCP_MAX_WINSHIFT);
2834 #endif
2835 tp->requested_s_scale = TCP_MAX_WINSHIFT;
2836 }
2837 break;
2838
2839 case TCPOPT_TIMESTAMP:
2840 if (optlen != TCPOLEN_TIMESTAMP)
2841 continue;
2842 oi->ts_present = 1;
2843 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
2844 NTOHL(oi->ts_val);
2845 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
2846 NTOHL(oi->ts_ecr);
2847
2848 /*
2849 * A timestamp received in a SYN makes
2850 * it ok to send timestamp requests and replies.
2851 */
2852 if (th->th_flags & TH_SYN) {
2853 tp->t_flags |= TF_RCVD_TSTMP;
2854 tp->ts_recent = oi->ts_val;
2855 tp->ts_recent_age = tcp_now;
2856 }
2857 break;
2858 case TCPOPT_SACK_PERMITTED:
2859 if (optlen != TCPOLEN_SACK_PERMITTED)
2860 continue;
2861 if (!(th->th_flags & TH_SYN))
2862 continue;
2863 if (tcp_do_sack) {
2864 tp->t_flags |= TF_SACK_PERMIT;
2865 tp->t_flags |= TF_WILL_SACK;
2866 }
2867 break;
2868
2869 case TCPOPT_SACK:
2870 tcp_sack_option(tp, th, cp, optlen);
2871 break;
2872 #ifdef TCP_SIGNATURE
2873 case TCPOPT_SIGNATURE:
2874 if (optlen != TCPOLEN_SIGNATURE)
2875 continue;
2876 if (sigp && bcmp(sigp, cp + 2, TCP_SIGLEN))
2877 return (-1);
2878
2879 sigp = sigbuf;
2880 memcpy(sigbuf, cp + 2, TCP_SIGLEN);
2881 memset(cp + 2, 0, TCP_SIGLEN);
2882 tp->t_flags |= TF_SIGNATURE;
2883 break;
2884 #endif
2885 }
2886 }
2887
2888 #ifdef TCP_SIGNATURE
2889 if (tp->t_flags & TF_SIGNATURE) {
2890
2891 sav = tcp_signature_getsav(m, th);
2892
2893 if (sav == NULL && tp->t_state == TCPS_LISTEN)
2894 return (-1);
2895 }
2896
2897 if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
2898 if (sav == NULL)
2899 return (-1);
2900 #ifdef FAST_IPSEC
2901 KEY_FREESAV(&sav);
2902 #else
2903 key_freesav(sav);
2904 #endif
2905 return (-1);
2906 }
2907
2908 if (sigp) {
2909 char sig[TCP_SIGLEN];
2910
2911 TCP_FIELDS_TO_NET(th);
2912 if (tcp_signature(m, th, toff, sav, sig) < 0) {
2913 TCP_FIELDS_TO_HOST(th);
2914 if (sav == NULL)
2915 return (-1);
2916 #ifdef FAST_IPSEC
2917 KEY_FREESAV(&sav);
2918 #else
2919 key_freesav(sav);
2920 #endif
2921 return (-1);
2922 }
2923 TCP_FIELDS_TO_HOST(th);
2924
2925 if (bcmp(sig, sigp, TCP_SIGLEN)) {
2926 tcpstat.tcps_badsig++;
2927 if (sav == NULL)
2928 return (-1);
2929 #ifdef FAST_IPSEC
2930 KEY_FREESAV(&sav);
2931 #else
2932 key_freesav(sav);
2933 #endif
2934 return (-1);
2935 } else
2936 tcpstat.tcps_goodsig++;
2937
2938 key_sa_recordxfer(sav, m);
2939 #ifdef FAST_IPSEC
2940 KEY_FREESAV(&sav);
2941 #else
2942 key_freesav(sav);
2943 #endif
2944 }
2945 #endif
2946
2947 return (0);
2948 }
2949
2950 /*
2951 * Pull out of band byte out of a segment so
2952 * it doesn't appear in the user's data queue.
2953 * It is still reflected in the segment length for
2954 * sequencing purposes.
2955 */
2956 void
2957 tcp_pulloutofband(struct socket *so, struct tcphdr *th,
2958 struct mbuf *m, int off)
2959 {
2960 int cnt = off + th->th_urp - 1;
2961
2962 while (cnt >= 0) {
2963 if (m->m_len > cnt) {
2964 char *cp = mtod(m, caddr_t) + cnt;
2965 struct tcpcb *tp = sototcpcb(so);
2966
2967 tp->t_iobc = *cp;
2968 tp->t_oobflags |= TCPOOB_HAVEDATA;
2969 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2970 m->m_len--;
2971 return;
2972 }
2973 cnt -= m->m_len;
2974 m = m->m_next;
2975 if (m == 0)
2976 break;
2977 }
2978 panic("tcp_pulloutofband");
2979 }
2980
2981 /*
2982 * Collect new round-trip time estimate
2983 * and update averages and current timeout.
2984 */
2985 void
2986 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
2987 {
2988 int32_t delta;
2989
2990 tcpstat.tcps_rttupdated++;
2991 if (tp->t_srtt != 0) {
2992 /*
2993 * srtt is stored as fixed point with 3 bits after the
2994 * binary point (i.e., scaled by 8). The following magic
2995 * is equivalent to the smoothing algorithm in rfc793 with
2996 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2997 * point). Adjust rtt to origin 0.
2998 */
2999 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
3000 if ((tp->t_srtt += delta) <= 0)
3001 tp->t_srtt = 1 << 2;
3002 /*
3003 * We accumulate a smoothed rtt variance (actually, a
3004 * smoothed mean difference), then set the retransmit
3005 * timer to smoothed rtt + 4 times the smoothed variance.
3006 * rttvar is stored as fixed point with 2 bits after the
3007 * binary point (scaled by 4). The following is
3008 * equivalent to rfc793 smoothing with an alpha of .75
3009 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3010 * rfc793's wired-in beta.
3011 */
3012 if (delta < 0)
3013 delta = -delta;
3014 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
3015 if ((tp->t_rttvar += delta) <= 0)
3016 tp->t_rttvar = 1 << 2;
3017 } else {
3018 /*
3019 * No rtt measurement yet - use the unsmoothed rtt.
3020 * Set the variance to half the rtt (so our first
3021 * retransmit happens at 3*rtt).
3022 */
3023 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
3024 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
3025 }
3026 tp->t_rtttime = 0;
3027 tp->t_rxtshift = 0;
3028
3029 /*
3030 * the retransmit should happen at rtt + 4 * rttvar.
3031 * Because of the way we do the smoothing, srtt and rttvar
3032 * will each average +1/2 tick of bias. When we compute
3033 * the retransmit timer, we want 1/2 tick of rounding and
3034 * 1 extra tick because of +-1/2 tick uncertainty in the
3035 * firing of the timer. The bias will give us exactly the
3036 * 1.5 tick we need. But, because the bias is
3037 * statistical, we have to test that we don't drop below
3038 * the minimum feasible timer (which is 2 ticks).
3039 */
3040 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3041 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
3042
3043 /*
3044 * We received an ack for a packet that wasn't retransmitted;
3045 * it is probably safe to discard any error indications we've
3046 * received recently. This isn't quite right, but close enough
3047 * for now (a route might have failed after we sent a segment,
3048 * and the return path might not be symmetrical).
3049 */
3050 tp->t_softerror = 0;
3051 }
3052
3053 void
3054 tcp_reno_newack(struct tcpcb *tp, struct tcphdr *th)
3055 {
3056 if (tp->t_partialacks < 0) {
3057 /*
3058 * We were not in fast recovery. Reset the duplicate ack
3059 * counter.
3060 */
3061 tp->t_dupacks = 0;
3062 } else {
3063 /*
3064 * Clamp the congestion window to the crossover point and
3065 * exit fast recovery.
3066 */
3067 if (tp->snd_cwnd > tp->snd_ssthresh)
3068 tp->snd_cwnd = tp->snd_ssthresh;
3069 tp->t_partialacks = -1;
3070 tp->t_dupacks = 0;
3071 }
3072 }
3073
3074 /*
3075 * Implement the NewReno response to a new ack, checking for partial acks in
3076 * fast recovery.
3077 */
3078 void
3079 tcp_newreno_newack(struct tcpcb *tp, struct tcphdr *th)
3080 {
3081 if (tp->t_partialacks < 0) {
3082 /*
3083 * We were not in fast recovery. Reset the duplicate ack
3084 * counter.
3085 */
3086 tp->t_dupacks = 0;
3087 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
3088 /*
3089 * This is a partial ack. Retransmit the first unacknowledged
3090 * segment and deflate the congestion window by the amount of
3091 * acknowledged data. Do not exit fast recovery.
3092 */
3093 tcp_seq onxt = tp->snd_nxt;
3094 u_long ocwnd = tp->snd_cwnd;
3095
3096 /*
3097 * snd_una has not yet been updated and the socket's send
3098 * buffer has not yet drained off the ACK'd data, so we
3099 * have to leave snd_una as it was to get the correct data
3100 * offset in tcp_output().
3101 */
3102 if (++tp->t_partialacks == 1)
3103 TCP_TIMER_DISARM(tp, TCPT_REXMT);
3104 tp->t_rtttime = 0;
3105 tp->snd_nxt = th->th_ack;
3106 /*
3107 * Set snd_cwnd to one segment beyond ACK'd offset. snd_una
3108 * is not yet updated when we're called.
3109 */
3110 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
3111 (void) tcp_output(tp);
3112 tp->snd_cwnd = ocwnd;
3113 if (SEQ_GT(onxt, tp->snd_nxt))
3114 tp->snd_nxt = onxt;
3115 /*
3116 * Partial window deflation. Relies on fact that tp->snd_una
3117 * not updated yet.
3118 */
3119 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
3120 } else {
3121 /*
3122 * Complete ack. Inflate the congestion window to ssthresh
3123 * and exit fast recovery.
3124 *
3125 * Window inflation should have left us with approx.
3126 * snd_ssthresh outstanding data. But in case we
3127 * would be inclined to send a burst, better to do
3128 * it via the slow start mechanism.
3129 */
3130 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
3131 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
3132 + tp->t_segsz;
3133 else
3134 tp->snd_cwnd = tp->snd_ssthresh;
3135 tp->t_partialacks = -1;
3136 tp->t_dupacks = 0;
3137 }
3138 }
3139
3140
3141 /*
3142 * TCP compressed state engine. Currently used to hold compressed
3143 * state for SYN_RECEIVED.
3144 */
3145
3146 u_long syn_cache_count;
3147 u_int32_t syn_hash1, syn_hash2;
3148
3149 #define SYN_HASH(sa, sp, dp) \
3150 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3151 ((u_int32_t)(sp)))^syn_hash2)))
3152 #ifndef INET6
3153 #define SYN_HASHALL(hash, src, dst) \
3154 do { \
3155 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3156 ((struct sockaddr_in *)(src))->sin_port, \
3157 ((struct sockaddr_in *)(dst))->sin_port); \
3158 } while (/*CONSTCOND*/ 0)
3159 #else
3160 #define SYN_HASH6(sa, sp, dp) \
3161 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3162 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3163 & 0x7fffffff)
3164
3165 #define SYN_HASHALL(hash, src, dst) \
3166 do { \
3167 switch ((src)->sa_family) { \
3168 case AF_INET: \
3169 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3170 ((struct sockaddr_in *)(src))->sin_port, \
3171 ((struct sockaddr_in *)(dst))->sin_port); \
3172 break; \
3173 case AF_INET6: \
3174 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
3175 ((struct sockaddr_in6 *)(src))->sin6_port, \
3176 ((struct sockaddr_in6 *)(dst))->sin6_port); \
3177 break; \
3178 default: \
3179 hash = 0; \
3180 } \
3181 } while (/*CONSTCOND*/0)
3182 #endif /* INET6 */
3183
3184 #define SYN_CACHE_RM(sc) \
3185 do { \
3186 TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \
3187 (sc), sc_bucketq); \
3188 (sc)->sc_tp = NULL; \
3189 LIST_REMOVE((sc), sc_tpq); \
3190 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
3191 callout_stop(&(sc)->sc_timer); \
3192 syn_cache_count--; \
3193 } while (/*CONSTCOND*/0)
3194
3195 #define SYN_CACHE_PUT(sc) \
3196 do { \
3197 if ((sc)->sc_ipopts) \
3198 (void) m_free((sc)->sc_ipopts); \
3199 if ((sc)->sc_route4.ro_rt != NULL) \
3200 RTFREE((sc)->sc_route4.ro_rt); \
3201 if (callout_invoking(&(sc)->sc_timer)) \
3202 (sc)->sc_flags |= SCF_DEAD; \
3203 else \
3204 pool_put(&syn_cache_pool, (sc)); \
3205 } while (/*CONSTCOND*/0)
3206
3207 POOL_INIT(syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL);
3208
3209 /*
3210 * We don't estimate RTT with SYNs, so each packet starts with the default
3211 * RTT and each timer step has a fixed timeout value.
3212 */
3213 #define SYN_CACHE_TIMER_ARM(sc) \
3214 do { \
3215 TCPT_RANGESET((sc)->sc_rxtcur, \
3216 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3217 TCPTV_REXMTMAX); \
3218 callout_reset(&(sc)->sc_timer, \
3219 (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
3220 } while (/*CONSTCOND*/0)
3221
3222 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
3223
3224 void
3225 syn_cache_init(void)
3226 {
3227 int i;
3228
3229 /* Initialize the hash buckets. */
3230 for (i = 0; i < tcp_syn_cache_size; i++)
3231 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3232 }
3233
3234 void
3235 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
3236 {
3237 struct syn_cache_head *scp;
3238 struct syn_cache *sc2;
3239 int s;
3240
3241 /*
3242 * If there are no entries in the hash table, reinitialize
3243 * the hash secrets.
3244 */
3245 if (syn_cache_count == 0) {
3246 syn_hash1 = arc4random();
3247 syn_hash2 = arc4random();
3248 }
3249
3250 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3251 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3252 scp = &tcp_syn_cache[sc->sc_bucketidx];
3253
3254 /*
3255 * Make sure that we don't overflow the per-bucket
3256 * limit or the total cache size limit.
3257 */
3258 s = splsoftnet();
3259 if (scp->sch_length >= tcp_syn_bucket_limit) {
3260 tcpstat.tcps_sc_bucketoverflow++;
3261 /*
3262 * The bucket is full. Toss the oldest element in the
3263 * bucket. This will be the first entry in the bucket.
3264 */
3265 sc2 = TAILQ_FIRST(&scp->sch_bucket);
3266 #ifdef DIAGNOSTIC
3267 /*
3268 * This should never happen; we should always find an
3269 * entry in our bucket.
3270 */
3271 if (sc2 == NULL)
3272 panic("syn_cache_insert: bucketoverflow: impossible");
3273 #endif
3274 SYN_CACHE_RM(sc2);
3275 SYN_CACHE_PUT(sc2);
3276 } else if (syn_cache_count >= tcp_syn_cache_limit) {
3277 struct syn_cache_head *scp2, *sce;
3278
3279 tcpstat.tcps_sc_overflowed++;
3280 /*
3281 * The cache is full. Toss the oldest entry in the
3282 * first non-empty bucket we can find.
3283 *
3284 * XXX We would really like to toss the oldest
3285 * entry in the cache, but we hope that this
3286 * condition doesn't happen very often.
3287 */
3288 scp2 = scp;
3289 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3290 sce = &tcp_syn_cache[tcp_syn_cache_size];
3291 for (++scp2; scp2 != scp; scp2++) {
3292 if (scp2 >= sce)
3293 scp2 = &tcp_syn_cache[0];
3294 if (! TAILQ_EMPTY(&scp2->sch_bucket))
3295 break;
3296 }
3297 #ifdef DIAGNOSTIC
3298 /*
3299 * This should never happen; we should always find a
3300 * non-empty bucket.
3301 */
3302 if (scp2 == scp)
3303 panic("syn_cache_insert: cacheoverflow: "
3304 "impossible");
3305 #endif
3306 }
3307 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3308 SYN_CACHE_RM(sc2);
3309 SYN_CACHE_PUT(sc2);
3310 }
3311
3312 /*
3313 * Initialize the entry's timer.
3314 */
3315 sc->sc_rxttot = 0;
3316 sc->sc_rxtshift = 0;
3317 SYN_CACHE_TIMER_ARM(sc);
3318
3319 /* Link it from tcpcb entry */
3320 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3321
3322 /* Put it into the bucket. */
3323 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3324 scp->sch_length++;
3325 syn_cache_count++;
3326
3327 tcpstat.tcps_sc_added++;
3328 splx(s);
3329 }
3330
3331 /*
3332 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3333 * If we have retransmitted an entry the maximum number of times, expire
3334 * that entry.
3335 */
3336 void
3337 syn_cache_timer(void *arg)
3338 {
3339 struct syn_cache *sc = arg;
3340 int s;
3341
3342 s = splsoftnet();
3343 callout_ack(&sc->sc_timer);
3344
3345 if (__predict_false(sc->sc_flags & SCF_DEAD)) {
3346 tcpstat.tcps_sc_delayed_free++;
3347 pool_put(&syn_cache_pool, sc);
3348 splx(s);
3349 return;
3350 }
3351
3352 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3353 /* Drop it -- too many retransmissions. */
3354 goto dropit;
3355 }
3356
3357 /*
3358 * Compute the total amount of time this entry has
3359 * been on a queue. If this entry has been on longer
3360 * than the keep alive timer would allow, expire it.
3361 */
3362 sc->sc_rxttot += sc->sc_rxtcur;
3363 if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
3364 goto dropit;
3365
3366 tcpstat.tcps_sc_retransmitted++;
3367 (void) syn_cache_respond(sc, NULL);
3368
3369 /* Advance the timer back-off. */
3370 sc->sc_rxtshift++;
3371 SYN_CACHE_TIMER_ARM(sc);
3372
3373 splx(s);
3374 return;
3375
3376 dropit:
3377 tcpstat.tcps_sc_timed_out++;
3378 SYN_CACHE_RM(sc);
3379 SYN_CACHE_PUT(sc);
3380 splx(s);
3381 }
3382
3383 /*
3384 * Remove syn cache created by the specified tcb entry,
3385 * because this does not make sense to keep them
3386 * (if there's no tcb entry, syn cache entry will never be used)
3387 */
3388 void
3389 syn_cache_cleanup(struct tcpcb *tp)
3390 {
3391 struct syn_cache *sc, *nsc;
3392 int s;
3393
3394 s = splsoftnet();
3395
3396 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3397 nsc = LIST_NEXT(sc, sc_tpq);
3398
3399 #ifdef DIAGNOSTIC
3400 if (sc->sc_tp != tp)
3401 panic("invalid sc_tp in syn_cache_cleanup");
3402 #endif
3403 SYN_CACHE_RM(sc);
3404 SYN_CACHE_PUT(sc);
3405 }
3406 /* just for safety */
3407 LIST_INIT(&tp->t_sc);
3408
3409 splx(s);
3410 }
3411
3412 /*
3413 * Find an entry in the syn cache.
3414 */
3415 struct syn_cache *
3416 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst,
3417 struct syn_cache_head **headp)
3418 {
3419 struct syn_cache *sc;
3420 struct syn_cache_head *scp;
3421 u_int32_t hash;
3422 int s;
3423
3424 SYN_HASHALL(hash, src, dst);
3425
3426 scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3427 *headp = scp;
3428 s = splsoftnet();
3429 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3430 sc = TAILQ_NEXT(sc, sc_bucketq)) {
3431 if (sc->sc_hash != hash)
3432 continue;
3433 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
3434 !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
3435 splx(s);
3436 return (sc);
3437 }
3438 }
3439 splx(s);
3440 return (NULL);
3441 }
3442
3443 /*
3444 * This function gets called when we receive an ACK for a
3445 * socket in the LISTEN state. We look up the connection
3446 * in the syn cache, and if its there, we pull it out of
3447 * the cache and turn it into a full-blown connection in
3448 * the SYN-RECEIVED state.
3449 *
3450 * The return values may not be immediately obvious, and their effects
3451 * can be subtle, so here they are:
3452 *
3453 * NULL SYN was not found in cache; caller should drop the
3454 * packet and send an RST.
3455 *
3456 * -1 We were unable to create the new connection, and are
3457 * aborting it. An ACK,RST is being sent to the peer
3458 * (unless we got screwey sequence numbners; see below),
3459 * because the 3-way handshake has been completed. Caller
3460 * should not free the mbuf, since we may be using it. If
3461 * we are not, we will free it.
3462 *
3463 * Otherwise, the return value is a pointer to the new socket
3464 * associated with the connection.
3465 */
3466 struct socket *
3467 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
3468 struct tcphdr *th, unsigned int hlen, unsigned int tlen,
3469 struct socket *so, struct mbuf *m)
3470 {
3471 struct syn_cache *sc;
3472 struct syn_cache_head *scp;
3473 struct inpcb *inp = NULL;
3474 #ifdef INET6
3475 struct in6pcb *in6p = NULL;
3476 #endif
3477 struct tcpcb *tp = 0;
3478 struct mbuf *am;
3479 int s;
3480 struct socket *oso;
3481
3482 s = splsoftnet();
3483 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3484 splx(s);
3485 return (NULL);
3486 }
3487
3488 /*
3489 * Verify the sequence and ack numbers. Try getting the correct
3490 * response again.
3491 */
3492 if ((th->th_ack != sc->sc_iss + 1) ||
3493 SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3494 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3495 (void) syn_cache_respond(sc, m);
3496 splx(s);
3497 return ((struct socket *)(-1));
3498 }
3499
3500 /* Remove this cache entry */
3501 SYN_CACHE_RM(sc);
3502 splx(s);
3503
3504 /*
3505 * Ok, create the full blown connection, and set things up
3506 * as they would have been set up if we had created the
3507 * connection when the SYN arrived. If we can't create
3508 * the connection, abort it.
3509 */
3510 /*
3511 * inp still has the OLD in_pcb stuff, set the
3512 * v6-related flags on the new guy, too. This is
3513 * done particularly for the case where an AF_INET6
3514 * socket is bound only to a port, and a v4 connection
3515 * comes in on that port.
3516 * we also copy the flowinfo from the original pcb
3517 * to the new one.
3518 */
3519 oso = so;
3520 so = sonewconn(so, SS_ISCONNECTED);
3521 if (so == NULL)
3522 goto resetandabort;
3523
3524 switch (so->so_proto->pr_domain->dom_family) {
3525 #ifdef INET
3526 case AF_INET:
3527 inp = sotoinpcb(so);
3528 break;
3529 #endif
3530 #ifdef INET6
3531 case AF_INET6:
3532 in6p = sotoin6pcb(so);
3533 break;
3534 #endif
3535 }
3536 switch (src->sa_family) {
3537 #ifdef INET
3538 case AF_INET:
3539 if (inp) {
3540 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3541 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
3542 inp->inp_options = ip_srcroute();
3543 in_pcbstate(inp, INP_BOUND);
3544 if (inp->inp_options == NULL) {
3545 inp->inp_options = sc->sc_ipopts;
3546 sc->sc_ipopts = NULL;
3547 }
3548 }
3549 #ifdef INET6
3550 else if (in6p) {
3551 /* IPv4 packet to AF_INET6 socket */
3552 bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr));
3553 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
3554 bcopy(&((struct sockaddr_in *)dst)->sin_addr,
3555 &in6p->in6p_laddr.s6_addr32[3],
3556 sizeof(((struct sockaddr_in *)dst)->sin_addr));
3557 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
3558 in6totcpcb(in6p)->t_family = AF_INET;
3559 if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
3560 in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
3561 else
3562 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
3563 in6_pcbstate(in6p, IN6P_BOUND);
3564 }
3565 #endif
3566 break;
3567 #endif
3568 #ifdef INET6
3569 case AF_INET6:
3570 if (in6p) {
3571 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
3572 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
3573 in6_pcbstate(in6p, IN6P_BOUND);
3574 }
3575 break;
3576 #endif
3577 }
3578 #ifdef INET6
3579 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
3580 struct in6pcb *oin6p = sotoin6pcb(oso);
3581 /* inherit socket options from the listening socket */
3582 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
3583 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
3584 m_freem(in6p->in6p_options);
3585 in6p->in6p_options = 0;
3586 }
3587 ip6_savecontrol(in6p, &in6p->in6p_options,
3588 mtod(m, struct ip6_hdr *), m);
3589 }
3590 #endif
3591
3592 #if defined(IPSEC) || defined(FAST_IPSEC)
3593 /*
3594 * we make a copy of policy, instead of sharing the policy,
3595 * for better behavior in terms of SA lookup and dead SA removal.
3596 */
3597 if (inp) {
3598 /* copy old policy into new socket's */
3599 if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
3600 printf("tcp_input: could not copy policy\n");
3601 }
3602 #ifdef INET6
3603 else if (in6p) {
3604 /* copy old policy into new socket's */
3605 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
3606 in6p->in6p_sp))
3607 printf("tcp_input: could not copy policy\n");
3608 }
3609 #endif
3610 #endif
3611
3612 /*
3613 * Give the new socket our cached route reference.
3614 */
3615 if (inp)
3616 inp->inp_route = sc->sc_route4; /* struct assignment */
3617 #ifdef INET6
3618 else
3619 in6p->in6p_route = sc->sc_route6;
3620 #endif
3621 sc->sc_route4.ro_rt = NULL;
3622
3623 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
3624 if (am == NULL)
3625 goto resetandabort;
3626 MCLAIM(am, &tcp_mowner);
3627 am->m_len = src->sa_len;
3628 bcopy(src, mtod(am, caddr_t), src->sa_len);
3629 if (inp) {
3630 if (in_pcbconnect(inp, am)) {
3631 (void) m_free(am);
3632 goto resetandabort;
3633 }
3634 }
3635 #ifdef INET6
3636 else if (in6p) {
3637 if (src->sa_family == AF_INET) {
3638 /* IPv4 packet to AF_INET6 socket */
3639 struct sockaddr_in6 *sin6;
3640 sin6 = mtod(am, struct sockaddr_in6 *);
3641 am->m_len = sizeof(*sin6);
3642 bzero(sin6, sizeof(*sin6));
3643 sin6->sin6_family = AF_INET6;
3644 sin6->sin6_len = sizeof(*sin6);
3645 sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
3646 sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
3647 bcopy(&((struct sockaddr_in *)src)->sin_addr,
3648 &sin6->sin6_addr.s6_addr32[3],
3649 sizeof(sin6->sin6_addr.s6_addr32[3]));
3650 }
3651 if (in6_pcbconnect(in6p, am)) {
3652 (void) m_free(am);
3653 goto resetandabort;
3654 }
3655 }
3656 #endif
3657 else {
3658 (void) m_free(am);
3659 goto resetandabort;
3660 }
3661 (void) m_free(am);
3662
3663 if (inp)
3664 tp = intotcpcb(inp);
3665 #ifdef INET6
3666 else if (in6p)
3667 tp = in6totcpcb(in6p);
3668 #endif
3669 else
3670 tp = NULL;
3671 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3672 if (sc->sc_request_r_scale != 15) {
3673 tp->requested_s_scale = sc->sc_requested_s_scale;
3674 tp->request_r_scale = sc->sc_request_r_scale;
3675 tp->snd_scale = sc->sc_requested_s_scale;
3676 tp->rcv_scale = sc->sc_request_r_scale;
3677 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3678 }
3679 if (sc->sc_flags & SCF_TIMESTAMP)
3680 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3681 tp->ts_timebase = sc->sc_timebase;
3682
3683 tp->t_template = tcp_template(tp);
3684 if (tp->t_template == 0) {
3685 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
3686 so = NULL;
3687 m_freem(m);
3688 goto abort;
3689 }
3690
3691 tp->iss = sc->sc_iss;
3692 tp->irs = sc->sc_irs;
3693 tcp_sendseqinit(tp);
3694 tcp_rcvseqinit(tp);
3695 tp->t_state = TCPS_SYN_RECEIVED;
3696 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
3697 tcpstat.tcps_accepts++;
3698
3699 if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
3700 tp->t_flags |= TF_WILL_SACK;
3701
3702 #ifdef TCP_SIGNATURE
3703 if (sc->sc_flags & SCF_SIGNATURE)
3704 tp->t_flags |= TF_SIGNATURE;
3705 #endif
3706
3707 /* Initialize tp->t_ourmss before we deal with the peer's! */
3708 tp->t_ourmss = sc->sc_ourmaxseg;
3709 tcp_mss_from_peer(tp, sc->sc_peermaxseg);
3710
3711 /*
3712 * Initialize the initial congestion window. If we
3713 * had to retransmit the SYN,ACK, we must initialize cwnd
3714 * to 1 segment (i.e. the Loss Window).
3715 */
3716 if (sc->sc_rxtshift)
3717 tp->snd_cwnd = tp->t_peermss;
3718 else {
3719 int ss = tcp_init_win;
3720 #ifdef INET
3721 if (inp != NULL && in_localaddr(inp->inp_faddr))
3722 ss = tcp_init_win_local;
3723 #endif
3724 #ifdef INET6
3725 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
3726 ss = tcp_init_win_local;
3727 #endif
3728 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
3729 }
3730
3731 tcp_rmx_rtt(tp);
3732 tp->snd_wl1 = sc->sc_irs;
3733 tp->rcv_up = sc->sc_irs + 1;
3734
3735 /*
3736 * This is what whould have happened in tcp_output() when
3737 * the SYN,ACK was sent.
3738 */
3739 tp->snd_up = tp->snd_una;
3740 tp->snd_max = tp->snd_nxt = tp->iss+1;
3741 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3742 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3743 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3744 tp->last_ack_sent = tp->rcv_nxt;
3745 tp->t_partialacks = -1;
3746 tp->t_dupacks = 0;
3747
3748 tcpstat.tcps_sc_completed++;
3749 SYN_CACHE_PUT(sc);
3750 return (so);
3751
3752 resetandabort:
3753 (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
3754 abort:
3755 if (so != NULL)
3756 (void) soabort(so);
3757 SYN_CACHE_PUT(sc);
3758 tcpstat.tcps_sc_aborted++;
3759 return ((struct socket *)(-1));
3760 }
3761
3762 /*
3763 * This function is called when we get a RST for a
3764 * non-existent connection, so that we can see if the
3765 * connection is in the syn cache. If it is, zap it.
3766 */
3767
3768 void
3769 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
3770 {
3771 struct syn_cache *sc;
3772 struct syn_cache_head *scp;
3773 int s = splsoftnet();
3774
3775 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3776 splx(s);
3777 return;
3778 }
3779 if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3780 SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3781 splx(s);
3782 return;
3783 }
3784 SYN_CACHE_RM(sc);
3785 splx(s);
3786 tcpstat.tcps_sc_reset++;
3787 SYN_CACHE_PUT(sc);
3788 }
3789
3790 void
3791 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst,
3792 struct tcphdr *th)
3793 {
3794 struct syn_cache *sc;
3795 struct syn_cache_head *scp;
3796 int s;
3797
3798 s = splsoftnet();
3799 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3800 splx(s);
3801 return;
3802 }
3803 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3804 if (ntohl (th->th_seq) != sc->sc_iss) {
3805 splx(s);
3806 return;
3807 }
3808
3809 /*
3810 * If we've retransmitted 3 times and this is our second error,
3811 * we remove the entry. Otherwise, we allow it to continue on.
3812 * This prevents us from incorrectly nuking an entry during a
3813 * spurious network outage.
3814 *
3815 * See tcp_notify().
3816 */
3817 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3818 sc->sc_flags |= SCF_UNREACH;
3819 splx(s);
3820 return;
3821 }
3822
3823 SYN_CACHE_RM(sc);
3824 splx(s);
3825 tcpstat.tcps_sc_unreach++;
3826 SYN_CACHE_PUT(sc);
3827 }
3828
3829 /*
3830 * Given a LISTEN socket and an inbound SYN request, add
3831 * this to the syn cache, and send back a segment:
3832 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3833 * to the source.
3834 *
3835 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3836 * Doing so would require that we hold onto the data and deliver it
3837 * to the application. However, if we are the target of a SYN-flood
3838 * DoS attack, an attacker could send data which would eventually
3839 * consume all available buffer space if it were ACKed. By not ACKing
3840 * the data, we avoid this DoS scenario.
3841 */
3842
3843 int
3844 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
3845 unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp,
3846 int optlen, struct tcp_opt_info *oi)
3847 {
3848 struct tcpcb tb, *tp;
3849 long win;
3850 struct syn_cache *sc;
3851 struct syn_cache_head *scp;
3852 struct mbuf *ipopts;
3853 struct tcp_opt_info opti;
3854
3855 tp = sototcpcb(so);
3856
3857 bzero(&opti, sizeof(opti));
3858
3859 /*
3860 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
3861 *
3862 * Note this check is performed in tcp_input() very early on.
3863 */
3864
3865 /*
3866 * Initialize some local state.
3867 */
3868 win = sbspace(&so->so_rcv);
3869 if (win > TCP_MAXWIN)
3870 win = TCP_MAXWIN;
3871
3872 switch (src->sa_family) {
3873 #ifdef INET
3874 case AF_INET:
3875 /*
3876 * Remember the IP options, if any.
3877 */
3878 ipopts = ip_srcroute();
3879 break;
3880 #endif
3881 default:
3882 ipopts = NULL;
3883 }
3884
3885 #ifdef TCP_SIGNATURE
3886 if (optp || (tp->t_flags & TF_SIGNATURE))
3887 #else
3888 if (optp)
3889 #endif
3890 {
3891 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
3892 #ifdef TCP_SIGNATURE
3893 tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
3894 #endif
3895 if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len -
3896 sizeof(struct tcphdr) - optlen - hlen, oi) < 0)
3897 return (0);
3898 } else
3899 tb.t_flags = 0;
3900
3901 /*
3902 * See if we already have an entry for this connection.
3903 * If we do, resend the SYN,ACK. We do not count this
3904 * as a retransmission (XXX though maybe we should).
3905 */
3906 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
3907 tcpstat.tcps_sc_dupesyn++;
3908 if (ipopts) {
3909 /*
3910 * If we were remembering a previous source route,
3911 * forget it and use the new one we've been given.
3912 */
3913 if (sc->sc_ipopts)
3914 (void) m_free(sc->sc_ipopts);
3915 sc->sc_ipopts = ipopts;
3916 }
3917 sc->sc_timestamp = tb.ts_recent;
3918 if (syn_cache_respond(sc, m) == 0) {
3919 tcpstat.tcps_sndacks++;
3920 tcpstat.tcps_sndtotal++;
3921 }
3922 return (1);
3923 }
3924
3925 sc = pool_get(&syn_cache_pool, PR_NOWAIT);
3926 if (sc == NULL) {
3927 if (ipopts)
3928 (void) m_free(ipopts);
3929 return (0);
3930 }
3931
3932 /*
3933 * Fill in the cache, and put the necessary IP and TCP
3934 * options into the reply.
3935 */
3936 bzero(sc, sizeof(struct syn_cache));
3937 callout_init(&sc->sc_timer);
3938 bcopy(src, &sc->sc_src, src->sa_len);
3939 bcopy(dst, &sc->sc_dst, dst->sa_len);
3940 sc->sc_flags = 0;
3941 sc->sc_ipopts = ipopts;
3942 sc->sc_irs = th->th_seq;
3943 switch (src->sa_family) {
3944 #ifdef INET
3945 case AF_INET:
3946 {
3947 struct sockaddr_in *srcin = (void *) src;
3948 struct sockaddr_in *dstin = (void *) dst;
3949
3950 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
3951 &srcin->sin_addr, dstin->sin_port,
3952 srcin->sin_port, sizeof(dstin->sin_addr), 0);
3953 break;
3954 }
3955 #endif /* INET */
3956 #ifdef INET6
3957 case AF_INET6:
3958 {
3959 struct sockaddr_in6 *srcin6 = (void *) src;
3960 struct sockaddr_in6 *dstin6 = (void *) dst;
3961
3962 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
3963 &srcin6->sin6_addr, dstin6->sin6_port,
3964 srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
3965 break;
3966 }
3967 #endif /* INET6 */
3968 }
3969 sc->sc_peermaxseg = oi->maxseg;
3970 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
3971 m->m_pkthdr.rcvif : NULL,
3972 sc->sc_src.sa.sa_family);
3973 sc->sc_win = win;
3974 sc->sc_timebase = tcp_now; /* see tcp_newtcpcb() */
3975 sc->sc_timestamp = tb.ts_recent;
3976 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
3977 (TF_REQ_TSTMP|TF_RCVD_TSTMP))
3978 sc->sc_flags |= SCF_TIMESTAMP;
3979 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3980 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
3981 sc->sc_requested_s_scale = tb.requested_s_scale;
3982 sc->sc_request_r_scale = 0;
3983 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
3984 TCP_MAXWIN << sc->sc_request_r_scale <
3985 so->so_rcv.sb_hiwat)
3986 sc->sc_request_r_scale++;
3987 } else {
3988 sc->sc_requested_s_scale = 15;
3989 sc->sc_request_r_scale = 15;
3990 }
3991 if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
3992 sc->sc_flags |= SCF_SACK_PERMIT;
3993 #ifdef TCP_SIGNATURE
3994 if (tb.t_flags & TF_SIGNATURE)
3995 sc->sc_flags |= SCF_SIGNATURE;
3996 #endif
3997 sc->sc_tp = tp;
3998 if (syn_cache_respond(sc, m) == 0) {
3999 syn_cache_insert(sc, tp);
4000 tcpstat.tcps_sndacks++;
4001 tcpstat.tcps_sndtotal++;
4002 } else {
4003 SYN_CACHE_PUT(sc);
4004 tcpstat.tcps_sc_dropped++;
4005 }
4006 return (1);
4007 }
4008
4009 int
4010 syn_cache_respond(struct syn_cache *sc, struct mbuf *m)
4011 {
4012 struct route *ro;
4013 u_int8_t *optp;
4014 int optlen, error;
4015 u_int16_t tlen;
4016 struct ip *ip = NULL;
4017 #ifdef INET6
4018 struct ip6_hdr *ip6 = NULL;
4019 #endif
4020 struct tcpcb *tp;
4021 struct tcphdr *th;
4022 u_int hlen;
4023 struct socket *so;
4024
4025 switch (sc->sc_src.sa.sa_family) {
4026 case AF_INET:
4027 hlen = sizeof(struct ip);
4028 ro = &sc->sc_route4;
4029 break;
4030 #ifdef INET6
4031 case AF_INET6:
4032 hlen = sizeof(struct ip6_hdr);
4033 ro = (struct route *)&sc->sc_route6;
4034 break;
4035 #endif
4036 default:
4037 if (m)
4038 m_freem(m);
4039 return (EAFNOSUPPORT);
4040 }
4041
4042 /* Compute the size of the TCP options. */
4043 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
4044 ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
4045 #ifdef TCP_SIGNATURE
4046 ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
4047 #endif
4048 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
4049
4050 tlen = hlen + sizeof(struct tcphdr) + optlen;
4051
4052 /*
4053 * Create the IP+TCP header from scratch.
4054 */
4055 if (m)
4056 m_freem(m);
4057 #ifdef DIAGNOSTIC
4058 if (max_linkhdr + tlen > MCLBYTES)
4059 return (ENOBUFS);
4060 #endif
4061 MGETHDR(m, M_DONTWAIT, MT_DATA);
4062 if (m && tlen > MHLEN) {
4063 MCLGET(m, M_DONTWAIT);
4064 if ((m->m_flags & M_EXT) == 0) {
4065 m_freem(m);
4066 m = NULL;
4067 }
4068 }
4069 if (m == NULL)
4070 return (ENOBUFS);
4071 MCLAIM(m, &tcp_tx_mowner);
4072
4073 /* Fixup the mbuf. */
4074 m->m_data += max_linkhdr;
4075 m->m_len = m->m_pkthdr.len = tlen;
4076 if (sc->sc_tp) {
4077 tp = sc->sc_tp;
4078 if (tp->t_inpcb)
4079 so = tp->t_inpcb->inp_socket;
4080 #ifdef INET6
4081 else if (tp->t_in6pcb)
4082 so = tp->t_in6pcb->in6p_socket;
4083 #endif
4084 else
4085 so = NULL;
4086 } else
4087 so = NULL;
4088 m->m_pkthdr.rcvif = NULL;
4089 memset(mtod(m, u_char *), 0, tlen);
4090
4091 switch (sc->sc_src.sa.sa_family) {
4092 case AF_INET:
4093 ip = mtod(m, struct ip *);
4094 ip->ip_v = 4;
4095 ip->ip_dst = sc->sc_src.sin.sin_addr;
4096 ip->ip_src = sc->sc_dst.sin.sin_addr;
4097 ip->ip_p = IPPROTO_TCP;
4098 th = (struct tcphdr *)(ip + 1);
4099 th->th_dport = sc->sc_src.sin.sin_port;
4100 th->th_sport = sc->sc_dst.sin.sin_port;
4101 break;
4102 #ifdef INET6
4103 case AF_INET6:
4104 ip6 = mtod(m, struct ip6_hdr *);
4105 ip6->ip6_vfc = IPV6_VERSION;
4106 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4107 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4108 ip6->ip6_nxt = IPPROTO_TCP;
4109 /* ip6_plen will be updated in ip6_output() */
4110 th = (struct tcphdr *)(ip6 + 1);
4111 th->th_dport = sc->sc_src.sin6.sin6_port;
4112 th->th_sport = sc->sc_dst.sin6.sin6_port;
4113 break;
4114 #endif
4115 default:
4116 th = NULL;
4117 }
4118
4119 th->th_seq = htonl(sc->sc_iss);
4120 th->th_ack = htonl(sc->sc_irs + 1);
4121 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4122 th->th_flags = TH_SYN|TH_ACK;
4123 th->th_win = htons(sc->sc_win);
4124 /* th_sum already 0 */
4125 /* th_urp already 0 */
4126
4127 /* Tack on the TCP options. */
4128 optp = (u_int8_t *)(th + 1);
4129 *optp++ = TCPOPT_MAXSEG;
4130 *optp++ = 4;
4131 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4132 *optp++ = sc->sc_ourmaxseg & 0xff;
4133
4134 if (sc->sc_request_r_scale != 15) {
4135 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4136 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4137 sc->sc_request_r_scale);
4138 optp += 4;
4139 }
4140
4141 if (sc->sc_flags & SCF_TIMESTAMP) {
4142 u_int32_t *lp = (u_int32_t *)(optp);
4143 /* Form timestamp option as shown in appendix A of RFC 1323. */
4144 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
4145 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4146 *lp = htonl(sc->sc_timestamp);
4147 optp += TCPOLEN_TSTAMP_APPA;
4148 }
4149
4150 if (sc->sc_flags & SCF_SACK_PERMIT) {
4151 u_int8_t *p = optp;
4152
4153 /* Let the peer know that we will SACK. */
4154 p[0] = TCPOPT_SACK_PERMITTED;
4155 p[1] = 2;
4156 p[2] = TCPOPT_NOP;
4157 p[3] = TCPOPT_NOP;
4158 optp += 4;
4159 }
4160
4161 #ifdef TCP_SIGNATURE
4162 if (sc->sc_flags & SCF_SIGNATURE) {
4163 struct secasvar *sav;
4164 u_int8_t *sigp;
4165
4166 sav = tcp_signature_getsav(m, th);
4167
4168 if (sav == NULL) {
4169 if (m)
4170 m_freem(m);
4171 return (EPERM);
4172 }
4173
4174 *optp++ = TCPOPT_SIGNATURE;
4175 *optp++ = TCPOLEN_SIGNATURE;
4176 sigp = optp;
4177 bzero(optp, TCP_SIGLEN);
4178 optp += TCP_SIGLEN;
4179 *optp++ = TCPOPT_NOP;
4180 *optp++ = TCPOPT_EOL;
4181
4182 (void)tcp_signature(m, th, hlen, sav, sigp);
4183
4184 key_sa_recordxfer(sav, m);
4185 #ifdef FAST_IPSEC
4186 KEY_FREESAV(&sav);
4187 #else
4188 key_freesav(sav);
4189 #endif
4190 }
4191 #endif
4192
4193 /* Compute the packet's checksum. */
4194 switch (sc->sc_src.sa.sa_family) {
4195 case AF_INET:
4196 ip->ip_len = htons(tlen - hlen);
4197 th->th_sum = 0;
4198 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4199 break;
4200 #ifdef INET6
4201 case AF_INET6:
4202 ip6->ip6_plen = htons(tlen - hlen);
4203 th->th_sum = 0;
4204 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4205 break;
4206 #endif
4207 }
4208
4209 /*
4210 * Fill in some straggling IP bits. Note the stack expects
4211 * ip_len to be in host order, for convenience.
4212 */
4213 switch (sc->sc_src.sa.sa_family) {
4214 #ifdef INET
4215 case AF_INET:
4216 ip->ip_len = htons(tlen);
4217 ip->ip_ttl = ip_defttl;
4218 /* XXX tos? */
4219 break;
4220 #endif
4221 #ifdef INET6
4222 case AF_INET6:
4223 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4224 ip6->ip6_vfc |= IPV6_VERSION;
4225 ip6->ip6_plen = htons(tlen - hlen);
4226 /* ip6_hlim will be initialized afterwards */
4227 /* XXX flowlabel? */
4228 break;
4229 #endif
4230 }
4231
4232 /* XXX use IPsec policy on listening socket, on SYN ACK */
4233 tp = sc->sc_tp;
4234
4235 switch (sc->sc_src.sa.sa_family) {
4236 #ifdef INET
4237 case AF_INET:
4238 error = ip_output(m, sc->sc_ipopts, ro,
4239 (ip_mtudisc ? IP_MTUDISC : 0),
4240 (struct ip_moptions *)NULL, so);
4241 break;
4242 #endif
4243 #ifdef INET6
4244 case AF_INET6:
4245 ip6->ip6_hlim = in6_selecthlim(NULL,
4246 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
4247
4248 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
4249 (struct ip6_moptions *)0, so, NULL);
4250 break;
4251 #endif
4252 default:
4253 error = EAFNOSUPPORT;
4254 break;
4255 }
4256 return (error);
4257 }
4258