tcp_input.c revision 1.80 1 /* $NetBSD: tcp_input.c,v 1.80 1999/04/29 03:54:22 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 1997, 1998, 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9 * Facility, NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
42 * The Regents of the University of California. All rights reserved.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. All advertising materials mentioning features or use of this software
53 * must display the following acknowledgement:
54 * This product includes software developed by the University of
55 * California, Berkeley and its contributors.
56 * 4. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
73 */
74
75 /*
76 * TODO list for SYN cache stuff:
77 *
78 * Find room for a "state" field, which is needed to keep a
79 * compressed state for TIME_WAIT TCBs. It's been noted already
80 * that this is fairly important for very high-volume web and
81 * mail servers, which use a large number of short-lived
82 * connections.
83 */
84
85 #include <sys/param.h>
86 #include <sys/systm.h>
87 #include <sys/malloc.h>
88 #include <sys/mbuf.h>
89 #include <sys/protosw.h>
90 #include <sys/socket.h>
91 #include <sys/socketvar.h>
92 #include <sys/errno.h>
93 #include <sys/syslog.h>
94 #include <sys/pool.h>
95
96 #include <net/if.h>
97 #include <net/route.h>
98
99 #include <netinet/in.h>
100 #include <netinet/in_systm.h>
101 #include <netinet/ip.h>
102 #include <netinet/in_pcb.h>
103 #include <netinet/ip_var.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_fsm.h>
106 #include <netinet/tcp_seq.h>
107 #include <netinet/tcp_timer.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/tcpip.h>
110 #include <netinet/tcp_debug.h>
111
112 #include <machine/stdarg.h>
113
114 int tcprexmtthresh = 3;
115 struct tcpiphdr tcp_saveti;
116
117 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
118
119 /* for modulo comparisons of timestamps */
120 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
121 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
122
123 /*
124 * Macro to compute ACK transmission behavior. Delay the ACK unless
125 * we have already delayed an ACK (must send an ACK every two segments).
126 * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
127 * option is enabled.
128 */
129 #define TCP_SETUP_ACK(tp, ti) \
130 do { \
131 if ((tp)->t_flags & TF_DELACK || \
132 (tcp_ack_on_push && (ti)->ti_flags & TH_PUSH)) \
133 tp->t_flags |= TF_ACKNOW; \
134 else \
135 TCP_SET_DELACK(tp); \
136 } while (0)
137
138 /*
139 * Insert segment ti into reassembly queue of tcp with
140 * control block tp. Return TH_FIN if reassembly now includes
141 * a segment with FIN. The macro form does the common case inline
142 * (segment is the next to be received on an established connection,
143 * and the queue is empty), avoiding linkage into and removal
144 * from the queue and repetition of various conversions.
145 * Set DELACK for segments received in order, but ack immediately
146 * when segments are out of order (so fast retransmit can work).
147 */
148 #define TCP_REASS(tp, ti, m, so, flags) { \
149 TCP_REASS_LOCK((tp)); \
150 if ((ti)->ti_seq == (tp)->rcv_nxt && \
151 (tp)->segq.lh_first == NULL && \
152 (tp)->t_state == TCPS_ESTABLISHED) { \
153 TCP_SETUP_ACK(tp, ti); \
154 (tp)->rcv_nxt += (ti)->ti_len; \
155 flags = (ti)->ti_flags & TH_FIN; \
156 tcpstat.tcps_rcvpack++;\
157 tcpstat.tcps_rcvbyte += (ti)->ti_len;\
158 sbappend(&(so)->so_rcv, (m)); \
159 sorwakeup(so); \
160 } else { \
161 (flags) = tcp_reass((tp), (ti), (m)); \
162 tp->t_flags |= TF_ACKNOW; \
163 } \
164 TCP_REASS_UNLOCK((tp)); \
165 }
166
167 int
168 tcp_reass(tp, ti, m)
169 register struct tcpcb *tp;
170 register struct tcpiphdr *ti;
171 struct mbuf *m;
172 {
173 register struct ipqent *p, *q, *nq, *tiqe = NULL;
174 struct socket *so = tp->t_inpcb->inp_socket;
175 int pkt_flags;
176 tcp_seq pkt_seq;
177 unsigned pkt_len;
178 u_long rcvpartdupbyte = 0;
179 u_long rcvoobyte;
180
181 TCP_REASS_LOCK_CHECK(tp);
182
183 /*
184 * Call with ti==0 after become established to
185 * force pre-ESTABLISHED data up to user socket.
186 */
187 if (ti == 0)
188 goto present;
189
190 rcvoobyte = ti->ti_len;
191 /*
192 * Copy these to local variables because the tcpiphdr
193 * gets munged while we are collapsing mbufs.
194 */
195 pkt_seq = ti->ti_seq;
196 pkt_len = ti->ti_len;
197 pkt_flags = ti->ti_flags;
198 /*
199 * Find a segment which begins after this one does.
200 */
201 for (p = NULL, q = tp->segq.lh_first; q != NULL; q = nq) {
202 nq = q->ipqe_q.le_next;
203 /*
204 * If the received segment is just right after this
205 * fragment, merge the two together and then check
206 * for further overlaps.
207 */
208 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
209 #ifdef TCPREASS_DEBUG
210 printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
211 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
212 q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
213 #endif
214 pkt_len += q->ipqe_len;
215 pkt_flags |= q->ipqe_flags;
216 pkt_seq = q->ipqe_seq;
217 m_cat(q->ipqe_m, m);
218 m = q->ipqe_m;
219 goto free_ipqe;
220 }
221 /*
222 * If the received segment is completely past this
223 * fragment, we need to go the next fragment.
224 */
225 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
226 p = q;
227 continue;
228 }
229 /*
230 * If the fragment is past the received segment,
231 * it (or any following) can't be concatenated.
232 */
233 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len))
234 break;
235 /*
236 * We've received all the data in this segment before.
237 * mark it as a duplicate and return.
238 */
239 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
240 SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
241 tcpstat.tcps_rcvduppack++;
242 tcpstat.tcps_rcvdupbyte += pkt_len;
243 m_freem(m);
244 if (tiqe != NULL)
245 pool_put(&ipqent_pool, tiqe);
246 return (0);
247 }
248 /*
249 * Received segment completely overlaps this fragment
250 * so we drop the fragment (this keeps the temporal
251 * ordering of segments correct).
252 */
253 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
254 SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
255 rcvpartdupbyte += q->ipqe_len;
256 m_freem(q->ipqe_m);
257 goto free_ipqe;
258 }
259 /*
260 * RX'ed segment extends past the end of the
261 * fragment. Drop the overlapping bytes. Then
262 * merge the fragment and segment then treat as
263 * a longer received packet.
264 */
265 if (SEQ_LT(q->ipqe_seq, pkt_seq)
266 && SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
267 int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
268 #ifdef TCPREASS_DEBUG
269 printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
270 tp, overlap,
271 pkt_seq, pkt_seq + pkt_len, pkt_len);
272 #endif
273 m_adj(m, overlap);
274 rcvpartdupbyte += overlap;
275 m_cat(q->ipqe_m, m);
276 m = q->ipqe_m;
277 pkt_seq = q->ipqe_seq;
278 pkt_len += q->ipqe_len - overlap;
279 rcvoobyte -= overlap;
280 goto free_ipqe;
281 }
282 /*
283 * RX'ed segment extends past the front of the
284 * fragment. Drop the overlapping bytes on the
285 * received packet. The packet will then be
286 * contatentated with this fragment a bit later.
287 */
288 if (SEQ_GT(q->ipqe_seq, pkt_seq)
289 && SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) {
290 int overlap = pkt_seq + pkt_len - q->ipqe_seq;
291 #ifdef TCPREASS_DEBUG
292 printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
293 tp, overlap,
294 pkt_seq, pkt_seq + pkt_len, pkt_len);
295 #endif
296 m_adj(m, -overlap);
297 pkt_len -= overlap;
298 rcvpartdupbyte += overlap;
299 rcvoobyte -= overlap;
300 }
301 /*
302 * If the received segment immediates precedes this
303 * fragment then tack the fragment onto this segment
304 * and reinsert the data.
305 */
306 if (q->ipqe_seq == pkt_seq + pkt_len) {
307 #ifdef TCPREASS_DEBUG
308 printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
309 tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
310 pkt_seq, pkt_seq + pkt_len, pkt_len);
311 #endif
312 pkt_len += q->ipqe_len;
313 pkt_flags |= q->ipqe_flags;
314 m_cat(m, q->ipqe_m);
315 LIST_REMOVE(q, ipqe_q);
316 LIST_REMOVE(q, ipqe_timeq);
317 if (tiqe == NULL) {
318 tiqe = q;
319 } else {
320 pool_put(&ipqent_pool, q);
321 }
322 break;
323 }
324 /*
325 * If the fragment is before the segment, remember it.
326 * When this loop is terminated, p will contain the
327 * pointer to fragment that is right before the received
328 * segment.
329 */
330 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
331 p = q;
332
333 continue;
334
335 /*
336 * This is a common operation. It also will allow
337 * to save doing a malloc/free in most instances.
338 */
339 free_ipqe:
340 LIST_REMOVE(q, ipqe_q);
341 LIST_REMOVE(q, ipqe_timeq);
342 if (tiqe == NULL) {
343 tiqe = q;
344 } else {
345 pool_put(&ipqent_pool, q);
346 }
347 }
348
349 /*
350 * Allocate a new queue entry since the received segment did not
351 * collapse onto any other out-of-order block; thus we are allocating
352 * a new block. If it had collapsed, tiqe would not be NULL and
353 * we would be reusing it.
354 * XXX If we can't, just drop the packet. XXX
355 */
356 if (tiqe == NULL) {
357 tiqe = pool_get(&ipqent_pool, PR_NOWAIT);
358 if (tiqe == NULL) {
359 tcpstat.tcps_rcvmemdrop++;
360 m_freem(m);
361 return (0);
362 }
363 }
364
365 /*
366 * Update the counters.
367 */
368 tcpstat.tcps_rcvoopack++;
369 tcpstat.tcps_rcvoobyte += rcvoobyte;
370 if (rcvpartdupbyte) {
371 tcpstat.tcps_rcvpartduppack++;
372 tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
373 }
374
375 /*
376 * Insert the new fragment queue entry into both queues.
377 */
378 tiqe->ipqe_m = m;
379 tiqe->ipqe_seq = pkt_seq;
380 tiqe->ipqe_len = pkt_len;
381 tiqe->ipqe_flags = pkt_flags;
382 if (p == NULL) {
383 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
384 #ifdef TCPREASS_DEBUG
385 if (tiqe->ipqe_seq != tp->rcv_nxt)
386 printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
387 tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
388 #endif
389 } else {
390 LIST_INSERT_AFTER(p, tiqe, ipqe_q);
391 #ifdef TCPREASS_DEBUG
392 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
393 tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
394 p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
395 #endif
396 }
397
398 LIST_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
399
400 present:
401 /*
402 * Present data to user, advancing rcv_nxt through
403 * completed sequence space.
404 */
405 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
406 return (0);
407 q = tp->segq.lh_first;
408 if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
409 return (0);
410 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
411 return (0);
412
413 tp->rcv_nxt += q->ipqe_len;
414 pkt_flags = q->ipqe_flags & TH_FIN;
415
416 LIST_REMOVE(q, ipqe_q);
417 LIST_REMOVE(q, ipqe_timeq);
418 if (so->so_state & SS_CANTRCVMORE)
419 m_freem(q->ipqe_m);
420 else
421 sbappend(&so->so_rcv, q->ipqe_m);
422 pool_put(&ipqent_pool, q);
423 sorwakeup(so);
424 return (pkt_flags);
425 }
426
427 /*
428 * TCP input routine, follows pages 65-76 of the
429 * protocol specification dated September, 1981 very closely.
430 */
431 void
432 #if __STDC__
433 tcp_input(struct mbuf *m, ...)
434 #else
435 tcp_input(m, va_alist)
436 register struct mbuf *m;
437 #endif
438 {
439 register struct tcpiphdr *ti;
440 register struct inpcb *inp;
441 caddr_t optp = NULL;
442 int optlen = 0;
443 int len, tlen, off, hdroptlen;
444 register struct tcpcb *tp = 0;
445 register int tiflags;
446 struct socket *so = NULL;
447 int todrop, acked, ourfinisacked, needoutput = 0;
448 short ostate = 0;
449 int iss = 0;
450 u_long tiwin;
451 struct tcp_opt_info opti;
452 int iphlen;
453 va_list ap;
454
455 va_start(ap, m);
456 iphlen = va_arg(ap, int);
457 va_end(ap);
458
459 tcpstat.tcps_rcvtotal++;
460
461 opti.ts_present = 0;
462 opti.maxseg = 0;
463
464 /*
465 * Get IP and TCP header together in first mbuf.
466 * Note: IP leaves IP header in first mbuf.
467 */
468 ti = mtod(m, struct tcpiphdr *);
469 if (iphlen > sizeof (struct ip))
470 ip_stripoptions(m, (struct mbuf *)0);
471 if (m->m_len < sizeof (struct tcpiphdr)) {
472 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
473 tcpstat.tcps_rcvshort++;
474 return;
475 }
476 ti = mtod(m, struct tcpiphdr *);
477 }
478
479 /*
480 * Checksum extended TCP header and data.
481 */
482 len = ((struct ip *)ti)->ip_len;
483 tlen = len - sizeof (struct ip);
484 bzero(ti->ti_x1, sizeof ti->ti_x1);
485 ti->ti_len = (u_int16_t)tlen;
486 HTONS(ti->ti_len);
487 if (in_cksum(m, len) != 0) {
488 tcpstat.tcps_rcvbadsum++;
489 goto drop;
490 }
491
492 /*
493 * Check that TCP offset makes sense,
494 * pull out TCP options and adjust length. XXX
495 */
496 off = ti->ti_off << 2;
497 if (off < sizeof (struct tcphdr) || off > tlen) {
498 tcpstat.tcps_rcvbadoff++;
499 goto drop;
500 }
501 tlen -= off;
502 ti->ti_len = tlen;
503 if (off > sizeof (struct tcphdr)) {
504 if (m->m_len < sizeof(struct ip) + off) {
505 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
506 tcpstat.tcps_rcvshort++;
507 return;
508 }
509 ti = mtod(m, struct tcpiphdr *);
510 }
511 optlen = off - sizeof (struct tcphdr);
512 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
513 /*
514 * Do quick retrieval of timestamp options ("options
515 * prediction?"). If timestamp is the only option and it's
516 * formatted as recommended in RFC 1323 appendix A, we
517 * quickly get the values now and not bother calling
518 * tcp_dooptions(), etc.
519 */
520 if ((optlen == TCPOLEN_TSTAMP_APPA ||
521 (optlen > TCPOLEN_TSTAMP_APPA &&
522 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
523 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
524 (ti->ti_flags & TH_SYN) == 0) {
525 opti.ts_present = 1;
526 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
527 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
528 optp = NULL; /* we've parsed the options */
529 }
530 }
531 tiflags = ti->ti_flags;
532
533 /*
534 * Convert TCP protocol specific fields to host format.
535 */
536 NTOHL(ti->ti_seq);
537 NTOHL(ti->ti_ack);
538 NTOHS(ti->ti_win);
539 NTOHS(ti->ti_urp);
540
541 /*
542 * Locate pcb for segment.
543 */
544 findpcb:
545 inp = in_pcblookup_connect(&tcbtable, ti->ti_src, ti->ti_sport,
546 ti->ti_dst, ti->ti_dport);
547 if (inp == 0) {
548 ++tcpstat.tcps_pcbhashmiss;
549 inp = in_pcblookup_bind(&tcbtable, ti->ti_dst, ti->ti_dport);
550 if (inp == 0) {
551 ++tcpstat.tcps_noport;
552 goto dropwithreset;
553 }
554 }
555
556 /*
557 * If the state is CLOSED (i.e., TCB does not exist) then
558 * all data in the incoming segment is discarded.
559 * If the TCB exists but is in CLOSED state, it is embryonic,
560 * but should either do a listen or a connect soon.
561 */
562 tp = intotcpcb(inp);
563 if (tp == 0)
564 goto dropwithreset;
565 if (tp->t_state == TCPS_CLOSED)
566 goto drop;
567
568 /* Unscale the window into a 32-bit value. */
569 if ((tiflags & TH_SYN) == 0)
570 tiwin = ti->ti_win << tp->snd_scale;
571 else
572 tiwin = ti->ti_win;
573
574 so = inp->inp_socket;
575 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
576 if (so->so_options & SO_DEBUG) {
577 ostate = tp->t_state;
578 tcp_saveti = *ti;
579 }
580 if (so->so_options & SO_ACCEPTCONN) {
581 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
582 if (tiflags & TH_RST) {
583 syn_cache_reset(ti);
584 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
585 (TH_ACK|TH_SYN)) {
586 /*
587 * Received a SYN,ACK. This should
588 * never happen while we are in
589 * LISTEN. Send an RST.
590 */
591 goto badsyn;
592 } else if (tiflags & TH_ACK) {
593 so = syn_cache_get(so, m);
594 if (so == NULL) {
595 /*
596 * We don't have a SYN for
597 * this ACK; send an RST.
598 */
599 goto badsyn;
600 } else if (so ==
601 (struct socket *)(-1)) {
602 /*
603 * We were unable to create
604 * the connection. If the
605 * 3-way handshake was
606 * completed, and RST has
607 * been sent to the peer.
608 * Since the mbuf might be
609 * in use for the reply,
610 * do not free it.
611 */
612 m = NULL;
613 } else {
614 /*
615 * We have created a
616 * full-blown connection.
617 */
618 inp = sotoinpcb(so);
619 tp = intotcpcb(inp);
620 tiwin <<= tp->snd_scale;
621 goto after_listen;
622 }
623 } else {
624 /*
625 * None of RST, SYN or ACK was set.
626 * This is an invalid packet for a
627 * TCB in LISTEN state. Send a RST.
628 */
629 goto badsyn;
630 }
631 } else {
632 /*
633 * Received a SYN.
634 */
635 if (in_hosteq(ti->ti_src, ti->ti_dst) &&
636 ti->ti_sport == ti->ti_dport) {
637 /*
638 * LISTEN socket received a SYN
639 * from itself? This can't possibly
640 * be valid; drop the packet.
641 */
642 tcpstat.tcps_badsyn++;
643 goto drop;
644 }
645 /*
646 * SYN looks ok; create compressed TCP
647 * state for it.
648 */
649 if (so->so_qlen <= so->so_qlimit &&
650 syn_cache_add(so, m, optp, optlen, &opti))
651 m = NULL;
652 }
653 goto drop;
654 }
655 }
656
657 after_listen:
658 #ifdef DIAGNOSTIC
659 /*
660 * Should not happen now that all embryonic connections
661 * are handled with compressed state.
662 */
663 if (tp->t_state == TCPS_LISTEN)
664 panic("tcp_input: TCPS_LISTEN");
665 #endif
666
667 /*
668 * Segment received on connection.
669 * Reset idle time and keep-alive timer.
670 */
671 tp->t_idle = 0;
672 if (TCPS_HAVEESTABLISHED(tp->t_state))
673 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
674
675 /*
676 * Process options.
677 */
678 if (optp)
679 tcp_dooptions(tp, optp, optlen, ti, &opti);
680
681 /*
682 * Header prediction: check for the two common cases
683 * of a uni-directional data xfer. If the packet has
684 * no control flags, is in-sequence, the window didn't
685 * change and we're not retransmitting, it's a
686 * candidate. If the length is zero and the ack moved
687 * forward, we're the sender side of the xfer. Just
688 * free the data acked & wake any higher level process
689 * that was blocked waiting for space. If the length
690 * is non-zero and the ack didn't move, we're the
691 * receiver side. If we're getting packets in-order
692 * (the reassembly queue is empty), add the data to
693 * the socket buffer and note that we need a delayed ack.
694 */
695 if (tp->t_state == TCPS_ESTABLISHED &&
696 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
697 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
698 ti->ti_seq == tp->rcv_nxt &&
699 tiwin && tiwin == tp->snd_wnd &&
700 tp->snd_nxt == tp->snd_max) {
701
702 /*
703 * If last ACK falls within this segment's sequence numbers,
704 * record the timestamp.
705 */
706 if (opti.ts_present &&
707 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
708 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) {
709 tp->ts_recent_age = tcp_now;
710 tp->ts_recent = opti.ts_val;
711 }
712
713 if (ti->ti_len == 0) {
714 if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
715 SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
716 tp->snd_cwnd >= tp->snd_wnd &&
717 tp->t_dupacks < tcprexmtthresh) {
718 /*
719 * this is a pure ack for outstanding data.
720 */
721 ++tcpstat.tcps_predack;
722 if (opti.ts_present)
723 tcp_xmit_timer(tp,
724 tcp_now-opti.ts_ecr+1);
725 else if (tp->t_rtt &&
726 SEQ_GT(ti->ti_ack, tp->t_rtseq))
727 tcp_xmit_timer(tp, tp->t_rtt);
728 acked = ti->ti_ack - tp->snd_una;
729 tcpstat.tcps_rcvackpack++;
730 tcpstat.tcps_rcvackbyte += acked;
731 sbdrop(&so->so_snd, acked);
732 tp->snd_una = ti->ti_ack;
733 m_freem(m);
734
735 /*
736 * If all outstanding data are acked, stop
737 * retransmit timer, otherwise restart timer
738 * using current (possibly backed-off) value.
739 * If process is waiting for space,
740 * wakeup/selwakeup/signal. If data
741 * are ready to send, let tcp_output
742 * decide between more output or persist.
743 */
744 if (tp->snd_una == tp->snd_max)
745 TCP_TIMER_DISARM(tp, TCPT_REXMT);
746 else if (TCP_TIMER_ISARMED(tp,
747 TCPT_PERSIST) == 0)
748 TCP_TIMER_ARM(tp, TCPT_REXMT,
749 tp->t_rxtcur);
750
751 sowwakeup(so);
752 if (so->so_snd.sb_cc)
753 (void) tcp_output(tp);
754 return;
755 }
756 } else if (ti->ti_ack == tp->snd_una &&
757 tp->segq.lh_first == NULL &&
758 ti->ti_len <= sbspace(&so->so_rcv)) {
759 /*
760 * this is a pure, in-sequence data packet
761 * with nothing on the reassembly queue and
762 * we have enough buffer space to take it.
763 */
764 ++tcpstat.tcps_preddat;
765 tp->rcv_nxt += ti->ti_len;
766 tcpstat.tcps_rcvpack++;
767 tcpstat.tcps_rcvbyte += ti->ti_len;
768 /*
769 * Drop TCP, IP headers and TCP options then add data
770 * to socket buffer.
771 */
772 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
773 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
774 sbappend(&so->so_rcv, m);
775 sorwakeup(so);
776 TCP_SETUP_ACK(tp, ti);
777 if (tp->t_flags & TF_ACKNOW)
778 (void) tcp_output(tp);
779 return;
780 }
781 }
782
783 /*
784 * Drop TCP, IP headers and TCP options.
785 */
786 hdroptlen = sizeof(struct tcpiphdr) + off - sizeof(struct tcphdr);
787 m->m_data += hdroptlen;
788 m->m_len -= hdroptlen;
789
790 /*
791 * Calculate amount of space in receive window,
792 * and then do TCP input processing.
793 * Receive window is amount of space in rcv queue,
794 * but not less than advertised window.
795 */
796 { int win;
797
798 win = sbspace(&so->so_rcv);
799 if (win < 0)
800 win = 0;
801 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
802 }
803
804 switch (tp->t_state) {
805
806 /*
807 * If the state is SYN_SENT:
808 * if seg contains an ACK, but not for our SYN, drop the input.
809 * if seg contains a RST, then drop the connection.
810 * if seg does not contain SYN, then drop it.
811 * Otherwise this is an acceptable SYN segment
812 * initialize tp->rcv_nxt and tp->irs
813 * if seg contains ack then advance tp->snd_una
814 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
815 * arrange for segment to be acked (eventually)
816 * continue processing rest of data/controls, beginning with URG
817 */
818 case TCPS_SYN_SENT:
819 if ((tiflags & TH_ACK) &&
820 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
821 SEQ_GT(ti->ti_ack, tp->snd_max)))
822 goto dropwithreset;
823 if (tiflags & TH_RST) {
824 if (tiflags & TH_ACK)
825 tp = tcp_drop(tp, ECONNREFUSED);
826 goto drop;
827 }
828 if ((tiflags & TH_SYN) == 0)
829 goto drop;
830 if (tiflags & TH_ACK) {
831 tp->snd_una = ti->ti_ack;
832 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
833 tp->snd_nxt = tp->snd_una;
834 }
835 TCP_TIMER_DISARM(tp, TCPT_REXMT);
836 tp->irs = ti->ti_seq;
837 tcp_rcvseqinit(tp);
838 tp->t_flags |= TF_ACKNOW;
839 tcp_mss_from_peer(tp, opti.maxseg);
840
841 /*
842 * Initialize the initial congestion window. If we
843 * had to retransmit the SYN, we must initialize cwnd
844 * to 1 segment (i.e. the Loss Window).
845 */
846 if (tp->t_flags & TF_SYN_REXMT)
847 tp->snd_cwnd = tp->t_peermss;
848 else
849 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
850 tp->t_peermss);
851
852 tcp_rmx_rtt(tp);
853 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
854 tcpstat.tcps_connects++;
855 soisconnected(so);
856 tcp_established(tp);
857 /* Do window scaling on this connection? */
858 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
859 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
860 tp->snd_scale = tp->requested_s_scale;
861 tp->rcv_scale = tp->request_r_scale;
862 }
863 TCP_REASS_LOCK(tp);
864 (void) tcp_reass(tp, (struct tcpiphdr *)0,
865 (struct mbuf *)0);
866 TCP_REASS_UNLOCK(tp);
867 /*
868 * if we didn't have to retransmit the SYN,
869 * use its rtt as our initial srtt & rtt var.
870 */
871 if (tp->t_rtt)
872 tcp_xmit_timer(tp, tp->t_rtt);
873 } else
874 tp->t_state = TCPS_SYN_RECEIVED;
875
876 /*
877 * Advance ti->ti_seq to correspond to first data byte.
878 * If data, trim to stay within window,
879 * dropping FIN if necessary.
880 */
881 ti->ti_seq++;
882 if (ti->ti_len > tp->rcv_wnd) {
883 todrop = ti->ti_len - tp->rcv_wnd;
884 m_adj(m, -todrop);
885 ti->ti_len = tp->rcv_wnd;
886 tiflags &= ~TH_FIN;
887 tcpstat.tcps_rcvpackafterwin++;
888 tcpstat.tcps_rcvbyteafterwin += todrop;
889 }
890 tp->snd_wl1 = ti->ti_seq - 1;
891 tp->rcv_up = ti->ti_seq;
892 goto step6;
893
894 /*
895 * If the state is SYN_RECEIVED:
896 * If seg contains an ACK, but not for our SYN, drop the input
897 * and generate an RST. See page 36, rfc793
898 */
899 case TCPS_SYN_RECEIVED:
900 if ((tiflags & TH_ACK) &&
901 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
902 SEQ_GT(ti->ti_ack, tp->snd_max)))
903 goto dropwithreset;
904 break;
905 }
906
907 /*
908 * States other than LISTEN or SYN_SENT.
909 * First check timestamp, if present.
910 * Then check that at least some bytes of segment are within
911 * receive window. If segment begins before rcv_nxt,
912 * drop leading data (and SYN); if nothing left, just ack.
913 *
914 * RFC 1323 PAWS: If we have a timestamp reply on this segment
915 * and it's less than ts_recent, drop it.
916 */
917 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
918 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
919
920 /* Check to see if ts_recent is over 24 days old. */
921 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
922 /*
923 * Invalidate ts_recent. If this segment updates
924 * ts_recent, the age will be reset later and ts_recent
925 * will get a valid value. If it does not, setting
926 * ts_recent to zero will at least satisfy the
927 * requirement that zero be placed in the timestamp
928 * echo reply when ts_recent isn't valid. The
929 * age isn't reset until we get a valid ts_recent
930 * because we don't want out-of-order segments to be
931 * dropped when ts_recent is old.
932 */
933 tp->ts_recent = 0;
934 } else {
935 tcpstat.tcps_rcvduppack++;
936 tcpstat.tcps_rcvdupbyte += ti->ti_len;
937 tcpstat.tcps_pawsdrop++;
938 goto dropafterack;
939 }
940 }
941
942 todrop = tp->rcv_nxt - ti->ti_seq;
943 if (todrop > 0) {
944 if (tiflags & TH_SYN) {
945 tiflags &= ~TH_SYN;
946 ti->ti_seq++;
947 if (ti->ti_urp > 1)
948 ti->ti_urp--;
949 else {
950 tiflags &= ~TH_URG;
951 ti->ti_urp = 0;
952 }
953 todrop--;
954 }
955 if (todrop > ti->ti_len ||
956 (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
957 /*
958 * Any valid FIN must be to the left of the window.
959 * At this point the FIN must be a duplicate or
960 * out of sequence; drop it.
961 */
962 tiflags &= ~TH_FIN;
963 /*
964 * Send an ACK to resynchronize and drop any data.
965 * But keep on processing for RST or ACK.
966 */
967 tp->t_flags |= TF_ACKNOW;
968 todrop = ti->ti_len;
969 tcpstat.tcps_rcvdupbyte += todrop;
970 tcpstat.tcps_rcvduppack++;
971 } else {
972 tcpstat.tcps_rcvpartduppack++;
973 tcpstat.tcps_rcvpartdupbyte += todrop;
974 }
975 m_adj(m, todrop);
976 ti->ti_seq += todrop;
977 ti->ti_len -= todrop;
978 if (ti->ti_urp > todrop)
979 ti->ti_urp -= todrop;
980 else {
981 tiflags &= ~TH_URG;
982 ti->ti_urp = 0;
983 }
984 }
985
986 /*
987 * If new data are received on a connection after the
988 * user processes are gone, then RST the other end.
989 */
990 if ((so->so_state & SS_NOFDREF) &&
991 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
992 tp = tcp_close(tp);
993 tcpstat.tcps_rcvafterclose++;
994 goto dropwithreset;
995 }
996
997 /*
998 * If segment ends after window, drop trailing data
999 * (and PUSH and FIN); if nothing left, just ACK.
1000 */
1001 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
1002 if (todrop > 0) {
1003 tcpstat.tcps_rcvpackafterwin++;
1004 if (todrop >= ti->ti_len) {
1005 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
1006 /*
1007 * If a new connection request is received
1008 * while in TIME_WAIT, drop the old connection
1009 * and start over if the sequence numbers
1010 * are above the previous ones.
1011 */
1012 if (tiflags & TH_SYN &&
1013 tp->t_state == TCPS_TIME_WAIT &&
1014 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
1015 iss = tcp_new_iss(tp, sizeof(struct tcpcb),
1016 tp->snd_nxt);
1017 tp = tcp_close(tp);
1018 /*
1019 * We have already advanced the mbuf
1020 * pointers past the IP+TCP headers and
1021 * options. Restore those pointers before
1022 * attempting to use the TCP header again.
1023 */
1024 m->m_data -= hdroptlen;
1025 m->m_len += hdroptlen;
1026 goto findpcb;
1027 }
1028 /*
1029 * If window is closed can only take segments at
1030 * window edge, and have to drop data and PUSH from
1031 * incoming segments. Continue processing, but
1032 * remember to ack. Otherwise, drop segment
1033 * and ack.
1034 */
1035 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
1036 tp->t_flags |= TF_ACKNOW;
1037 tcpstat.tcps_rcvwinprobe++;
1038 } else
1039 goto dropafterack;
1040 } else
1041 tcpstat.tcps_rcvbyteafterwin += todrop;
1042 m_adj(m, -todrop);
1043 ti->ti_len -= todrop;
1044 tiflags &= ~(TH_PUSH|TH_FIN);
1045 }
1046
1047 /*
1048 * If last ACK falls within this segment's sequence numbers,
1049 * and the timestamp is newer, record it.
1050 */
1051 if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
1052 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
1053 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len +
1054 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
1055 tp->ts_recent_age = tcp_now;
1056 tp->ts_recent = opti.ts_val;
1057 }
1058
1059 /*
1060 * If the RST bit is set examine the state:
1061 * SYN_RECEIVED STATE:
1062 * If passive open, return to LISTEN state.
1063 * If active open, inform user that connection was refused.
1064 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1065 * Inform user that connection was reset, and close tcb.
1066 * CLOSING, LAST_ACK, TIME_WAIT STATES
1067 * Close the tcb.
1068 */
1069 if (tiflags&TH_RST) switch (tp->t_state) {
1070
1071 case TCPS_SYN_RECEIVED:
1072 so->so_error = ECONNREFUSED;
1073 goto close;
1074
1075 case TCPS_ESTABLISHED:
1076 case TCPS_FIN_WAIT_1:
1077 case TCPS_FIN_WAIT_2:
1078 case TCPS_CLOSE_WAIT:
1079 so->so_error = ECONNRESET;
1080 close:
1081 tp->t_state = TCPS_CLOSED;
1082 tcpstat.tcps_drops++;
1083 tp = tcp_close(tp);
1084 goto drop;
1085
1086 case TCPS_CLOSING:
1087 case TCPS_LAST_ACK:
1088 case TCPS_TIME_WAIT:
1089 tp = tcp_close(tp);
1090 goto drop;
1091 }
1092
1093 /*
1094 * If a SYN is in the window, then this is an
1095 * error and we send an RST and drop the connection.
1096 */
1097 if (tiflags & TH_SYN) {
1098 tp = tcp_drop(tp, ECONNRESET);
1099 goto dropwithreset;
1100 }
1101
1102 /*
1103 * If the ACK bit is off we drop the segment and return.
1104 */
1105 if ((tiflags & TH_ACK) == 0) {
1106 if (tp->t_flags & TF_ACKNOW)
1107 goto dropafterack;
1108 else
1109 goto drop;
1110 }
1111
1112 /*
1113 * Ack processing.
1114 */
1115 switch (tp->t_state) {
1116
1117 /*
1118 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1119 * ESTABLISHED state and continue processing, otherwise
1120 * send an RST.
1121 */
1122 case TCPS_SYN_RECEIVED:
1123 if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
1124 SEQ_GT(ti->ti_ack, tp->snd_max))
1125 goto dropwithreset;
1126 tcpstat.tcps_connects++;
1127 soisconnected(so);
1128 tcp_established(tp);
1129 /* Do window scaling? */
1130 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1131 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1132 tp->snd_scale = tp->requested_s_scale;
1133 tp->rcv_scale = tp->request_r_scale;
1134 }
1135 TCP_REASS_LOCK(tp);
1136 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
1137 TCP_REASS_UNLOCK(tp);
1138 tp->snd_wl1 = ti->ti_seq - 1;
1139 /* fall into ... */
1140
1141 /*
1142 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1143 * ACKs. If the ack is in the range
1144 * tp->snd_una < ti->ti_ack <= tp->snd_max
1145 * then advance tp->snd_una to ti->ti_ack and drop
1146 * data from the retransmission queue. If this ACK reflects
1147 * more up to date window information we update our window information.
1148 */
1149 case TCPS_ESTABLISHED:
1150 case TCPS_FIN_WAIT_1:
1151 case TCPS_FIN_WAIT_2:
1152 case TCPS_CLOSE_WAIT:
1153 case TCPS_CLOSING:
1154 case TCPS_LAST_ACK:
1155 case TCPS_TIME_WAIT:
1156
1157 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
1158 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
1159 tcpstat.tcps_rcvdupack++;
1160 /*
1161 * If we have outstanding data (other than
1162 * a window probe), this is a completely
1163 * duplicate ack (ie, window info didn't
1164 * change), the ack is the biggest we've
1165 * seen and we've seen exactly our rexmt
1166 * threshhold of them, assume a packet
1167 * has been dropped and retransmit it.
1168 * Kludge snd_nxt & the congestion
1169 * window so we send only this one
1170 * packet.
1171 *
1172 * We know we're losing at the current
1173 * window size so do congestion avoidance
1174 * (set ssthresh to half the current window
1175 * and pull our congestion window back to
1176 * the new ssthresh).
1177 *
1178 * Dup acks mean that packets have left the
1179 * network (they're now cached at the receiver)
1180 * so bump cwnd by the amount in the receiver
1181 * to keep a constant cwnd packets in the
1182 * network.
1183 */
1184 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
1185 ti->ti_ack != tp->snd_una)
1186 tp->t_dupacks = 0;
1187 else if (++tp->t_dupacks == tcprexmtthresh) {
1188 tcp_seq onxt = tp->snd_nxt;
1189 u_int win =
1190 min(tp->snd_wnd, tp->snd_cwnd) /
1191 2 / tp->t_segsz;
1192 if (SEQ_LT(ti->ti_ack, tp->snd_recover)) {
1193 /*
1194 * False fast retransmit after
1195 * timeout. Do not cut window.
1196 */
1197 tp->snd_cwnd += tp->t_segsz;
1198 tp->t_dupacks = 0;
1199 (void) tcp_output(tp);
1200 goto drop;
1201 }
1202
1203 if (win < 2)
1204 win = 2;
1205 tp->snd_ssthresh = win * tp->t_segsz;
1206 tp->snd_recover = tp->snd_max;
1207 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1208 tp->t_rtt = 0;
1209 tp->snd_nxt = ti->ti_ack;
1210 tp->snd_cwnd = tp->t_segsz;
1211 (void) tcp_output(tp);
1212 tp->snd_cwnd = tp->snd_ssthresh +
1213 tp->t_segsz * tp->t_dupacks;
1214 if (SEQ_GT(onxt, tp->snd_nxt))
1215 tp->snd_nxt = onxt;
1216 goto drop;
1217 } else if (tp->t_dupacks > tcprexmtthresh) {
1218 tp->snd_cwnd += tp->t_segsz;
1219 (void) tcp_output(tp);
1220 goto drop;
1221 }
1222 } else
1223 tp->t_dupacks = 0;
1224 break;
1225 }
1226 /*
1227 * If the congestion window was inflated to account
1228 * for the other side's cached packets, retract it.
1229 */
1230 if (!tcp_do_newreno) {
1231 if (tp->t_dupacks >= tcprexmtthresh &&
1232 tp->snd_cwnd > tp->snd_ssthresh)
1233 tp->snd_cwnd = tp->snd_ssthresh;
1234 tp->t_dupacks = 0;
1235 } else if (tp->t_dupacks >= tcprexmtthresh
1236 && !tcp_newreno(tp, ti)) {
1237 tp->snd_cwnd = tp->snd_ssthresh;
1238 /*
1239 * Window inflation should have left us with approx.
1240 * snd_ssthresh outstanding data. But in case we
1241 * would be inclined to send a burst, better to do
1242 * it via the slow start mechanism.
1243 */
1244 if (SEQ_SUB(tp->snd_max, ti->ti_ack) < tp->snd_ssthresh)
1245 tp->snd_cwnd = SEQ_SUB(tp->snd_max, ti->ti_ack)
1246 + tp->t_segsz;
1247 tp->t_dupacks = 0;
1248 }
1249 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1250 tcpstat.tcps_rcvacktoomuch++;
1251 goto dropafterack;
1252 }
1253 acked = ti->ti_ack - tp->snd_una;
1254 tcpstat.tcps_rcvackpack++;
1255 tcpstat.tcps_rcvackbyte += acked;
1256
1257 /*
1258 * If we have a timestamp reply, update smoothed
1259 * round trip time. If no timestamp is present but
1260 * transmit timer is running and timed sequence
1261 * number was acked, update smoothed round trip time.
1262 * Since we now have an rtt measurement, cancel the
1263 * timer backoff (cf., Phil Karn's retransmit alg.).
1264 * Recompute the initial retransmit timer.
1265 */
1266 if (opti.ts_present)
1267 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1);
1268 else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1269 tcp_xmit_timer(tp,tp->t_rtt);
1270
1271 /*
1272 * If all outstanding data is acked, stop retransmit
1273 * timer and remember to restart (more output or persist).
1274 * If there is more data to be acked, restart retransmit
1275 * timer, using current (possibly backed-off) value.
1276 */
1277 if (ti->ti_ack == tp->snd_max) {
1278 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1279 needoutput = 1;
1280 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1281 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1282 /*
1283 * When new data is acked, open the congestion window.
1284 * If the window gives us less than ssthresh packets
1285 * in flight, open exponentially (segsz per packet).
1286 * Otherwise open linearly: segsz per window
1287 * (segsz^2 / cwnd per packet), plus a constant
1288 * fraction of a packet (segsz/8) to help larger windows
1289 * open quickly enough.
1290 */
1291 {
1292 register u_int cw = tp->snd_cwnd;
1293 register u_int incr = tp->t_segsz;
1294
1295 if (cw > tp->snd_ssthresh)
1296 incr = incr * incr / cw;
1297 if (!tcp_do_newreno || SEQ_GEQ(ti->ti_ack, tp->snd_recover))
1298 tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
1299 }
1300 if (acked > so->so_snd.sb_cc) {
1301 tp->snd_wnd -= so->so_snd.sb_cc;
1302 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1303 ourfinisacked = 1;
1304 } else {
1305 sbdrop(&so->so_snd, acked);
1306 tp->snd_wnd -= acked;
1307 ourfinisacked = 0;
1308 }
1309 sowwakeup(so);
1310 tp->snd_una = ti->ti_ack;
1311 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1312 tp->snd_nxt = tp->snd_una;
1313
1314 switch (tp->t_state) {
1315
1316 /*
1317 * In FIN_WAIT_1 STATE in addition to the processing
1318 * for the ESTABLISHED state if our FIN is now acknowledged
1319 * then enter FIN_WAIT_2.
1320 */
1321 case TCPS_FIN_WAIT_1:
1322 if (ourfinisacked) {
1323 /*
1324 * If we can't receive any more
1325 * data, then closing user can proceed.
1326 * Starting the timer is contrary to the
1327 * specification, but if we don't get a FIN
1328 * we'll hang forever.
1329 */
1330 if (so->so_state & SS_CANTRCVMORE) {
1331 soisdisconnected(so);
1332 if (tcp_maxidle > 0)
1333 TCP_TIMER_ARM(tp, TCPT_2MSL,
1334 tcp_maxidle);
1335 }
1336 tp->t_state = TCPS_FIN_WAIT_2;
1337 }
1338 break;
1339
1340 /*
1341 * In CLOSING STATE in addition to the processing for
1342 * the ESTABLISHED state if the ACK acknowledges our FIN
1343 * then enter the TIME-WAIT state, otherwise ignore
1344 * the segment.
1345 */
1346 case TCPS_CLOSING:
1347 if (ourfinisacked) {
1348 tp->t_state = TCPS_TIME_WAIT;
1349 tcp_canceltimers(tp);
1350 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1351 soisdisconnected(so);
1352 }
1353 break;
1354
1355 /*
1356 * In LAST_ACK, we may still be waiting for data to drain
1357 * and/or to be acked, as well as for the ack of our FIN.
1358 * If our FIN is now acknowledged, delete the TCB,
1359 * enter the closed state and return.
1360 */
1361 case TCPS_LAST_ACK:
1362 if (ourfinisacked) {
1363 tp = tcp_close(tp);
1364 goto drop;
1365 }
1366 break;
1367
1368 /*
1369 * In TIME_WAIT state the only thing that should arrive
1370 * is a retransmission of the remote FIN. Acknowledge
1371 * it and restart the finack timer.
1372 */
1373 case TCPS_TIME_WAIT:
1374 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1375 goto dropafterack;
1376 }
1377 }
1378
1379 step6:
1380 /*
1381 * Update window information.
1382 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1383 */
1384 if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
1385 (tp->snd_wl1 == ti->ti_seq && SEQ_LT(tp->snd_wl2, ti->ti_ack)) ||
1386 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))) {
1387 /* keep track of pure window updates */
1388 if (ti->ti_len == 0 &&
1389 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1390 tcpstat.tcps_rcvwinupd++;
1391 tp->snd_wnd = tiwin;
1392 tp->snd_wl1 = ti->ti_seq;
1393 tp->snd_wl2 = ti->ti_ack;
1394 if (tp->snd_wnd > tp->max_sndwnd)
1395 tp->max_sndwnd = tp->snd_wnd;
1396 needoutput = 1;
1397 }
1398
1399 /*
1400 * Process segments with URG.
1401 */
1402 if ((tiflags & TH_URG) && ti->ti_urp &&
1403 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1404 /*
1405 * This is a kludge, but if we receive and accept
1406 * random urgent pointers, we'll crash in
1407 * soreceive. It's hard to imagine someone
1408 * actually wanting to send this much urgent data.
1409 */
1410 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) {
1411 ti->ti_urp = 0; /* XXX */
1412 tiflags &= ~TH_URG; /* XXX */
1413 goto dodata; /* XXX */
1414 }
1415 /*
1416 * If this segment advances the known urgent pointer,
1417 * then mark the data stream. This should not happen
1418 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1419 * a FIN has been received from the remote side.
1420 * In these states we ignore the URG.
1421 *
1422 * According to RFC961 (Assigned Protocols),
1423 * the urgent pointer points to the last octet
1424 * of urgent data. We continue, however,
1425 * to consider it to indicate the first octet
1426 * of data past the urgent section as the original
1427 * spec states (in one of two places).
1428 */
1429 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1430 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1431 so->so_oobmark = so->so_rcv.sb_cc +
1432 (tp->rcv_up - tp->rcv_nxt) - 1;
1433 if (so->so_oobmark == 0)
1434 so->so_state |= SS_RCVATMARK;
1435 sohasoutofband(so);
1436 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1437 }
1438 /*
1439 * Remove out of band data so doesn't get presented to user.
1440 * This can happen independent of advancing the URG pointer,
1441 * but if two URG's are pending at once, some out-of-band
1442 * data may creep in... ick.
1443 */
1444 if (ti->ti_urp <= (u_int16_t) ti->ti_len
1445 #ifdef SO_OOBINLINE
1446 && (so->so_options & SO_OOBINLINE) == 0
1447 #endif
1448 )
1449 tcp_pulloutofband(so, ti, m);
1450 } else
1451 /*
1452 * If no out of band data is expected,
1453 * pull receive urgent pointer along
1454 * with the receive window.
1455 */
1456 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1457 tp->rcv_up = tp->rcv_nxt;
1458 dodata: /* XXX */
1459
1460 /*
1461 * Process the segment text, merging it into the TCP sequencing queue,
1462 * and arranging for acknowledgment of receipt if necessary.
1463 * This process logically involves adjusting tp->rcv_wnd as data
1464 * is presented to the user (this happens in tcp_usrreq.c,
1465 * case PRU_RCVD). If a FIN has already been received on this
1466 * connection then we just ignore the text.
1467 */
1468 if ((ti->ti_len || (tiflags & TH_FIN)) &&
1469 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1470 TCP_REASS(tp, ti, m, so, tiflags);
1471 /*
1472 * Note the amount of data that peer has sent into
1473 * our window, in order to estimate the sender's
1474 * buffer size.
1475 */
1476 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1477 } else {
1478 m_freem(m);
1479 tiflags &= ~TH_FIN;
1480 }
1481
1482 /*
1483 * If FIN is received ACK the FIN and let the user know
1484 * that the connection is closing. Ignore a FIN received before
1485 * the connection is fully established.
1486 */
1487 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
1488 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1489 socantrcvmore(so);
1490 tp->t_flags |= TF_ACKNOW;
1491 tp->rcv_nxt++;
1492 }
1493 switch (tp->t_state) {
1494
1495 /*
1496 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
1497 */
1498 case TCPS_ESTABLISHED:
1499 tp->t_state = TCPS_CLOSE_WAIT;
1500 break;
1501
1502 /*
1503 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1504 * enter the CLOSING state.
1505 */
1506 case TCPS_FIN_WAIT_1:
1507 tp->t_state = TCPS_CLOSING;
1508 break;
1509
1510 /*
1511 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1512 * starting the time-wait timer, turning off the other
1513 * standard timers.
1514 */
1515 case TCPS_FIN_WAIT_2:
1516 tp->t_state = TCPS_TIME_WAIT;
1517 tcp_canceltimers(tp);
1518 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1519 soisdisconnected(so);
1520 break;
1521
1522 /*
1523 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1524 */
1525 case TCPS_TIME_WAIT:
1526 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1527 break;
1528 }
1529 }
1530 if (so->so_options & SO_DEBUG)
1531 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0);
1532
1533 /*
1534 * Return any desired output.
1535 */
1536 if (needoutput || (tp->t_flags & TF_ACKNOW))
1537 (void) tcp_output(tp);
1538 return;
1539
1540 badsyn:
1541 /*
1542 * Received a bad SYN. Increment counters and dropwithreset.
1543 */
1544 tcpstat.tcps_badsyn++;
1545 tp = NULL;
1546 goto dropwithreset;
1547
1548 dropafterack:
1549 /*
1550 * Generate an ACK dropping incoming segment if it occupies
1551 * sequence space, where the ACK reflects our state.
1552 */
1553 if (tiflags & TH_RST)
1554 goto drop;
1555 m_freem(m);
1556 tp->t_flags |= TF_ACKNOW;
1557 (void) tcp_output(tp);
1558 return;
1559
1560 dropwithreset:
1561 /*
1562 * Generate a RST, dropping incoming segment.
1563 * Make ACK acceptable to originator of segment.
1564 * Don't bother to respond if destination was broadcast/multicast.
1565 */
1566 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) ||
1567 IN_MULTICAST(ti->ti_dst.s_addr))
1568 goto drop;
1569 if (tiflags & TH_ACK)
1570 (void)tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1571 else {
1572 if (tiflags & TH_SYN)
1573 ti->ti_len++;
1574 (void)tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1575 TH_RST|TH_ACK);
1576 }
1577 return;
1578
1579 drop:
1580 /*
1581 * Drop space held by incoming segment and return.
1582 */
1583 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1584 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1585 m_freem(m);
1586 return;
1587 }
1588
1589 void
1590 tcp_dooptions(tp, cp, cnt, ti, oi)
1591 struct tcpcb *tp;
1592 u_char *cp;
1593 int cnt;
1594 struct tcpiphdr *ti;
1595 struct tcp_opt_info *oi;
1596 {
1597 u_int16_t mss;
1598 int opt, optlen;
1599
1600 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1601 opt = cp[0];
1602 if (opt == TCPOPT_EOL)
1603 break;
1604 if (opt == TCPOPT_NOP)
1605 optlen = 1;
1606 else {
1607 optlen = cp[1];
1608 if (optlen <= 0)
1609 break;
1610 }
1611 switch (opt) {
1612
1613 default:
1614 continue;
1615
1616 case TCPOPT_MAXSEG:
1617 if (optlen != TCPOLEN_MAXSEG)
1618 continue;
1619 if (!(ti->ti_flags & TH_SYN))
1620 continue;
1621 bcopy(cp + 2, &mss, sizeof(mss));
1622 oi->maxseg = ntohs(mss);
1623 break;
1624
1625 case TCPOPT_WINDOW:
1626 if (optlen != TCPOLEN_WINDOW)
1627 continue;
1628 if (!(ti->ti_flags & TH_SYN))
1629 continue;
1630 tp->t_flags |= TF_RCVD_SCALE;
1631 tp->requested_s_scale = cp[2];
1632 if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
1633 log(LOG_ERR, "TCP: invalid wscale %d from "
1634 "0x%08x, assuming %d\n",
1635 tp->requested_s_scale,
1636 ntohl(ti->ti_src.s_addr),
1637 TCP_MAX_WINSHIFT);
1638 tp->requested_s_scale = TCP_MAX_WINSHIFT;
1639 }
1640 break;
1641
1642 case TCPOPT_TIMESTAMP:
1643 if (optlen != TCPOLEN_TIMESTAMP)
1644 continue;
1645 oi->ts_present = 1;
1646 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
1647 NTOHL(oi->ts_val);
1648 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
1649 NTOHL(oi->ts_ecr);
1650
1651 /*
1652 * A timestamp received in a SYN makes
1653 * it ok to send timestamp requests and replies.
1654 */
1655 if (ti->ti_flags & TH_SYN) {
1656 tp->t_flags |= TF_RCVD_TSTMP;
1657 tp->ts_recent = oi->ts_val;
1658 tp->ts_recent_age = tcp_now;
1659 }
1660 break;
1661 case TCPOPT_SACK_PERMITTED:
1662 if (optlen != TCPOLEN_SACK_PERMITTED)
1663 continue;
1664 if (!(ti->ti_flags & TH_SYN))
1665 continue;
1666 tp->t_flags &= ~TF_CANT_TXSACK;
1667 break;
1668
1669 case TCPOPT_SACK:
1670 if (tp->t_flags & TF_IGNR_RXSACK)
1671 continue;
1672 if (optlen % 8 != 2 || optlen < 10)
1673 continue;
1674 cp += 2;
1675 optlen -= 2;
1676 for (; optlen > 0; cp -= 8, optlen -= 8) {
1677 tcp_seq lwe, rwe;
1678 bcopy((char *)cp, (char *) &lwe, sizeof(lwe));
1679 NTOHL(lwe);
1680 bcopy((char *)cp, (char *) &rwe, sizeof(rwe));
1681 NTOHL(rwe);
1682 /* tcp_mark_sacked(tp, lwe, rwe); */
1683 }
1684 break;
1685 }
1686 }
1687 }
1688
1689 /*
1690 * Pull out of band byte out of a segment so
1691 * it doesn't appear in the user's data queue.
1692 * It is still reflected in the segment length for
1693 * sequencing purposes.
1694 */
1695 void
1696 tcp_pulloutofband(so, ti, m)
1697 struct socket *so;
1698 struct tcpiphdr *ti;
1699 register struct mbuf *m;
1700 {
1701 int cnt = ti->ti_urp - 1;
1702
1703 while (cnt >= 0) {
1704 if (m->m_len > cnt) {
1705 char *cp = mtod(m, caddr_t) + cnt;
1706 struct tcpcb *tp = sototcpcb(so);
1707
1708 tp->t_iobc = *cp;
1709 tp->t_oobflags |= TCPOOB_HAVEDATA;
1710 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
1711 m->m_len--;
1712 return;
1713 }
1714 cnt -= m->m_len;
1715 m = m->m_next;
1716 if (m == 0)
1717 break;
1718 }
1719 panic("tcp_pulloutofband");
1720 }
1721
1722 /*
1723 * Collect new round-trip time estimate
1724 * and update averages and current timeout.
1725 */
1726 void
1727 tcp_xmit_timer(tp, rtt)
1728 register struct tcpcb *tp;
1729 short rtt;
1730 {
1731 register short delta;
1732 short rttmin;
1733
1734 tcpstat.tcps_rttupdated++;
1735 --rtt;
1736 if (tp->t_srtt != 0) {
1737 /*
1738 * srtt is stored as fixed point with 3 bits after the
1739 * binary point (i.e., scaled by 8). The following magic
1740 * is equivalent to the smoothing algorithm in rfc793 with
1741 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1742 * point). Adjust rtt to origin 0.
1743 */
1744 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
1745 if ((tp->t_srtt += delta) <= 0)
1746 tp->t_srtt = 1 << 2;
1747 /*
1748 * We accumulate a smoothed rtt variance (actually, a
1749 * smoothed mean difference), then set the retransmit
1750 * timer to smoothed rtt + 4 times the smoothed variance.
1751 * rttvar is stored as fixed point with 2 bits after the
1752 * binary point (scaled by 4). The following is
1753 * equivalent to rfc793 smoothing with an alpha of .75
1754 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1755 * rfc793's wired-in beta.
1756 */
1757 if (delta < 0)
1758 delta = -delta;
1759 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1760 if ((tp->t_rttvar += delta) <= 0)
1761 tp->t_rttvar = 1 << 2;
1762 } else {
1763 /*
1764 * No rtt measurement yet - use the unsmoothed rtt.
1765 * Set the variance to half the rtt (so our first
1766 * retransmit happens at 3*rtt).
1767 */
1768 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
1769 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
1770 }
1771 tp->t_rtt = 0;
1772 tp->t_rxtshift = 0;
1773
1774 /*
1775 * the retransmit should happen at rtt + 4 * rttvar.
1776 * Because of the way we do the smoothing, srtt and rttvar
1777 * will each average +1/2 tick of bias. When we compute
1778 * the retransmit timer, we want 1/2 tick of rounding and
1779 * 1 extra tick because of +-1/2 tick uncertainty in the
1780 * firing of the timer. The bias will give us exactly the
1781 * 1.5 tick we need. But, because the bias is
1782 * statistical, we have to test that we don't drop below
1783 * the minimum feasible timer (which is 2 ticks).
1784 */
1785 if (tp->t_rttmin > rtt + 2)
1786 rttmin = tp->t_rttmin;
1787 else
1788 rttmin = rtt + 2;
1789 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
1790
1791 /*
1792 * We received an ack for a packet that wasn't retransmitted;
1793 * it is probably safe to discard any error indications we've
1794 * received recently. This isn't quite right, but close enough
1795 * for now (a route might have failed after we sent a segment,
1796 * and the return path might not be symmetrical).
1797 */
1798 tp->t_softerror = 0;
1799 }
1800
1801 /*
1802 * Checks for partial ack. If partial ack arrives, force the retransmission
1803 * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
1804 * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
1805 * be started again. If the ack advances at least to tp->snd_recover, return 0.
1806 */
1807 int
1808 tcp_newreno(tp, ti)
1809 struct tcpcb *tp;
1810 struct tcpiphdr *ti;
1811 {
1812 if (SEQ_LT(ti->ti_ack, tp->snd_recover)) {
1813 tcp_seq onxt = tp->snd_nxt;
1814 tcp_seq ouna = tp->snd_una; /* Haven't updated snd_una yet*/
1815 u_long ocwnd = tp->snd_cwnd;
1816 TCP_TIMER_DISARM(tp, TCPT_REXMT);
1817 tp->t_rtt = 0;
1818 tp->snd_nxt = ti->ti_ack;
1819 tp->snd_cwnd = tp->t_segsz;
1820 tp->snd_una = ti->ti_ack;
1821 (void) tcp_output(tp);
1822 tp->snd_cwnd = ocwnd;
1823 tp->snd_una = ouna;
1824 if (SEQ_GT(onxt, tp->snd_nxt))
1825 tp->snd_nxt = onxt;
1826 /*
1827 * Partial window deflation. Relies on fact that tp->snd_una
1828 * not updated yet.
1829 */
1830 tp->snd_cwnd -= (ti->ti_ack - tp->snd_una - tp->t_segsz);
1831 return 1;
1832 }
1833 return 0;
1834 }
1835
1836
1837 /*
1838 * TCP compressed state engine. Currently used to hold compressed
1839 * state for SYN_RECEIVED.
1840 */
1841
1842 u_long syn_cache_count;
1843 u_int32_t syn_hash1, syn_hash2;
1844
1845 #define SYN_HASH(sa, sp, dp) \
1846 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
1847 ((u_int32_t)(sp)))^syn_hash2)))
1848
1849 #define SYN_CACHE_RM(sc) \
1850 do { \
1851 LIST_REMOVE((sc), sc_bucketq); \
1852 tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
1853 TAILQ_REMOVE(&tcp_syn_cache_timeq[(sc)->sc_rxtshift], (sc), sc_timeq); \
1854 syn_cache_count--; \
1855 } while (0)
1856
1857 #define SYN_CACHE_PUT(sc) \
1858 do { \
1859 if ((sc)->sc_ipopts) \
1860 (void) m_free((sc)->sc_ipopts); \
1861 if ((sc)->sc_route.ro_rt != NULL) \
1862 RTFREE((sc)->sc_route.ro_rt); \
1863 pool_put(&syn_cache_pool, (sc)); \
1864 } while (0)
1865
1866 struct pool syn_cache_pool;
1867
1868 /*
1869 * We don't estimate RTT with SYNs, so each packet starts with the default
1870 * RTT and each timer queue has a fixed timeout value. This allows us to
1871 * optimize the timer queues somewhat.
1872 */
1873 #define SYN_CACHE_TIMER_ARM(sc) \
1874 do { \
1875 TCPT_RANGESET((sc)->sc_rxtcur, \
1876 TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
1877 TCPTV_REXMTMAX); \
1878 PRT_SLOW_ARM((sc)->sc_rexmt, (sc)->sc_rxtcur); \
1879 } while (0)
1880
1881 TAILQ_HEAD(, syn_cache) tcp_syn_cache_timeq[TCP_MAXRXTSHIFT + 1];
1882
1883 void
1884 syn_cache_init()
1885 {
1886 int i;
1887
1888 /* Initialize the hash buckets. */
1889 for (i = 0; i < tcp_syn_cache_size; i++)
1890 LIST_INIT(&tcp_syn_cache[i].sch_bucket);
1891
1892 /* Initialize the timer queues. */
1893 for (i = 0; i <= TCP_MAXRXTSHIFT; i++)
1894 TAILQ_INIT(&tcp_syn_cache_timeq[i]);
1895
1896 /* Initialize the syn cache pool. */
1897 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
1898 "synpl", 0, NULL, NULL, M_PCB);
1899 }
1900
1901 void
1902 syn_cache_insert(sc)
1903 struct syn_cache *sc;
1904 {
1905 struct syn_cache_head *scp;
1906 struct syn_cache *sc2;
1907 int s, i;
1908
1909 /*
1910 * If there are no entries in the hash table, reinitialize
1911 * the hash secrets.
1912 */
1913 if (syn_cache_count == 0) {
1914 struct timeval tv;
1915 microtime(&tv);
1916 syn_hash1 = random() ^ (u_long)≻
1917 syn_hash2 = random() ^ tv.tv_usec;
1918 }
1919
1920 sc->sc_hash = SYN_HASH(&sc->sc_src, sc->sc_sport, sc->sc_dport);
1921 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
1922 scp = &tcp_syn_cache[sc->sc_bucketidx];
1923
1924 /*
1925 * Make sure that we don't overflow the per-bucket
1926 * limit or the total cache size limit.
1927 */
1928 s = splsoftnet();
1929 if (scp->sch_length >= tcp_syn_bucket_limit) {
1930 tcpstat.tcps_sc_bucketoverflow++;
1931 /*
1932 * The bucket is full. Toss the oldest element in the
1933 * bucket. This will be the entry with our bucket
1934 * index closest to the front of the timer queue with
1935 * the largest timeout value.
1936 *
1937 * Note: This timer queue traversal may be expensive, so
1938 * we hope that this doesn't happen very often. It is
1939 * much more likely that we'll overflow the entire
1940 * cache, which is much easier to handle; see below.
1941 */
1942 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
1943 for (sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
1944 sc2 != NULL;
1945 sc2 = TAILQ_NEXT(sc2, sc_timeq)) {
1946 if (sc2->sc_bucketidx == sc->sc_bucketidx) {
1947 SYN_CACHE_RM(sc2);
1948 SYN_CACHE_PUT(sc2);
1949 goto insert; /* 2 level break */
1950 }
1951 }
1952 }
1953 #ifdef DIAGNOSTIC
1954 /*
1955 * This should never happen; we should always find an
1956 * entry in our bucket.
1957 */
1958 panic("syn_cache_insert: bucketoverflow: impossible");
1959 #endif
1960 } else if (syn_cache_count >= tcp_syn_cache_limit) {
1961 tcpstat.tcps_sc_overflowed++;
1962 /*
1963 * The cache is full. Toss the oldest entry in the
1964 * entire cache. This is the front entry in the
1965 * first non-empty timer queue with the largest
1966 * timeout value.
1967 */
1968 for (i = TCP_MAXRXTSHIFT; i >= 0; i--) {
1969 sc2 = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
1970 if (sc2 == NULL)
1971 continue;
1972 SYN_CACHE_RM(sc2);
1973 SYN_CACHE_PUT(sc2);
1974 goto insert; /* symmetry with above */
1975 }
1976 #ifdef DIAGNOSTIC
1977 /*
1978 * This should never happen; we should always find an
1979 * entry in the cache.
1980 */
1981 panic("syn_cache_insert: cache overflow: impossible");
1982 #endif
1983 }
1984
1985 insert:
1986 /*
1987 * Initialize the entry's timer.
1988 */
1989 sc->sc_rxttot = 0;
1990 sc->sc_rxtshift = 0;
1991 SYN_CACHE_TIMER_ARM(sc);
1992 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift], sc, sc_timeq);
1993
1994 /* Put it into the bucket. */
1995 LIST_INSERT_HEAD(&scp->sch_bucket, sc, sc_bucketq);
1996 scp->sch_length++;
1997 syn_cache_count++;
1998
1999 tcpstat.tcps_sc_added++;
2000 splx(s);
2001 }
2002
2003 /*
2004 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
2005 * If we have retransmitted an entry the maximum number of times, expire
2006 * that entry.
2007 */
2008 void
2009 syn_cache_timer()
2010 {
2011 struct syn_cache *sc, *nsc;
2012 int i, s;
2013
2014 s = splsoftnet();
2015
2016 /*
2017 * First, get all the entries that need to be retransmitted, or
2018 * must be expired due to exceeding the initial keepalive time.
2019 */
2020 for (i = 0; i < TCP_MAXRXTSHIFT; i++) {
2021 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[i]);
2022 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
2023 sc = nsc) {
2024 nsc = TAILQ_NEXT(sc, sc_timeq);
2025
2026 /*
2027 * Compute the total amount of time this entry has
2028 * been on a queue. If this entry has been on longer
2029 * than the keep alive timer would allow, expire it.
2030 */
2031 sc->sc_rxttot += sc->sc_rxtcur;
2032 if (sc->sc_rxttot >= TCPTV_KEEP_INIT) {
2033 tcpstat.tcps_sc_timed_out++;
2034 SYN_CACHE_RM(sc);
2035 SYN_CACHE_PUT(sc);
2036 continue;
2037 }
2038
2039 tcpstat.tcps_sc_retransmitted++;
2040 (void) syn_cache_respond(sc, NULL);
2041
2042 /* Advance this entry onto the next timer queue. */
2043 TAILQ_REMOVE(&tcp_syn_cache_timeq[i], sc, sc_timeq);
2044 sc->sc_rxtshift = i + 1;
2045 SYN_CACHE_TIMER_ARM(sc);
2046 TAILQ_INSERT_TAIL(&tcp_syn_cache_timeq[sc->sc_rxtshift],
2047 sc, sc_timeq);
2048 }
2049 }
2050
2051 /*
2052 * Now get all the entries that are expired due to too many
2053 * retransmissions.
2054 */
2055 for (sc = TAILQ_FIRST(&tcp_syn_cache_timeq[TCP_MAXRXTSHIFT]);
2056 sc != NULL && PRT_SLOW_ISEXPIRED(sc->sc_rexmt);
2057 sc = nsc) {
2058 tcpstat.tcps_sc_timed_out++;
2059 SYN_CACHE_RM(sc);
2060 SYN_CACHE_PUT(sc);
2061 }
2062 splx(s);
2063 }
2064
2065 /*
2066 * Find an entry in the syn cache.
2067 */
2068 struct syn_cache *
2069 syn_cache_lookup(ti, headp)
2070 struct tcpiphdr *ti;
2071 struct syn_cache_head **headp;
2072 {
2073 struct syn_cache *sc;
2074 struct syn_cache_head *scp;
2075 u_int32_t hash;
2076 int s;
2077
2078 hash = SYN_HASH(&ti->ti_src, ti->ti_sport, ti->ti_dport);
2079
2080 scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
2081 *headp = scp;
2082 s = splsoftnet();
2083 for (sc = LIST_FIRST(&scp->sch_bucket); sc != NULL;
2084 sc = LIST_NEXT(sc, sc_bucketq)) {
2085 if (sc->sc_hash != hash)
2086 continue;
2087 if (sc->sc_src.s_addr == ti->ti_src.s_addr &&
2088 sc->sc_sport == ti->ti_sport &&
2089 sc->sc_dport == ti->ti_dport &&
2090 sc->sc_dst.s_addr == ti->ti_dst.s_addr) {
2091 splx(s);
2092 return (sc);
2093 }
2094 }
2095 splx(s);
2096 return (NULL);
2097 }
2098
2099 /*
2100 * This function gets called when we receive an ACK for a
2101 * socket in the LISTEN state. We look up the connection
2102 * in the syn cache, and if its there, we pull it out of
2103 * the cache and turn it into a full-blown connection in
2104 * the SYN-RECEIVED state.
2105 *
2106 * The return values may not be immediately obvious, and their effects
2107 * can be subtle, so here they are:
2108 *
2109 * NULL SYN was not found in cache; caller should drop the
2110 * packet and send an RST.
2111 *
2112 * -1 We were unable to create the new connection, and are
2113 * aborting it. An ACK,RST is being sent to the peer
2114 * (unless we got screwey sequence numbners; see below),
2115 * because the 3-way handshake has been completed. Caller
2116 * should not free the mbuf, since we may be using it. If
2117 * we are not, we will free it.
2118 *
2119 * Otherwise, the return value is a pointer to the new socket
2120 * associated with the connection.
2121 */
2122 struct socket *
2123 syn_cache_get(so, m)
2124 struct socket *so;
2125 struct mbuf *m;
2126 {
2127 struct syn_cache *sc;
2128 struct syn_cache_head *scp;
2129 register struct inpcb *inp;
2130 register struct tcpcb *tp = 0;
2131 register struct tcpiphdr *ti;
2132 struct sockaddr_in *sin;
2133 struct mbuf *am;
2134 int s;
2135
2136 ti = mtod(m, struct tcpiphdr *);
2137 s = splsoftnet();
2138 if ((sc = syn_cache_lookup(ti, &scp)) == NULL) {
2139 splx(s);
2140 return (NULL);
2141 }
2142
2143 /*
2144 * Verify the sequence and ack numbers. Try getting the correct
2145 * response again.
2146 */
2147 if ((ti->ti_ack != sc->sc_iss + 1) ||
2148 SEQ_LEQ(ti->ti_seq, sc->sc_irs) ||
2149 SEQ_GT(ti->ti_seq, sc->sc_irs + 1 + sc->sc_win)) {
2150 (void) syn_cache_respond(sc, m);
2151 splx(s);
2152 return ((struct socket *)(-1));
2153 }
2154
2155 /* Remove this cache entry */
2156 SYN_CACHE_RM(sc);
2157 splx(s);
2158
2159 /*
2160 * Ok, create the full blown connection, and set things up
2161 * as they would have been set up if we had created the
2162 * connection when the SYN arrived. If we can't create
2163 * the connection, abort it.
2164 */
2165 so = sonewconn(so, SS_ISCONNECTED);
2166 if (so == NULL)
2167 goto resetandabort;
2168
2169 inp = sotoinpcb(so);
2170 inp->inp_laddr = sc->sc_dst;
2171 inp->inp_lport = sc->sc_dport;
2172 in_pcbstate(inp, INP_BOUND);
2173 inp->inp_options = ip_srcroute();
2174 if (inp->inp_options == NULL) {
2175 inp->inp_options = sc->sc_ipopts;
2176 sc->sc_ipopts = NULL;
2177 }
2178
2179 /*
2180 * Give the new socket our cached route reference.
2181 */
2182 inp->inp_route = sc->sc_route; /* struct assignment */
2183 sc->sc_route.ro_rt = NULL;
2184
2185 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
2186 if (am == NULL)
2187 goto resetandabort;
2188 am->m_len = sizeof(struct sockaddr_in);
2189 sin = mtod(am, struct sockaddr_in *);
2190 sin->sin_family = AF_INET;
2191 sin->sin_len = sizeof(*sin);
2192 sin->sin_addr = sc->sc_src;
2193 sin->sin_port = sc->sc_sport;
2194 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
2195 if (in_pcbconnect(inp, am)) {
2196 (void) m_free(am);
2197 goto resetandabort;
2198 }
2199 (void) m_free(am);
2200
2201 tp = intotcpcb(inp);
2202 if (sc->sc_request_r_scale != 15) {
2203 tp->requested_s_scale = sc->sc_requested_s_scale;
2204 tp->request_r_scale = sc->sc_request_r_scale;
2205 tp->snd_scale = sc->sc_requested_s_scale;
2206 tp->rcv_scale = sc->sc_request_r_scale;
2207 tp->t_flags |= TF_RCVD_SCALE;
2208 }
2209 if (sc->sc_flags & SCF_TIMESTAMP)
2210 tp->t_flags |= TF_RCVD_TSTMP;
2211
2212 tp->t_template = tcp_template(tp);
2213 if (tp->t_template == 0) {
2214 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
2215 so = NULL;
2216 m_freem(m);
2217 goto abort;
2218 }
2219
2220 tp->iss = sc->sc_iss;
2221 tp->irs = sc->sc_irs;
2222 tcp_sendseqinit(tp);
2223 tcp_rcvseqinit(tp);
2224 tp->t_state = TCPS_SYN_RECEIVED;
2225 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
2226 tcpstat.tcps_accepts++;
2227
2228 /* Initialize tp->t_ourmss before we deal with the peer's! */
2229 tp->t_ourmss = sc->sc_ourmaxseg;
2230 tcp_mss_from_peer(tp, sc->sc_peermaxseg);
2231
2232 /*
2233 * Initialize the initial congestion window. If we
2234 * had to retransmit the SYN,ACK, we must initialize cwnd
2235 * to 1 segment (i.e. the Loss Window).
2236 */
2237 if (sc->sc_rxtshift)
2238 tp->snd_cwnd = tp->t_peermss;
2239 else
2240 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win, tp->t_peermss);
2241
2242 tcp_rmx_rtt(tp);
2243 tp->snd_wl1 = sc->sc_irs;
2244 tp->rcv_up = sc->sc_irs + 1;
2245
2246 /*
2247 * This is what whould have happened in tcp_ouput() when
2248 * the SYN,ACK was sent.
2249 */
2250 tp->snd_up = tp->snd_una;
2251 tp->snd_max = tp->snd_nxt = tp->iss+1;
2252 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
2253 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
2254 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
2255 tp->last_ack_sent = tp->rcv_nxt;
2256
2257 tcpstat.tcps_sc_completed++;
2258 SYN_CACHE_PUT(sc);
2259 return (so);
2260
2261 resetandabort:
2262 (void) tcp_respond(NULL, ti, m, ti->ti_seq+ti->ti_len,
2263 (tcp_seq)0, TH_RST|TH_ACK);
2264 abort:
2265 if (so != NULL)
2266 (void) soabort(so);
2267 SYN_CACHE_PUT(sc);
2268 tcpstat.tcps_sc_aborted++;
2269 return ((struct socket *)(-1));
2270 }
2271
2272 /*
2273 * This function is called when we get a RST for a
2274 * non-existant connection, so that we can see if the
2275 * connection is in the syn cache. If it is, zap it.
2276 */
2277
2278 void
2279 syn_cache_reset(ti)
2280 register struct tcpiphdr *ti;
2281 {
2282 struct syn_cache *sc;
2283 struct syn_cache_head *scp;
2284 int s = splsoftnet();
2285
2286 if ((sc = syn_cache_lookup(ti, &scp)) == NULL) {
2287 splx(s);
2288 return;
2289 }
2290 if (SEQ_LT(ti->ti_seq,sc->sc_irs) ||
2291 SEQ_GT(ti->ti_seq, sc->sc_irs+1)) {
2292 splx(s);
2293 return;
2294 }
2295 SYN_CACHE_RM(sc);
2296 splx(s);
2297 tcpstat.tcps_sc_reset++;
2298 SYN_CACHE_PUT(sc);
2299 }
2300
2301 void
2302 syn_cache_unreach(ip, th)
2303 struct ip *ip;
2304 struct tcphdr *th;
2305 {
2306 struct syn_cache *sc;
2307 struct syn_cache_head *scp;
2308 struct tcpiphdr ti2;
2309 int s;
2310
2311 ti2.ti_src.s_addr = ip->ip_dst.s_addr;
2312 ti2.ti_dst.s_addr = ip->ip_src.s_addr;
2313 ti2.ti_sport = th->th_dport;
2314 ti2.ti_dport = th->th_sport;
2315
2316 s = splsoftnet();
2317 if ((sc = syn_cache_lookup(&ti2, &scp)) == NULL) {
2318 splx(s);
2319 return;
2320 }
2321 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
2322 if (ntohl (th->th_seq) != sc->sc_iss) {
2323 splx(s);
2324 return;
2325 }
2326
2327 /*
2328 * If we've rertransmitted 3 times and this is our second error,
2329 * we remove the entry. Otherwise, we allow it to continue on.
2330 * This prevents us from incorrectly nuking an entry during a
2331 * spurious network outage.
2332 *
2333 * See tcp_notify().
2334 */
2335 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
2336 sc->sc_flags |= SCF_UNREACH;
2337 splx(s);
2338 return;
2339 }
2340
2341 SYN_CACHE_RM(sc);
2342 splx(s);
2343 tcpstat.tcps_sc_unreach++;
2344 SYN_CACHE_PUT(sc);
2345 }
2346
2347 /*
2348 * Given a LISTEN socket and an inbound SYN request, add
2349 * this to the syn cache, and send back a segment:
2350 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2351 * to the source.
2352 *
2353 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
2354 * Doing so would require that we hold onto the data and deliver it
2355 * to the application. However, if we are the target of a SYN-flood
2356 * DoS attack, an attacker could send data which would eventually
2357 * consume all available buffer space if it were ACKed. By not ACKing
2358 * the data, we avoid this DoS scenario.
2359 */
2360
2361 int
2362 syn_cache_add(so, m, optp, optlen, oi)
2363 struct socket *so;
2364 struct mbuf *m;
2365 u_char *optp;
2366 int optlen;
2367 struct tcp_opt_info *oi;
2368 {
2369 register struct tcpiphdr *ti;
2370 struct tcpcb tb, *tp;
2371 long win;
2372 struct syn_cache *sc;
2373 struct syn_cache_head *scp;
2374 struct mbuf *ipopts;
2375
2376 tp = sototcpcb(so);
2377 ti = mtod(m, struct tcpiphdr *);
2378
2379 /*
2380 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2381 * in_broadcast() should never return true on a received
2382 * packet with M_BCAST not set.
2383 */
2384 if (m->m_flags & (M_BCAST|M_MCAST) ||
2385 IN_MULTICAST(ti->ti_src.s_addr) ||
2386 IN_MULTICAST(ti->ti_dst.s_addr))
2387 return (0);
2388
2389 /*
2390 * Initialize some local state.
2391 */
2392 win = sbspace(&so->so_rcv);
2393 if (win > TCP_MAXWIN)
2394 win = TCP_MAXWIN;
2395
2396 /*
2397 * Remember the IP options, if any.
2398 */
2399 ipopts = ip_srcroute();
2400
2401 if (optp) {
2402 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
2403 tcp_dooptions(&tb, optp, optlen, ti, oi);
2404 } else
2405 tb.t_flags = 0;
2406
2407 /*
2408 * See if we already have an entry for this connection.
2409 * If we do, resend the SYN,ACK. We do not count this
2410 * as a retransmission (XXX though maybe we should).
2411 */
2412 if ((sc = syn_cache_lookup(ti, &scp)) != NULL) {
2413 tcpstat.tcps_sc_dupesyn++;
2414 if (ipopts) {
2415 /*
2416 * If we were remembering a previous source route,
2417 * forget it and use the new one we've been given.
2418 */
2419 if (sc->sc_ipopts)
2420 (void) m_free(sc->sc_ipopts);
2421 sc->sc_ipopts = ipopts;
2422 }
2423 sc->sc_timestamp = tb.ts_recent;
2424 if (syn_cache_respond(sc, m) == 0) {
2425 tcpstat.tcps_sndacks++;
2426 tcpstat.tcps_sndtotal++;
2427 }
2428 return (1);
2429 }
2430
2431 sc = pool_get(&syn_cache_pool, PR_NOWAIT);
2432 if (sc == NULL) {
2433 if (ipopts)
2434 (void) m_free(ipopts);
2435 return (0);
2436 }
2437
2438 /*
2439 * Fill in the cache, and put the necessary IP and TCP
2440 * options into the reply.
2441 */
2442 memset(&sc->sc_route, 0, sizeof(sc->sc_route));
2443 sc->sc_src.s_addr = ti->ti_src.s_addr;
2444 sc->sc_dst.s_addr = ti->ti_dst.s_addr;
2445 sc->sc_sport = ti->ti_sport;
2446 sc->sc_dport = ti->ti_dport;
2447 sc->sc_flags = 0;
2448 sc->sc_ipopts = ipopts;
2449 sc->sc_irs = ti->ti_seq;
2450 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0);
2451 sc->sc_peermaxseg = oi->maxseg;
2452 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
2453 m->m_pkthdr.rcvif : NULL);
2454 sc->sc_win = win;
2455 sc->sc_timestamp = tb.ts_recent;
2456 if (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP))
2457 sc->sc_flags |= SCF_TIMESTAMP;
2458 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2459 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2460 sc->sc_requested_s_scale = tb.requested_s_scale;
2461 sc->sc_request_r_scale = 0;
2462 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
2463 TCP_MAXWIN << sc->sc_request_r_scale <
2464 so->so_rcv.sb_hiwat)
2465 sc->sc_request_r_scale++;
2466 } else {
2467 sc->sc_requested_s_scale = 15;
2468 sc->sc_request_r_scale = 15;
2469 }
2470 if (syn_cache_respond(sc, m) == 0) {
2471 syn_cache_insert(sc);
2472 tcpstat.tcps_sndacks++;
2473 tcpstat.tcps_sndtotal++;
2474 } else {
2475 SYN_CACHE_PUT(sc);
2476 tcpstat.tcps_sc_dropped++;
2477 }
2478 return (1);
2479 }
2480
2481 int
2482 syn_cache_respond(sc, m)
2483 struct syn_cache *sc;
2484 struct mbuf *m;
2485 {
2486 struct route *ro = &sc->sc_route;
2487 struct rtentry *rt;
2488 struct sockaddr_in *dst;
2489 struct tcpiphdr *ti;
2490 u_int8_t *optp;
2491 int optlen, error;
2492 u_int16_t tlen;
2493
2494 /* Compute the size of the TCP options. */
2495 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
2496 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
2497
2498 tlen = sizeof(struct tcpiphdr) + optlen;
2499
2500 /*
2501 * Create the IP+TCP header from scratch. Reuse the received mbuf
2502 * if possible.
2503 */
2504 if (m != NULL) {
2505 m_freem(m->m_next);
2506 m->m_next = NULL;
2507 MRESETDATA(m);
2508 } else {
2509 MGETHDR(m, M_DONTWAIT, MT_DATA);
2510 if (m == NULL)
2511 return (ENOBUFS);
2512 }
2513
2514 /* Fixup the mbuf. */
2515 m->m_data += max_linkhdr;
2516 m->m_len = m->m_pkthdr.len = tlen;
2517 m->m_pkthdr.rcvif = NULL;
2518
2519 ti = mtod(m, struct tcpiphdr *);
2520 memset(ti, 0, tlen);
2521
2522 ti->ti_dst = sc->sc_src;
2523 ti->ti_src = sc->sc_dst;
2524 ti->ti_sport = sc->sc_dport;
2525 ti->ti_dport = sc->sc_sport;
2526 ti->ti_pr = IPPROTO_TCP;
2527 ti->ti_len = htons(tlen - sizeof(struct ip));
2528 /* ti_x1 already 0'd */
2529 ti->ti_seq = htonl(sc->sc_iss);
2530 ti->ti_ack = htonl(sc->sc_irs + 1);
2531 /* ti_x2 already 0 */
2532 ti->ti_off = (sizeof(struct tcphdr) + optlen) >> 2;
2533 ti->ti_flags = TH_SYN|TH_ACK;
2534 ti->ti_win = htons(sc->sc_win);
2535 /* ti_sum already 0 */
2536 /* ti_urp already 0 */
2537
2538 /* Tack on the TCP options. */
2539 optp = (u_int8_t *)(ti + 1);
2540 *optp++ = TCPOPT_MAXSEG;
2541 *optp++ = 4;
2542 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
2543 *optp++ = sc->sc_ourmaxseg & 0xff;
2544
2545 if (sc->sc_request_r_scale != 15) {
2546 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
2547 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
2548 sc->sc_request_r_scale);
2549 optp += 4;
2550 }
2551
2552 if (sc->sc_flags & SCF_TIMESTAMP) {
2553 u_int32_t *lp = (u_int32_t *)(optp);
2554 /* Form timestamp option as shown in appendix A of RFC 1323. */
2555 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
2556 *lp++ = htonl(tcp_now);
2557 *lp = htonl(sc->sc_timestamp);
2558 optp += TCPOLEN_TSTAMP_APPA;
2559 }
2560
2561 /* Compute the packet's checksum. */
2562 ti->ti_sum = in_cksum(m, tlen);
2563
2564 /*
2565 * Fill in some straggling IP bits. Note the stack expects
2566 * ip_len to be in host order, for convenience.
2567 */
2568 ((struct ip *)ti)->ip_len = tlen;
2569 ((struct ip *)ti)->ip_ttl = ip_defttl;
2570 /* XXX tos? */
2571
2572 /*
2573 * If we're doing Path MTU discovery, we need to set DF unless
2574 * the route's MTU is locked. If we don't yet know the route,
2575 * look it up now. We will copy this reference to the inpcb
2576 * when we finish creating the connection.
2577 */
2578 if ((rt = ro->ro_rt) == NULL || (rt->rt_flags & RTF_UP) == 0) {
2579 if (ro->ro_rt != NULL) {
2580 RTFREE(ro->ro_rt);
2581 ro->ro_rt = NULL;
2582 }
2583 dst = satosin(&ro->ro_dst);
2584 dst->sin_family = AF_INET;
2585 dst->sin_len = sizeof(*dst);
2586 dst->sin_addr = ti->ti_dst;
2587 rtalloc(ro);
2588 if ((rt = ro->ro_rt) == NULL) {
2589 m_freem(m);
2590 ipstat.ips_noroute++;
2591 return (EHOSTUNREACH);
2592 }
2593 }
2594 if (ip_mtudisc != 0 && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
2595 ((struct ip *)ti)->ip_off |= IP_DF;
2596
2597 /* ...and send it off! */
2598 error = ip_output(m, sc->sc_ipopts, ro, 0, NULL);
2599
2600 return (error);
2601 }
2602