tcp_input.c revision 1.39 1 /* $NetBSD: tcp_input.c,v 1.39 1998/01/05 10:32:03 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
36 */
37
38 /*
39 * TODO list for SYN cache stuff:
40 *
41 * (a) The definition of "struct syn_cache" says:
42 *
43 * This structure should not exceeed 32 bytes.
44 *
45 * but it's 40 bytes on the Alpha. Can reduce memory use one
46 * of two ways:
47 *
48 * (1) Use a dynamically-sized hash table, and handle
49 * collisions by rehashing. Then sc_next is unnecessary.
50 *
51 * (2) Allocate syn_cache structures in pages (or some other
52 * large chunk). This would probably be desirable for
53 * maintaining locality of reference anyway.
54 *
55 * If you do this, you can change sc_next to a page/index
56 * value, and make it a 32-bit (or maybe even 16-bit)
57 * integer, thus partly obviating the need for the previous
58 * hack.
59 *
60 * It's also worth noting this this is necessary for IPv6, as well,
61 * where we use 32 bytes just for the IP addresses, so eliminating
62 * wastage is going to become more important. (BTW, has anyone
63 * integreated these changes with one fo the IPv6 status that are
64 * available?)
65 *
66 * (b) Find room for a "state" field, which is needed to keep a
67 * compressed state for TIME_WAIT TCBs. It's been noted already
68 * that this is fairly important for very high-volume web and
69 * mail servers, which use a large number of short-lived
70 * connections.
71 */
72
73 #ifndef TUBA_INCLUDE
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/malloc.h>
77 #include <sys/mbuf.h>
78 #include <sys/protosw.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/errno.h>
82
83 #include <net/if.h>
84 #include <net/route.h>
85
86 #include <netinet/in.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/ip.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/ip_var.h>
91 #include <netinet/tcp.h>
92 #include <netinet/tcp_fsm.h>
93 #include <netinet/tcp_seq.h>
94 #include <netinet/tcp_timer.h>
95 #include <netinet/tcp_var.h>
96 #include <netinet/tcpip.h>
97 #include <netinet/tcp_debug.h>
98
99 #include <machine/stdarg.h>
100
101 int tcprexmtthresh = 3;
102 struct tcpiphdr tcp_saveti;
103
104 extern u_long sb_max;
105
106 #endif /* TUBA_INCLUDE */
107 #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
108
109 /* for modulo comparisons of timestamps */
110 #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
111 #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
112
113 /*
114 * Macro to compute ACK transmission behavior. Delay the ACK unless
115 * the other side PUSH'd or we have already delayed an ACK (must send
116 * an ACK every two segments).
117 */
118 #define TCP_SETUP_ACK(tp, ti) \
119 do { \
120 if ((ti)->ti_flags & TH_PUSH || \
121 (tp)->t_flags & TF_DELACK) \
122 tp->t_flags |= TF_ACKNOW; \
123 else \
124 TCP_SET_DELACK(tp); \
125 } while (0)
126
127 /*
128 * Insert segment ti into reassembly queue of tcp with
129 * control block tp. Return TH_FIN if reassembly now includes
130 * a segment with FIN. The macro form does the common case inline
131 * (segment is the next to be received on an established connection,
132 * and the queue is empty), avoiding linkage into and removal
133 * from the queue and repetition of various conversions.
134 * Set DELACK for segments received in order, but ack immediately
135 * when segments are out of order (so fast retransmit can work).
136 */
137 #define TCP_REASS(tp, ti, m, so, flags) { \
138 if ((ti)->ti_seq == (tp)->rcv_nxt && \
139 (tp)->segq.lh_first == NULL && \
140 (tp)->t_state == TCPS_ESTABLISHED) { \
141 TCP_SETUP_ACK(tp, ti); \
142 (tp)->rcv_nxt += (ti)->ti_len; \
143 flags = (ti)->ti_flags & TH_FIN; \
144 tcpstat.tcps_rcvpack++;\
145 tcpstat.tcps_rcvbyte += (ti)->ti_len;\
146 sbappend(&(so)->so_rcv, (m)); \
147 sorwakeup(so); \
148 } else { \
149 (flags) = tcp_reass((tp), (ti), (m)); \
150 tp->t_flags |= TF_ACKNOW; \
151 } \
152 }
153 #ifndef TUBA_INCLUDE
154
155 int
156 tcp_reass(tp, ti, m)
157 register struct tcpcb *tp;
158 register struct tcpiphdr *ti;
159 struct mbuf *m;
160 {
161 register struct ipqent *p, *q, *nq, *tiqe;
162 struct socket *so = tp->t_inpcb->inp_socket;
163 int flags;
164
165 /*
166 * Call with ti==0 after become established to
167 * force pre-ESTABLISHED data up to user socket.
168 */
169 if (ti == 0)
170 goto present;
171
172 /*
173 * Allocate a new queue entry, before we throw away any data.
174 * If we can't, just drop the packet. XXX
175 */
176 MALLOC(tiqe, struct ipqent *, sizeof (struct ipqent), M_IPQ, M_NOWAIT);
177 if (tiqe == NULL) {
178 tcpstat.tcps_rcvmemdrop++;
179 m_freem(m);
180 return (0);
181 }
182
183 /*
184 * Find a segment which begins after this one does.
185 */
186 for (p = NULL, q = tp->segq.lh_first; q != NULL;
187 p = q, q = q->ipqe_q.le_next)
188 if (SEQ_GT(q->ipqe_tcp->ti_seq, ti->ti_seq))
189 break;
190
191 /*
192 * If there is a preceding segment, it may provide some of
193 * our data already. If so, drop the data from the incoming
194 * segment. If it provides all of our data, drop us.
195 */
196 if (p != NULL) {
197 register struct tcpiphdr *phdr = p->ipqe_tcp;
198 register int i;
199
200 /* conversion to int (in i) handles seq wraparound */
201 i = phdr->ti_seq + phdr->ti_len - ti->ti_seq;
202 if (i > 0) {
203 if (i >= ti->ti_len) {
204 tcpstat.tcps_rcvduppack++;
205 tcpstat.tcps_rcvdupbyte += ti->ti_len;
206 m_freem(m);
207 FREE(tiqe, M_IPQ);
208 return (0);
209 }
210 m_adj(m, i);
211 ti->ti_len -= i;
212 ti->ti_seq += i;
213 }
214 }
215 tcpstat.tcps_rcvoopack++;
216 tcpstat.tcps_rcvoobyte += ti->ti_len;
217
218 /*
219 * While we overlap succeeding segments trim them or,
220 * if they are completely covered, dequeue them.
221 */
222 for (; q != NULL; q = nq) {
223 register struct tcpiphdr *qhdr = q->ipqe_tcp;
224 register int i = (ti->ti_seq + ti->ti_len) - qhdr->ti_seq;
225
226 if (i <= 0)
227 break;
228 if (i < qhdr->ti_len) {
229 qhdr->ti_seq += i;
230 qhdr->ti_len -= i;
231 m_adj(q->ipqe_m, i);
232 break;
233 }
234 nq = q->ipqe_q.le_next;
235 m_freem(q->ipqe_m);
236 LIST_REMOVE(q, ipqe_q);
237 FREE(q, M_IPQ);
238 }
239
240 /* Insert the new fragment queue entry into place. */
241 tiqe->ipqe_m = m;
242 tiqe->ipqe_tcp = ti;
243 if (p == NULL) {
244 LIST_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
245 } else {
246 LIST_INSERT_AFTER(p, tiqe, ipqe_q);
247 }
248
249 present:
250 /*
251 * Present data to user, advancing rcv_nxt through
252 * completed sequence space.
253 */
254 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
255 return (0);
256 q = tp->segq.lh_first;
257 if (q == NULL || q->ipqe_tcp->ti_seq != tp->rcv_nxt)
258 return (0);
259 if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_tcp->ti_len)
260 return (0);
261 do {
262 tp->rcv_nxt += q->ipqe_tcp->ti_len;
263 flags = q->ipqe_tcp->ti_flags & TH_FIN;
264
265 nq = q->ipqe_q.le_next;
266 LIST_REMOVE(q, ipqe_q);
267 if (so->so_state & SS_CANTRCVMORE)
268 m_freem(q->ipqe_m);
269 else
270 sbappend(&so->so_rcv, q->ipqe_m);
271 FREE(q, M_IPQ);
272 q = nq;
273 } while (q != NULL && q->ipqe_tcp->ti_seq == tp->rcv_nxt);
274 sorwakeup(so);
275 return (flags);
276 }
277
278 /*
279 * TCP input routine, follows pages 65-76 of the
280 * protocol specification dated September, 1981 very closely.
281 */
282 void
283 #if __STDC__
284 tcp_input(struct mbuf *m, ...)
285 #else
286 tcp_input(m, va_alist)
287 register struct mbuf *m;
288 #endif
289 {
290 register struct tcpiphdr *ti;
291 register struct inpcb *inp;
292 caddr_t optp = NULL;
293 int optlen = 0;
294 int len, tlen, off, hdroptlen;
295 register struct tcpcb *tp = 0;
296 register int tiflags;
297 struct socket *so = NULL;
298 int todrop, acked, ourfinisacked, needoutput = 0;
299 short ostate = 0;
300 int iss = 0;
301 u_long tiwin;
302 struct tcp_opt_info opti;
303 int iphlen;
304 va_list ap;
305
306 va_start(ap, m);
307 iphlen = va_arg(ap, int);
308 va_end(ap);
309
310 tcpstat.tcps_rcvtotal++;
311
312 opti.ts_present = 0;
313 opti.maxseg = 0;
314
315 /*
316 * Get IP and TCP header together in first mbuf.
317 * Note: IP leaves IP header in first mbuf.
318 */
319 ti = mtod(m, struct tcpiphdr *);
320 if (iphlen > sizeof (struct ip))
321 ip_stripoptions(m, (struct mbuf *)0);
322 if (m->m_len < sizeof (struct tcpiphdr)) {
323 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
324 tcpstat.tcps_rcvshort++;
325 return;
326 }
327 ti = mtod(m, struct tcpiphdr *);
328 }
329
330 /*
331 * Checksum extended TCP header and data.
332 */
333 tlen = ((struct ip *)ti)->ip_len;
334 len = sizeof (struct ip) + tlen;
335 bzero(ti->ti_x1, sizeof ti->ti_x1);
336 ti->ti_len = (u_int16_t)tlen;
337 HTONS(ti->ti_len);
338 if ((ti->ti_sum = in_cksum(m, len)) != 0) {
339 tcpstat.tcps_rcvbadsum++;
340 goto drop;
341 }
342 #endif /* TUBA_INCLUDE */
343
344 /*
345 * Check that TCP offset makes sense,
346 * pull out TCP options and adjust length. XXX
347 */
348 off = ti->ti_off << 2;
349 if (off < sizeof (struct tcphdr) || off > tlen) {
350 tcpstat.tcps_rcvbadoff++;
351 goto drop;
352 }
353 tlen -= off;
354 ti->ti_len = tlen;
355 if (off > sizeof (struct tcphdr)) {
356 if (m->m_len < sizeof(struct ip) + off) {
357 if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
358 tcpstat.tcps_rcvshort++;
359 return;
360 }
361 ti = mtod(m, struct tcpiphdr *);
362 }
363 optlen = off - sizeof (struct tcphdr);
364 optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
365 /*
366 * Do quick retrieval of timestamp options ("options
367 * prediction?"). If timestamp is the only option and it's
368 * formatted as recommended in RFC 1323 appendix A, we
369 * quickly get the values now and not bother calling
370 * tcp_dooptions(), etc.
371 */
372 if ((optlen == TCPOLEN_TSTAMP_APPA ||
373 (optlen > TCPOLEN_TSTAMP_APPA &&
374 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
375 *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
376 (ti->ti_flags & TH_SYN) == 0) {
377 opti.ts_present = 1;
378 opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
379 opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
380 optp = NULL; /* we've parsed the options */
381 }
382 }
383 tiflags = ti->ti_flags;
384
385 /*
386 * Convert TCP protocol specific fields to host format.
387 */
388 NTOHL(ti->ti_seq);
389 NTOHL(ti->ti_ack);
390 NTOHS(ti->ti_win);
391 NTOHS(ti->ti_urp);
392
393 /*
394 * Locate pcb for segment.
395 */
396 findpcb:
397 inp = in_pcblookup_connect(&tcbtable, ti->ti_src, ti->ti_sport,
398 ti->ti_dst, ti->ti_dport);
399 if (inp == 0) {
400 ++tcpstat.tcps_pcbhashmiss;
401 inp = in_pcblookup_bind(&tcbtable, ti->ti_dst, ti->ti_dport);
402 if (inp == 0) {
403 ++tcpstat.tcps_noport;
404 goto dropwithreset;
405 }
406 }
407
408 /*
409 * If the state is CLOSED (i.e., TCB does not exist) then
410 * all data in the incoming segment is discarded.
411 * If the TCB exists but is in CLOSED state, it is embryonic,
412 * but should either do a listen or a connect soon.
413 */
414 tp = intotcpcb(inp);
415 if (tp == 0)
416 goto dropwithreset;
417 if (tp->t_state == TCPS_CLOSED)
418 goto drop;
419
420 /* Unscale the window into a 32-bit value. */
421 if ((tiflags & TH_SYN) == 0)
422 tiwin = ti->ti_win << tp->snd_scale;
423 else
424 tiwin = ti->ti_win;
425
426 so = inp->inp_socket;
427 if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
428 if (so->so_options & SO_DEBUG) {
429 ostate = tp->t_state;
430 tcp_saveti = *ti;
431 }
432 if (so->so_options & SO_ACCEPTCONN) {
433 if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
434 if (tiflags & TH_RST) {
435 syn_cache_reset(ti);
436 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
437 (TH_ACK|TH_SYN)) {
438 /*
439 * Received a SYN,ACK. This should
440 * never happen while we are in
441 * LISTEN. Send an RST.
442 */
443 goto badsyn;
444 } else if (tiflags & TH_ACK) {
445 so = syn_cache_get(so, m);
446 if (so == NULL) {
447 /*
448 * We don't have a SYN for
449 * this ACK; send an RST.
450 */
451 goto badsyn;
452 } else if (so ==
453 (struct socket *)(-1)) {
454 /*
455 * We were unable to create
456 * the connection. If the
457 * 3-way handshake was
458 * completeed, and RST has
459 * been sent to the peer.
460 * Since the mbuf might be
461 * in use for the reply,
462 * do not free it.
463 */
464 m = NULL;
465 } else {
466 /*
467 * We have created a
468 * full-blown connection.
469 */
470 inp = sotoinpcb(so);
471 tp = intotcpcb(inp);
472 tiwin <<= tp->snd_scale;
473 goto after_listen;
474 }
475 }
476 } else {
477 /*
478 * Received a SYN.
479 */
480 if (in_hosteq(ti->ti_src, ti->ti_dst) &&
481 ti->ti_sport == ti->ti_dport) {
482 /*
483 * LISTEN socket received a SYN
484 * from itself? This can't possibly
485 * be valid; drop the packet.
486 */
487 tcpstat.tcps_badsyn++;
488 goto drop;
489 }
490 /*
491 * SYN looks ok; create compressed TCP
492 * state for it.
493 */
494 if (so->so_qlen <= so->so_qlimit &&
495 syn_cache_add(so, m, optp, optlen, &opti))
496 m = NULL;
497 }
498 goto drop;
499 }
500 }
501
502 after_listen:
503 #ifdef DIAGNOSTIC
504 /*
505 * Should not happen now that all embryonic connections
506 * are handled with compressed state.
507 */
508 if (tp->t_state == TCPS_LISTEN)
509 panic("tcp_input: TCPS_LISTEN");
510 #endif
511
512 /*
513 * Segment received on connection.
514 * Reset idle time and keep-alive timer.
515 */
516 tp->t_idle = 0;
517 if (TCPS_HAVEESTABLISHED(tp->t_state))
518 tp->t_timer[TCPT_KEEP] = tcp_keepidle;
519
520 /*
521 * Process options.
522 */
523 if (optp)
524 tcp_dooptions(tp, optp, optlen, ti, &opti);
525
526 /*
527 * Header prediction: check for the two common cases
528 * of a uni-directional data xfer. If the packet has
529 * no control flags, is in-sequence, the window didn't
530 * change and we're not retransmitting, it's a
531 * candidate. If the length is zero and the ack moved
532 * forward, we're the sender side of the xfer. Just
533 * free the data acked & wake any higher level process
534 * that was blocked waiting for space. If the length
535 * is non-zero and the ack didn't move, we're the
536 * receiver side. If we're getting packets in-order
537 * (the reassembly queue is empty), add the data to
538 * the socket buffer and note that we need a delayed ack.
539 */
540 if (tp->t_state == TCPS_ESTABLISHED &&
541 (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
542 (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
543 ti->ti_seq == tp->rcv_nxt &&
544 tiwin && tiwin == tp->snd_wnd &&
545 tp->snd_nxt == tp->snd_max) {
546
547 /*
548 * If last ACK falls within this segment's sequence numbers,
549 * record the timestamp.
550 */
551 if (opti.ts_present &&
552 SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
553 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) {
554 tp->ts_recent_age = tcp_now;
555 tp->ts_recent = opti.ts_val;
556 }
557
558 if (ti->ti_len == 0) {
559 if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
560 SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
561 tp->snd_cwnd >= tp->snd_wnd &&
562 tp->t_dupacks < tcprexmtthresh) {
563 /*
564 * this is a pure ack for outstanding data.
565 */
566 ++tcpstat.tcps_predack;
567 if (opti.ts_present)
568 tcp_xmit_timer(tp,
569 tcp_now-opti.ts_ecr+1);
570 else if (tp->t_rtt &&
571 SEQ_GT(ti->ti_ack, tp->t_rtseq))
572 tcp_xmit_timer(tp, tp->t_rtt);
573 acked = ti->ti_ack - tp->snd_una;
574 tcpstat.tcps_rcvackpack++;
575 tcpstat.tcps_rcvackbyte += acked;
576 sbdrop(&so->so_snd, acked);
577 tp->snd_una = ti->ti_ack;
578 m_freem(m);
579
580 /*
581 * If all outstanding data are acked, stop
582 * retransmit timer, otherwise restart timer
583 * using current (possibly backed-off) value.
584 * If process is waiting for space,
585 * wakeup/selwakeup/signal. If data
586 * are ready to send, let tcp_output
587 * decide between more output or persist.
588 */
589 if (tp->snd_una == tp->snd_max)
590 tp->t_timer[TCPT_REXMT] = 0;
591 else if (tp->t_timer[TCPT_PERSIST] == 0)
592 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
593
594 if (sb_notify(&so->so_snd))
595 sowwakeup(so);
596 if (so->so_snd.sb_cc)
597 (void) tcp_output(tp);
598 return;
599 }
600 } else if (ti->ti_ack == tp->snd_una &&
601 tp->segq.lh_first == NULL &&
602 ti->ti_len <= sbspace(&so->so_rcv)) {
603 /*
604 * this is a pure, in-sequence data packet
605 * with nothing on the reassembly queue and
606 * we have enough buffer space to take it.
607 */
608 ++tcpstat.tcps_preddat;
609 tp->rcv_nxt += ti->ti_len;
610 tcpstat.tcps_rcvpack++;
611 tcpstat.tcps_rcvbyte += ti->ti_len;
612 /*
613 * Drop TCP, IP headers and TCP options then add data
614 * to socket buffer.
615 */
616 m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
617 m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
618 sbappend(&so->so_rcv, m);
619 sorwakeup(so);
620 TCP_SETUP_ACK(tp, ti);
621 if (tp->t_flags & TF_ACKNOW)
622 (void) tcp_output(tp);
623 return;
624 }
625 }
626
627 /*
628 * Drop TCP, IP headers and TCP options.
629 */
630 hdroptlen = sizeof(struct tcpiphdr) + off - sizeof(struct tcphdr);
631 m->m_data += hdroptlen;
632 m->m_len -= hdroptlen;
633
634 /*
635 * Calculate amount of space in receive window,
636 * and then do TCP input processing.
637 * Receive window is amount of space in rcv queue,
638 * but not less than advertised window.
639 */
640 { int win;
641
642 win = sbspace(&so->so_rcv);
643 if (win < 0)
644 win = 0;
645 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
646 }
647
648 switch (tp->t_state) {
649
650 /*
651 * If the state is SYN_SENT:
652 * if seg contains an ACK, but not for our SYN, drop the input.
653 * if seg contains a RST, then drop the connection.
654 * if seg does not contain SYN, then drop it.
655 * Otherwise this is an acceptable SYN segment
656 * initialize tp->rcv_nxt and tp->irs
657 * if seg contains ack then advance tp->snd_una
658 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
659 * arrange for segment to be acked (eventually)
660 * continue processing rest of data/controls, beginning with URG
661 */
662 case TCPS_SYN_SENT:
663 if ((tiflags & TH_ACK) &&
664 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
665 SEQ_GT(ti->ti_ack, tp->snd_max)))
666 goto dropwithreset;
667 if (tiflags & TH_RST) {
668 if (tiflags & TH_ACK)
669 tp = tcp_drop(tp, ECONNREFUSED);
670 goto drop;
671 }
672 if ((tiflags & TH_SYN) == 0)
673 goto drop;
674 if (tiflags & TH_ACK) {
675 tp->snd_una = ti->ti_ack;
676 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
677 tp->snd_nxt = tp->snd_una;
678 }
679 tp->t_timer[TCPT_REXMT] = 0;
680 tp->irs = ti->ti_seq;
681 tcp_rcvseqinit(tp);
682 tp->t_flags |= TF_ACKNOW;
683 tcp_mss_from_peer(tp, opti.maxseg);
684 tcp_rmx_rtt(tp);
685 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
686 tcpstat.tcps_connects++;
687 soisconnected(so);
688 tcp_established(tp);
689 /* Do window scaling on this connection? */
690 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
691 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
692 tp->snd_scale = tp->requested_s_scale;
693 tp->rcv_scale = tp->request_r_scale;
694 }
695 (void) tcp_reass(tp, (struct tcpiphdr *)0,
696 (struct mbuf *)0);
697 /*
698 * if we didn't have to retransmit the SYN,
699 * use its rtt as our initial srtt & rtt var.
700 */
701 if (tp->t_rtt)
702 tcp_xmit_timer(tp, tp->t_rtt);
703 } else
704 tp->t_state = TCPS_SYN_RECEIVED;
705
706 /*
707 * Advance ti->ti_seq to correspond to first data byte.
708 * If data, trim to stay within window,
709 * dropping FIN if necessary.
710 */
711 ti->ti_seq++;
712 if (ti->ti_len > tp->rcv_wnd) {
713 todrop = ti->ti_len - tp->rcv_wnd;
714 m_adj(m, -todrop);
715 ti->ti_len = tp->rcv_wnd;
716 tiflags &= ~TH_FIN;
717 tcpstat.tcps_rcvpackafterwin++;
718 tcpstat.tcps_rcvbyteafterwin += todrop;
719 }
720 tp->snd_wl1 = ti->ti_seq - 1;
721 tp->rcv_up = ti->ti_seq;
722 goto step6;
723
724 /*
725 * If the state is SYN_RECEIVED:
726 * If seg contains an ACK, but not for our SYN, drop the input
727 * and generate an RST. See page 36, rfc793
728 */
729 case TCPS_SYN_RECEIVED:
730 if ((tiflags & TH_ACK) &&
731 (SEQ_LEQ(ti->ti_ack, tp->iss) ||
732 SEQ_GT(ti->ti_ack, tp->snd_max)))
733 goto dropwithreset;
734 break;
735 }
736
737 /*
738 * States other than LISTEN or SYN_SENT.
739 * First check timestamp, if present.
740 * Then check that at least some bytes of segment are within
741 * receive window. If segment begins before rcv_nxt,
742 * drop leading data (and SYN); if nothing left, just ack.
743 *
744 * RFC 1323 PAWS: If we have a timestamp reply on this segment
745 * and it's less than ts_recent, drop it.
746 */
747 if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
748 TSTMP_LT(opti.ts_val, tp->ts_recent)) {
749
750 /* Check to see if ts_recent is over 24 days old. */
751 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
752 /*
753 * Invalidate ts_recent. If this segment updates
754 * ts_recent, the age will be reset later and ts_recent
755 * will get a valid value. If it does not, setting
756 * ts_recent to zero will at least satisfy the
757 * requirement that zero be placed in the timestamp
758 * echo reply when ts_recent isn't valid. The
759 * age isn't reset until we get a valid ts_recent
760 * because we don't want out-of-order segments to be
761 * dropped when ts_recent is old.
762 */
763 tp->ts_recent = 0;
764 } else {
765 tcpstat.tcps_rcvduppack++;
766 tcpstat.tcps_rcvdupbyte += ti->ti_len;
767 tcpstat.tcps_pawsdrop++;
768 goto dropafterack;
769 }
770 }
771
772 todrop = tp->rcv_nxt - ti->ti_seq;
773 if (todrop > 0) {
774 if (tiflags & TH_SYN) {
775 tiflags &= ~TH_SYN;
776 ti->ti_seq++;
777 if (ti->ti_urp > 1)
778 ti->ti_urp--;
779 else {
780 tiflags &= ~TH_URG;
781 ti->ti_urp = 0;
782 }
783 todrop--;
784 }
785 if (todrop >= ti->ti_len) {
786 /*
787 * Any valid FIN must be to the left of the
788 * window. At this point, FIN must be a
789 * duplicate or out-of-sequence, so drop it.
790 */
791 tiflags &= ~TH_FIN;
792 /*
793 * Send ACK to resynchronize, and drop any data,
794 * but keep on processing for RST or ACK.
795 */
796 tp->t_flags |= TF_ACKNOW;
797 tcpstat.tcps_rcvdupbyte += todrop = ti->ti_len;
798 tcpstat.tcps_rcvduppack++;
799 } else {
800 tcpstat.tcps_rcvpartduppack++;
801 tcpstat.tcps_rcvpartdupbyte += todrop;
802 }
803 m_adj(m, todrop);
804 ti->ti_seq += todrop;
805 ti->ti_len -= todrop;
806 if (ti->ti_urp > todrop)
807 ti->ti_urp -= todrop;
808 else {
809 tiflags &= ~TH_URG;
810 ti->ti_urp = 0;
811 }
812 }
813
814 /*
815 * If new data are received on a connection after the
816 * user processes are gone, then RST the other end.
817 */
818 if ((so->so_state & SS_NOFDREF) &&
819 tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
820 tp = tcp_close(tp);
821 tcpstat.tcps_rcvafterclose++;
822 goto dropwithreset;
823 }
824
825 /*
826 * If segment ends after window, drop trailing data
827 * (and PUSH and FIN); if nothing left, just ACK.
828 */
829 todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
830 if (todrop > 0) {
831 tcpstat.tcps_rcvpackafterwin++;
832 if (todrop >= ti->ti_len) {
833 tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
834 /*
835 * If a new connection request is received
836 * while in TIME_WAIT, drop the old connection
837 * and start over if the sequence numbers
838 * are above the previous ones.
839 */
840 if (tiflags & TH_SYN &&
841 tp->t_state == TCPS_TIME_WAIT &&
842 SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
843 iss = tcp_new_iss(tp, sizeof(struct tcpcb),
844 tp->rcv_nxt);
845 tp = tcp_close(tp);
846 /*
847 * We have already advanced the mbuf
848 * pointers past the IP+TCP headers and
849 * options. Restore those pointers before
850 * attempting to use the TCP header again.
851 */
852 m->m_data -= hdroptlen;
853 m->m_len += hdroptlen;
854 goto findpcb;
855 }
856 /*
857 * If window is closed can only take segments at
858 * window edge, and have to drop data and PUSH from
859 * incoming segments. Continue processing, but
860 * remember to ack. Otherwise, drop segment
861 * and ack.
862 */
863 if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
864 tp->t_flags |= TF_ACKNOW;
865 tcpstat.tcps_rcvwinprobe++;
866 } else
867 goto dropafterack;
868 } else
869 tcpstat.tcps_rcvbyteafterwin += todrop;
870 m_adj(m, -todrop);
871 ti->ti_len -= todrop;
872 tiflags &= ~(TH_PUSH|TH_FIN);
873 }
874
875 /*
876 * If last ACK falls within this segment's sequence numbers,
877 * record its timestamp.
878 */
879 if (opti.ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
880 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len +
881 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
882 tp->ts_recent_age = tcp_now;
883 tp->ts_recent = opti.ts_val;
884 }
885
886 /*
887 * If the RST bit is set examine the state:
888 * SYN_RECEIVED STATE:
889 * If passive open, return to LISTEN state.
890 * If active open, inform user that connection was refused.
891 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
892 * Inform user that connection was reset, and close tcb.
893 * CLOSING, LAST_ACK, TIME_WAIT STATES
894 * Close the tcb.
895 */
896 if (tiflags&TH_RST) switch (tp->t_state) {
897
898 case TCPS_SYN_RECEIVED:
899 so->so_error = ECONNREFUSED;
900 goto close;
901
902 case TCPS_ESTABLISHED:
903 case TCPS_FIN_WAIT_1:
904 case TCPS_FIN_WAIT_2:
905 case TCPS_CLOSE_WAIT:
906 so->so_error = ECONNRESET;
907 close:
908 tp->t_state = TCPS_CLOSED;
909 tcpstat.tcps_drops++;
910 tp = tcp_close(tp);
911 goto drop;
912
913 case TCPS_CLOSING:
914 case TCPS_LAST_ACK:
915 case TCPS_TIME_WAIT:
916 tp = tcp_close(tp);
917 goto drop;
918 }
919
920 /*
921 * If a SYN is in the window, then this is an
922 * error and we send an RST and drop the connection.
923 */
924 if (tiflags & TH_SYN) {
925 tp = tcp_drop(tp, ECONNRESET);
926 goto dropwithreset;
927 }
928
929 /*
930 * If the ACK bit is off we drop the segment and return.
931 */
932 if ((tiflags & TH_ACK) == 0)
933 goto drop;
934
935 /*
936 * Ack processing.
937 */
938 switch (tp->t_state) {
939
940 /*
941 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
942 * ESTABLISHED state and continue processing, otherwise
943 * send an RST.
944 */
945 case TCPS_SYN_RECEIVED:
946 if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
947 SEQ_GT(ti->ti_ack, tp->snd_max))
948 goto dropwithreset;
949 tcpstat.tcps_connects++;
950 soisconnected(so);
951 tcp_established(tp);
952 /* Do window scaling? */
953 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
954 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
955 tp->snd_scale = tp->requested_s_scale;
956 tp->rcv_scale = tp->request_r_scale;
957 }
958 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
959 tp->snd_wl1 = ti->ti_seq - 1;
960 /* fall into ... */
961
962 /*
963 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
964 * ACKs. If the ack is in the range
965 * tp->snd_una < ti->ti_ack <= tp->snd_max
966 * then advance tp->snd_una to ti->ti_ack and drop
967 * data from the retransmission queue. If this ACK reflects
968 * more up to date window information we update our window information.
969 */
970 case TCPS_ESTABLISHED:
971 case TCPS_FIN_WAIT_1:
972 case TCPS_FIN_WAIT_2:
973 case TCPS_CLOSE_WAIT:
974 case TCPS_CLOSING:
975 case TCPS_LAST_ACK:
976 case TCPS_TIME_WAIT:
977
978 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
979 if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
980 tcpstat.tcps_rcvdupack++;
981 /*
982 * If we have outstanding data (other than
983 * a window probe), this is a completely
984 * duplicate ack (ie, window info didn't
985 * change), the ack is the biggest we've
986 * seen and we've seen exactly our rexmt
987 * threshhold of them, assume a packet
988 * has been dropped and retransmit it.
989 * Kludge snd_nxt & the congestion
990 * window so we send only this one
991 * packet.
992 *
993 * We know we're losing at the current
994 * window size so do congestion avoidance
995 * (set ssthresh to half the current window
996 * and pull our congestion window back to
997 * the new ssthresh).
998 *
999 * Dup acks mean that packets have left the
1000 * network (they're now cached at the receiver)
1001 * so bump cwnd by the amount in the receiver
1002 * to keep a constant cwnd packets in the
1003 * network.
1004 */
1005 if (tp->t_timer[TCPT_REXMT] == 0 ||
1006 ti->ti_ack != tp->snd_una)
1007 tp->t_dupacks = 0;
1008 else if (++tp->t_dupacks == tcprexmtthresh) {
1009 tcp_seq onxt = tp->snd_nxt;
1010 u_int win =
1011 min(tp->snd_wnd, tp->snd_cwnd) /
1012 2 / tp->t_segsz;
1013
1014 if (win < 2)
1015 win = 2;
1016 tp->snd_ssthresh = win * tp->t_segsz;
1017 tp->t_timer[TCPT_REXMT] = 0;
1018 tp->t_rtt = 0;
1019 tp->snd_nxt = ti->ti_ack;
1020 tp->snd_cwnd = tp->t_segsz;
1021 (void) tcp_output(tp);
1022 tp->snd_cwnd = tp->snd_ssthresh +
1023 tp->t_segsz * tp->t_dupacks;
1024 if (SEQ_GT(onxt, tp->snd_nxt))
1025 tp->snd_nxt = onxt;
1026 goto drop;
1027 } else if (tp->t_dupacks > tcprexmtthresh) {
1028 tp->snd_cwnd += tp->t_segsz;
1029 (void) tcp_output(tp);
1030 goto drop;
1031 }
1032 } else
1033 tp->t_dupacks = 0;
1034 break;
1035 }
1036 /*
1037 * If the congestion window was inflated to account
1038 * for the other side's cached packets, retract it.
1039 */
1040 if (tp->t_dupacks >= tcprexmtthresh &&
1041 tp->snd_cwnd > tp->snd_ssthresh)
1042 tp->snd_cwnd = tp->snd_ssthresh;
1043 tp->t_dupacks = 0;
1044 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1045 tcpstat.tcps_rcvacktoomuch++;
1046 goto dropafterack;
1047 }
1048 acked = ti->ti_ack - tp->snd_una;
1049 tcpstat.tcps_rcvackpack++;
1050 tcpstat.tcps_rcvackbyte += acked;
1051
1052 /*
1053 * If we have a timestamp reply, update smoothed
1054 * round trip time. If no timestamp is present but
1055 * transmit timer is running and timed sequence
1056 * number was acked, update smoothed round trip time.
1057 * Since we now have an rtt measurement, cancel the
1058 * timer backoff (cf., Phil Karn's retransmit alg.).
1059 * Recompute the initial retransmit timer.
1060 */
1061 if (opti.ts_present)
1062 tcp_xmit_timer(tp, tcp_now - opti.ts_ecr + 1);
1063 else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1064 tcp_xmit_timer(tp,tp->t_rtt);
1065
1066 /*
1067 * If all outstanding data is acked, stop retransmit
1068 * timer and remember to restart (more output or persist).
1069 * If there is more data to be acked, restart retransmit
1070 * timer, using current (possibly backed-off) value.
1071 */
1072 if (ti->ti_ack == tp->snd_max) {
1073 tp->t_timer[TCPT_REXMT] = 0;
1074 needoutput = 1;
1075 } else if (tp->t_timer[TCPT_PERSIST] == 0)
1076 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1077 /*
1078 * When new data is acked, open the congestion window.
1079 * If the window gives us less than ssthresh packets
1080 * in flight, open exponentially (segsz per packet).
1081 * Otherwise open linearly: segsz per window
1082 * (segsz^2 / cwnd per packet), plus a constant
1083 * fraction of a packet (segsz/8) to help larger windows
1084 * open quickly enough.
1085 */
1086 {
1087 register u_int cw = tp->snd_cwnd;
1088 register u_int incr = tp->t_segsz;
1089
1090 if (cw > tp->snd_ssthresh)
1091 incr = incr * incr / cw;
1092 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1093 }
1094 if (acked > so->so_snd.sb_cc) {
1095 tp->snd_wnd -= so->so_snd.sb_cc;
1096 sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1097 ourfinisacked = 1;
1098 } else {
1099 sbdrop(&so->so_snd, acked);
1100 tp->snd_wnd -= acked;
1101 ourfinisacked = 0;
1102 }
1103 if (sb_notify(&so->so_snd))
1104 sowwakeup(so);
1105 tp->snd_una = ti->ti_ack;
1106 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1107 tp->snd_nxt = tp->snd_una;
1108
1109 switch (tp->t_state) {
1110
1111 /*
1112 * In FIN_WAIT_1 STATE in addition to the processing
1113 * for the ESTABLISHED state if our FIN is now acknowledged
1114 * then enter FIN_WAIT_2.
1115 */
1116 case TCPS_FIN_WAIT_1:
1117 if (ourfinisacked) {
1118 /*
1119 * If we can't receive any more
1120 * data, then closing user can proceed.
1121 * Starting the timer is contrary to the
1122 * specification, but if we don't get a FIN
1123 * we'll hang forever.
1124 */
1125 if (so->so_state & SS_CANTRCVMORE) {
1126 soisdisconnected(so);
1127 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1128 }
1129 tp->t_state = TCPS_FIN_WAIT_2;
1130 }
1131 break;
1132
1133 /*
1134 * In CLOSING STATE in addition to the processing for
1135 * the ESTABLISHED state if the ACK acknowledges our FIN
1136 * then enter the TIME-WAIT state, otherwise ignore
1137 * the segment.
1138 */
1139 case TCPS_CLOSING:
1140 if (ourfinisacked) {
1141 tp->t_state = TCPS_TIME_WAIT;
1142 tcp_canceltimers(tp);
1143 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1144 soisdisconnected(so);
1145 }
1146 break;
1147
1148 /*
1149 * In LAST_ACK, we may still be waiting for data to drain
1150 * and/or to be acked, as well as for the ack of our FIN.
1151 * If our FIN is now acknowledged, delete the TCB,
1152 * enter the closed state and return.
1153 */
1154 case TCPS_LAST_ACK:
1155 if (ourfinisacked) {
1156 tp = tcp_close(tp);
1157 goto drop;
1158 }
1159 break;
1160
1161 /*
1162 * In TIME_WAIT state the only thing that should arrive
1163 * is a retransmission of the remote FIN. Acknowledge
1164 * it and restart the finack timer.
1165 */
1166 case TCPS_TIME_WAIT:
1167 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1168 goto dropafterack;
1169 }
1170 }
1171
1172 step6:
1173 /*
1174 * Update window information.
1175 * Don't look at window if no ACK: TAC's send garbage on first SYN.
1176 */
1177 if (((tiflags & TH_ACK) && SEQ_LT(tp->snd_wl1, ti->ti_seq)) ||
1178 (tp->snd_wl1 == ti->ti_seq && SEQ_LT(tp->snd_wl2, ti->ti_ack)) ||
1179 (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)) {
1180 /* keep track of pure window updates */
1181 if (ti->ti_len == 0 &&
1182 tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1183 tcpstat.tcps_rcvwinupd++;
1184 tp->snd_wnd = tiwin;
1185 tp->snd_wl1 = ti->ti_seq;
1186 tp->snd_wl2 = ti->ti_ack;
1187 if (tp->snd_wnd > tp->max_sndwnd)
1188 tp->max_sndwnd = tp->snd_wnd;
1189 needoutput = 1;
1190 }
1191
1192 /*
1193 * Process segments with URG.
1194 */
1195 if ((tiflags & TH_URG) && ti->ti_urp &&
1196 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1197 /*
1198 * This is a kludge, but if we receive and accept
1199 * random urgent pointers, we'll crash in
1200 * soreceive. It's hard to imagine someone
1201 * actually wanting to send this much urgent data.
1202 */
1203 if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) {
1204 ti->ti_urp = 0; /* XXX */
1205 tiflags &= ~TH_URG; /* XXX */
1206 goto dodata; /* XXX */
1207 }
1208 /*
1209 * If this segment advances the known urgent pointer,
1210 * then mark the data stream. This should not happen
1211 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1212 * a FIN has been received from the remote side.
1213 * In these states we ignore the URG.
1214 *
1215 * According to RFC961 (Assigned Protocols),
1216 * the urgent pointer points to the last octet
1217 * of urgent data. We continue, however,
1218 * to consider it to indicate the first octet
1219 * of data past the urgent section as the original
1220 * spec states (in one of two places).
1221 */
1222 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1223 tp->rcv_up = ti->ti_seq + ti->ti_urp;
1224 so->so_oobmark = so->so_rcv.sb_cc +
1225 (tp->rcv_up - tp->rcv_nxt) - 1;
1226 if (so->so_oobmark == 0)
1227 so->so_state |= SS_RCVATMARK;
1228 sohasoutofband(so);
1229 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1230 }
1231 /*
1232 * Remove out of band data so doesn't get presented to user.
1233 * This can happen independent of advancing the URG pointer,
1234 * but if two URG's are pending at once, some out-of-band
1235 * data may creep in... ick.
1236 */
1237 if (ti->ti_urp <= (u_int16_t) ti->ti_len
1238 #ifdef SO_OOBINLINE
1239 && (so->so_options & SO_OOBINLINE) == 0
1240 #endif
1241 )
1242 tcp_pulloutofband(so, ti, m);
1243 } else
1244 /*
1245 * If no out of band data is expected,
1246 * pull receive urgent pointer along
1247 * with the receive window.
1248 */
1249 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1250 tp->rcv_up = tp->rcv_nxt;
1251 dodata: /* XXX */
1252
1253 /*
1254 * Process the segment text, merging it into the TCP sequencing queue,
1255 * and arranging for acknowledgment of receipt if necessary.
1256 * This process logically involves adjusting tp->rcv_wnd as data
1257 * is presented to the user (this happens in tcp_usrreq.c,
1258 * case PRU_RCVD). If a FIN has already been received on this
1259 * connection then we just ignore the text.
1260 */
1261 if ((ti->ti_len || (tiflags & TH_FIN)) &&
1262 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1263 TCP_REASS(tp, ti, m, so, tiflags);
1264 /*
1265 * Note the amount of data that peer has sent into
1266 * our window, in order to estimate the sender's
1267 * buffer size.
1268 */
1269 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1270 } else {
1271 m_freem(m);
1272 tiflags &= ~TH_FIN;
1273 }
1274
1275 /*
1276 * If FIN is received ACK the FIN and let the user know
1277 * that the connection is closing. Ignore a FIN received before
1278 * the connection is fully established.
1279 */
1280 if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
1281 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1282 socantrcvmore(so);
1283 tp->t_flags |= TF_ACKNOW;
1284 tp->rcv_nxt++;
1285 }
1286 switch (tp->t_state) {
1287
1288 /*
1289 * In ESTABLISHED STATE enter the CLOSE_WAIT state.
1290 */
1291 case TCPS_ESTABLISHED:
1292 tp->t_state = TCPS_CLOSE_WAIT;
1293 break;
1294
1295 /*
1296 * If still in FIN_WAIT_1 STATE FIN has not been acked so
1297 * enter the CLOSING state.
1298 */
1299 case TCPS_FIN_WAIT_1:
1300 tp->t_state = TCPS_CLOSING;
1301 break;
1302
1303 /*
1304 * In FIN_WAIT_2 state enter the TIME_WAIT state,
1305 * starting the time-wait timer, turning off the other
1306 * standard timers.
1307 */
1308 case TCPS_FIN_WAIT_2:
1309 tp->t_state = TCPS_TIME_WAIT;
1310 tcp_canceltimers(tp);
1311 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1312 soisdisconnected(so);
1313 break;
1314
1315 /*
1316 * In TIME_WAIT state restart the 2 MSL time_wait timer.
1317 */
1318 case TCPS_TIME_WAIT:
1319 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1320 break;
1321 }
1322 }
1323 if (so->so_options & SO_DEBUG)
1324 tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0);
1325
1326 /*
1327 * Return any desired output.
1328 */
1329 if (needoutput || (tp->t_flags & TF_ACKNOW))
1330 (void) tcp_output(tp);
1331 return;
1332
1333 badsyn:
1334 /*
1335 * Received a bad SYN. Increment counters and dropwithreset.
1336 */
1337 tcpstat.tcps_badsyn++;
1338 tp = NULL;
1339 goto dropwithreset;
1340
1341 dropafterack:
1342 /*
1343 * Generate an ACK dropping incoming segment if it occupies
1344 * sequence space, where the ACK reflects our state.
1345 */
1346 if (tiflags & TH_RST)
1347 goto drop;
1348 m_freem(m);
1349 tp->t_flags |= TF_ACKNOW;
1350 (void) tcp_output(tp);
1351 return;
1352
1353 dropwithreset:
1354 /*
1355 * Generate a RST, dropping incoming segment.
1356 * Make ACK acceptable to originator of segment.
1357 * Don't bother to respond if destination was broadcast/multicast.
1358 */
1359 if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) ||
1360 IN_MULTICAST(ti->ti_dst.s_addr))
1361 goto drop;
1362 if (tiflags & TH_ACK)
1363 (void)tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1364 else {
1365 if (tiflags & TH_SYN)
1366 ti->ti_len++;
1367 (void)tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1368 TH_RST|TH_ACK);
1369 }
1370 return;
1371
1372 drop:
1373 /*
1374 * Drop space held by incoming segment and return.
1375 */
1376 if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1377 tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1378 m_freem(m);
1379 return;
1380 #ifndef TUBA_INCLUDE
1381 }
1382
1383 void
1384 tcp_dooptions(tp, cp, cnt, ti, oi)
1385 struct tcpcb *tp;
1386 u_char *cp;
1387 int cnt;
1388 struct tcpiphdr *ti;
1389 struct tcp_opt_info *oi;
1390 {
1391 u_int16_t mss;
1392 int opt, optlen;
1393
1394 for (; cnt > 0; cnt -= optlen, cp += optlen) {
1395 opt = cp[0];
1396 if (opt == TCPOPT_EOL)
1397 break;
1398 if (opt == TCPOPT_NOP)
1399 optlen = 1;
1400 else {
1401 optlen = cp[1];
1402 if (optlen <= 0)
1403 break;
1404 }
1405 switch (opt) {
1406
1407 default:
1408 continue;
1409
1410 case TCPOPT_MAXSEG:
1411 if (optlen != TCPOLEN_MAXSEG)
1412 continue;
1413 if (!(ti->ti_flags & TH_SYN))
1414 continue;
1415 bcopy(cp + 2, &mss, sizeof(mss));
1416 oi->maxseg = ntohs(mss);
1417 break;
1418
1419 case TCPOPT_WINDOW:
1420 if (optlen != TCPOLEN_WINDOW)
1421 continue;
1422 if (!(ti->ti_flags & TH_SYN))
1423 continue;
1424 tp->t_flags |= TF_RCVD_SCALE;
1425 tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1426 break;
1427
1428 case TCPOPT_TIMESTAMP:
1429 if (optlen != TCPOLEN_TIMESTAMP)
1430 continue;
1431 oi->ts_present = 1;
1432 bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
1433 NTOHL(oi->ts_val);
1434 bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
1435 NTOHL(oi->ts_ecr);
1436
1437 /*
1438 * A timestamp received in a SYN makes
1439 * it ok to send timestamp requests and replies.
1440 */
1441 if (ti->ti_flags & TH_SYN) {
1442 tp->t_flags |= TF_RCVD_TSTMP;
1443 tp->ts_recent = oi->ts_val;
1444 tp->ts_recent_age = tcp_now;
1445 }
1446 break;
1447 }
1448 }
1449 }
1450
1451 /*
1452 * Pull out of band byte out of a segment so
1453 * it doesn't appear in the user's data queue.
1454 * It is still reflected in the segment length for
1455 * sequencing purposes.
1456 */
1457 void
1458 tcp_pulloutofband(so, ti, m)
1459 struct socket *so;
1460 struct tcpiphdr *ti;
1461 register struct mbuf *m;
1462 {
1463 int cnt = ti->ti_urp - 1;
1464
1465 while (cnt >= 0) {
1466 if (m->m_len > cnt) {
1467 char *cp = mtod(m, caddr_t) + cnt;
1468 struct tcpcb *tp = sototcpcb(so);
1469
1470 tp->t_iobc = *cp;
1471 tp->t_oobflags |= TCPOOB_HAVEDATA;
1472 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
1473 m->m_len--;
1474 return;
1475 }
1476 cnt -= m->m_len;
1477 m = m->m_next;
1478 if (m == 0)
1479 break;
1480 }
1481 panic("tcp_pulloutofband");
1482 }
1483
1484 /*
1485 * Collect new round-trip time estimate
1486 * and update averages and current timeout.
1487 */
1488 void
1489 tcp_xmit_timer(tp, rtt)
1490 register struct tcpcb *tp;
1491 short rtt;
1492 {
1493 register short delta;
1494
1495 tcpstat.tcps_rttupdated++;
1496 --rtt;
1497 if (tp->t_srtt != 0) {
1498 /*
1499 * srtt is stored as fixed point with 3 bits after the
1500 * binary point (i.e., scaled by 8). The following magic
1501 * is equivalent to the smoothing algorithm in rfc793 with
1502 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1503 * point). Adjust rtt to origin 0.
1504 */
1505 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
1506 if ((tp->t_srtt += delta) <= 0)
1507 tp->t_srtt = 1 << 2;
1508 /*
1509 * We accumulate a smoothed rtt variance (actually, a
1510 * smoothed mean difference), then set the retransmit
1511 * timer to smoothed rtt + 4 times the smoothed variance.
1512 * rttvar is stored as fixed point with 2 bits after the
1513 * binary point (scaled by 4). The following is
1514 * equivalent to rfc793 smoothing with an alpha of .75
1515 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1516 * rfc793's wired-in beta.
1517 */
1518 if (delta < 0)
1519 delta = -delta;
1520 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1521 if ((tp->t_rttvar += delta) <= 0)
1522 tp->t_rttvar = 1 << 2;
1523 } else {
1524 /*
1525 * No rtt measurement yet - use the unsmoothed rtt.
1526 * Set the variance to half the rtt (so our first
1527 * retransmit happens at 3*rtt).
1528 */
1529 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
1530 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
1531 }
1532 tp->t_rtt = 0;
1533 tp->t_rxtshift = 0;
1534
1535 /*
1536 * the retransmit should happen at rtt + 4 * rttvar.
1537 * Because of the way we do the smoothing, srtt and rttvar
1538 * will each average +1/2 tick of bias. When we compute
1539 * the retransmit timer, we want 1/2 tick of rounding and
1540 * 1 extra tick because of +-1/2 tick uncertainty in the
1541 * firing of the timer. The bias will give us exactly the
1542 * 1.5 tick we need. But, because the bias is
1543 * statistical, we have to test that we don't drop below
1544 * the minimum feasible timer (which is 2 ticks).
1545 */
1546 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1547 rtt + 2, TCPTV_REXMTMAX);
1548
1549 /*
1550 * We received an ack for a packet that wasn't retransmitted;
1551 * it is probably safe to discard any error indications we've
1552 * received recently. This isn't quite right, but close enough
1553 * for now (a route might have failed after we sent a segment,
1554 * and the return path might not be symmetrical).
1555 */
1556 tp->t_softerror = 0;
1557 }
1558
1559 /*
1560 * TCP compressed state engine. Currently used to hold compressed
1561 * state for SYN_RECEIVED.
1562 */
1563
1564 u_long syn_cache_count;
1565 u_int32_t syn_hash1, syn_hash2;
1566
1567 #define SYN_HASH(sa, sp, dp) \
1568 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
1569 ((u_int32_t)(sp)))^syn_hash2)) \
1570 & 0x7fffffff)
1571
1572 #define eptosp(ep, e, s) ((struct s *)((char *)(ep) - \
1573 ((char *)(&((struct s *)0)->e) - (char *)0)))
1574
1575 #define SYN_CACHE_RM(sc, p, scp) { \
1576 *(p) = (sc)->sc_next; \
1577 if ((sc)->sc_next) \
1578 (sc)->sc_next->sc_timer += (sc)->sc_timer; \
1579 else { \
1580 (scp)->sch_timer_sum -= (sc)->sc_timer; \
1581 if ((scp)->sch_timer_sum <= 0) \
1582 (scp)->sch_timer_sum = -1; \
1583 /* If need be, fix up the last pointer */ \
1584 if ((scp)->sch_first) \
1585 (scp)->sch_last = eptosp(p, sc_next, syn_cache); \
1586 } \
1587 (scp)->sch_length--; \
1588 syn_cache_count--; \
1589 }
1590
1591 void
1592 syn_cache_insert(sc, prevp, headp)
1593 struct syn_cache *sc;
1594 struct syn_cache ***prevp;
1595 struct syn_cache_head **headp;
1596 {
1597 struct syn_cache_head *scp, *scp2, *sce;
1598 struct syn_cache *sc2;
1599 static u_int timeo_val;
1600 int s;
1601
1602 /* Initialize the hash secrets when adding the first entry */
1603 if (syn_cache_count == 0) {
1604 struct timeval tv;
1605 microtime(&tv);
1606 syn_hash1 = random() ^ (u_long)≻
1607 syn_hash2 = random() ^ tv.tv_usec;
1608 }
1609
1610 sc->sc_hash = SYN_HASH(&sc->sc_src, sc->sc_sport, sc->sc_dport);
1611 sc->sc_next = NULL;
1612 scp = &tcp_syn_cache[sc->sc_hash % tcp_syn_cache_size];
1613 *headp = scp;
1614
1615 /*
1616 * Make sure that we don't overflow the per-bucket
1617 * limit or the total cache size limit.
1618 */
1619 s = splsoftnet();
1620 if (scp->sch_length >= tcp_syn_bucket_limit) {
1621 tcpstat.tcps_sc_bucketoverflow++;
1622 sc2 = scp->sch_first;
1623 scp->sch_first = sc2->sc_next;
1624 FREE(sc2, M_PCB);
1625 } else if (syn_cache_count >= tcp_syn_cache_limit) {
1626 tcpstat.tcps_sc_overflowed++;
1627 /*
1628 * The cache is full. Toss the first (i.e, oldest)
1629 * element in this bucket.
1630 */
1631 scp2 = scp;
1632 if (scp2->sch_first == NULL) {
1633 sce = &tcp_syn_cache[tcp_syn_cache_size];
1634 for (++scp2; scp2 != scp; scp2++) {
1635 if (scp2 >= sce)
1636 scp2 = &tcp_syn_cache[0];
1637 if (scp2->sch_first)
1638 break;
1639 }
1640 }
1641 sc2 = scp2->sch_first;
1642 if (sc2 == NULL) {
1643 FREE(sc, M_PCB);
1644 return;
1645 }
1646 if ((scp2->sch_first = sc2->sc_next) == NULL)
1647 scp2->sch_last = NULL;
1648 else
1649 sc2->sc_next->sc_timer += sc2->sc_timer;
1650 FREE(sc2, M_PCB);
1651 } else {
1652 scp->sch_length++;
1653 syn_cache_count++;
1654 }
1655 tcpstat.tcps_sc_added++;
1656
1657 /*
1658 * Put it into the bucket.
1659 */
1660 if (scp->sch_first == NULL)
1661 *prevp = &scp->sch_first;
1662 else {
1663 *prevp = &scp->sch_last->sc_next;
1664 tcpstat.tcps_sc_collisions++;
1665 }
1666 **prevp = sc;
1667 scp->sch_last = sc;
1668
1669 /*
1670 * If the timeout value has changed
1671 * 1) force it to fit in a u_char
1672 * 2) Run the timer routine to truncate all
1673 * existing entries to the new timeout value.
1674 */
1675 if (timeo_val != tcp_syn_cache_timeo) {
1676 tcp_syn_cache_timeo = min(tcp_syn_cache_timeo, UCHAR_MAX);
1677 if (timeo_val > tcp_syn_cache_timeo)
1678 syn_cache_timer(timeo_val - tcp_syn_cache_timeo);
1679 timeo_val = tcp_syn_cache_timeo;
1680 }
1681 if (scp->sch_timer_sum > 0)
1682 sc->sc_timer = tcp_syn_cache_timeo - scp->sch_timer_sum;
1683 else if (scp->sch_timer_sum == 0) {
1684 /* When the bucket timer is 0, it is not in the cache queue. */
1685 scp->sch_headq = tcp_syn_cache_first;
1686 tcp_syn_cache_first = scp;
1687 sc->sc_timer = tcp_syn_cache_timeo;
1688 }
1689 scp->sch_timer_sum = tcp_syn_cache_timeo;
1690 splx(s);
1691 }
1692
1693 /*
1694 * Walk down the cache list, decrementing the timer of
1695 * the first element on each entry. If the timer goes
1696 * to zero, remove it and all successive entries with
1697 * a zero timer.
1698 */
1699 void
1700 syn_cache_timer(interval)
1701 int interval;
1702 {
1703 struct syn_cache_head *scp, **pscp;
1704 struct syn_cache *sc, *scn;
1705 int n, s;
1706
1707 pscp = &tcp_syn_cache_first;
1708 scp = tcp_syn_cache_first;
1709 s = splsoftnet();
1710 while (scp) {
1711 /*
1712 * Remove any empty hash buckets
1713 * from the cache queue.
1714 */
1715 if ((sc = scp->sch_first) == NULL) {
1716 *pscp = scp->sch_headq;
1717 scp->sch_headq = NULL;
1718 scp->sch_timer_sum = 0;
1719 scp->sch_first = scp->sch_last = NULL;
1720 scp->sch_length = 0;
1721 scp = *pscp;
1722 continue;
1723 }
1724
1725 scp->sch_timer_sum -= interval;
1726 if (scp->sch_timer_sum <= 0)
1727 scp->sch_timer_sum = -1;
1728 n = interval;
1729 while (sc->sc_timer <= n) {
1730 n -= sc->sc_timer;
1731 scn = sc->sc_next;
1732 tcpstat.tcps_sc_timed_out++;
1733 syn_cache_count--;
1734 FREE(sc, M_PCB);
1735 scp->sch_length--;
1736 if ((sc = scn) == NULL)
1737 break;
1738 }
1739 if ((scp->sch_first = sc) != NULL) {
1740 sc->sc_timer -= n;
1741 pscp = &scp->sch_headq;
1742 scp = scp->sch_headq;
1743 }
1744 }
1745 splx(s);
1746 }
1747
1748 /*
1749 * Find an entry in the syn cache.
1750 */
1751 struct syn_cache *
1752 syn_cache_lookup(ti, prevp, headp)
1753 struct tcpiphdr *ti;
1754 struct syn_cache ***prevp;
1755 struct syn_cache_head **headp;
1756 {
1757 struct syn_cache *sc, **prev;
1758 struct syn_cache_head *head;
1759 u_int32_t hash;
1760 int s;
1761
1762 hash = SYN_HASH(&ti->ti_src, ti->ti_sport, ti->ti_dport);
1763
1764 head = &tcp_syn_cache[hash % tcp_syn_cache_size];
1765 *headp = head;
1766 prev = &head->sch_first;
1767 s = splsoftnet();
1768 for (sc = head->sch_first; sc; prev = &sc->sc_next, sc = sc->sc_next) {
1769 if (sc->sc_hash != hash)
1770 continue;
1771 if (sc->sc_src.s_addr == ti->ti_src.s_addr &&
1772 sc->sc_sport == ti->ti_sport &&
1773 sc->sc_dport == ti->ti_dport &&
1774 sc->sc_dst.s_addr == ti->ti_dst.s_addr) {
1775 *prevp = prev;
1776 splx(s);
1777 return (sc);
1778 }
1779 }
1780 splx(s);
1781 return (NULL);
1782 }
1783
1784 /*
1785 * This function gets called when we receive an ACK for a
1786 * socket in the LISTEN state. We look up the connection
1787 * in the syn cache, and if its there, we pull it out of
1788 * the cache and turn it into a full-blown connection in
1789 * the SYN-RECEIVED state.
1790 *
1791 * The return values may not be immediately obvious, and their effects
1792 * can be subtle, so here they are:
1793 *
1794 * NULL SYN was not found in cache; caller should drop the
1795 * packet and send an RST.
1796 *
1797 * -1 We were unable to create the new connection, and are
1798 * aborting it. An ACK,RST is being sent to the peer
1799 * (unless we got screwey sequence numbners; see below),
1800 * because the 3-way handshake has been completed. Caller
1801 * should not free the mbuf, since we may be using it. If
1802 * we are not, we will free it.
1803 *
1804 * Otherwise, the return value is a pointer to the new socket
1805 * associated with the connection.
1806 */
1807 struct socket *
1808 syn_cache_get(so, m)
1809 struct socket *so;
1810 struct mbuf *m;
1811 {
1812 struct syn_cache *sc, **sc_prev;
1813 struct syn_cache_head *head;
1814 register struct inpcb *inp;
1815 register struct tcpcb *tp = 0;
1816 register struct tcpiphdr *ti;
1817 struct sockaddr_in *sin;
1818 struct mbuf *am;
1819 long win;
1820 int s;
1821
1822 ti = mtod(m, struct tcpiphdr *);
1823 s = splsoftnet();
1824 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) {
1825 splx(s);
1826 return (NULL);
1827 }
1828
1829 win = sbspace(&so->so_rcv);
1830 if (win > TCP_MAXWIN)
1831 win = TCP_MAXWIN;
1832
1833 /*
1834 * Verify the sequence and ack numbers.
1835 */
1836 if ((ti->ti_ack != sc->sc_iss + 1) ||
1837 SEQ_LEQ(ti->ti_seq, sc->sc_irs) ||
1838 SEQ_GT(ti->ti_seq, sc->sc_irs + 1 + win)) {
1839 (void) syn_cache_respond(sc, m, ti, win, 0);
1840 splx(s);
1841 return ((struct socket *)(-1));
1842 }
1843
1844 /* Remove this cache entry */
1845 SYN_CACHE_RM(sc, sc_prev, head);
1846 splx(s);
1847
1848 /*
1849 * Ok, create the full blown connection, and set things up
1850 * as they would have been set up if we had created the
1851 * connection when the SYN arrived. If we can't create
1852 * the connection, abort it.
1853 */
1854 so = sonewconn(so, SS_ISCONNECTED);
1855 if (so == NULL)
1856 goto resetandabort;
1857
1858 inp = sotoinpcb(so);
1859 inp->inp_laddr = sc->sc_dst;
1860 inp->inp_lport = sc->sc_dport;
1861 in_pcbstate(inp, INP_BOUND);
1862 #if BSD>=43
1863 inp->inp_options = ip_srcroute();
1864 #endif
1865
1866 am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
1867 if (am == NULL) {
1868 m_freem(m);
1869 goto resetandabort;
1870 }
1871 am->m_len = sizeof(struct sockaddr_in);
1872 sin = mtod(am, struct sockaddr_in *);
1873 sin->sin_family = AF_INET;
1874 sin->sin_len = sizeof(*sin);
1875 sin->sin_addr = sc->sc_src;
1876 sin->sin_port = sc->sc_sport;
1877 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
1878 if (in_pcbconnect(inp, am)) {
1879 (void) m_free(am);
1880 m_freem(m);
1881 goto resetandabort;
1882 }
1883 (void) m_free(am);
1884
1885 tp = intotcpcb(inp);
1886 if (sc->sc_request_r_scale != 15) {
1887 tp->requested_s_scale = sc->sc_requested_s_scale;
1888 tp->request_r_scale = sc->sc_request_r_scale;
1889 tp->snd_scale = sc->sc_requested_s_scale;
1890 tp->rcv_scale = sc->sc_request_r_scale;
1891 tp->t_flags |= TF_RCVD_SCALE;
1892 }
1893 if (sc->sc_tstmp)
1894 tp->t_flags |= TF_RCVD_TSTMP;
1895
1896 tp->t_template = tcp_template(tp);
1897 if (tp->t_template == 0) {
1898 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
1899 so = NULL;
1900 m_freem(m);
1901 goto abort;
1902 }
1903
1904 tp->iss = sc->sc_iss;
1905 tp->irs = sc->sc_irs;
1906 tcp_sendseqinit(tp);
1907 tcp_rcvseqinit(tp);
1908 tp->t_state = TCPS_SYN_RECEIVED;
1909 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
1910 tcpstat.tcps_accepts++;
1911
1912 /* Initialize tp->t_ourmss before we deal with the peer's! */
1913 tp->t_ourmss = sc->sc_ourmaxseg;
1914 tcp_mss_from_peer(tp, sc->sc_peermaxseg);
1915 tcp_rmx_rtt(tp);
1916 tp->snd_wl1 = sc->sc_irs;
1917 tp->rcv_up = sc->sc_irs + 1;
1918
1919 /*
1920 * This is what whould have happened in tcp_ouput() when
1921 * the SYN,ACK was sent.
1922 */
1923 tp->snd_up = tp->snd_una;
1924 tp->snd_max = tp->snd_nxt = tp->iss+1;
1925 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1926 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1927 tp->rcv_adv = tp->rcv_nxt + win;
1928 tp->last_ack_sent = tp->rcv_nxt;
1929
1930 tcpstat.tcps_sc_completed++;
1931 FREE(sc, M_PCB);
1932 return (so);
1933
1934 resetandabort:
1935 (void) tcp_respond(NULL, ti, m, ti->ti_seq+ti->ti_len,
1936 (tcp_seq)0, TH_RST|TH_ACK);
1937 abort:
1938 if (so != NULL)
1939 (void) soabort(so);
1940 FREE(sc, M_PCB);
1941 tcpstat.tcps_sc_aborted++;
1942 return ((struct socket *)(-1));
1943 }
1944
1945 /*
1946 * This function is called when we get a RST for a
1947 * non-existant connection, so that we can see if the
1948 * connection is in the syn cache. If it is, zap it.
1949 */
1950
1951 void
1952 syn_cache_reset(ti)
1953 register struct tcpiphdr *ti;
1954 {
1955 struct syn_cache *sc, **sc_prev;
1956 struct syn_cache_head *head;
1957 int s = splsoftnet();
1958
1959 if ((sc = syn_cache_lookup(ti, &sc_prev, &head)) == NULL) {
1960 splx(s);
1961 return;
1962 }
1963 if (SEQ_LT(ti->ti_seq,sc->sc_irs) ||
1964 SEQ_GT(ti->ti_seq, sc->sc_irs+1)) {
1965 splx(s);
1966 return;
1967 }
1968 SYN_CACHE_RM(sc, sc_prev, head);
1969 splx(s);
1970 tcpstat.tcps_sc_reset++;
1971 FREE(sc, M_PCB);
1972 }
1973
1974 void
1975 syn_cache_unreach(ip, th)
1976 struct ip *ip;
1977 struct tcphdr *th;
1978 {
1979 struct syn_cache *sc, **sc_prev;
1980 struct syn_cache_head *head;
1981 struct tcpiphdr ti2;
1982 int s;
1983
1984 ti2.ti_src.s_addr = ip->ip_dst.s_addr;
1985 ti2.ti_dst.s_addr = ip->ip_src.s_addr;
1986 ti2.ti_sport = th->th_dport;
1987 ti2.ti_dport = th->th_sport;
1988
1989 s = splsoftnet();
1990 if ((sc = syn_cache_lookup(&ti2, &sc_prev, &head)) == NULL) {
1991 splx(s);
1992 return;
1993 }
1994 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
1995 if (ntohl (th->th_seq) != sc->sc_iss) {
1996 splx(s);
1997 return;
1998 }
1999 SYN_CACHE_RM(sc, sc_prev, head);
2000 splx(s);
2001 tcpstat.tcps_sc_unreach++;
2002 FREE(sc, M_PCB);
2003 }
2004
2005 /*
2006 * Given a LISTEN socket and an inbound SYN request, add
2007 * this to the syn cache, and send back a segment:
2008 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
2009 * to the source.
2010 *
2011 * XXX We don't properly handle SYN-with-data!
2012 */
2013
2014 int
2015 syn_cache_add(so, m, optp, optlen, oi)
2016 struct socket *so;
2017 struct mbuf *m;
2018 u_char *optp;
2019 int optlen;
2020 struct tcp_opt_info *oi;
2021 {
2022 register struct tcpiphdr *ti;
2023 struct tcpcb tb, *tp;
2024 long win;
2025 struct syn_cache *sc, **sc_prev;
2026 struct syn_cache_head *scp;
2027 extern int tcp_do_rfc1323;
2028
2029 tp = sototcpcb(so);
2030 ti = mtod(m, struct tcpiphdr *);
2031
2032 /*
2033 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2034 * in_broadcast() should never return true on a received
2035 * packet with M_BCAST not set.
2036 */
2037 if (m->m_flags & (M_BCAST|M_MCAST) ||
2038 IN_MULTICAST(ti->ti_src.s_addr) ||
2039 IN_MULTICAST(ti->ti_dst.s_addr))
2040 return (0);
2041
2042 /*
2043 * Initialize some local state.
2044 */
2045 win = sbspace(&so->so_rcv);
2046 if (win > TCP_MAXWIN)
2047 win = TCP_MAXWIN;
2048
2049 if (optp) {
2050 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
2051 tcp_dooptions(&tb, optp, optlen, ti, oi);
2052 } else
2053 tb.t_flags = 0;
2054
2055 /*
2056 * See if we already have an entry for this connection.
2057 */
2058 if ((sc = syn_cache_lookup(ti, &sc_prev, &scp)) != NULL) {
2059 tcpstat.tcps_sc_dupesyn++;
2060 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) {
2061 tcpstat.tcps_sndacks++;
2062 tcpstat.tcps_sndtotal++;
2063 }
2064 return (1);
2065 }
2066
2067 MALLOC(sc, struct syn_cache *, sizeof(*sc), M_PCB, M_NOWAIT);
2068 if (sc == NULL)
2069 return (0);
2070 /*
2071 * Fill in the cache, and put the necessary TCP
2072 * options into the reply.
2073 */
2074 sc->sc_src.s_addr = ti->ti_src.s_addr;
2075 sc->sc_dst.s_addr = ti->ti_dst.s_addr;
2076 sc->sc_sport = ti->ti_sport;
2077 sc->sc_dport = ti->ti_dport;
2078 sc->sc_irs = ti->ti_seq;
2079 sc->sc_iss = tcp_new_iss(sc, sizeof(struct syn_cache), 0);
2080 sc->sc_peermaxseg = oi->maxseg;
2081 sc->sc_ourmaxseg = tcp_mss_to_advertise(tp);
2082 sc->sc_tstmp = (tcp_do_rfc1323 && (tb.t_flags & TF_RCVD_TSTMP)) ? 1 : 0;
2083 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2084 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2085 sc->sc_requested_s_scale = tb.requested_s_scale;
2086 sc->sc_request_r_scale = 0;
2087 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
2088 TCP_MAXWIN << sc->sc_request_r_scale <
2089 so->so_rcv.sb_hiwat)
2090 sc->sc_request_r_scale++;
2091 } else {
2092 sc->sc_requested_s_scale = 15;
2093 sc->sc_request_r_scale = 15;
2094 }
2095 if (syn_cache_respond(sc, m, ti, win, tb.ts_recent) == 0) {
2096 syn_cache_insert(sc, &sc_prev, &scp);
2097 tcpstat.tcps_sndacks++;
2098 tcpstat.tcps_sndtotal++;
2099 } else {
2100 FREE(sc, M_PCB);
2101 tcpstat.tcps_sc_dropped++;
2102 }
2103 return (1);
2104 }
2105
2106 int
2107 syn_cache_respond(sc, m, ti, win, ts)
2108 struct syn_cache *sc;
2109 struct mbuf *m;
2110 register struct tcpiphdr *ti;
2111 long win;
2112 u_long ts;
2113 {
2114 u_int8_t *optp;
2115 int optlen;
2116
2117 /*
2118 * Tack on the TCP options. If there isn't enough trailing
2119 * space for them, move up the fixed header to make space.
2120 */
2121 optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
2122 (sc->sc_tstmp ? TCPOLEN_TSTAMP_APPA : 0);
2123 if (optlen > M_TRAILINGSPACE(m)) {
2124 if (M_LEADINGSPACE(m) >= optlen) {
2125 m->m_data -= optlen;
2126 m->m_len += optlen;
2127 } else {
2128 struct mbuf *m0 = m;
2129 if ((m = m_gethdr(M_DONTWAIT, MT_HEADER)) == NULL) {
2130 m_freem(m0);
2131 return (ENOBUFS);
2132 }
2133 MH_ALIGN(m, sizeof(*ti) + optlen);
2134 m->m_next = m0; /* this gets freed below */
2135 }
2136 ovbcopy((caddr_t)ti, mtod(m, caddr_t), sizeof(*ti));
2137 ti = mtod(m, struct tcpiphdr *);
2138 }
2139
2140 optp = (u_int8_t *)(ti + 1);
2141 optp[0] = TCPOPT_MAXSEG;
2142 optp[1] = 4;
2143 optp[2] = (sc->sc_ourmaxseg >> 8) & 0xff;
2144 optp[3] = sc->sc_ourmaxseg & 0xff;
2145 optlen = 4;
2146
2147 if (sc->sc_request_r_scale != 15) {
2148 *((u_int32_t *)(optp + optlen)) = htonl(TCPOPT_NOP << 24 |
2149 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
2150 sc->sc_request_r_scale);
2151 optlen += 4;
2152 }
2153
2154 if (sc->sc_tstmp) {
2155 u_int32_t *lp = (u_int32_t *)(optp + optlen);
2156 /* Form timestamp option as shown in appendix A of RFC 1323. */
2157 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
2158 *lp++ = htonl(tcp_now);
2159 *lp = htonl(ts);
2160 optlen += TCPOLEN_TSTAMP_APPA;
2161 }
2162
2163 /*
2164 * Toss any trailing mbufs. No need to worry about
2165 * m_len and m_pkthdr.len, since tcp_respond() will
2166 * unconditionally set them.
2167 */
2168 if (m->m_next) {
2169 m_freem(m->m_next);
2170 m->m_next = NULL;
2171 }
2172
2173 /*
2174 * Fill in the fields that tcp_respond() will not touch, and
2175 * then send the response.
2176 */
2177 ti->ti_off = (sizeof(struct tcphdr) + optlen) >> 2;
2178 ti->ti_win = htons(win);
2179 return (tcp_respond(NULL, ti, m, sc->sc_irs + 1, sc->sc_iss,
2180 TH_SYN|TH_ACK));
2181 }
2182 #endif /* TUBA_INCLUDE */
2183