tcp_subr.c revision 1.51 1 /* $NetBSD: tcp_subr.c,v 1.51 1998/05/06 01:21:21 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9 * Facility, NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
42 * The Regents of the University of California. All rights reserved.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. All advertising materials mentioning features or use of this software
53 * must display the following acknowledgement:
54 * This product includes software developed by the University of
55 * California, Berkeley and its contributors.
56 * 4. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
73 */
74
75 #include "opt_tcp_compat_42.h"
76 #include "rnd.h"
77
78 #include <sys/param.h>
79 #include <sys/proc.h>
80 #include <sys/systm.h>
81 #include <sys/malloc.h>
82 #include <sys/mbuf.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/protosw.h>
86 #include <sys/errno.h>
87 #include <sys/kernel.h>
88 #if NRND > 0
89 #include <sys/rnd.h>
90 #endif
91
92 #include <net/route.h>
93 #include <net/if.h>
94
95 #include <netinet/in.h>
96 #include <netinet/in_systm.h>
97 #include <netinet/ip.h>
98 #include <netinet/in_pcb.h>
99 #include <netinet/ip_var.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_fsm.h>
103 #include <netinet/tcp_seq.h>
104 #include <netinet/tcp_timer.h>
105 #include <netinet/tcp_var.h>
106 #include <netinet/tcpip.h>
107
108 /* patchable/settable parameters for tcp */
109 int tcp_mssdflt = TCP_MSS;
110 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
111 int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
112 int tcp_do_sack = 1; /* selective acknowledgement */
113 int tcp_do_win_scale = 1; /* RFC1323 window scaling */
114 int tcp_do_timestamps = 1; /* RFC1323 timestamps */
115 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
116 int tcp_init_win = 1;
117 int tcp_mss_ifmtu = 0;
118 #ifdef TCP_COMPAT_42
119 int tcp_compat_42 = 1;
120 #else
121 int tcp_compat_42 = 0;
122 #endif
123
124 #ifndef TCBHASHSIZE
125 #define TCBHASHSIZE 128
126 #endif
127 int tcbhashsize = TCBHASHSIZE;
128
129 int tcp_freeq __P((struct tcpcb *));
130
131 /*
132 * Tcp initialization
133 */
134 void
135 tcp_init()
136 {
137
138 in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
139 LIST_INIT(&tcp_delacks);
140 if (max_protohdr < sizeof(struct tcpiphdr))
141 max_protohdr = sizeof(struct tcpiphdr);
142 if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN)
143 panic("tcp_init");
144 }
145
146 /*
147 * Create template to be used to send tcp packets on a connection.
148 * Call after host entry created, allocates an mbuf and fills
149 * in a skeletal tcp/ip header, minimizing the amount of work
150 * necessary when the connection is used.
151 */
152 struct tcpiphdr *
153 tcp_template(tp)
154 struct tcpcb *tp;
155 {
156 register struct inpcb *inp = tp->t_inpcb;
157 register struct tcpiphdr *n;
158
159 if ((n = tp->t_template) == 0) {
160 MALLOC(n, struct tcpiphdr *, sizeof (struct tcpiphdr),
161 M_MBUF, M_NOWAIT);
162 if (n == NULL)
163 return (0);
164 }
165 bzero(n->ti_x1, sizeof n->ti_x1);
166 n->ti_pr = IPPROTO_TCP;
167 n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
168 n->ti_src = inp->inp_laddr;
169 n->ti_dst = inp->inp_faddr;
170 n->ti_sport = inp->inp_lport;
171 n->ti_dport = inp->inp_fport;
172 n->ti_seq = 0;
173 n->ti_ack = 0;
174 n->ti_x2 = 0;
175 n->ti_off = 5;
176 n->ti_flags = 0;
177 n->ti_win = 0;
178 n->ti_sum = 0;
179 n->ti_urp = 0;
180 return (n);
181 }
182
183 /*
184 * Send a single message to the TCP at address specified by
185 * the given TCP/IP header. If m == 0, then we make a copy
186 * of the tcpiphdr at ti and send directly to the addressed host.
187 * This is used to force keep alive messages out using the TCP
188 * template for a connection tp->t_template. If flags are given
189 * then we send a message back to the TCP which originated the
190 * segment ti, and discard the mbuf containing it and any other
191 * attached mbufs.
192 *
193 * In any case the ack and sequence number of the transmitted
194 * segment are as specified by the parameters.
195 */
196 int
197 tcp_respond(tp, ti, m, ack, seq, flags)
198 struct tcpcb *tp;
199 register struct tcpiphdr *ti;
200 register struct mbuf *m;
201 tcp_seq ack, seq;
202 int flags;
203 {
204 register int tlen;
205 int win = 0;
206 struct route *ro = 0;
207
208 if (tp) {
209 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
210 ro = &tp->t_inpcb->inp_route;
211 }
212 if (m == 0) {
213 m = m_gethdr(M_DONTWAIT, MT_HEADER);
214 if (m == NULL)
215 return (ENOBUFS);
216
217 if (tcp_compat_42)
218 tlen = 1;
219 else
220 tlen = 0;
221
222 m->m_data += max_linkhdr;
223 *mtod(m, struct tcpiphdr *) = *ti;
224 ti = mtod(m, struct tcpiphdr *);
225 flags = TH_ACK;
226 } else {
227 m_freem(m->m_next);
228 m->m_next = 0;
229 m->m_data = (caddr_t)ti;
230 m->m_len = sizeof (struct tcpiphdr);
231 tlen = 0;
232 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
233 xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_int32_t);
234 xchg(ti->ti_dport, ti->ti_sport, u_int16_t);
235 #undef xchg
236 }
237 bzero(ti->ti_x1, sizeof ti->ti_x1);
238 ti->ti_seq = htonl(seq);
239 ti->ti_ack = htonl(ack);
240 ti->ti_x2 = 0;
241 if ((flags & TH_SYN) == 0) {
242 if (tp)
243 ti->ti_win = htons((u_int16_t) (win >> tp->rcv_scale));
244 else
245 ti->ti_win = htons((u_int16_t)win);
246 ti->ti_off = sizeof (struct tcphdr) >> 2;
247 tlen += sizeof (struct tcphdr);
248 } else
249 tlen += ti->ti_off << 2;
250 ti->ti_len = htons((u_int16_t)tlen);
251 tlen += sizeof (struct ip);
252 m->m_len = tlen;
253 m->m_pkthdr.len = tlen;
254 m->m_pkthdr.rcvif = (struct ifnet *) 0;
255 ti->ti_flags = flags;
256 ti->ti_urp = 0;
257 ti->ti_sum = 0;
258 ti->ti_sum = in_cksum(m, tlen);
259 ((struct ip *)ti)->ip_len = tlen;
260 ((struct ip *)ti)->ip_ttl = ip_defttl;
261 return ip_output(m, NULL, ro, 0, NULL);
262 }
263
264 /*
265 * Create a new TCP control block, making an
266 * empty reassembly queue and hooking it to the argument
267 * protocol control block.
268 */
269 struct tcpcb *
270 tcp_newtcpcb(inp)
271 struct inpcb *inp;
272 {
273 register struct tcpcb *tp;
274
275 tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT);
276 if (tp == NULL)
277 return ((struct tcpcb *)0);
278 bzero((caddr_t)tp, sizeof(struct tcpcb));
279 LIST_INIT(&tp->segq);
280 LIST_INIT(&tp->timeq);
281 tp->t_peermss = tcp_mssdflt;
282 tp->t_ourmss = tcp_mssdflt;
283 tp->t_segsz = tcp_mssdflt;
284
285 tp->t_flags = 0;
286 if (tcp_do_rfc1323 && tcp_do_win_scale)
287 tp->t_flags |= TF_REQ_SCALE;
288 if (tcp_do_rfc1323 && tcp_do_timestamps)
289 tp->t_flags |= TF_REQ_TSTMP;
290 if (tcp_do_sack == 2)
291 tp->t_flags |= TF_WILL_SACK;
292 else if (tcp_do_sack == 1)
293 tp->t_flags |= TF_WILL_SACK|TF_IGNR_RXSACK;
294 tp->t_flags |= TF_CANT_TXSACK;
295 tp->t_inpcb = inp;
296 /*
297 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
298 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
299 * reasonable initial retransmit time.
300 */
301 tp->t_srtt = TCPTV_SRTTBASE;
302 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
303 tp->t_rttmin = TCPTV_MIN;
304 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
305 TCPTV_MIN, TCPTV_REXMTMAX);
306 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
307 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
308 inp->inp_ip.ip_ttl = ip_defttl;
309 inp->inp_ppcb = (caddr_t)tp;
310 return (tp);
311 }
312
313 /*
314 * Drop a TCP connection, reporting
315 * the specified error. If connection is synchronized,
316 * then send a RST to peer.
317 */
318 struct tcpcb *
319 tcp_drop(tp, errno)
320 register struct tcpcb *tp;
321 int errno;
322 {
323 struct socket *so = tp->t_inpcb->inp_socket;
324
325 if (TCPS_HAVERCVDSYN(tp->t_state)) {
326 tp->t_state = TCPS_CLOSED;
327 (void) tcp_output(tp);
328 tcpstat.tcps_drops++;
329 } else
330 tcpstat.tcps_conndrops++;
331 if (errno == ETIMEDOUT && tp->t_softerror)
332 errno = tp->t_softerror;
333 so->so_error = errno;
334 return (tcp_close(tp));
335 }
336
337 /*
338 * Close a TCP control block:
339 * discard all space held by the tcp
340 * discard internet protocol block
341 * wake up any sleepers
342 */
343 struct tcpcb *
344 tcp_close(tp)
345 register struct tcpcb *tp;
346 {
347 struct inpcb *inp = tp->t_inpcb;
348 struct socket *so = inp->inp_socket;
349 #ifdef RTV_RTT
350 register struct rtentry *rt;
351
352 /*
353 * If we sent enough data to get some meaningful characteristics,
354 * save them in the routing entry. 'Enough' is arbitrarily
355 * defined as the sendpipesize (default 4K) * 16. This would
356 * give us 16 rtt samples assuming we only get one sample per
357 * window (the usual case on a long haul net). 16 samples is
358 * enough for the srtt filter to converge to within 5% of the correct
359 * value; fewer samples and we could save a very bogus rtt.
360 *
361 * Don't update the default route's characteristics and don't
362 * update anything that the user "locked".
363 */
364 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
365 (rt = inp->inp_route.ro_rt) &&
366 !in_nullhost(satosin(rt_key(rt))->sin_addr)) {
367 register u_long i = 0;
368
369 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
370 i = tp->t_srtt *
371 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
372 if (rt->rt_rmx.rmx_rtt && i)
373 /*
374 * filter this update to half the old & half
375 * the new values, converting scale.
376 * See route.h and tcp_var.h for a
377 * description of the scaling constants.
378 */
379 rt->rt_rmx.rmx_rtt =
380 (rt->rt_rmx.rmx_rtt + i) / 2;
381 else
382 rt->rt_rmx.rmx_rtt = i;
383 }
384 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
385 i = tp->t_rttvar *
386 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
387 if (rt->rt_rmx.rmx_rttvar && i)
388 rt->rt_rmx.rmx_rttvar =
389 (rt->rt_rmx.rmx_rttvar + i) / 2;
390 else
391 rt->rt_rmx.rmx_rttvar = i;
392 }
393 /*
394 * update the pipelimit (ssthresh) if it has been updated
395 * already or if a pipesize was specified & the threshhold
396 * got below half the pipesize. I.e., wait for bad news
397 * before we start updating, then update on both good
398 * and bad news.
399 */
400 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
401 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
402 i < (rt->rt_rmx.rmx_sendpipe / 2)) {
403 /*
404 * convert the limit from user data bytes to
405 * packets then to packet data bytes.
406 */
407 i = (i + tp->t_segsz / 2) / tp->t_segsz;
408 if (i < 2)
409 i = 2;
410 i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
411 if (rt->rt_rmx.rmx_ssthresh)
412 rt->rt_rmx.rmx_ssthresh =
413 (rt->rt_rmx.rmx_ssthresh + i) / 2;
414 else
415 rt->rt_rmx.rmx_ssthresh = i;
416 }
417 }
418 #endif /* RTV_RTT */
419 /* free the reassembly queue, if any */
420 (void) tcp_freeq(tp);
421 TCP_CLEAR_DELACK(tp);
422
423 if (tp->t_template)
424 FREE(tp->t_template, M_MBUF);
425 free(tp, M_PCB);
426 inp->inp_ppcb = 0;
427 soisdisconnected(so);
428 in_pcbdetach(inp);
429 tcpstat.tcps_closed++;
430 return ((struct tcpcb *)0);
431 }
432
433 int
434 tcp_freeq(tp)
435 struct tcpcb *tp;
436 {
437 register struct ipqent *qe;
438 int rv = 0;
439 #ifdef TCPREASS_DEBUG
440 int i = 0;
441 #endif
442
443 while ((qe = tp->segq.lh_first) != NULL) {
444 #ifdef TCPREASS_DEBUG
445 printf("tcp_freeq[%p,%d]: %u:%u(%u) 0x%02x\n",
446 tp, i++, qe->ipqe_seq, qe->ipqe_seq + qe->ipqe_len,
447 qe->ipqe_len, qe->ipqe_flags & (TH_SYN|TH_FIN|TH_RST));
448 #endif
449 LIST_REMOVE(qe, ipqe_q);
450 LIST_REMOVE(qe, ipqe_timeq);
451 m_freem(qe->ipqe_m);
452 FREE(qe, M_IPQ);
453 rv = 1;
454 }
455 return (rv);
456 }
457
458 /*
459 * Protocol drain routine. Called when memory is in short supply.
460 */
461 void
462 tcp_drain()
463 {
464 register struct inpcb *inp;
465 register struct tcpcb *tp;
466
467 /*
468 * Free the sequence queue of all TCP connections.
469 */
470 inp = tcbtable.inpt_queue.cqh_first;
471 if (inp) /* XXX */
472 for (; inp != (struct inpcb *)&tcbtable.inpt_queue;
473 inp = inp->inp_queue.cqe_next) {
474 if ((tp = intotcpcb(inp)) != NULL) {
475 if (tcp_freeq(tp))
476 tcpstat.tcps_connsdrained++;
477 }
478 }
479 }
480
481 /*
482 * Notify a tcp user of an asynchronous error;
483 * store error as soft error, but wake up user
484 * (for now, won't do anything until can select for soft error).
485 */
486 void
487 tcp_notify(inp, error)
488 struct inpcb *inp;
489 int error;
490 {
491 register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
492 register struct socket *so = inp->inp_socket;
493
494 /*
495 * Ignore some errors if we are hooked up.
496 * If connection hasn't completed, has retransmitted several times,
497 * and receives a second error, give up now. This is better
498 * than waiting a long time to establish a connection that
499 * can never complete.
500 */
501 if (tp->t_state == TCPS_ESTABLISHED &&
502 (error == EHOSTUNREACH || error == ENETUNREACH ||
503 error == EHOSTDOWN)) {
504 return;
505 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
506 tp->t_rxtshift > 3 && tp->t_softerror)
507 so->so_error = error;
508 else
509 tp->t_softerror = error;
510 wakeup((caddr_t) &so->so_timeo);
511 sorwakeup(so);
512 sowwakeup(so);
513 }
514
515 void *
516 tcp_ctlinput(cmd, sa, v)
517 int cmd;
518 struct sockaddr *sa;
519 register void *v;
520 {
521 register struct ip *ip = v;
522 register struct tcphdr *th;
523 extern int inetctlerrmap[];
524 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
525 int errno;
526 int nmatch;
527
528 if ((unsigned)cmd >= PRC_NCMDS)
529 return NULL;
530 errno = inetctlerrmap[cmd];
531 if (cmd == PRC_QUENCH)
532 notify = tcp_quench;
533 else if (PRC_IS_REDIRECT(cmd))
534 notify = in_rtchange, ip = 0;
535 else if (cmd == PRC_MSGSIZE && ip_mtudisc)
536 notify = tcp_mtudisc, ip = 0;
537 else if (cmd == PRC_HOSTDEAD)
538 ip = 0;
539 else if (errno == 0)
540 return NULL;
541 if (ip) {
542 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
543 nmatch = in_pcbnotify(&tcbtable, satosin(sa)->sin_addr,
544 th->th_dport, ip->ip_src, th->th_sport, errno, notify);
545 if (nmatch == 0 && syn_cache_count &&
546 (inetctlerrmap[cmd] == EHOSTUNREACH ||
547 inetctlerrmap[cmd] == ENETUNREACH ||
548 inetctlerrmap[cmd] == EHOSTDOWN))
549 syn_cache_unreach(ip, th);
550 } else
551 (void)in_pcbnotifyall(&tcbtable, satosin(sa)->sin_addr, errno,
552 notify);
553 return NULL;
554 }
555
556 /*
557 * When a source quench is received, close congestion window
558 * to one segment. We will gradually open it again as we proceed.
559 */
560 void
561 tcp_quench(inp, errno)
562 struct inpcb *inp;
563 int errno;
564 {
565 struct tcpcb *tp = intotcpcb(inp);
566
567 if (tp)
568 tp->snd_cwnd = TCP_INITIAL_WINDOW(1, tp->t_segsz);
569 }
570
571 /*
572 * On receipt of path MTU corrections, flush old route and replace it
573 * with the new one. Retransmit all unacknowledged packets, to ensure
574 * that all packets will be received.
575 */
576 void
577 tcp_mtudisc(inp, errno)
578 struct inpcb *inp;
579 int errno;
580 {
581 struct tcpcb *tp = intotcpcb(inp);
582 struct rtentry *rt = in_pcbrtentry(inp);
583
584 if (tp != 0) {
585 if (rt != 0) {
586 /*
587 * If this was not a host route, remove and realloc.
588 */
589 if ((rt->rt_flags & RTF_HOST) == 0) {
590 in_rtchange(inp, errno);
591 if ((rt = in_pcbrtentry(inp)) == 0)
592 return;
593 }
594
595 /*
596 * Slow start out of the error condition. We
597 * use the MTU because we know it's smaller
598 * than the previously transmitted segment.
599 */
600 if (rt->rt_rmx.rmx_mtu != 0)
601 tp->snd_cwnd =
602 TCP_INITIAL_WINDOW(tcp_init_win,
603 rt->rt_rmx.rmx_mtu);
604 }
605
606 /*
607 * Resend unacknowledged packets.
608 */
609 tp->snd_nxt = tp->snd_una;
610 tcp_output(tp);
611 }
612 }
613
614
615 /*
616 * Compute the MSS to advertise to the peer. Called only during
617 * the 3-way handshake. If we are the server (peer initiated
618 * connection), we are called with the TCPCB for the listen
619 * socket. If we are the client (we initiated connection), we
620 * are called witht he TCPCB for the actual connection.
621 */
622 u_long
623 tcp_mss_to_advertise(ifp)
624 const struct ifnet *ifp;
625 {
626 extern u_long in_maxmtu;
627 u_long mss = 0;
628
629 /*
630 * In order to avoid defeating path MTU discovery on the peer,
631 * we advertise the max MTU of all attached networks as our MSS,
632 * per RFC 1191, section 3.1.
633 *
634 * We provide the option to advertise just the MTU of
635 * the interface on which we hope this connection will
636 * be receiving. If we are responding to a SYN, we
637 * will have a pretty good idea about this, but when
638 * initiating a connection there is a bit more doubt.
639 *
640 * We also need to ensure that loopback has a large enough
641 * MSS, as the loopback MTU is never included in in_maxmtu.
642 */
643
644 if (ifp != NULL)
645 mss = ifp->if_mtu;
646
647 if (tcp_mss_ifmtu == 0)
648 mss = max(in_maxmtu, mss);
649
650 if (mss > sizeof(struct tcpiphdr))
651 mss -= sizeof(struct tcpiphdr);
652
653 mss = max(tcp_mssdflt, mss);
654 return (mss);
655 }
656
657 /*
658 * Set connection variables based on the peer's advertised MSS.
659 * We are passed the TCPCB for the actual connection. If we
660 * are the server, we are called by the compressed state engine
661 * when the 3-way handshake is complete. If we are the client,
662 * we are called when we recieve the SYN,ACK from the server.
663 *
664 * NOTE: Our advertised MSS value must be initialized in the TCPCB
665 * before this routine is called!
666 */
667 void
668 tcp_mss_from_peer(tp, offer)
669 struct tcpcb *tp;
670 int offer;
671 {
672 struct inpcb *inp = tp->t_inpcb;
673 struct socket *so = inp->inp_socket;
674 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
675 struct rtentry *rt = in_pcbrtentry(inp);
676 #endif
677 u_long bufsize;
678 int mss;
679
680 /*
681 * As per RFC1122, use the default MSS value, unless they
682 * sent us an offer. Do not accept offers less than 32 bytes.
683 */
684 mss = tcp_mssdflt;
685 if (offer)
686 mss = offer;
687 mss = max(mss, 32); /* sanity */
688 mss -= (tcp_optlen(tp) + ip_optlen(tp->t_inpcb));
689
690 /*
691 * If there's a pipesize, change the socket buffer to that size.
692 * Make the socket buffer an integral number of MSS units. If
693 * the MSS is larger than the socket buffer, artificially decrease
694 * the MSS.
695 */
696 #ifdef RTV_SPIPE
697 if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
698 bufsize = rt->rt_rmx.rmx_sendpipe;
699 else
700 #endif
701 bufsize = so->so_snd.sb_hiwat;
702 if (bufsize < mss)
703 mss = bufsize;
704 else {
705 bufsize = roundup(bufsize, mss);
706 if (bufsize > sb_max)
707 bufsize = sb_max;
708 (void) sbreserve(&so->so_snd, bufsize);
709 }
710 tp->t_peermss = mss;
711 tp->t_segsz = mss;
712
713 #ifdef RTV_SSTHRESH
714 if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
715 /*
716 * There's some sort of gateway or interface buffer
717 * limit on the path. Use this to set the slow
718 * start threshold, but set the threshold to no less
719 * than 2 * MSS.
720 */
721 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
722 }
723 #endif
724 }
725
726 /*
727 * Processing necessary when a TCP connection is established.
728 */
729 void
730 tcp_established(tp)
731 struct tcpcb *tp;
732 {
733 struct inpcb *inp = tp->t_inpcb;
734 struct socket *so = inp->inp_socket;
735 #ifdef RTV_RPIPE
736 struct rtentry *rt = in_pcbrtentry(inp);
737 #endif
738 u_long bufsize;
739
740 tp->t_state = TCPS_ESTABLISHED;
741 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
742
743 #ifdef RTV_RPIPE
744 if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
745 bufsize = rt->rt_rmx.rmx_recvpipe;
746 else
747 #endif
748 bufsize = so->so_rcv.sb_hiwat;
749 if (bufsize > tp->t_ourmss) {
750 bufsize = roundup(bufsize, tp->t_ourmss);
751 if (bufsize > sb_max)
752 bufsize = sb_max;
753 (void) sbreserve(&so->so_rcv, bufsize);
754 }
755 }
756
757 /*
758 * Check if there's an initial rtt or rttvar. Convert from the
759 * route-table units to scaled multiples of the slow timeout timer.
760 * Called only during the 3-way handshake.
761 */
762 void
763 tcp_rmx_rtt(tp)
764 struct tcpcb *tp;
765 {
766 #ifdef RTV_RTT
767 struct rtentry *rt;
768 int rtt;
769
770 if ((rt = in_pcbrtentry(tp->t_inpcb)) == NULL)
771 return;
772
773 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
774 /*
775 * XXX The lock bit for MTU indicates that the value
776 * is also a minimum value; this is subject to time.
777 */
778 if (rt->rt_rmx.rmx_locks & RTV_RTT)
779 TCPT_RANGESET(tp->t_rttmin,
780 rtt / (RTM_RTTUNIT / PR_SLOWHZ),
781 TCPTV_MIN, TCPTV_REXMTMAX);
782 tp->t_srtt = rtt /
783 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
784 if (rt->rt_rmx.rmx_rttvar) {
785 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
786 ((RTM_RTTUNIT / PR_SLOWHZ) >>
787 (TCP_RTTVAR_SHIFT + 2));
788 } else {
789 /* Default variation is +- 1 rtt */
790 tp->t_rttvar =
791 tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
792 }
793 TCPT_RANGESET(tp->t_rxtcur,
794 ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
795 tp->t_rttmin, TCPTV_REXMTMAX);
796 }
797 #endif
798 }
799
800 tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
801
802 /*
803 * Get a new sequence value given a tcp control block
804 */
805 tcp_seq
806 tcp_new_iss(tp, len, addin)
807 void *tp;
808 u_long len;
809 tcp_seq addin;
810 {
811 tcp_seq tcp_iss;
812
813 /*
814 * add randomness about this connection, but do not estimate
815 * entropy from the timing, since the physical device driver would
816 * have done that for us.
817 */
818 #if NRND > 0
819 if (tp != NULL)
820 rnd_add_data(NULL, tp, len, 0);
821 #endif
822
823 /*
824 * randomize.
825 */
826 #if NRND > 0
827 rnd_extract_data(&tcp_iss, sizeof(tcp_iss), RND_EXTRACT_ANY);
828 #else
829 tcp_iss = random();
830 #endif
831
832 /*
833 * If we were asked to add some amount to a known value,
834 * we will take a random value obtained above, mask off the upper
835 * bits, and add in the known value. We also add in a constant to
836 * ensure that we are at least a certain distance from the original
837 * value.
838 *
839 * This is used when an old connection is in timed wait
840 * and we have a new one coming in, for instance.
841 */
842 if (addin != 0) {
843 #ifdef TCPISS_DEBUG
844 printf("Random %08x, ", tcp_iss);
845 #endif
846 tcp_iss &= TCP_ISS_RANDOM_MASK;
847 tcp_iss = tcp_iss + addin + TCP_ISSINCR;
848 tcp_iss_seq += TCP_ISSINCR;
849 tcp_iss += tcp_iss_seq;
850 #ifdef TCPISS_DEBUG
851 printf("Old ISS %08x, ISS %08x\n", addin, tcp_iss);
852 #endif
853 } else {
854 tcp_iss &= TCP_ISS_RANDOM_MASK;
855 tcp_iss_seq += TCP_ISSINCR;
856 tcp_iss += tcp_iss_seq;
857 #ifdef TCPISS_DEBUG
858 printf("ISS %08x\n", tcp_iss);
859 #endif
860 }
861
862 if (tcp_compat_42) {
863 /*
864 * Limit it to the positive range for really old TCP
865 * implementations.
866 */
867 if ((int)tcp_iss < 0)
868 tcp_iss &= 0x7fffffff; /* XXX */
869 }
870
871 return tcp_iss;
872 }
873
874
875 /*
876 * Determine the length of the TCP options for this connection.
877 *
878 * XXX: What do we do for SACK, when we add that? Just reserve
879 * all of the space? Otherwise we can't exactly be incrementing
880 * cwnd by an amount that varies depending on the amount we last
881 * had to SACK!
882 */
883
884 u_int
885 tcp_optlen(tp)
886 struct tcpcb *tp;
887 {
888 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
889 (TF_REQ_TSTMP | TF_RCVD_TSTMP))
890 return TCPOLEN_TSTAMP_APPA;
891 else
892 return 0;
893 }
894