tcp_output.c revision 1.61 1 /* $NetBSD: tcp_output.c,v 1.61 2000/10/19 20:23:00 itojun Exp $ */
2
3 /*
4 %%% portions-copyright-nrl-95
5 Portions of this software are Copyright 1995-1998 by Randall Atkinson,
6 Ronald Lee, Daniel McDonald, Bao Phan, and Chris Winters. All Rights
7 Reserved. All rights under this copyright have been assigned to the US
8 Naval Research Laboratory (NRL). The NRL Copyright Notice and License
9 Agreement Version 1.1 (January 17, 1995) applies to these portions of the
10 software.
11 You should have received a copy of the license with this software. If you
12 didn't get a copy, you may request one from <license (at) ipv6.nrl.navy.mil>.
13
14 */
15
16 /*
17 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
18 * All rights reserved.
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 * 1. Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * 2. Redistributions in binary form must reproduce the above copyright
26 * notice, this list of conditions and the following disclaimer in the
27 * documentation and/or other materials provided with the distribution.
28 * 3. Neither the name of the project nor the names of its contributors
29 * may be used to endorse or promote products derived from this software
30 * without specific prior written permission.
31 *
32 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 * SUCH DAMAGE.
43 */
44
45 /*-
46 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
47 * All rights reserved.
48 *
49 * This code is derived from software contributed to The NetBSD Foundation
50 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
51 * Facility, NASA Ames Research Center.
52 *
53 * Redistribution and use in source and binary forms, with or without
54 * modification, are permitted provided that the following conditions
55 * are met:
56 * 1. Redistributions of source code must retain the above copyright
57 * notice, this list of conditions and the following disclaimer.
58 * 2. Redistributions in binary form must reproduce the above copyright
59 * notice, this list of conditions and the following disclaimer in the
60 * documentation and/or other materials provided with the distribution.
61 * 3. All advertising materials mentioning features or use of this software
62 * must display the following acknowledgement:
63 * This product includes software developed by the NetBSD
64 * Foundation, Inc. and its contributors.
65 * 4. Neither the name of The NetBSD Foundation nor the names of its
66 * contributors may be used to endorse or promote products derived
67 * from this software without specific prior written permission.
68 *
69 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
70 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
71 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
72 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
73 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
74 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
75 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
76 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
77 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
78 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
79 * POSSIBILITY OF SUCH DAMAGE.
80 */
81
82 /*
83 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
84 * The Regents of the University of California. All rights reserved.
85 *
86 * Redistribution and use in source and binary forms, with or without
87 * modification, are permitted provided that the following conditions
88 * are met:
89 * 1. Redistributions of source code must retain the above copyright
90 * notice, this list of conditions and the following disclaimer.
91 * 2. Redistributions in binary form must reproduce the above copyright
92 * notice, this list of conditions and the following disclaimer in the
93 * documentation and/or other materials provided with the distribution.
94 * 3. All advertising materials mentioning features or use of this software
95 * must display the following acknowledgement:
96 * This product includes software developed by the University of
97 * California, Berkeley and its contributors.
98 * 4. Neither the name of the University nor the names of its contributors
99 * may be used to endorse or promote products derived from this software
100 * without specific prior written permission.
101 *
102 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
103 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
104 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
105 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
106 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
107 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
108 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
109 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
110 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
111 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
112 * SUCH DAMAGE.
113 *
114 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
115 */
116
117 #include "opt_inet.h"
118 #include "opt_ipsec.h"
119
120 #include <sys/param.h>
121 #include <sys/systm.h>
122 #include <sys/malloc.h>
123 #include <sys/mbuf.h>
124 #include <sys/protosw.h>
125 #include <sys/socket.h>
126 #include <sys/socketvar.h>
127 #include <sys/errno.h>
128 #include <sys/domain.h>
129
130 #include <net/if.h>
131 #include <net/route.h>
132
133 #include <netinet/in.h>
134 #include <netinet/in_systm.h>
135 #include <netinet/ip.h>
136 #include <netinet/in_pcb.h>
137 #include <netinet/ip_var.h>
138
139 #ifdef INET6
140 #ifndef INET
141 #include <netinet/in.h>
142 #endif
143 #include <netinet/ip6.h>
144 #include <netinet6/in6_pcb.h>
145 #include <netinet6/ip6_var.h>
146 #endif
147
148 #include <netinet/tcp.h>
149 #define TCPOUTFLAGS
150 #include <netinet/tcp_fsm.h>
151 #include <netinet/tcp_seq.h>
152 #include <netinet/tcp_timer.h>
153 #include <netinet/tcp_var.h>
154 #include <netinet/tcpip.h>
155 #include <netinet/tcp_debug.h>
156
157 #ifdef notyet
158 extern struct mbuf *m_copypack();
159 #endif
160
161 #define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
162
163 /*
164 * Knob to enable Congestion Window Monitoring, and control the
165 * the burst size it allows. Default burst is 4 packets, per
166 * the Internet draft.
167 */
168 int tcp_cwm = 0;
169 int tcp_cwm_burstsize = 4;
170
171 static __inline void tcp_segsize __P((struct tcpcb *, int *, int *));
172 static __inline void
173 tcp_segsize(tp, txsegsizep, rxsegsizep)
174 struct tcpcb *tp;
175 int *txsegsizep, *rxsegsizep;
176 {
177 #ifdef INET
178 struct inpcb *inp = tp->t_inpcb;
179 #endif
180 #ifdef INET6
181 struct in6pcb *in6p = tp->t_in6pcb;
182 #endif
183 struct rtentry *rt;
184 struct ifnet *ifp;
185 int size;
186 int iphlen;
187
188 #ifdef DIAGNOSTIC
189 if (tp->t_inpcb && tp->t_in6pcb)
190 panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
191 #endif
192 switch (tp->t_family) {
193 #ifdef INET
194 case AF_INET:
195 iphlen = sizeof(struct ip);
196 break;
197 #endif
198 #ifdef INET6
199 case AF_INET6:
200 iphlen = sizeof(struct ip6_hdr);
201 break;
202 #endif
203 default:
204 size = tcp_mssdflt;
205 goto out;
206 }
207
208 rt = NULL;
209 #ifdef INET
210 if (inp)
211 rt = in_pcbrtentry(inp);
212 #endif
213 #ifdef INET6
214 if (in6p)
215 rt = in6_pcbrtentry(in6p);
216 #endif
217 if (rt == NULL) {
218 size = tcp_mssdflt;
219 goto out;
220 }
221
222 ifp = rt->rt_ifp;
223
224 size = tcp_mssdflt;
225 if (rt->rt_rmx.rmx_mtu != 0)
226 size = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
227 else if (ifp->if_flags & IFF_LOOPBACK)
228 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
229 #ifdef INET
230 else if (ip_mtudisc)
231 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
232 else if (inp && in_localaddr(inp->inp_faddr))
233 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
234 #endif
235 #ifdef INET6
236 else if (in6p) {
237 #ifdef INET
238 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
239 /* mapped addr case */
240 struct in_addr d;
241 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
242 if (in_localaddr(d))
243 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
244 } else
245 #endif
246 {
247 if (in6_localaddr(&in6p->in6p_faddr))
248 size = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
249 }
250 }
251 #endif
252 size -= tcp_optlen(tp);
253 /*
254 * XXX tp->t_ourmss should have the right size, but without this code
255 * fragmentation will occur... need more investigation
256 */
257 #ifdef INET
258 if (inp) {
259 #ifdef IPSEC
260 size -= ipsec4_hdrsiz_tcp(tp);
261 #endif
262 size -= ip_optlen(inp);
263 }
264 #endif
265 #ifdef INET6
266 #ifdef INET
267 if (in6p && tp->t_family == AF_INET) {
268 #ifdef IPSEC
269 size -= ipsec4_hdrsiz_tcp(tp);
270 #endif
271 /* XXX size -= ip_optlen(in6p); */
272 } else
273 #endif
274 if (in6p && tp->t_family == AF_INET6) {
275 #ifdef IPSEC
276 size -= ipsec6_hdrsiz_tcp(tp);
277 #endif
278 size -= ip6_optlen(in6p);
279 }
280 #endif
281
282 out:
283 /*
284 * *rxsegsizep holds *estimated* inbound segment size (estimation
285 * assumes that path MTU is the same for both ways). this is only
286 * for silly window avoidance, do not use the value for other purposes.
287 *
288 * ipseclen is subtracted from both sides, this may not be right.
289 * I'm not quite sure about this (could someone comment).
290 */
291 *txsegsizep = min(tp->t_peermss, size);
292 *rxsegsizep = min(tp->t_ourmss, size);
293
294 if (*txsegsizep != tp->t_segsz) {
295 /*
296 * If the new segment size is larger, we don't want to
297 * mess up the congestion window, but if it is smaller
298 * we'll have to reduce the congestion window to ensure
299 * that we don't get into trouble with initial windows
300 * and the rest. In any case, if the segment size
301 * has changed, chances are the path has, too, and
302 * our congestion window will be different.
303 */
304 if (*txsegsizep < tp->t_segsz) {
305 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
306 * *txsegsizep, *txsegsizep);
307 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
308 * *txsegsizep, *txsegsizep);
309 }
310 tp->t_segsz = *txsegsizep;
311 }
312 }
313
314 /*
315 * Tcp output routine: figure out what should be sent and send it.
316 */
317 int
318 tcp_output(tp)
319 struct tcpcb *tp;
320 {
321 struct socket *so;
322 struct route *ro;
323 long len, win;
324 int off, flags, error;
325 struct mbuf *m;
326 struct ip *ip;
327 #ifdef INET6
328 struct ip6_hdr *ip6;
329 #endif
330 struct tcphdr *th;
331 u_char opt[MAX_TCPOPTLEN];
332 unsigned optlen, hdrlen;
333 int idle, sendalot, txsegsize, rxsegsize;
334 int maxburst = TCP_MAXBURST;
335 int af; /* address family on the wire */
336 int iphdrlen;
337
338 #ifdef DIAGNOSTIC
339 if (tp->t_inpcb && tp->t_in6pcb)
340 panic("tcp_output: both t_inpcb and t_in6pcb are set");
341 #endif
342 so = NULL;
343 ro = NULL;
344 if (tp->t_inpcb) {
345 so = tp->t_inpcb->inp_socket;
346 ro = &tp->t_inpcb->inp_route;
347 }
348 #ifdef INET6
349 else if (tp->t_in6pcb) {
350 so = tp->t_in6pcb->in6p_socket;
351 ro = (struct route *)&tp->t_in6pcb->in6p_route;
352 }
353 #endif
354
355 switch (af = tp->t_family) {
356 #ifdef INET
357 case AF_INET:
358 if (tp->t_inpcb)
359 break;
360 #ifdef INET6
361 /* mapped addr case */
362 if (tp->t_in6pcb)
363 break;
364 #endif
365 return EINVAL;
366 #endif
367 #ifdef INET6
368 case AF_INET6:
369 if (tp->t_in6pcb)
370 break;
371 return EINVAL;
372 #endif
373 default:
374 return EAFNOSUPPORT;
375 }
376
377 tcp_segsize(tp, &txsegsize, &rxsegsize);
378
379 idle = (tp->snd_max == tp->snd_una);
380
381 /*
382 * Restart Window computation. From draft-floyd-incr-init-win-03:
383 *
384 * Optionally, a TCP MAY set the restart window to the
385 * minimum of the value used for the initial window and
386 * the current value of cwnd (in other words, using a
387 * larger value for the restart window should never increase
388 * the size of cwnd).
389 */
390 if (tcp_cwm) {
391 /*
392 * Hughes/Touch/Heidemann Congestion Window Monitoring.
393 * Count the number of packets currently pending
394 * acknowledgement, and limit our congestion window
395 * to a pre-determined allowed burst size plus that count.
396 * This prevents bursting once all pending packets have
397 * been acknowledged (i.e. transmission is idle).
398 *
399 * XXX Link this to Initial Window?
400 */
401 tp->snd_cwnd = min(tp->snd_cwnd,
402 (tcp_cwm_burstsize * txsegsize) +
403 (tp->snd_nxt - tp->snd_una));
404 } else {
405 if (idle && tp->t_idle >= tp->t_rxtcur) {
406 /*
407 * We have been idle for "a while" and no acks are
408 * expected to clock out any data we send --
409 * slow start to get ack "clock" running again.
410 */
411 tp->snd_cwnd = min(tp->snd_cwnd,
412 TCP_INITIAL_WINDOW(tcp_init_win, txsegsize));
413 }
414 }
415
416 again:
417 /*
418 * Determine length of data that should be transmitted, and
419 * flags that should be used. If there is some data or critical
420 * controls (SYN, RST) to send, then transmit; otherwise,
421 * investigate further.
422 */
423 sendalot = 0;
424 off = tp->snd_nxt - tp->snd_una;
425 win = min(tp->snd_wnd, tp->snd_cwnd);
426
427 flags = tcp_outflags[tp->t_state];
428 /*
429 * If in persist timeout with window of 0, send 1 byte.
430 * Otherwise, if window is small but nonzero
431 * and timer expired, we will send what we can
432 * and go to transmit state.
433 */
434 if (tp->t_force) {
435 if (win == 0) {
436 /*
437 * If we still have some data to send, then
438 * clear the FIN bit. Usually this would
439 * happen below when it realizes that we
440 * aren't sending all the data. However,
441 * if we have exactly 1 byte of unset data,
442 * then it won't clear the FIN bit below,
443 * and if we are in persist state, we wind
444 * up sending the packet without recording
445 * that we sent the FIN bit.
446 *
447 * We can't just blindly clear the FIN bit,
448 * because if we don't have any more data
449 * to send then the probe will be the FIN
450 * itself.
451 */
452 if (off < so->so_snd.sb_cc)
453 flags &= ~TH_FIN;
454 win = 1;
455 } else {
456 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
457 tp->t_rxtshift = 0;
458 }
459 }
460
461 if (win < so->so_snd.sb_cc) {
462 len = win - off;
463 flags &= ~TH_FIN;
464 } else
465 len = so->so_snd.sb_cc - off;
466
467 if (len < 0) {
468 /*
469 * If FIN has been sent but not acked,
470 * but we haven't been called to retransmit,
471 * len will be -1. Otherwise, window shrank
472 * after we sent into it. If window shrank to 0,
473 * cancel pending retransmit, pull snd_nxt back
474 * to (closed) window, and set the persist timer
475 * if it isn't already going. If the window didn't
476 * close completely, just wait for an ACK.
477 *
478 * If we have a pending FIN, either it has already been
479 * transmitted or it is outside the window, so drop it.
480 * If the FIN has been transmitted, but this is not a
481 * retransmission, then len must be -1. Therefore we also
482 * prevent here the sending of `gratuitous FINs'. This
483 * eliminates the need to check for that case below (e.g.
484 * to back up snd_nxt before the FIN so that the sequence
485 * number is correct).
486 */
487 len = 0;
488 flags &= ~TH_FIN;
489 if (win == 0) {
490 TCP_TIMER_DISARM(tp, TCPT_REXMT);
491 tp->t_rxtshift = 0;
492 tp->snd_nxt = tp->snd_una;
493 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
494 tcp_setpersist(tp);
495 }
496 }
497 if (len > txsegsize) {
498 len = txsegsize;
499 flags &= ~TH_FIN;
500 sendalot = 1;
501 }
502
503 win = sbspace(&so->so_rcv);
504
505 /*
506 * Sender silly window avoidance. If connection is idle
507 * and can send all data, a maximum segment,
508 * at least a maximum default-size segment do it,
509 * or are forced, do it; otherwise don't bother.
510 * If peer's buffer is tiny, then send
511 * when window is at least half open.
512 * If retransmitting (possibly after persist timer forced us
513 * to send into a small window), then must resend.
514 */
515 if (len) {
516 if (len == txsegsize)
517 goto send;
518 if ((so->so_state & SS_MORETOCOME) == 0 &&
519 ((idle || tp->t_flags & TF_NODELAY) &&
520 len + off >= so->so_snd.sb_cc))
521 goto send;
522 if (tp->t_force)
523 goto send;
524 if (len >= tp->max_sndwnd / 2)
525 goto send;
526 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
527 goto send;
528 }
529
530 /*
531 * Compare available window to amount of window known to peer
532 * (as advertised window less next expected input). If the
533 * difference is at least twice the size of the largest segment
534 * we expect to receive (i.e. two segments) or at least 50% of
535 * the maximum possible window, then want to send a window update
536 * to peer.
537 */
538 if (win > 0) {
539 /*
540 * "adv" is the amount we can increase the window,
541 * taking into account that we are limited by
542 * TCP_MAXWIN << tp->rcv_scale.
543 */
544 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
545 (tp->rcv_adv - tp->rcv_nxt);
546
547 if (adv >= (long) (2 * rxsegsize))
548 goto send;
549 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
550 goto send;
551 }
552
553 /*
554 * Send if we owe peer an ACK.
555 */
556 if (tp->t_flags & TF_ACKNOW)
557 goto send;
558 if (flags & (TH_SYN|TH_FIN|TH_RST))
559 goto send;
560 if (SEQ_GT(tp->snd_up, tp->snd_una))
561 goto send;
562
563 /*
564 * TCP window updates are not reliable, rather a polling protocol
565 * using ``persist'' packets is used to insure receipt of window
566 * updates. The three ``states'' for the output side are:
567 * idle not doing retransmits or persists
568 * persisting to move a small or zero window
569 * (re)transmitting and thereby not persisting
570 *
571 * tp->t_timer[TCPT_PERSIST]
572 * is set when we are in persist state.
573 * tp->t_force
574 * is set when we are called to send a persist packet.
575 * tp->t_timer[TCPT_REXMT]
576 * is set when we are retransmitting
577 * The output side is idle when both timers are zero.
578 *
579 * If send window is too small, there is data to transmit, and no
580 * retransmit or persist is pending, then go to persist state.
581 * If nothing happens soon, send when timer expires:
582 * if window is nonzero, transmit what we can,
583 * otherwise force out a byte.
584 */
585 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
586 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
587 tp->t_rxtshift = 0;
588 tcp_setpersist(tp);
589 }
590
591 /*
592 * No reason to send a segment, just return.
593 */
594 return (0);
595
596 send:
597 /*
598 * Before ESTABLISHED, force sending of initial options
599 * unless TCP set not to do any options.
600 * NOTE: we assume that the IP/TCP header plus TCP options
601 * always fit in a single mbuf, leaving room for a maximum
602 * link header, i.e.
603 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
604 */
605 optlen = 0;
606 switch (af) {
607 #ifdef INET
608 case AF_INET:
609 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
610 break;
611 #endif
612 #ifdef INET6
613 case AF_INET6:
614 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
615 break;
616 #endif
617 default: /*pacify gcc*/
618 iphdrlen = 0;
619 break;
620 }
621 hdrlen = iphdrlen;
622 if (flags & TH_SYN) {
623 struct rtentry *rt;
624
625 rt = NULL;
626 #ifdef INET
627 if (tp->t_inpcb)
628 rt = in_pcbrtentry(tp->t_inpcb);
629 #endif
630 #ifdef INET6
631 if (tp->t_in6pcb)
632 rt = in6_pcbrtentry(tp->t_in6pcb);
633 #endif
634
635 tp->snd_nxt = tp->iss;
636 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ?
637 rt->rt_ifp : NULL, af);
638 if ((tp->t_flags & TF_NOOPT) == 0) {
639 opt[0] = TCPOPT_MAXSEG;
640 opt[1] = 4;
641 opt[2] = (tp->t_ourmss >> 8) & 0xff;
642 opt[3] = tp->t_ourmss & 0xff;
643 optlen = 4;
644
645 if ((tp->t_flags & TF_REQ_SCALE) &&
646 ((flags & TH_ACK) == 0 ||
647 (tp->t_flags & TF_RCVD_SCALE))) {
648 *((u_int32_t *) (opt + optlen)) = htonl(
649 TCPOPT_NOP << 24 |
650 TCPOPT_WINDOW << 16 |
651 TCPOLEN_WINDOW << 8 |
652 tp->request_r_scale);
653 optlen += 4;
654 }
655 }
656 }
657
658 /*
659 * Send a timestamp and echo-reply if this is a SYN and our side
660 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
661 * and our peer have sent timestamps in our SYN's.
662 */
663 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
664 (flags & TH_RST) == 0 &&
665 ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
666 (tp->t_flags & TF_RCVD_TSTMP))) {
667 u_int32_t *lp = (u_int32_t *)(opt + optlen);
668
669 /* Form timestamp option as shown in appendix A of RFC 1323. */
670 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
671 *lp++ = htonl(tcp_now);
672 *lp = htonl(tp->ts_recent);
673 optlen += TCPOLEN_TSTAMP_APPA;
674 }
675
676 hdrlen += optlen;
677
678 #ifdef DIAGNOSTIC
679 if (len > txsegsize)
680 panic("tcp data to be sent is larger than segment");
681 if (max_linkhdr + hdrlen > MCLBYTES)
682 panic("tcphdr too big");
683 #endif
684
685 /*
686 * Grab a header mbuf, attaching a copy of data to
687 * be transmitted, and initialize the header from
688 * the template for sends on this connection.
689 */
690 if (len) {
691 if (tp->t_force && len == 1)
692 tcpstat.tcps_sndprobe++;
693 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
694 tcpstat.tcps_sndrexmitpack++;
695 tcpstat.tcps_sndrexmitbyte += len;
696 } else {
697 tcpstat.tcps_sndpack++;
698 tcpstat.tcps_sndbyte += len;
699 }
700 #ifdef notyet
701 if ((m = m_copypack(so->so_snd.sb_mb, off,
702 (int)len, max_linkhdr + hdrlen)) == 0) {
703 error = ENOBUFS;
704 goto out;
705 }
706 /*
707 * m_copypack left space for our hdr; use it.
708 */
709 m->m_len += hdrlen;
710 m->m_data -= hdrlen;
711 #else
712 MGETHDR(m, M_DONTWAIT, MT_HEADER);
713 if (m != NULL &&
714 (max_linkhdr + hdrlen > MHLEN ||
715 max_linkhdr + hdrlen + len <= MCLBYTES)) {
716 MCLGET(m, M_DONTWAIT);
717 if ((m->m_flags & M_EXT) == 0) {
718 m_freem(m);
719 m = NULL;
720 }
721 }
722 if (m == NULL) {
723 error = ENOBUFS;
724 goto out;
725 }
726 m->m_data += max_linkhdr;
727 m->m_len = hdrlen;
728 if (len <= M_TRAILINGSPACE(m)) {
729 m_copydata(so->so_snd.sb_mb, off, (int) len,
730 mtod(m, caddr_t) + hdrlen);
731 m->m_len += len;
732 } else {
733 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
734 if (m->m_next == NULL) {
735 m_freem(m);
736 error = ENOBUFS;
737 goto out;
738 }
739 }
740 #endif
741 /*
742 * If we're sending everything we've got, set PUSH.
743 * (This will keep happy those implementations which only
744 * give data to the user when a buffer fills or
745 * a PUSH comes in.)
746 */
747 if (off + len == so->so_snd.sb_cc)
748 flags |= TH_PUSH;
749 } else {
750 if (tp->t_flags & TF_ACKNOW)
751 tcpstat.tcps_sndacks++;
752 else if (flags & (TH_SYN|TH_FIN|TH_RST))
753 tcpstat.tcps_sndctrl++;
754 else if (SEQ_GT(tp->snd_up, tp->snd_una))
755 tcpstat.tcps_sndurg++;
756 else
757 tcpstat.tcps_sndwinup++;
758
759 MGETHDR(m, M_DONTWAIT, MT_HEADER);
760 if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
761 MCLGET(m, M_DONTWAIT);
762 if ((m->m_flags & M_EXT) == 0) {
763 m_freem(m);
764 m = NULL;
765 }
766 }
767 if (m == NULL) {
768 error = ENOBUFS;
769 goto out;
770 }
771 m->m_data += max_linkhdr;
772 m->m_len = hdrlen;
773 }
774 m->m_pkthdr.rcvif = (struct ifnet *)0;
775 switch (af) {
776 #ifdef INET
777 case AF_INET:
778 ip = mtod(m, struct ip *);
779 #ifdef INET6
780 ip6 = NULL;
781 #endif
782 th = (struct tcphdr *)(ip + 1);
783 break;
784 #endif
785 #ifdef INET6
786 case AF_INET6:
787 ip = NULL;
788 ip6 = mtod(m, struct ip6_hdr *);
789 th = (struct tcphdr *)(ip6 + 1);
790 break;
791 #endif
792 default: /*pacify gcc*/
793 ip = NULL;
794 #ifdef INET6
795 ip6 = NULL;
796 #endif
797 th = NULL;
798 break;
799 }
800 if (tp->t_template == 0)
801 panic("tcp_output");
802 if (tp->t_template->m_len < iphdrlen)
803 panic("tcp_output");
804 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen);
805
806 /*
807 * If we are doing retransmissions, then snd_nxt will
808 * not reflect the first unsent octet. For ACK only
809 * packets, we do not want the sequence number of the
810 * retransmitted packet, we want the sequence number
811 * of the next unsent octet. So, if there is no data
812 * (and no SYN or FIN), use snd_max instead of snd_nxt
813 * when filling in ti_seq. But if we are in persist
814 * state, snd_max might reflect one byte beyond the
815 * right edge of the window, so use snd_nxt in that
816 * case, since we know we aren't doing a retransmission.
817 * (retransmit and persist are mutually exclusive...)
818 */
819 if (len || (flags & (TH_SYN|TH_FIN)) ||
820 TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
821 th->th_seq = htonl(tp->snd_nxt);
822 else
823 th->th_seq = htonl(tp->snd_max);
824 th->th_ack = htonl(tp->rcv_nxt);
825 if (optlen) {
826 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
827 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
828 }
829 th->th_flags = flags;
830 /*
831 * Calculate receive window. Don't shrink window,
832 * but avoid silly window syndrome.
833 */
834 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
835 win = 0;
836 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
837 win = (long)TCP_MAXWIN << tp->rcv_scale;
838 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
839 win = (long)(tp->rcv_adv - tp->rcv_nxt);
840 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
841 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
842 u_int32_t urp = tp->snd_up - tp->snd_nxt;
843 if (urp > IP_MAXPACKET)
844 urp = IP_MAXPACKET;
845 th->th_urp = htons((u_int16_t)urp);
846 th->th_flags |= TH_URG;
847 } else
848 /*
849 * If no urgent pointer to send, then we pull
850 * the urgent pointer to the left edge of the send window
851 * so that it doesn't drift into the send window on sequence
852 * number wraparound.
853 */
854 tp->snd_up = tp->snd_una; /* drag it along */
855
856 /*
857 * Put TCP length in extended header, and then
858 * checksum extended header and data.
859 */
860 switch (af) {
861 #ifdef INET
862 case AF_INET:
863 {
864 struct ipovly *ipov = (struct ipovly *)ip;
865 if (len + optlen)
866 ipov->ih_len = htons((u_int16_t)(sizeof(struct tcphdr) +
867 optlen + len));
868 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
869 th->th_sum = 0;
870 th->th_sum = in_cksum(m, (int)(hdrlen + len));
871 break;
872 }
873 #endif
874 #ifdef INET6
875 case AF_INET6:
876 /* equals to hdrlen + len */
877 m->m_pkthdr.len = sizeof(struct ip6_hdr)
878 + sizeof(struct tcphdr) + optlen + len;
879 th->th_sum = 0;
880 th->th_sum = in6_cksum(m, IPPROTO_TCP,
881 sizeof(struct ip6_hdr),
882 sizeof(struct tcphdr) + optlen + len);
883 break;
884 #endif
885 }
886
887 /*
888 * In transmit state, time the transmission and arrange for
889 * the retransmit. In persist state, just set snd_max.
890 */
891 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
892 tcp_seq startseq = tp->snd_nxt;
893
894 /*
895 * Advance snd_nxt over sequence space of this segment.
896 * There are no states in which we send both a SYN and a FIN,
897 * so we collapse the tests for these flags.
898 */
899 if (flags & (TH_SYN|TH_FIN))
900 tp->snd_nxt++;
901 tp->snd_nxt += len;
902 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
903 tp->snd_max = tp->snd_nxt;
904 /*
905 * Time this transmission if not a retransmission and
906 * not currently timing anything.
907 */
908 if (tp->t_rtt == 0) {
909 tp->t_rtt = 1;
910 tp->t_rtseq = startseq;
911 tcpstat.tcps_segstimed++;
912 }
913 }
914
915 /*
916 * Set retransmit timer if not currently set,
917 * and not doing an ack or a keep-alive probe.
918 * Initial value for retransmit timer is smoothed
919 * round-trip time + 2 * round-trip time variance.
920 * Initialize shift counter which is used for backoff
921 * of retransmit time.
922 */
923 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
924 tp->snd_nxt != tp->snd_una) {
925 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
926 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
927 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
928 tp->t_rxtshift = 0;
929 }
930 }
931 } else
932 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
933 tp->snd_max = tp->snd_nxt + len;
934
935 /*
936 * Trace.
937 */
938 if (so->so_options & SO_DEBUG) {
939 /*
940 * need to recover version # field, which was overwritten
941 * on ip_cksum computation.
942 */
943 struct ip *sip;
944 sip = mtod(m, struct ip *);
945 switch (af) {
946 #ifdef INET
947 case AF_INET:
948 sip->ip_v = 4;
949 break;
950 #endif
951 #ifdef INET6
952 case AF_INET6:
953 sip->ip_v = 6;
954 break;
955 #endif
956 }
957 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
958 }
959
960 /*
961 * Fill in IP length and desired time to live and
962 * send to IP level. There should be a better way
963 * to handle ttl and tos; we could keep them in
964 * the template, but need a way to checksum without them.
965 */
966 m->m_pkthdr.len = hdrlen + len;
967
968 switch (af) {
969 #ifdef INET
970 case AF_INET:
971 ip->ip_len = m->m_pkthdr.len;
972 if (tp->t_inpcb) {
973 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
974 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
975 }
976 #ifdef INET6
977 else if (tp->t_in6pcb) {
978 ip->ip_ttl = tp->t_in6pcb->in6p_ip6.ip6_hlim;
979 ip->ip_tos = 0; /*XXX*/
980 }
981 #endif
982 break;
983 #endif
984 #ifdef INET6
985 case AF_INET6:
986 ip6->ip6_nxt = IPPROTO_TCP;
987 if (tp->t_in6pcb) {
988 /*
989 * we separately set hoplimit for every segment, since
990 * the user might want to change the value via
991 * setsockopt. Also, desired default hop limit might
992 * be changed via Neighbor Discovery.
993 */
994 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb,
995 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
996 }
997 /* ip6->ip6_flow = ??? */
998 /* ip6_plen will be filled in ip6_output(). */
999 break;
1000 #endif
1001 }
1002
1003 #ifdef IPSEC
1004 ipsec_setsocket(m, so);
1005 #endif /*IPSEC*/
1006
1007 switch (af) {
1008 #ifdef INET
1009 case AF_INET:
1010 {
1011 struct mbuf *opts;
1012
1013 if (tp->t_inpcb)
1014 opts = tp->t_inpcb->inp_options;
1015 else
1016 opts = NULL;
1017 error = ip_output(m, opts, ro,
1018 (ip_mtudisc ? IP_MTUDISC : 0) |
1019 (so->so_options & SO_DONTROUTE),
1020 0);
1021 break;
1022 }
1023 #endif
1024 #ifdef INET6
1025 case AF_INET6:
1026 {
1027 struct ip6_pktopts *opts;
1028
1029 if (tp->t_in6pcb)
1030 opts = tp->t_in6pcb->in6p_outputopts;
1031 else
1032 opts = NULL;
1033 error = ip6_output(m, opts, (struct route_in6 *)ro,
1034 so->so_options & SO_DONTROUTE, 0, NULL);
1035 break;
1036 }
1037 #endif
1038 default:
1039 error = EAFNOSUPPORT;
1040 break;
1041 }
1042 if (error) {
1043 out:
1044 if (error == ENOBUFS) {
1045 #ifdef INET
1046 if (tp->t_inpcb)
1047 tcp_quench(tp->t_inpcb, 0);
1048 #endif
1049 #ifdef INET6
1050 if (tp->t_in6pcb)
1051 tcp6_quench(tp->t_in6pcb, 0);
1052 #endif
1053 return (0);
1054 }
1055 if ((error == EHOSTUNREACH || error == ENETDOWN)
1056 && TCPS_HAVERCVDSYN(tp->t_state)) {
1057 tp->t_softerror = error;
1058 return (0);
1059 }
1060 return (error);
1061 }
1062 tcpstat.tcps_sndtotal++;
1063 if (tp->t_flags & TF_DELACK)
1064 tcpstat.tcps_delack++;
1065
1066 /*
1067 * Data sent (as far as we can tell).
1068 * If this advertises a larger window than any other segment,
1069 * then remember the size of the advertised window.
1070 * Any pending ACK has now been sent.
1071 */
1072 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1073 tp->rcv_adv = tp->rcv_nxt + win;
1074 tp->last_ack_sent = tp->rcv_nxt;
1075 tp->t_flags &= ~TF_ACKNOW;
1076 TCP_CLEAR_DELACK(tp);
1077 #ifdef DIAGNOSTIC
1078 if (maxburst < 0)
1079 printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1080 #endif
1081 if (sendalot && (!tcp_do_newreno || --maxburst))
1082 goto again;
1083 return (0);
1084 }
1085
1086 void
1087 tcp_setpersist(tp)
1088 struct tcpcb *tp;
1089 {
1090 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
1091 int nticks;
1092
1093 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1094 panic("tcp_output REXMT");
1095 /*
1096 * Start/restart persistance timer.
1097 */
1098 if (t < tp->t_rttmin)
1099 t = tp->t_rttmin;
1100 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1101 TCPTV_PERSMIN, TCPTV_PERSMAX);
1102 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1103 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1104 tp->t_rxtshift++;
1105 }
1106