tcp_output.c revision 1.141.10.1 1 /* $NetBSD: tcp_output.c,v 1.141.10.1 2006/04/19 04:46:11 elad Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 *
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
38 *
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
70 */
71
72 /*-
73 * Copyright (c) 1997, 1998, 2001, 2005 The NetBSD Foundation, Inc.
74 * All rights reserved.
75 *
76 * This code is derived from software contributed to The NetBSD Foundation
77 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
78 * Facility, NASA Ames Research Center.
79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Charles M. Hannum.
81 *
82 * Redistribution and use in source and binary forms, with or without
83 * modification, are permitted provided that the following conditions
84 * are met:
85 * 1. Redistributions of source code must retain the above copyright
86 * notice, this list of conditions and the following disclaimer.
87 * 2. Redistributions in binary form must reproduce the above copyright
88 * notice, this list of conditions and the following disclaimer in the
89 * documentation and/or other materials provided with the distribution.
90 * 3. All advertising materials mentioning features or use of this software
91 * must display the following acknowledgement:
92 * This product includes software developed by the NetBSD
93 * Foundation, Inc. and its contributors.
94 * 4. Neither the name of The NetBSD Foundation nor the names of its
95 * contributors may be used to endorse or promote products derived
96 * from this software without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
99 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108 * POSSIBILITY OF SUCH DAMAGE.
109 */
110
111 /*
112 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
113 * The Regents of the University of California. All rights reserved.
114 *
115 * Redistribution and use in source and binary forms, with or without
116 * modification, are permitted provided that the following conditions
117 * are met:
118 * 1. Redistributions of source code must retain the above copyright
119 * notice, this list of conditions and the following disclaimer.
120 * 2. Redistributions in binary form must reproduce the above copyright
121 * notice, this list of conditions and the following disclaimer in the
122 * documentation and/or other materials provided with the distribution.
123 * 3. Neither the name of the University nor the names of its contributors
124 * may be used to endorse or promote products derived from this software
125 * without specific prior written permission.
126 *
127 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
128 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
129 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
130 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
131 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
132 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
133 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
134 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
135 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
136 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
137 * SUCH DAMAGE.
138 *
139 * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
140 */
141
142 #include <sys/cdefs.h>
143 __KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.141.10.1 2006/04/19 04:46:11 elad Exp $");
144
145 #include "opt_inet.h"
146 #include "opt_ipsec.h"
147 #include "opt_tcp_debug.h"
148
149 #include <sys/param.h>
150 #include <sys/systm.h>
151 #include <sys/malloc.h>
152 #include <sys/mbuf.h>
153 #include <sys/protosw.h>
154 #include <sys/socket.h>
155 #include <sys/socketvar.h>
156 #include <sys/errno.h>
157 #include <sys/domain.h>
158 #include <sys/kernel.h>
159 #ifdef TCP_SIGNATURE
160 #include <sys/md5.h>
161 #endif
162
163 #include <net/if.h>
164 #include <net/route.h>
165
166 #include <netinet/in.h>
167 #include <netinet/in_systm.h>
168 #include <netinet/ip.h>
169 #include <netinet/in_pcb.h>
170 #include <netinet/ip_var.h>
171
172 #ifdef INET6
173 #ifndef INET
174 #include <netinet/in.h>
175 #endif
176 #include <netinet/ip6.h>
177 #include <netinet6/in6_var.h>
178 #include <netinet6/ip6_var.h>
179 #include <netinet6/in6_pcb.h>
180 #include <netinet6/nd6.h>
181 #endif
182
183 #ifdef FAST_IPSEC
184 #include <netipsec/ipsec.h>
185 #include <netipsec/key.h>
186 #endif /* FAST_IPSEC*/
187 #ifdef IPSEC
188 #include <netinet6/ipsec.h>
189 #endif
190
191 #include <netinet/tcp.h>
192 #define TCPOUTFLAGS
193 #include <netinet/tcp_fsm.h>
194 #include <netinet/tcp_seq.h>
195 #include <netinet/tcp_timer.h>
196 #include <netinet/tcp_var.h>
197 #include <netinet/tcpip.h>
198 #include <netinet/tcp_debug.h>
199 #include <netinet/in_offload.h>
200
201 #ifdef IPSEC
202 #include <netkey/key.h>
203 #endif
204
205 #ifdef notyet
206 extern struct mbuf *m_copypack();
207 #endif
208
209 /*
210 * Knob to enable Congestion Window Monitoring, and control
211 * the burst size it allows. Default burst is 4 packets, per
212 * the Internet draft.
213 */
214 int tcp_cwm = 0;
215 int tcp_cwm_burstsize = 4;
216
217 #ifdef TCP_OUTPUT_COUNTERS
218 #include <sys/device.h>
219
220 extern struct evcnt tcp_output_bigheader;
221 extern struct evcnt tcp_output_predict_hit;
222 extern struct evcnt tcp_output_predict_miss;
223 extern struct evcnt tcp_output_copysmall;
224 extern struct evcnt tcp_output_copybig;
225 extern struct evcnt tcp_output_refbig;
226
227 #define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
228 #else
229
230 #define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
231
232 #endif /* TCP_OUTPUT_COUNTERS */
233
234 static
235 #ifndef GPROF
236 inline
237 #endif
238 int
239 tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep)
240 {
241 #ifdef INET
242 struct inpcb *inp = tp->t_inpcb;
243 #endif
244 #ifdef INET6
245 struct in6pcb *in6p = tp->t_in6pcb;
246 #endif
247 struct socket *so = NULL;
248 struct rtentry *rt;
249 struct ifnet *ifp;
250 int size;
251 int hdrlen;
252 int optlen;
253
254 #ifdef DIAGNOSTIC
255 if (tp->t_inpcb && tp->t_in6pcb)
256 panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
257 #endif
258 switch (tp->t_family) {
259 #ifdef INET
260 case AF_INET:
261 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
262 break;
263 #endif
264 #ifdef INET6
265 case AF_INET6:
266 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
267 break;
268 #endif
269 default:
270 size = tcp_mssdflt;
271 goto out;
272 }
273
274 rt = NULL;
275 #ifdef INET
276 if (inp) {
277 rt = in_pcbrtentry(inp);
278 so = inp->inp_socket;
279 }
280 #endif
281 #ifdef INET6
282 if (in6p) {
283 rt = in6_pcbrtentry(in6p);
284 so = in6p->in6p_socket;
285 }
286 #endif
287 if (rt == NULL) {
288 size = tcp_mssdflt;
289 goto out;
290 }
291
292 ifp = rt->rt_ifp;
293
294 size = tcp_mssdflt;
295 if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
296 #ifdef INET6
297 if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
298 /*
299 * RFC2460 section 5, last paragraph: if path MTU is
300 * smaller than 1280, use 1280 as packet size and
301 * attach fragment header.
302 */
303 size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
304 } else
305 size = rt->rt_rmx.rmx_mtu - hdrlen;
306 #else
307 size = rt->rt_rmx.rmx_mtu - hdrlen;
308 #endif
309 } else if (ifp->if_flags & IFF_LOOPBACK)
310 size = ifp->if_mtu - hdrlen;
311 #ifdef INET
312 else if (inp && tp->t_mtudisc)
313 size = ifp->if_mtu - hdrlen;
314 else if (inp && in_localaddr(inp->inp_faddr))
315 size = ifp->if_mtu - hdrlen;
316 #endif
317 #ifdef INET6
318 else if (in6p) {
319 #ifdef INET
320 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
321 /* mapped addr case */
322 struct in_addr d;
323 bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
324 if (tp->t_mtudisc || in_localaddr(d))
325 size = ifp->if_mtu - hdrlen;
326 } else
327 #endif
328 {
329 /*
330 * for IPv6, path MTU discovery is always turned on,
331 * or the node must use packet size <= 1280.
332 */
333 size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU;
334 size -= hdrlen;
335 }
336 }
337 #endif
338 out:
339 /*
340 * Now we must make room for whatever extra TCP/IP options are in
341 * the packet.
342 */
343 optlen = tcp_optlen(tp);
344
345 /*
346 * XXX tp->t_ourmss should have the right size, but without this code
347 * fragmentation will occur... need more investigation
348 */
349 #ifdef INET
350 if (inp) {
351 #if defined(IPSEC) || defined(FAST_IPSEC)
352 if (! IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND))
353 optlen += ipsec4_hdrsiz_tcp(tp);
354 #endif
355 optlen += ip_optlen(inp);
356 }
357 #endif
358 #ifdef INET6
359 #ifdef INET
360 if (in6p && tp->t_family == AF_INET) {
361 #if defined(IPSEC) || defined(FAST_IPSEC)
362 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
363 optlen += ipsec4_hdrsiz_tcp(tp);
364 #endif
365 /* XXX size -= ip_optlen(in6p); */
366 } else
367 #endif
368 if (in6p && tp->t_family == AF_INET6) {
369 #ifdef IPSEC
370 if (! IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
371 optlen += ipsec6_hdrsiz_tcp(tp);
372 #endif
373 optlen += ip6_optlen(in6p);
374 }
375 #endif
376 size -= optlen;
377
378 /* there may not be any room for data if mtu is too small */
379 if (size < 0)
380 return (EMSGSIZE);
381
382 /*
383 * *rxsegsizep holds *estimated* inbound segment size (estimation
384 * assumes that path MTU is the same for both ways). this is only
385 * for silly window avoidance, do not use the value for other purposes.
386 *
387 * ipseclen is subtracted from both sides, this may not be right.
388 * I'm not quite sure about this (could someone comment).
389 */
390 *txsegsizep = min(tp->t_peermss - optlen, size);
391 /*
392 * Never send more than half a buffer full. This insures that we can
393 * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
394 * therefore acks will never be delayed unless we run out of data to
395 * transmit.
396 */
397 if (so)
398 *txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep);
399 *rxsegsizep = min(tp->t_ourmss - optlen, size);
400
401 if (*txsegsizep != tp->t_segsz) {
402 /*
403 * If the new segment size is larger, we don't want to
404 * mess up the congestion window, but if it is smaller
405 * we'll have to reduce the congestion window to ensure
406 * that we don't get into trouble with initial windows
407 * and the rest. In any case, if the segment size
408 * has changed, chances are the path has, too, and
409 * our congestion window will be different.
410 */
411 if (*txsegsizep < tp->t_segsz) {
412 tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
413 * *txsegsizep, *txsegsizep);
414 tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
415 * *txsegsizep, *txsegsizep);
416 }
417 tp->t_segsz = *txsegsizep;
418 }
419
420 return (0);
421 }
422
423 static
424 #ifndef GPROF
425 inline
426 #endif
427 int
428 tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
429 long len, int hdrlen, struct mbuf **mp)
430 {
431 struct mbuf *m, *m0;
432
433 if (tp->t_force && len == 1)
434 tcpstat.tcps_sndprobe++;
435 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
436 tcpstat.tcps_sndrexmitpack++;
437 tcpstat.tcps_sndrexmitbyte += len;
438 } else {
439 tcpstat.tcps_sndpack++;
440 tcpstat.tcps_sndbyte += len;
441 }
442 #ifdef notyet
443 if ((m = m_copypack(so->so_snd.sb_mb, off,
444 (int)len, max_linkhdr + hdrlen)) == 0)
445 return (ENOBUFS);
446 /*
447 * m_copypack left space for our hdr; use it.
448 */
449 m->m_len += hdrlen;
450 m->m_data -= hdrlen;
451 #else
452 MGETHDR(m, M_DONTWAIT, MT_HEADER);
453 if (__predict_false(m == NULL))
454 return (ENOBUFS);
455 MCLAIM(m, &tcp_tx_mowner);
456
457 /*
458 * XXX Because other code assumes headers will fit in
459 * XXX one header mbuf.
460 *
461 * (This code should almost *never* be run.)
462 */
463 if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
464 TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
465 MCLGET(m, M_DONTWAIT);
466 if ((m->m_flags & M_EXT) == 0) {
467 m_freem(m);
468 return (ENOBUFS);
469 }
470 }
471
472 m->m_data += max_linkhdr;
473 m->m_len = hdrlen;
474
475 /*
476 * To avoid traversing the whole sb_mb chain for correct
477 * data to send, remember last sent mbuf, its offset and
478 * the sent size. When called the next time, see if the
479 * data to send is directly following the previous transfer.
480 * This is important for large TCP windows.
481 */
482 if (off == 0 || tp->t_lastm == NULL ||
483 (tp->t_lastoff + tp->t_lastlen) != off) {
484 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
485 /*
486 * Either a new packet or a retransmit.
487 * Start from the beginning.
488 */
489 tp->t_lastm = so->so_snd.sb_mb;
490 tp->t_inoff = off;
491 } else {
492 TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
493 tp->t_inoff += tp->t_lastlen;
494 }
495
496 /* Traverse forward to next packet */
497 while (tp->t_inoff > 0) {
498 if (tp->t_lastm == NULL)
499 panic("tp->t_lastm == NULL");
500 if (tp->t_inoff < tp->t_lastm->m_len)
501 break;
502 tp->t_inoff -= tp->t_lastm->m_len;
503 tp->t_lastm = tp->t_lastm->m_next;
504 }
505
506 tp->t_lastoff = off;
507 tp->t_lastlen = len;
508 m0 = tp->t_lastm;
509 off = tp->t_inoff;
510
511 if (len <= M_TRAILINGSPACE(m)) {
512 m_copydata(m0, off, (int) len, mtod(m, caddr_t) + hdrlen);
513 m->m_len += len;
514 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
515 } else {
516 m->m_next = m_copy(m0, off, (int) len);
517 if (m->m_next == NULL) {
518 m_freem(m);
519 return (ENOBUFS);
520 }
521 #ifdef TCP_OUTPUT_COUNTERS
522 if (m->m_next->m_flags & M_EXT)
523 TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
524 else
525 TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
526 #endif /* TCP_OUTPUT_COUNTERS */
527 }
528 #endif
529
530 *mp = m;
531 return (0);
532 }
533
534 /*
535 * Tcp output routine: figure out what should be sent and send it.
536 */
537 int
538 tcp_output(struct tcpcb *tp)
539 {
540 struct socket *so;
541 struct route *ro;
542 long len, win;
543 int off, flags, error;
544 struct mbuf *m;
545 struct ip *ip;
546 #ifdef INET6
547 struct ip6_hdr *ip6;
548 #endif
549 struct tcphdr *th;
550 u_char opt[MAX_TCPOPTLEN];
551 unsigned optlen, hdrlen, packetlen;
552 unsigned int sack_numblks;
553 int idle, sendalot, txsegsize, rxsegsize;
554 int txsegsize_nosack;
555 int maxburst = TCP_MAXBURST;
556 int af; /* address family on the wire */
557 int iphdrlen;
558 int has_tso, use_tso;
559 int sack_rxmit;
560 int sack_bytes_rxmt;
561 struct sackhole *p;
562 #ifdef TCP_SIGNATURE
563 int sigoff = 0;
564 #endif
565
566 #ifdef DIAGNOSTIC
567 if (tp->t_inpcb && tp->t_in6pcb)
568 panic("tcp_output: both t_inpcb and t_in6pcb are set");
569 #endif
570 so = NULL;
571 ro = NULL;
572 if (tp->t_inpcb) {
573 so = tp->t_inpcb->inp_socket;
574 ro = &tp->t_inpcb->inp_route;
575 }
576 #ifdef INET6
577 else if (tp->t_in6pcb) {
578 so = tp->t_in6pcb->in6p_socket;
579 ro = (struct route *)&tp->t_in6pcb->in6p_route;
580 }
581 #endif
582
583 switch (af = tp->t_family) {
584 #ifdef INET
585 case AF_INET:
586 if (tp->t_inpcb)
587 break;
588 #ifdef INET6
589 /* mapped addr case */
590 if (tp->t_in6pcb)
591 break;
592 #endif
593 return (EINVAL);
594 #endif
595 #ifdef INET6
596 case AF_INET6:
597 if (tp->t_in6pcb)
598 break;
599 return (EINVAL);
600 #endif
601 default:
602 return (EAFNOSUPPORT);
603 }
604
605 if (tcp_segsize(tp, &txsegsize, &rxsegsize))
606 return (EMSGSIZE);
607
608 idle = (tp->snd_max == tp->snd_una);
609
610 /*
611 * Determine if we can use TCP segmentation offload:
612 * - If we're using IPv4
613 * - If there is not an IPsec policy that prevents it
614 * - If the interface can do it
615 */
616 has_tso = tp->t_inpcb != NULL &&
617 #if defined(IPSEC) || defined(FAST_IPSEC)
618 IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp,
619 IPSEC_DIR_OUTBOUND) &&
620 #endif
621 tp->t_inpcb->inp_route.ro_rt != NULL &&
622 (tp->t_inpcb->inp_route.ro_rt->rt_ifp->if_capenable &
623 IFCAP_TSOv4) != 0;
624
625 /*
626 * Restart Window computation. From draft-floyd-incr-init-win-03:
627 *
628 * Optionally, a TCP MAY set the restart window to the
629 * minimum of the value used for the initial window and
630 * the current value of cwnd (in other words, using a
631 * larger value for the restart window should never increase
632 * the size of cwnd).
633 */
634 if (tcp_cwm) {
635 /*
636 * Hughes/Touch/Heidemann Congestion Window Monitoring.
637 * Count the number of packets currently pending
638 * acknowledgement, and limit our congestion window
639 * to a pre-determined allowed burst size plus that count.
640 * This prevents bursting once all pending packets have
641 * been acknowledged (i.e. transmission is idle).
642 *
643 * XXX Link this to Initial Window?
644 */
645 tp->snd_cwnd = min(tp->snd_cwnd,
646 (tcp_cwm_burstsize * txsegsize) +
647 (tp->snd_nxt - tp->snd_una));
648 } else {
649 if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
650 /*
651 * We have been idle for "a while" and no acks are
652 * expected to clock out any data we send --
653 * slow start to get ack "clock" running again.
654 */
655 int ss = tcp_init_win;
656 #ifdef INET
657 if (tp->t_inpcb &&
658 in_localaddr(tp->t_inpcb->inp_faddr))
659 ss = tcp_init_win_local;
660 #endif
661 #ifdef INET6
662 if (tp->t_in6pcb &&
663 in6_localaddr(&tp->t_in6pcb->in6p_faddr))
664 ss = tcp_init_win_local;
665 #endif
666 tp->snd_cwnd = min(tp->snd_cwnd,
667 TCP_INITIAL_WINDOW(ss, txsegsize));
668 }
669 }
670
671 txsegsize_nosack = txsegsize;
672 again:
673 use_tso = has_tso;
674 TCP_REASS_LOCK(tp);
675 sack_numblks = tcp_sack_numblks(tp);
676 if (sack_numblks) {
677 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
678 /* don't duplicate D-SACK. */
679 use_tso = 0;
680 }
681 txsegsize = txsegsize_nosack - TCP_SACK_OPTLEN(sack_numblks);
682 } else {
683 txsegsize = txsegsize_nosack;
684 }
685
686 /*
687 * Determine length of data that should be transmitted, and
688 * flags that should be used. If there is some data or critical
689 * controls (SYN, RST) to send, then transmit; otherwise,
690 * investigate further.
691 *
692 * Readjust SACK information to avoid resending duplicate data.
693 */
694 if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
695 tcp_sack_adjust(tp);
696 sendalot = 0;
697 off = tp->snd_nxt - tp->snd_una;
698 win = min(tp->snd_wnd, tp->snd_cwnd);
699
700 flags = tcp_outflags[tp->t_state];
701
702 /*
703 * Send any SACK-generated retransmissions. If we're explicitly trying
704 * to send out new data (when sendalot is 1), bypass this function.
705 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
706 * we're replacing a (future) new transmission with a retransmission
707 * now, and we previously incremented snd_cwnd in tcp_input().
708 */
709 /*
710 * Still in sack recovery , reset rxmit flag to zero.
711 */
712 sack_rxmit = 0;
713 sack_bytes_rxmt = 0;
714 len = 0;
715 p = NULL;
716 do {
717 long cwin;
718 if (!TCP_SACK_ENABLED(tp))
719 break;
720 if (tp->t_partialacks < 0)
721 break;
722 p = tcp_sack_output(tp, &sack_bytes_rxmt);
723 if (p == NULL)
724 break;
725
726 cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
727 if (cwin < 0)
728 cwin = 0;
729 /* Do not retransmit SACK segments beyond snd_recover */
730 if (SEQ_GT(p->end, tp->snd_recover)) {
731 /*
732 * (At least) part of sack hole extends beyond
733 * snd_recover. Check to see if we can rexmit data
734 * for this hole.
735 */
736 if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
737 /*
738 * Can't rexmit any more data for this hole.
739 * That data will be rexmitted in the next
740 * sack recovery episode, when snd_recover
741 * moves past p->rxmit.
742 */
743 p = NULL;
744 break;
745 }
746 /* Can rexmit part of the current hole */
747 len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
748 } else
749 len = ((long)ulmin(cwin, p->end - p->rxmit));
750 off = p->rxmit - tp->snd_una;
751 if (off + len > so->so_snd.sb_cc) {
752 /* 1 for TH_FIN */
753 KASSERT(off + len == so->so_snd.sb_cc + 1);
754 KASSERT(p->rxmit + len == tp->snd_max);
755 len = so->so_snd.sb_cc - off;
756 }
757 if (len > 0) {
758 sack_rxmit = 1;
759 sendalot = 1;
760 }
761 } while (/*CONSTCOND*/0);
762
763 /*
764 * If in persist timeout with window of 0, send 1 byte.
765 * Otherwise, if window is small but nonzero
766 * and timer expired, we will send what we can
767 * and go to transmit state.
768 */
769 if (tp->t_force) {
770 if (win == 0) {
771 /*
772 * If we still have some data to send, then
773 * clear the FIN bit. Usually this would
774 * happen below when it realizes that we
775 * aren't sending all the data. However,
776 * if we have exactly 1 byte of unset data,
777 * then it won't clear the FIN bit below,
778 * and if we are in persist state, we wind
779 * up sending the packet without recording
780 * that we sent the FIN bit.
781 *
782 * We can't just blindly clear the FIN bit,
783 * because if we don't have any more data
784 * to send then the probe will be the FIN
785 * itself.
786 */
787 if (off < so->so_snd.sb_cc)
788 flags &= ~TH_FIN;
789 win = 1;
790 } else {
791 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
792 tp->t_rxtshift = 0;
793 }
794 }
795
796 if (!TCP_SACK_ENABLED(tp)) {
797 if (win < so->so_snd.sb_cc) {
798 len = win - off;
799 flags &= ~TH_FIN;
800 } else
801 len = so->so_snd.sb_cc - off;
802 } else if (sack_rxmit == 0) {
803 if (sack_bytes_rxmt != 0) {
804 long cwin;
805
806 /*
807 * We are inside of a SACK recovery episode and are
808 * sending new data, having retransmitted all the
809 * data possible in the scoreboard.
810 */
811 if (tp->snd_wnd < so->so_snd.sb_cc) {
812 len = tp->snd_wnd - off;
813 flags &= ~TH_FIN;
814 } else {
815 len = so->so_snd.sb_cc - off;
816 }
817
818 /*
819 * From FreeBSD:
820 * Don't remove this (len > 0) check !
821 * We explicitly check for len > 0 here (although it
822 * isn't really necessary), to work around a gcc
823 * optimization issue - to force gcc to compute
824 * len above. Without this check, the computation
825 * of len is bungled by the optimizer.
826 */
827 if (len > 0) {
828 cwin = tp->snd_cwnd -
829 (tp->snd_nxt - tp->sack_newdata) -
830 sack_bytes_rxmt;
831 if (cwin < 0)
832 cwin = 0;
833 if (cwin < len) {
834 len = cwin;
835 flags &= ~TH_FIN;
836 }
837 }
838 } else if (win < so->so_snd.sb_cc) {
839 len = win - off;
840 flags &= ~TH_FIN;
841 } else
842 len = so->so_snd.sb_cc - off;
843 }
844
845 if (len < 0) {
846 /*
847 * If FIN has been sent but not acked,
848 * but we haven't been called to retransmit,
849 * len will be -1. Otherwise, window shrank
850 * after we sent into it. If window shrank to 0,
851 * cancel pending retransmit, pull snd_nxt back
852 * to (closed) window, and set the persist timer
853 * if it isn't already going. If the window didn't
854 * close completely, just wait for an ACK.
855 *
856 * If we have a pending FIN, either it has already been
857 * transmitted or it is outside the window, so drop it.
858 * If the FIN has been transmitted, but this is not a
859 * retransmission, then len must be -1. Therefore we also
860 * prevent here the sending of `gratuitous FINs'. This
861 * eliminates the need to check for that case below (e.g.
862 * to back up snd_nxt before the FIN so that the sequence
863 * number is correct).
864 */
865 len = 0;
866 flags &= ~TH_FIN;
867 if (win == 0) {
868 TCP_TIMER_DISARM(tp, TCPT_REXMT);
869 tp->t_rxtshift = 0;
870 tp->snd_nxt = tp->snd_una;
871 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
872 tcp_setpersist(tp);
873 }
874 }
875 if (len > txsegsize) {
876 if (use_tso) {
877 /*
878 * Truncate TSO transfers to IP_MAXPACKET, and make
879 * sure that we send equal size transfers down the
880 * stack (rather than big-small-big-small-...).
881 */
882 len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize;
883 if (len <= txsegsize) {
884 use_tso = 0;
885 }
886 } else
887 len = txsegsize;
888 flags &= ~TH_FIN;
889 sendalot = 1;
890 } else
891 use_tso = 0;
892 if (sack_rxmit) {
893 if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
894 flags &= ~TH_FIN;
895 }
896
897 win = sbspace(&so->so_rcv);
898
899 /*
900 * Sender silly window avoidance. If connection is idle
901 * and can send all data, a maximum segment,
902 * at least a maximum default-size segment do it,
903 * or are forced, do it; otherwise don't bother.
904 * If peer's buffer is tiny, then send
905 * when window is at least half open.
906 * If retransmitting (possibly after persist timer forced us
907 * to send into a small window), then must resend.
908 */
909 if (len) {
910 if (len >= txsegsize)
911 goto send;
912 if ((so->so_state & SS_MORETOCOME) == 0 &&
913 ((idle || tp->t_flags & TF_NODELAY) &&
914 len + off >= so->so_snd.sb_cc))
915 goto send;
916 if (tp->t_force)
917 goto send;
918 if (len >= tp->max_sndwnd / 2)
919 goto send;
920 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
921 goto send;
922 if (sack_rxmit)
923 goto send;
924 }
925
926 /*
927 * Compare available window to amount of window known to peer
928 * (as advertised window less next expected input). If the
929 * difference is at least twice the size of the largest segment
930 * we expect to receive (i.e. two segments) or at least 50% of
931 * the maximum possible window, then want to send a window update
932 * to peer.
933 */
934 if (win > 0) {
935 /*
936 * "adv" is the amount we can increase the window,
937 * taking into account that we are limited by
938 * TCP_MAXWIN << tp->rcv_scale.
939 */
940 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
941 (tp->rcv_adv - tp->rcv_nxt);
942
943 if (adv >= (long) (2 * rxsegsize))
944 goto send;
945 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
946 goto send;
947 }
948
949 /*
950 * Send if we owe peer an ACK.
951 */
952 if (tp->t_flags & TF_ACKNOW)
953 goto send;
954 if (flags & (TH_SYN|TH_FIN|TH_RST))
955 goto send;
956 if (SEQ_GT(tp->snd_up, tp->snd_una))
957 goto send;
958 /*
959 * In SACK, it is possible for tcp_output to fail to send a segment
960 * after the retransmission timer has been turned off. Make sure
961 * that the retransmission timer is set.
962 */
963 if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
964 !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
965 !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
966 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
967 goto just_return;
968 }
969
970 /*
971 * TCP window updates are not reliable, rather a polling protocol
972 * using ``persist'' packets is used to insure receipt of window
973 * updates. The three ``states'' for the output side are:
974 * idle not doing retransmits or persists
975 * persisting to move a small or zero window
976 * (re)transmitting and thereby not persisting
977 *
978 * tp->t_timer[TCPT_PERSIST]
979 * is set when we are in persist state.
980 * tp->t_force
981 * is set when we are called to send a persist packet.
982 * tp->t_timer[TCPT_REXMT]
983 * is set when we are retransmitting
984 * The output side is idle when both timers are zero.
985 *
986 * If send window is too small, there is data to transmit, and no
987 * retransmit or persist is pending, then go to persist state.
988 * If nothing happens soon, send when timer expires:
989 * if window is nonzero, transmit what we can,
990 * otherwise force out a byte.
991 */
992 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
993 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
994 tp->t_rxtshift = 0;
995 tcp_setpersist(tp);
996 }
997
998 /*
999 * No reason to send a segment, just return.
1000 */
1001 just_return:
1002 TCP_REASS_UNLOCK(tp);
1003 return (0);
1004
1005 send:
1006 /*
1007 * Before ESTABLISHED, force sending of initial options
1008 * unless TCP set not to do any options.
1009 * NOTE: we assume that the IP/TCP header plus TCP options
1010 * always fit in a single mbuf, leaving room for a maximum
1011 * link header, i.e.
1012 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1013 */
1014 optlen = 0;
1015 switch (af) {
1016 #ifdef INET
1017 case AF_INET:
1018 iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
1019 break;
1020 #endif
1021 #ifdef INET6
1022 case AF_INET6:
1023 iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1024 break;
1025 #endif
1026 default: /*pacify gcc*/
1027 iphdrlen = 0;
1028 break;
1029 }
1030 hdrlen = iphdrlen;
1031 if (flags & TH_SYN) {
1032 struct rtentry *rt;
1033
1034 rt = NULL;
1035 #ifdef INET
1036 if (tp->t_inpcb)
1037 rt = in_pcbrtentry(tp->t_inpcb);
1038 #endif
1039 #ifdef INET6
1040 if (tp->t_in6pcb)
1041 rt = in6_pcbrtentry(tp->t_in6pcb);
1042 #endif
1043
1044 tp->snd_nxt = tp->iss;
1045 tp->t_ourmss = tcp_mss_to_advertise(rt != NULL ?
1046 rt->rt_ifp : NULL, af);
1047 if ((tp->t_flags & TF_NOOPT) == 0) {
1048 opt[0] = TCPOPT_MAXSEG;
1049 opt[1] = 4;
1050 opt[2] = (tp->t_ourmss >> 8) & 0xff;
1051 opt[3] = tp->t_ourmss & 0xff;
1052 optlen = 4;
1053
1054 if ((tp->t_flags & TF_REQ_SCALE) &&
1055 ((flags & TH_ACK) == 0 ||
1056 (tp->t_flags & TF_RCVD_SCALE))) {
1057 *((u_int32_t *) (opt + optlen)) = htonl(
1058 TCPOPT_NOP << 24 |
1059 TCPOPT_WINDOW << 16 |
1060 TCPOLEN_WINDOW << 8 |
1061 tp->request_r_scale);
1062 optlen += 4;
1063 }
1064 if (tcp_do_sack) {
1065 u_int8_t *cp = (u_int8_t *)(opt + optlen);
1066
1067 cp[0] = TCPOPT_SACK_PERMITTED;
1068 cp[1] = 2;
1069 cp[2] = TCPOPT_NOP;
1070 cp[3] = TCPOPT_NOP;
1071 optlen += 4;
1072 }
1073 }
1074 }
1075
1076 /*
1077 * Send a timestamp and echo-reply if this is a SYN and our side
1078 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1079 * and our peer have sent timestamps in our SYN's.
1080 */
1081 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1082 (flags & TH_RST) == 0 &&
1083 ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
1084 (tp->t_flags & TF_RCVD_TSTMP))) {
1085 u_int32_t *lp = (u_int32_t *)(opt + optlen);
1086
1087 /* Form timestamp option as shown in appendix A of RFC 1323. */
1088 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1089 *lp++ = htonl(TCP_TIMESTAMP(tp));
1090 *lp = htonl(tp->ts_recent);
1091 optlen += TCPOLEN_TSTAMP_APPA;
1092 }
1093
1094 /*
1095 * Tack on the SACK block if it is necessary.
1096 */
1097 if (sack_numblks) {
1098 int sack_len;
1099 u_char *bp = (u_char *)(opt + optlen);
1100 u_int32_t *lp = (u_int32_t *)(bp + 4);
1101 struct ipqent *tiqe;
1102
1103 sack_len = sack_numblks * 8 + 2;
1104 bp[0] = TCPOPT_NOP;
1105 bp[1] = TCPOPT_NOP;
1106 bp[2] = TCPOPT_SACK;
1107 bp[3] = sack_len;
1108 if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
1109 sack_numblks--;
1110 *lp++ = htonl(tp->rcv_dsack_block.left);
1111 *lp++ = htonl(tp->rcv_dsack_block.right);
1112 tp->rcv_sack_flags &= ~TCPSACK_HAVED;
1113 }
1114 for (tiqe = TAILQ_FIRST(&tp->timeq);
1115 sack_numblks > 0; tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
1116 KASSERT(tiqe != NULL);
1117 sack_numblks--;
1118 *lp++ = htonl(tiqe->ipqe_seq);
1119 *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
1120 ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
1121 }
1122 optlen += sack_len + 2;
1123 }
1124 TCP_REASS_UNLOCK(tp);
1125
1126 #ifdef TCP_SIGNATURE
1127 #if defined(INET6) && defined(FAST_IPSEC)
1128 if (tp->t_family == AF_INET)
1129 #endif
1130 if (tp->t_flags & TF_SIGNATURE) {
1131 u_char *bp;
1132 /*
1133 * Initialize TCP-MD5 option (RFC2385)
1134 */
1135 bp = (u_char *)opt + optlen;
1136 *bp++ = TCPOPT_SIGNATURE;
1137 *bp++ = TCPOLEN_SIGNATURE;
1138 sigoff = optlen + 2;
1139 bzero(bp, TCP_SIGLEN);
1140 bp += TCP_SIGLEN;
1141 optlen += TCPOLEN_SIGNATURE;
1142 /*
1143 * Terminate options list and maintain 32-bit alignment.
1144 */
1145 *bp++ = TCPOPT_NOP;
1146 *bp++ = TCPOPT_EOL;
1147 optlen += 2;
1148 }
1149 #endif /* TCP_SIGNATURE */
1150
1151 hdrlen += optlen;
1152
1153 #ifdef DIAGNOSTIC
1154 if (!use_tso && len > txsegsize)
1155 panic("tcp data to be sent is larger than segment");
1156 else if (use_tso && len > IP_MAXPACKET)
1157 panic("tcp data to be sent is larger than max TSO size");
1158 if (max_linkhdr + hdrlen > MCLBYTES)
1159 panic("tcphdr too big");
1160 #endif
1161
1162 /*
1163 * Grab a header mbuf, attaching a copy of data to
1164 * be transmitted, and initialize the header from
1165 * the template for sends on this connection.
1166 */
1167 if (len) {
1168 error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
1169 if (error)
1170 goto out;
1171 /*
1172 * If we're sending everything we've got, set PUSH.
1173 * (This will keep happy those implementations which only
1174 * give data to the user when a buffer fills or
1175 * a PUSH comes in.)
1176 */
1177 if (off + len == so->so_snd.sb_cc)
1178 flags |= TH_PUSH;
1179 } else {
1180 if (tp->t_flags & TF_ACKNOW)
1181 tcpstat.tcps_sndacks++;
1182 else if (flags & (TH_SYN|TH_FIN|TH_RST))
1183 tcpstat.tcps_sndctrl++;
1184 else if (SEQ_GT(tp->snd_up, tp->snd_una))
1185 tcpstat.tcps_sndurg++;
1186 else
1187 tcpstat.tcps_sndwinup++;
1188
1189 MGETHDR(m, M_DONTWAIT, MT_HEADER);
1190 if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
1191 MCLGET(m, M_DONTWAIT);
1192 if ((m->m_flags & M_EXT) == 0) {
1193 m_freem(m);
1194 m = NULL;
1195 }
1196 }
1197 if (m == NULL) {
1198 error = ENOBUFS;
1199 goto out;
1200 }
1201 MCLAIM(m, &tcp_tx_mowner);
1202 m->m_data += max_linkhdr;
1203 m->m_len = hdrlen;
1204 }
1205 m->m_pkthdr.rcvif = (struct ifnet *)0;
1206 switch (af) {
1207 #ifdef INET
1208 case AF_INET:
1209 ip = mtod(m, struct ip *);
1210 #ifdef INET6
1211 ip6 = NULL;
1212 #endif
1213 th = (struct tcphdr *)(ip + 1);
1214 break;
1215 #endif
1216 #ifdef INET6
1217 case AF_INET6:
1218 ip = NULL;
1219 ip6 = mtod(m, struct ip6_hdr *);
1220 th = (struct tcphdr *)(ip6 + 1);
1221 break;
1222 #endif
1223 default: /*pacify gcc*/
1224 ip = NULL;
1225 #ifdef INET6
1226 ip6 = NULL;
1227 #endif
1228 th = NULL;
1229 break;
1230 }
1231 if (tp->t_template == 0)
1232 panic("tcp_output");
1233 if (tp->t_template->m_len < iphdrlen)
1234 panic("tcp_output");
1235 bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t), iphdrlen);
1236
1237 /*
1238 * If we are doing retransmissions, then snd_nxt will
1239 * not reflect the first unsent octet. For ACK only
1240 * packets, we do not want the sequence number of the
1241 * retransmitted packet, we want the sequence number
1242 * of the next unsent octet. So, if there is no data
1243 * (and no SYN or FIN), use snd_max instead of snd_nxt
1244 * when filling in ti_seq. But if we are in persist
1245 * state, snd_max might reflect one byte beyond the
1246 * right edge of the window, so use snd_nxt in that
1247 * case, since we know we aren't doing a retransmission.
1248 * (retransmit and persist are mutually exclusive...)
1249 */
1250 if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
1251 th->th_seq = htonl(p->rxmit);
1252 p->rxmit += len;
1253 } else {
1254 if (len || (flags & (TH_SYN|TH_FIN)) ||
1255 TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
1256 th->th_seq = htonl(tp->snd_nxt);
1257 else
1258 th->th_seq = htonl(tp->snd_max);
1259 }
1260 th->th_ack = htonl(tp->rcv_nxt);
1261 if (optlen) {
1262 bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
1263 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
1264 }
1265 th->th_flags = flags;
1266 /*
1267 * Calculate receive window. Don't shrink window,
1268 * but avoid silly window syndrome.
1269 */
1270 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
1271 win = 0;
1272 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
1273 win = (long)TCP_MAXWIN << tp->rcv_scale;
1274 if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
1275 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
1276 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
1277 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
1278 u_int32_t urp = tp->snd_up - tp->snd_nxt;
1279 if (urp > IP_MAXPACKET)
1280 urp = IP_MAXPACKET;
1281 th->th_urp = htons((u_int16_t)urp);
1282 th->th_flags |= TH_URG;
1283 } else
1284 /*
1285 * If no urgent pointer to send, then we pull
1286 * the urgent pointer to the left edge of the send window
1287 * so that it doesn't drift into the send window on sequence
1288 * number wraparound.
1289 */
1290 tp->snd_up = tp->snd_una; /* drag it along */
1291
1292 #ifdef TCP_SIGNATURE
1293 #if defined(INET6) && defined(FAST_IPSEC)
1294 if (tp->t_family == AF_INET) /* XXX */
1295 #endif
1296 if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
1297 struct secasvar *sav;
1298 u_int8_t *sigp;
1299
1300 sav = tcp_signature_getsav(m, th);
1301
1302 if (sav == NULL) {
1303 if (m)
1304 m_freem(m);
1305 return (EPERM);
1306 }
1307
1308 m->m_pkthdr.len = hdrlen + len;
1309 sigp = (caddr_t)th + sizeof(*th) + sigoff;
1310 tcp_signature(m, th, (caddr_t)th - mtod(m, caddr_t), sav, sigp);
1311
1312 key_sa_recordxfer(sav, m);
1313 #ifdef FAST_IPSEC
1314 KEY_FREESAV(&sav);
1315 #else
1316 key_freesav(sav);
1317 #endif
1318 }
1319 #endif
1320
1321 /*
1322 * Set ourselves up to be checksummed just before the packet
1323 * hits the wire.
1324 */
1325 switch (af) {
1326 #ifdef INET
1327 case AF_INET:
1328 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1329 if (use_tso) {
1330 m->m_pkthdr.segsz = txsegsize;
1331 m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
1332 } else {
1333 m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
1334 if (len + optlen) {
1335 /* Fixup the pseudo-header checksum. */
1336 /* XXXJRT Not IP Jumbogram safe. */
1337 th->th_sum = in_cksum_addword(th->th_sum,
1338 htons((u_int16_t) (len + optlen)));
1339 }
1340 }
1341 break;
1342 #endif
1343 #ifdef INET6
1344 case AF_INET6:
1345 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1346 m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
1347 if (len + optlen) {
1348 /* Fixup the pseudo-header checksum. */
1349 /* XXXJRT: Not IPv6 Jumbogram safe. */
1350 th->th_sum = in_cksum_addword(th->th_sum,
1351 htons((u_int16_t) (len + optlen)));
1352 }
1353 break;
1354 #endif
1355 }
1356
1357 /*
1358 * In transmit state, time the transmission and arrange for
1359 * the retransmit. In persist state, just set snd_max.
1360 */
1361 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
1362 tcp_seq startseq = tp->snd_nxt;
1363
1364 /*
1365 * Advance snd_nxt over sequence space of this segment.
1366 * There are no states in which we send both a SYN and a FIN,
1367 * so we collapse the tests for these flags.
1368 */
1369 if (flags & (TH_SYN|TH_FIN))
1370 tp->snd_nxt++;
1371 if (sack_rxmit)
1372 goto timer;
1373 tp->snd_nxt += len;
1374 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
1375 tp->snd_max = tp->snd_nxt;
1376 /*
1377 * Time this transmission if not a retransmission and
1378 * not currently timing anything.
1379 */
1380 if (tp->t_rtttime == 0) {
1381 tp->t_rtttime = tcp_now;
1382 tp->t_rtseq = startseq;
1383 tcpstat.tcps_segstimed++;
1384 }
1385 }
1386
1387 /*
1388 * Set retransmit timer if not currently set,
1389 * and not doing an ack or a keep-alive probe.
1390 * Initial value for retransmit timer is smoothed
1391 * round-trip time + 2 * round-trip time variance.
1392 * Initialize shift counter which is used for backoff
1393 * of retransmit time.
1394 */
1395 timer:
1396 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
1397 ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||
1398 tp->snd_nxt != tp->snd_una)) {
1399 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
1400 TCP_TIMER_DISARM(tp, TCPT_PERSIST);
1401 tp->t_rxtshift = 0;
1402 }
1403 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1404 }
1405 } else
1406 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
1407 tp->snd_max = tp->snd_nxt + len;
1408
1409 #ifdef TCP_DEBUG
1410 /*
1411 * Trace.
1412 */
1413 if (so->so_options & SO_DEBUG)
1414 tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
1415 #endif
1416
1417 /*
1418 * Fill in IP length and desired time to live and
1419 * send to IP level. There should be a better way
1420 * to handle ttl and tos; we could keep them in
1421 * the template, but need a way to checksum without them.
1422 */
1423 m->m_pkthdr.len = hdrlen + len;
1424
1425 switch (af) {
1426 #ifdef INET
1427 case AF_INET:
1428 ip->ip_len = htons(m->m_pkthdr.len);
1429 packetlen = m->m_pkthdr.len;
1430 if (tp->t_inpcb) {
1431 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
1432 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
1433 }
1434 #ifdef INET6
1435 else if (tp->t_in6pcb) {
1436 ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/
1437 ip->ip_tos = 0; /*XXX*/
1438 }
1439 #endif
1440 break;
1441 #endif
1442 #ifdef INET6
1443 case AF_INET6:
1444 packetlen = m->m_pkthdr.len;
1445 ip6->ip6_nxt = IPPROTO_TCP;
1446 if (tp->t_in6pcb) {
1447 /*
1448 * we separately set hoplimit for every segment, since
1449 * the user might want to change the value via
1450 * setsockopt. Also, desired default hop limit might
1451 * be changed via Neighbor Discovery.
1452 */
1453 ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb,
1454 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
1455 }
1456 /* ip6->ip6_flow = ??? */
1457 /* ip6_plen will be filled in ip6_output(). */
1458 break;
1459 #endif
1460 default: /*pacify gcc*/
1461 packetlen = 0;
1462 break;
1463 }
1464
1465 switch (af) {
1466 #ifdef INET
1467 case AF_INET:
1468 {
1469 struct mbuf *opts;
1470
1471 if (tp->t_inpcb)
1472 opts = tp->t_inpcb->inp_options;
1473 else
1474 opts = NULL;
1475 error = ip_output(m, opts, ro,
1476 (tp->t_mtudisc ? IP_MTUDISC : 0) |
1477 (so->so_options & SO_DONTROUTE),
1478 (struct ip_moptions *)0, so);
1479 break;
1480 }
1481 #endif
1482 #ifdef INET6
1483 case AF_INET6:
1484 {
1485 struct ip6_pktopts *opts;
1486
1487 if (tp->t_in6pcb)
1488 opts = tp->t_in6pcb->in6p_outputopts;
1489 else
1490 opts = NULL;
1491 error = ip6_output(m, opts, (struct route_in6 *)ro,
1492 so->so_options & SO_DONTROUTE,
1493 (struct ip6_moptions *)0, so, NULL);
1494 break;
1495 }
1496 #endif
1497 default:
1498 error = EAFNOSUPPORT;
1499 break;
1500 }
1501 if (error) {
1502 out:
1503 if (error == ENOBUFS) {
1504 tcpstat.tcps_selfquench++;
1505 #ifdef INET
1506 if (tp->t_inpcb)
1507 tcp_quench(tp->t_inpcb, 0);
1508 #endif
1509 #ifdef INET6
1510 if (tp->t_in6pcb)
1511 tcp6_quench(tp->t_in6pcb, 0);
1512 #endif
1513 error = 0;
1514 } else if ((error == EHOSTUNREACH || error == ENETDOWN) &&
1515 TCPS_HAVERCVDSYN(tp->t_state)) {
1516 tp->t_softerror = error;
1517 error = 0;
1518 }
1519
1520 /* Back out the seqence number advance. */
1521 if (sack_rxmit)
1522 p->rxmit -= len;
1523
1524 /* Restart the delayed ACK timer, if necessary. */
1525 if (tp->t_flags & TF_DELACK)
1526 TCP_RESTART_DELACK(tp);
1527
1528 return (error);
1529 }
1530
1531 if (packetlen > tp->t_pmtud_mtu_sent)
1532 tp->t_pmtud_mtu_sent = packetlen;
1533
1534 tcpstat.tcps_sndtotal++;
1535 if (tp->t_flags & TF_DELACK)
1536 tcpstat.tcps_delack++;
1537
1538 /*
1539 * Data sent (as far as we can tell).
1540 * If this advertises a larger window than any other segment,
1541 * then remember the size of the advertised window.
1542 * Any pending ACK has now been sent.
1543 */
1544 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
1545 tp->rcv_adv = tp->rcv_nxt + win;
1546 tp->last_ack_sent = tp->rcv_nxt;
1547 tp->t_flags &= ~TF_ACKNOW;
1548 TCP_CLEAR_DELACK(tp);
1549 #ifdef DIAGNOSTIC
1550 if (maxburst < 0)
1551 printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
1552 #endif
1553 if (sendalot && (!tcp_do_newreno || --maxburst))
1554 goto again;
1555 return (0);
1556 }
1557
1558 void
1559 tcp_setpersist(struct tcpcb *tp)
1560 {
1561 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
1562 int nticks;
1563
1564 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
1565 panic("tcp_output REXMT");
1566 /*
1567 * Start/restart persistance timer.
1568 */
1569 if (t < tp->t_rttmin)
1570 t = tp->t_rttmin;
1571 TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
1572 TCPTV_PERSMIN, TCPTV_PERSMAX);
1573 TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
1574 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
1575 tp->t_rxtshift++;
1576 }
1577
1578 #if defined(INET)
1579 /*
1580 * tcp4_segment: handle M_CSUM_TSOv4 by software.
1581 *
1582 * => always consume m.
1583 * => call output_func with output_arg for each segments.
1584 */
1585
1586 int
1587 tcp4_segment(struct mbuf *m, int (*output_func)(void *, struct mbuf *),
1588 void *output_arg)
1589 {
1590 int mss;
1591 int iphlen;
1592 int thlen;
1593 int hlen;
1594 int len;
1595 struct ip *iph;
1596 struct tcphdr *th;
1597 uint16_t ipid;
1598 uint32_t tcpseq;
1599 struct mbuf *hdr = NULL;
1600 struct mbuf *t;
1601 int error = 0;
1602
1603 KASSERT((m->m_flags & M_PKTHDR) != 0);
1604 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0);
1605
1606 m->m_pkthdr.csum_flags = 0;
1607
1608 len = m->m_pkthdr.len;
1609 KASSERT(len >= sizeof(*iph) + sizeof(*th));
1610
1611 if (m->m_len < sizeof(*iph)) {
1612 m = m_pullup(m, sizeof(*iph));
1613 if (m == NULL) {
1614 error = ENOMEM;
1615 goto quit;
1616 }
1617 }
1618 iph = mtod(m, struct ip *);
1619 iphlen = iph->ip_hl * 4;
1620 KASSERT(iph->ip_v == IPVERSION);
1621 KASSERT(iphlen >= sizeof(*iph));
1622 KASSERT(iph->ip_p == IPPROTO_TCP);
1623 ipid = ntohs(iph->ip_id);
1624
1625 hlen = iphlen + sizeof(*th);
1626 if (m->m_len < hlen) {
1627 m = m_pullup(m, hlen);
1628 if (m == NULL) {
1629 error = ENOMEM;
1630 goto quit;
1631 }
1632 }
1633 th = (void *)(mtod(m, char *) + iphlen);
1634 tcpseq = ntohl(th->th_seq);
1635 thlen = th->th_off * 4;
1636 hlen = iphlen + thlen;
1637
1638 mss = m->m_pkthdr.segsz;
1639 KASSERT(mss != 0);
1640 KASSERT(len > hlen);
1641
1642 t = m_split(m, hlen, M_NOWAIT);
1643 if (t == NULL) {
1644 error = ENOMEM;
1645 goto quit;
1646 }
1647 hdr = m;
1648 m = t;
1649 len -= hlen;
1650 KASSERT(len % mss == 0);
1651 while (len > 0) {
1652 struct mbuf *n;
1653
1654 n = m_dup(hdr, 0, hlen, M_NOWAIT);
1655 if (n == NULL) {
1656 error = ENOMEM;
1657 goto quit;
1658 }
1659 KASSERT(n->m_len == hlen); /* XXX */
1660
1661 t = m_split(m, mss, M_NOWAIT);
1662 if (t == NULL) {
1663 m_freem(n);
1664 error = ENOMEM;
1665 goto quit;
1666 }
1667 m_cat(n, m);
1668 m = t;
1669
1670 KASSERT(n->m_len >= hlen); /* XXX */
1671
1672 n->m_pkthdr.len = hlen + mss;
1673 iph = mtod(n, struct ip *);
1674 KASSERT(iph->ip_v == IPVERSION);
1675 iph->ip_len = htons(n->m_pkthdr.len);
1676 iph->ip_id = htons(ipid);
1677 th = (void *)(mtod(n, char *) + iphlen);
1678 th->th_seq = htonl(tcpseq);
1679 iph->ip_sum = 0;
1680 iph->ip_sum = in_cksum(n, iphlen);
1681 th->th_sum = 0;
1682 th->th_sum = in4_cksum(n, IPPROTO_TCP, iphlen, thlen + mss);
1683
1684 error = (*output_func)(output_arg, n);
1685 if (error) {
1686 goto quit;
1687 }
1688
1689 tcpseq += mss;
1690 ipid++;
1691 len -= mss;
1692 }
1693
1694 quit:
1695 if (hdr != NULL) {
1696 m_freem(hdr);
1697 }
1698 if (m != NULL) {
1699 m_freem(m);
1700 }
1701
1702 return error;
1703 }
1704 #endif /* defined(INET) */
1705