tcp_subr.c revision 1.291 1 /* $NetBSD: tcp_subr.c,v 1.291 2022/09/20 07:19:14 ozaki-r Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc.
34 * All rights reserved.
35 *
36 * This code is derived from software contributed to The NetBSD Foundation
37 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
38 * Facility, NASA Ames Research Center.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
59 * POSSIBILITY OF SUCH DAMAGE.
60 */
61
62 /*
63 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
64 * The Regents of the University of California. All rights reserved.
65 *
66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions
68 * are met:
69 * 1. Redistributions of source code must retain the above copyright
70 * notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright
72 * notice, this list of conditions and the following disclaimer in the
73 * documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors
75 * may be used to endorse or promote products derived from this software
76 * without specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 *
90 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
91 */
92
93 #include <sys/cdefs.h>
94 __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.291 2022/09/20 07:19:14 ozaki-r Exp $");
95
96 #ifdef _KERNEL_OPT
97 #include "opt_inet.h"
98 #include "opt_ipsec.h"
99 #include "opt_inet_csum.h"
100 #include "opt_mbuftrace.h"
101 #endif
102
103 #include <sys/param.h>
104 #include <sys/atomic.h>
105 #include <sys/proc.h>
106 #include <sys/systm.h>
107 #include <sys/mbuf.h>
108 #include <sys/once.h>
109 #include <sys/socket.h>
110 #include <sys/socketvar.h>
111 #include <sys/protosw.h>
112 #include <sys/errno.h>
113 #include <sys/kernel.h>
114 #include <sys/pool.h>
115 #include <sys/md5.h>
116 #include <sys/cprng.h>
117
118 #include <net/route.h>
119 #include <net/if.h>
120
121 #include <netinet/in.h>
122 #include <netinet/in_systm.h>
123 #include <netinet/ip.h>
124 #include <netinet/in_pcb.h>
125 #include <netinet/ip_var.h>
126 #include <netinet/ip_icmp.h>
127
128 #ifdef INET6
129 #include <netinet/ip6.h>
130 #include <netinet6/in6_pcb.h>
131 #include <netinet6/ip6_var.h>
132 #include <netinet6/in6_var.h>
133 #include <netinet6/ip6protosw.h>
134 #include <netinet/icmp6.h>
135 #include <netinet6/nd6.h>
136 #endif
137
138 #include <netinet/tcp.h>
139 #include <netinet/tcp_fsm.h>
140 #include <netinet/tcp_seq.h>
141 #include <netinet/tcp_timer.h>
142 #include <netinet/tcp_var.h>
143 #include <netinet/tcp_vtw.h>
144 #include <netinet/tcp_private.h>
145 #include <netinet/tcp_congctl.h>
146 #include <netinet/tcp_syncache.h>
147
148 #ifdef IPSEC
149 #include <netipsec/ipsec.h>
150 #ifdef INET6
151 #include <netipsec/ipsec6.h>
152 #endif
153 #include <netipsec/key.h>
154 #endif
155
156
157 struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
158 u_int32_t tcp_now; /* slow ticks, for RFC 1323 timestamps */
159
160 percpu_t *tcpstat_percpu;
161
162 /* patchable/settable parameters for tcp */
163 int tcp_mssdflt = TCP_MSS;
164 int tcp_minmss = TCP_MINMSS;
165 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
166 int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
167 int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */
168 int tcp_do_sack = 1; /* selective acknowledgement */
169 int tcp_do_win_scale = 1; /* RFC1323 window scaling */
170 int tcp_do_timestamps = 1; /* RFC1323 timestamps */
171 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
172 int tcp_do_ecn = 0; /* Explicit Congestion Notification */
173 #ifndef TCP_INIT_WIN
174 #define TCP_INIT_WIN 4 /* initial slow start window */
175 #endif
176 #ifndef TCP_INIT_WIN_LOCAL
177 #define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */
178 #endif
179 /*
180 * Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460.
181 * This is to simulate current behavior for iw == 4
182 */
183 int tcp_init_win_max[] = {
184 1 * 1460,
185 1 * 1460,
186 2 * 1460,
187 2 * 1460,
188 3 * 1460,
189 5 * 1460,
190 6 * 1460,
191 7 * 1460,
192 8 * 1460,
193 9 * 1460,
194 10 * 1460
195 };
196 int tcp_init_win = TCP_INIT_WIN;
197 int tcp_init_win_local = TCP_INIT_WIN_LOCAL;
198 int tcp_mss_ifmtu = 0;
199 int tcp_rst_ppslim = 100; /* 100pps */
200 int tcp_ackdrop_ppslim = 100; /* 100pps */
201 int tcp_do_loopback_cksum = 0;
202 int tcp_do_abc = 1; /* RFC3465 Appropriate byte counting. */
203 int tcp_abc_aggressive = 1; /* 1: L=2*SMSS 0: L=1*SMSS */
204 int tcp_sack_tp_maxholes = 32;
205 int tcp_sack_globalmaxholes = 1024;
206 int tcp_sack_globalholes = 0;
207 int tcp_ecn_maxretries = 1;
208 int tcp_msl_enable = 1; /* enable TIME_WAIT truncation */
209 int tcp_msl_loop = PR_SLOWHZ; /* MSL for loopback */
210 int tcp_msl_local = 5 * PR_SLOWHZ; /* MSL for 'local' */
211 int tcp_msl_remote = TCPTV_MSL; /* MSL otherwise */
212 int tcp_msl_remote_threshold = TCPTV_SRTTDFLT; /* RTT threshold */
213 int tcp_rttlocal = 0; /* Use RTT to decide who's 'local' */
214
215 int tcp4_vtw_enable = 0; /* 1 to enable */
216 int tcp6_vtw_enable = 0; /* 1 to enable */
217 int tcp_vtw_was_enabled = 0;
218 int tcp_vtw_entries = 1 << 4; /* 16 vestigial TIME_WAIT entries */
219
220 /* tcb hash */
221 #ifndef TCBHASHSIZE
222 #define TCBHASHSIZE 128
223 #endif
224 int tcbhashsize = TCBHASHSIZE;
225
226 int tcp_freeq(struct tcpcb *);
227 static int tcp_iss_secret_init(void);
228
229 static void tcp_mtudisc_callback(struct in_addr);
230
231 #ifdef INET6
232 static void tcp6_mtudisc(struct in6pcb *, int);
233 #endif
234
235 static struct pool tcpcb_pool;
236
237 static int tcp_drainwanted;
238
239 #ifdef TCP_CSUM_COUNTERS
240 #include <sys/device.h>
241
242 struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
243 NULL, "tcp", "hwcsum bad");
244 struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
245 NULL, "tcp", "hwcsum ok");
246 struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
247 NULL, "tcp", "hwcsum data");
248 struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
249 NULL, "tcp", "swcsum");
250
251 EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
252 EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
253 EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
254 EVCNT_ATTACH_STATIC(tcp_swcsum);
255
256 #if defined(INET6)
257 struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
258 NULL, "tcp6", "hwcsum bad");
259 struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
260 NULL, "tcp6", "hwcsum ok");
261 struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
262 NULL, "tcp6", "hwcsum data");
263 struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
264 NULL, "tcp6", "swcsum");
265
266 EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad);
267 EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok);
268 EVCNT_ATTACH_STATIC(tcp6_hwcsum_data);
269 EVCNT_ATTACH_STATIC(tcp6_swcsum);
270 #endif /* defined(INET6) */
271 #endif /* TCP_CSUM_COUNTERS */
272
273
274 #ifdef TCP_OUTPUT_COUNTERS
275 #include <sys/device.h>
276
277 struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
278 NULL, "tcp", "output big header");
279 struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
280 NULL, "tcp", "output predict hit");
281 struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
282 NULL, "tcp", "output predict miss");
283 struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
284 NULL, "tcp", "output copy small");
285 struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
286 NULL, "tcp", "output copy big");
287 struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
288 NULL, "tcp", "output reference big");
289
290 EVCNT_ATTACH_STATIC(tcp_output_bigheader);
291 EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
292 EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
293 EVCNT_ATTACH_STATIC(tcp_output_copysmall);
294 EVCNT_ATTACH_STATIC(tcp_output_copybig);
295 EVCNT_ATTACH_STATIC(tcp_output_refbig);
296
297 #endif /* TCP_OUTPUT_COUNTERS */
298
299 #ifdef TCP_REASS_COUNTERS
300 #include <sys/device.h>
301
302 struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
303 NULL, "tcp_reass", "calls");
304 struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
305 &tcp_reass_, "tcp_reass", "insert into empty queue");
306 struct evcnt tcp_reass_iteration[8] = {
307 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
308 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
309 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
310 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
311 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
312 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
313 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
314 EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
315 };
316 struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
317 &tcp_reass_, "tcp_reass", "prepend to first");
318 struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
319 &tcp_reass_, "tcp_reass", "prepend");
320 struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
321 &tcp_reass_, "tcp_reass", "insert");
322 struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
323 &tcp_reass_, "tcp_reass", "insert at tail");
324 struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
325 &tcp_reass_, "tcp_reass", "append");
326 struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
327 &tcp_reass_, "tcp_reass", "append to tail fragment");
328 struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
329 &tcp_reass_, "tcp_reass", "overlap at end");
330 struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
331 &tcp_reass_, "tcp_reass", "overlap at start");
332 struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
333 &tcp_reass_, "tcp_reass", "duplicate segment");
334 struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
335 &tcp_reass_, "tcp_reass", "duplicate fragment");
336
337 EVCNT_ATTACH_STATIC(tcp_reass_);
338 EVCNT_ATTACH_STATIC(tcp_reass_empty);
339 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
340 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
341 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
342 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
343 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
344 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
345 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
346 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
347 EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
348 EVCNT_ATTACH_STATIC(tcp_reass_prepend);
349 EVCNT_ATTACH_STATIC(tcp_reass_insert);
350 EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
351 EVCNT_ATTACH_STATIC(tcp_reass_append);
352 EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
353 EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
354 EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
355 EVCNT_ATTACH_STATIC(tcp_reass_segdup);
356 EVCNT_ATTACH_STATIC(tcp_reass_fragdup);
357
358 #endif /* TCP_REASS_COUNTERS */
359
360 #ifdef MBUFTRACE
361 struct mowner tcp_mowner = MOWNER_INIT("tcp", "");
362 struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx");
363 struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx");
364 struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock");
365 struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx");
366 struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx");
367 #endif
368
369 static int
370 do_tcpinit(void)
371 {
372
373 in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
374 pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
375 NULL, IPL_SOFTNET);
376
377 tcp_usrreq_init();
378
379 /* Initialize timer state. */
380 tcp_timer_init();
381
382 /* Initialize the compressed state engine. */
383 syn_cache_init();
384
385 /* Initialize the congestion control algorithms. */
386 tcp_congctl_init();
387
388 /* Initialize the TCPCB template. */
389 tcp_tcpcb_template();
390
391 /* Initialize reassembly queue */
392 tcpipqent_init();
393
394 /* SACK */
395 tcp_sack_init();
396
397 MOWNER_ATTACH(&tcp_tx_mowner);
398 MOWNER_ATTACH(&tcp_rx_mowner);
399 MOWNER_ATTACH(&tcp_reass_mowner);
400 MOWNER_ATTACH(&tcp_sock_mowner);
401 MOWNER_ATTACH(&tcp_sock_tx_mowner);
402 MOWNER_ATTACH(&tcp_sock_rx_mowner);
403 MOWNER_ATTACH(&tcp_mowner);
404
405 tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS);
406
407 vtw_earlyinit();
408
409 tcp_slowtimo_init();
410
411 return 0;
412 }
413
414 void
415 tcp_init_common(unsigned basehlen)
416 {
417 static ONCE_DECL(dotcpinit);
418 unsigned hlen = basehlen + sizeof(struct tcphdr);
419 unsigned oldhlen;
420
421 if (max_linkhdr + hlen > MHLEN)
422 panic("tcp_init");
423 while ((oldhlen = max_protohdr) < hlen)
424 atomic_cas_uint(&max_protohdr, oldhlen, hlen);
425
426 RUN_ONCE(&dotcpinit, do_tcpinit);
427 }
428
429 /*
430 * Tcp initialization
431 */
432 void
433 tcp_init(void)
434 {
435
436 icmp_mtudisc_callback_register(tcp_mtudisc_callback);
437
438 tcp_init_common(sizeof(struct ip));
439 }
440
441 /*
442 * Create template to be used to send tcp packets on a connection.
443 * Call after host entry created, allocates an mbuf and fills
444 * in a skeletal tcp/ip header, minimizing the amount of work
445 * necessary when the connection is used.
446 */
447 struct mbuf *
448 tcp_template(struct tcpcb *tp)
449 {
450 struct inpcb *inp = tp->t_inpcb;
451 #ifdef INET6
452 struct in6pcb *in6p = tp->t_in6pcb;
453 #endif
454 struct tcphdr *n;
455 struct mbuf *m;
456 int hlen;
457
458 switch (tp->t_family) {
459 case AF_INET:
460 hlen = sizeof(struct ip);
461 if (inp)
462 break;
463 #ifdef INET6
464 if (in6p) {
465 /* mapped addr case */
466 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr)
467 && IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr))
468 break;
469 }
470 #endif
471 return NULL; /*EINVAL*/
472 #ifdef INET6
473 case AF_INET6:
474 hlen = sizeof(struct ip6_hdr);
475 if (in6p) {
476 /* more sainty check? */
477 break;
478 }
479 return NULL; /*EINVAL*/
480 #endif
481 default:
482 return NULL; /*EAFNOSUPPORT*/
483 }
484
485 KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES);
486
487 m = tp->t_template;
488 if (m && m->m_len == hlen + sizeof(struct tcphdr)) {
489 ;
490 } else {
491 if (m)
492 m_freem(m);
493 m = tp->t_template = NULL;
494 MGETHDR(m, M_DONTWAIT, MT_HEADER);
495 if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
496 MCLGET(m, M_DONTWAIT);
497 if ((m->m_flags & M_EXT) == 0) {
498 m_free(m);
499 m = NULL;
500 }
501 }
502 if (m == NULL)
503 return NULL;
504 MCLAIM(m, &tcp_mowner);
505 m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
506 }
507
508 memset(mtod(m, void *), 0, m->m_len);
509
510 n = (struct tcphdr *)(mtod(m, char *) + hlen);
511
512 switch (tp->t_family) {
513 case AF_INET:
514 {
515 struct ipovly *ipov;
516 mtod(m, struct ip *)->ip_v = 4;
517 mtod(m, struct ip *)->ip_hl = hlen >> 2;
518 ipov = mtod(m, struct ipovly *);
519 ipov->ih_pr = IPPROTO_TCP;
520 ipov->ih_len = htons(sizeof(struct tcphdr));
521 if (inp) {
522 ipov->ih_src = inp->inp_laddr;
523 ipov->ih_dst = inp->inp_faddr;
524 }
525 #ifdef INET6
526 else if (in6p) {
527 /* mapped addr case */
528 bcopy(&in6p->in6p_laddr.s6_addr32[3], &ipov->ih_src,
529 sizeof(ipov->ih_src));
530 bcopy(&in6p->in6p_faddr.s6_addr32[3], &ipov->ih_dst,
531 sizeof(ipov->ih_dst));
532 }
533 #endif
534
535 /*
536 * Compute the pseudo-header portion of the checksum
537 * now. We incrementally add in the TCP option and
538 * payload lengths later, and then compute the TCP
539 * checksum right before the packet is sent off onto
540 * the wire.
541 */
542 n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
543 ipov->ih_dst.s_addr,
544 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
545 break;
546 }
547 #ifdef INET6
548 case AF_INET6:
549 {
550 struct ip6_hdr *ip6;
551 mtod(m, struct ip *)->ip_v = 6;
552 ip6 = mtod(m, struct ip6_hdr *);
553 ip6->ip6_nxt = IPPROTO_TCP;
554 ip6->ip6_plen = htons(sizeof(struct tcphdr));
555 ip6->ip6_src = in6p->in6p_laddr;
556 ip6->ip6_dst = in6p->in6p_faddr;
557 ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK;
558 if (ip6_auto_flowlabel) {
559 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
560 ip6->ip6_flow |=
561 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
562 }
563 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
564 ip6->ip6_vfc |= IPV6_VERSION;
565
566 /*
567 * Compute the pseudo-header portion of the checksum
568 * now. We incrementally add in the TCP option and
569 * payload lengths later, and then compute the TCP
570 * checksum right before the packet is sent off onto
571 * the wire.
572 */
573 n->th_sum = in6_cksum_phdr(&in6p->in6p_laddr,
574 &in6p->in6p_faddr, htonl(sizeof(struct tcphdr)),
575 htonl(IPPROTO_TCP));
576 break;
577 }
578 #endif
579 }
580
581 if (inp) {
582 n->th_sport = inp->inp_lport;
583 n->th_dport = inp->inp_fport;
584 }
585 #ifdef INET6
586 else if (in6p) {
587 n->th_sport = in6p->in6p_lport;
588 n->th_dport = in6p->in6p_fport;
589 }
590 #endif
591
592 n->th_seq = 0;
593 n->th_ack = 0;
594 n->th_x2 = 0;
595 n->th_off = 5;
596 n->th_flags = 0;
597 n->th_win = 0;
598 n->th_urp = 0;
599 return m;
600 }
601
602 /*
603 * Send a single message to the TCP at address specified by
604 * the given TCP/IP header. If m == 0, then we make a copy
605 * of the tcpiphdr at ti and send directly to the addressed host.
606 * This is used to force keep alive messages out using the TCP
607 * template for a connection tp->t_template. If flags are given
608 * then we send a message back to the TCP which originated the
609 * segment ti, and discard the mbuf containing it and any other
610 * attached mbufs.
611 *
612 * In any case the ack and sequence number of the transmitted
613 * segment are as specified by the parameters.
614 */
615 int
616 tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m,
617 struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
618 {
619 struct route *ro;
620 int error, tlen, win = 0;
621 int hlen;
622 struct ip *ip;
623 #ifdef INET6
624 struct ip6_hdr *ip6;
625 #endif
626 int family; /* family on packet, not inpcb/in6pcb! */
627 struct tcphdr *th;
628
629 if (tp != NULL && (flags & TH_RST) == 0) {
630 KASSERT(!(tp->t_inpcb && tp->t_in6pcb));
631
632 if (tp->t_inpcb)
633 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
634 #ifdef INET6
635 if (tp->t_in6pcb)
636 win = sbspace(&tp->t_in6pcb->in6p_socket->so_rcv);
637 #endif
638 }
639
640 th = NULL; /* Quell uninitialized warning */
641 ip = NULL;
642 #ifdef INET6
643 ip6 = NULL;
644 #endif
645 if (m == NULL) {
646 if (!mtemplate)
647 return EINVAL;
648
649 /* get family information from template */
650 switch (mtod(mtemplate, struct ip *)->ip_v) {
651 case 4:
652 family = AF_INET;
653 hlen = sizeof(struct ip);
654 break;
655 #ifdef INET6
656 case 6:
657 family = AF_INET6;
658 hlen = sizeof(struct ip6_hdr);
659 break;
660 #endif
661 default:
662 return EAFNOSUPPORT;
663 }
664
665 MGETHDR(m, M_DONTWAIT, MT_HEADER);
666 if (m) {
667 MCLAIM(m, &tcp_tx_mowner);
668 MCLGET(m, M_DONTWAIT);
669 if ((m->m_flags & M_EXT) == 0) {
670 m_free(m);
671 m = NULL;
672 }
673 }
674 if (m == NULL)
675 return ENOBUFS;
676
677 tlen = 0;
678
679 m->m_data += max_linkhdr;
680 bcopy(mtod(mtemplate, void *), mtod(m, void *),
681 mtemplate->m_len);
682 switch (family) {
683 case AF_INET:
684 ip = mtod(m, struct ip *);
685 th = (struct tcphdr *)(ip + 1);
686 break;
687 #ifdef INET6
688 case AF_INET6:
689 ip6 = mtod(m, struct ip6_hdr *);
690 th = (struct tcphdr *)(ip6 + 1);
691 break;
692 #endif
693 }
694 flags = TH_ACK;
695 } else {
696 if ((m->m_flags & M_PKTHDR) == 0) {
697 m_freem(m);
698 return EINVAL;
699 }
700 KASSERT(th0 != NULL);
701
702 /* get family information from m */
703 switch (mtod(m, struct ip *)->ip_v) {
704 case 4:
705 family = AF_INET;
706 hlen = sizeof(struct ip);
707 ip = mtod(m, struct ip *);
708 break;
709 #ifdef INET6
710 case 6:
711 family = AF_INET6;
712 hlen = sizeof(struct ip6_hdr);
713 ip6 = mtod(m, struct ip6_hdr *);
714 break;
715 #endif
716 default:
717 m_freem(m);
718 return EAFNOSUPPORT;
719 }
720 /* clear h/w csum flags inherited from rx packet */
721 m->m_pkthdr.csum_flags = 0;
722
723 if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
724 tlen = sizeof(*th0);
725 else
726 tlen = th0->th_off << 2;
727
728 if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
729 mtod(m, char *) + hlen == (char *)th0) {
730 m->m_len = hlen + tlen;
731 m_freem(m->m_next);
732 m->m_next = NULL;
733 } else {
734 struct mbuf *n;
735
736 KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES);
737
738 MGETHDR(n, M_DONTWAIT, MT_HEADER);
739 if (n && max_linkhdr + hlen + tlen > MHLEN) {
740 MCLGET(n, M_DONTWAIT);
741 if ((n->m_flags & M_EXT) == 0) {
742 m_freem(n);
743 n = NULL;
744 }
745 }
746 if (!n) {
747 m_freem(m);
748 return ENOBUFS;
749 }
750
751 MCLAIM(n, &tcp_tx_mowner);
752 n->m_data += max_linkhdr;
753 n->m_len = hlen + tlen;
754 m_copyback(n, 0, hlen, mtod(m, void *));
755 m_copyback(n, hlen, tlen, (void *)th0);
756
757 m_freem(m);
758 m = n;
759 n = NULL;
760 }
761
762 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
763 switch (family) {
764 case AF_INET:
765 ip = mtod(m, struct ip *);
766 th = (struct tcphdr *)(ip + 1);
767 ip->ip_p = IPPROTO_TCP;
768 xchg(ip->ip_dst, ip->ip_src, struct in_addr);
769 ip->ip_p = IPPROTO_TCP;
770 break;
771 #ifdef INET6
772 case AF_INET6:
773 ip6 = mtod(m, struct ip6_hdr *);
774 th = (struct tcphdr *)(ip6 + 1);
775 ip6->ip6_nxt = IPPROTO_TCP;
776 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
777 ip6->ip6_nxt = IPPROTO_TCP;
778 break;
779 #endif
780 }
781 xchg(th->th_dport, th->th_sport, u_int16_t);
782 #undef xchg
783 tlen = 0; /*be friendly with the following code*/
784 }
785 th->th_seq = htonl(seq);
786 th->th_ack = htonl(ack);
787 th->th_x2 = 0;
788 if ((flags & TH_SYN) == 0) {
789 if (tp)
790 win >>= tp->rcv_scale;
791 if (win > TCP_MAXWIN)
792 win = TCP_MAXWIN;
793 th->th_win = htons((u_int16_t)win);
794 th->th_off = sizeof (struct tcphdr) >> 2;
795 tlen += sizeof(*th);
796 } else {
797 tlen += th->th_off << 2;
798 }
799 m->m_len = hlen + tlen;
800 m->m_pkthdr.len = hlen + tlen;
801 m_reset_rcvif(m);
802 th->th_flags = flags;
803 th->th_urp = 0;
804
805 switch (family) {
806 case AF_INET:
807 {
808 struct ipovly *ipov = (struct ipovly *)ip;
809 memset(ipov->ih_x1, 0, sizeof ipov->ih_x1);
810 ipov->ih_len = htons((u_int16_t)tlen);
811
812 th->th_sum = 0;
813 th->th_sum = in_cksum(m, hlen + tlen);
814 ip->ip_len = htons(hlen + tlen);
815 ip->ip_ttl = ip_defttl;
816 break;
817 }
818 #ifdef INET6
819 case AF_INET6:
820 {
821 th->th_sum = 0;
822 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
823 tlen);
824 ip6->ip6_plen = htons(tlen);
825 if (tp && tp->t_in6pcb)
826 ip6->ip6_hlim = in6_selecthlim_rt(tp->t_in6pcb);
827 else
828 ip6->ip6_hlim = ip6_defhlim;
829 ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
830 if (ip6_auto_flowlabel) {
831 ip6->ip6_flow |=
832 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
833 }
834 break;
835 }
836 #endif
837 }
838
839 if (tp != NULL && tp->t_inpcb != NULL) {
840 ro = &tp->t_inpcb->inp_route;
841 KASSERT(family == AF_INET);
842 KASSERT(in_hosteq(ip->ip_dst, tp->t_inpcb->inp_faddr));
843 }
844 #ifdef INET6
845 else if (tp != NULL && tp->t_in6pcb != NULL) {
846 ro = (struct route *)&tp->t_in6pcb->in6p_route;
847
848 #ifdef DIAGNOSTIC
849 if (family == AF_INET) {
850 if (!IN6_IS_ADDR_V4MAPPED(&tp->t_in6pcb->in6p_faddr))
851 panic("tcp_respond: not mapped addr");
852 if (memcmp(&ip->ip_dst,
853 &tp->t_in6pcb->in6p_faddr.s6_addr32[3],
854 sizeof(ip->ip_dst)) != 0) {
855 panic("tcp_respond: ip_dst != in6p_faddr");
856 }
857 } else if (family == AF_INET6) {
858 if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
859 &tp->t_in6pcb->in6p_faddr))
860 panic("tcp_respond: ip6_dst != in6p_faddr");
861 } else
862 panic("tcp_respond: address family mismatch");
863 #endif
864 }
865 #endif
866 else
867 ro = NULL;
868
869 switch (family) {
870 case AF_INET:
871 error = ip_output(m, NULL, ro,
872 (tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL,
873 tp ? tp->t_inpcb : NULL);
874 break;
875 #ifdef INET6
876 case AF_INET6:
877 error = ip6_output(m, NULL, ro, 0, NULL,
878 tp ? tp->t_in6pcb : NULL, NULL);
879 break;
880 #endif
881 default:
882 error = EAFNOSUPPORT;
883 break;
884 }
885
886 return error;
887 }
888
889 /*
890 * Template TCPCB. Rather than zeroing a new TCPCB and initializing
891 * a bunch of members individually, we maintain this template for the
892 * static and mostly-static components of the TCPCB, and copy it into
893 * the new TCPCB instead.
894 */
895 static struct tcpcb tcpcb_template = {
896 .t_srtt = TCPTV_SRTTBASE,
897 .t_rttmin = TCPTV_MIN,
898
899 .snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
900 .snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
901 .snd_numholes = 0,
902 .snd_cubic_wmax = 0,
903 .snd_cubic_wmax_last = 0,
904 .snd_cubic_ctime = 0,
905
906 .t_partialacks = -1,
907 .t_bytes_acked = 0,
908 .t_sndrexmitpack = 0,
909 .t_rcvoopack = 0,
910 .t_sndzerowin = 0,
911 };
912
913 /*
914 * Updates the TCPCB template whenever a parameter that would affect
915 * the template is changed.
916 */
917 void
918 tcp_tcpcb_template(void)
919 {
920 struct tcpcb *tp = &tcpcb_template;
921 int flags;
922
923 tp->t_peermss = tcp_mssdflt;
924 tp->t_ourmss = tcp_mssdflt;
925 tp->t_segsz = tcp_mssdflt;
926
927 flags = 0;
928 if (tcp_do_rfc1323 && tcp_do_win_scale)
929 flags |= TF_REQ_SCALE;
930 if (tcp_do_rfc1323 && tcp_do_timestamps)
931 flags |= TF_REQ_TSTMP;
932 tp->t_flags = flags;
933
934 /*
935 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
936 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
937 * reasonable initial retransmit time.
938 */
939 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
940 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
941 TCPTV_MIN, TCPTV_REXMTMAX);
942
943 /* Keep Alive */
944 tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS);
945 tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS);
946 tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS);
947 tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS));
948 tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
949 TCP_TIMER_MAXTICKS/tp->t_keepcnt);
950
951 /* MSL */
952 tp->t_msl = TCPTV_MSL;
953 }
954
955 /*
956 * Create a new TCP control block, making an
957 * empty reassembly queue and hooking it to the argument
958 * protocol control block.
959 */
960 /* family selects inpcb, or in6pcb */
961 struct tcpcb *
962 tcp_newtcpcb(int family, void *aux)
963 {
964 struct tcpcb *tp;
965 int i;
966
967 /* XXX Consider using a pool_cache for speed. */
968 tp = pool_get(&tcpcb_pool, PR_NOWAIT); /* splsoftnet via tcp_usrreq */
969 if (tp == NULL)
970 return NULL;
971 memcpy(tp, &tcpcb_template, sizeof(*tp));
972 TAILQ_INIT(&tp->segq);
973 TAILQ_INIT(&tp->timeq);
974 tp->t_family = family; /* may be overridden later on */
975 TAILQ_INIT(&tp->snd_holes);
976 LIST_INIT(&tp->t_sc); /* XXX can template this */
977
978 /* Don't sweat this loop; hopefully the compiler will unroll it. */
979 for (i = 0; i < TCPT_NTIMERS; i++) {
980 callout_init(&tp->t_timer[i], CALLOUT_MPSAFE);
981 TCP_TIMER_INIT(tp, i);
982 }
983 callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE);
984
985 switch (family) {
986 case AF_INET:
987 {
988 struct inpcb *inp = (struct inpcb *)aux;
989
990 inp->inp_ip.ip_ttl = ip_defttl;
991 inp->inp_ppcb = (void *)tp;
992
993 tp->t_inpcb = inp;
994 tp->t_mtudisc = ip_mtudisc;
995 break;
996 }
997 #ifdef INET6
998 case AF_INET6:
999 {
1000 struct in6pcb *in6p = (struct in6pcb *)aux;
1001
1002 in6p->in6p_ip6.ip6_hlim = in6_selecthlim_rt(in6p);
1003 in6p->in6p_ppcb = (void *)tp;
1004
1005 tp->t_in6pcb = in6p;
1006 /* for IPv6, always try to run path MTU discovery */
1007 tp->t_mtudisc = 1;
1008 break;
1009 }
1010 #endif /* INET6 */
1011 default:
1012 for (i = 0; i < TCPT_NTIMERS; i++)
1013 callout_destroy(&tp->t_timer[i]);
1014 callout_destroy(&tp->t_delack_ch);
1015 pool_put(&tcpcb_pool, tp); /* splsoftnet via tcp_usrreq */
1016 return NULL;
1017 }
1018
1019 /*
1020 * Initialize our timebase. When we send timestamps, we take
1021 * the delta from tcp_now -- this means each connection always
1022 * gets a timebase of 1, which makes it, among other things,
1023 * more difficult to determine how long a system has been up,
1024 * and thus how many TCP sequence increments have occurred.
1025 *
1026 * We start with 1, because 0 doesn't work with linux, which
1027 * considers timestamp 0 in a SYN packet as a bug and disables
1028 * timestamps.
1029 */
1030 tp->ts_timebase = tcp_now - 1;
1031
1032 tcp_congctl_select(tp, tcp_congctl_global_name);
1033
1034 return tp;
1035 }
1036
1037 /*
1038 * Drop a TCP connection, reporting
1039 * the specified error. If connection is synchronized,
1040 * then send a RST to peer.
1041 */
1042 struct tcpcb *
1043 tcp_drop(struct tcpcb *tp, int errno)
1044 {
1045 struct socket *so = NULL;
1046
1047 KASSERT(!(tp->t_inpcb && tp->t_in6pcb));
1048
1049 if (tp->t_inpcb)
1050 so = tp->t_inpcb->inp_socket;
1051 #ifdef INET6
1052 if (tp->t_in6pcb)
1053 so = tp->t_in6pcb->in6p_socket;
1054 #endif
1055 if (!so)
1056 return NULL;
1057
1058 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1059 tp->t_state = TCPS_CLOSED;
1060 (void) tcp_output(tp);
1061 TCP_STATINC(TCP_STAT_DROPS);
1062 } else
1063 TCP_STATINC(TCP_STAT_CONNDROPS);
1064 if (errno == ETIMEDOUT && tp->t_softerror)
1065 errno = tp->t_softerror;
1066 so->so_error = errno;
1067 return (tcp_close(tp));
1068 }
1069
1070 /*
1071 * Close a TCP control block:
1072 * discard all space held by the tcp
1073 * discard internet protocol block
1074 * wake up any sleepers
1075 */
1076 struct tcpcb *
1077 tcp_close(struct tcpcb *tp)
1078 {
1079 struct inpcb *inp;
1080 #ifdef INET6
1081 struct in6pcb *in6p;
1082 #endif
1083 struct socket *so;
1084 #ifdef RTV_RTT
1085 struct rtentry *rt = NULL;
1086 #endif
1087 struct route *ro;
1088 int j;
1089
1090 inp = tp->t_inpcb;
1091 #ifdef INET6
1092 in6p = tp->t_in6pcb;
1093 #endif
1094 so = NULL;
1095 ro = NULL;
1096 if (inp) {
1097 so = inp->inp_socket;
1098 ro = &inp->inp_route;
1099 }
1100 #ifdef INET6
1101 else if (in6p) {
1102 so = in6p->in6p_socket;
1103 ro = (struct route *)&in6p->in6p_route;
1104 }
1105 #endif
1106
1107 #ifdef RTV_RTT
1108 /*
1109 * If we sent enough data to get some meaningful characteristics,
1110 * save them in the routing entry. 'Enough' is arbitrarily
1111 * defined as the sendpipesize (default 4K) * 16. This would
1112 * give us 16 rtt samples assuming we only get one sample per
1113 * window (the usual case on a long haul net). 16 samples is
1114 * enough for the srtt filter to converge to within 5% of the correct
1115 * value; fewer samples and we could save a very bogus rtt.
1116 *
1117 * Don't update the default route's characteristics and don't
1118 * update anything that the user "locked".
1119 */
1120 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
1121 ro && (rt = rtcache_validate(ro)) != NULL &&
1122 !in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) {
1123 u_long i = 0;
1124
1125 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1126 i = tp->t_srtt *
1127 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
1128 if (rt->rt_rmx.rmx_rtt && i)
1129 /*
1130 * filter this update to half the old & half
1131 * the new values, converting scale.
1132 * See route.h and tcp_var.h for a
1133 * description of the scaling constants.
1134 */
1135 rt->rt_rmx.rmx_rtt =
1136 (rt->rt_rmx.rmx_rtt + i) / 2;
1137 else
1138 rt->rt_rmx.rmx_rtt = i;
1139 }
1140 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1141 i = tp->t_rttvar *
1142 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
1143 if (rt->rt_rmx.rmx_rttvar && i)
1144 rt->rt_rmx.rmx_rttvar =
1145 (rt->rt_rmx.rmx_rttvar + i) / 2;
1146 else
1147 rt->rt_rmx.rmx_rttvar = i;
1148 }
1149 /*
1150 * update the pipelimit (ssthresh) if it has been updated
1151 * already or if a pipesize was specified & the threshold
1152 * got below half the pipesize. I.e., wait for bad news
1153 * before we start updating, then update on both good
1154 * and bad news.
1155 */
1156 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1157 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
1158 i < (rt->rt_rmx.rmx_sendpipe / 2)) {
1159 /*
1160 * convert the limit from user data bytes to
1161 * packets then to packet data bytes.
1162 */
1163 i = (i + tp->t_segsz / 2) / tp->t_segsz;
1164 if (i < 2)
1165 i = 2;
1166 i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
1167 if (rt->rt_rmx.rmx_ssthresh)
1168 rt->rt_rmx.rmx_ssthresh =
1169 (rt->rt_rmx.rmx_ssthresh + i) / 2;
1170 else
1171 rt->rt_rmx.rmx_ssthresh = i;
1172 }
1173 }
1174 rtcache_unref(rt, ro);
1175 #endif /* RTV_RTT */
1176 /* free the reassembly queue, if any */
1177 TCP_REASS_LOCK(tp);
1178 (void) tcp_freeq(tp);
1179 TCP_REASS_UNLOCK(tp);
1180
1181 /* free the SACK holes list. */
1182 tcp_free_sackholes(tp);
1183 tcp_congctl_release(tp);
1184 syn_cache_cleanup(tp);
1185
1186 if (tp->t_template) {
1187 m_free(tp->t_template);
1188 tp->t_template = NULL;
1189 }
1190
1191 /*
1192 * Detaching the pcb will unlock the socket/tcpcb, and stopping
1193 * the timers can also drop the lock. We need to prevent access
1194 * to the tcpcb as it's half torn down. Flag the pcb as dead
1195 * (prevents access by timers) and only then detach it.
1196 */
1197 tp->t_flags |= TF_DEAD;
1198 if (inp) {
1199 inp->inp_ppcb = 0;
1200 soisdisconnected(so);
1201 in_pcbdetach(inp);
1202 }
1203 #ifdef INET6
1204 else if (in6p) {
1205 in6p->in6p_ppcb = 0;
1206 soisdisconnected(so);
1207 in6_pcbdetach(in6p);
1208 }
1209 #endif
1210 /*
1211 * pcb is no longer visble elsewhere, so we can safely release
1212 * the lock in callout_halt() if needed.
1213 */
1214 TCP_STATINC(TCP_STAT_CLOSED);
1215 for (j = 0; j < TCPT_NTIMERS; j++) {
1216 callout_halt(&tp->t_timer[j], softnet_lock);
1217 callout_destroy(&tp->t_timer[j]);
1218 }
1219 callout_halt(&tp->t_delack_ch, softnet_lock);
1220 callout_destroy(&tp->t_delack_ch);
1221 pool_put(&tcpcb_pool, tp);
1222
1223 return NULL;
1224 }
1225
1226 int
1227 tcp_freeq(struct tcpcb *tp)
1228 {
1229 struct ipqent *qe;
1230 int rv = 0;
1231
1232 TCP_REASS_LOCK_CHECK(tp);
1233
1234 while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) {
1235 TAILQ_REMOVE(&tp->segq, qe, ipqe_q);
1236 TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
1237 m_freem(qe->ipqe_m);
1238 tcpipqent_free(qe);
1239 rv = 1;
1240 }
1241 tp->t_segqlen = 0;
1242 KASSERT(TAILQ_EMPTY(&tp->timeq));
1243 return (rv);
1244 }
1245
1246 void
1247 tcp_fasttimo(void)
1248 {
1249 if (tcp_drainwanted) {
1250 tcp_drain();
1251 tcp_drainwanted = 0;
1252 }
1253 }
1254
1255 void
1256 tcp_drainstub(void)
1257 {
1258 tcp_drainwanted = 1;
1259 }
1260
1261 /*
1262 * Protocol drain routine. Called when memory is in short supply.
1263 * Called from pr_fasttimo thus a callout context.
1264 */
1265 void
1266 tcp_drain(void)
1267 {
1268 struct inpcb_hdr *inph;
1269 struct tcpcb *tp;
1270
1271 mutex_enter(softnet_lock);
1272 KERNEL_LOCK(1, NULL);
1273
1274 /*
1275 * Free the sequence queue of all TCP connections.
1276 */
1277 TAILQ_FOREACH(inph, &tcbtable.inpt_queue, inph_queue) {
1278 switch (inph->inph_af) {
1279 case AF_INET:
1280 tp = intotcpcb((struct inpcb *)inph);
1281 break;
1282 #ifdef INET6
1283 case AF_INET6:
1284 tp = in6totcpcb((struct in6pcb *)inph);
1285 break;
1286 #endif
1287 default:
1288 tp = NULL;
1289 break;
1290 }
1291 if (tp != NULL) {
1292 /*
1293 * If the tcpcb is already busy,
1294 * just bail out now.
1295 */
1296 if (tcp_reass_lock_try(tp) == 0)
1297 continue;
1298 if (tcp_freeq(tp))
1299 TCP_STATINC(TCP_STAT_CONNSDRAINED);
1300 TCP_REASS_UNLOCK(tp);
1301 }
1302 }
1303
1304 KERNEL_UNLOCK_ONE(NULL);
1305 mutex_exit(softnet_lock);
1306 }
1307
1308 /*
1309 * Notify a tcp user of an asynchronous error;
1310 * store error as soft error, but wake up user
1311 * (for now, won't do anything until can select for soft error).
1312 */
1313 void
1314 tcp_notify(struct inpcb *inp, int error)
1315 {
1316 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
1317 struct socket *so = inp->inp_socket;
1318
1319 /*
1320 * Ignore some errors if we are hooked up.
1321 * If connection hasn't completed, has retransmitted several times,
1322 * and receives a second error, give up now. This is better
1323 * than waiting a long time to establish a connection that
1324 * can never complete.
1325 */
1326 if (tp->t_state == TCPS_ESTABLISHED &&
1327 (error == EHOSTUNREACH || error == ENETUNREACH ||
1328 error == EHOSTDOWN)) {
1329 return;
1330 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
1331 tp->t_rxtshift > 3 && tp->t_softerror)
1332 so->so_error = error;
1333 else
1334 tp->t_softerror = error;
1335 cv_broadcast(&so->so_cv);
1336 sorwakeup(so);
1337 sowwakeup(so);
1338 }
1339
1340 #ifdef INET6
1341 void
1342 tcp6_notify(struct in6pcb *in6p, int error)
1343 {
1344 struct tcpcb *tp = (struct tcpcb *)in6p->in6p_ppcb;
1345 struct socket *so = in6p->in6p_socket;
1346
1347 /*
1348 * Ignore some errors if we are hooked up.
1349 * If connection hasn't completed, has retransmitted several times,
1350 * and receives a second error, give up now. This is better
1351 * than waiting a long time to establish a connection that
1352 * can never complete.
1353 */
1354 if (tp->t_state == TCPS_ESTABLISHED &&
1355 (error == EHOSTUNREACH || error == ENETUNREACH ||
1356 error == EHOSTDOWN)) {
1357 return;
1358 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
1359 tp->t_rxtshift > 3 && tp->t_softerror)
1360 so->so_error = error;
1361 else
1362 tp->t_softerror = error;
1363 cv_broadcast(&so->so_cv);
1364 sorwakeup(so);
1365 sowwakeup(so);
1366 }
1367 #endif
1368
1369 #ifdef INET6
1370 void *
1371 tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
1372 {
1373 struct tcphdr th;
1374 void (*notify)(struct in6pcb *, int) = tcp6_notify;
1375 int nmatch;
1376 struct ip6_hdr *ip6;
1377 const struct sockaddr_in6 *sa6_src = NULL;
1378 const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
1379 struct mbuf *m;
1380 int off;
1381
1382 if (sa->sa_family != AF_INET6 ||
1383 sa->sa_len != sizeof(struct sockaddr_in6))
1384 return NULL;
1385 if ((unsigned)cmd >= PRC_NCMDS)
1386 return NULL;
1387 else if (cmd == PRC_QUENCH) {
1388 /*
1389 * Don't honor ICMP Source Quench messages meant for
1390 * TCP connections.
1391 */
1392 return NULL;
1393 } else if (PRC_IS_REDIRECT(cmd))
1394 notify = in6_rtchange, d = NULL;
1395 else if (cmd == PRC_MSGSIZE)
1396 ; /* special code is present, see below */
1397 else if (cmd == PRC_HOSTDEAD)
1398 d = NULL;
1399 else if (inet6ctlerrmap[cmd] == 0)
1400 return NULL;
1401
1402 /* if the parameter is from icmp6, decode it. */
1403 if (d != NULL) {
1404 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
1405 m = ip6cp->ip6c_m;
1406 ip6 = ip6cp->ip6c_ip6;
1407 off = ip6cp->ip6c_off;
1408 sa6_src = ip6cp->ip6c_src;
1409 } else {
1410 m = NULL;
1411 ip6 = NULL;
1412 sa6_src = &sa6_any;
1413 off = 0;
1414 }
1415
1416 if (ip6) {
1417 /* check if we can safely examine src and dst ports */
1418 if (m->m_pkthdr.len < off + sizeof(th)) {
1419 if (cmd == PRC_MSGSIZE)
1420 icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
1421 return NULL;
1422 }
1423
1424 memset(&th, 0, sizeof(th));
1425 m_copydata(m, off, sizeof(th), (void *)&th);
1426
1427 if (cmd == PRC_MSGSIZE) {
1428 int valid = 0;
1429
1430 /*
1431 * Check to see if we have a valid TCP connection
1432 * corresponding to the address in the ICMPv6 message
1433 * payload.
1434 */
1435 if (in6_pcblookup_connect(&tcbtable, &sa6->sin6_addr,
1436 th.th_dport,
1437 (const struct in6_addr *)&sa6_src->sin6_addr,
1438 th.th_sport, 0, 0))
1439 valid++;
1440
1441 /*
1442 * Depending on the value of "valid" and routing table
1443 * size (mtudisc_{hi,lo}wat), we will:
1444 * - recalcurate the new MTU and create the
1445 * corresponding routing entry, or
1446 * - ignore the MTU change notification.
1447 */
1448 icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
1449
1450 /*
1451 * no need to call in6_pcbnotify, it should have been
1452 * called via callback if necessary
1453 */
1454 return NULL;
1455 }
1456
1457 nmatch = in6_pcbnotify(&tcbtable, sa, th.th_dport,
1458 (const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
1459 if (nmatch == 0 && syn_cache_count &&
1460 (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
1461 inet6ctlerrmap[cmd] == ENETUNREACH ||
1462 inet6ctlerrmap[cmd] == EHOSTDOWN))
1463 syn_cache_unreach((const struct sockaddr *)sa6_src,
1464 sa, &th);
1465 } else {
1466 (void) in6_pcbnotify(&tcbtable, sa, 0,
1467 (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
1468 }
1469
1470 return NULL;
1471 }
1472 #endif
1473
1474 /* assumes that ip header and tcp header are contiguous on mbuf */
1475 void *
1476 tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
1477 {
1478 struct ip *ip = v;
1479 struct tcphdr *th;
1480 struct icmp *icp;
1481 extern const int inetctlerrmap[];
1482 void (*notify)(struct inpcb *, int) = tcp_notify;
1483 int errno;
1484 int nmatch;
1485 struct tcpcb *tp;
1486 u_int mtu;
1487 tcp_seq seq;
1488 struct inpcb *inp;
1489 #ifdef INET6
1490 struct in6pcb *in6p;
1491 struct in6_addr src6, dst6;
1492 #endif
1493
1494 if (sa->sa_family != AF_INET ||
1495 sa->sa_len != sizeof(struct sockaddr_in))
1496 return NULL;
1497 if ((unsigned)cmd >= PRC_NCMDS)
1498 return NULL;
1499 errno = inetctlerrmap[cmd];
1500 if (cmd == PRC_QUENCH)
1501 /*
1502 * Don't honor ICMP Source Quench messages meant for
1503 * TCP connections.
1504 */
1505 return NULL;
1506 else if (PRC_IS_REDIRECT(cmd))
1507 notify = in_rtchange, ip = 0;
1508 else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
1509 /*
1510 * Check to see if we have a valid TCP connection
1511 * corresponding to the address in the ICMP message
1512 * payload.
1513 *
1514 * Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
1515 */
1516 th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1517 #ifdef INET6
1518 in6_in_2_v4mapin6(&ip->ip_src, &src6);
1519 in6_in_2_v4mapin6(&ip->ip_dst, &dst6);
1520 #endif
1521 if ((inp = in_pcblookup_connect(&tcbtable, ip->ip_dst,
1522 th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL)
1523 #ifdef INET6
1524 in6p = NULL;
1525 #else
1526 ;
1527 #endif
1528 #ifdef INET6
1529 else if ((in6p = in6_pcblookup_connect(&tcbtable, &dst6,
1530 th->th_dport, &src6, th->th_sport, 0, 0)) != NULL)
1531 ;
1532 #endif
1533 else
1534 return NULL;
1535
1536 /*
1537 * Now that we've validated that we are actually communicating
1538 * with the host indicated in the ICMP message, locate the
1539 * ICMP header, recalculate the new MTU, and create the
1540 * corresponding routing entry.
1541 */
1542 icp = (struct icmp *)((char *)ip -
1543 offsetof(struct icmp, icmp_ip));
1544 if (inp) {
1545 if ((tp = intotcpcb(inp)) == NULL)
1546 return NULL;
1547 }
1548 #ifdef INET6
1549 else if (in6p) {
1550 if ((tp = in6totcpcb(in6p)) == NULL)
1551 return NULL;
1552 }
1553 #endif
1554 else
1555 return NULL;
1556 seq = ntohl(th->th_seq);
1557 if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max))
1558 return NULL;
1559 /*
1560 * If the ICMP message advertises a Next-Hop MTU
1561 * equal or larger than the maximum packet size we have
1562 * ever sent, drop the message.
1563 */
1564 mtu = (u_int)ntohs(icp->icmp_nextmtu);
1565 if (mtu >= tp->t_pmtud_mtu_sent)
1566 return NULL;
1567 if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
1568 /*
1569 * Calculate new MTU, and create corresponding
1570 * route (traditional PMTUD).
1571 */
1572 tp->t_flags &= ~TF_PMTUD_PEND;
1573 icmp_mtudisc(icp, ip->ip_dst);
1574 } else {
1575 /*
1576 * Record the information got in the ICMP
1577 * message; act on it later.
1578 * If we had already recorded an ICMP message,
1579 * replace the old one only if the new message
1580 * refers to an older TCP segment
1581 */
1582 if (tp->t_flags & TF_PMTUD_PEND) {
1583 if (SEQ_LT(tp->t_pmtud_th_seq, seq))
1584 return NULL;
1585 } else
1586 tp->t_flags |= TF_PMTUD_PEND;
1587 tp->t_pmtud_th_seq = seq;
1588 tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
1589 tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
1590 tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
1591 }
1592 return NULL;
1593 } else if (cmd == PRC_HOSTDEAD)
1594 ip = 0;
1595 else if (errno == 0)
1596 return NULL;
1597 if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
1598 th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1599 nmatch = in_pcbnotify(&tcbtable, satocsin(sa)->sin_addr,
1600 th->th_dport, ip->ip_src, th->th_sport, errno, notify);
1601 if (nmatch == 0 && syn_cache_count &&
1602 (inetctlerrmap[cmd] == EHOSTUNREACH ||
1603 inetctlerrmap[cmd] == ENETUNREACH ||
1604 inetctlerrmap[cmd] == EHOSTDOWN)) {
1605 struct sockaddr_in sin;
1606 memset(&sin, 0, sizeof(sin));
1607 sin.sin_len = sizeof(sin);
1608 sin.sin_family = AF_INET;
1609 sin.sin_port = th->th_sport;
1610 sin.sin_addr = ip->ip_src;
1611 syn_cache_unreach((struct sockaddr *)&sin, sa, th);
1612 }
1613
1614 /* XXX mapped address case */
1615 } else
1616 in_pcbnotifyall(&tcbtable, satocsin(sa)->sin_addr, errno,
1617 notify);
1618 return NULL;
1619 }
1620
1621 /*
1622 * When a source quench is received, we are being notified of congestion.
1623 * Close the congestion window down to the Loss Window (one segment).
1624 * We will gradually open it again as we proceed.
1625 */
1626 void
1627 tcp_quench(struct inpcb *inp)
1628 {
1629 struct tcpcb *tp = intotcpcb(inp);
1630
1631 if (tp) {
1632 tp->snd_cwnd = tp->t_segsz;
1633 tp->t_bytes_acked = 0;
1634 }
1635 }
1636
1637 #ifdef INET6
1638 void
1639 tcp6_quench(struct in6pcb *in6p)
1640 {
1641 struct tcpcb *tp = in6totcpcb(in6p);
1642
1643 if (tp) {
1644 tp->snd_cwnd = tp->t_segsz;
1645 tp->t_bytes_acked = 0;
1646 }
1647 }
1648 #endif
1649
1650 /*
1651 * Path MTU Discovery handlers.
1652 */
1653 void
1654 tcp_mtudisc_callback(struct in_addr faddr)
1655 {
1656 #ifdef INET6
1657 struct in6_addr in6;
1658 #endif
1659
1660 in_pcbnotifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
1661 #ifdef INET6
1662 in6_in_2_v4mapin6(&faddr, &in6);
1663 tcp6_mtudisc_callback(&in6);
1664 #endif
1665 }
1666
1667 /*
1668 * On receipt of path MTU corrections, flush old route and replace it
1669 * with the new one. Retransmit all unacknowledged packets, to ensure
1670 * that all packets will be received.
1671 */
1672 void
1673 tcp_mtudisc(struct inpcb *inp, int errno)
1674 {
1675 struct tcpcb *tp = intotcpcb(inp);
1676 struct rtentry *rt;
1677
1678 if (tp == NULL)
1679 return;
1680
1681 rt = in_pcbrtentry(inp);
1682 if (rt != NULL) {
1683 /*
1684 * If this was not a host route, remove and realloc.
1685 */
1686 if ((rt->rt_flags & RTF_HOST) == 0) {
1687 in_pcbrtentry_unref(rt, inp);
1688 in_rtchange(inp, errno);
1689 if ((rt = in_pcbrtentry(inp)) == NULL)
1690 return;
1691 }
1692
1693 /*
1694 * Slow start out of the error condition. We
1695 * use the MTU because we know it's smaller
1696 * than the previously transmitted segment.
1697 *
1698 * Note: This is more conservative than the
1699 * suggestion in draft-floyd-incr-init-win-03.
1700 */
1701 if (rt->rt_rmx.rmx_mtu != 0)
1702 tp->snd_cwnd =
1703 TCP_INITIAL_WINDOW(tcp_init_win,
1704 rt->rt_rmx.rmx_mtu);
1705 in_pcbrtentry_unref(rt, inp);
1706 }
1707
1708 /*
1709 * Resend unacknowledged packets.
1710 */
1711 tp->snd_nxt = tp->sack_newdata = tp->snd_una;
1712 tcp_output(tp);
1713 }
1714
1715 #ifdef INET6
1716 /*
1717 * Path MTU Discovery handlers.
1718 */
1719 void
1720 tcp6_mtudisc_callback(struct in6_addr *faddr)
1721 {
1722 struct sockaddr_in6 sin6;
1723
1724 memset(&sin6, 0, sizeof(sin6));
1725 sin6.sin6_family = AF_INET6;
1726 sin6.sin6_len = sizeof(struct sockaddr_in6);
1727 sin6.sin6_addr = *faddr;
1728 (void) in6_pcbnotify(&tcbtable, (struct sockaddr *)&sin6, 0,
1729 (const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
1730 }
1731
1732 void
1733 tcp6_mtudisc(struct in6pcb *in6p, int errno)
1734 {
1735 struct tcpcb *tp = in6totcpcb(in6p);
1736 struct rtentry *rt;
1737
1738 if (tp == NULL)
1739 return;
1740
1741 rt = in6_pcbrtentry(in6p);
1742 if (rt != NULL) {
1743 /*
1744 * If this was not a host route, remove and realloc.
1745 */
1746 if ((rt->rt_flags & RTF_HOST) == 0) {
1747 in6_pcbrtentry_unref(rt, in6p);
1748 in6_rtchange(in6p, errno);
1749 rt = in6_pcbrtentry(in6p);
1750 if (rt == NULL)
1751 return;
1752 }
1753
1754 /*
1755 * Slow start out of the error condition. We
1756 * use the MTU because we know it's smaller
1757 * than the previously transmitted segment.
1758 *
1759 * Note: This is more conservative than the
1760 * suggestion in draft-floyd-incr-init-win-03.
1761 */
1762 if (rt->rt_rmx.rmx_mtu != 0) {
1763 tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
1764 rt->rt_rmx.rmx_mtu);
1765 }
1766 in6_pcbrtentry_unref(rt, in6p);
1767 }
1768
1769 /*
1770 * Resend unacknowledged packets.
1771 */
1772 tp->snd_nxt = tp->sack_newdata = tp->snd_una;
1773 tcp_output(tp);
1774 }
1775 #endif /* INET6 */
1776
1777 /*
1778 * Compute the MSS to advertise to the peer. Called only during
1779 * the 3-way handshake. If we are the server (peer initiated
1780 * connection), we are called with a pointer to the interface
1781 * on which the SYN packet arrived. If we are the client (we
1782 * initiated connection), we are called with a pointer to the
1783 * interface out which this connection should go.
1784 *
1785 * NOTE: Do not subtract IP option/extension header size nor IPsec
1786 * header size from MSS advertisement. MSS option must hold the maximum
1787 * segment size we can accept, so it must always be:
1788 * max(if mtu) - ip header - tcp header
1789 */
1790 u_long
1791 tcp_mss_to_advertise(const struct ifnet *ifp, int af)
1792 {
1793 extern u_long in_maxmtu;
1794 u_long mss = 0;
1795 u_long hdrsiz;
1796
1797 /*
1798 * In order to avoid defeating path MTU discovery on the peer,
1799 * we advertise the max MTU of all attached networks as our MSS,
1800 * per RFC 1191, section 3.1.
1801 *
1802 * We provide the option to advertise just the MTU of
1803 * the interface on which we hope this connection will
1804 * be receiving. If we are responding to a SYN, we
1805 * will have a pretty good idea about this, but when
1806 * initiating a connection there is a bit more doubt.
1807 *
1808 * We also need to ensure that loopback has a large enough
1809 * MSS, as the loopback MTU is never included in in_maxmtu.
1810 */
1811
1812 if (ifp != NULL)
1813 switch (af) {
1814 #ifdef INET6
1815 case AF_INET6: /* FALLTHROUGH */
1816 #endif
1817 case AF_INET:
1818 mss = ifp->if_mtu;
1819 break;
1820 }
1821
1822 if (tcp_mss_ifmtu == 0)
1823 switch (af) {
1824 #ifdef INET6
1825 case AF_INET6: /* FALLTHROUGH */
1826 #endif
1827 case AF_INET:
1828 mss = uimax(in_maxmtu, mss);
1829 break;
1830 }
1831
1832 switch (af) {
1833 case AF_INET:
1834 hdrsiz = sizeof(struct ip);
1835 break;
1836 #ifdef INET6
1837 case AF_INET6:
1838 hdrsiz = sizeof(struct ip6_hdr);
1839 break;
1840 #endif
1841 default:
1842 hdrsiz = 0;
1843 break;
1844 }
1845 hdrsiz += sizeof(struct tcphdr);
1846 if (mss > hdrsiz)
1847 mss -= hdrsiz;
1848
1849 mss = uimax(tcp_mssdflt, mss);
1850 return (mss);
1851 }
1852
1853 /*
1854 * Set connection variables based on the peer's advertised MSS.
1855 * We are passed the TCPCB for the actual connection. If we
1856 * are the server, we are called by the compressed state engine
1857 * when the 3-way handshake is complete. If we are the client,
1858 * we are called when we receive the SYN,ACK from the server.
1859 *
1860 * NOTE: Our advertised MSS value must be initialized in the TCPCB
1861 * before this routine is called!
1862 */
1863 void
1864 tcp_mss_from_peer(struct tcpcb *tp, int offer)
1865 {
1866 struct socket *so;
1867 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1868 struct rtentry *rt;
1869 #endif
1870 u_long bufsize;
1871 int mss;
1872
1873 KASSERT(!(tp->t_inpcb && tp->t_in6pcb));
1874
1875 so = NULL;
1876 rt = NULL;
1877
1878 if (tp->t_inpcb) {
1879 so = tp->t_inpcb->inp_socket;
1880 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1881 rt = in_pcbrtentry(tp->t_inpcb);
1882 #endif
1883 }
1884
1885 #ifdef INET6
1886 if (tp->t_in6pcb) {
1887 so = tp->t_in6pcb->in6p_socket;
1888 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1889 rt = in6_pcbrtentry(tp->t_in6pcb);
1890 #endif
1891 }
1892 #endif
1893
1894 /*
1895 * As per RFC1122, use the default MSS value, unless they
1896 * sent us an offer. Do not accept offers less than 256 bytes.
1897 */
1898 mss = tcp_mssdflt;
1899 if (offer)
1900 mss = offer;
1901 mss = uimax(mss, 256); /* sanity */
1902 tp->t_peermss = mss;
1903 mss -= tcp_optlen(tp);
1904 if (tp->t_inpcb)
1905 mss -= ip_optlen(tp->t_inpcb);
1906 #ifdef INET6
1907 if (tp->t_in6pcb)
1908 mss -= ip6_optlen(tp->t_in6pcb);
1909 #endif
1910 /*
1911 * XXX XXX What if mss goes negative or zero? This can happen if a
1912 * socket has large IPv6 options. We crash below.
1913 */
1914
1915 /*
1916 * If there's a pipesize, change the socket buffer to that size.
1917 * Make the socket buffer an integral number of MSS units. If
1918 * the MSS is larger than the socket buffer, artificially decrease
1919 * the MSS.
1920 */
1921 #ifdef RTV_SPIPE
1922 if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
1923 bufsize = rt->rt_rmx.rmx_sendpipe;
1924 else
1925 #endif
1926 {
1927 KASSERT(so != NULL);
1928 bufsize = so->so_snd.sb_hiwat;
1929 }
1930 if (bufsize < mss)
1931 mss = bufsize;
1932 else {
1933 bufsize = roundup(bufsize, mss);
1934 if (bufsize > sb_max)
1935 bufsize = sb_max;
1936 (void) sbreserve(&so->so_snd, bufsize, so);
1937 }
1938 tp->t_segsz = mss;
1939
1940 #ifdef RTV_SSTHRESH
1941 if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
1942 /*
1943 * There's some sort of gateway or interface buffer
1944 * limit on the path. Use this to set the slow
1945 * start threshold, but set the threshold to no less
1946 * than 2 * MSS.
1947 */
1948 tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh);
1949 }
1950 #endif
1951 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
1952 if (tp->t_inpcb)
1953 in_pcbrtentry_unref(rt, tp->t_inpcb);
1954 #ifdef INET6
1955 if (tp->t_in6pcb)
1956 in6_pcbrtentry_unref(rt, tp->t_in6pcb);
1957 #endif
1958 #endif
1959 }
1960
1961 /*
1962 * Processing necessary when a TCP connection is established.
1963 */
1964 void
1965 tcp_established(struct tcpcb *tp)
1966 {
1967 struct socket *so;
1968 #ifdef RTV_RPIPE
1969 struct rtentry *rt;
1970 #endif
1971 u_long bufsize;
1972
1973 KASSERT(!(tp->t_inpcb && tp->t_in6pcb));
1974
1975 so = NULL;
1976 rt = NULL;
1977
1978 /* This is a while() to reduce the dreadful stairstepping below */
1979 while (tp->t_inpcb) {
1980 so = tp->t_inpcb->inp_socket;
1981 #if defined(RTV_RPIPE)
1982 rt = in_pcbrtentry(tp->t_inpcb);
1983 #endif
1984 if (__predict_true(tcp_msl_enable)) {
1985 if (tp->t_inpcb->inp_laddr.s_addr == INADDR_LOOPBACK) {
1986 tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1987 break;
1988 }
1989
1990 if (__predict_false(tcp_rttlocal)) {
1991 /* This may be adjusted by tcp_input */
1992 tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1993 break;
1994 }
1995 if (in_localaddr(tp->t_inpcb->inp_faddr)) {
1996 tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1997 break;
1998 }
1999 }
2000 tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
2001 break;
2002 }
2003
2004 /* Clamp to a reasonable range. */
2005 tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
2006
2007 #ifdef INET6
2008 /* The !tp->t_inpcb lets the compiler know it can't be v4 *and* v6 */
2009 while (!tp->t_inpcb && tp->t_in6pcb) {
2010 so = tp->t_in6pcb->in6p_socket;
2011 #if defined(RTV_RPIPE)
2012 rt = in6_pcbrtentry(tp->t_in6pcb);
2013 #endif
2014 if (__predict_true(tcp_msl_enable)) {
2015 extern const struct in6_addr in6addr_loopback;
2016
2017 if (IN6_ARE_ADDR_EQUAL(&tp->t_in6pcb->in6p_laddr,
2018 &in6addr_loopback)) {
2019 tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
2020 break;
2021 }
2022
2023 if (__predict_false(tcp_rttlocal)) {
2024 /* This may be adjusted by tcp_input */
2025 tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
2026 break;
2027 }
2028 if (in6_localaddr(&tp->t_in6pcb->in6p_faddr)) {
2029 tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
2030 break;
2031 }
2032 }
2033 tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
2034 break;
2035 }
2036
2037 /* Clamp to a reasonable range. */
2038 tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
2039 #endif
2040
2041 tp->t_state = TCPS_ESTABLISHED;
2042 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
2043
2044 #ifdef RTV_RPIPE
2045 if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
2046 bufsize = rt->rt_rmx.rmx_recvpipe;
2047 else
2048 #endif
2049 {
2050 KASSERT(so != NULL);
2051 bufsize = so->so_rcv.sb_hiwat;
2052 }
2053 if (bufsize > tp->t_ourmss) {
2054 bufsize = roundup(bufsize, tp->t_ourmss);
2055 if (bufsize > sb_max)
2056 bufsize = sb_max;
2057 (void) sbreserve(&so->so_rcv, bufsize, so);
2058 }
2059 #ifdef RTV_RPIPE
2060 if (tp->t_inpcb)
2061 in_pcbrtentry_unref(rt, tp->t_inpcb);
2062 #ifdef INET6
2063 if (tp->t_in6pcb)
2064 in6_pcbrtentry_unref(rt, tp->t_in6pcb);
2065 #endif
2066 #endif
2067 }
2068
2069 /*
2070 * Check if there's an initial rtt or rttvar. Convert from the
2071 * route-table units to scaled multiples of the slow timeout timer.
2072 * Called only during the 3-way handshake.
2073 */
2074 void
2075 tcp_rmx_rtt(struct tcpcb *tp)
2076 {
2077 #ifdef RTV_RTT
2078 struct rtentry *rt = NULL;
2079 int rtt;
2080
2081 KASSERT(!(tp->t_inpcb && tp->t_in6pcb));
2082
2083 if (tp->t_inpcb)
2084 rt = in_pcbrtentry(tp->t_inpcb);
2085 #ifdef INET6
2086 if (tp->t_in6pcb)
2087 rt = in6_pcbrtentry(tp->t_in6pcb);
2088 #endif
2089 if (rt == NULL)
2090 return;
2091
2092 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2093 /*
2094 * XXX The lock bit for MTU indicates that the value
2095 * is also a minimum value; this is subject to time.
2096 */
2097 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2098 TCPT_RANGESET(tp->t_rttmin,
2099 rtt / (RTM_RTTUNIT / PR_SLOWHZ),
2100 TCPTV_MIN, TCPTV_REXMTMAX);
2101 tp->t_srtt = rtt /
2102 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
2103 if (rt->rt_rmx.rmx_rttvar) {
2104 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2105 ((RTM_RTTUNIT / PR_SLOWHZ) >>
2106 (TCP_RTTVAR_SHIFT + 2));
2107 } else {
2108 /* Default variation is +- 1 rtt */
2109 tp->t_rttvar =
2110 tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
2111 }
2112 TCPT_RANGESET(tp->t_rxtcur,
2113 ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
2114 tp->t_rttmin, TCPTV_REXMTMAX);
2115 }
2116 if (tp->t_inpcb)
2117 in_pcbrtentry_unref(rt, tp->t_inpcb);
2118 #ifdef INET6
2119 if (tp->t_in6pcb)
2120 in6_pcbrtentry_unref(rt, tp->t_in6pcb);
2121 #endif
2122 #endif
2123 }
2124
2125 tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
2126
2127 /*
2128 * Get a new sequence value given a tcp control block
2129 */
2130 tcp_seq
2131 tcp_new_iss(struct tcpcb *tp)
2132 {
2133
2134 if (tp->t_inpcb != NULL) {
2135 return tcp_new_iss1(&tp->t_inpcb->inp_laddr,
2136 &tp->t_inpcb->inp_faddr, tp->t_inpcb->inp_lport,
2137 tp->t_inpcb->inp_fport, sizeof(tp->t_inpcb->inp_laddr));
2138 }
2139 #ifdef INET6
2140 if (tp->t_in6pcb != NULL) {
2141 return tcp_new_iss1(&tp->t_in6pcb->in6p_laddr,
2142 &tp->t_in6pcb->in6p_faddr, tp->t_in6pcb->in6p_lport,
2143 tp->t_in6pcb->in6p_fport, sizeof(tp->t_in6pcb->in6p_laddr));
2144 }
2145 #endif
2146
2147 panic("tcp_new_iss: unreachable");
2148 }
2149
2150 static u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */
2151
2152 /*
2153 * Initialize RFC 1948 ISS Secret
2154 */
2155 static int
2156 tcp_iss_secret_init(void)
2157 {
2158 cprng_strong(kern_cprng,
2159 tcp_iss_secret, sizeof(tcp_iss_secret), 0);
2160
2161 return 0;
2162 }
2163
2164 /*
2165 * This routine actually generates a new TCP initial sequence number.
2166 */
2167 tcp_seq
2168 tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
2169 size_t addrsz)
2170 {
2171 tcp_seq tcp_iss;
2172
2173 if (tcp_do_rfc1948) {
2174 MD5_CTX ctx;
2175 u_int8_t hash[16]; /* XXX MD5 knowledge */
2176 static ONCE_DECL(tcp_iss_secret_control);
2177
2178 /*
2179 * If we haven't been here before, initialize our cryptographic
2180 * hash secret.
2181 */
2182 RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init);
2183
2184 /*
2185 * Compute the base value of the ISS. It is a hash
2186 * of (saddr, sport, daddr, dport, secret).
2187 */
2188 MD5Init(&ctx);
2189
2190 MD5Update(&ctx, (u_char *) laddr, addrsz);
2191 MD5Update(&ctx, (u_char *) &lport, sizeof(lport));
2192
2193 MD5Update(&ctx, (u_char *) faddr, addrsz);
2194 MD5Update(&ctx, (u_char *) &fport, sizeof(fport));
2195
2196 MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));
2197
2198 MD5Final(hash, &ctx);
2199
2200 memcpy(&tcp_iss, hash, sizeof(tcp_iss));
2201
2202 #ifdef TCPISS_DEBUG
2203 printf("ISS hash 0x%08x, ", tcp_iss);
2204 #endif
2205 } else {
2206 /*
2207 * Randomize.
2208 */
2209 tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK;
2210 #ifdef TCPISS_DEBUG
2211 printf("ISS random 0x%08x, ", tcp_iss);
2212 #endif
2213 }
2214
2215 /*
2216 * Add the offset in to the computed value.
2217 */
2218 tcp_iss += tcp_iss_seq;
2219 #ifdef TCPISS_DEBUG
2220 printf("ISS %08x\n", tcp_iss);
2221 #endif
2222 return tcp_iss;
2223 }
2224
2225 #if defined(IPSEC)
2226 /* compute ESP/AH header size for TCP, including outer IP header. */
2227 size_t
2228 ipsec4_hdrsiz_tcp(struct tcpcb *tp)
2229 {
2230 struct inpcb *inp;
2231 size_t hdrsiz;
2232
2233 /* XXX mapped addr case (tp->t_in6pcb) */
2234 if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
2235 return 0;
2236 switch (tp->t_family) {
2237 case AF_INET:
2238 /* XXX: should use correct direction. */
2239 hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
2240 break;
2241 default:
2242 hdrsiz = 0;
2243 break;
2244 }
2245
2246 return hdrsiz;
2247 }
2248
2249 #ifdef INET6
2250 size_t
2251 ipsec6_hdrsiz_tcp(struct tcpcb *tp)
2252 {
2253 struct in6pcb *in6p;
2254 size_t hdrsiz;
2255
2256 if (!tp || !tp->t_template || !(in6p = tp->t_in6pcb))
2257 return 0;
2258 switch (tp->t_family) {
2259 case AF_INET6:
2260 /* XXX: should use correct direction. */
2261 hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, in6p);
2262 break;
2263 case AF_INET:
2264 /* mapped address case - tricky */
2265 default:
2266 hdrsiz = 0;
2267 break;
2268 }
2269
2270 return hdrsiz;
2271 }
2272 #endif
2273 #endif /*IPSEC*/
2274
2275 /*
2276 * Determine the length of the TCP options for this connection.
2277 *
2278 * XXX: What do we do for SACK, when we add that? Just reserve
2279 * all of the space? Otherwise we can't exactly be incrementing
2280 * cwnd by an amount that varies depending on the amount we last
2281 * had to SACK!
2282 */
2283
2284 u_int
2285 tcp_optlen(struct tcpcb *tp)
2286 {
2287 u_int optlen;
2288
2289 optlen = 0;
2290 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
2291 (TF_REQ_TSTMP | TF_RCVD_TSTMP))
2292 optlen += TCPOLEN_TSTAMP_APPA;
2293
2294 #ifdef TCP_SIGNATURE
2295 if (tp->t_flags & TF_SIGNATURE)
2296 optlen += TCPOLEN_SIGLEN;
2297 #endif
2298
2299 return optlen;
2300 }
2301
2302 u_int
2303 tcp_hdrsz(struct tcpcb *tp)
2304 {
2305 u_int hlen;
2306
2307 switch (tp->t_family) {
2308 #ifdef INET6
2309 case AF_INET6:
2310 hlen = sizeof(struct ip6_hdr);
2311 break;
2312 #endif
2313 case AF_INET:
2314 hlen = sizeof(struct ip);
2315 break;
2316 default:
2317 hlen = 0;
2318 break;
2319 }
2320 hlen += sizeof(struct tcphdr);
2321
2322 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2323 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
2324 hlen += TCPOLEN_TSTAMP_APPA;
2325 #ifdef TCP_SIGNATURE
2326 if (tp->t_flags & TF_SIGNATURE)
2327 hlen += TCPOLEN_SIGLEN;
2328 #endif
2329 return hlen;
2330 }
2331
2332 void
2333 tcp_statinc(u_int stat)
2334 {
2335
2336 KASSERT(stat < TCP_NSTATS);
2337 TCP_STATINC(stat);
2338 }
2339
2340 void
2341 tcp_statadd(u_int stat, uint64_t val)
2342 {
2343
2344 KASSERT(stat < TCP_NSTATS);
2345 TCP_STATADD(stat, val);
2346 }
2347