1 1.441 rin /* $NetBSD: tcp_input.c,v 1.441 2024/10/08 06:17:14 rin Exp $ */ 2 1.83 itojun 3 1.83 itojun /* 4 1.83 itojun * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 1.83 itojun * All rights reserved. 6 1.143 itojun * 7 1.83 itojun * Redistribution and use in source and binary forms, with or without 8 1.83 itojun * modification, are permitted provided that the following conditions 9 1.83 itojun * are met: 10 1.83 itojun * 1. Redistributions of source code must retain the above copyright 11 1.83 itojun * notice, this list of conditions and the following disclaimer. 12 1.83 itojun * 2. Redistributions in binary form must reproduce the above copyright 13 1.83 itojun * notice, this list of conditions and the following disclaimer in the 14 1.83 itojun * documentation and/or other materials provided with the distribution. 15 1.83 itojun * 3. Neither the name of the project nor the names of its contributors 16 1.83 itojun * may be used to endorse or promote products derived from this software 17 1.83 itojun * without specific prior written permission. 18 1.143 itojun * 19 1.83 itojun * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 1.83 itojun * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 1.83 itojun * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 1.83 itojun * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 1.83 itojun * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 1.83 itojun * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 1.83 itojun * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 1.83 itojun * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 1.83 itojun * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 1.83 itojun * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 1.83 itojun * SUCH DAMAGE. 30 1.83 itojun */ 31 1.44 thorpej 32 1.134 itojun /* 33 1.134 itojun * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 34 1.143 itojun * 35 1.134 itojun * NRL grants permission for redistribution and use in source and binary 36 1.134 itojun * forms, with or without modification, of the software and documentation 37 1.134 itojun * created at NRL provided that the following conditions are met: 38 1.143 itojun * 39 1.134 itojun * 1. Redistributions of source code must retain the above copyright 40 1.134 itojun * notice, this list of conditions and the following disclaimer. 41 1.134 itojun * 2. Redistributions in binary form must reproduce the above copyright 42 1.134 itojun * notice, this list of conditions and the following disclaimer in the 43 1.134 itojun * documentation and/or other materials provided with the distribution. 44 1.134 itojun * 3. All advertising materials mentioning features or use of this software 45 1.134 itojun * must display the following acknowledgements: 46 1.134 itojun * This product includes software developed by the University of 47 1.134 itojun * California, Berkeley and its contributors. 48 1.134 itojun * This product includes software developed at the Information 49 1.134 itojun * Technology Division, US Naval Research Laboratory. 50 1.134 itojun * 4. Neither the name of the NRL nor the names of its contributors 51 1.134 itojun * may be used to endorse or promote products derived from this software 52 1.134 itojun * without specific prior written permission. 53 1.143 itojun * 54 1.134 itojun * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 55 1.134 itojun * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 1.134 itojun * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 1.134 itojun * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 58 1.134 itojun * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 1.134 itojun * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 1.134 itojun * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 1.134 itojun * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 1.134 itojun * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 1.134 itojun * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 1.134 itojun * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 65 1.143 itojun * 66 1.134 itojun * The views and conclusions contained in the software and documentation 67 1.134 itojun * are those of the authors and should not be interpreted as representing 68 1.134 itojun * official policies, either expressed or implied, of the US Naval 69 1.134 itojun * Research Laboratory (NRL). 70 1.134 itojun */ 71 1.134 itojun 72 1.44 thorpej /*- 73 1.312 dyoung * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006, 74 1.312 dyoung * 2011 The NetBSD Foundation, Inc. 75 1.44 thorpej * All rights reserved. 76 1.44 thorpej * 77 1.44 thorpej * This code is derived from software contributed to The NetBSD Foundation 78 1.312 dyoung * by Coyote Point Systems, Inc. 79 1.312 dyoung * This code is derived from software contributed to The NetBSD Foundation 80 1.44 thorpej * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 81 1.44 thorpej * Facility, NASA Ames Research Center. 82 1.223 mycroft * This code is derived from software contributed to The NetBSD Foundation 83 1.223 mycroft * by Charles M. Hannum. 84 1.244 rpaulo * This code is derived from software contributed to The NetBSD Foundation 85 1.244 rpaulo * by Rui Paulo. 86 1.44 thorpej * 87 1.44 thorpej * Redistribution and use in source and binary forms, with or without 88 1.44 thorpej * modification, are permitted provided that the following conditions 89 1.44 thorpej * are met: 90 1.44 thorpej * 1. Redistributions of source code must retain the above copyright 91 1.44 thorpej * notice, this list of conditions and the following disclaimer. 92 1.44 thorpej * 2. Redistributions in binary form must reproduce the above copyright 93 1.44 thorpej * notice, this list of conditions and the following disclaimer in the 94 1.44 thorpej * documentation and/or other materials provided with the distribution. 95 1.44 thorpej * 96 1.44 thorpej * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 97 1.44 thorpej * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 98 1.44 thorpej * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 99 1.44 thorpej * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 100 1.44 thorpej * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 101 1.44 thorpej * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 102 1.44 thorpej * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 103 1.44 thorpej * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 104 1.44 thorpej * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 105 1.44 thorpej * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 106 1.44 thorpej * POSSIBILITY OF SUCH DAMAGE. 107 1.44 thorpej */ 108 1.10 cgd 109 1.1 cgd /* 110 1.39 thorpej * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 111 1.9 mycroft * The Regents of the University of California. All rights reserved. 112 1.1 cgd * 113 1.1 cgd * Redistribution and use in source and binary forms, with or without 114 1.1 cgd * modification, are permitted provided that the following conditions 115 1.1 cgd * are met: 116 1.1 cgd * 1. Redistributions of source code must retain the above copyright 117 1.1 cgd * notice, this list of conditions and the following disclaimer. 118 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 119 1.1 cgd * notice, this list of conditions and the following disclaimer in the 120 1.1 cgd * documentation and/or other materials provided with the distribution. 121 1.174 agc * 3. Neither the name of the University nor the names of its contributors 122 1.1 cgd * may be used to endorse or promote products derived from this software 123 1.1 cgd * without specific prior written permission. 124 1.1 cgd * 125 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 126 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 127 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 128 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 129 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 130 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 131 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 132 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 133 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 134 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 135 1.1 cgd * SUCH DAMAGE. 136 1.1 cgd * 137 1.39 thorpej * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 138 1.1 cgd */ 139 1.1 cgd 140 1.133 lukem #include <sys/cdefs.h> 141 1.441 rin __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.441 2024/10/08 06:17:14 rin Exp $"); 142 1.29 thorpej 143 1.344 pooka #ifdef _KERNEL_OPT 144 1.83 itojun #include "opt_inet.h" 145 1.85 thorpej #include "opt_ipsec.h" 146 1.125 thorpej #include "opt_inet_csum.h" 147 1.127 abs #include "opt_tcp_debug.h" 148 1.344 pooka #endif 149 1.83 itojun 150 1.3 mycroft #include <sys/param.h> 151 1.3 mycroft #include <sys/systm.h> 152 1.3 mycroft #include <sys/malloc.h> 153 1.3 mycroft #include <sys/mbuf.h> 154 1.3 mycroft #include <sys/protosw.h> 155 1.3 mycroft #include <sys/socket.h> 156 1.3 mycroft #include <sys/socketvar.h> 157 1.3 mycroft #include <sys/errno.h> 158 1.52 thorpej #include <sys/syslog.h> 159 1.63 thorpej #include <sys/pool.h> 160 1.83 itojun #include <sys/domain.h> 161 1.129 thorpej #include <sys/kernel.h> 162 1.206 itojun #ifdef TCP_SIGNATURE 163 1.206 itojun #include <sys/md5.h> 164 1.206 itojun #endif 165 1.273 elad #include <sys/lwp.h> /* for lwp0 */ 166 1.318 tls #include <sys/cprng.h> 167 1.1 cgd 168 1.3 mycroft #include <net/if.h> 169 1.87 itojun #include <net/if_types.h> 170 1.1 cgd 171 1.3 mycroft #include <netinet/in.h> 172 1.3 mycroft #include <netinet/in_systm.h> 173 1.3 mycroft #include <netinet/ip.h> 174 1.3 mycroft #include <netinet/in_pcb.h> 175 1.169 matt #include <netinet/in_var.h> 176 1.3 mycroft #include <netinet/ip_var.h> 177 1.233 yamt #include <netinet/in_offload.h> 178 1.83 itojun 179 1.423 roy #if NARP > 0 180 1.422 roy #include <netinet/if_inarp.h> 181 1.422 roy #endif 182 1.83 itojun #ifdef INET6 183 1.83 itojun #include <netinet/ip6.h> 184 1.112 itojun #include <netinet6/ip6_var.h> 185 1.83 itojun #include <netinet6/in6_pcb.h> 186 1.83 itojun #include <netinet6/ip6_var.h> 187 1.83 itojun #include <netinet6/in6_var.h> 188 1.83 itojun #include <netinet/icmp6.h> 189 1.98 itojun #include <netinet6/nd6.h> 190 1.238 riz #ifdef TCP_SIGNATURE 191 1.238 riz #include <netinet6/scope6_var.h> 192 1.238 riz #endif 193 1.83 itojun #endif 194 1.83 itojun 195 1.99 itojun #ifndef INET6 196 1.99 itojun #include <netinet/ip6.h> 197 1.99 itojun #endif 198 1.99 itojun 199 1.3 mycroft #include <netinet/tcp.h> 200 1.3 mycroft #include <netinet/tcp_fsm.h> 201 1.3 mycroft #include <netinet/tcp_seq.h> 202 1.3 mycroft #include <netinet/tcp_timer.h> 203 1.3 mycroft #include <netinet/tcp_var.h> 204 1.284 thorpej #include <netinet/tcp_private.h> 205 1.246 rpaulo #include <netinet/tcp_congctl.h> 206 1.3 mycroft #include <netinet/tcp_debug.h> 207 1.434 ozaki #include <netinet/tcp_syncache.h> 208 1.1 cgd 209 1.88 itojun #ifdef INET6 210 1.87 itojun #include "faith.h" 211 1.124 itojun #if defined(NFAITH) && NFAITH > 0 212 1.124 itojun #include <net/if_faith.h> 213 1.124 itojun #endif 214 1.388 maxv #endif 215 1.175 jonathan 216 1.326 christos #ifdef IPSEC 217 1.175 jonathan #include <netipsec/ipsec.h> 218 1.175 jonathan #include <netipsec/key.h> 219 1.187 jonathan #ifdef INET6 220 1.187 jonathan #include <netipsec/ipsec6.h> 221 1.187 jonathan #endif 222 1.326 christos #endif /* IPSEC*/ 223 1.175 jonathan 224 1.312 dyoung #include <netinet/tcp_vtw.h> 225 1.312 dyoung 226 1.1 cgd int tcprexmtthresh = 3; 227 1.82 ad int tcp_log_refused; 228 1.9 mycroft 229 1.300 pooka int tcp_do_autorcvbuf = 1; 230 1.269 rmind int tcp_autorcvbuf_inc = 16 * 1024; 231 1.269 rmind int tcp_autorcvbuf_max = 256 * 1024; 232 1.299 darran int tcp_msl = (TCPTV_MSL / PR_SLOWHZ); 233 1.269 rmind 234 1.441 rin int tcp_reass_maxqueuelen = 100; 235 1.441 rin 236 1.116 itojun static int tcp_rst_ppslim_count = 0; 237 1.116 itojun static struct timeval tcp_rst_ppslim_last; 238 1.194 itojun static int tcp_ackdrop_ppslim_count = 0; 239 1.194 itojun static struct timeval tcp_ackdrop_ppslim_last; 240 1.104 thorpej 241 1.216 mycroft #define TCP_PAWS_IDLE (24U * 24 * 60 * 60 * PR_SLOWHZ) 242 1.9 mycroft 243 1.9 mycroft /* for modulo comparisons of timestamps */ 244 1.9 mycroft #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) 245 1.9 mycroft #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) 246 1.9 mycroft 247 1.1 cgd /* 248 1.98 itojun * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. 249 1.98 itojun */ 250 1.421 roy static void 251 1.421 roy nd_hint(struct tcpcb *tp) 252 1.421 roy { 253 1.421 roy struct route *ro = NULL; 254 1.421 roy struct rtentry *rt; 255 1.421 roy 256 1.421 roy if (tp == NULL) 257 1.421 roy return; 258 1.421 roy 259 1.435 ozaki ro = &tp->t_inpcb->inp_route; 260 1.421 roy if (ro == NULL) 261 1.421 roy return; 262 1.421 roy 263 1.421 roy rt = rtcache_validate(ro); 264 1.421 roy if (rt == NULL) 265 1.421 roy return; 266 1.274 dyoung 267 1.421 roy switch (tp->t_family) { 268 1.423 roy #if NARP > 0 269 1.421 roy case AF_INET: 270 1.421 roy arp_nud_hint(rt); 271 1.421 roy break; 272 1.421 roy #endif 273 1.421 roy #ifdef INET6 274 1.421 roy case AF_INET6: 275 1.342 ozaki nd6_nud_hint(rt); 276 1.421 roy break; 277 1.421 roy #endif 278 1.417 maxv } 279 1.421 roy 280 1.421 roy rtcache_unref(rt, ro); 281 1.274 dyoung } 282 1.98 itojun 283 1.98 itojun /* 284 1.280 yamt * Compute ACK transmission behavior. Delay the ACK unless 285 1.47 thorpej * we have already delayed an ACK (must send an ACK every two segments). 286 1.55 thorpej * We also ACK immediately if we received a PUSH and the ACK-on-PUSH 287 1.55 thorpej * option is enabled. 288 1.37 thorpej */ 289 1.280 yamt static void 290 1.280 yamt tcp_setup_ack(struct tcpcb *tp, const struct tcphdr *th) 291 1.280 yamt { 292 1.280 yamt 293 1.280 yamt if (tp->t_flags & TF_DELACK || 294 1.280 yamt (tcp_ack_on_push && th->th_flags & TH_PUSH)) 295 1.280 yamt tp->t_flags |= TF_ACKNOW; 296 1.280 yamt else 297 1.280 yamt TCP_SET_DELACK(tp); 298 1.280 yamt } 299 1.280 yamt 300 1.280 yamt static void 301 1.280 yamt icmp_check(struct tcpcb *tp, const struct tcphdr *th, int acked) 302 1.280 yamt { 303 1.280 yamt 304 1.280 yamt /* 305 1.280 yamt * If we had a pending ICMP message that refers to data that have 306 1.280 yamt * just been acknowledged, disregard the recorded ICMP message. 307 1.280 yamt */ 308 1.280 yamt if ((tp->t_flags & TF_PMTUD_PEND) && 309 1.280 yamt SEQ_GT(th->th_ack, tp->t_pmtud_th_seq)) 310 1.280 yamt tp->t_flags &= ~TF_PMTUD_PEND; 311 1.37 thorpej 312 1.280 yamt /* 313 1.280 yamt * Keep track of the largest chunk of data 314 1.280 yamt * acknowledged since last PMTU update 315 1.280 yamt */ 316 1.280 yamt if (tp->t_pmtud_mss_acked < acked) 317 1.280 yamt tp->t_pmtud_mss_acked = acked; 318 1.280 yamt } 319 1.231 christos 320 1.103 thorpej /* 321 1.103 thorpej * Convert TCP protocol fields to host order for easier processing. 322 1.103 thorpej */ 323 1.280 yamt static void 324 1.280 yamt tcp_fields_to_host(struct tcphdr *th) 325 1.280 yamt { 326 1.280 yamt 327 1.280 yamt NTOHL(th->th_seq); 328 1.280 yamt NTOHL(th->th_ack); 329 1.280 yamt NTOHS(th->th_win); 330 1.280 yamt NTOHS(th->th_urp); 331 1.280 yamt } 332 1.103 thorpej 333 1.153 thorpej /* 334 1.153 thorpej * ... and reverse the above. 335 1.153 thorpej */ 336 1.280 yamt static void 337 1.280 yamt tcp_fields_to_net(struct tcphdr *th) 338 1.280 yamt { 339 1.280 yamt 340 1.280 yamt HTONL(th->th_seq); 341 1.280 yamt HTONL(th->th_ack); 342 1.280 yamt HTONS(th->th_win); 343 1.280 yamt HTONS(th->th_urp); 344 1.280 yamt } 345 1.153 thorpej 346 1.390 maxv static void 347 1.390 maxv tcp_urp_drop(struct tcphdr *th, int todrop, int *tiflags) 348 1.390 maxv { 349 1.401 rmind if (th->th_urp > todrop) { 350 1.390 maxv th->th_urp -= todrop; 351 1.390 maxv } else { 352 1.390 maxv *tiflags &= ~TH_URG; 353 1.390 maxv th->th_urp = 0; 354 1.390 maxv } 355 1.390 maxv } 356 1.390 maxv 357 1.125 thorpej #ifdef TCP_CSUM_COUNTERS 358 1.125 thorpej #include <sys/device.h> 359 1.125 thorpej 360 1.125 thorpej extern struct evcnt tcp_hwcsum_ok; 361 1.125 thorpej extern struct evcnt tcp_hwcsum_bad; 362 1.125 thorpej extern struct evcnt tcp_hwcsum_data; 363 1.125 thorpej extern struct evcnt tcp_swcsum; 364 1.232 yamt #if defined(INET6) 365 1.232 yamt extern struct evcnt tcp6_hwcsum_ok; 366 1.232 yamt extern struct evcnt tcp6_hwcsum_bad; 367 1.232 yamt extern struct evcnt tcp6_hwcsum_data; 368 1.232 yamt extern struct evcnt tcp6_swcsum; 369 1.232 yamt #endif /* defined(INET6) */ 370 1.125 thorpej 371 1.125 thorpej #define TCP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++ 372 1.125 thorpej 373 1.125 thorpej #else 374 1.125 thorpej 375 1.125 thorpej #define TCP_CSUM_COUNTER_INCR(ev) /* nothing */ 376 1.125 thorpej 377 1.125 thorpej #endif /* TCP_CSUM_COUNTERS */ 378 1.125 thorpej 379 1.141 matt #ifdef TCP_REASS_COUNTERS 380 1.141 matt #include <sys/device.h> 381 1.141 matt 382 1.141 matt extern struct evcnt tcp_reass_; 383 1.141 matt extern struct evcnt tcp_reass_empty; 384 1.141 matt extern struct evcnt tcp_reass_iteration[8]; 385 1.141 matt extern struct evcnt tcp_reass_prependfirst; 386 1.141 matt extern struct evcnt tcp_reass_prepend; 387 1.141 matt extern struct evcnt tcp_reass_insert; 388 1.141 matt extern struct evcnt tcp_reass_inserttail; 389 1.141 matt extern struct evcnt tcp_reass_append; 390 1.141 matt extern struct evcnt tcp_reass_appendtail; 391 1.141 matt extern struct evcnt tcp_reass_overlaptail; 392 1.141 matt extern struct evcnt tcp_reass_overlapfront; 393 1.141 matt extern struct evcnt tcp_reass_segdup; 394 1.141 matt extern struct evcnt tcp_reass_fragdup; 395 1.141 matt 396 1.141 matt #define TCP_REASS_COUNTER_INCR(ev) (ev)->ev_count++ 397 1.141 matt 398 1.141 matt #else 399 1.141 matt 400 1.141 matt #define TCP_REASS_COUNTER_INCR(ev) /* nothing */ 401 1.141 matt 402 1.141 matt #endif /* TCP_REASS_COUNTERS */ 403 1.141 matt 404 1.256 yamt static int tcp_reass(struct tcpcb *, const struct tcphdr *, struct mbuf *, 405 1.386 maxv int); 406 1.254 yamt 407 1.219 perry static void tcp4_log_refused(const struct ip *, const struct tcphdr *); 408 1.145 yamt #ifdef INET6 409 1.219 perry static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *); 410 1.145 yamt #endif 411 1.145 yamt 412 1.257 yamt #if defined(MBUFTRACE) 413 1.257 yamt struct mowner tcp_reass_mowner = MOWNER_INIT("tcp", "reass"); 414 1.257 yamt #endif /* defined(MBUFTRACE) */ 415 1.257 yamt 416 1.292 pooka static struct pool tcpipqent_pool; 417 1.292 pooka 418 1.292 pooka void 419 1.293 cegger tcpipqent_init(void) 420 1.292 pooka { 421 1.292 pooka 422 1.292 pooka pool_init(&tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl", 423 1.292 pooka NULL, IPL_VM); 424 1.292 pooka } 425 1.209 yamt 426 1.225 yamt struct ipqent * 427 1.281 matt tcpipqent_alloc(void) 428 1.225 yamt { 429 1.225 yamt struct ipqent *ipqe; 430 1.225 yamt int s; 431 1.225 yamt 432 1.225 yamt s = splvm(); 433 1.225 yamt ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT); 434 1.225 yamt splx(s); 435 1.225 yamt 436 1.225 yamt return ipqe; 437 1.225 yamt } 438 1.225 yamt 439 1.225 yamt void 440 1.225 yamt tcpipqent_free(struct ipqent *ipqe) 441 1.225 yamt { 442 1.225 yamt int s; 443 1.225 yamt 444 1.225 yamt s = splvm(); 445 1.225 yamt pool_put(&tcpipqent_pool, ipqe); 446 1.225 yamt splx(s); 447 1.225 yamt } 448 1.225 yamt 449 1.399 maxv /* 450 1.399 maxv * Insert segment ti into reassembly queue of tcp with 451 1.399 maxv * control block tp. Return TH_FIN if reassembly now includes 452 1.399 maxv * a segment with FIN. 453 1.399 maxv */ 454 1.256 yamt static int 455 1.386 maxv tcp_reass(struct tcpcb *tp, const struct tcphdr *th, struct mbuf *m, int tlen) 456 1.1 cgd { 457 1.106 augustss struct ipqent *p, *q, *nq, *tiqe = NULL; 458 1.83 itojun struct socket *so = NULL; 459 1.54 matt int pkt_flags; 460 1.54 matt tcp_seq pkt_seq; 461 1.54 matt unsigned pkt_len; 462 1.54 matt u_long rcvpartdupbyte = 0; 463 1.54 matt u_long rcvoobyte; 464 1.141 matt #ifdef TCP_REASS_COUNTERS 465 1.141 matt u_int count = 0; 466 1.141 matt #endif 467 1.439 riastrad net_stat_ref_t tcps; 468 1.1 cgd 469 1.435 ozaki so = tp->t_inpcb->inp_socket; 470 1.83 itojun 471 1.72 thorpej TCP_REASS_LOCK_CHECK(tp); 472 1.72 thorpej 473 1.1 cgd /* 474 1.389 maxv * Call with th==NULL after become established to 475 1.1 cgd * force pre-ESTABLISHED data up to user socket. 476 1.1 cgd */ 477 1.389 maxv if (th == NULL) 478 1.1 cgd goto present; 479 1.1 cgd 480 1.257 yamt m_claimm(m, &tcp_reass_mowner); 481 1.257 yamt 482 1.386 maxv rcvoobyte = tlen; 483 1.1 cgd /* 484 1.399 maxv * Copy these to local variables because the TCP header gets munged 485 1.399 maxv * while we are collapsing mbufs. 486 1.20 cgd */ 487 1.83 itojun pkt_seq = th->th_seq; 488 1.386 maxv pkt_len = tlen; 489 1.83 itojun pkt_flags = th->th_flags; 490 1.141 matt 491 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_); 492 1.141 matt 493 1.141 matt if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) { 494 1.141 matt /* 495 1.141 matt * When we miss a packet, the vast majority of time we get 496 1.141 matt * packets that follow it in order. So optimize for that. 497 1.141 matt */ 498 1.141 matt if (pkt_seq == p->ipqe_seq + p->ipqe_len) { 499 1.141 matt p->ipqe_len += pkt_len; 500 1.141 matt p->ipqe_flags |= pkt_flags; 501 1.405 maxv m_cat(p->ipqe_m, m); 502 1.189 itojun m = NULL; 503 1.141 matt tiqe = p; 504 1.141 matt TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq); 505 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail); 506 1.141 matt goto skip_replacement; 507 1.141 matt } 508 1.141 matt /* 509 1.141 matt * While we're here, if the pkt is completely beyond 510 1.141 matt * anything we have, just insert it at the tail. 511 1.141 matt */ 512 1.141 matt if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) { 513 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail); 514 1.141 matt goto insert_it; 515 1.141 matt } 516 1.141 matt } 517 1.141 matt 518 1.141 matt q = TAILQ_FIRST(&tp->segq); 519 1.141 matt 520 1.141 matt if (q != NULL) { 521 1.141 matt /* 522 1.141 matt * If this segment immediately precedes the first out-of-order 523 1.141 matt * block, simply slap the segment in front of it and (mostly) 524 1.141 matt * skip the complicated logic. 525 1.141 matt */ 526 1.141 matt if (pkt_seq + pkt_len == q->ipqe_seq) { 527 1.141 matt q->ipqe_seq = pkt_seq; 528 1.141 matt q->ipqe_len += pkt_len; 529 1.141 matt q->ipqe_flags |= pkt_flags; 530 1.141 matt m_cat(m, q->ipqe_m); 531 1.141 matt q->ipqe_m = m; 532 1.141 matt tiqe = q; 533 1.141 matt TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 534 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst); 535 1.141 matt goto skip_replacement; 536 1.141 matt } 537 1.141 matt } else { 538 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_empty); 539 1.141 matt } 540 1.141 matt 541 1.20 cgd /* 542 1.1 cgd * Find a segment which begins after this one does. 543 1.1 cgd */ 544 1.141 matt for (p = NULL; q != NULL; q = nq) { 545 1.141 matt nq = TAILQ_NEXT(q, ipqe_q); 546 1.141 matt #ifdef TCP_REASS_COUNTERS 547 1.141 matt count++; 548 1.141 matt #endif 549 1.399 maxv 550 1.54 matt /* 551 1.54 matt * If the received segment is just right after this 552 1.54 matt * fragment, merge the two together and then check 553 1.54 matt * for further overlaps. 554 1.54 matt */ 555 1.54 matt if (q->ipqe_seq + q->ipqe_len == pkt_seq) { 556 1.54 matt pkt_len += q->ipqe_len; 557 1.54 matt pkt_flags |= q->ipqe_flags; 558 1.54 matt pkt_seq = q->ipqe_seq; 559 1.405 maxv m_cat(q->ipqe_m, m); 560 1.54 matt m = q->ipqe_m; 561 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_append); 562 1.54 matt goto free_ipqe; 563 1.54 matt } 564 1.399 maxv 565 1.54 matt /* 566 1.54 matt * If the received segment is completely past this 567 1.399 maxv * fragment, we need to go to the next fragment. 568 1.54 matt */ 569 1.54 matt if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 570 1.54 matt p = q; 571 1.54 matt continue; 572 1.54 matt } 573 1.399 maxv 574 1.54 matt /* 575 1.143 itojun * If the fragment is past the received segment, 576 1.54 matt * it (or any following) can't be concatenated. 577 1.54 matt */ 578 1.141 matt if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) { 579 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_insert); 580 1.1 cgd break; 581 1.141 matt } 582 1.141 matt 583 1.54 matt /* 584 1.54 matt * We've received all the data in this segment before. 585 1.399 maxv * Mark it as a duplicate and return. 586 1.54 matt */ 587 1.54 matt if (SEQ_LEQ(q->ipqe_seq, pkt_seq) && 588 1.54 matt SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 589 1.284 thorpej tcps = TCP_STAT_GETREF(); 590 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVDUPPACK); 591 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVDUPBYTE, pkt_len); 592 1.284 thorpej TCP_STAT_PUTREF(); 593 1.222 jonathan tcp_new_dsack(tp, pkt_seq, pkt_len); 594 1.54 matt m_freem(m); 595 1.225 yamt if (tiqe != NULL) { 596 1.225 yamt tcpipqent_free(tiqe); 597 1.225 yamt } 598 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_segdup); 599 1.312 dyoung goto out; 600 1.54 matt } 601 1.399 maxv 602 1.54 matt /* 603 1.54 matt * Received segment completely overlaps this fragment 604 1.54 matt * so we drop the fragment (this keeps the temporal 605 1.54 matt * ordering of segments correct). 606 1.54 matt */ 607 1.54 matt if (SEQ_GEQ(q->ipqe_seq, pkt_seq) && 608 1.54 matt SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) { 609 1.54 matt rcvpartdupbyte += q->ipqe_len; 610 1.54 matt m_freem(q->ipqe_m); 611 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup); 612 1.54 matt goto free_ipqe; 613 1.54 matt } 614 1.399 maxv 615 1.54 matt /* 616 1.399 maxv * Received segment extends past the end of the fragment. 617 1.399 maxv * Drop the overlapping bytes, merge the fragment and 618 1.399 maxv * segment, and treat as a longer received packet. 619 1.54 matt */ 620 1.189 itojun if (SEQ_LT(q->ipqe_seq, pkt_seq) && 621 1.189 itojun SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq)) { 622 1.54 matt int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq; 623 1.54 matt m_adj(m, overlap); 624 1.54 matt rcvpartdupbyte += overlap; 625 1.405 maxv m_cat(q->ipqe_m, m); 626 1.54 matt m = q->ipqe_m; 627 1.54 matt pkt_seq = q->ipqe_seq; 628 1.54 matt pkt_len += q->ipqe_len - overlap; 629 1.54 matt rcvoobyte -= overlap; 630 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail); 631 1.54 matt goto free_ipqe; 632 1.54 matt } 633 1.399 maxv 634 1.54 matt /* 635 1.399 maxv * Received segment extends past the front of the fragment. 636 1.399 maxv * Drop the overlapping bytes on the received packet. The 637 1.399 maxv * packet will then be concatenated with this fragment a 638 1.399 maxv * bit later. 639 1.54 matt */ 640 1.189 itojun if (SEQ_GT(q->ipqe_seq, pkt_seq) && 641 1.189 itojun SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len)) { 642 1.54 matt int overlap = pkt_seq + pkt_len - q->ipqe_seq; 643 1.54 matt m_adj(m, -overlap); 644 1.54 matt pkt_len -= overlap; 645 1.54 matt rcvpartdupbyte += overlap; 646 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront); 647 1.54 matt rcvoobyte -= overlap; 648 1.54 matt } 649 1.399 maxv 650 1.54 matt /* 651 1.399 maxv * If the received segment immediately precedes this 652 1.54 matt * fragment then tack the fragment onto this segment 653 1.54 matt * and reinsert the data. 654 1.54 matt */ 655 1.54 matt if (q->ipqe_seq == pkt_seq + pkt_len) { 656 1.54 matt pkt_len += q->ipqe_len; 657 1.54 matt pkt_flags |= q->ipqe_flags; 658 1.54 matt m_cat(m, q->ipqe_m); 659 1.141 matt TAILQ_REMOVE(&tp->segq, q, ipqe_q); 660 1.141 matt TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 661 1.224 yamt tp->t_segqlen--; 662 1.224 yamt KASSERT(tp->t_segqlen >= 0); 663 1.224 yamt KASSERT(tp->t_segqlen != 0 || 664 1.224 yamt (TAILQ_EMPTY(&tp->segq) && 665 1.224 yamt TAILQ_EMPTY(&tp->timeq))); 666 1.225 yamt if (tiqe == NULL) { 667 1.189 itojun tiqe = q; 668 1.225 yamt } else { 669 1.225 yamt tcpipqent_free(q); 670 1.225 yamt } 671 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_prepend); 672 1.54 matt break; 673 1.54 matt } 674 1.399 maxv 675 1.54 matt /* 676 1.54 matt * If the fragment is before the segment, remember it. 677 1.54 matt * When this loop is terminated, p will contain the 678 1.399 maxv * pointer to the fragment that is right before the 679 1.399 maxv * received segment. 680 1.54 matt */ 681 1.54 matt if (SEQ_LEQ(q->ipqe_seq, pkt_seq)) 682 1.54 matt p = q; 683 1.54 matt 684 1.54 matt continue; 685 1.54 matt 686 1.54 matt /* 687 1.54 matt * This is a common operation. It also will allow 688 1.54 matt * to save doing a malloc/free in most instances. 689 1.54 matt */ 690 1.54 matt free_ipqe: 691 1.141 matt TAILQ_REMOVE(&tp->segq, q, ipqe_q); 692 1.141 matt TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 693 1.224 yamt tp->t_segqlen--; 694 1.224 yamt KASSERT(tp->t_segqlen >= 0); 695 1.224 yamt KASSERT(tp->t_segqlen != 0 || 696 1.224 yamt (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 697 1.225 yamt if (tiqe == NULL) { 698 1.189 itojun tiqe = q; 699 1.225 yamt } else { 700 1.225 yamt tcpipqent_free(q); 701 1.225 yamt } 702 1.1 cgd } 703 1.1 cgd 704 1.141 matt #ifdef TCP_REASS_COUNTERS 705 1.141 matt if (count > 7) 706 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]); 707 1.141 matt else if (count > 0) 708 1.141 matt TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]); 709 1.141 matt #endif 710 1.141 matt 711 1.399 maxv insert_it: 712 1.441 rin /* limit tcp segments per reassembly queue */ 713 1.441 rin if (tp->t_segqlen > tcp_reass_maxqueuelen) { 714 1.441 rin TCP_STATINC(TCP_STAT_RCVMEMDROP); 715 1.441 rin m_freem(m); 716 1.441 rin goto out; 717 1.441 rin } 718 1.441 rin 719 1.1 cgd /* 720 1.399 maxv * Allocate a new queue entry (block) since the received segment 721 1.399 maxv * did not collapse onto any other out-of-order block. If it had 722 1.399 maxv * collapsed, tiqe would not be NULL and we would be reusing it. 723 1.399 maxv * 724 1.399 maxv * If the allocation fails, drop the packet. 725 1.1 cgd */ 726 1.54 matt if (tiqe == NULL) { 727 1.225 yamt tiqe = tcpipqent_alloc(); 728 1.54 matt if (tiqe == NULL) { 729 1.284 thorpej TCP_STATINC(TCP_STAT_RCVMEMDROP); 730 1.54 matt m_freem(m); 731 1.312 dyoung goto out; 732 1.54 matt } 733 1.54 matt } 734 1.20 cgd 735 1.54 matt /* 736 1.54 matt * Update the counters. 737 1.54 matt */ 738 1.336 he tp->t_rcvoopack++; 739 1.284 thorpej tcps = TCP_STAT_GETREF(); 740 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVOOPACK); 741 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVOOBYTE, rcvoobyte); 742 1.54 matt if (rcvpartdupbyte) { 743 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPARTDUPPACK); 744 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVPARTDUPBYTE, 745 1.439 riastrad rcvpartdupbyte); 746 1.1 cgd } 747 1.284 thorpej TCP_STAT_PUTREF(); 748 1.1 cgd 749 1.54 matt /* 750 1.54 matt * Insert the new fragment queue entry into both queues. 751 1.54 matt */ 752 1.20 cgd tiqe->ipqe_m = m; 753 1.54 matt tiqe->ipqe_seq = pkt_seq; 754 1.54 matt tiqe->ipqe_len = pkt_len; 755 1.54 matt tiqe->ipqe_flags = pkt_flags; 756 1.20 cgd if (p == NULL) { 757 1.141 matt TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q); 758 1.20 cgd } else { 759 1.141 matt TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q); 760 1.20 cgd } 761 1.224 yamt tp->t_segqlen++; 762 1.1 cgd 763 1.141 matt skip_replacement: 764 1.141 matt TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq); 765 1.54 matt 766 1.1 cgd present: 767 1.1 cgd /* 768 1.1 cgd * Present data to user, advancing rcv_nxt through 769 1.1 cgd * completed sequence space. 770 1.1 cgd */ 771 1.11 mycroft if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 772 1.312 dyoung goto out; 773 1.141 matt q = TAILQ_FIRST(&tp->segq); 774 1.54 matt if (q == NULL || q->ipqe_seq != tp->rcv_nxt) 775 1.312 dyoung goto out; 776 1.54 matt if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len) 777 1.312 dyoung goto out; 778 1.20 cgd 779 1.54 matt tp->rcv_nxt += q->ipqe_len; 780 1.54 matt pkt_flags = q->ipqe_flags & TH_FIN; 781 1.421 roy nd_hint(tp); 782 1.54 matt 783 1.141 matt TAILQ_REMOVE(&tp->segq, q, ipqe_q); 784 1.141 matt TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq); 785 1.224 yamt tp->t_segqlen--; 786 1.224 yamt KASSERT(tp->t_segqlen >= 0); 787 1.224 yamt KASSERT(tp->t_segqlen != 0 || 788 1.224 yamt (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq))); 789 1.54 matt if (so->so_state & SS_CANTRCVMORE) 790 1.54 matt m_freem(q->ipqe_m); 791 1.54 matt else 792 1.148 thorpej sbappendstream(&so->so_rcv, q->ipqe_m); 793 1.225 yamt tcpipqent_free(q); 794 1.312 dyoung TCP_REASS_UNLOCK(tp); 795 1.1 cgd sorwakeup(so); 796 1.399 maxv return pkt_flags; 797 1.399 maxv 798 1.312 dyoung out: 799 1.312 dyoung TCP_REASS_UNLOCK(tp); 800 1.399 maxv return 0; 801 1.1 cgd } 802 1.1 cgd 803 1.120 itojun #ifdef INET6 804 1.83 itojun int 805 1.220 perry tcp6_input(struct mbuf **mp, int *offp, int proto) 806 1.83 itojun { 807 1.83 itojun struct mbuf *m = *mp; 808 1.83 itojun 809 1.83 itojun /* 810 1.83 itojun * draft-itojun-ipv6-tcp-to-anycast 811 1.83 itojun * better place to put this in? 812 1.83 itojun */ 813 1.83 itojun if (m->m_flags & M_ANYCAST6) { 814 1.98 itojun struct ip6_hdr *ip6; 815 1.98 itojun if (m->m_len < sizeof(struct ip6_hdr)) { 816 1.98 itojun if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { 817 1.284 thorpej TCP_STATINC(TCP_STAT_RCVSHORT); 818 1.98 itojun return IPPROTO_DONE; 819 1.98 itojun } 820 1.98 itojun } 821 1.98 itojun ip6 = mtod(m, struct ip6_hdr *); 822 1.124 itojun icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 823 1.262 christos (char *)&ip6->ip6_dst - (char *)ip6); 824 1.83 itojun return IPPROTO_DONE; 825 1.83 itojun } 826 1.83 itojun 827 1.83 itojun tcp_input(m, *offp, proto); 828 1.83 itojun return IPPROTO_DONE; 829 1.83 itojun } 830 1.83 itojun #endif 831 1.83 itojun 832 1.145 yamt static void 833 1.220 perry tcp4_log_refused(const struct ip *ip, const struct tcphdr *th) 834 1.145 yamt { 835 1.335 christos char src[INET_ADDRSTRLEN]; 836 1.335 christos char dst[INET_ADDRSTRLEN]; 837 1.145 yamt 838 1.145 yamt if (ip) { 839 1.335 christos in_print(src, sizeof(src), &ip->ip_src); 840 1.335 christos in_print(dst, sizeof(dst), &ip->ip_dst); 841 1.389 maxv } else { 842 1.165 itojun strlcpy(src, "(unknown)", sizeof(src)); 843 1.165 itojun strlcpy(dst, "(unknown)", sizeof(dst)); 844 1.145 yamt } 845 1.145 yamt log(LOG_INFO, 846 1.145 yamt "Connection attempt to TCP %s:%d from %s:%d\n", 847 1.145 yamt dst, ntohs(th->th_dport), 848 1.145 yamt src, ntohs(th->th_sport)); 849 1.145 yamt } 850 1.145 yamt 851 1.145 yamt #ifdef INET6 852 1.145 yamt static void 853 1.220 perry tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th) 854 1.145 yamt { 855 1.145 yamt char src[INET6_ADDRSTRLEN]; 856 1.145 yamt char dst[INET6_ADDRSTRLEN]; 857 1.145 yamt 858 1.145 yamt if (ip6) { 859 1.335 christos in6_print(src, sizeof(src), &ip6->ip6_src); 860 1.335 christos in6_print(dst, sizeof(dst), &ip6->ip6_dst); 861 1.389 maxv } else { 862 1.165 itojun strlcpy(src, "(unknown v6)", sizeof(src)); 863 1.165 itojun strlcpy(dst, "(unknown v6)", sizeof(dst)); 864 1.145 yamt } 865 1.145 yamt log(LOG_INFO, 866 1.145 yamt "Connection attempt to TCP [%s]:%d from [%s]:%d\n", 867 1.145 yamt dst, ntohs(th->th_dport), 868 1.145 yamt src, ntohs(th->th_sport)); 869 1.145 yamt } 870 1.145 yamt #endif 871 1.145 yamt 872 1.1 cgd /* 873 1.212 yamt * Checksum extended TCP header and data. 874 1.212 yamt */ 875 1.212 yamt int 876 1.255 christos tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th, 877 1.249 christos int toff, int off, int tlen) 878 1.212 yamt { 879 1.347 ozaki struct ifnet *rcvif; 880 1.347 ozaki int s; 881 1.212 yamt 882 1.212 yamt /* 883 1.212 yamt * XXX it's better to record and check if this mbuf is 884 1.212 yamt * already checked. 885 1.212 yamt */ 886 1.212 yamt 887 1.347 ozaki rcvif = m_get_rcvif(m, &s); 888 1.354 ozaki if (__predict_false(rcvif == NULL)) 889 1.354 ozaki goto badcsum; /* XXX */ 890 1.347 ozaki 891 1.212 yamt switch (af) { 892 1.212 yamt case AF_INET: 893 1.212 yamt switch (m->m_pkthdr.csum_flags & 894 1.347 ozaki ((rcvif->if_csum_flags_rx & M_CSUM_TCPv4) | 895 1.212 yamt M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 896 1.212 yamt case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD: 897 1.212 yamt TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad); 898 1.212 yamt goto badcsum; 899 1.212 yamt 900 1.212 yamt case M_CSUM_TCPv4|M_CSUM_DATA: { 901 1.212 yamt u_int32_t hw_csum = m->m_pkthdr.csum_data; 902 1.212 yamt 903 1.212 yamt TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data); 904 1.212 yamt if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) { 905 1.212 yamt const struct ip *ip = 906 1.212 yamt mtod(m, const struct ip *); 907 1.212 yamt 908 1.212 yamt hw_csum = in_cksum_phdr(ip->ip_src.s_addr, 909 1.212 yamt ip->ip_dst.s_addr, 910 1.212 yamt htons(hw_csum + tlen + off + IPPROTO_TCP)); 911 1.212 yamt } 912 1.212 yamt if ((hw_csum ^ 0xffff) != 0) 913 1.212 yamt goto badcsum; 914 1.212 yamt break; 915 1.212 yamt } 916 1.212 yamt 917 1.212 yamt case M_CSUM_TCPv4: 918 1.212 yamt /* Checksum was okay. */ 919 1.212 yamt TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok); 920 1.212 yamt break; 921 1.212 yamt 922 1.212 yamt default: 923 1.212 yamt /* 924 1.212 yamt * Must compute it ourselves. Maybe skip checksum 925 1.212 yamt * on loopback interfaces. 926 1.212 yamt */ 927 1.347 ozaki if (__predict_true(!(rcvif->if_flags & IFF_LOOPBACK) || 928 1.212 yamt tcp_do_loopback_cksum)) { 929 1.212 yamt TCP_CSUM_COUNTER_INCR(&tcp_swcsum); 930 1.212 yamt if (in4_cksum(m, IPPROTO_TCP, toff, 931 1.212 yamt tlen + off) != 0) 932 1.212 yamt goto badcsum; 933 1.212 yamt } 934 1.212 yamt break; 935 1.212 yamt } 936 1.212 yamt break; 937 1.212 yamt 938 1.212 yamt #ifdef INET6 939 1.212 yamt case AF_INET6: 940 1.232 yamt switch (m->m_pkthdr.csum_flags & 941 1.347 ozaki ((rcvif->if_csum_flags_rx & M_CSUM_TCPv6) | 942 1.232 yamt M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) { 943 1.232 yamt case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD: 944 1.232 yamt TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad); 945 1.232 yamt goto badcsum; 946 1.232 yamt 947 1.232 yamt #if 0 /* notyet */ 948 1.232 yamt case M_CSUM_TCPv6|M_CSUM_DATA: 949 1.232 yamt #endif 950 1.232 yamt 951 1.232 yamt case M_CSUM_TCPv6: 952 1.232 yamt /* Checksum was okay. */ 953 1.232 yamt TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok); 954 1.232 yamt break; 955 1.232 yamt 956 1.232 yamt default: 957 1.232 yamt /* 958 1.232 yamt * Must compute it ourselves. Maybe skip checksum 959 1.232 yamt * on loopback interfaces. 960 1.232 yamt */ 961 1.232 yamt if (__predict_true((m->m_flags & M_LOOP) == 0 || 962 1.232 yamt tcp_do_loopback_cksum)) { 963 1.232 yamt TCP_CSUM_COUNTER_INCR(&tcp6_swcsum); 964 1.232 yamt if (in6_cksum(m, IPPROTO_TCP, toff, 965 1.232 yamt tlen + off) != 0) 966 1.232 yamt goto badcsum; 967 1.232 yamt } 968 1.212 yamt } 969 1.212 yamt break; 970 1.212 yamt #endif /* INET6 */ 971 1.212 yamt } 972 1.347 ozaki m_put_rcvif(rcvif, &s); 973 1.212 yamt 974 1.212 yamt return 0; 975 1.212 yamt 976 1.212 yamt badcsum: 977 1.347 ozaki m_put_rcvif(rcvif, &s); 978 1.284 thorpej TCP_STATINC(TCP_STAT_RCVBADSUM); 979 1.212 yamt return -1; 980 1.212 yamt } 981 1.212 yamt 982 1.389 maxv /* 983 1.389 maxv * When a packet arrives addressed to a vestigial tcpbp, we 984 1.312 dyoung * nevertheless have to respond to it per the spec. 985 1.389 maxv * 986 1.389 maxv * This code is duplicated from the one in tcp_input(). 987 1.312 dyoung */ 988 1.312 dyoung static void tcp_vtw_input(struct tcphdr *th, vestigial_inpcb_t *vp, 989 1.372 maxv struct mbuf *m, int tlen) 990 1.312 dyoung { 991 1.389 maxv int tiflags; 992 1.389 maxv int todrop; 993 1.389 maxv uint32_t t_flags = 0; 994 1.439 riastrad net_stat_ref_t tcps; 995 1.312 dyoung 996 1.312 dyoung tiflags = th->th_flags; 997 1.312 dyoung todrop = vp->rcv_nxt - th->th_seq; 998 1.312 dyoung 999 1.312 dyoung if (todrop > 0) { 1000 1.312 dyoung if (tiflags & TH_SYN) { 1001 1.312 dyoung tiflags &= ~TH_SYN; 1002 1.390 maxv th->th_seq++; 1003 1.390 maxv tcp_urp_drop(th, 1, &tiflags); 1004 1.390 maxv todrop--; 1005 1.312 dyoung } 1006 1.312 dyoung if (todrop > tlen || 1007 1.312 dyoung (todrop == tlen && (tiflags & TH_FIN) == 0)) { 1008 1.312 dyoung /* 1009 1.312 dyoung * Any valid FIN or RST must be to the left of the 1010 1.312 dyoung * window. At this point the FIN or RST must be a 1011 1.312 dyoung * duplicate or out of sequence; drop it. 1012 1.312 dyoung */ 1013 1.312 dyoung if (tiflags & TH_RST) 1014 1.312 dyoung goto drop; 1015 1.312 dyoung tiflags &= ~(TH_FIN|TH_RST); 1016 1.389 maxv 1017 1.312 dyoung /* 1018 1.312 dyoung * Send an ACK to resynchronize and drop any data. 1019 1.312 dyoung * But keep on processing for RST or ACK. 1020 1.312 dyoung */ 1021 1.312 dyoung t_flags |= TF_ACKNOW; 1022 1.312 dyoung todrop = tlen; 1023 1.312 dyoung tcps = TCP_STAT_GETREF(); 1024 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVDUPPACK); 1025 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVDUPBYTE, todrop); 1026 1.312 dyoung TCP_STAT_PUTREF(); 1027 1.389 maxv } else if ((tiflags & TH_RST) && 1028 1.389 maxv th->th_seq != vp->rcv_nxt) { 1029 1.312 dyoung /* 1030 1.312 dyoung * Test for reset before adjusting the sequence 1031 1.312 dyoung * number for overlapping data. 1032 1.312 dyoung */ 1033 1.312 dyoung goto dropafterack_ratelim; 1034 1.312 dyoung } else { 1035 1.312 dyoung tcps = TCP_STAT_GETREF(); 1036 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPARTDUPPACK); 1037 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVPARTDUPBYTE, 1038 1.439 riastrad todrop); 1039 1.312 dyoung TCP_STAT_PUTREF(); 1040 1.312 dyoung } 1041 1.312 dyoung 1042 1.312 dyoung // tcp_new_dsack(tp, th->th_seq, todrop); 1043 1.312 dyoung // hdroptlen += todrop; /*drop from head afterwards*/ 1044 1.312 dyoung 1045 1.312 dyoung th->th_seq += todrop; 1046 1.312 dyoung tlen -= todrop; 1047 1.390 maxv tcp_urp_drop(th, todrop, &tiflags); 1048 1.312 dyoung } 1049 1.312 dyoung 1050 1.312 dyoung /* 1051 1.312 dyoung * If new data are received on a connection after the 1052 1.312 dyoung * user processes are gone, then RST the other end. 1053 1.312 dyoung */ 1054 1.312 dyoung if (tlen) { 1055 1.312 dyoung TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); 1056 1.312 dyoung goto dropwithreset; 1057 1.312 dyoung } 1058 1.312 dyoung 1059 1.312 dyoung /* 1060 1.312 dyoung * If segment ends after window, drop trailing data 1061 1.312 dyoung * (and PUSH and FIN); if nothing left, just ACK. 1062 1.312 dyoung */ 1063 1.389 maxv todrop = (th->th_seq + tlen) - (vp->rcv_nxt + vp->rcv_wnd); 1064 1.312 dyoung 1065 1.312 dyoung if (todrop > 0) { 1066 1.312 dyoung TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); 1067 1.312 dyoung if (todrop >= tlen) { 1068 1.312 dyoung /* 1069 1.312 dyoung * The segment actually starts after the window. 1070 1.312 dyoung * th->th_seq + tlen - vp->rcv_nxt - vp->rcv_wnd >= tlen 1071 1.312 dyoung * th->th_seq - vp->rcv_nxt - vp->rcv_wnd >= 0 1072 1.312 dyoung * th->th_seq >= vp->rcv_nxt + vp->rcv_wnd 1073 1.312 dyoung */ 1074 1.312 dyoung TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); 1075 1.389 maxv 1076 1.312 dyoung /* 1077 1.312 dyoung * If a new connection request is received 1078 1.312 dyoung * while in TIME_WAIT, drop the old connection 1079 1.312 dyoung * and start over if the sequence numbers 1080 1.312 dyoung * are above the previous ones. 1081 1.312 dyoung */ 1082 1.389 maxv if ((tiflags & TH_SYN) && 1083 1.389 maxv SEQ_GT(th->th_seq, vp->rcv_nxt)) { 1084 1.389 maxv /* 1085 1.389 maxv * We only support this in the !NOFDREF case, which 1086 1.312 dyoung * is to say: not here. 1087 1.312 dyoung */ 1088 1.331 maxv goto dropwithreset; 1089 1.312 dyoung } 1090 1.389 maxv 1091 1.312 dyoung /* 1092 1.312 dyoung * If window is closed can only take segments at 1093 1.312 dyoung * window edge, and have to drop data and PUSH from 1094 1.312 dyoung * incoming segments. Continue processing, but 1095 1.312 dyoung * remember to ack. Otherwise, drop segment 1096 1.312 dyoung * and (if not RST) ack. 1097 1.312 dyoung */ 1098 1.312 dyoung if (vp->rcv_wnd == 0 && th->th_seq == vp->rcv_nxt) { 1099 1.312 dyoung t_flags |= TF_ACKNOW; 1100 1.312 dyoung TCP_STATINC(TCP_STAT_RCVWINPROBE); 1101 1.389 maxv } else { 1102 1.312 dyoung goto dropafterack; 1103 1.389 maxv } 1104 1.389 maxv } else { 1105 1.312 dyoung TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); 1106 1.389 maxv } 1107 1.312 dyoung m_adj(m, -todrop); 1108 1.312 dyoung tlen -= todrop; 1109 1.312 dyoung tiflags &= ~(TH_PUSH|TH_FIN); 1110 1.312 dyoung } 1111 1.312 dyoung 1112 1.312 dyoung if (tiflags & TH_RST) { 1113 1.312 dyoung if (th->th_seq != vp->rcv_nxt) 1114 1.312 dyoung goto dropafterack_ratelim; 1115 1.312 dyoung 1116 1.312 dyoung vtw_del(vp->ctl, vp->vtw); 1117 1.312 dyoung goto drop; 1118 1.312 dyoung } 1119 1.312 dyoung 1120 1.312 dyoung /* 1121 1.312 dyoung * If the ACK bit is off we drop the segment and return. 1122 1.312 dyoung */ 1123 1.312 dyoung if ((tiflags & TH_ACK) == 0) { 1124 1.312 dyoung if (t_flags & TF_ACKNOW) 1125 1.312 dyoung goto dropafterack; 1126 1.398 maxv goto drop; 1127 1.312 dyoung } 1128 1.312 dyoung 1129 1.312 dyoung /* 1130 1.312 dyoung * In TIME_WAIT state the only thing that should arrive 1131 1.312 dyoung * is a retransmission of the remote FIN. Acknowledge 1132 1.312 dyoung * it and restart the finack timer. 1133 1.312 dyoung */ 1134 1.312 dyoung vtw_restart(vp); 1135 1.312 dyoung goto dropafterack; 1136 1.312 dyoung 1137 1.312 dyoung dropafterack: 1138 1.312 dyoung /* 1139 1.312 dyoung * Generate an ACK dropping incoming segment if it occupies 1140 1.312 dyoung * sequence space, where the ACK reflects our state. 1141 1.312 dyoung */ 1142 1.312 dyoung if (tiflags & TH_RST) 1143 1.312 dyoung goto drop; 1144 1.312 dyoung goto dropafterack2; 1145 1.312 dyoung 1146 1.312 dyoung dropafterack_ratelim: 1147 1.312 dyoung /* 1148 1.312 dyoung * We may want to rate-limit ACKs against SYN/RST attack. 1149 1.312 dyoung */ 1150 1.312 dyoung if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 1151 1.389 maxv tcp_ackdrop_ppslim) == 0) { 1152 1.312 dyoung /* XXX stat */ 1153 1.312 dyoung goto drop; 1154 1.312 dyoung } 1155 1.312 dyoung /* ...fall into dropafterack2... */ 1156 1.312 dyoung 1157 1.312 dyoung dropafterack2: 1158 1.389 maxv (void)tcp_respond(0, m, m, th, th->th_seq + tlen, th->th_ack, TH_ACK); 1159 1.312 dyoung return; 1160 1.312 dyoung 1161 1.312 dyoung dropwithreset: 1162 1.312 dyoung /* 1163 1.312 dyoung * Generate a RST, dropping incoming segment. 1164 1.312 dyoung * Make ACK acceptable to originator of segment. 1165 1.312 dyoung */ 1166 1.312 dyoung if (tiflags & TH_RST) 1167 1.312 dyoung goto drop; 1168 1.312 dyoung 1169 1.389 maxv if (tiflags & TH_ACK) { 1170 1.312 dyoung tcp_respond(0, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 1171 1.389 maxv } else { 1172 1.312 dyoung if (tiflags & TH_SYN) 1173 1.312 dyoung ++tlen; 1174 1.312 dyoung (void)tcp_respond(0, m, m, th, th->th_seq + tlen, (tcp_seq)0, 1175 1.389 maxv TH_RST|TH_ACK); 1176 1.312 dyoung } 1177 1.312 dyoung return; 1178 1.312 dyoung drop: 1179 1.312 dyoung m_freem(m); 1180 1.312 dyoung } 1181 1.312 dyoung 1182 1.212 yamt /* 1183 1.235 hubertf * TCP input routine, follows pages 65-76 of RFC 793 very closely. 1184 1.1 cgd */ 1185 1.5 mycroft void 1186 1.412 maxv tcp_input(struct mbuf *m, int off, int proto) 1187 1.1 cgd { 1188 1.106 augustss struct tcphdr *th; 1189 1.83 itojun struct ip *ip; 1190 1.106 augustss struct inpcb *inp; 1191 1.83 itojun #ifdef INET6 1192 1.83 itojun struct ip6_hdr *ip6; 1193 1.83 itojun #endif 1194 1.155 itojun u_int8_t *optp = NULL; 1195 1.23 christos int optlen = 0; 1196 1.412 maxv int len, tlen, hdroptlen = 0; 1197 1.368 maxv struct tcpcb *tp = NULL; 1198 1.106 augustss int tiflags; 1199 1.23 christos struct socket *so = NULL; 1200 1.311 yamt int todrop, acked, ourfinisacked, needoutput = 0; 1201 1.311 yamt bool dupseg; 1202 1.157 simonb #ifdef TCP_DEBUG 1203 1.23 christos short ostate = 0; 1204 1.157 simonb #endif 1205 1.12 cgd u_long tiwin; 1206 1.29 thorpej struct tcp_opt_info opti; 1207 1.410 maxv int thlen, iphlen; 1208 1.83 itojun int af; /* af on the wire */ 1209 1.83 itojun struct mbuf *tcp_saveti = NULL; 1210 1.229 yamt uint32_t ts_rtt; 1211 1.244 rpaulo uint8_t iptos; 1212 1.439 riastrad net_stat_ref_t tcps; 1213 1.312 dyoung vestigial_inpcb_t vestige; 1214 1.312 dyoung 1215 1.312 dyoung vestige.valid = 0; 1216 1.23 christos 1217 1.162 matt MCLAIM(m, &tcp_rx_mowner); 1218 1.1 cgd 1219 1.284 thorpej TCP_STATINC(TCP_STAT_RCVTOTAL); 1220 1.29 thorpej 1221 1.295 cegger memset(&opti, 0, sizeof(opti)); 1222 1.29 thorpej opti.ts_present = 0; 1223 1.29 thorpej opti.maxseg = 0; 1224 1.29 thorpej 1225 1.1 cgd /* 1226 1.103 thorpej * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN. 1227 1.103 thorpej * 1228 1.103 thorpej * TCP is, by definition, unicast, so we reject all 1229 1.103 thorpej * multicast outright. 1230 1.103 thorpej * 1231 1.103 thorpej * Note, there are additional src/dst address checks in 1232 1.103 thorpej * the AF-specific code below. 1233 1.103 thorpej */ 1234 1.103 thorpej if (m->m_flags & (M_BCAST|M_MCAST)) { 1235 1.103 thorpej /* XXX stat */ 1236 1.103 thorpej goto drop; 1237 1.103 thorpej } 1238 1.103 thorpej #ifdef INET6 1239 1.103 thorpej if (m->m_flags & M_ANYCAST6) { 1240 1.103 thorpej /* XXX stat */ 1241 1.103 thorpej goto drop; 1242 1.103 thorpej } 1243 1.103 thorpej #endif 1244 1.103 thorpej 1245 1.411 maxv M_REGION_GET(th, struct tcphdr *, m, off, sizeof(struct tcphdr)); 1246 1.370 maxv if (th == NULL) { 1247 1.370 maxv TCP_STATINC(TCP_STAT_RCVSHORT); 1248 1.370 maxv return; 1249 1.370 maxv } 1250 1.370 maxv 1251 1.103 thorpej /* 1252 1.418 christos * Enforce alignment requirements that are violated in 1253 1.418 christos * some cases, see kern/50766 for details. 1254 1.418 christos */ 1255 1.427 jakllsch if (ACCESSIBLE_POINTER(th, struct tcphdr) == 0) { 1256 1.418 christos m = m_copyup(m, off + sizeof(struct tcphdr), 0); 1257 1.418 christos if (m == NULL) { 1258 1.418 christos TCP_STATINC(TCP_STAT_RCVSHORT); 1259 1.418 christos return; 1260 1.418 christos } 1261 1.418 christos th = (struct tcphdr *)(mtod(m, char *) + off); 1262 1.418 christos } 1263 1.427 jakllsch KASSERT(ACCESSIBLE_POINTER(th, struct tcphdr)); 1264 1.418 christos 1265 1.418 christos /* 1266 1.197 itojun * Get IP and TCP header. 1267 1.1 cgd * Note: IP leaves IP header in first mbuf. 1268 1.1 cgd */ 1269 1.83 itojun ip = mtod(m, struct ip *); 1270 1.418 christos #ifdef INET6 1271 1.418 christos ip6 = mtod(m, struct ip6_hdr *); 1272 1.418 christos #endif 1273 1.83 itojun switch (ip->ip_v) { 1274 1.83 itojun case 4: 1275 1.83 itojun af = AF_INET; 1276 1.83 itojun iphlen = sizeof(struct ip); 1277 1.370 maxv 1278 1.373 maxv if (IN_MULTICAST(ip->ip_dst.s_addr) || 1279 1.373 maxv in_broadcast(ip->ip_dst, m_get_rcvif_NOMPSAFE(m))) 1280 1.373 maxv goto drop; 1281 1.373 maxv 1282 1.103 thorpej /* We do the checksum after PCB lookup... */ 1283 1.150 itojun len = ntohs(ip->ip_len); 1284 1.411 maxv tlen = len - off; 1285 1.244 rpaulo iptos = ip->ip_tos; 1286 1.83 itojun break; 1287 1.83 itojun #ifdef INET6 1288 1.83 itojun case 6: 1289 1.83 itojun iphlen = sizeof(struct ip6_hdr); 1290 1.83 itojun af = AF_INET6; 1291 1.101 itojun 1292 1.115 itojun /* 1293 1.115 itojun * Be proactive about unspecified IPv6 address in source. 1294 1.115 itojun * As we use all-zero to indicate unbounded/unconnected pcb, 1295 1.115 itojun * unspecified IPv6 address can be used to confuse us. 1296 1.115 itojun * 1297 1.115 itojun * Note that packets with unspecified IPv6 destination is 1298 1.115 itojun * already dropped in ip6_input. 1299 1.115 itojun */ 1300 1.115 itojun if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 1301 1.101 itojun /* XXX stat */ 1302 1.101 itojun goto drop; 1303 1.101 itojun } 1304 1.1 cgd 1305 1.83 itojun /* 1306 1.103 thorpej * Make sure destination address is not multicast. 1307 1.103 thorpej * Source address checked in ip6_input(). 1308 1.83 itojun */ 1309 1.103 thorpej if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { 1310 1.103 thorpej /* XXX stat */ 1311 1.103 thorpej goto drop; 1312 1.103 thorpej } 1313 1.103 thorpej 1314 1.103 thorpej /* We do the checksum after PCB lookup... */ 1315 1.83 itojun len = m->m_pkthdr.len; 1316 1.411 maxv tlen = len - off; 1317 1.244 rpaulo iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1318 1.83 itojun break; 1319 1.83 itojun #endif 1320 1.83 itojun default: 1321 1.83 itojun m_freem(m); 1322 1.83 itojun return; 1323 1.1 cgd } 1324 1.368 maxv 1325 1.146 thorpej 1326 1.1 cgd /* 1327 1.368 maxv * Check that TCP offset makes sense, pull out TCP options and 1328 1.368 maxv * adjust length. 1329 1.1 cgd */ 1330 1.410 maxv thlen = th->th_off << 2; 1331 1.410 maxv if (thlen < sizeof(struct tcphdr) || thlen > tlen) { 1332 1.284 thorpej TCP_STATINC(TCP_STAT_RCVBADOFF); 1333 1.1 cgd goto drop; 1334 1.1 cgd } 1335 1.410 maxv tlen -= thlen; 1336 1.83 itojun 1337 1.410 maxv if (thlen > sizeof(struct tcphdr)) { 1338 1.411 maxv M_REGION_GET(th, struct tcphdr *, m, off, thlen); 1339 1.99 itojun if (th == NULL) { 1340 1.284 thorpej TCP_STATINC(TCP_STAT_RCVSHORT); 1341 1.99 itojun return; 1342 1.99 itojun } 1343 1.427 jakllsch KASSERT(ACCESSIBLE_POINTER(th, struct tcphdr)); 1344 1.410 maxv optlen = thlen - sizeof(struct tcphdr); 1345 1.155 itojun optp = ((u_int8_t *)th) + sizeof(struct tcphdr); 1346 1.387 maxv 1347 1.143 itojun /* 1348 1.387 maxv * Do quick retrieval of timestamp options. 1349 1.387 maxv * 1350 1.387 maxv * If timestamp is the only option and it's formatted as 1351 1.387 maxv * recommended in RFC 1323 appendix A, we quickly get the 1352 1.387 maxv * values now and don't bother calling tcp_dooptions(), 1353 1.387 maxv * etc. 1354 1.9 mycroft */ 1355 1.9 mycroft if ((optlen == TCPOLEN_TSTAMP_APPA || 1356 1.9 mycroft (optlen > TCPOLEN_TSTAMP_APPA && 1357 1.387 maxv optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && 1358 1.414 kamil be32dec(optp) == TCPOPT_TSTAMP_HDR && 1359 1.387 maxv (th->th_flags & TH_SYN) == 0) { 1360 1.29 thorpej opti.ts_present = 1; 1361 1.414 kamil opti.ts_val = be32dec(optp + 4); 1362 1.414 kamil opti.ts_ecr = be32dec(optp + 8); 1363 1.9 mycroft optp = NULL; /* we've parsed the options */ 1364 1.1 cgd } 1365 1.1 cgd } 1366 1.83 itojun tiflags = th->th_flags; 1367 1.1 cgd 1368 1.1 cgd /* 1369 1.340 kefren * Checksum extended TCP header and data 1370 1.340 kefren */ 1371 1.411 maxv if (tcp_input_checksum(af, m, th, off, thlen, tlen)) 1372 1.340 kefren goto badcsum; 1373 1.340 kefren 1374 1.340 kefren /* 1375 1.1 cgd * Locate pcb for segment. 1376 1.1 cgd */ 1377 1.1 cgd findpcb: 1378 1.83 itojun inp = NULL; 1379 1.83 itojun switch (af) { 1380 1.83 itojun case AF_INET: 1381 1.437 ozaki inp = inpcb_lookup(&tcbtable, ip->ip_src, th->th_sport, 1382 1.368 maxv ip->ip_dst, th->th_dport, &vestige); 1383 1.374 maxv if (inp == NULL && !vestige.valid) { 1384 1.284 thorpej TCP_STATINC(TCP_STAT_PCBHASHMISS); 1385 1.437 ozaki inp = inpcb_lookup_bound(&tcbtable, ip->ip_dst, 1386 1.368 maxv th->th_dport); 1387 1.83 itojun } 1388 1.120 itojun #ifdef INET6 1389 1.374 maxv if (inp == NULL && !vestige.valid) { 1390 1.83 itojun struct in6_addr s, d; 1391 1.83 itojun 1392 1.83 itojun /* mapped addr case */ 1393 1.345 rtr in6_in_2_v4mapin6(&ip->ip_src, &s); 1394 1.345 rtr in6_in_2_v4mapin6(&ip->ip_dst, &d); 1395 1.438 ozaki inp = in6pcb_lookup(&tcbtable, &s, 1396 1.368 maxv th->th_sport, &d, th->th_dport, 0, &vestige); 1397 1.435 ozaki if (inp == NULL && !vestige.valid) { 1398 1.284 thorpej TCP_STATINC(TCP_STAT_PCBHASHMISS); 1399 1.438 ozaki inp = in6pcb_lookup_bound(&tcbtable, &d, 1400 1.181 itojun th->th_dport, 0); 1401 1.83 itojun } 1402 1.83 itojun } 1403 1.83 itojun #endif 1404 1.435 ozaki if (inp == NULL && !vestige.valid) { 1405 1.284 thorpej TCP_STATINC(TCP_STAT_NOPORT); 1406 1.156 itojun if (tcp_log_refused && 1407 1.156 itojun (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { 1408 1.145 yamt tcp4_log_refused(ip, th); 1409 1.82 ad } 1410 1.280 yamt tcp_fields_to_host(th); 1411 1.104 thorpej goto dropwithreset_ratelim; 1412 1.21 mycroft } 1413 1.326 christos #if defined(IPSEC) 1414 1.332 christos if (ipsec_used) { 1415 1.435 ozaki if (inp && ipsec_in_reject(m, inp)) 1416 1.332 christos goto drop; 1417 1.83 itojun } 1418 1.83 itojun #endif /*IPSEC*/ 1419 1.83 itojun break; 1420 1.120 itojun #ifdef INET6 1421 1.83 itojun case AF_INET6: 1422 1.87 itojun { 1423 1.87 itojun int faith; 1424 1.87 itojun 1425 1.87 itojun #if defined(NFAITH) && NFAITH > 0 1426 1.124 itojun faith = faithprefix(&ip6->ip6_dst); 1427 1.87 itojun #else 1428 1.87 itojun faith = 0; 1429 1.87 itojun #endif 1430 1.438 ozaki inp = in6pcb_lookup(&tcbtable, &ip6->ip6_src, 1431 1.368 maxv th->th_sport, &ip6->ip6_dst, th->th_dport, faith, &vestige); 1432 1.435 ozaki if (inp == NULL && !vestige.valid) { 1433 1.284 thorpej TCP_STATINC(TCP_STAT_PCBHASHMISS); 1434 1.438 ozaki inp = in6pcb_lookup_bound(&tcbtable, &ip6->ip6_dst, 1435 1.368 maxv th->th_dport, faith); 1436 1.83 itojun } 1437 1.435 ozaki if (inp == NULL && !vestige.valid) { 1438 1.284 thorpej TCP_STATINC(TCP_STAT_NOPORT); 1439 1.156 itojun if (tcp_log_refused && 1440 1.156 itojun (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) { 1441 1.145 yamt tcp6_log_refused(ip6, th); 1442 1.136 itojun } 1443 1.280 yamt tcp_fields_to_host(th); 1444 1.104 thorpej goto dropwithreset_ratelim; 1445 1.83 itojun } 1446 1.326 christos #if defined(IPSEC) 1447 1.435 ozaki if (ipsec_used && inp && ipsec_in_reject(m, inp)) 1448 1.83 itojun goto drop; 1449 1.385 maxv #endif 1450 1.83 itojun break; 1451 1.87 itojun } 1452 1.83 itojun #endif 1453 1.1 cgd } 1454 1.1 cgd 1455 1.384 maxv tcp_fields_to_host(th); 1456 1.384 maxv 1457 1.24 mycroft /* 1458 1.24 mycroft * If the state is CLOSED (i.e., TCB does not exist) then 1459 1.24 mycroft * all data in the incoming segment is discarded. 1460 1.24 mycroft * If the TCB exists but is in CLOSED state, it is embryonic, 1461 1.24 mycroft * but should either do a listen or a connect soon. 1462 1.24 mycroft */ 1463 1.83 itojun tp = NULL; 1464 1.83 itojun so = NULL; 1465 1.83 itojun if (inp) { 1466 1.298 minskim /* Check the minimum TTL for socket. */ 1467 1.436 ozaki if (inp->inp_af == AF_INET && ip->ip_ttl < in4p_ip_minttl(inp)) 1468 1.298 minskim goto drop; 1469 1.298 minskim 1470 1.83 itojun tp = intotcpcb(inp); 1471 1.83 itojun so = inp->inp_socket; 1472 1.435 ozaki } else if (vestige.valid) { 1473 1.368 maxv /* We do not support the resurrection of vtw tcpcps. */ 1474 1.372 maxv tcp_vtw_input(th, &vestige, m, tlen); 1475 1.368 maxv m = NULL; 1476 1.312 dyoung goto drop; 1477 1.312 dyoung } 1478 1.312 dyoung 1479 1.384 maxv if (tp == NULL) 1480 1.104 thorpej goto dropwithreset_ratelim; 1481 1.1 cgd if (tp->t_state == TCPS_CLOSED) 1482 1.1 cgd goto drop; 1483 1.103 thorpej 1484 1.289 ad KASSERT(so->so_lock == softnet_lock); 1485 1.289 ad KASSERT(solocked(so)); 1486 1.289 ad 1487 1.9 mycroft /* Unscale the window into a 32-bit value. */ 1488 1.9 mycroft if ((tiflags & TH_SYN) == 0) 1489 1.83 itojun tiwin = th->th_win << tp->snd_scale; 1490 1.9 mycroft else 1491 1.83 itojun tiwin = th->th_win; 1492 1.83 itojun 1493 1.83 itojun #ifdef INET6 1494 1.83 itojun /* save packet options if user wanted */ 1495 1.436 ozaki if (inp->inp_af == AF_INET6 && (inp->inp_flags & IN6P_CONTROLOPTS)) { 1496 1.440 rin m_freem(inp->inp_options); 1497 1.440 rin inp->inp_options = NULL; 1498 1.435 ozaki ip6_savecontrol(inp, &inp->inp_options, ip6, m); 1499 1.83 itojun } 1500 1.83 itojun #endif 1501 1.9 mycroft 1502 1.365 maxv if (so->so_options & SO_DEBUG) { 1503 1.365 maxv #ifdef TCP_DEBUG 1504 1.365 maxv ostate = tp->t_state; 1505 1.365 maxv #endif 1506 1.365 maxv 1507 1.365 maxv tcp_saveti = NULL; 1508 1.365 maxv if (iphlen + sizeof(struct tcphdr) > MHLEN) 1509 1.365 maxv goto nosave; 1510 1.365 maxv 1511 1.365 maxv if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) { 1512 1.365 maxv tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT); 1513 1.374 maxv if (tcp_saveti == NULL) 1514 1.365 maxv goto nosave; 1515 1.365 maxv } else { 1516 1.365 maxv MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER); 1517 1.374 maxv if (tcp_saveti == NULL) 1518 1.365 maxv goto nosave; 1519 1.365 maxv MCLAIM(m, &tcp_mowner); 1520 1.365 maxv tcp_saveti->m_len = iphlen; 1521 1.365 maxv m_copydata(m, 0, iphlen, 1522 1.365 maxv mtod(tcp_saveti, void *)); 1523 1.365 maxv } 1524 1.365 maxv 1525 1.365 maxv if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) { 1526 1.365 maxv m_freem(tcp_saveti); 1527 1.365 maxv tcp_saveti = NULL; 1528 1.365 maxv } else { 1529 1.365 maxv tcp_saveti->m_len += sizeof(struct tcphdr); 1530 1.365 maxv memcpy(mtod(tcp_saveti, char *) + iphlen, th, 1531 1.365 maxv sizeof(struct tcphdr)); 1532 1.365 maxv } 1533 1.365 maxv nosave:; 1534 1.365 maxv } 1535 1.365 maxv 1536 1.365 maxv if (so->so_options & SO_ACCEPTCONN) { 1537 1.83 itojun union syn_cache_sa src; 1538 1.83 itojun union syn_cache_sa dst; 1539 1.83 itojun 1540 1.378 maxv KASSERT(tp->t_state == TCPS_LISTEN); 1541 1.378 maxv 1542 1.295 cegger memset(&src, 0, sizeof(src)); 1543 1.295 cegger memset(&dst, 0, sizeof(dst)); 1544 1.83 itojun switch (af) { 1545 1.83 itojun case AF_INET: 1546 1.83 itojun src.sin.sin_len = sizeof(struct sockaddr_in); 1547 1.83 itojun src.sin.sin_family = AF_INET; 1548 1.83 itojun src.sin.sin_addr = ip->ip_src; 1549 1.83 itojun src.sin.sin_port = th->th_sport; 1550 1.83 itojun 1551 1.83 itojun dst.sin.sin_len = sizeof(struct sockaddr_in); 1552 1.83 itojun dst.sin.sin_family = AF_INET; 1553 1.83 itojun dst.sin.sin_addr = ip->ip_dst; 1554 1.83 itojun dst.sin.sin_port = th->th_dport; 1555 1.83 itojun break; 1556 1.83 itojun #ifdef INET6 1557 1.83 itojun case AF_INET6: 1558 1.83 itojun src.sin6.sin6_len = sizeof(struct sockaddr_in6); 1559 1.83 itojun src.sin6.sin6_family = AF_INET6; 1560 1.83 itojun src.sin6.sin6_addr = ip6->ip6_src; 1561 1.83 itojun src.sin6.sin6_port = th->th_sport; 1562 1.83 itojun 1563 1.83 itojun dst.sin6.sin6_len = sizeof(struct sockaddr_in6); 1564 1.83 itojun dst.sin6.sin6_family = AF_INET6; 1565 1.83 itojun dst.sin6.sin6_addr = ip6->ip6_dst; 1566 1.83 itojun dst.sin6.sin6_port = th->th_dport; 1567 1.83 itojun break; 1568 1.385 maxv #endif 1569 1.83 itojun } 1570 1.83 itojun 1571 1.366 maxv if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { 1572 1.366 maxv if (tiflags & TH_RST) { 1573 1.366 maxv syn_cache_reset(&src.sa, &dst.sa, th); 1574 1.366 maxv } else if ((tiflags & (TH_ACK|TH_SYN)) == 1575 1.366 maxv (TH_ACK|TH_SYN)) { 1576 1.366 maxv /* 1577 1.374 maxv * Received a SYN,ACK. This should never 1578 1.374 maxv * happen while we are in LISTEN. Send an RST. 1579 1.366 maxv */ 1580 1.366 maxv goto badsyn; 1581 1.366 maxv } else if (tiflags & TH_ACK) { 1582 1.391 maxv so = syn_cache_get(&src.sa, &dst.sa, th, so, m); 1583 1.366 maxv if (so == NULL) { 1584 1.35 thorpej /* 1585 1.368 maxv * We don't have a SYN for this ACK; 1586 1.368 maxv * send an RST. 1587 1.35 thorpej */ 1588 1.35 thorpej goto badsyn; 1589 1.368 maxv } else if (so == (struct socket *)(-1)) { 1590 1.366 maxv /* 1591 1.368 maxv * We were unable to create the 1592 1.368 maxv * connection. If the 3-way handshake 1593 1.368 maxv * was completed, and RST has been 1594 1.368 maxv * sent to the peer. Since the mbuf 1595 1.368 maxv * might be in use for the reply, do 1596 1.368 maxv * not free it. 1597 1.366 maxv */ 1598 1.366 maxv m = NULL; 1599 1.366 maxv } else { 1600 1.366 maxv /* 1601 1.368 maxv * We have created a full-blown 1602 1.368 maxv * connection. 1603 1.366 maxv */ 1604 1.435 ozaki inp = sotoinpcb(so); 1605 1.435 ozaki tp = intotcpcb(inp); 1606 1.366 maxv if (tp == NULL) 1607 1.366 maxv goto badsyn; /*XXX*/ 1608 1.366 maxv tiwin <<= tp->snd_scale; 1609 1.366 maxv goto after_listen; 1610 1.66 mycroft } 1611 1.143 itojun } else { 1612 1.29 thorpej /* 1613 1.366 maxv * None of RST, SYN or ACK was set. 1614 1.366 maxv * This is an invalid packet for a 1615 1.366 maxv * TCB in LISTEN state. Send a RST. 1616 1.35 thorpej */ 1617 1.366 maxv goto badsyn; 1618 1.366 maxv } 1619 1.366 maxv } else { 1620 1.366 maxv /* 1621 1.366 maxv * Received a SYN. 1622 1.366 maxv */ 1623 1.250 rpaulo 1624 1.83 itojun #ifdef INET6 1625 1.366 maxv /* 1626 1.366 maxv * If deprecated address is forbidden, we do 1627 1.366 maxv * not accept SYN to deprecated interface 1628 1.366 maxv * address to prevent any new inbound 1629 1.366 maxv * connection from getting established. 1630 1.366 maxv * When we do not accept SYN, we send a TCP 1631 1.366 maxv * RST, with deprecated source address (instead 1632 1.366 maxv * of dropping it). We compromise it as it is 1633 1.366 maxv * much better for peer to send a RST, and 1634 1.366 maxv * RST will be the final packet for the 1635 1.366 maxv * exchange. 1636 1.366 maxv * 1637 1.366 maxv * If we do not forbid deprecated addresses, we 1638 1.366 maxv * accept the SYN packet. RFC2462 does not 1639 1.366 maxv * suggest dropping SYN in this case. 1640 1.366 maxv * If we decipher RFC2462 5.5.4, it says like 1641 1.366 maxv * this: 1642 1.366 maxv * 1. use of deprecated addr with existing 1643 1.366 maxv * communication is okay - "SHOULD continue 1644 1.366 maxv * to be used" 1645 1.366 maxv * 2. use of it with new communication: 1646 1.366 maxv * (2a) "SHOULD NOT be used if alternate 1647 1.366 maxv * address with sufficient scope is 1648 1.366 maxv * available" 1649 1.366 maxv * (2b) nothing mentioned otherwise. 1650 1.366 maxv * Here we fall into (2b) case as we have no 1651 1.366 maxv * choice in our source address selection - we 1652 1.366 maxv * must obey the peer. 1653 1.366 maxv * 1654 1.366 maxv * The wording in RFC2462 is confusing, and 1655 1.366 maxv * there are multiple description text for 1656 1.366 maxv * deprecated address handling - worse, they 1657 1.366 maxv * are not exactly the same. I believe 5.5.4 1658 1.366 maxv * is the best one, so we follow 5.5.4. 1659 1.366 maxv */ 1660 1.366 maxv if (af == AF_INET6 && !ip6_use_deprecated) { 1661 1.366 maxv struct in6_ifaddr *ia6; 1662 1.366 maxv int s; 1663 1.366 maxv struct ifnet *rcvif = m_get_rcvif(m, &s); 1664 1.366 maxv if (rcvif == NULL) 1665 1.366 maxv goto dropwithreset; /* XXX */ 1666 1.366 maxv if ((ia6 = in6ifa_ifpwithaddr(rcvif, 1667 1.366 maxv &ip6->ip6_dst)) && 1668 1.366 maxv (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1669 1.366 maxv tp = NULL; 1670 1.347 ozaki m_put_rcvif(rcvif, &s); 1671 1.366 maxv goto dropwithreset; 1672 1.152 itojun } 1673 1.366 maxv m_put_rcvif(rcvif, &s); 1674 1.366 maxv } 1675 1.152 itojun #endif 1676 1.183 itojun 1677 1.366 maxv /* 1678 1.368 maxv * LISTEN socket received a SYN from itself? This 1679 1.368 maxv * can't possibly be valid; drop the packet. 1680 1.366 maxv */ 1681 1.366 maxv if (th->th_sport == th->th_dport) { 1682 1.374 maxv int eq = 0; 1683 1.152 itojun 1684 1.366 maxv switch (af) { 1685 1.366 maxv case AF_INET: 1686 1.374 maxv eq = in_hosteq(ip->ip_src, ip->ip_dst); 1687 1.366 maxv break; 1688 1.152 itojun #ifdef INET6 1689 1.366 maxv case AF_INET6: 1690 1.374 maxv eq = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, 1691 1.368 maxv &ip6->ip6_dst); 1692 1.366 maxv break; 1693 1.152 itojun #endif 1694 1.366 maxv } 1695 1.374 maxv if (eq) { 1696 1.366 maxv TCP_STATINC(TCP_STAT_BADSYN); 1697 1.366 maxv goto drop; 1698 1.35 thorpej } 1699 1.366 maxv } 1700 1.83 itojun 1701 1.366 maxv /* 1702 1.366 maxv * SYN looks ok; create compressed TCP 1703 1.366 maxv * state for it. 1704 1.366 maxv */ 1705 1.366 maxv if (so->so_qlen <= so->so_qlimit && 1706 1.411 maxv syn_cache_add(&src.sa, &dst.sa, th, off, 1707 1.368 maxv so, m, optp, optlen, &opti)) 1708 1.366 maxv m = NULL; 1709 1.1 cgd } 1710 1.366 maxv 1711 1.366 maxv goto drop; 1712 1.1 cgd } 1713 1.1 cgd 1714 1.29 thorpej after_listen: 1715 1.29 thorpej /* 1716 1.385 maxv * From here on, we're dealing with !LISTEN. 1717 1.29 thorpej */ 1718 1.368 maxv KASSERT(tp->t_state != TCPS_LISTEN); 1719 1.29 thorpej 1720 1.1 cgd /* 1721 1.1 cgd * Segment received on connection. 1722 1.1 cgd * Reset idle time and keep-alive timer. 1723 1.1 cgd */ 1724 1.128 thorpej tp->t_rcvtime = tcp_now; 1725 1.25 mycroft if (TCPS_HAVEESTABLISHED(tp->t_state)) 1726 1.267 christos TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); 1727 1.1 cgd 1728 1.1 cgd /* 1729 1.29 thorpej * Process options. 1730 1.1 cgd */ 1731 1.206 itojun #ifdef TCP_SIGNATURE 1732 1.206 itojun if (optp || (tp->t_flags & TF_SIGNATURE)) 1733 1.206 itojun #else 1734 1.29 thorpej if (optp) 1735 1.206 itojun #endif 1736 1.411 maxv if (tcp_dooptions(tp, optp, optlen, th, m, off, &opti) < 0) 1737 1.206 itojun goto drop; 1738 1.9 mycroft 1739 1.222 jonathan if (TCP_SACK_ENABLED(tp)) { 1740 1.222 jonathan tcp_del_sackholes(tp, th); 1741 1.222 jonathan } 1742 1.222 jonathan 1743 1.244 rpaulo if (TCP_ECN_ALLOWED(tp)) { 1744 1.303 rmind if (tiflags & TH_CWR) { 1745 1.303 rmind tp->t_flags &= ~TF_ECN_SND_ECE; 1746 1.303 rmind } 1747 1.244 rpaulo switch (iptos & IPTOS_ECN_MASK) { 1748 1.244 rpaulo case IPTOS_ECN_CE: 1749 1.244 rpaulo tp->t_flags |= TF_ECN_SND_ECE; 1750 1.284 thorpej TCP_STATINC(TCP_STAT_ECN_CE); 1751 1.244 rpaulo break; 1752 1.244 rpaulo case IPTOS_ECN_ECT0: 1753 1.284 thorpej TCP_STATINC(TCP_STAT_ECN_ECT); 1754 1.244 rpaulo break; 1755 1.244 rpaulo case IPTOS_ECN_ECT1: 1756 1.244 rpaulo /* XXX: ignore for now -- rpaulo */ 1757 1.244 rpaulo break; 1758 1.244 rpaulo } 1759 1.244 rpaulo /* 1760 1.244 rpaulo * Congestion experienced. 1761 1.244 rpaulo * Ignore if we are already trying to recover. 1762 1.244 rpaulo */ 1763 1.244 rpaulo if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover)) 1764 1.251 rpaulo tp->t_congctl->cong_exp(tp); 1765 1.244 rpaulo } 1766 1.244 rpaulo 1767 1.213 mycroft if (opti.ts_present && opti.ts_ecr) { 1768 1.213 mycroft /* 1769 1.213 mycroft * Calculate the RTT from the returned time stamp and the 1770 1.213 mycroft * connection's time base. If the time stamp is later than 1771 1.215 mycroft * the current time, or is extremely old, fall back to non-1323 1772 1.308 yamt * RTT calculation. Since ts_rtt is unsigned, we can test both 1773 1.215 mycroft * at the same time. 1774 1.309 gdt * 1775 1.309 gdt * Note that ts_rtt is in units of slow ticks (500 1776 1.309 gdt * ms). Since most earthbound RTTs are < 500 ms, 1777 1.309 gdt * observed values will have large quantization noise. 1778 1.309 gdt * Our smoothed RTT is then the fraction of observed 1779 1.309 gdt * samples that are 1 tick instead of 0 (times 500 1780 1.309 gdt * ms). 1781 1.309 gdt * 1782 1.309 gdt * ts_rtt is increased by 1 to denote a valid sample, 1783 1.309 gdt * with 0 indicating an invalid measurement. This 1784 1.309 gdt * extra 1 must be removed when ts_rtt is used, or 1785 1.413 msaitoh * else an erroneous extra 500 ms will result. 1786 1.213 mycroft */ 1787 1.229 yamt ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1; 1788 1.229 yamt if (ts_rtt > TCP_PAWS_IDLE) 1789 1.229 yamt ts_rtt = 0; 1790 1.229 yamt } else { 1791 1.229 yamt ts_rtt = 0; 1792 1.213 mycroft } 1793 1.213 mycroft 1794 1.143 itojun /* 1795 1.389 maxv * Fast path: check for the two common cases of a uni-directional 1796 1.389 maxv * data transfer. If: 1797 1.389 maxv * o We are in the ESTABLISHED state, and 1798 1.389 maxv * o The packet has no control flags, and 1799 1.389 maxv * o The packet is in-sequence, and 1800 1.389 maxv * o The window didn't change, and 1801 1.389 maxv * o We are not retransmitting 1802 1.389 maxv * It's a candidate. 1803 1.389 maxv * 1804 1.389 maxv * If the length (tlen) is zero and the ack moved forward, we're 1805 1.389 maxv * the sender side of the transfer. Just free the data acked and 1806 1.389 maxv * wake any higher level process that was blocked waiting for 1807 1.389 maxv * space. 1808 1.389 maxv * 1809 1.389 maxv * If the length is non-zero and the ack didn't move, we're the 1810 1.389 maxv * receiver side. If we're getting packets in-order (the reassembly 1811 1.389 maxv * queue is empty), add the data to the socket buffer and note 1812 1.389 maxv * that we need a delayed ack. 1813 1.1 cgd */ 1814 1.1 cgd if (tp->t_state == TCPS_ESTABLISHED && 1815 1.244 rpaulo (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) 1816 1.244 rpaulo == TH_ACK && 1817 1.29 thorpej (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) && 1818 1.83 itojun th->th_seq == tp->rcv_nxt && 1819 1.9 mycroft tiwin && tiwin == tp->snd_wnd && 1820 1.1 cgd tp->snd_nxt == tp->snd_max) { 1821 1.9 mycroft 1822 1.143 itojun /* 1823 1.9 mycroft * If last ACK falls within this segment's sequence numbers, 1824 1.278 yamt * record the timestamp. 1825 1.278 yamt * NOTE that the test is modified according to the latest 1826 1.278 yamt * proposal of the tcplw (at) cray.com list (Braden 1993/04/26). 1827 1.278 yamt * 1828 1.278 yamt * note that we already know 1829 1.278 yamt * TSTMP_GEQ(opti.ts_val, tp->ts_recent) 1830 1.9 mycroft */ 1831 1.389 maxv if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1832 1.217 mycroft tp->ts_recent_age = tcp_now; 1833 1.29 thorpej tp->ts_recent = opti.ts_val; 1834 1.9 mycroft } 1835 1.9 mycroft 1836 1.83 itojun if (tlen == 0) { 1837 1.213 mycroft /* Ack prediction. */ 1838 1.83 itojun if (SEQ_GT(th->th_ack, tp->snd_una) && 1839 1.83 itojun SEQ_LEQ(th->th_ack, tp->snd_max) && 1840 1.15 mycroft tp->snd_cwnd >= tp->snd_wnd && 1841 1.214 mycroft tp->t_partialacks < 0) { 1842 1.1 cgd /* 1843 1.1 cgd * this is a pure ack for outstanding data. 1844 1.1 cgd */ 1845 1.229 yamt if (ts_rtt) 1846 1.314 gdt tcp_xmit_timer(tp, ts_rtt - 1); 1847 1.128 thorpej else if (tp->t_rtttime && 1848 1.83 itojun SEQ_GT(th->th_ack, tp->t_rtseq)) 1849 1.128 thorpej tcp_xmit_timer(tp, 1850 1.198 itojun tcp_now - tp->t_rtttime); 1851 1.83 itojun acked = th->th_ack - tp->snd_una; 1852 1.284 thorpej tcps = TCP_STAT_GETREF(); 1853 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_PREDACK); 1854 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVACKPACK); 1855 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVACKBYTE, 1856 1.439 riastrad acked); 1857 1.284 thorpej TCP_STAT_PUTREF(); 1858 1.421 roy nd_hint(tp); 1859 1.172 ragge 1860 1.186 ragge if (acked > (tp->t_lastoff - tp->t_inoff)) 1861 1.186 ragge tp->t_lastm = NULL; 1862 1.1 cgd sbdrop(&so->so_snd, acked); 1863 1.186 ragge tp->t_lastoff -= acked; 1864 1.170 ragge 1865 1.280 yamt icmp_check(tp, th, acked); 1866 1.231 christos 1867 1.213 mycroft tp->snd_una = th->th_ack; 1868 1.222 jonathan tp->snd_fack = tp->snd_una; 1869 1.213 mycroft if (SEQ_LT(tp->snd_high, tp->snd_una)) 1870 1.213 mycroft tp->snd_high = tp->snd_una; 1871 1.419 kardel /* 1872 1.419 kardel * drag snd_wl2 along so only newer 1873 1.419 kardel * ACKs can update the window size. 1874 1.419 kardel * also avoids the state where snd_wl2 1875 1.419 kardel * is eventually larger than th_ack and thus 1876 1.419 kardel * blocking the window update mechanism and 1877 1.419 kardel * the connection gets stuck for a loooong 1878 1.419 kardel * time in the zero sized send window state. 1879 1.419 kardel * 1880 1.419 kardel * see PR/kern 55567 1881 1.419 kardel */ 1882 1.419 kardel tp->snd_wl2 = tp->snd_una; 1883 1.419 kardel 1884 1.1 cgd m_freem(m); 1885 1.1 cgd 1886 1.1 cgd /* 1887 1.1 cgd * If all outstanding data are acked, stop 1888 1.1 cgd * retransmit timer, otherwise restart timer 1889 1.1 cgd * using current (possibly backed-off) value. 1890 1.1 cgd * If process is waiting for space, 1891 1.282 rmind * wakeup/selnotify/signal. If data 1892 1.1 cgd * are ready to send, let tcp_output 1893 1.1 cgd * decide between more output or persist. 1894 1.1 cgd */ 1895 1.1 cgd if (tp->snd_una == tp->snd_max) 1896 1.58 thorpej TCP_TIMER_DISARM(tp, TCPT_REXMT); 1897 1.58 thorpej else if (TCP_TIMER_ISARMED(tp, 1898 1.58 thorpej TCPT_PERSIST) == 0) 1899 1.58 thorpej TCP_TIMER_ARM(tp, TCPT_REXMT, 1900 1.58 thorpej tp->t_rxtcur); 1901 1.1 cgd 1902 1.54 matt sowwakeup(so); 1903 1.302 tls if (so->so_snd.sb_cc) { 1904 1.301 tls KERNEL_LOCK(1, NULL); 1905 1.389 maxv (void)tcp_output(tp); 1906 1.301 tls KERNEL_UNLOCK_ONE(NULL); 1907 1.302 tls } 1908 1.440 rin m_freem(tcp_saveti); 1909 1.1 cgd return; 1910 1.1 cgd } 1911 1.83 itojun } else if (th->th_ack == tp->snd_una && 1912 1.141 matt TAILQ_FIRST(&tp->segq) == NULL && 1913 1.83 itojun tlen <= sbspace(&so->so_rcv)) { 1914 1.389 maxv int newsize = 0; 1915 1.269 rmind 1916 1.1 cgd /* 1917 1.1 cgd * this is a pure, in-sequence data packet 1918 1.1 cgd * with nothing on the reassembly queue and 1919 1.1 cgd * we have enough buffer space to take it. 1920 1.1 cgd */ 1921 1.83 itojun tp->rcv_nxt += tlen; 1922 1.420 kardel 1923 1.420 kardel /* 1924 1.420 kardel * Pull rcv_up up to prevent seq wrap relative to 1925 1.420 kardel * rcv_nxt. 1926 1.420 kardel */ 1927 1.420 kardel tp->rcv_up = tp->rcv_nxt; 1928 1.420 kardel 1929 1.420 kardel /* 1930 1.420 kardel * Pull snd_wl1 up to prevent seq wrap relative to 1931 1.420 kardel * th_seq. 1932 1.420 kardel */ 1933 1.420 kardel tp->snd_wl1 = th->th_seq; 1934 1.420 kardel 1935 1.284 thorpej tcps = TCP_STAT_GETREF(); 1936 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_PREDDAT); 1937 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPACK); 1938 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVBYTE, tlen); 1939 1.284 thorpej TCP_STAT_PUTREF(); 1940 1.421 roy nd_hint(tp); 1941 1.269 rmind /* 1942 1.269 rmind * Automatic sizing enables the performance of large buffers 1943 1.269 rmind * and most of the efficiency of small ones by only allocating 1944 1.269 rmind * space when it is needed. 1945 1.269 rmind * 1946 1.269 rmind * On the receive side the socket buffer memory is only rarely 1947 1.269 rmind * used to any significant extent. This allows us to be much 1948 1.269 rmind * more aggressive in scaling the receive socket buffer. For 1949 1.269 rmind * the case that the buffer space is actually used to a large 1950 1.269 rmind * extent and we run out of kernel memory we can simply drop 1951 1.269 rmind * the new segments; TCP on the sender will just retransmit it 1952 1.269 rmind * later. Setting the buffer size too big may only consume too 1953 1.269 rmind * much kernel memory if the application doesn't read() from 1954 1.269 rmind * the socket or packet loss or reordering makes use of the 1955 1.269 rmind * reassembly queue. 1956 1.269 rmind * 1957 1.269 rmind * The criteria to step up the receive buffer one notch are: 1958 1.269 rmind * 1. the number of bytes received during the time it takes 1959 1.269 rmind * one timestamp to be reflected back to us (the RTT); 1960 1.269 rmind * 2. received bytes per RTT is within seven eighth of the 1961 1.269 rmind * current socket buffer size; 1962 1.269 rmind * 3. receive buffer size has not hit maximal automatic size; 1963 1.269 rmind * 1964 1.269 rmind * This algorithm does one step per RTT at most and only if 1965 1.269 rmind * we receive a bulk stream w/o packet losses or reorderings. 1966 1.269 rmind * Shrinking the buffer during idle times is not necessary as 1967 1.269 rmind * it doesn't consume any memory when idle. 1968 1.269 rmind * 1969 1.269 rmind * TODO: Only step up if the application is actually serving 1970 1.269 rmind * the buffer to better manage the socket buffer resources. 1971 1.269 rmind */ 1972 1.269 rmind if (tcp_do_autorcvbuf && 1973 1.269 rmind opti.ts_ecr && 1974 1.269 rmind (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 1975 1.269 rmind if (opti.ts_ecr > tp->rfbuf_ts && 1976 1.270 yamt opti.ts_ecr - tp->rfbuf_ts < PR_SLOWHZ) { 1977 1.269 rmind if (tp->rfbuf_cnt > 1978 1.269 rmind (so->so_rcv.sb_hiwat / 8 * 7) && 1979 1.269 rmind so->so_rcv.sb_hiwat < 1980 1.269 rmind tcp_autorcvbuf_max) { 1981 1.269 rmind newsize = 1982 1.409 riastrad uimin(so->so_rcv.sb_hiwat + 1983 1.269 rmind tcp_autorcvbuf_inc, 1984 1.269 rmind tcp_autorcvbuf_max); 1985 1.269 rmind } 1986 1.269 rmind /* Start over with next RTT. */ 1987 1.269 rmind tp->rfbuf_ts = 0; 1988 1.269 rmind tp->rfbuf_cnt = 0; 1989 1.269 rmind } else 1990 1.269 rmind tp->rfbuf_cnt += tlen; /* add up */ 1991 1.269 rmind } 1992 1.269 rmind 1993 1.1 cgd /* 1994 1.9 mycroft * Drop TCP, IP headers and TCP options then add data 1995 1.9 mycroft * to socket buffer. 1996 1.1 cgd */ 1997 1.389 maxv if (so->so_state & SS_CANTRCVMORE) { 1998 1.154 itojun m_freem(m); 1999 1.389 maxv } else { 2000 1.269 rmind /* 2001 1.269 rmind * Set new socket buffer size. 2002 1.269 rmind * Give up when limit is reached. 2003 1.269 rmind */ 2004 1.269 rmind if (newsize) 2005 1.269 rmind if (!sbreserve(&so->so_rcv, 2006 1.269 rmind newsize, so)) 2007 1.269 rmind so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 2008 1.411 maxv m_adj(m, off + thlen); 2009 1.154 itojun sbappendstream(&so->so_rcv, m); 2010 1.154 itojun } 2011 1.1 cgd sorwakeup(so); 2012 1.280 yamt tcp_setup_ack(tp, th); 2013 1.302 tls if (tp->t_flags & TF_ACKNOW) { 2014 1.301 tls KERNEL_LOCK(1, NULL); 2015 1.389 maxv (void)tcp_output(tp); 2016 1.301 tls KERNEL_UNLOCK_ONE(NULL); 2017 1.302 tls } 2018 1.440 rin m_freem(tcp_saveti); 2019 1.1 cgd return; 2020 1.1 cgd } 2021 1.1 cgd } 2022 1.1 cgd 2023 1.1 cgd /* 2024 1.97 itojun * Compute mbuf offset to TCP data segment. 2025 1.1 cgd */ 2026 1.411 maxv hdroptlen = off + thlen; 2027 1.1 cgd 2028 1.1 cgd /* 2029 1.385 maxv * Calculate amount of space in receive window. Receive window is 2030 1.385 maxv * amount of space in rcv queue, but not less than advertised 2031 1.385 maxv * window. 2032 1.1 cgd */ 2033 1.385 maxv { 2034 1.385 maxv int win; 2035 1.385 maxv win = sbspace(&so->so_rcv); 2036 1.385 maxv if (win < 0) 2037 1.385 maxv win = 0; 2038 1.385 maxv tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 2039 1.1 cgd } 2040 1.1 cgd 2041 1.269 rmind /* Reset receive buffer auto scaling when not in bulk receive mode. */ 2042 1.269 rmind tp->rfbuf_ts = 0; 2043 1.269 rmind tp->rfbuf_cnt = 0; 2044 1.269 rmind 2045 1.1 cgd switch (tp->t_state) { 2046 1.1 cgd /* 2047 1.1 cgd * If the state is SYN_SENT: 2048 1.1 cgd * if seg contains an ACK, but not for our SYN, drop the input. 2049 1.1 cgd * if seg contains a RST, then drop the connection. 2050 1.1 cgd * if seg does not contain SYN, then drop it. 2051 1.1 cgd * Otherwise this is an acceptable SYN segment 2052 1.1 cgd * initialize tp->rcv_nxt and tp->irs 2053 1.1 cgd * if seg contains ack then advance tp->snd_una 2054 1.244 rpaulo * if seg contains a ECE and ECN support is enabled, the stream 2055 1.244 rpaulo * is ECN capable. 2056 1.1 cgd * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 2057 1.1 cgd * arrange for segment to be acked (eventually) 2058 1.1 cgd * continue processing rest of data/controls, beginning with URG 2059 1.1 cgd */ 2060 1.1 cgd case TCPS_SYN_SENT: 2061 1.1 cgd if ((tiflags & TH_ACK) && 2062 1.83 itojun (SEQ_LEQ(th->th_ack, tp->iss) || 2063 1.83 itojun SEQ_GT(th->th_ack, tp->snd_max))) 2064 1.1 cgd goto dropwithreset; 2065 1.1 cgd if (tiflags & TH_RST) { 2066 1.1 cgd if (tiflags & TH_ACK) 2067 1.1 cgd tp = tcp_drop(tp, ECONNREFUSED); 2068 1.1 cgd goto drop; 2069 1.1 cgd } 2070 1.1 cgd if ((tiflags & TH_SYN) == 0) 2071 1.1 cgd goto drop; 2072 1.1 cgd if (tiflags & TH_ACK) { 2073 1.213 mycroft tp->snd_una = th->th_ack; 2074 1.1 cgd if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2075 1.1 cgd tp->snd_nxt = tp->snd_una; 2076 1.213 mycroft if (SEQ_LT(tp->snd_high, tp->snd_una)) 2077 1.213 mycroft tp->snd_high = tp->snd_una; 2078 1.107 matt TCP_TIMER_DISARM(tp, TCPT_REXMT); 2079 1.244 rpaulo 2080 1.244 rpaulo if ((tiflags & TH_ECE) && tcp_do_ecn) { 2081 1.244 rpaulo tp->t_flags |= TF_ECN_PERMIT; 2082 1.284 thorpej TCP_STATINC(TCP_STAT_ECN_SHS); 2083 1.244 rpaulo } 2084 1.1 cgd } 2085 1.83 itojun tp->irs = th->th_seq; 2086 1.1 cgd tcp_rcvseqinit(tp); 2087 1.1 cgd tp->t_flags |= TF_ACKNOW; 2088 1.32 thorpej tcp_mss_from_peer(tp, opti.maxseg); 2089 1.46 thorpej 2090 1.46 thorpej /* 2091 1.46 thorpej * Initialize the initial congestion window. If we 2092 1.46 thorpej * had to retransmit the SYN, we must initialize cwnd 2093 1.62 thorpej * to 1 segment (i.e. the Loss Window). 2094 1.46 thorpej */ 2095 1.62 thorpej if (tp->t_flags & TF_SYN_REXMT) 2096 1.62 thorpej tp->snd_cwnd = tp->t_peermss; 2097 1.163 thorpej else { 2098 1.163 thorpej int ss = tcp_init_win; 2099 1.436 ozaki if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp))) 2100 1.163 thorpej ss = tcp_init_win_local; 2101 1.163 thorpej #ifdef INET6 2102 1.436 ozaki else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp))) 2103 1.163 thorpej ss = tcp_init_win_local; 2104 1.163 thorpej #endif 2105 1.163 thorpej tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss); 2106 1.163 thorpej } 2107 1.46 thorpej 2108 1.32 thorpej tcp_rmx_rtt(tp); 2109 1.108 matt if (tiflags & TH_ACK) { 2110 1.284 thorpej TCP_STATINC(TCP_STAT_CONNECTS); 2111 1.312 dyoung /* 2112 1.312 dyoung * move tcp_established before soisconnected 2113 1.313 dholland * because upcall handler can drive tcp_output 2114 1.312 dyoung * functionality. 2115 1.312 dyoung * XXX we might call soisconnected at the end of 2116 1.312 dyoung * all processing 2117 1.312 dyoung */ 2118 1.312 dyoung tcp_established(tp); 2119 1.1 cgd soisconnected(so); 2120 1.9 mycroft /* Do window scaling on this connection? */ 2121 1.9 mycroft if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2122 1.158 thorpej (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2123 1.9 mycroft tp->snd_scale = tp->requested_s_scale; 2124 1.9 mycroft tp->rcv_scale = tp->request_r_scale; 2125 1.9 mycroft } 2126 1.72 thorpej TCP_REASS_LOCK(tp); 2127 1.386 maxv (void)tcp_reass(tp, NULL, NULL, tlen); 2128 1.1 cgd /* 2129 1.1 cgd * if we didn't have to retransmit the SYN, 2130 1.1 cgd * use its rtt as our initial srtt & rtt var. 2131 1.1 cgd */ 2132 1.128 thorpej if (tp->t_rtttime) 2133 1.128 thorpej tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 2134 1.389 maxv } else { 2135 1.1 cgd tp->t_state = TCPS_SYN_RECEIVED; 2136 1.389 maxv } 2137 1.1 cgd 2138 1.1 cgd /* 2139 1.83 itojun * Advance th->th_seq to correspond to first data byte. 2140 1.1 cgd * If data, trim to stay within window, 2141 1.1 cgd * dropping FIN if necessary. 2142 1.1 cgd */ 2143 1.83 itojun th->th_seq++; 2144 1.83 itojun if (tlen > tp->rcv_wnd) { 2145 1.83 itojun todrop = tlen - tp->rcv_wnd; 2146 1.1 cgd m_adj(m, -todrop); 2147 1.83 itojun tlen = tp->rcv_wnd; 2148 1.1 cgd tiflags &= ~TH_FIN; 2149 1.284 thorpej tcps = TCP_STAT_GETREF(); 2150 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPACKAFTERWIN); 2151 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVBYTEAFTERWIN, 2152 1.439 riastrad todrop); 2153 1.284 thorpej TCP_STAT_PUTREF(); 2154 1.1 cgd } 2155 1.83 itojun tp->snd_wl1 = th->th_seq - 1; 2156 1.83 itojun tp->rcv_up = th->th_seq; 2157 1.1 cgd goto step6; 2158 1.29 thorpej 2159 1.29 thorpej /* 2160 1.29 thorpej * If the state is SYN_RECEIVED: 2161 1.29 thorpej * If seg contains an ACK, but not for our SYN, drop the input 2162 1.29 thorpej * and generate an RST. See page 36, rfc793 2163 1.29 thorpej */ 2164 1.29 thorpej case TCPS_SYN_RECEIVED: 2165 1.29 thorpej if ((tiflags & TH_ACK) && 2166 1.83 itojun (SEQ_LEQ(th->th_ack, tp->iss) || 2167 1.83 itojun SEQ_GT(th->th_ack, tp->snd_max))) 2168 1.29 thorpej goto dropwithreset; 2169 1.29 thorpej break; 2170 1.1 cgd } 2171 1.1 cgd 2172 1.1 cgd /* 2173 1.385 maxv * From here on, we're dealing with !LISTEN and !SYN_SENT. 2174 1.385 maxv */ 2175 1.385 maxv KASSERT(tp->t_state != TCPS_LISTEN && 2176 1.385 maxv tp->t_state != TCPS_SYN_SENT); 2177 1.385 maxv 2178 1.385 maxv /* 2179 1.389 maxv * RFC1323 PAWS: if we have a timestamp reply on this segment and 2180 1.389 maxv * it's less than ts_recent, drop it. 2181 1.1 cgd */ 2182 1.29 thorpej if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && 2183 1.29 thorpej TSTMP_LT(opti.ts_val, tp->ts_recent)) { 2184 1.9 mycroft /* Check to see if ts_recent is over 24 days old. */ 2185 1.217 mycroft if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) { 2186 1.9 mycroft /* 2187 1.9 mycroft * Invalidate ts_recent. If this segment updates 2188 1.9 mycroft * ts_recent, the age will be reset later and ts_recent 2189 1.9 mycroft * will get a valid value. If it does not, setting 2190 1.9 mycroft * ts_recent to zero will at least satisfy the 2191 1.9 mycroft * requirement that zero be placed in the timestamp 2192 1.9 mycroft * echo reply when ts_recent isn't valid. The 2193 1.9 mycroft * age isn't reset until we get a valid ts_recent 2194 1.9 mycroft * because we don't want out-of-order segments to be 2195 1.9 mycroft * dropped when ts_recent is old. 2196 1.9 mycroft */ 2197 1.9 mycroft tp->ts_recent = 0; 2198 1.9 mycroft } else { 2199 1.284 thorpej tcps = TCP_STAT_GETREF(); 2200 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVDUPPACK); 2201 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVDUPBYTE, tlen); 2202 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_PAWSDROP); 2203 1.284 thorpej TCP_STAT_PUTREF(); 2204 1.222 jonathan tcp_new_dsack(tp, th->th_seq, tlen); 2205 1.9 mycroft goto dropafterack; 2206 1.9 mycroft } 2207 1.9 mycroft } 2208 1.9 mycroft 2209 1.389 maxv /* 2210 1.389 maxv * Check that at least some bytes of the segment are within the 2211 1.389 maxv * receive window. If segment begins before rcv_nxt, drop leading 2212 1.389 maxv * data (and SYN); if nothing left, just ack. 2213 1.389 maxv */ 2214 1.83 itojun todrop = tp->rcv_nxt - th->th_seq; 2215 1.261 thorpej dupseg = false; 2216 1.1 cgd if (todrop > 0) { 2217 1.1 cgd if (tiflags & TH_SYN) { 2218 1.1 cgd tiflags &= ~TH_SYN; 2219 1.83 itojun th->th_seq++; 2220 1.390 maxv tcp_urp_drop(th, 1, &tiflags); 2221 1.1 cgd todrop--; 2222 1.1 cgd } 2223 1.83 itojun if (todrop > tlen || 2224 1.83 itojun (todrop == tlen && (tiflags & TH_FIN) == 0)) { 2225 1.1 cgd /* 2226 1.193 matt * Any valid FIN or RST must be to the left of the 2227 1.193 matt * window. At this point the FIN or RST must be a 2228 1.193 matt * duplicate or out of sequence; drop it. 2229 1.7 mycroft */ 2230 1.193 matt if (tiflags & TH_RST) 2231 1.193 matt goto drop; 2232 1.193 matt tiflags &= ~(TH_FIN|TH_RST); 2233 1.389 maxv 2234 1.7 mycroft /* 2235 1.42 mycroft * Send an ACK to resynchronize and drop any data. 2236 1.42 mycroft * But keep on processing for RST or ACK. 2237 1.1 cgd */ 2238 1.7 mycroft tp->t_flags |= TF_ACKNOW; 2239 1.83 itojun todrop = tlen; 2240 1.261 thorpej dupseg = true; 2241 1.284 thorpej tcps = TCP_STAT_GETREF(); 2242 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVDUPPACK); 2243 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVDUPBYTE, todrop); 2244 1.284 thorpej TCP_STAT_PUTREF(); 2245 1.389 maxv } else if ((tiflags & TH_RST) && th->th_seq != tp->rcv_nxt) { 2246 1.204 matt /* 2247 1.204 matt * Test for reset before adjusting the sequence 2248 1.204 matt * number for overlapping data. 2249 1.204 matt */ 2250 1.204 matt goto dropafterack_ratelim; 2251 1.1 cgd } else { 2252 1.284 thorpej tcps = TCP_STAT_GETREF(); 2253 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPARTDUPPACK); 2254 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVPARTDUPBYTE, 2255 1.439 riastrad todrop); 2256 1.284 thorpej TCP_STAT_PUTREF(); 2257 1.1 cgd } 2258 1.222 jonathan tcp_new_dsack(tp, th->th_seq, todrop); 2259 1.399 maxv hdroptlen += todrop; /* drop from head afterwards (m_adj) */ 2260 1.83 itojun th->th_seq += todrop; 2261 1.83 itojun tlen -= todrop; 2262 1.390 maxv tcp_urp_drop(th, todrop, &tiflags); 2263 1.1 cgd } 2264 1.1 cgd 2265 1.1 cgd /* 2266 1.389 maxv * If new data is received on a connection after the user processes 2267 1.389 maxv * are gone, then RST the other end. 2268 1.1 cgd */ 2269 1.1 cgd if ((so->so_state & SS_NOFDREF) && 2270 1.83 itojun tp->t_state > TCPS_CLOSE_WAIT && tlen) { 2271 1.1 cgd tp = tcp_close(tp); 2272 1.284 thorpej TCP_STATINC(TCP_STAT_RCVAFTERCLOSE); 2273 1.1 cgd goto dropwithreset; 2274 1.1 cgd } 2275 1.1 cgd 2276 1.1 cgd /* 2277 1.389 maxv * If the segment ends after the window, drop trailing data (and 2278 1.389 maxv * PUSH and FIN); if nothing left, just ACK. 2279 1.1 cgd */ 2280 1.387 maxv todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 2281 1.1 cgd if (todrop > 0) { 2282 1.284 thorpej TCP_STATINC(TCP_STAT_RCVPACKAFTERWIN); 2283 1.83 itojun if (todrop >= tlen) { 2284 1.193 matt /* 2285 1.193 matt * The segment actually starts after the window. 2286 1.193 matt * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen 2287 1.193 matt * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0 2288 1.193 matt * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd 2289 1.193 matt */ 2290 1.284 thorpej TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, tlen); 2291 1.387 maxv 2292 1.1 cgd /* 2293 1.387 maxv * If a new connection request is received while in 2294 1.387 maxv * TIME_WAIT, drop the old connection and start over 2295 1.387 maxv * if the sequence numbers are above the previous 2296 1.387 maxv * ones. 2297 1.387 maxv * 2298 1.387 maxv * NOTE: We need to put the header fields back into 2299 1.387 maxv * network order. 2300 1.1 cgd */ 2301 1.387 maxv if ((tiflags & TH_SYN) && 2302 1.1 cgd tp->t_state == TCPS_TIME_WAIT && 2303 1.83 itojun SEQ_GT(th->th_seq, tp->rcv_nxt)) { 2304 1.1 cgd tp = tcp_close(tp); 2305 1.280 yamt tcp_fields_to_net(th); 2306 1.397 maxv m_freem(tcp_saveti); 2307 1.397 maxv tcp_saveti = NULL; 2308 1.1 cgd goto findpcb; 2309 1.1 cgd } 2310 1.387 maxv 2311 1.1 cgd /* 2312 1.1 cgd * If window is closed can only take segments at 2313 1.1 cgd * window edge, and have to drop data and PUSH from 2314 1.1 cgd * incoming segments. Continue processing, but 2315 1.1 cgd * remember to ack. Otherwise, drop segment 2316 1.193 matt * and (if not RST) ack. 2317 1.1 cgd */ 2318 1.83 itojun if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2319 1.395 maxv KASSERT(todrop == tlen); 2320 1.1 cgd tp->t_flags |= TF_ACKNOW; 2321 1.284 thorpej TCP_STATINC(TCP_STAT_RCVWINPROBE); 2322 1.387 maxv } else { 2323 1.1 cgd goto dropafterack; 2324 1.387 maxv } 2325 1.387 maxv } else { 2326 1.284 thorpej TCP_STATADD(TCP_STAT_RCVBYTEAFTERWIN, todrop); 2327 1.387 maxv } 2328 1.1 cgd m_adj(m, -todrop); 2329 1.83 itojun tlen -= todrop; 2330 1.1 cgd tiflags &= ~(TH_PUSH|TH_FIN); 2331 1.1 cgd } 2332 1.1 cgd 2333 1.1 cgd /* 2334 1.9 mycroft * If last ACK falls within this segment's sequence numbers, 2335 1.278 yamt * record the timestamp. 2336 1.278 yamt * NOTE: 2337 1.278 yamt * 1) That the test incorporates suggestions from the latest 2338 1.278 yamt * proposal of the tcplw (at) cray.com list (Braden 1993/04/26). 2339 1.278 yamt * 2) That updating only on newer timestamps interferes with 2340 1.278 yamt * our earlier PAWS tests, so this check should be solely 2341 1.278 yamt * predicated on the sequence space of this segment. 2342 1.278 yamt * 3) That we modify the segment boundary check to be 2343 1.278 yamt * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2344 1.278 yamt * instead of RFC1323's 2345 1.278 yamt * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2346 1.278 yamt * This modified check allows us to overcome RFC1323's 2347 1.278 yamt * limitations as described in Stevens TCP/IP Illustrated 2348 1.278 yamt * Vol. 2 p.869. In such cases, we can still calculate the 2349 1.278 yamt * RTT correctly when RCV.NXT == Last.ACK.Sent. 2350 1.9 mycroft */ 2351 1.278 yamt if (opti.ts_present && 2352 1.83 itojun SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2353 1.278 yamt SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2354 1.389 maxv ((tiflags & (TH_SYN|TH_FIN)) != 0))) { 2355 1.217 mycroft tp->ts_recent_age = tcp_now; 2356 1.29 thorpej tp->ts_recent = opti.ts_val; 2357 1.9 mycroft } 2358 1.9 mycroft 2359 1.9 mycroft /* 2360 1.1 cgd * If the RST bit is set examine the state: 2361 1.385 maxv * RECEIVED state: 2362 1.385 maxv * If passive open, return to LISTEN state. 2363 1.385 maxv * If active open, inform user that connection was refused. 2364 1.385 maxv * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT states: 2365 1.385 maxv * Inform user that connection was reset, and close tcb. 2366 1.385 maxv * CLOSING, LAST_ACK, TIME_WAIT states: 2367 1.385 maxv * Close the tcb. 2368 1.1 cgd */ 2369 1.194 itojun if (tiflags & TH_RST) { 2370 1.296 christos if (th->th_seq != tp->rcv_nxt) 2371 1.194 itojun goto dropafterack_ratelim; 2372 1.1 cgd 2373 1.194 itojun switch (tp->t_state) { 2374 1.194 itojun case TCPS_SYN_RECEIVED: 2375 1.194 itojun so->so_error = ECONNREFUSED; 2376 1.194 itojun goto close; 2377 1.1 cgd 2378 1.194 itojun case TCPS_ESTABLISHED: 2379 1.194 itojun case TCPS_FIN_WAIT_1: 2380 1.194 itojun case TCPS_FIN_WAIT_2: 2381 1.194 itojun case TCPS_CLOSE_WAIT: 2382 1.194 itojun so->so_error = ECONNRESET; 2383 1.194 itojun close: 2384 1.194 itojun tp->t_state = TCPS_CLOSED; 2385 1.284 thorpej TCP_STATINC(TCP_STAT_DROPS); 2386 1.194 itojun tp = tcp_close(tp); 2387 1.194 itojun goto drop; 2388 1.1 cgd 2389 1.194 itojun case TCPS_CLOSING: 2390 1.194 itojun case TCPS_LAST_ACK: 2391 1.194 itojun case TCPS_TIME_WAIT: 2392 1.194 itojun tp = tcp_close(tp); 2393 1.194 itojun goto drop; 2394 1.194 itojun } 2395 1.1 cgd } 2396 1.1 cgd 2397 1.1 cgd /* 2398 1.193 matt * Since we've covered the SYN-SENT and SYN-RECEIVED states above 2399 1.416 jnemeth * we must be in a synchronized state. RFC793 states (under Reset 2400 1.416 jnemeth * Generation) that any unacceptable segment (an out-of-order SYN 2401 1.193 matt * qualifies) received in a synchronized state must elicit only an 2402 1.193 matt * empty acknowledgment segment ... and the connection remains in 2403 1.193 matt * the same state. 2404 1.1 cgd */ 2405 1.195 itojun if (tiflags & TH_SYN) { 2406 1.195 itojun if (tp->rcv_nxt == th->th_seq) { 2407 1.195 itojun tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1, 2408 1.195 itojun TH_ACK); 2409 1.440 rin m_freem(tcp_saveti); 2410 1.195 itojun return; 2411 1.195 itojun } 2412 1.221 perry 2413 1.194 itojun goto dropafterack_ratelim; 2414 1.195 itojun } 2415 1.1 cgd 2416 1.1 cgd /* 2417 1.1 cgd * If the ACK bit is off we drop the segment and return. 2418 1.1 cgd */ 2419 1.78 kml if ((tiflags & TH_ACK) == 0) { 2420 1.78 kml if (tp->t_flags & TF_ACKNOW) 2421 1.78 kml goto dropafterack; 2422 1.396 maxv goto drop; 2423 1.78 kml } 2424 1.143 itojun 2425 1.1 cgd /* 2426 1.389 maxv * From here on, we're doing ACK processing. 2427 1.1 cgd */ 2428 1.389 maxv 2429 1.1 cgd switch (tp->t_state) { 2430 1.1 cgd /* 2431 1.1 cgd * In SYN_RECEIVED state if the ack ACKs our SYN then enter 2432 1.1 cgd * ESTABLISHED state and continue processing, otherwise 2433 1.1 cgd * send an RST. 2434 1.1 cgd */ 2435 1.1 cgd case TCPS_SYN_RECEIVED: 2436 1.83 itojun if (SEQ_GT(tp->snd_una, th->th_ack) || 2437 1.83 itojun SEQ_GT(th->th_ack, tp->snd_max)) 2438 1.1 cgd goto dropwithreset; 2439 1.284 thorpej TCP_STATINC(TCP_STAT_CONNECTS); 2440 1.1 cgd soisconnected(so); 2441 1.32 thorpej tcp_established(tp); 2442 1.9 mycroft /* Do window scaling? */ 2443 1.9 mycroft if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2444 1.158 thorpej (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2445 1.9 mycroft tp->snd_scale = tp->requested_s_scale; 2446 1.9 mycroft tp->rcv_scale = tp->request_r_scale; 2447 1.9 mycroft } 2448 1.72 thorpej TCP_REASS_LOCK(tp); 2449 1.386 maxv (void)tcp_reass(tp, NULL, NULL, tlen); 2450 1.83 itojun tp->snd_wl1 = th->th_seq - 1; 2451 1.389 maxv /* FALLTHROUGH */ 2452 1.1 cgd 2453 1.1 cgd /* 2454 1.1 cgd * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2455 1.1 cgd * ACKs. If the ack is in the range 2456 1.83 itojun * tp->snd_una < th->th_ack <= tp->snd_max 2457 1.83 itojun * then advance tp->snd_una to th->th_ack and drop 2458 1.1 cgd * data from the retransmission queue. If this ACK reflects 2459 1.1 cgd * more up to date window information we update our window information. 2460 1.1 cgd */ 2461 1.1 cgd case TCPS_ESTABLISHED: 2462 1.1 cgd case TCPS_FIN_WAIT_1: 2463 1.1 cgd case TCPS_FIN_WAIT_2: 2464 1.1 cgd case TCPS_CLOSE_WAIT: 2465 1.1 cgd case TCPS_CLOSING: 2466 1.1 cgd case TCPS_LAST_ACK: 2467 1.1 cgd case TCPS_TIME_WAIT: 2468 1.83 itojun if (SEQ_LEQ(th->th_ack, tp->snd_una)) { 2469 1.218 mycroft if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) { 2470 1.307 yamt TCP_STATINC(TCP_STAT_RCVDUPACK); 2471 1.1 cgd /* 2472 1.1 cgd * If we have outstanding data (other than 2473 1.1 cgd * a window probe), this is a completely 2474 1.1 cgd * duplicate ack (ie, window info didn't 2475 1.1 cgd * change), the ack is the biggest we've 2476 1.1 cgd * seen and we've seen exactly our rexmt 2477 1.429 andvar * threshold of them, assume a packet 2478 1.1 cgd * has been dropped and retransmit it. 2479 1.1 cgd * Kludge snd_nxt & the congestion 2480 1.1 cgd * window so we send only this one 2481 1.1 cgd * packet. 2482 1.1 cgd */ 2483 1.58 thorpej if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 || 2484 1.83 itojun th->th_ack != tp->snd_una) 2485 1.1 cgd tp->t_dupacks = 0; 2486 1.222 jonathan else if (tp->t_partialacks < 0 && 2487 1.389 maxv (++tp->t_dupacks == tcprexmtthresh || 2488 1.389 maxv TCP_FACK_FASTRECOV(tp))) { 2489 1.246 rpaulo /* 2490 1.246 rpaulo * Do the fast retransmit, and adjust 2491 1.432 andvar * congestion control parameters. 2492 1.246 rpaulo */ 2493 1.246 rpaulo if (tp->t_congctl->fast_retransmit(tp, th)) { 2494 1.246 rpaulo /* False fast retransmit */ 2495 1.213 mycroft break; 2496 1.398 maxv } 2497 1.398 maxv goto drop; 2498 1.1 cgd } else if (tp->t_dupacks > tcprexmtthresh) { 2499 1.34 kml tp->snd_cwnd += tp->t_segsz; 2500 1.301 tls KERNEL_LOCK(1, NULL); 2501 1.389 maxv (void)tcp_output(tp); 2502 1.301 tls KERNEL_UNLOCK_ONE(NULL); 2503 1.1 cgd goto drop; 2504 1.1 cgd } 2505 1.218 mycroft } else { 2506 1.218 mycroft /* 2507 1.218 mycroft * If the ack appears to be very old, only 2508 1.218 mycroft * allow data that is in-sequence. This 2509 1.218 mycroft * makes it somewhat more difficult to insert 2510 1.218 mycroft * forged data by guessing sequence numbers. 2511 1.218 mycroft * Sent an ack to try to update the send 2512 1.218 mycroft * sequence number on the other side. 2513 1.218 mycroft */ 2514 1.218 mycroft if (tlen && th->th_seq != tp->rcv_nxt && 2515 1.194 itojun SEQ_LT(th->th_ack, 2516 1.218 mycroft tp->snd_una - tp->max_sndwnd)) 2517 1.218 mycroft goto dropafterack; 2518 1.218 mycroft } 2519 1.1 cgd break; 2520 1.1 cgd } 2521 1.1 cgd /* 2522 1.1 cgd * If the congestion window was inflated to account 2523 1.1 cgd * for the other side's cached packets, retract it. 2524 1.1 cgd */ 2525 1.330 kefren tp->t_congctl->fast_retransmit_newack(tp, th); 2526 1.330 kefren 2527 1.83 itojun if (SEQ_GT(th->th_ack, tp->snd_max)) { 2528 1.284 thorpej TCP_STATINC(TCP_STAT_RCVACKTOOMUCH); 2529 1.1 cgd goto dropafterack; 2530 1.1 cgd } 2531 1.83 itojun acked = th->th_ack - tp->snd_una; 2532 1.284 thorpej tcps = TCP_STAT_GETREF(); 2533 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVACKPACK); 2534 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVACKBYTE, acked); 2535 1.284 thorpej TCP_STAT_PUTREF(); 2536 1.1 cgd 2537 1.1 cgd /* 2538 1.9 mycroft * If we have a timestamp reply, update smoothed 2539 1.9 mycroft * round trip time. If no timestamp is present but 2540 1.9 mycroft * transmit timer is running and timed sequence 2541 1.1 cgd * number was acked, update smoothed round trip time. 2542 1.1 cgd * Since we now have an rtt measurement, cancel the 2543 1.1 cgd * timer backoff (cf., Phil Karn's retransmit alg.). 2544 1.1 cgd * Recompute the initial retransmit timer. 2545 1.1 cgd */ 2546 1.229 yamt if (ts_rtt) 2547 1.314 gdt tcp_xmit_timer(tp, ts_rtt - 1); 2548 1.128 thorpej else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) 2549 1.128 thorpej tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); 2550 1.1 cgd 2551 1.1 cgd /* 2552 1.1 cgd * If all outstanding data is acked, stop retransmit 2553 1.1 cgd * timer and remember to restart (more output or persist). 2554 1.1 cgd * If there is more data to be acked, restart retransmit 2555 1.1 cgd * timer, using current (possibly backed-off) value. 2556 1.1 cgd */ 2557 1.83 itojun if (th->th_ack == tp->snd_max) { 2558 1.58 thorpej TCP_TIMER_DISARM(tp, TCPT_REXMT); 2559 1.1 cgd needoutput = 1; 2560 1.58 thorpej } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 2561 1.58 thorpej TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 2562 1.246 rpaulo 2563 1.1 cgd /* 2564 1.246 rpaulo * New data has been acked, adjust the congestion window. 2565 1.246 rpaulo */ 2566 1.248 rpaulo tp->t_congctl->newack(tp, th); 2567 1.246 rpaulo 2568 1.421 roy nd_hint(tp); 2569 1.1 cgd if (acked > so->so_snd.sb_cc) { 2570 1.1 cgd tp->snd_wnd -= so->so_snd.sb_cc; 2571 1.1 cgd sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); 2572 1.1 cgd ourfinisacked = 1; 2573 1.1 cgd } else { 2574 1.186 ragge if (acked > (tp->t_lastoff - tp->t_inoff)) 2575 1.186 ragge tp->t_lastm = NULL; 2576 1.1 cgd sbdrop(&so->so_snd, acked); 2577 1.186 ragge tp->t_lastoff -= acked; 2578 1.343 matt if (tp->snd_wnd > acked) 2579 1.343 matt tp->snd_wnd -= acked; 2580 1.343 matt else 2581 1.343 matt tp->snd_wnd = 0; 2582 1.1 cgd ourfinisacked = 0; 2583 1.1 cgd } 2584 1.54 matt sowwakeup(so); 2585 1.231 christos 2586 1.280 yamt icmp_check(tp, th, acked); 2587 1.231 christos 2588 1.213 mycroft tp->snd_una = th->th_ack; 2589 1.222 jonathan if (SEQ_GT(tp->snd_una, tp->snd_fack)) 2590 1.222 jonathan tp->snd_fack = tp->snd_una; 2591 1.1 cgd if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2592 1.1 cgd tp->snd_nxt = tp->snd_una; 2593 1.213 mycroft if (SEQ_LT(tp->snd_high, tp->snd_una)) 2594 1.213 mycroft tp->snd_high = tp->snd_una; 2595 1.1 cgd 2596 1.1 cgd switch (tp->t_state) { 2597 1.1 cgd 2598 1.1 cgd /* 2599 1.1 cgd * In FIN_WAIT_1 STATE in addition to the processing 2600 1.1 cgd * for the ESTABLISHED state if our FIN is now acknowledged 2601 1.1 cgd * then enter FIN_WAIT_2. 2602 1.1 cgd */ 2603 1.1 cgd case TCPS_FIN_WAIT_1: 2604 1.1 cgd if (ourfinisacked) { 2605 1.1 cgd /* 2606 1.1 cgd * If we can't receive any more 2607 1.1 cgd * data, then closing user can proceed. 2608 1.1 cgd * Starting the timer is contrary to the 2609 1.1 cgd * specification, but if we don't get a FIN 2610 1.1 cgd * we'll hang forever. 2611 1.1 cgd */ 2612 1.1 cgd if (so->so_state & SS_CANTRCVMORE) { 2613 1.1 cgd soisdisconnected(so); 2614 1.267 christos if (tp->t_maxidle > 0) 2615 1.65 mouse TCP_TIMER_ARM(tp, TCPT_2MSL, 2616 1.267 christos tp->t_maxidle); 2617 1.1 cgd } 2618 1.1 cgd tp->t_state = TCPS_FIN_WAIT_2; 2619 1.1 cgd } 2620 1.1 cgd break; 2621 1.1 cgd 2622 1.1 cgd /* 2623 1.1 cgd * In CLOSING STATE in addition to the processing for 2624 1.1 cgd * the ESTABLISHED state if the ACK acknowledges our FIN 2625 1.1 cgd * then enter the TIME-WAIT state, otherwise ignore 2626 1.1 cgd * the segment. 2627 1.1 cgd */ 2628 1.1 cgd case TCPS_CLOSING: 2629 1.1 cgd if (ourfinisacked) { 2630 1.1 cgd tp->t_state = TCPS_TIME_WAIT; 2631 1.1 cgd tcp_canceltimers(tp); 2632 1.312 dyoung TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2633 1.1 cgd soisdisconnected(so); 2634 1.1 cgd } 2635 1.1 cgd break; 2636 1.1 cgd 2637 1.1 cgd /* 2638 1.1 cgd * In LAST_ACK, we may still be waiting for data to drain 2639 1.1 cgd * and/or to be acked, as well as for the ack of our FIN. 2640 1.1 cgd * If our FIN is now acknowledged, delete the TCB, 2641 1.1 cgd * enter the closed state and return. 2642 1.1 cgd */ 2643 1.1 cgd case TCPS_LAST_ACK: 2644 1.1 cgd if (ourfinisacked) { 2645 1.1 cgd tp = tcp_close(tp); 2646 1.1 cgd goto drop; 2647 1.1 cgd } 2648 1.1 cgd break; 2649 1.1 cgd 2650 1.1 cgd /* 2651 1.1 cgd * In TIME_WAIT state the only thing that should arrive 2652 1.1 cgd * is a retransmission of the remote FIN. Acknowledge 2653 1.1 cgd * it and restart the finack timer. 2654 1.1 cgd */ 2655 1.1 cgd case TCPS_TIME_WAIT: 2656 1.312 dyoung TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2657 1.1 cgd goto dropafterack; 2658 1.1 cgd } 2659 1.1 cgd } 2660 1.1 cgd 2661 1.1 cgd step6: 2662 1.1 cgd /* 2663 1.1 cgd * Update window information. 2664 1.1 cgd * Don't look at window if no ACK: TAC's send garbage on first SYN. 2665 1.1 cgd */ 2666 1.83 itojun if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || 2667 1.239 rpaulo (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 2668 1.239 rpaulo (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 2669 1.1 cgd /* keep track of pure window updates */ 2670 1.83 itojun if (tlen == 0 && 2671 1.83 itojun tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 2672 1.284 thorpej TCP_STATINC(TCP_STAT_RCVWINUPD); 2673 1.9 mycroft tp->snd_wnd = tiwin; 2674 1.83 itojun tp->snd_wl1 = th->th_seq; 2675 1.83 itojun tp->snd_wl2 = th->th_ack; 2676 1.1 cgd if (tp->snd_wnd > tp->max_sndwnd) 2677 1.1 cgd tp->max_sndwnd = tp->snd_wnd; 2678 1.1 cgd needoutput = 1; 2679 1.1 cgd } 2680 1.1 cgd 2681 1.1 cgd /* 2682 1.1 cgd * Process segments with URG. 2683 1.1 cgd */ 2684 1.83 itojun if ((tiflags & TH_URG) && th->th_urp && 2685 1.1 cgd TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2686 1.1 cgd /* 2687 1.1 cgd * This is a kludge, but if we receive and accept 2688 1.1 cgd * random urgent pointers, we'll crash in 2689 1.1 cgd * soreceive. It's hard to imagine someone 2690 1.1 cgd * actually wanting to send this much urgent data. 2691 1.1 cgd */ 2692 1.83 itojun if (th->th_urp + so->so_rcv.sb_cc > sb_max) { 2693 1.83 itojun th->th_urp = 0; /* XXX */ 2694 1.1 cgd tiflags &= ~TH_URG; /* XXX */ 2695 1.1 cgd goto dodata; /* XXX */ 2696 1.1 cgd } 2697 1.389 maxv 2698 1.1 cgd /* 2699 1.1 cgd * If this segment advances the known urgent pointer, 2700 1.1 cgd * then mark the data stream. This should not happen 2701 1.1 cgd * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 2702 1.143 itojun * a FIN has been received from the remote side. 2703 1.1 cgd * In these states we ignore the URG. 2704 1.1 cgd * 2705 1.1 cgd * According to RFC961 (Assigned Protocols), 2706 1.1 cgd * the urgent pointer points to the last octet 2707 1.1 cgd * of urgent data. We continue, however, 2708 1.1 cgd * to consider it to indicate the first octet 2709 1.143 itojun * of data past the urgent section as the original 2710 1.1 cgd * spec states (in one of two places). 2711 1.1 cgd */ 2712 1.83 itojun if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 2713 1.83 itojun tp->rcv_up = th->th_seq + th->th_urp; 2714 1.1 cgd so->so_oobmark = so->so_rcv.sb_cc + 2715 1.1 cgd (tp->rcv_up - tp->rcv_nxt) - 1; 2716 1.1 cgd if (so->so_oobmark == 0) 2717 1.1 cgd so->so_state |= SS_RCVATMARK; 2718 1.1 cgd sohasoutofband(so); 2719 1.1 cgd tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 2720 1.1 cgd } 2721 1.389 maxv 2722 1.1 cgd /* 2723 1.1 cgd * Remove out of band data so doesn't get presented to user. 2724 1.1 cgd * This can happen independent of advancing the URG pointer, 2725 1.1 cgd * but if two URG's are pending at once, some out-of-band 2726 1.1 cgd * data may creep in... ick. 2727 1.1 cgd */ 2728 1.385 maxv if (th->th_urp <= (u_int16_t)tlen && 2729 1.385 maxv (so->so_options & SO_OOBINLINE) == 0) 2730 1.97 itojun tcp_pulloutofband(so, th, m, hdroptlen); 2731 1.389 maxv } else { 2732 1.1 cgd /* 2733 1.1 cgd * If no out of band data is expected, 2734 1.1 cgd * pull receive urgent pointer along 2735 1.1 cgd * with the receive window. 2736 1.1 cgd */ 2737 1.1 cgd if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 2738 1.1 cgd tp->rcv_up = tp->rcv_nxt; 2739 1.389 maxv } 2740 1.389 maxv dodata: 2741 1.1 cgd 2742 1.1 cgd /* 2743 1.1 cgd * Process the segment text, merging it into the TCP sequencing queue, 2744 1.95 simonb * and arranging for acknowledgement of receipt if necessary. 2745 1.1 cgd * This process logically involves adjusting tp->rcv_wnd as data 2746 1.1 cgd * is presented to the user (this happens in tcp_usrreq.c, 2747 1.334 rtr * tcp_rcvd()). If a FIN has already been received on this 2748 1.1 cgd * connection then we just ignore the text. 2749 1.1 cgd */ 2750 1.83 itojun if ((tlen || (tiflags & TH_FIN)) && 2751 1.1 cgd TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2752 1.83 itojun /* 2753 1.399 maxv * Handle the common case: 2754 1.399 maxv * o Segment is the next to be received, and 2755 1.399 maxv * o The queue is empty, and 2756 1.399 maxv * o The connection is established 2757 1.399 maxv * In this case, we avoid calling tcp_reass. 2758 1.399 maxv * 2759 1.399 maxv * tcp_setup_ack: set DELACK for segments received in order, 2760 1.399 maxv * but ack immediately when segments are out of order (so that 2761 1.399 maxv * fast retransmit can work). 2762 1.83 itojun */ 2763 1.83 itojun TCP_REASS_LOCK(tp); 2764 1.83 itojun if (th->th_seq == tp->rcv_nxt && 2765 1.141 matt TAILQ_FIRST(&tp->segq) == NULL && 2766 1.83 itojun tp->t_state == TCPS_ESTABLISHED) { 2767 1.280 yamt tcp_setup_ack(tp, th); 2768 1.83 itojun tp->rcv_nxt += tlen; 2769 1.83 itojun tiflags = th->th_flags & TH_FIN; 2770 1.284 thorpej tcps = TCP_STAT_GETREF(); 2771 1.439 riastrad _NET_STATINC_REF(tcps, TCP_STAT_RCVPACK); 2772 1.439 riastrad _NET_STATADD_REF(tcps, TCP_STAT_RCVBYTE, tlen); 2773 1.284 thorpej TCP_STAT_PUTREF(); 2774 1.421 roy nd_hint(tp); 2775 1.389 maxv if (so->so_state & SS_CANTRCVMORE) { 2776 1.154 itojun m_freem(m); 2777 1.389 maxv } else { 2778 1.154 itojun m_adj(m, hdroptlen); 2779 1.154 itojun sbappendstream(&(so)->so_rcv, m); 2780 1.154 itojun } 2781 1.291 tls TCP_REASS_UNLOCK(tp); 2782 1.83 itojun sorwakeup(so); 2783 1.83 itojun } else { 2784 1.97 itojun m_adj(m, hdroptlen); 2785 1.386 maxv tiflags = tcp_reass(tp, th, m, tlen); 2786 1.83 itojun tp->t_flags |= TF_ACKNOW; 2787 1.83 itojun } 2788 1.83 itojun 2789 1.1 cgd /* 2790 1.1 cgd * Note the amount of data that peer has sent into 2791 1.1 cgd * our window, in order to estimate the sender's 2792 1.1 cgd * buffer size. 2793 1.1 cgd */ 2794 1.1 cgd len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 2795 1.1 cgd } else { 2796 1.1 cgd m_freem(m); 2797 1.83 itojun m = NULL; 2798 1.1 cgd tiflags &= ~TH_FIN; 2799 1.1 cgd } 2800 1.1 cgd 2801 1.1 cgd /* 2802 1.1 cgd * If FIN is received ACK the FIN and let the user know 2803 1.22 mycroft * that the connection is closing. Ignore a FIN received before 2804 1.22 mycroft * the connection is fully established. 2805 1.1 cgd */ 2806 1.22 mycroft if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2807 1.1 cgd if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2808 1.1 cgd socantrcvmore(so); 2809 1.1 cgd tp->t_flags |= TF_ACKNOW; 2810 1.1 cgd tp->rcv_nxt++; 2811 1.1 cgd } 2812 1.1 cgd switch (tp->t_state) { 2813 1.1 cgd 2814 1.1 cgd /* 2815 1.22 mycroft * In ESTABLISHED STATE enter the CLOSE_WAIT state. 2816 1.1 cgd */ 2817 1.1 cgd case TCPS_ESTABLISHED: 2818 1.1 cgd tp->t_state = TCPS_CLOSE_WAIT; 2819 1.1 cgd break; 2820 1.1 cgd 2821 1.1 cgd /* 2822 1.1 cgd * If still in FIN_WAIT_1 STATE FIN has not been acked so 2823 1.1 cgd * enter the CLOSING state. 2824 1.1 cgd */ 2825 1.1 cgd case TCPS_FIN_WAIT_1: 2826 1.1 cgd tp->t_state = TCPS_CLOSING; 2827 1.1 cgd break; 2828 1.1 cgd 2829 1.1 cgd /* 2830 1.1 cgd * In FIN_WAIT_2 state enter the TIME_WAIT state, 2831 1.143 itojun * starting the time-wait timer, turning off the other 2832 1.1 cgd * standard timers. 2833 1.1 cgd */ 2834 1.1 cgd case TCPS_FIN_WAIT_2: 2835 1.1 cgd tp->t_state = TCPS_TIME_WAIT; 2836 1.1 cgd tcp_canceltimers(tp); 2837 1.312 dyoung TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2838 1.1 cgd soisdisconnected(so); 2839 1.1 cgd break; 2840 1.1 cgd 2841 1.1 cgd /* 2842 1.1 cgd * In TIME_WAIT state restart the 2 MSL time_wait timer. 2843 1.1 cgd */ 2844 1.1 cgd case TCPS_TIME_WAIT: 2845 1.312 dyoung TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * tp->t_msl); 2846 1.1 cgd break; 2847 1.1 cgd } 2848 1.1 cgd } 2849 1.127 abs #ifdef TCP_DEBUG 2850 1.127 abs if (so->so_options & SO_DEBUG) 2851 1.83 itojun tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0); 2852 1.127 abs #endif 2853 1.1 cgd 2854 1.1 cgd /* 2855 1.1 cgd * Return any desired output. 2856 1.1 cgd */ 2857 1.222 jonathan if (needoutput || (tp->t_flags & TF_ACKNOW)) { 2858 1.301 tls KERNEL_LOCK(1, NULL); 2859 1.389 maxv (void)tcp_output(tp); 2860 1.301 tls KERNEL_UNLOCK_ONE(NULL); 2861 1.222 jonathan } 2862 1.440 rin m_freem(tcp_saveti); 2863 1.312 dyoung 2864 1.312 dyoung if (tp->t_state == TCPS_TIME_WAIT 2865 1.312 dyoung && (so->so_state & SS_NOFDREF) 2866 1.435 ozaki && (tp->t_inpcb || af != AF_INET || af != AF_INET6) 2867 1.312 dyoung && ((af == AF_INET ? tcp4_vtw_enable : tcp6_vtw_enable) & 1) != 0 2868 1.312 dyoung && TAILQ_EMPTY(&tp->segq) 2869 1.312 dyoung && vtw_add(af, tp)) { 2870 1.312 dyoung ; 2871 1.312 dyoung } 2872 1.1 cgd return; 2873 1.35 thorpej 2874 1.35 thorpej badsyn: 2875 1.35 thorpej /* 2876 1.35 thorpej * Received a bad SYN. Increment counters and dropwithreset. 2877 1.35 thorpej */ 2878 1.284 thorpej TCP_STATINC(TCP_STAT_BADSYN); 2879 1.35 thorpej tp = NULL; 2880 1.35 thorpej goto dropwithreset; 2881 1.1 cgd 2882 1.1 cgd dropafterack: 2883 1.1 cgd /* 2884 1.1 cgd * Generate an ACK dropping incoming segment if it occupies 2885 1.1 cgd * sequence space, where the ACK reflects our state. 2886 1.1 cgd */ 2887 1.1 cgd if (tiflags & TH_RST) 2888 1.1 cgd goto drop; 2889 1.194 itojun goto dropafterack2; 2890 1.194 itojun 2891 1.194 itojun dropafterack_ratelim: 2892 1.194 itojun /* 2893 1.194 itojun * We may want to rate-limit ACKs against SYN/RST attack. 2894 1.194 itojun */ 2895 1.194 itojun if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count, 2896 1.194 itojun tcp_ackdrop_ppslim) == 0) { 2897 1.194 itojun /* XXX stat */ 2898 1.194 itojun goto drop; 2899 1.194 itojun } 2900 1.194 itojun 2901 1.194 itojun dropafterack2: 2902 1.1 cgd m_freem(m); 2903 1.1 cgd tp->t_flags |= TF_ACKNOW; 2904 1.301 tls KERNEL_LOCK(1, NULL); 2905 1.389 maxv (void)tcp_output(tp); 2906 1.301 tls KERNEL_UNLOCK_ONE(NULL); 2907 1.440 rin m_freem(tcp_saveti); 2908 1.1 cgd return; 2909 1.1 cgd 2910 1.104 thorpej dropwithreset_ratelim: 2911 1.104 thorpej /* 2912 1.104 thorpej * We may want to rate-limit RSTs in certain situations, 2913 1.104 thorpej * particularly if we are sending an RST in response to 2914 1.104 thorpej * an attempt to connect to or otherwise communicate with 2915 1.104 thorpej * a port for which we have no socket. 2916 1.104 thorpej */ 2917 1.116 itojun if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count, 2918 1.116 itojun tcp_rst_ppslim) == 0) { 2919 1.104 thorpej /* XXX stat */ 2920 1.104 thorpej goto drop; 2921 1.104 thorpej } 2922 1.104 thorpej 2923 1.1 cgd dropwithreset: 2924 1.1 cgd /* 2925 1.1 cgd * Generate a RST, dropping incoming segment. 2926 1.1 cgd * Make ACK acceptable to originator of segment. 2927 1.1 cgd */ 2928 1.103 thorpej if (tiflags & TH_RST) 2929 1.137 christos goto drop; 2930 1.376 maxv if (tiflags & TH_ACK) { 2931 1.86 itojun (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST); 2932 1.376 maxv } else { 2933 1.1 cgd if (tiflags & TH_SYN) 2934 1.83 itojun tlen++; 2935 1.86 itojun (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0, 2936 1.1 cgd TH_RST|TH_ACK); 2937 1.1 cgd } 2938 1.440 rin m_freem(tcp_saveti); 2939 1.1 cgd return; 2940 1.1 cgd 2941 1.125 thorpej badcsum: 2942 1.1 cgd drop: 2943 1.1 cgd /* 2944 1.1 cgd * Drop space held by incoming segment and return. 2945 1.1 cgd */ 2946 1.83 itojun if (tp) { 2947 1.435 ozaki so = tp->t_inpcb->inp_socket; 2948 1.127 abs #ifdef TCP_DEBUG 2949 1.89 itojun if (so && (so->so_options & SO_DEBUG) != 0) 2950 1.83 itojun tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0); 2951 1.127 abs #endif 2952 1.83 itojun } 2953 1.440 rin m_freem(tcp_saveti); 2954 1.1 cgd m_freem(m); 2955 1.1 cgd return; 2956 1.1 cgd } 2957 1.1 cgd 2958 1.206 itojun #ifdef TCP_SIGNATURE 2959 1.206 itojun int 2960 1.262 christos tcp_signature_apply(void *fstate, void *data, u_int len) 2961 1.206 itojun { 2962 1.206 itojun 2963 1.206 itojun MD5Update(fstate, (u_char *)data, len); 2964 1.206 itojun return (0); 2965 1.206 itojun } 2966 1.206 itojun 2967 1.206 itojun struct secasvar * 2968 1.379 maxv tcp_signature_getsav(struct mbuf *m) 2969 1.206 itojun { 2970 1.206 itojun struct ip *ip; 2971 1.206 itojun struct ip6_hdr *ip6; 2972 1.206 itojun 2973 1.206 itojun ip = mtod(m, struct ip *); 2974 1.206 itojun switch (ip->ip_v) { 2975 1.206 itojun case 4: 2976 1.206 itojun ip = mtod(m, struct ip *); 2977 1.206 itojun ip6 = NULL; 2978 1.206 itojun break; 2979 1.206 itojun case 6: 2980 1.206 itojun ip = NULL; 2981 1.206 itojun ip6 = mtod(m, struct ip6_hdr *); 2982 1.206 itojun break; 2983 1.206 itojun default: 2984 1.206 itojun return (NULL); 2985 1.206 itojun } 2986 1.206 itojun 2987 1.326 christos #ifdef IPSEC 2988 1.351 christos union sockaddr_union dst; 2989 1.332 christos 2990 1.351 christos /* Extract the destination from the IP header in the mbuf. */ 2991 1.351 christos memset(&dst, 0, sizeof(union sockaddr_union)); 2992 1.351 christos if (ip != NULL) { 2993 1.351 christos dst.sa.sa_len = sizeof(struct sockaddr_in); 2994 1.351 christos dst.sa.sa_family = AF_INET; 2995 1.351 christos dst.sin.sin_addr = ip->ip_dst; 2996 1.351 christos } else { 2997 1.351 christos dst.sa.sa_len = sizeof(struct sockaddr_in6); 2998 1.351 christos dst.sa.sa_family = AF_INET6; 2999 1.351 christos dst.sin6.sin6_addr = ip6->ip6_dst; 3000 1.260 degroote } 3001 1.351 christos 3002 1.351 christos /* 3003 1.351 christos * Look up an SADB entry which matches the address of the peer. 3004 1.351 christos */ 3005 1.359 ozaki return KEY_LOOKUP_SA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI), 0, 0); 3006 1.351 christos #else 3007 1.333 rmind return NULL; 3008 1.206 itojun #endif 3009 1.206 itojun } 3010 1.206 itojun 3011 1.206 itojun int 3012 1.206 itojun tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff, 3013 1.206 itojun struct secasvar *sav, char *sig) 3014 1.206 itojun { 3015 1.206 itojun MD5_CTX ctx; 3016 1.206 itojun struct ip *ip; 3017 1.206 itojun struct ipovly *ipovly; 3018 1.337 rtr #ifdef INET6 3019 1.206 itojun struct ip6_hdr *ip6; 3020 1.337 rtr struct ip6_hdr_pseudo ip6pseudo; 3021 1.388 maxv #endif 3022 1.206 itojun struct ippseudo ippseudo; 3023 1.206 itojun struct tcphdr th0; 3024 1.208 itojun int l, tcphdrlen; 3025 1.206 itojun 3026 1.206 itojun if (sav == NULL) 3027 1.206 itojun return (-1); 3028 1.206 itojun 3029 1.208 itojun tcphdrlen = th->th_off * 4; 3030 1.208 itojun 3031 1.206 itojun switch (mtod(m, struct ip *)->ip_v) { 3032 1.206 itojun case 4: 3033 1.337 rtr MD5Init(&ctx); 3034 1.206 itojun ip = mtod(m, struct ip *); 3035 1.206 itojun memset(&ippseudo, 0, sizeof(ippseudo)); 3036 1.206 itojun ipovly = (struct ipovly *)ip; 3037 1.206 itojun ippseudo.ippseudo_src = ipovly->ih_src; 3038 1.206 itojun ippseudo.ippseudo_dst = ipovly->ih_dst; 3039 1.206 itojun ippseudo.ippseudo_pad = 0; 3040 1.206 itojun ippseudo.ippseudo_p = IPPROTO_TCP; 3041 1.206 itojun ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff); 3042 1.206 itojun MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo)); 3043 1.337 rtr break; 3044 1.337 rtr #if INET6 3045 1.337 rtr case 6: 3046 1.337 rtr MD5Init(&ctx); 3047 1.337 rtr ip6 = mtod(m, struct ip6_hdr *); 3048 1.206 itojun memset(&ip6pseudo, 0, sizeof(ip6pseudo)); 3049 1.206 itojun ip6pseudo.ip6ph_src = ip6->ip6_src; 3050 1.206 itojun in6_clearscope(&ip6pseudo.ip6ph_src); 3051 1.206 itojun ip6pseudo.ip6ph_dst = ip6->ip6_dst; 3052 1.206 itojun in6_clearscope(&ip6pseudo.ip6ph_dst); 3053 1.206 itojun ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff); 3054 1.206 itojun ip6pseudo.ip6ph_nxt = IPPROTO_TCP; 3055 1.206 itojun MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo)); 3056 1.337 rtr break; 3057 1.388 maxv #endif 3058 1.337 rtr default: 3059 1.337 rtr return (-1); 3060 1.206 itojun } 3061 1.206 itojun 3062 1.206 itojun th0 = *th; 3063 1.206 itojun th0.th_sum = 0; 3064 1.206 itojun MD5Update(&ctx, (char *)&th0, sizeof(th0)); 3065 1.206 itojun 3066 1.208 itojun l = m->m_pkthdr.len - thoff - tcphdrlen; 3067 1.206 itojun if (l > 0) 3068 1.208 itojun m_apply(m, thoff + tcphdrlen, 3069 1.208 itojun m->m_pkthdr.len - thoff - tcphdrlen, 3070 1.206 itojun tcp_signature_apply, &ctx); 3071 1.206 itojun 3072 1.206 itojun MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth)); 3073 1.206 itojun MD5Final(sig, &ctx); 3074 1.206 itojun 3075 1.206 itojun return (0); 3076 1.206 itojun } 3077 1.206 itojun #endif 3078 1.206 itojun 3079 1.308 yamt /* 3080 1.389 maxv * Parse and process tcp options. 3081 1.308 yamt * 3082 1.389 maxv * Returns -1 if this segment should be dropped. (eg. wrong signature) 3083 1.389 maxv * Otherwise returns 0. 3084 1.308 yamt */ 3085 1.434 ozaki int 3086 1.375 maxv tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt, struct tcphdr *th, 3087 1.255 christos struct mbuf *m, int toff, struct tcp_opt_info *oi) 3088 1.1 cgd { 3089 1.12 cgd u_int16_t mss; 3090 1.206 itojun int opt, optlen = 0; 3091 1.206 itojun #ifdef TCP_SIGNATURE 3092 1.262 christos void *sigp = NULL; 3093 1.206 itojun char sigbuf[TCP_SIGLEN]; 3094 1.206 itojun struct secasvar *sav = NULL; 3095 1.206 itojun #endif 3096 1.1 cgd 3097 1.206 itojun for (; cp && cnt > 0; cnt -= optlen, cp += optlen) { 3098 1.1 cgd opt = cp[0]; 3099 1.1 cgd if (opt == TCPOPT_EOL) 3100 1.1 cgd break; 3101 1.1 cgd if (opt == TCPOPT_NOP) 3102 1.1 cgd optlen = 1; 3103 1.1 cgd else { 3104 1.113 itojun if (cnt < 2) 3105 1.113 itojun break; 3106 1.1 cgd optlen = cp[1]; 3107 1.113 itojun if (optlen < 2 || optlen > cnt) 3108 1.1 cgd break; 3109 1.1 cgd } 3110 1.1 cgd switch (opt) { 3111 1.1 cgd 3112 1.1 cgd default: 3113 1.1 cgd continue; 3114 1.1 cgd 3115 1.1 cgd case TCPOPT_MAXSEG: 3116 1.9 mycroft if (optlen != TCPOLEN_MAXSEG) 3117 1.1 cgd continue; 3118 1.83 itojun if (!(th->th_flags & TH_SYN)) 3119 1.1 cgd continue; 3120 1.234 christos if (TCPS_HAVERCVDSYN(tp->t_state)) 3121 1.234 christos continue; 3122 1.387 maxv memcpy(&mss, cp + 2, sizeof(mss)); 3123 1.29 thorpej oi->maxseg = ntohs(mss); 3124 1.1 cgd break; 3125 1.9 mycroft 3126 1.9 mycroft case TCPOPT_WINDOW: 3127 1.9 mycroft if (optlen != TCPOLEN_WINDOW) 3128 1.9 mycroft continue; 3129 1.83 itojun if (!(th->th_flags & TH_SYN)) 3130 1.9 mycroft continue; 3131 1.234 christos if (TCPS_HAVERCVDSYN(tp->t_state)) 3132 1.234 christos continue; 3133 1.9 mycroft tp->t_flags |= TF_RCVD_SCALE; 3134 1.52 thorpej tp->requested_s_scale = cp[2]; 3135 1.52 thorpej if (tp->requested_s_scale > TCP_MAX_WINSHIFT) { 3136 1.335 christos char buf[INET6_ADDRSTRLEN]; 3137 1.335 christos struct ip *ip = mtod(m, struct ip *); 3138 1.335 christos #ifdef INET6 3139 1.335 christos struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); 3140 1.335 christos #endif 3141 1.403 maxv 3142 1.403 maxv switch (ip->ip_v) { 3143 1.403 maxv case 4: 3144 1.335 christos in_print(buf, sizeof(buf), 3145 1.335 christos &ip->ip_src); 3146 1.403 maxv break; 3147 1.83 itojun #ifdef INET6 3148 1.403 maxv case 6: 3149 1.335 christos in6_print(buf, sizeof(buf), 3150 1.335 christos &ip6->ip6_src); 3151 1.403 maxv break; 3152 1.83 itojun #endif 3153 1.403 maxv default: 3154 1.335 christos strlcpy(buf, "(unknown)", sizeof(buf)); 3155 1.403 maxv break; 3156 1.403 maxv } 3157 1.403 maxv 3158 1.83 itojun log(LOG_ERR, "TCP: invalid wscale %d from %s, " 3159 1.83 itojun "assuming %d\n", 3160 1.335 christos tp->requested_s_scale, buf, 3161 1.83 itojun TCP_MAX_WINSHIFT); 3162 1.52 thorpej tp->requested_s_scale = TCP_MAX_WINSHIFT; 3163 1.52 thorpej } 3164 1.9 mycroft break; 3165 1.9 mycroft 3166 1.9 mycroft case TCPOPT_TIMESTAMP: 3167 1.9 mycroft if (optlen != TCPOLEN_TIMESTAMP) 3168 1.9 mycroft continue; 3169 1.32 thorpej oi->ts_present = 1; 3170 1.387 maxv memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val)); 3171 1.29 thorpej NTOHL(oi->ts_val); 3172 1.387 maxv memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr)); 3173 1.29 thorpej NTOHL(oi->ts_ecr); 3174 1.9 mycroft 3175 1.234 christos if (!(th->th_flags & TH_SYN)) 3176 1.234 christos continue; 3177 1.234 christos if (TCPS_HAVERCVDSYN(tp->t_state)) 3178 1.234 christos continue; 3179 1.143 itojun /* 3180 1.9 mycroft * A timestamp received in a SYN makes 3181 1.9 mycroft * it ok to send timestamp requests and replies. 3182 1.9 mycroft */ 3183 1.234 christos tp->t_flags |= TF_RCVD_TSTMP; 3184 1.234 christos tp->ts_recent = oi->ts_val; 3185 1.234 christos tp->ts_recent_age = tcp_now; 3186 1.230 christos break; 3187 1.230 christos 3188 1.54 matt case TCPOPT_SACK_PERMITTED: 3189 1.54 matt if (optlen != TCPOLEN_SACK_PERMITTED) 3190 1.54 matt continue; 3191 1.83 itojun if (!(th->th_flags & TH_SYN)) 3192 1.54 matt continue; 3193 1.234 christos if (TCPS_HAVERCVDSYN(tp->t_state)) 3194 1.234 christos continue; 3195 1.222 jonathan if (tcp_do_sack) { 3196 1.222 jonathan tp->t_flags |= TF_SACK_PERMIT; 3197 1.222 jonathan tp->t_flags |= TF_WILL_SACK; 3198 1.222 jonathan } 3199 1.54 matt break; 3200 1.54 matt 3201 1.54 matt case TCPOPT_SACK: 3202 1.222 jonathan tcp_sack_option(tp, th, cp, optlen); 3203 1.9 mycroft break; 3204 1.201 jonathan #ifdef TCP_SIGNATURE 3205 1.201 jonathan case TCPOPT_SIGNATURE: 3206 1.201 jonathan if (optlen != TCPOLEN_SIGNATURE) 3207 1.201 jonathan continue; 3208 1.402 maxv if (sigp && 3209 1.402 maxv !consttime_memequal(sigp, cp + 2, TCP_SIGLEN)) 3210 1.206 itojun return (-1); 3211 1.206 itojun 3212 1.206 itojun sigp = sigbuf; 3213 1.206 itojun memcpy(sigbuf, cp + 2, TCP_SIGLEN); 3214 1.206 itojun tp->t_flags |= TF_SIGNATURE; 3215 1.201 jonathan break; 3216 1.201 jonathan #endif 3217 1.1 cgd } 3218 1.1 cgd } 3219 1.206 itojun 3220 1.327 christos #ifndef TCP_SIGNATURE 3221 1.327 christos return 0; 3222 1.327 christos #else 3223 1.206 itojun if (tp->t_flags & TF_SIGNATURE) { 3224 1.379 maxv sav = tcp_signature_getsav(m); 3225 1.206 itojun if (sav == NULL && tp->t_state == TCPS_LISTEN) 3226 1.206 itojun return (-1); 3227 1.206 itojun } 3228 1.206 itojun 3229 1.327 christos if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) 3230 1.327 christos goto out; 3231 1.206 itojun 3232 1.206 itojun if (sigp) { 3233 1.206 itojun char sig[TCP_SIGLEN]; 3234 1.206 itojun 3235 1.280 yamt tcp_fields_to_net(th); 3236 1.206 itojun if (tcp_signature(m, th, toff, sav, sig) < 0) { 3237 1.280 yamt tcp_fields_to_host(th); 3238 1.327 christos goto out; 3239 1.206 itojun } 3240 1.280 yamt tcp_fields_to_host(th); 3241 1.206 itojun 3242 1.402 maxv if (!consttime_memequal(sig, sigp, TCP_SIGLEN)) { 3243 1.284 thorpej TCP_STATINC(TCP_STAT_BADSIG); 3244 1.327 christos goto out; 3245 1.206 itojun } else 3246 1.284 thorpej TCP_STATINC(TCP_STAT_GOODSIG); 3247 1.206 itojun 3248 1.206 itojun key_sa_recordxfer(sav, m); 3249 1.360 ozaki KEY_SA_UNREF(&sav); 3250 1.206 itojun } 3251 1.327 christos return 0; 3252 1.327 christos out: 3253 1.327 christos if (sav != NULL) 3254 1.360 ozaki KEY_SA_UNREF(&sav); 3255 1.327 christos return -1; 3256 1.206 itojun #endif 3257 1.1 cgd } 3258 1.1 cgd 3259 1.1 cgd /* 3260 1.1 cgd * Pull out of band byte out of a segment so 3261 1.1 cgd * it doesn't appear in the user's data queue. 3262 1.1 cgd * It is still reflected in the segment length for 3263 1.1 cgd * sequencing purposes. 3264 1.1 cgd */ 3265 1.5 mycroft void 3266 1.220 perry tcp_pulloutofband(struct socket *so, struct tcphdr *th, 3267 1.220 perry struct mbuf *m, int off) 3268 1.1 cgd { 3269 1.97 itojun int cnt = off + th->th_urp - 1; 3270 1.143 itojun 3271 1.1 cgd while (cnt >= 0) { 3272 1.1 cgd if (m->m_len > cnt) { 3273 1.262 christos char *cp = mtod(m, char *) + cnt; 3274 1.1 cgd struct tcpcb *tp = sototcpcb(so); 3275 1.1 cgd 3276 1.1 cgd tp->t_iobc = *cp; 3277 1.1 cgd tp->t_oobflags |= TCPOOB_HAVEDATA; 3278 1.387 maxv memmove(cp, cp + 1, (unsigned)(m->m_len - cnt - 1)); 3279 1.1 cgd m->m_len--; 3280 1.1 cgd return; 3281 1.1 cgd } 3282 1.1 cgd cnt -= m->m_len; 3283 1.1 cgd m = m->m_next; 3284 1.387 maxv if (m == NULL) 3285 1.1 cgd break; 3286 1.1 cgd } 3287 1.1 cgd panic("tcp_pulloutofband"); 3288 1.1 cgd } 3289 1.1 cgd 3290 1.1 cgd /* 3291 1.1 cgd * Collect new round-trip time estimate 3292 1.1 cgd * and update averages and current timeout. 3293 1.309 gdt * 3294 1.309 gdt * rtt is in units of slow ticks (typically 500 ms) -- essentially the 3295 1.309 gdt * difference of two timestamps. 3296 1.1 cgd */ 3297 1.5 mycroft void 3298 1.220 perry tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt) 3299 1.1 cgd { 3300 1.128 thorpej int32_t delta; 3301 1.1 cgd 3302 1.284 thorpej TCP_STATINC(TCP_STAT_RTTUPDATED); 3303 1.1 cgd if (tp->t_srtt != 0) { 3304 1.1 cgd /* 3305 1.309 gdt * Compute the amount to add to srtt for smoothing, 3306 1.309 gdt * *alpha, or 2^(-TCP_RTT_SHIFT). Because 3307 1.309 gdt * srtt is stored in 1/32 slow ticks, we conceptually 3308 1.309 gdt * shift left 5 bits, subtract srtt to get the 3309 1.433 andvar * difference, and then shift right by TCP_RTT_SHIFT 3310 1.309 gdt * (3) to obtain 1/8 of the difference. 3311 1.1 cgd */ 3312 1.16 mycroft delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT); 3313 1.309 gdt /* 3314 1.309 gdt * This can never happen, because delta's lowest 3315 1.309 gdt * possible value is 1/8 of t_srtt. But if it does, 3316 1.309 gdt * set srtt to some reasonable value, here chosen 3317 1.309 gdt * as 1/8 tick. 3318 1.309 gdt */ 3319 1.1 cgd if ((tp->t_srtt += delta) <= 0) 3320 1.27 mycroft tp->t_srtt = 1 << 2; 3321 1.1 cgd /* 3322 1.309 gdt * RFC2988 requires that rttvar be updated first. 3323 1.309 gdt * This code is compliant because "delta" is the old 3324 1.309 gdt * srtt minus the new observation (scaled). 3325 1.309 gdt * 3326 1.309 gdt * RFC2988 says: 3327 1.309 gdt * rttvar = (1-beta) * rttvar + beta * |srtt-observed| 3328 1.309 gdt * 3329 1.309 gdt * delta is in units of 1/32 ticks, and has then been 3330 1.309 gdt * divided by 8. This is equivalent to being in 1/16s 3331 1.309 gdt * units and divided by 4. Subtract from it 1/4 of 3332 1.309 gdt * the existing rttvar to form the (signed) amount to 3333 1.309 gdt * adjust. 3334 1.1 cgd */ 3335 1.1 cgd if (delta < 0) 3336 1.1 cgd delta = -delta; 3337 1.1 cgd delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); 3338 1.309 gdt /* 3339 1.309 gdt * As with srtt, this should never happen. There is 3340 1.309 gdt * no support in RFC2988 for this operation. But 1/4s 3341 1.310 wiz * as rttvar when faced with something arguably wrong 3342 1.309 gdt * is ok. 3343 1.309 gdt */ 3344 1.1 cgd if ((tp->t_rttvar += delta) <= 0) 3345 1.27 mycroft tp->t_rttvar = 1 << 2; 3346 1.312 dyoung 3347 1.312 dyoung /* 3348 1.312 dyoung * If srtt exceeds .01 second, ensure we use the 'remote' MSL 3349 1.312 dyoung * Problem is: it doesn't work. Disabled by defaulting 3350 1.312 dyoung * tcp_rttlocal to 0; see corresponding code in 3351 1.312 dyoung * tcp_subr that selects local vs remote in a different way. 3352 1.312 dyoung * 3353 1.312 dyoung * The static branch prediction hint here should be removed 3354 1.312 dyoung * when the rtt estimator is fixed and the rtt_enable code 3355 1.312 dyoung * is turned back on. 3356 1.312 dyoung */ 3357 1.312 dyoung if (__predict_false(tcp_rttlocal) && tcp_msl_enable 3358 1.312 dyoung && tp->t_srtt > tcp_msl_remote_threshold 3359 1.312 dyoung && tp->t_msl < tcp_msl_remote) { 3360 1.415 riastrad tp->t_msl = MIN(tcp_msl_remote, TCP_MAXMSL); 3361 1.312 dyoung } 3362 1.1 cgd } else { 3363 1.143 itojun /* 3364 1.309 gdt * This is the first measurement. Per RFC2988, 2.2, 3365 1.309 gdt * set rtt=R and srtt=R/2. 3366 1.309 gdt * For srtt, storage representation is 1/32 ticks, 3367 1.309 gdt * so shift left by 5. 3368 1.310 wiz * For rttvar, storage representation is 1/16 ticks, 3369 1.309 gdt * So shift left by 4, but then right by 1 to halve. 3370 1.1 cgd */ 3371 1.16 mycroft tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2); 3372 1.16 mycroft tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1); 3373 1.1 cgd } 3374 1.128 thorpej tp->t_rtttime = 0; 3375 1.1 cgd tp->t_rxtshift = 0; 3376 1.1 cgd 3377 1.1 cgd /* 3378 1.1 cgd * the retransmit should happen at rtt + 4 * rttvar. 3379 1.1 cgd * Because of the way we do the smoothing, srtt and rttvar 3380 1.1 cgd * will each average +1/2 tick of bias. When we compute 3381 1.1 cgd * the retransmit timer, we want 1/2 tick of rounding and 3382 1.1 cgd * 1 extra tick because of +-1/2 tick uncertainty in the 3383 1.1 cgd * firing of the timer. The bias will give us exactly the 3384 1.1 cgd * 1.5 tick we need. But, because the bias is 3385 1.1 cgd * statistical, we have to test that we don't drop below 3386 1.1 cgd * the minimum feasible timer (which is 2 ticks). 3387 1.1 cgd */ 3388 1.128 thorpej TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3389 1.409 riastrad uimax(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); 3390 1.143 itojun 3391 1.1 cgd /* 3392 1.1 cgd * We received an ack for a packet that wasn't retransmitted; 3393 1.1 cgd * it is probably safe to discard any error indications we've 3394 1.1 cgd * received recently. This isn't quite right, but close enough 3395 1.1 cgd * for now (a route might have failed after we sent a segment, 3396 1.1 cgd * and the return path might not be symmetrical). 3397 1.1 cgd */ 3398 1.1 cgd tp->t_softerror = 0; 3399 1.1 cgd } 3400