Home | History | Annotate | Line # | Download | only in netinet
tcp_syncache.c revision 1.5
      1 /*	$NetBSD: tcp_syncache.c,v 1.5 2022/11/04 09:00:58 ozaki-r Exp $	*/
      2 
      3 /*
      4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the project nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
     34  *
     35  * NRL grants permission for redistribution and use in source and binary
     36  * forms, with or without modification, of the software and documentation
     37  * created at NRL provided that the following conditions are met:
     38  *
     39  * 1. Redistributions of source code must retain the above copyright
     40  *    notice, this list of conditions and the following disclaimer.
     41  * 2. Redistributions in binary form must reproduce the above copyright
     42  *    notice, this list of conditions and the following disclaimer in the
     43  *    documentation and/or other materials provided with the distribution.
     44  * 3. All advertising materials mentioning features or use of this software
     45  *    must display the following acknowledgements:
     46  *      This product includes software developed by the University of
     47  *      California, Berkeley and its contributors.
     48  *      This product includes software developed at the Information
     49  *      Technology Division, US Naval Research Laboratory.
     50  * 4. Neither the name of the NRL nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
     55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
     57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
     58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     65  *
     66  * The views and conclusions contained in the software and documentation
     67  * are those of the authors and should not be interpreted as representing
     68  * official policies, either expressed or implied, of the US Naval
     69  * Research Laboratory (NRL).
     70  */
     71 
     72 /*-
     73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
     74  * 2011 The NetBSD Foundation, Inc.
     75  * All rights reserved.
     76  *
     77  * This code is derived from software contributed to The NetBSD Foundation
     78  * by Coyote Point Systems, Inc.
     79  * This code is derived from software contributed to The NetBSD Foundation
     80  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
     81  * Facility, NASA Ames Research Center.
     82  * This code is derived from software contributed to The NetBSD Foundation
     83  * by Charles M. Hannum.
     84  * This code is derived from software contributed to The NetBSD Foundation
     85  * by Rui Paulo.
     86  *
     87  * Redistribution and use in source and binary forms, with or without
     88  * modification, are permitted provided that the following conditions
     89  * are met:
     90  * 1. Redistributions of source code must retain the above copyright
     91  *    notice, this list of conditions and the following disclaimer.
     92  * 2. Redistributions in binary form must reproduce the above copyright
     93  *    notice, this list of conditions and the following disclaimer in the
     94  *    documentation and/or other materials provided with the distribution.
     95  *
     96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
    100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    106  * POSSIBILITY OF SUCH DAMAGE.
    107  */
    108 
    109 /*
    110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
    111  *	The Regents of the University of California.  All rights reserved.
    112  *
    113  * Redistribution and use in source and binary forms, with or without
    114  * modification, are permitted provided that the following conditions
    115  * are met:
    116  * 1. Redistributions of source code must retain the above copyright
    117  *    notice, this list of conditions and the following disclaimer.
    118  * 2. Redistributions in binary form must reproduce the above copyright
    119  *    notice, this list of conditions and the following disclaimer in the
    120  *    documentation and/or other materials provided with the distribution.
    121  * 3. Neither the name of the University nor the names of its contributors
    122  *    may be used to endorse or promote products derived from this software
    123  *    without specific prior written permission.
    124  *
    125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    135  * SUCH DAMAGE.
    136  *
    137  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
    138  */
    139 
    140 /*
    141  *	TODO list for SYN cache stuff:
    142  *
    143  *	Find room for a "state" field, which is needed to keep a
    144  *	compressed state for TIME_WAIT TCBs.  It's been noted already
    145  *	that this is fairly important for very high-volume web and
    146  *	mail servers, which use a large number of short-lived
    147  *	connections.
    148  */
    149 
    150 #include <sys/cdefs.h>
    151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.5 2022/11/04 09:00:58 ozaki-r Exp $");
    152 
    153 #ifdef _KERNEL_OPT
    154 #include "opt_inet.h"
    155 #include "opt_ipsec.h"
    156 #endif
    157 
    158 #include <sys/param.h>
    159 #include <sys/systm.h>
    160 #include <sys/mbuf.h>
    161 #include <sys/protosw.h>
    162 #include <sys/socket.h>
    163 #include <sys/socketvar.h>
    164 #include <sys/errno.h>
    165 #include <sys/syslog.h>
    166 #include <sys/pool.h>
    167 #include <sys/domain.h>
    168 #include <sys/kernel.h>
    169 #include <sys/lwp.h> /* for lwp0 */
    170 #include <sys/cprng.h>
    171 
    172 #include <netinet/in.h>
    173 #include <netinet/ip.h>
    174 #include <netinet/in_pcb.h>
    175 #include <netinet/in_var.h>
    176 #include <netinet/ip_var.h>
    177 
    178 #include <netinet/ip6.h>
    179 #ifdef INET6
    180 #include <netinet6/ip6_var.h>
    181 #include <netinet6/in6_pcb.h>
    182 #include <netinet6/ip6_var.h>
    183 #include <netinet6/in6_var.h>
    184 #endif
    185 
    186 #include <netinet/tcp.h>
    187 #include <netinet/tcp_fsm.h>
    188 #include <netinet/tcp_seq.h>
    189 #include <netinet/tcp_timer.h>
    190 #include <netinet/tcp_var.h>
    191 #include <netinet/tcp_private.h>
    192 #include <netinet/tcp_syncache.h>
    193 
    194 #ifdef TCP_SIGNATURE
    195 #ifdef IPSEC
    196 #include <netipsec/ipsec.h>
    197 #include <netipsec/key.h>
    198 #ifdef INET6
    199 #include <netipsec/ipsec6.h>
    200 #endif
    201 #endif	/* IPSEC*/
    202 #endif
    203 
    204 static void	syn_cache_timer(void *);
    205 static struct syn_cache *
    206 		syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
    207 		struct syn_cache_head **);
    208 static int	syn_cache_respond(struct syn_cache *);
    209 
    210 /* syn hash parameters */
    211 #define	TCP_SYN_HASH_SIZE	293
    212 #define	TCP_SYN_BUCKET_SIZE	35
    213 static int	tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
    214 int		tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
    215 int		tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
    216 static struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
    217 
    218 /*
    219  * TCP compressed state engine.  Currently used to hold compressed
    220  * state for SYN_RECEIVED.
    221  */
    222 
    223 u_long	syn_cache_count;
    224 static u_int32_t syn_hash1, syn_hash2;
    225 
    226 #define SYN_HASH(sa, sp, dp) \
    227 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
    228 				     ((u_int32_t)(sp)))^syn_hash2)))
    229 #ifndef INET6
    230 #define	SYN_HASHALL(hash, src, dst) \
    231 do {									\
    232 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
    233 		((const struct sockaddr_in *)(src))->sin_port,		\
    234 		((const struct sockaddr_in *)(dst))->sin_port);		\
    235 } while (/*CONSTCOND*/ 0)
    236 #else
    237 #define SYN_HASH6(sa, sp, dp) \
    238 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
    239 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
    240 	 & 0x7fffffff)
    241 
    242 #define SYN_HASHALL(hash, src, dst) \
    243 do {									\
    244 	switch ((src)->sa_family) {					\
    245 	case AF_INET:							\
    246 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
    247 			((const struct sockaddr_in *)(src))->sin_port,	\
    248 			((const struct sockaddr_in *)(dst))->sin_port);	\
    249 		break;							\
    250 	case AF_INET6:							\
    251 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
    252 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
    253 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
    254 		break;							\
    255 	default:							\
    256 		hash = 0;						\
    257 	}								\
    258 } while (/*CONSTCOND*/0)
    259 #endif /* INET6 */
    260 
    261 static struct pool syn_cache_pool;
    262 
    263 /*
    264  * We don't estimate RTT with SYNs, so each packet starts with the default
    265  * RTT and each timer step has a fixed timeout value.
    266  */
    267 static inline void
    268 syn_cache_timer_arm(struct syn_cache *sc)
    269 {
    270 
    271 	TCPT_RANGESET(sc->sc_rxtcur,
    272 	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
    273 	    TCPTV_REXMTMAX);
    274 	callout_reset(&sc->sc_timer,
    275 	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
    276 }
    277 
    278 #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
    279 
    280 static inline void
    281 syn_cache_rm(struct syn_cache *sc)
    282 {
    283 	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
    284 	    sc, sc_bucketq);
    285 	sc->sc_tp = NULL;
    286 	LIST_REMOVE(sc, sc_tpq);
    287 	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
    288 	callout_stop(&sc->sc_timer);
    289 	syn_cache_count--;
    290 }
    291 
    292 static inline void
    293 syn_cache_put(struct syn_cache *sc)
    294 {
    295 	if (sc->sc_ipopts)
    296 		(void) m_free(sc->sc_ipopts);
    297 	rtcache_free(&sc->sc_route);
    298 	sc->sc_flags |= SCF_DEAD;
    299 	if (!callout_invoking(&sc->sc_timer))
    300 		callout_schedule(&(sc)->sc_timer, 1);
    301 }
    302 
    303 void
    304 syn_cache_init(void)
    305 {
    306 	int i;
    307 
    308 	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
    309 	    "synpl", NULL, IPL_SOFTNET);
    310 
    311 	/* Initialize the hash buckets. */
    312 	for (i = 0; i < tcp_syn_cache_size; i++)
    313 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
    314 }
    315 
    316 void
    317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
    318 {
    319 	struct syn_cache_head *scp;
    320 	struct syn_cache *sc2;
    321 	int s;
    322 
    323 	/*
    324 	 * If there are no entries in the hash table, reinitialize
    325 	 * the hash secrets.
    326 	 */
    327 	if (syn_cache_count == 0) {
    328 		syn_hash1 = cprng_fast32();
    329 		syn_hash2 = cprng_fast32();
    330 	}
    331 
    332 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
    333 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
    334 	scp = &tcp_syn_cache[sc->sc_bucketidx];
    335 
    336 	/*
    337 	 * Make sure that we don't overflow the per-bucket
    338 	 * limit or the total cache size limit.
    339 	 */
    340 	s = splsoftnet();
    341 	if (scp->sch_length >= tcp_syn_bucket_limit) {
    342 		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
    343 		/*
    344 		 * The bucket is full.  Toss the oldest element in the
    345 		 * bucket.  This will be the first entry in the bucket.
    346 		 */
    347 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
    348 #ifdef DIAGNOSTIC
    349 		/*
    350 		 * This should never happen; we should always find an
    351 		 * entry in our bucket.
    352 		 */
    353 		if (sc2 == NULL)
    354 			panic("syn_cache_insert: bucketoverflow: impossible");
    355 #endif
    356 		syn_cache_rm(sc2);
    357 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
    358 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
    359 		struct syn_cache_head *scp2, *sce;
    360 
    361 		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
    362 		/*
    363 		 * The cache is full.  Toss the oldest entry in the
    364 		 * first non-empty bucket we can find.
    365 		 *
    366 		 * XXX We would really like to toss the oldest
    367 		 * entry in the cache, but we hope that this
    368 		 * condition doesn't happen very often.
    369 		 */
    370 		scp2 = scp;
    371 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
    372 			sce = &tcp_syn_cache[tcp_syn_cache_size];
    373 			for (++scp2; scp2 != scp; scp2++) {
    374 				if (scp2 >= sce)
    375 					scp2 = &tcp_syn_cache[0];
    376 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
    377 					break;
    378 			}
    379 #ifdef DIAGNOSTIC
    380 			/*
    381 			 * This should never happen; we should always find a
    382 			 * non-empty bucket.
    383 			 */
    384 			if (scp2 == scp)
    385 				panic("syn_cache_insert: cacheoverflow: "
    386 				    "impossible");
    387 #endif
    388 		}
    389 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
    390 		syn_cache_rm(sc2);
    391 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
    392 	}
    393 
    394 	/*
    395 	 * Initialize the entry's timer.
    396 	 */
    397 	sc->sc_rxttot = 0;
    398 	sc->sc_rxtshift = 0;
    399 	syn_cache_timer_arm(sc);
    400 
    401 	/* Link it from tcpcb entry */
    402 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
    403 
    404 	/* Put it into the bucket. */
    405 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
    406 	scp->sch_length++;
    407 	syn_cache_count++;
    408 
    409 	TCP_STATINC(TCP_STAT_SC_ADDED);
    410 	splx(s);
    411 }
    412 
    413 /*
    414  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
    415  * If we have retransmitted an entry the maximum number of times, expire
    416  * that entry.
    417  */
    418 static void
    419 syn_cache_timer(void *arg)
    420 {
    421 	struct syn_cache *sc = arg;
    422 
    423 	mutex_enter(softnet_lock);
    424 	KERNEL_LOCK(1, NULL);
    425 
    426 	callout_ack(&sc->sc_timer);
    427 
    428 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
    429 		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
    430 		goto free;
    431 	}
    432 
    433 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
    434 		/* Drop it -- too many retransmissions. */
    435 		goto dropit;
    436 	}
    437 
    438 	/*
    439 	 * Compute the total amount of time this entry has
    440 	 * been on a queue.  If this entry has been on longer
    441 	 * than the keep alive timer would allow, expire it.
    442 	 */
    443 	sc->sc_rxttot += sc->sc_rxtcur;
    444 	if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
    445 		goto dropit;
    446 
    447 	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
    448 	(void)syn_cache_respond(sc);
    449 
    450 	/* Advance the timer back-off. */
    451 	sc->sc_rxtshift++;
    452 	syn_cache_timer_arm(sc);
    453 
    454 	goto out;
    455 
    456  dropit:
    457 	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
    458 	syn_cache_rm(sc);
    459 	if (sc->sc_ipopts)
    460 		(void) m_free(sc->sc_ipopts);
    461 	rtcache_free(&sc->sc_route);
    462 
    463  free:
    464 	callout_destroy(&sc->sc_timer);
    465 	pool_put(&syn_cache_pool, sc);
    466 
    467  out:
    468 	KERNEL_UNLOCK_ONE(NULL);
    469 	mutex_exit(softnet_lock);
    470 }
    471 
    472 /*
    473  * Remove syn cache created by the specified tcb entry,
    474  * because this does not make sense to keep them
    475  * (if there's no tcb entry, syn cache entry will never be used)
    476  */
    477 void
    478 syn_cache_cleanup(struct tcpcb *tp)
    479 {
    480 	struct syn_cache *sc, *nsc;
    481 	int s;
    482 
    483 	s = splsoftnet();
    484 
    485 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
    486 		nsc = LIST_NEXT(sc, sc_tpq);
    487 
    488 #ifdef DIAGNOSTIC
    489 		if (sc->sc_tp != tp)
    490 			panic("invalid sc_tp in syn_cache_cleanup");
    491 #endif
    492 		syn_cache_rm(sc);
    493 		syn_cache_put(sc);	/* calls pool_put but see spl above */
    494 	}
    495 	/* just for safety */
    496 	LIST_INIT(&tp->t_sc);
    497 
    498 	splx(s);
    499 }
    500 
    501 /*
    502  * Find an entry in the syn cache.
    503  */
    504 static struct syn_cache *
    505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
    506     struct syn_cache_head **headp)
    507 {
    508 	struct syn_cache *sc;
    509 	struct syn_cache_head *scp;
    510 	u_int32_t hash;
    511 	int s;
    512 
    513 	SYN_HASHALL(hash, src, dst);
    514 
    515 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
    516 	*headp = scp;
    517 	s = splsoftnet();
    518 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
    519 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
    520 		if (sc->sc_hash != hash)
    521 			continue;
    522 		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
    523 		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
    524 			splx(s);
    525 			return (sc);
    526 		}
    527 	}
    528 	splx(s);
    529 	return (NULL);
    530 }
    531 
    532 /*
    533  * This function gets called when we receive an ACK for a socket in the
    534  * LISTEN state. We look up the connection in the syn cache, and if it's
    535  * there, we pull it out of the cache and turn it into a full-blown
    536  * connection in the SYN-RECEIVED state.
    537  *
    538  * The return values may not be immediately obvious, and their effects
    539  * can be subtle, so here they are:
    540  *
    541  *	NULL	SYN was not found in cache; caller should drop the
    542  *		packet and send an RST.
    543  *
    544  *	-1	We were unable to create the new connection, and are
    545  *		aborting it.  An ACK,RST is being sent to the peer
    546  *		(unless we got screwey sequence numbers; see below),
    547  *		because the 3-way handshake has been completed.  Caller
    548  *		should not free the mbuf, since we may be using it.  If
    549  *		we are not, we will free it.
    550  *
    551  *	Otherwise, the return value is a pointer to the new socket
    552  *	associated with the connection.
    553  */
    554 struct socket *
    555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
    556     struct tcphdr *th, struct socket *so, struct mbuf *m)
    557 {
    558 	struct syn_cache *sc;
    559 	struct syn_cache_head *scp;
    560 	struct inpcb *inp = NULL;
    561 	struct tcpcb *tp;
    562 	int s;
    563 	struct socket *oso;
    564 
    565 	s = splsoftnet();
    566 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    567 		splx(s);
    568 		return NULL;
    569 	}
    570 
    571 	/*
    572 	 * Verify the sequence and ack numbers.  Try getting the correct
    573 	 * response again.
    574 	 */
    575 	if ((th->th_ack != sc->sc_iss + 1) ||
    576 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
    577 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
    578 		m_freem(m);
    579 		(void)syn_cache_respond(sc);
    580 		splx(s);
    581 		return ((struct socket *)(-1));
    582 	}
    583 
    584 	/* Remove this cache entry */
    585 	syn_cache_rm(sc);
    586 	splx(s);
    587 
    588 	/*
    589 	 * Ok, create the full blown connection, and set things up
    590 	 * as they would have been set up if we had created the
    591 	 * connection when the SYN arrived.  If we can't create
    592 	 * the connection, abort it.
    593 	 */
    594 	/*
    595 	 * inp still has the OLD in_pcb stuff, set the
    596 	 * v6-related flags on the new guy, too.   This is
    597 	 * done particularly for the case where an AF_INET6
    598 	 * socket is bound only to a port, and a v4 connection
    599 	 * comes in on that port.
    600 	 * we also copy the flowinfo from the original pcb
    601 	 * to the new one.
    602 	 */
    603 	oso = so;
    604 	so = sonewconn(so, true);
    605 	if (so == NULL)
    606 		goto resetandabort;
    607 
    608 	inp = sotoinpcb(so);
    609 
    610 	switch (src->sa_family) {
    611 	case AF_INET:
    612 		if (inp->inp_af == AF_INET) {
    613 			in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
    614 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
    615 			inp->inp_options = ip_srcroute(m);
    616 			inpcb_set_state(inp, INP_BOUND);
    617 			if (inp->inp_options == NULL) {
    618 				inp->inp_options = sc->sc_ipopts;
    619 				sc->sc_ipopts = NULL;
    620 			}
    621 		}
    622 #ifdef INET6
    623 		else if (inp->inp_af == AF_INET6) {
    624 			/* IPv4 packet to AF_INET6 socket */
    625 			memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
    626 			in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
    627 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
    628 				&in6p_laddr(inp).s6_addr32[3],
    629 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
    630 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
    631 			intotcpcb(inp)->t_family = AF_INET;
    632 			if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
    633 				inp->inp_flags |= IN6P_IPV6_V6ONLY;
    634 			else
    635 				inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
    636 			inpcb_set_state(inp, INP_BOUND);
    637 		}
    638 #endif
    639 		break;
    640 #ifdef INET6
    641 	case AF_INET6:
    642 		if (inp->inp_af == AF_INET6) {
    643 			in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
    644 			inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
    645 			inpcb_set_state(inp, INP_BOUND);
    646 		}
    647 		break;
    648 #endif
    649 	}
    650 
    651 #ifdef INET6
    652 	if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
    653 		struct inpcb *oinp = sotoinpcb(oso);
    654 		/* inherit socket options from the listening socket */
    655 		inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
    656 		if (inp->inp_flags & IN6P_CONTROLOPTS) {
    657 			m_freem(inp->inp_options);
    658 			inp->inp_options = NULL;
    659 		}
    660 		ip6_savecontrol(inp, &inp->inp_options,
    661 		    mtod(m, struct ip6_hdr *), m);
    662 	}
    663 #endif
    664 
    665 	/*
    666 	 * Give the new socket our cached route reference.
    667 	 */
    668 	rtcache_copy(&inp->inp_route, &sc->sc_route);
    669 	rtcache_free(&sc->sc_route);
    670 
    671 	if (inp->inp_af == AF_INET) {
    672 		struct sockaddr_in sin;
    673 		memcpy(&sin, src, src->sa_len);
    674 		if (inpcb_connect(inp, &sin, &lwp0)) {
    675 			goto resetandabort;
    676 		}
    677 	}
    678 #ifdef INET6
    679 	else if (inp->inp_af == AF_INET6) {
    680 		struct sockaddr_in6 sin6;
    681 		memcpy(&sin6, src, src->sa_len);
    682 		if (src->sa_family == AF_INET) {
    683 			/* IPv4 packet to AF_INET6 socket */
    684 			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
    685 		}
    686 		if (in6_pcbconnect(inp, &sin6, NULL)) {
    687 			goto resetandabort;
    688 		}
    689 	}
    690 #endif
    691 	else {
    692 		goto resetandabort;
    693 	}
    694 
    695 	tp = intotcpcb(inp);
    696 
    697 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
    698 	if (sc->sc_request_r_scale != 15) {
    699 		tp->requested_s_scale = sc->sc_requested_s_scale;
    700 		tp->request_r_scale = sc->sc_request_r_scale;
    701 		tp->snd_scale = sc->sc_requested_s_scale;
    702 		tp->rcv_scale = sc->sc_request_r_scale;
    703 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
    704 	}
    705 	if (sc->sc_flags & SCF_TIMESTAMP)
    706 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
    707 	tp->ts_timebase = sc->sc_timebase;
    708 
    709 	tp->t_template = tcp_template(tp);
    710 	if (tp->t_template == 0) {
    711 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
    712 		so = NULL;
    713 		m_freem(m);
    714 		goto abort;
    715 	}
    716 
    717 	tp->iss = sc->sc_iss;
    718 	tp->irs = sc->sc_irs;
    719 	tcp_sendseqinit(tp);
    720 	tcp_rcvseqinit(tp);
    721 	tp->t_state = TCPS_SYN_RECEIVED;
    722 	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
    723 	TCP_STATINC(TCP_STAT_ACCEPTS);
    724 
    725 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
    726 		tp->t_flags |= TF_WILL_SACK;
    727 
    728 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
    729 		tp->t_flags |= TF_ECN_PERMIT;
    730 
    731 #ifdef TCP_SIGNATURE
    732 	if (sc->sc_flags & SCF_SIGNATURE)
    733 		tp->t_flags |= TF_SIGNATURE;
    734 #endif
    735 
    736 	/* Initialize tp->t_ourmss before we deal with the peer's! */
    737 	tp->t_ourmss = sc->sc_ourmaxseg;
    738 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
    739 
    740 	/*
    741 	 * Initialize the initial congestion window.  If we
    742 	 * had to retransmit the SYN,ACK, we must initialize cwnd
    743 	 * to 1 segment (i.e. the Loss Window).
    744 	 */
    745 	if (sc->sc_rxtshift)
    746 		tp->snd_cwnd = tp->t_peermss;
    747 	else {
    748 		int ss = tcp_init_win;
    749 		if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
    750 			ss = tcp_init_win_local;
    751 #ifdef INET6
    752 		else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
    753 			ss = tcp_init_win_local;
    754 #endif
    755 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
    756 	}
    757 
    758 	tcp_rmx_rtt(tp);
    759 	tp->snd_wl1 = sc->sc_irs;
    760 	tp->rcv_up = sc->sc_irs + 1;
    761 
    762 	/*
    763 	 * This is what would have happened in tcp_output() when
    764 	 * the SYN,ACK was sent.
    765 	 */
    766 	tp->snd_up = tp->snd_una;
    767 	tp->snd_max = tp->snd_nxt = tp->iss+1;
    768 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
    769 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
    770 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
    771 	tp->last_ack_sent = tp->rcv_nxt;
    772 	tp->t_partialacks = -1;
    773 	tp->t_dupacks = 0;
    774 
    775 	TCP_STATINC(TCP_STAT_SC_COMPLETED);
    776 	s = splsoftnet();
    777 	syn_cache_put(sc);
    778 	splx(s);
    779 	return so;
    780 
    781 resetandabort:
    782 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
    783 abort:
    784 	if (so != NULL) {
    785 		(void) soqremque(so, 1);
    786 		(void) soabort(so);
    787 		mutex_enter(softnet_lock);
    788 	}
    789 	s = splsoftnet();
    790 	syn_cache_put(sc);
    791 	splx(s);
    792 	TCP_STATINC(TCP_STAT_SC_ABORTED);
    793 	return ((struct socket *)(-1));
    794 }
    795 
    796 /*
    797  * This function is called when we get a RST for a
    798  * non-existent connection, so that we can see if the
    799  * connection is in the syn cache.  If it is, zap it.
    800  */
    801 
    802 void
    803 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
    804 {
    805 	struct syn_cache *sc;
    806 	struct syn_cache_head *scp;
    807 	int s = splsoftnet();
    808 
    809 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    810 		splx(s);
    811 		return;
    812 	}
    813 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
    814 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
    815 		splx(s);
    816 		return;
    817 	}
    818 	syn_cache_rm(sc);
    819 	TCP_STATINC(TCP_STAT_SC_RESET);
    820 	syn_cache_put(sc);	/* calls pool_put but see spl above */
    821 	splx(s);
    822 }
    823 
    824 void
    825 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
    826     struct tcphdr *th)
    827 {
    828 	struct syn_cache *sc;
    829 	struct syn_cache_head *scp;
    830 	int s;
    831 
    832 	s = splsoftnet();
    833 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    834 		splx(s);
    835 		return;
    836 	}
    837 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
    838 	if (ntohl(th->th_seq) != sc->sc_iss) {
    839 		splx(s);
    840 		return;
    841 	}
    842 
    843 	/*
    844 	 * If we've retransmitted 3 times and this is our second error,
    845 	 * we remove the entry.  Otherwise, we allow it to continue on.
    846 	 * This prevents us from incorrectly nuking an entry during a
    847 	 * spurious network outage.
    848 	 *
    849 	 * See tcp_notify().
    850 	 */
    851 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
    852 		sc->sc_flags |= SCF_UNREACH;
    853 		splx(s);
    854 		return;
    855 	}
    856 
    857 	syn_cache_rm(sc);
    858 	TCP_STATINC(TCP_STAT_SC_UNREACH);
    859 	syn_cache_put(sc);	/* calls pool_put but see spl above */
    860 	splx(s);
    861 }
    862 
    863 /*
    864  * Given a LISTEN socket and an inbound SYN request, add this to the syn
    865  * cache, and send back a segment:
    866  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
    867  * to the source.
    868  *
    869  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
    870  * Doing so would require that we hold onto the data and deliver it
    871  * to the application.  However, if we are the target of a SYN-flood
    872  * DoS attack, an attacker could send data which would eventually
    873  * consume all available buffer space if it were ACKed.  By not ACKing
    874  * the data, we avoid this DoS scenario.
    875  */
    876 int
    877 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
    878     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
    879     int optlen, struct tcp_opt_info *oi)
    880 {
    881 	struct tcpcb tb, *tp;
    882 	long win;
    883 	struct syn_cache *sc;
    884 	struct syn_cache_head *scp;
    885 	struct mbuf *ipopts;
    886 	int s;
    887 
    888 	tp = sototcpcb(so);
    889 
    890 	/*
    891 	 * Initialize some local state.
    892 	 */
    893 	win = sbspace(&so->so_rcv);
    894 	if (win > TCP_MAXWIN)
    895 		win = TCP_MAXWIN;
    896 
    897 #ifdef TCP_SIGNATURE
    898 	if (optp || (tp->t_flags & TF_SIGNATURE))
    899 #else
    900 	if (optp)
    901 #endif
    902 	{
    903 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
    904 #ifdef TCP_SIGNATURE
    905 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
    906 #endif
    907 		tb.t_state = TCPS_LISTEN;
    908 		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
    909 			return 0;
    910 	} else
    911 		tb.t_flags = 0;
    912 
    913 	switch (src->sa_family) {
    914 	case AF_INET:
    915 		/* Remember the IP options, if any. */
    916 		ipopts = ip_srcroute(m);
    917 		break;
    918 	default:
    919 		ipopts = NULL;
    920 	}
    921 
    922 	/*
    923 	 * See if we already have an entry for this connection.
    924 	 * If we do, resend the SYN,ACK.  We do not count this
    925 	 * as a retransmission (XXX though maybe we should).
    926 	 */
    927 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
    928 		TCP_STATINC(TCP_STAT_SC_DUPESYN);
    929 		if (ipopts) {
    930 			/*
    931 			 * If we were remembering a previous source route,
    932 			 * forget it and use the new one we've been given.
    933 			 */
    934 			if (sc->sc_ipopts)
    935 				(void)m_free(sc->sc_ipopts);
    936 			sc->sc_ipopts = ipopts;
    937 		}
    938 		sc->sc_timestamp = tb.ts_recent;
    939 		m_freem(m);
    940 		if (syn_cache_respond(sc) == 0) {
    941 			uint64_t *tcps = TCP_STAT_GETREF();
    942 			tcps[TCP_STAT_SNDACKS]++;
    943 			tcps[TCP_STAT_SNDTOTAL]++;
    944 			TCP_STAT_PUTREF();
    945 		}
    946 		return 1;
    947 	}
    948 
    949 	s = splsoftnet();
    950 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
    951 	splx(s);
    952 	if (sc == NULL) {
    953 		if (ipopts)
    954 			(void)m_free(ipopts);
    955 		return 0;
    956 	}
    957 
    958 	/*
    959 	 * Fill in the cache, and put the necessary IP and TCP
    960 	 * options into the reply.
    961 	 */
    962 	memset(sc, 0, sizeof(struct syn_cache));
    963 	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
    964 	memcpy(&sc->sc_src, src, src->sa_len);
    965 	memcpy(&sc->sc_dst, dst, dst->sa_len);
    966 	sc->sc_flags = 0;
    967 	sc->sc_ipopts = ipopts;
    968 	sc->sc_irs = th->th_seq;
    969 	switch (src->sa_family) {
    970 	case AF_INET:
    971 	    {
    972 		struct sockaddr_in *srcin = (void *)src;
    973 		struct sockaddr_in *dstin = (void *)dst;
    974 
    975 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
    976 		    &srcin->sin_addr, dstin->sin_port,
    977 		    srcin->sin_port, sizeof(dstin->sin_addr));
    978 		break;
    979 	    }
    980 #ifdef INET6
    981 	case AF_INET6:
    982 	    {
    983 		struct sockaddr_in6 *srcin6 = (void *)src;
    984 		struct sockaddr_in6 *dstin6 = (void *)dst;
    985 
    986 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
    987 		    &srcin6->sin6_addr, dstin6->sin6_port,
    988 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
    989 		break;
    990 	    }
    991 #endif
    992 	}
    993 	sc->sc_peermaxseg = oi->maxseg;
    994 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
    995 	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
    996 	sc->sc_win = win;
    997 	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
    998 	sc->sc_timestamp = tb.ts_recent;
    999 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
   1000 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
   1001 		sc->sc_flags |= SCF_TIMESTAMP;
   1002 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
   1003 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
   1004 		sc->sc_requested_s_scale = tb.requested_s_scale;
   1005 		sc->sc_request_r_scale = 0;
   1006 		/*
   1007 		 * Pick the smallest possible scaling factor that
   1008 		 * will still allow us to scale up to sb_max.
   1009 		 *
   1010 		 * We do this because there are broken firewalls that
   1011 		 * will corrupt the window scale option, leading to
   1012 		 * the other endpoint believing that our advertised
   1013 		 * window is unscaled.  At scale factors larger than
   1014 		 * 5 the unscaled window will drop below 1500 bytes,
   1015 		 * leading to serious problems when traversing these
   1016 		 * broken firewalls.
   1017 		 *
   1018 		 * With the default sbmax of 256K, a scale factor
   1019 		 * of 3 will be chosen by this algorithm.  Those who
   1020 		 * choose a larger sbmax should watch out
   1021 		 * for the compatibility problems mentioned above.
   1022 		 *
   1023 		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
   1024 		 * or <SYN,ACK>) segment itself is never scaled.
   1025 		 */
   1026 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
   1027 		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
   1028 			sc->sc_request_r_scale++;
   1029 	} else {
   1030 		sc->sc_requested_s_scale = 15;
   1031 		sc->sc_request_r_scale = 15;
   1032 	}
   1033 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
   1034 		sc->sc_flags |= SCF_SACK_PERMIT;
   1035 
   1036 	/*
   1037 	 * ECN setup packet received.
   1038 	 */
   1039 	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
   1040 		sc->sc_flags |= SCF_ECN_PERMIT;
   1041 
   1042 #ifdef TCP_SIGNATURE
   1043 	if (tb.t_flags & TF_SIGNATURE)
   1044 		sc->sc_flags |= SCF_SIGNATURE;
   1045 #endif
   1046 	sc->sc_tp = tp;
   1047 	m_freem(m);
   1048 	if (syn_cache_respond(sc) == 0) {
   1049 		uint64_t *tcps = TCP_STAT_GETREF();
   1050 		tcps[TCP_STAT_SNDACKS]++;
   1051 		tcps[TCP_STAT_SNDTOTAL]++;
   1052 		TCP_STAT_PUTREF();
   1053 		syn_cache_insert(sc, tp);
   1054 	} else {
   1055 		s = splsoftnet();
   1056 		/*
   1057 		 * syn_cache_put() will try to schedule the timer, so
   1058 		 * we need to initialize it
   1059 		 */
   1060 		syn_cache_timer_arm(sc);
   1061 		syn_cache_put(sc);
   1062 		splx(s);
   1063 		TCP_STATINC(TCP_STAT_SC_DROPPED);
   1064 	}
   1065 	return 1;
   1066 }
   1067 
   1068 /*
   1069  * syn_cache_respond: (re)send SYN+ACK.
   1070  *
   1071  * Returns 0 on success.
   1072  */
   1073 
   1074 static int
   1075 syn_cache_respond(struct syn_cache *sc)
   1076 {
   1077 #ifdef INET6
   1078 	struct rtentry *rt = NULL;
   1079 #endif
   1080 	struct route *ro;
   1081 	u_int8_t *optp;
   1082 	int optlen, error;
   1083 	u_int16_t tlen;
   1084 	struct ip *ip = NULL;
   1085 #ifdef INET6
   1086 	struct ip6_hdr *ip6 = NULL;
   1087 #endif
   1088 	struct tcpcb *tp;
   1089 	struct tcphdr *th;
   1090 	struct mbuf *m;
   1091 	u_int hlen;
   1092 #ifdef TCP_SIGNATURE
   1093 	struct secasvar *sav = NULL;
   1094 	u_int8_t *sigp = NULL;
   1095 #endif
   1096 
   1097 	ro = &sc->sc_route;
   1098 	switch (sc->sc_src.sa.sa_family) {
   1099 	case AF_INET:
   1100 		hlen = sizeof(struct ip);
   1101 		break;
   1102 #ifdef INET6
   1103 	case AF_INET6:
   1104 		hlen = sizeof(struct ip6_hdr);
   1105 		break;
   1106 #endif
   1107 	default:
   1108 		return EAFNOSUPPORT;
   1109 	}
   1110 
   1111 	/* Worst case scenario, since we don't know the option size yet. */
   1112 	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
   1113 	KASSERT(max_linkhdr + tlen <= MCLBYTES);
   1114 
   1115 	/*
   1116 	 * Create the IP+TCP header from scratch.
   1117 	 */
   1118 	MGETHDR(m, M_DONTWAIT, MT_DATA);
   1119 	if (m && (max_linkhdr + tlen) > MHLEN) {
   1120 		MCLGET(m, M_DONTWAIT);
   1121 		if ((m->m_flags & M_EXT) == 0) {
   1122 			m_freem(m);
   1123 			m = NULL;
   1124 		}
   1125 	}
   1126 	if (m == NULL)
   1127 		return ENOBUFS;
   1128 	MCLAIM(m, &tcp_tx_mowner);
   1129 
   1130 	tp = sc->sc_tp;
   1131 
   1132 	/* Fixup the mbuf. */
   1133 	m->m_data += max_linkhdr;
   1134 	m_reset_rcvif(m);
   1135 	memset(mtod(m, void *), 0, tlen);
   1136 
   1137 	switch (sc->sc_src.sa.sa_family) {
   1138 	case AF_INET:
   1139 		ip = mtod(m, struct ip *);
   1140 		ip->ip_v = 4;
   1141 		ip->ip_dst = sc->sc_src.sin.sin_addr;
   1142 		ip->ip_src = sc->sc_dst.sin.sin_addr;
   1143 		ip->ip_p = IPPROTO_TCP;
   1144 		th = (struct tcphdr *)(ip + 1);
   1145 		th->th_dport = sc->sc_src.sin.sin_port;
   1146 		th->th_sport = sc->sc_dst.sin.sin_port;
   1147 		break;
   1148 #ifdef INET6
   1149 	case AF_INET6:
   1150 		ip6 = mtod(m, struct ip6_hdr *);
   1151 		ip6->ip6_vfc = IPV6_VERSION;
   1152 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
   1153 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
   1154 		ip6->ip6_nxt = IPPROTO_TCP;
   1155 		/* ip6_plen will be updated in ip6_output() */
   1156 		th = (struct tcphdr *)(ip6 + 1);
   1157 		th->th_dport = sc->sc_src.sin6.sin6_port;
   1158 		th->th_sport = sc->sc_dst.sin6.sin6_port;
   1159 		break;
   1160 #endif
   1161 	default:
   1162 		panic("%s: impossible (1)", __func__);
   1163 	}
   1164 
   1165 	th->th_seq = htonl(sc->sc_iss);
   1166 	th->th_ack = htonl(sc->sc_irs + 1);
   1167 	th->th_flags = TH_SYN|TH_ACK;
   1168 	th->th_win = htons(sc->sc_win);
   1169 	/* th_x2, th_sum, th_urp already 0 from memset */
   1170 
   1171 	/* Tack on the TCP options. */
   1172 	optp = (u_int8_t *)(th + 1);
   1173 	optlen = 0;
   1174 	*optp++ = TCPOPT_MAXSEG;
   1175 	*optp++ = TCPOLEN_MAXSEG;
   1176 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
   1177 	*optp++ = sc->sc_ourmaxseg & 0xff;
   1178 	optlen += TCPOLEN_MAXSEG;
   1179 
   1180 	if (sc->sc_request_r_scale != 15) {
   1181 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
   1182 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
   1183 		    sc->sc_request_r_scale);
   1184 		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
   1185 		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
   1186 	}
   1187 
   1188 	if (sc->sc_flags & SCF_SACK_PERMIT) {
   1189 		/* Let the peer know that we will SACK. */
   1190 		*optp++ = TCPOPT_SACK_PERMITTED;
   1191 		*optp++ = TCPOLEN_SACK_PERMITTED;
   1192 		optlen += TCPOLEN_SACK_PERMITTED;
   1193 	}
   1194 
   1195 	if (sc->sc_flags & SCF_TIMESTAMP) {
   1196 		while (optlen % 4 != 2) {
   1197 			optlen += TCPOLEN_NOP;
   1198 			*optp++ = TCPOPT_NOP;
   1199 		}
   1200 		*optp++ = TCPOPT_TIMESTAMP;
   1201 		*optp++ = TCPOLEN_TIMESTAMP;
   1202 		u_int32_t *lp = (u_int32_t *)(optp);
   1203 		/* Form timestamp option as shown in appendix A of RFC 1323. */
   1204 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
   1205 		*lp   = htonl(sc->sc_timestamp);
   1206 		optp += TCPOLEN_TIMESTAMP - 2;
   1207 		optlen += TCPOLEN_TIMESTAMP;
   1208 	}
   1209 
   1210 #ifdef TCP_SIGNATURE
   1211 	if (sc->sc_flags & SCF_SIGNATURE) {
   1212 		sav = tcp_signature_getsav(m);
   1213 		if (sav == NULL) {
   1214 			m_freem(m);
   1215 			return EPERM;
   1216 		}
   1217 
   1218 		*optp++ = TCPOPT_SIGNATURE;
   1219 		*optp++ = TCPOLEN_SIGNATURE;
   1220 		sigp = optp;
   1221 		memset(optp, 0, TCP_SIGLEN);
   1222 		optp += TCP_SIGLEN;
   1223 		optlen += TCPOLEN_SIGNATURE;
   1224 	}
   1225 #endif
   1226 
   1227 	/*
   1228 	 * Terminate and pad TCP options to a 4 byte boundary.
   1229 	 *
   1230 	 * According to RFC793: "The content of the header beyond the
   1231 	 * End-of-Option option must be header padding (i.e., zero)."
   1232 	 * And later: "The padding is composed of zeros."
   1233 	 */
   1234 	if (optlen % 4) {
   1235 		optlen += TCPOLEN_EOL;
   1236 		*optp++ = TCPOPT_EOL;
   1237 	}
   1238 	while (optlen % 4) {
   1239 		optlen += TCPOLEN_PAD;
   1240 		*optp++ = TCPOPT_PAD;
   1241 	}
   1242 
   1243 	/* Compute the actual values now that we've added the options. */
   1244 	tlen = hlen + sizeof(struct tcphdr) + optlen;
   1245 	m->m_len = m->m_pkthdr.len = tlen;
   1246 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
   1247 
   1248 #ifdef TCP_SIGNATURE
   1249 	if (sav) {
   1250 		(void)tcp_signature(m, th, hlen, sav, sigp);
   1251 		key_sa_recordxfer(sav, m);
   1252 		KEY_SA_UNREF(&sav);
   1253 	}
   1254 #endif
   1255 
   1256 	/*
   1257 	 * Send ECN SYN-ACK setup packet.
   1258 	 * Routes can be asymmetric, so, even if we receive a packet
   1259 	 * with ECE and CWR set, we must not assume no one will block
   1260 	 * the ECE packet we are about to send.
   1261 	 */
   1262 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
   1263 	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
   1264 		th->th_flags |= TH_ECE;
   1265 		TCP_STATINC(TCP_STAT_ECN_SHS);
   1266 
   1267 		/*
   1268 		 * draft-ietf-tcpm-ecnsyn-00.txt
   1269 		 *
   1270 		 * "[...] a TCP node MAY respond to an ECN-setup
   1271 		 * SYN packet by setting ECT in the responding
   1272 		 * ECN-setup SYN/ACK packet, indicating to routers
   1273 		 * that the SYN/ACK packet is ECN-Capable.
   1274 		 * This allows a congested router along the path
   1275 		 * to mark the packet instead of dropping the
   1276 		 * packet as an indication of congestion."
   1277 		 *
   1278 		 * "[...] There can be a great benefit in setting
   1279 		 * an ECN-capable codepoint in SYN/ACK packets [...]
   1280 		 * Congestion is  most likely to occur in
   1281 		 * the server-to-client direction.  As a result,
   1282 		 * setting an ECN-capable codepoint in SYN/ACK
   1283 		 * packets can reduce the occurrence of three-second
   1284 		 * retransmit timeouts resulting from the drop
   1285 		 * of SYN/ACK packets."
   1286 		 *
   1287 		 * Page 4 and 6, January 2006.
   1288 		 */
   1289 
   1290 		switch (sc->sc_src.sa.sa_family) {
   1291 		case AF_INET:
   1292 			ip->ip_tos |= IPTOS_ECN_ECT0;
   1293 			break;
   1294 #ifdef INET6
   1295 		case AF_INET6:
   1296 			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
   1297 			break;
   1298 #endif
   1299 		}
   1300 		TCP_STATINC(TCP_STAT_ECN_ECT);
   1301 	}
   1302 
   1303 
   1304 	/*
   1305 	 * Compute the packet's checksum.
   1306 	 *
   1307 	 * Fill in some straggling IP bits.  Note the stack expects
   1308 	 * ip_len to be in host order, for convenience.
   1309 	 */
   1310 	switch (sc->sc_src.sa.sa_family) {
   1311 	case AF_INET:
   1312 		ip->ip_len = htons(tlen - hlen);
   1313 		th->th_sum = 0;
   1314 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
   1315 		ip->ip_len = htons(tlen);
   1316 		ip->ip_ttl = ip_defttl;
   1317 		/* XXX tos? */
   1318 		break;
   1319 #ifdef INET6
   1320 	case AF_INET6:
   1321 		ip6->ip6_plen = htons(tlen - hlen);
   1322 		th->th_sum = 0;
   1323 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
   1324 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
   1325 		ip6->ip6_vfc |= IPV6_VERSION;
   1326 		ip6->ip6_plen = htons(tlen - hlen);
   1327 		/* ip6_hlim will be initialized afterwards */
   1328 		/* XXX flowlabel? */
   1329 		break;
   1330 #endif
   1331 	}
   1332 
   1333 	/* XXX use IPsec policy on listening socket, on SYN ACK */
   1334 	tp = sc->sc_tp;
   1335 
   1336 	switch (sc->sc_src.sa.sa_family) {
   1337 	case AF_INET:
   1338 		error = ip_output(m, sc->sc_ipopts, ro,
   1339 		    (ip_mtudisc ? IP_MTUDISC : 0),
   1340 		    NULL, tp ? tp->t_inpcb : NULL);
   1341 		break;
   1342 #ifdef INET6
   1343 	case AF_INET6:
   1344 		ip6->ip6_hlim = in6_selecthlim(NULL,
   1345 		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
   1346 		rtcache_unref(rt, ro);
   1347 
   1348 		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
   1349 		    tp ? tp->t_inpcb : NULL, NULL);
   1350 		break;
   1351 #endif
   1352 	default:
   1353 		panic("%s: impossible (2)", __func__);
   1354 	}
   1355 
   1356 	return error;
   1357 }
   1358