Home | History | Annotate | Line # | Download | only in netinet
tcp_syncache.c revision 1.2
      1 /*	$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $	*/
      2 
      3 /*
      4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  * 3. Neither the name of the project nor the names of its contributors
     16  *    may be used to endorse or promote products derived from this software
     17  *    without specific prior written permission.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
     20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
     23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
     34  *
     35  * NRL grants permission for redistribution and use in source and binary
     36  * forms, with or without modification, of the software and documentation
     37  * created at NRL provided that the following conditions are met:
     38  *
     39  * 1. Redistributions of source code must retain the above copyright
     40  *    notice, this list of conditions and the following disclaimer.
     41  * 2. Redistributions in binary form must reproduce the above copyright
     42  *    notice, this list of conditions and the following disclaimer in the
     43  *    documentation and/or other materials provided with the distribution.
     44  * 3. All advertising materials mentioning features or use of this software
     45  *    must display the following acknowledgements:
     46  *      This product includes software developed by the University of
     47  *      California, Berkeley and its contributors.
     48  *      This product includes software developed at the Information
     49  *      Technology Division, US Naval Research Laboratory.
     50  * 4. Neither the name of the NRL nor the names of its contributors
     51  *    may be used to endorse or promote products derived from this software
     52  *    without specific prior written permission.
     53  *
     54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
     55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
     57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
     58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     65  *
     66  * The views and conclusions contained in the software and documentation
     67  * are those of the authors and should not be interpreted as representing
     68  * official policies, either expressed or implied, of the US Naval
     69  * Research Laboratory (NRL).
     70  */
     71 
     72 /*-
     73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
     74  * 2011 The NetBSD Foundation, Inc.
     75  * All rights reserved.
     76  *
     77  * This code is derived from software contributed to The NetBSD Foundation
     78  * by Coyote Point Systems, Inc.
     79  * This code is derived from software contributed to The NetBSD Foundation
     80  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
     81  * Facility, NASA Ames Research Center.
     82  * This code is derived from software contributed to The NetBSD Foundation
     83  * by Charles M. Hannum.
     84  * This code is derived from software contributed to The NetBSD Foundation
     85  * by Rui Paulo.
     86  *
     87  * Redistribution and use in source and binary forms, with or without
     88  * modification, are permitted provided that the following conditions
     89  * are met:
     90  * 1. Redistributions of source code must retain the above copyright
     91  *    notice, this list of conditions and the following disclaimer.
     92  * 2. Redistributions in binary form must reproduce the above copyright
     93  *    notice, this list of conditions and the following disclaimer in the
     94  *    documentation and/or other materials provided with the distribution.
     95  *
     96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
    100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    106  * POSSIBILITY OF SUCH DAMAGE.
    107  */
    108 
    109 /*
    110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
    111  *	The Regents of the University of California.  All rights reserved.
    112  *
    113  * Redistribution and use in source and binary forms, with or without
    114  * modification, are permitted provided that the following conditions
    115  * are met:
    116  * 1. Redistributions of source code must retain the above copyright
    117  *    notice, this list of conditions and the following disclaimer.
    118  * 2. Redistributions in binary form must reproduce the above copyright
    119  *    notice, this list of conditions and the following disclaimer in the
    120  *    documentation and/or other materials provided with the distribution.
    121  * 3. Neither the name of the University nor the names of its contributors
    122  *    may be used to endorse or promote products derived from this software
    123  *    without specific prior written permission.
    124  *
    125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
    126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    135  * SUCH DAMAGE.
    136  *
    137  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
    138  */
    139 
    140 /*
    141  *	TODO list for SYN cache stuff:
    142  *
    143  *	Find room for a "state" field, which is needed to keep a
    144  *	compressed state for TIME_WAIT TCBs.  It's been noted already
    145  *	that this is fairly important for very high-volume web and
    146  *	mail servers, which use a large number of short-lived
    147  *	connections.
    148  */
    149 
    150 #include <sys/cdefs.h>
    151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $");
    152 
    153 #ifdef _KERNEL_OPT
    154 #include "opt_inet.h"
    155 #include "opt_ipsec.h"
    156 #endif
    157 
    158 #include <sys/param.h>
    159 #include <sys/systm.h>
    160 #include <sys/mbuf.h>
    161 #include <sys/protosw.h>
    162 #include <sys/socket.h>
    163 #include <sys/socketvar.h>
    164 #include <sys/errno.h>
    165 #include <sys/syslog.h>
    166 #include <sys/pool.h>
    167 #include <sys/domain.h>
    168 #include <sys/kernel.h>
    169 #include <sys/lwp.h> /* for lwp0 */
    170 #include <sys/cprng.h>
    171 
    172 #include <netinet/in.h>
    173 #include <netinet/ip.h>
    174 #include <netinet/in_pcb.h>
    175 #include <netinet/in_var.h>
    176 #include <netinet/ip_var.h>
    177 
    178 #include <netinet/ip6.h>
    179 #ifdef INET6
    180 #include <netinet6/ip6_var.h>
    181 #include <netinet6/in6_pcb.h>
    182 #include <netinet6/ip6_var.h>
    183 #include <netinet6/in6_var.h>
    184 #endif
    185 
    186 #include <netinet/tcp.h>
    187 #include <netinet/tcp_fsm.h>
    188 #include <netinet/tcp_seq.h>
    189 #include <netinet/tcp_timer.h>
    190 #include <netinet/tcp_var.h>
    191 #include <netinet/tcp_private.h>
    192 #include <netinet/tcp_syncache.h>
    193 
    194 #ifdef TCP_SIGNATURE
    195 #ifdef IPSEC
    196 #include <netipsec/ipsec.h>
    197 #include <netipsec/key.h>
    198 #ifdef INET6
    199 #include <netipsec/ipsec6.h>
    200 #endif
    201 #endif	/* IPSEC*/
    202 #endif
    203 
    204 static void	syn_cache_timer(void *);
    205 static struct syn_cache *
    206 		syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
    207 		struct syn_cache_head **);
    208 static int	syn_cache_respond(struct syn_cache *);
    209 
    210 /* syn hash parameters */
    211 #define	TCP_SYN_HASH_SIZE	293
    212 #define	TCP_SYN_BUCKET_SIZE	35
    213 static int	tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
    214 int		tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
    215 int		tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
    216 static struct	syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
    217 
    218 /*
    219  * TCP compressed state engine.  Currently used to hold compressed
    220  * state for SYN_RECEIVED.
    221  */
    222 
    223 u_long	syn_cache_count;
    224 static u_int32_t syn_hash1, syn_hash2;
    225 
    226 #define SYN_HASH(sa, sp, dp) \
    227 	((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
    228 				     ((u_int32_t)(sp)))^syn_hash2)))
    229 #ifndef INET6
    230 #define	SYN_HASHALL(hash, src, dst) \
    231 do {									\
    232 	hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,	\
    233 		((const struct sockaddr_in *)(src))->sin_port,		\
    234 		((const struct sockaddr_in *)(dst))->sin_port);		\
    235 } while (/*CONSTCOND*/ 0)
    236 #else
    237 #define SYN_HASH6(sa, sp, dp) \
    238 	((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
    239 	  (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
    240 	 & 0x7fffffff)
    241 
    242 #define SYN_HASHALL(hash, src, dst) \
    243 do {									\
    244 	switch ((src)->sa_family) {					\
    245 	case AF_INET:							\
    246 		hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
    247 			((const struct sockaddr_in *)(src))->sin_port,	\
    248 			((const struct sockaddr_in *)(dst))->sin_port);	\
    249 		break;							\
    250 	case AF_INET6:							\
    251 		hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
    252 			((const struct sockaddr_in6 *)(src))->sin6_port,	\
    253 			((const struct sockaddr_in6 *)(dst))->sin6_port);	\
    254 		break;							\
    255 	default:							\
    256 		hash = 0;						\
    257 	}								\
    258 } while (/*CONSTCOND*/0)
    259 #endif /* INET6 */
    260 
    261 static struct pool syn_cache_pool;
    262 
    263 /*
    264  * We don't estimate RTT with SYNs, so each packet starts with the default
    265  * RTT and each timer step has a fixed timeout value.
    266  */
    267 static inline void
    268 syn_cache_timer_arm(struct syn_cache *sc)
    269 {
    270 
    271 	TCPT_RANGESET(sc->sc_rxtcur,
    272 	    TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
    273 	    TCPTV_REXMTMAX);
    274 	callout_reset(&sc->sc_timer,
    275 	    sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
    276 }
    277 
    278 #define	SYN_CACHE_TIMESTAMP(sc)	(tcp_now - (sc)->sc_timebase)
    279 
    280 static inline void
    281 syn_cache_rm(struct syn_cache *sc)
    282 {
    283 	TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
    284 	    sc, sc_bucketq);
    285 	sc->sc_tp = NULL;
    286 	LIST_REMOVE(sc, sc_tpq);
    287 	tcp_syn_cache[sc->sc_bucketidx].sch_length--;
    288 	callout_stop(&sc->sc_timer);
    289 	syn_cache_count--;
    290 }
    291 
    292 static inline void
    293 syn_cache_put(struct syn_cache *sc)
    294 {
    295 	if (sc->sc_ipopts)
    296 		(void) m_free(sc->sc_ipopts);
    297 	rtcache_free(&sc->sc_route);
    298 	sc->sc_flags |= SCF_DEAD;
    299 	if (!callout_invoking(&sc->sc_timer))
    300 		callout_schedule(&(sc)->sc_timer, 1);
    301 }
    302 
    303 void
    304 syn_cache_init(void)
    305 {
    306 	int i;
    307 
    308 	pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
    309 	    "synpl", NULL, IPL_SOFTNET);
    310 
    311 	/* Initialize the hash buckets. */
    312 	for (i = 0; i < tcp_syn_cache_size; i++)
    313 		TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
    314 }
    315 
    316 void
    317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
    318 {
    319 	struct syn_cache_head *scp;
    320 	struct syn_cache *sc2;
    321 	int s;
    322 
    323 	/*
    324 	 * If there are no entries in the hash table, reinitialize
    325 	 * the hash secrets.
    326 	 */
    327 	if (syn_cache_count == 0) {
    328 		syn_hash1 = cprng_fast32();
    329 		syn_hash2 = cprng_fast32();
    330 	}
    331 
    332 	SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
    333 	sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
    334 	scp = &tcp_syn_cache[sc->sc_bucketidx];
    335 
    336 	/*
    337 	 * Make sure that we don't overflow the per-bucket
    338 	 * limit or the total cache size limit.
    339 	 */
    340 	s = splsoftnet();
    341 	if (scp->sch_length >= tcp_syn_bucket_limit) {
    342 		TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
    343 		/*
    344 		 * The bucket is full.  Toss the oldest element in the
    345 		 * bucket.  This will be the first entry in the bucket.
    346 		 */
    347 		sc2 = TAILQ_FIRST(&scp->sch_bucket);
    348 #ifdef DIAGNOSTIC
    349 		/*
    350 		 * This should never happen; we should always find an
    351 		 * entry in our bucket.
    352 		 */
    353 		if (sc2 == NULL)
    354 			panic("syn_cache_insert: bucketoverflow: impossible");
    355 #endif
    356 		syn_cache_rm(sc2);
    357 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
    358 	} else if (syn_cache_count >= tcp_syn_cache_limit) {
    359 		struct syn_cache_head *scp2, *sce;
    360 
    361 		TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
    362 		/*
    363 		 * The cache is full.  Toss the oldest entry in the
    364 		 * first non-empty bucket we can find.
    365 		 *
    366 		 * XXX We would really like to toss the oldest
    367 		 * entry in the cache, but we hope that this
    368 		 * condition doesn't happen very often.
    369 		 */
    370 		scp2 = scp;
    371 		if (TAILQ_EMPTY(&scp2->sch_bucket)) {
    372 			sce = &tcp_syn_cache[tcp_syn_cache_size];
    373 			for (++scp2; scp2 != scp; scp2++) {
    374 				if (scp2 >= sce)
    375 					scp2 = &tcp_syn_cache[0];
    376 				if (! TAILQ_EMPTY(&scp2->sch_bucket))
    377 					break;
    378 			}
    379 #ifdef DIAGNOSTIC
    380 			/*
    381 			 * This should never happen; we should always find a
    382 			 * non-empty bucket.
    383 			 */
    384 			if (scp2 == scp)
    385 				panic("syn_cache_insert: cacheoverflow: "
    386 				    "impossible");
    387 #endif
    388 		}
    389 		sc2 = TAILQ_FIRST(&scp2->sch_bucket);
    390 		syn_cache_rm(sc2);
    391 		syn_cache_put(sc2);	/* calls pool_put but see spl above */
    392 	}
    393 
    394 	/*
    395 	 * Initialize the entry's timer.
    396 	 */
    397 	sc->sc_rxttot = 0;
    398 	sc->sc_rxtshift = 0;
    399 	syn_cache_timer_arm(sc);
    400 
    401 	/* Link it from tcpcb entry */
    402 	LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
    403 
    404 	/* Put it into the bucket. */
    405 	TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
    406 	scp->sch_length++;
    407 	syn_cache_count++;
    408 
    409 	TCP_STATINC(TCP_STAT_SC_ADDED);
    410 	splx(s);
    411 }
    412 
    413 /*
    414  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
    415  * If we have retransmitted an entry the maximum number of times, expire
    416  * that entry.
    417  */
    418 static void
    419 syn_cache_timer(void *arg)
    420 {
    421 	struct syn_cache *sc = arg;
    422 
    423 	mutex_enter(softnet_lock);
    424 	KERNEL_LOCK(1, NULL);
    425 
    426 	callout_ack(&sc->sc_timer);
    427 
    428 	if (__predict_false(sc->sc_flags & SCF_DEAD)) {
    429 		TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
    430 		goto free;
    431 	}
    432 
    433 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
    434 		/* Drop it -- too many retransmissions. */
    435 		goto dropit;
    436 	}
    437 
    438 	/*
    439 	 * Compute the total amount of time this entry has
    440 	 * been on a queue.  If this entry has been on longer
    441 	 * than the keep alive timer would allow, expire it.
    442 	 */
    443 	sc->sc_rxttot += sc->sc_rxtcur;
    444 	if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
    445 		goto dropit;
    446 
    447 	TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
    448 	(void)syn_cache_respond(sc);
    449 
    450 	/* Advance the timer back-off. */
    451 	sc->sc_rxtshift++;
    452 	syn_cache_timer_arm(sc);
    453 
    454 	goto out;
    455 
    456  dropit:
    457 	TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
    458 	syn_cache_rm(sc);
    459 	if (sc->sc_ipopts)
    460 		(void) m_free(sc->sc_ipopts);
    461 	rtcache_free(&sc->sc_route);
    462 
    463  free:
    464 	callout_destroy(&sc->sc_timer);
    465 	pool_put(&syn_cache_pool, sc);
    466 
    467  out:
    468 	KERNEL_UNLOCK_ONE(NULL);
    469 	mutex_exit(softnet_lock);
    470 }
    471 
    472 /*
    473  * Remove syn cache created by the specified tcb entry,
    474  * because this does not make sense to keep them
    475  * (if there's no tcb entry, syn cache entry will never be used)
    476  */
    477 void
    478 syn_cache_cleanup(struct tcpcb *tp)
    479 {
    480 	struct syn_cache *sc, *nsc;
    481 	int s;
    482 
    483 	s = splsoftnet();
    484 
    485 	for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
    486 		nsc = LIST_NEXT(sc, sc_tpq);
    487 
    488 #ifdef DIAGNOSTIC
    489 		if (sc->sc_tp != tp)
    490 			panic("invalid sc_tp in syn_cache_cleanup");
    491 #endif
    492 		syn_cache_rm(sc);
    493 		syn_cache_put(sc);	/* calls pool_put but see spl above */
    494 	}
    495 	/* just for safety */
    496 	LIST_INIT(&tp->t_sc);
    497 
    498 	splx(s);
    499 }
    500 
    501 /*
    502  * Find an entry in the syn cache.
    503  */
    504 static struct syn_cache *
    505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
    506     struct syn_cache_head **headp)
    507 {
    508 	struct syn_cache *sc;
    509 	struct syn_cache_head *scp;
    510 	u_int32_t hash;
    511 	int s;
    512 
    513 	SYN_HASHALL(hash, src, dst);
    514 
    515 	scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
    516 	*headp = scp;
    517 	s = splsoftnet();
    518 	for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
    519 	     sc = TAILQ_NEXT(sc, sc_bucketq)) {
    520 		if (sc->sc_hash != hash)
    521 			continue;
    522 		if (!memcmp(&sc->sc_src, src, src->sa_len) &&
    523 		    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
    524 			splx(s);
    525 			return (sc);
    526 		}
    527 	}
    528 	splx(s);
    529 	return (NULL);
    530 }
    531 
    532 /*
    533  * This function gets called when we receive an ACK for a socket in the
    534  * LISTEN state. We look up the connection in the syn cache, and if it's
    535  * there, we pull it out of the cache and turn it into a full-blown
    536  * connection in the SYN-RECEIVED state.
    537  *
    538  * The return values may not be immediately obvious, and their effects
    539  * can be subtle, so here they are:
    540  *
    541  *	NULL	SYN was not found in cache; caller should drop the
    542  *		packet and send an RST.
    543  *
    544  *	-1	We were unable to create the new connection, and are
    545  *		aborting it.  An ACK,RST is being sent to the peer
    546  *		(unless we got screwey sequence numbers; see below),
    547  *		because the 3-way handshake has been completed.  Caller
    548  *		should not free the mbuf, since we may be using it.  If
    549  *		we are not, we will free it.
    550  *
    551  *	Otherwise, the return value is a pointer to the new socket
    552  *	associated with the connection.
    553  */
    554 struct socket *
    555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
    556     struct tcphdr *th, struct socket *so, struct mbuf *m)
    557 {
    558 	struct syn_cache *sc;
    559 	struct syn_cache_head *scp;
    560 	struct inpcb *inp = NULL;
    561 #ifdef INET6
    562 	struct in6pcb *in6p = NULL;
    563 #endif
    564 	struct tcpcb *tp;
    565 	int s;
    566 	struct socket *oso;
    567 
    568 	s = splsoftnet();
    569 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    570 		splx(s);
    571 		return NULL;
    572 	}
    573 
    574 	/*
    575 	 * Verify the sequence and ack numbers.  Try getting the correct
    576 	 * response again.
    577 	 */
    578 	if ((th->th_ack != sc->sc_iss + 1) ||
    579 	    SEQ_LEQ(th->th_seq, sc->sc_irs) ||
    580 	    SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
    581 		m_freem(m);
    582 		(void)syn_cache_respond(sc);
    583 		splx(s);
    584 		return ((struct socket *)(-1));
    585 	}
    586 
    587 	/* Remove this cache entry */
    588 	syn_cache_rm(sc);
    589 	splx(s);
    590 
    591 	/*
    592 	 * Ok, create the full blown connection, and set things up
    593 	 * as they would have been set up if we had created the
    594 	 * connection when the SYN arrived.  If we can't create
    595 	 * the connection, abort it.
    596 	 */
    597 	/*
    598 	 * inp still has the OLD in_pcb stuff, set the
    599 	 * v6-related flags on the new guy, too.   This is
    600 	 * done particularly for the case where an AF_INET6
    601 	 * socket is bound only to a port, and a v4 connection
    602 	 * comes in on that port.
    603 	 * we also copy the flowinfo from the original pcb
    604 	 * to the new one.
    605 	 */
    606 	oso = so;
    607 	so = sonewconn(so, true);
    608 	if (so == NULL)
    609 		goto resetandabort;
    610 
    611 	switch (so->so_proto->pr_domain->dom_family) {
    612 	case AF_INET:
    613 		inp = sotoinpcb(so);
    614 		break;
    615 #ifdef INET6
    616 	case AF_INET6:
    617 		in6p = sotoin6pcb(so);
    618 		break;
    619 #endif
    620 	}
    621 
    622 	switch (src->sa_family) {
    623 	case AF_INET:
    624 		if (inp) {
    625 			inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
    626 			inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
    627 			inp->inp_options = ip_srcroute(m);
    628 			in_pcbstate(inp, INP_BOUND);
    629 			if (inp->inp_options == NULL) {
    630 				inp->inp_options = sc->sc_ipopts;
    631 				sc->sc_ipopts = NULL;
    632 			}
    633 		}
    634 #ifdef INET6
    635 		else if (in6p) {
    636 			/* IPv4 packet to AF_INET6 socket */
    637 			memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr));
    638 			in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
    639 			bcopy(&((struct sockaddr_in *)dst)->sin_addr,
    640 				&in6p->in6p_laddr.s6_addr32[3],
    641 				sizeof(((struct sockaddr_in *)dst)->sin_addr));
    642 			in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
    643 			in6totcpcb(in6p)->t_family = AF_INET;
    644 			if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
    645 				in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
    646 			else
    647 				in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
    648 			in6_pcbstate(in6p, IN6P_BOUND);
    649 		}
    650 #endif
    651 		break;
    652 #ifdef INET6
    653 	case AF_INET6:
    654 		if (in6p) {
    655 			in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
    656 			in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
    657 			in6_pcbstate(in6p, IN6P_BOUND);
    658 		}
    659 		break;
    660 #endif
    661 	}
    662 
    663 #ifdef INET6
    664 	if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
    665 		struct in6pcb *oin6p = sotoin6pcb(oso);
    666 		/* inherit socket options from the listening socket */
    667 		in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
    668 		if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
    669 			m_freem(in6p->in6p_options);
    670 			in6p->in6p_options = NULL;
    671 		}
    672 		ip6_savecontrol(in6p, &in6p->in6p_options,
    673 		    mtod(m, struct ip6_hdr *), m);
    674 	}
    675 #endif
    676 
    677 	/*
    678 	 * Give the new socket our cached route reference.
    679 	 */
    680 	if (inp) {
    681 		rtcache_copy(&inp->inp_route, &sc->sc_route);
    682 		rtcache_free(&sc->sc_route);
    683 	}
    684 #ifdef INET6
    685 	else {
    686 		rtcache_copy(&in6p->in6p_route, &sc->sc_route);
    687 		rtcache_free(&sc->sc_route);
    688 	}
    689 #endif
    690 
    691 	if (inp) {
    692 		struct sockaddr_in sin;
    693 		memcpy(&sin, src, src->sa_len);
    694 		if (in_pcbconnect(inp, &sin, &lwp0)) {
    695 			goto resetandabort;
    696 		}
    697 	}
    698 #ifdef INET6
    699 	else if (in6p) {
    700 		struct sockaddr_in6 sin6;
    701 		memcpy(&sin6, src, src->sa_len);
    702 		if (src->sa_family == AF_INET) {
    703 			/* IPv4 packet to AF_INET6 socket */
    704 			in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
    705 		}
    706 		if (in6_pcbconnect(in6p, &sin6, NULL)) {
    707 			goto resetandabort;
    708 		}
    709 	}
    710 #endif
    711 	else {
    712 		goto resetandabort;
    713 	}
    714 
    715 	if (inp)
    716 		tp = intotcpcb(inp);
    717 #ifdef INET6
    718 	else if (in6p)
    719 		tp = in6totcpcb(in6p);
    720 #endif
    721 	else
    722 		tp = NULL;
    723 
    724 	tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
    725 	if (sc->sc_request_r_scale != 15) {
    726 		tp->requested_s_scale = sc->sc_requested_s_scale;
    727 		tp->request_r_scale = sc->sc_request_r_scale;
    728 		tp->snd_scale = sc->sc_requested_s_scale;
    729 		tp->rcv_scale = sc->sc_request_r_scale;
    730 		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
    731 	}
    732 	if (sc->sc_flags & SCF_TIMESTAMP)
    733 		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
    734 	tp->ts_timebase = sc->sc_timebase;
    735 
    736 	tp->t_template = tcp_template(tp);
    737 	if (tp->t_template == 0) {
    738 		tp = tcp_drop(tp, ENOBUFS);	/* destroys socket */
    739 		so = NULL;
    740 		m_freem(m);
    741 		goto abort;
    742 	}
    743 
    744 	tp->iss = sc->sc_iss;
    745 	tp->irs = sc->sc_irs;
    746 	tcp_sendseqinit(tp);
    747 	tcp_rcvseqinit(tp);
    748 	tp->t_state = TCPS_SYN_RECEIVED;
    749 	TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
    750 	TCP_STATINC(TCP_STAT_ACCEPTS);
    751 
    752 	if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
    753 		tp->t_flags |= TF_WILL_SACK;
    754 
    755 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
    756 		tp->t_flags |= TF_ECN_PERMIT;
    757 
    758 #ifdef TCP_SIGNATURE
    759 	if (sc->sc_flags & SCF_SIGNATURE)
    760 		tp->t_flags |= TF_SIGNATURE;
    761 #endif
    762 
    763 	/* Initialize tp->t_ourmss before we deal with the peer's! */
    764 	tp->t_ourmss = sc->sc_ourmaxseg;
    765 	tcp_mss_from_peer(tp, sc->sc_peermaxseg);
    766 
    767 	/*
    768 	 * Initialize the initial congestion window.  If we
    769 	 * had to retransmit the SYN,ACK, we must initialize cwnd
    770 	 * to 1 segment (i.e. the Loss Window).
    771 	 */
    772 	if (sc->sc_rxtshift)
    773 		tp->snd_cwnd = tp->t_peermss;
    774 	else {
    775 		int ss = tcp_init_win;
    776 		if (inp != NULL && in_localaddr(inp->inp_faddr))
    777 			ss = tcp_init_win_local;
    778 #ifdef INET6
    779 		if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
    780 			ss = tcp_init_win_local;
    781 #endif
    782 		tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
    783 	}
    784 
    785 	tcp_rmx_rtt(tp);
    786 	tp->snd_wl1 = sc->sc_irs;
    787 	tp->rcv_up = sc->sc_irs + 1;
    788 
    789 	/*
    790 	 * This is what would have happened in tcp_output() when
    791 	 * the SYN,ACK was sent.
    792 	 */
    793 	tp->snd_up = tp->snd_una;
    794 	tp->snd_max = tp->snd_nxt = tp->iss+1;
    795 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
    796 	if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
    797 		tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
    798 	tp->last_ack_sent = tp->rcv_nxt;
    799 	tp->t_partialacks = -1;
    800 	tp->t_dupacks = 0;
    801 
    802 	TCP_STATINC(TCP_STAT_SC_COMPLETED);
    803 	s = splsoftnet();
    804 	syn_cache_put(sc);
    805 	splx(s);
    806 	return so;
    807 
    808 resetandabort:
    809 	(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
    810 abort:
    811 	if (so != NULL) {
    812 		(void) soqremque(so, 1);
    813 		(void) soabort(so);
    814 		mutex_enter(softnet_lock);
    815 	}
    816 	s = splsoftnet();
    817 	syn_cache_put(sc);
    818 	splx(s);
    819 	TCP_STATINC(TCP_STAT_SC_ABORTED);
    820 	return ((struct socket *)(-1));
    821 }
    822 
    823 /*
    824  * This function is called when we get a RST for a
    825  * non-existent connection, so that we can see if the
    826  * connection is in the syn cache.  If it is, zap it.
    827  */
    828 
    829 void
    830 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
    831 {
    832 	struct syn_cache *sc;
    833 	struct syn_cache_head *scp;
    834 	int s = splsoftnet();
    835 
    836 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    837 		splx(s);
    838 		return;
    839 	}
    840 	if (SEQ_LT(th->th_seq, sc->sc_irs) ||
    841 	    SEQ_GT(th->th_seq, sc->sc_irs+1)) {
    842 		splx(s);
    843 		return;
    844 	}
    845 	syn_cache_rm(sc);
    846 	TCP_STATINC(TCP_STAT_SC_RESET);
    847 	syn_cache_put(sc);	/* calls pool_put but see spl above */
    848 	splx(s);
    849 }
    850 
    851 void
    852 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
    853     struct tcphdr *th)
    854 {
    855 	struct syn_cache *sc;
    856 	struct syn_cache_head *scp;
    857 	int s;
    858 
    859 	s = splsoftnet();
    860 	if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
    861 		splx(s);
    862 		return;
    863 	}
    864 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
    865 	if (ntohl(th->th_seq) != sc->sc_iss) {
    866 		splx(s);
    867 		return;
    868 	}
    869 
    870 	/*
    871 	 * If we've retransmitted 3 times and this is our second error,
    872 	 * we remove the entry.  Otherwise, we allow it to continue on.
    873 	 * This prevents us from incorrectly nuking an entry during a
    874 	 * spurious network outage.
    875 	 *
    876 	 * See tcp_notify().
    877 	 */
    878 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
    879 		sc->sc_flags |= SCF_UNREACH;
    880 		splx(s);
    881 		return;
    882 	}
    883 
    884 	syn_cache_rm(sc);
    885 	TCP_STATINC(TCP_STAT_SC_UNREACH);
    886 	syn_cache_put(sc);	/* calls pool_put but see spl above */
    887 	splx(s);
    888 }
    889 
    890 /*
    891  * Given a LISTEN socket and an inbound SYN request, add this to the syn
    892  * cache, and send back a segment:
    893  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
    894  * to the source.
    895  *
    896  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
    897  * Doing so would require that we hold onto the data and deliver it
    898  * to the application.  However, if we are the target of a SYN-flood
    899  * DoS attack, an attacker could send data which would eventually
    900  * consume all available buffer space if it were ACKed.  By not ACKing
    901  * the data, we avoid this DoS scenario.
    902  */
    903 int
    904 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
    905     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
    906     int optlen, struct tcp_opt_info *oi)
    907 {
    908 	struct tcpcb tb, *tp;
    909 	long win;
    910 	struct syn_cache *sc;
    911 	struct syn_cache_head *scp;
    912 	struct mbuf *ipopts;
    913 	int s;
    914 
    915 	tp = sototcpcb(so);
    916 
    917 	/*
    918 	 * Initialize some local state.
    919 	 */
    920 	win = sbspace(&so->so_rcv);
    921 	if (win > TCP_MAXWIN)
    922 		win = TCP_MAXWIN;
    923 
    924 #ifdef TCP_SIGNATURE
    925 	if (optp || (tp->t_flags & TF_SIGNATURE))
    926 #else
    927 	if (optp)
    928 #endif
    929 	{
    930 		tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
    931 #ifdef TCP_SIGNATURE
    932 		tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
    933 #endif
    934 		tb.t_state = TCPS_LISTEN;
    935 		if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
    936 			return 0;
    937 	} else
    938 		tb.t_flags = 0;
    939 
    940 	switch (src->sa_family) {
    941 	case AF_INET:
    942 		/* Remember the IP options, if any. */
    943 		ipopts = ip_srcroute(m);
    944 		break;
    945 	default:
    946 		ipopts = NULL;
    947 	}
    948 
    949 	/*
    950 	 * See if we already have an entry for this connection.
    951 	 * If we do, resend the SYN,ACK.  We do not count this
    952 	 * as a retransmission (XXX though maybe we should).
    953 	 */
    954 	if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
    955 		TCP_STATINC(TCP_STAT_SC_DUPESYN);
    956 		if (ipopts) {
    957 			/*
    958 			 * If we were remembering a previous source route,
    959 			 * forget it and use the new one we've been given.
    960 			 */
    961 			if (sc->sc_ipopts)
    962 				(void)m_free(sc->sc_ipopts);
    963 			sc->sc_ipopts = ipopts;
    964 		}
    965 		sc->sc_timestamp = tb.ts_recent;
    966 		m_freem(m);
    967 		if (syn_cache_respond(sc) == 0) {
    968 			uint64_t *tcps = TCP_STAT_GETREF();
    969 			tcps[TCP_STAT_SNDACKS]++;
    970 			tcps[TCP_STAT_SNDTOTAL]++;
    971 			TCP_STAT_PUTREF();
    972 		}
    973 		return 1;
    974 	}
    975 
    976 	s = splsoftnet();
    977 	sc = pool_get(&syn_cache_pool, PR_NOWAIT);
    978 	splx(s);
    979 	if (sc == NULL) {
    980 		if (ipopts)
    981 			(void)m_free(ipopts);
    982 		return 0;
    983 	}
    984 
    985 	/*
    986 	 * Fill in the cache, and put the necessary IP and TCP
    987 	 * options into the reply.
    988 	 */
    989 	memset(sc, 0, sizeof(struct syn_cache));
    990 	callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
    991 	memcpy(&sc->sc_src, src, src->sa_len);
    992 	memcpy(&sc->sc_dst, dst, dst->sa_len);
    993 	sc->sc_flags = 0;
    994 	sc->sc_ipopts = ipopts;
    995 	sc->sc_irs = th->th_seq;
    996 	switch (src->sa_family) {
    997 	case AF_INET:
    998 	    {
    999 		struct sockaddr_in *srcin = (void *)src;
   1000 		struct sockaddr_in *dstin = (void *)dst;
   1001 
   1002 		sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
   1003 		    &srcin->sin_addr, dstin->sin_port,
   1004 		    srcin->sin_port, sizeof(dstin->sin_addr));
   1005 		break;
   1006 	    }
   1007 #ifdef INET6
   1008 	case AF_INET6:
   1009 	    {
   1010 		struct sockaddr_in6 *srcin6 = (void *)src;
   1011 		struct sockaddr_in6 *dstin6 = (void *)dst;
   1012 
   1013 		sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
   1014 		    &srcin6->sin6_addr, dstin6->sin6_port,
   1015 		    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
   1016 		break;
   1017 	    }
   1018 #endif
   1019 	}
   1020 	sc->sc_peermaxseg = oi->maxseg;
   1021 	sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
   1022 	    m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
   1023 	sc->sc_win = win;
   1024 	sc->sc_timebase = tcp_now - 1;	/* see tcp_newtcpcb() */
   1025 	sc->sc_timestamp = tb.ts_recent;
   1026 	if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
   1027 	    (TF_REQ_TSTMP|TF_RCVD_TSTMP))
   1028 		sc->sc_flags |= SCF_TIMESTAMP;
   1029 	if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
   1030 	    (TF_RCVD_SCALE|TF_REQ_SCALE)) {
   1031 		sc->sc_requested_s_scale = tb.requested_s_scale;
   1032 		sc->sc_request_r_scale = 0;
   1033 		/*
   1034 		 * Pick the smallest possible scaling factor that
   1035 		 * will still allow us to scale up to sb_max.
   1036 		 *
   1037 		 * We do this because there are broken firewalls that
   1038 		 * will corrupt the window scale option, leading to
   1039 		 * the other endpoint believing that our advertised
   1040 		 * window is unscaled.  At scale factors larger than
   1041 		 * 5 the unscaled window will drop below 1500 bytes,
   1042 		 * leading to serious problems when traversing these
   1043 		 * broken firewalls.
   1044 		 *
   1045 		 * With the default sbmax of 256K, a scale factor
   1046 		 * of 3 will be chosen by this algorithm.  Those who
   1047 		 * choose a larger sbmax should watch out
   1048 		 * for the compatibility problems mentioned above.
   1049 		 *
   1050 		 * RFC1323: The Window field in a SYN (i.e., a <SYN>
   1051 		 * or <SYN,ACK>) segment itself is never scaled.
   1052 		 */
   1053 		while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
   1054 		    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
   1055 			sc->sc_request_r_scale++;
   1056 	} else {
   1057 		sc->sc_requested_s_scale = 15;
   1058 		sc->sc_request_r_scale = 15;
   1059 	}
   1060 	if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
   1061 		sc->sc_flags |= SCF_SACK_PERMIT;
   1062 
   1063 	/*
   1064 	 * ECN setup packet received.
   1065 	 */
   1066 	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
   1067 		sc->sc_flags |= SCF_ECN_PERMIT;
   1068 
   1069 #ifdef TCP_SIGNATURE
   1070 	if (tb.t_flags & TF_SIGNATURE)
   1071 		sc->sc_flags |= SCF_SIGNATURE;
   1072 #endif
   1073 	sc->sc_tp = tp;
   1074 	m_freem(m);
   1075 	if (syn_cache_respond(sc) == 0) {
   1076 		uint64_t *tcps = TCP_STAT_GETREF();
   1077 		tcps[TCP_STAT_SNDACKS]++;
   1078 		tcps[TCP_STAT_SNDTOTAL]++;
   1079 		TCP_STAT_PUTREF();
   1080 		syn_cache_insert(sc, tp);
   1081 	} else {
   1082 		s = splsoftnet();
   1083 		/*
   1084 		 * syn_cache_put() will try to schedule the timer, so
   1085 		 * we need to initialize it
   1086 		 */
   1087 		syn_cache_timer_arm(sc);
   1088 		syn_cache_put(sc);
   1089 		splx(s);
   1090 		TCP_STATINC(TCP_STAT_SC_DROPPED);
   1091 	}
   1092 	return 1;
   1093 }
   1094 
   1095 /*
   1096  * syn_cache_respond: (re)send SYN+ACK.
   1097  *
   1098  * Returns 0 on success.
   1099  */
   1100 
   1101 static int
   1102 syn_cache_respond(struct syn_cache *sc)
   1103 {
   1104 #ifdef INET6
   1105 	struct rtentry *rt = NULL;
   1106 #endif
   1107 	struct route *ro;
   1108 	u_int8_t *optp;
   1109 	int optlen, error;
   1110 	u_int16_t tlen;
   1111 	struct ip *ip = NULL;
   1112 #ifdef INET6
   1113 	struct ip6_hdr *ip6 = NULL;
   1114 #endif
   1115 	struct tcpcb *tp;
   1116 	struct tcphdr *th;
   1117 	struct mbuf *m;
   1118 	u_int hlen;
   1119 #ifdef TCP_SIGNATURE
   1120 	struct secasvar *sav = NULL;
   1121 	u_int8_t *sigp = NULL;
   1122 #endif
   1123 
   1124 	ro = &sc->sc_route;
   1125 	switch (sc->sc_src.sa.sa_family) {
   1126 	case AF_INET:
   1127 		hlen = sizeof(struct ip);
   1128 		break;
   1129 #ifdef INET6
   1130 	case AF_INET6:
   1131 		hlen = sizeof(struct ip6_hdr);
   1132 		break;
   1133 #endif
   1134 	default:
   1135 		return EAFNOSUPPORT;
   1136 	}
   1137 
   1138 	/* Worst case scenario, since we don't know the option size yet. */
   1139 	tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
   1140 	KASSERT(max_linkhdr + tlen <= MCLBYTES);
   1141 
   1142 	/*
   1143 	 * Create the IP+TCP header from scratch.
   1144 	 */
   1145 	MGETHDR(m, M_DONTWAIT, MT_DATA);
   1146 	if (m && (max_linkhdr + tlen) > MHLEN) {
   1147 		MCLGET(m, M_DONTWAIT);
   1148 		if ((m->m_flags & M_EXT) == 0) {
   1149 			m_freem(m);
   1150 			m = NULL;
   1151 		}
   1152 	}
   1153 	if (m == NULL)
   1154 		return ENOBUFS;
   1155 	MCLAIM(m, &tcp_tx_mowner);
   1156 
   1157 	tp = sc->sc_tp;
   1158 
   1159 	/* Fixup the mbuf. */
   1160 	m->m_data += max_linkhdr;
   1161 	m_reset_rcvif(m);
   1162 	memset(mtod(m, void *), 0, tlen);
   1163 
   1164 	switch (sc->sc_src.sa.sa_family) {
   1165 	case AF_INET:
   1166 		ip = mtod(m, struct ip *);
   1167 		ip->ip_v = 4;
   1168 		ip->ip_dst = sc->sc_src.sin.sin_addr;
   1169 		ip->ip_src = sc->sc_dst.sin.sin_addr;
   1170 		ip->ip_p = IPPROTO_TCP;
   1171 		th = (struct tcphdr *)(ip + 1);
   1172 		th->th_dport = sc->sc_src.sin.sin_port;
   1173 		th->th_sport = sc->sc_dst.sin.sin_port;
   1174 		break;
   1175 #ifdef INET6
   1176 	case AF_INET6:
   1177 		ip6 = mtod(m, struct ip6_hdr *);
   1178 		ip6->ip6_vfc = IPV6_VERSION;
   1179 		ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
   1180 		ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
   1181 		ip6->ip6_nxt = IPPROTO_TCP;
   1182 		/* ip6_plen will be updated in ip6_output() */
   1183 		th = (struct tcphdr *)(ip6 + 1);
   1184 		th->th_dport = sc->sc_src.sin6.sin6_port;
   1185 		th->th_sport = sc->sc_dst.sin6.sin6_port;
   1186 		break;
   1187 #endif
   1188 	default:
   1189 		panic("%s: impossible (1)", __func__);
   1190 	}
   1191 
   1192 	th->th_seq = htonl(sc->sc_iss);
   1193 	th->th_ack = htonl(sc->sc_irs + 1);
   1194 	th->th_flags = TH_SYN|TH_ACK;
   1195 	th->th_win = htons(sc->sc_win);
   1196 	/* th_x2, th_sum, th_urp already 0 from memset */
   1197 
   1198 	/* Tack on the TCP options. */
   1199 	optp = (u_int8_t *)(th + 1);
   1200 	optlen = 0;
   1201 	*optp++ = TCPOPT_MAXSEG;
   1202 	*optp++ = TCPOLEN_MAXSEG;
   1203 	*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
   1204 	*optp++ = sc->sc_ourmaxseg & 0xff;
   1205 	optlen += TCPOLEN_MAXSEG;
   1206 
   1207 	if (sc->sc_request_r_scale != 15) {
   1208 		*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
   1209 		    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
   1210 		    sc->sc_request_r_scale);
   1211 		optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
   1212 		optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
   1213 	}
   1214 
   1215 	if (sc->sc_flags & SCF_SACK_PERMIT) {
   1216 		/* Let the peer know that we will SACK. */
   1217 		*optp++ = TCPOPT_SACK_PERMITTED;
   1218 		*optp++ = TCPOLEN_SACK_PERMITTED;
   1219 		optlen += TCPOLEN_SACK_PERMITTED;
   1220 	}
   1221 
   1222 	if (sc->sc_flags & SCF_TIMESTAMP) {
   1223 		while (optlen % 4 != 2) {
   1224 			optlen += TCPOLEN_NOP;
   1225 			*optp++ = TCPOPT_NOP;
   1226 		}
   1227 		*optp++ = TCPOPT_TIMESTAMP;
   1228 		*optp++ = TCPOLEN_TIMESTAMP;
   1229 		u_int32_t *lp = (u_int32_t *)(optp);
   1230 		/* Form timestamp option as shown in appendix A of RFC 1323. */
   1231 		*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
   1232 		*lp   = htonl(sc->sc_timestamp);
   1233 		optp += TCPOLEN_TIMESTAMP - 2;
   1234 		optlen += TCPOLEN_TIMESTAMP;
   1235 	}
   1236 
   1237 #ifdef TCP_SIGNATURE
   1238 	if (sc->sc_flags & SCF_SIGNATURE) {
   1239 		sav = tcp_signature_getsav(m);
   1240 		if (sav == NULL) {
   1241 			m_freem(m);
   1242 			return EPERM;
   1243 		}
   1244 
   1245 		*optp++ = TCPOPT_SIGNATURE;
   1246 		*optp++ = TCPOLEN_SIGNATURE;
   1247 		sigp = optp;
   1248 		memset(optp, 0, TCP_SIGLEN);
   1249 		optp += TCP_SIGLEN;
   1250 		optlen += TCPOLEN_SIGNATURE;
   1251 	}
   1252 #endif
   1253 
   1254 	/*
   1255 	 * Terminate and pad TCP options to a 4 byte boundary.
   1256 	 *
   1257 	 * According to RFC793: "The content of the header beyond the
   1258 	 * End-of-Option option must be header padding (i.e., zero)."
   1259 	 * And later: "The padding is composed of zeros."
   1260 	 */
   1261 	if (optlen % 4) {
   1262 		optlen += TCPOLEN_EOL;
   1263 		*optp++ = TCPOPT_EOL;
   1264 	}
   1265 	while (optlen % 4) {
   1266 		optlen += TCPOLEN_PAD;
   1267 		*optp++ = TCPOPT_PAD;
   1268 	}
   1269 
   1270 	/* Compute the actual values now that we've added the options. */
   1271 	tlen = hlen + sizeof(struct tcphdr) + optlen;
   1272 	m->m_len = m->m_pkthdr.len = tlen;
   1273 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
   1274 
   1275 #ifdef TCP_SIGNATURE
   1276 	if (sav) {
   1277 		(void)tcp_signature(m, th, hlen, sav, sigp);
   1278 		key_sa_recordxfer(sav, m);
   1279 		KEY_SA_UNREF(&sav);
   1280 	}
   1281 #endif
   1282 
   1283 	/*
   1284 	 * Send ECN SYN-ACK setup packet.
   1285 	 * Routes can be asymmetric, so, even if we receive a packet
   1286 	 * with ECE and CWR set, we must not assume no one will block
   1287 	 * the ECE packet we are about to send.
   1288 	 */
   1289 	if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
   1290 	    SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
   1291 		th->th_flags |= TH_ECE;
   1292 		TCP_STATINC(TCP_STAT_ECN_SHS);
   1293 
   1294 		/*
   1295 		 * draft-ietf-tcpm-ecnsyn-00.txt
   1296 		 *
   1297 		 * "[...] a TCP node MAY respond to an ECN-setup
   1298 		 * SYN packet by setting ECT in the responding
   1299 		 * ECN-setup SYN/ACK packet, indicating to routers
   1300 		 * that the SYN/ACK packet is ECN-Capable.
   1301 		 * This allows a congested router along the path
   1302 		 * to mark the packet instead of dropping the
   1303 		 * packet as an indication of congestion."
   1304 		 *
   1305 		 * "[...] There can be a great benefit in setting
   1306 		 * an ECN-capable codepoint in SYN/ACK packets [...]
   1307 		 * Congestion is  most likely to occur in
   1308 		 * the server-to-client direction.  As a result,
   1309 		 * setting an ECN-capable codepoint in SYN/ACK
   1310 		 * packets can reduce the occurrence of three-second
   1311 		 * retransmit timeouts resulting from the drop
   1312 		 * of SYN/ACK packets."
   1313 		 *
   1314 		 * Page 4 and 6, January 2006.
   1315 		 */
   1316 
   1317 		switch (sc->sc_src.sa.sa_family) {
   1318 		case AF_INET:
   1319 			ip->ip_tos |= IPTOS_ECN_ECT0;
   1320 			break;
   1321 #ifdef INET6
   1322 		case AF_INET6:
   1323 			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
   1324 			break;
   1325 #endif
   1326 		}
   1327 		TCP_STATINC(TCP_STAT_ECN_ECT);
   1328 	}
   1329 
   1330 
   1331 	/*
   1332 	 * Compute the packet's checksum.
   1333 	 *
   1334 	 * Fill in some straggling IP bits.  Note the stack expects
   1335 	 * ip_len to be in host order, for convenience.
   1336 	 */
   1337 	switch (sc->sc_src.sa.sa_family) {
   1338 	case AF_INET:
   1339 		ip->ip_len = htons(tlen - hlen);
   1340 		th->th_sum = 0;
   1341 		th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
   1342 		ip->ip_len = htons(tlen);
   1343 		ip->ip_ttl = ip_defttl;
   1344 		/* XXX tos? */
   1345 		break;
   1346 #ifdef INET6
   1347 	case AF_INET6:
   1348 		ip6->ip6_plen = htons(tlen - hlen);
   1349 		th->th_sum = 0;
   1350 		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
   1351 		ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
   1352 		ip6->ip6_vfc |= IPV6_VERSION;
   1353 		ip6->ip6_plen = htons(tlen - hlen);
   1354 		/* ip6_hlim will be initialized afterwards */
   1355 		/* XXX flowlabel? */
   1356 		break;
   1357 #endif
   1358 	}
   1359 
   1360 	/* XXX use IPsec policy on listening socket, on SYN ACK */
   1361 	tp = sc->sc_tp;
   1362 
   1363 	switch (sc->sc_src.sa.sa_family) {
   1364 	case AF_INET:
   1365 		error = ip_output(m, sc->sc_ipopts, ro,
   1366 		    (ip_mtudisc ? IP_MTUDISC : 0),
   1367 		    NULL, tp ? tp->t_inpcb : NULL);
   1368 		break;
   1369 #ifdef INET6
   1370 	case AF_INET6:
   1371 		ip6->ip6_hlim = in6_selecthlim(NULL,
   1372 		    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
   1373 		rtcache_unref(rt, ro);
   1374 
   1375 		error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
   1376 		    tp ? tp->t_in6pcb : NULL, NULL);
   1377 		break;
   1378 #endif
   1379 	default:
   1380 		panic("%s: impossible (2)", __func__);
   1381 	}
   1382 
   1383 	return error;
   1384 }
   1385