tcp_syncache.c revision 1.2 1 /* $NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $ */
2
3 /*
4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the project nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
34 *
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
38 *
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
70 */
71
72 /*-
73 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
74 * 2011 The NetBSD Foundation, Inc.
75 * All rights reserved.
76 *
77 * This code is derived from software contributed to The NetBSD Foundation
78 * by Coyote Point Systems, Inc.
79 * This code is derived from software contributed to The NetBSD Foundation
80 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
81 * Facility, NASA Ames Research Center.
82 * This code is derived from software contributed to The NetBSD Foundation
83 * by Charles M. Hannum.
84 * This code is derived from software contributed to The NetBSD Foundation
85 * by Rui Paulo.
86 *
87 * Redistribution and use in source and binary forms, with or without
88 * modification, are permitted provided that the following conditions
89 * are met:
90 * 1. Redistributions of source code must retain the above copyright
91 * notice, this list of conditions and the following disclaimer.
92 * 2. Redistributions in binary form must reproduce the above copyright
93 * notice, this list of conditions and the following disclaimer in the
94 * documentation and/or other materials provided with the distribution.
95 *
96 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
97 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
98 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
99 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
100 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
101 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
102 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
103 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
104 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
105 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
106 * POSSIBILITY OF SUCH DAMAGE.
107 */
108
109 /*
110 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
111 * The Regents of the University of California. All rights reserved.
112 *
113 * Redistribution and use in source and binary forms, with or without
114 * modification, are permitted provided that the following conditions
115 * are met:
116 * 1. Redistributions of source code must retain the above copyright
117 * notice, this list of conditions and the following disclaimer.
118 * 2. Redistributions in binary form must reproduce the above copyright
119 * notice, this list of conditions and the following disclaimer in the
120 * documentation and/or other materials provided with the distribution.
121 * 3. Neither the name of the University nor the names of its contributors
122 * may be used to endorse or promote products derived from this software
123 * without specific prior written permission.
124 *
125 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
126 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
127 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
128 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
129 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
130 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
131 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
132 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
133 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
134 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
135 * SUCH DAMAGE.
136 *
137 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
138 */
139
140 /*
141 * TODO list for SYN cache stuff:
142 *
143 * Find room for a "state" field, which is needed to keep a
144 * compressed state for TIME_WAIT TCBs. It's been noted already
145 * that this is fairly important for very high-volume web and
146 * mail servers, which use a large number of short-lived
147 * connections.
148 */
149
150 #include <sys/cdefs.h>
151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.2 2022/09/20 10:12:18 ozaki-r Exp $");
152
153 #ifdef _KERNEL_OPT
154 #include "opt_inet.h"
155 #include "opt_ipsec.h"
156 #endif
157
158 #include <sys/param.h>
159 #include <sys/systm.h>
160 #include <sys/mbuf.h>
161 #include <sys/protosw.h>
162 #include <sys/socket.h>
163 #include <sys/socketvar.h>
164 #include <sys/errno.h>
165 #include <sys/syslog.h>
166 #include <sys/pool.h>
167 #include <sys/domain.h>
168 #include <sys/kernel.h>
169 #include <sys/lwp.h> /* for lwp0 */
170 #include <sys/cprng.h>
171
172 #include <netinet/in.h>
173 #include <netinet/ip.h>
174 #include <netinet/in_pcb.h>
175 #include <netinet/in_var.h>
176 #include <netinet/ip_var.h>
177
178 #include <netinet/ip6.h>
179 #ifdef INET6
180 #include <netinet6/ip6_var.h>
181 #include <netinet6/in6_pcb.h>
182 #include <netinet6/ip6_var.h>
183 #include <netinet6/in6_var.h>
184 #endif
185
186 #include <netinet/tcp.h>
187 #include <netinet/tcp_fsm.h>
188 #include <netinet/tcp_seq.h>
189 #include <netinet/tcp_timer.h>
190 #include <netinet/tcp_var.h>
191 #include <netinet/tcp_private.h>
192 #include <netinet/tcp_syncache.h>
193
194 #ifdef TCP_SIGNATURE
195 #ifdef IPSEC
196 #include <netipsec/ipsec.h>
197 #include <netipsec/key.h>
198 #ifdef INET6
199 #include <netipsec/ipsec6.h>
200 #endif
201 #endif /* IPSEC*/
202 #endif
203
204 static void syn_cache_timer(void *);
205 static struct syn_cache *
206 syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
207 struct syn_cache_head **);
208 static int syn_cache_respond(struct syn_cache *);
209
210 /* syn hash parameters */
211 #define TCP_SYN_HASH_SIZE 293
212 #define TCP_SYN_BUCKET_SIZE 35
213 static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
214 int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
215 int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
216 static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
217
218 /*
219 * TCP compressed state engine. Currently used to hold compressed
220 * state for SYN_RECEIVED.
221 */
222
223 u_long syn_cache_count;
224 static u_int32_t syn_hash1, syn_hash2;
225
226 #define SYN_HASH(sa, sp, dp) \
227 ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
228 ((u_int32_t)(sp)))^syn_hash2)))
229 #ifndef INET6
230 #define SYN_HASHALL(hash, src, dst) \
231 do { \
232 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
233 ((const struct sockaddr_in *)(src))->sin_port, \
234 ((const struct sockaddr_in *)(dst))->sin_port); \
235 } while (/*CONSTCOND*/ 0)
236 #else
237 #define SYN_HASH6(sa, sp, dp) \
238 ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
239 (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
240 & 0x7fffffff)
241
242 #define SYN_HASHALL(hash, src, dst) \
243 do { \
244 switch ((src)->sa_family) { \
245 case AF_INET: \
246 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
247 ((const struct sockaddr_in *)(src))->sin_port, \
248 ((const struct sockaddr_in *)(dst))->sin_port); \
249 break; \
250 case AF_INET6: \
251 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
252 ((const struct sockaddr_in6 *)(src))->sin6_port, \
253 ((const struct sockaddr_in6 *)(dst))->sin6_port); \
254 break; \
255 default: \
256 hash = 0; \
257 } \
258 } while (/*CONSTCOND*/0)
259 #endif /* INET6 */
260
261 static struct pool syn_cache_pool;
262
263 /*
264 * We don't estimate RTT with SYNs, so each packet starts with the default
265 * RTT and each timer step has a fixed timeout value.
266 */
267 static inline void
268 syn_cache_timer_arm(struct syn_cache *sc)
269 {
270
271 TCPT_RANGESET(sc->sc_rxtcur,
272 TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
273 TCPTV_REXMTMAX);
274 callout_reset(&sc->sc_timer,
275 sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
276 }
277
278 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
279
280 static inline void
281 syn_cache_rm(struct syn_cache *sc)
282 {
283 TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
284 sc, sc_bucketq);
285 sc->sc_tp = NULL;
286 LIST_REMOVE(sc, sc_tpq);
287 tcp_syn_cache[sc->sc_bucketidx].sch_length--;
288 callout_stop(&sc->sc_timer);
289 syn_cache_count--;
290 }
291
292 static inline void
293 syn_cache_put(struct syn_cache *sc)
294 {
295 if (sc->sc_ipopts)
296 (void) m_free(sc->sc_ipopts);
297 rtcache_free(&sc->sc_route);
298 sc->sc_flags |= SCF_DEAD;
299 if (!callout_invoking(&sc->sc_timer))
300 callout_schedule(&(sc)->sc_timer, 1);
301 }
302
303 void
304 syn_cache_init(void)
305 {
306 int i;
307
308 pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
309 "synpl", NULL, IPL_SOFTNET);
310
311 /* Initialize the hash buckets. */
312 for (i = 0; i < tcp_syn_cache_size; i++)
313 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
314 }
315
316 void
317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
318 {
319 struct syn_cache_head *scp;
320 struct syn_cache *sc2;
321 int s;
322
323 /*
324 * If there are no entries in the hash table, reinitialize
325 * the hash secrets.
326 */
327 if (syn_cache_count == 0) {
328 syn_hash1 = cprng_fast32();
329 syn_hash2 = cprng_fast32();
330 }
331
332 SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
333 sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
334 scp = &tcp_syn_cache[sc->sc_bucketidx];
335
336 /*
337 * Make sure that we don't overflow the per-bucket
338 * limit or the total cache size limit.
339 */
340 s = splsoftnet();
341 if (scp->sch_length >= tcp_syn_bucket_limit) {
342 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
343 /*
344 * The bucket is full. Toss the oldest element in the
345 * bucket. This will be the first entry in the bucket.
346 */
347 sc2 = TAILQ_FIRST(&scp->sch_bucket);
348 #ifdef DIAGNOSTIC
349 /*
350 * This should never happen; we should always find an
351 * entry in our bucket.
352 */
353 if (sc2 == NULL)
354 panic("syn_cache_insert: bucketoverflow: impossible");
355 #endif
356 syn_cache_rm(sc2);
357 syn_cache_put(sc2); /* calls pool_put but see spl above */
358 } else if (syn_cache_count >= tcp_syn_cache_limit) {
359 struct syn_cache_head *scp2, *sce;
360
361 TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
362 /*
363 * The cache is full. Toss the oldest entry in the
364 * first non-empty bucket we can find.
365 *
366 * XXX We would really like to toss the oldest
367 * entry in the cache, but we hope that this
368 * condition doesn't happen very often.
369 */
370 scp2 = scp;
371 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
372 sce = &tcp_syn_cache[tcp_syn_cache_size];
373 for (++scp2; scp2 != scp; scp2++) {
374 if (scp2 >= sce)
375 scp2 = &tcp_syn_cache[0];
376 if (! TAILQ_EMPTY(&scp2->sch_bucket))
377 break;
378 }
379 #ifdef DIAGNOSTIC
380 /*
381 * This should never happen; we should always find a
382 * non-empty bucket.
383 */
384 if (scp2 == scp)
385 panic("syn_cache_insert: cacheoverflow: "
386 "impossible");
387 #endif
388 }
389 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
390 syn_cache_rm(sc2);
391 syn_cache_put(sc2); /* calls pool_put but see spl above */
392 }
393
394 /*
395 * Initialize the entry's timer.
396 */
397 sc->sc_rxttot = 0;
398 sc->sc_rxtshift = 0;
399 syn_cache_timer_arm(sc);
400
401 /* Link it from tcpcb entry */
402 LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
403
404 /* Put it into the bucket. */
405 TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
406 scp->sch_length++;
407 syn_cache_count++;
408
409 TCP_STATINC(TCP_STAT_SC_ADDED);
410 splx(s);
411 }
412
413 /*
414 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
415 * If we have retransmitted an entry the maximum number of times, expire
416 * that entry.
417 */
418 static void
419 syn_cache_timer(void *arg)
420 {
421 struct syn_cache *sc = arg;
422
423 mutex_enter(softnet_lock);
424 KERNEL_LOCK(1, NULL);
425
426 callout_ack(&sc->sc_timer);
427
428 if (__predict_false(sc->sc_flags & SCF_DEAD)) {
429 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
430 goto free;
431 }
432
433 if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
434 /* Drop it -- too many retransmissions. */
435 goto dropit;
436 }
437
438 /*
439 * Compute the total amount of time this entry has
440 * been on a queue. If this entry has been on longer
441 * than the keep alive timer would allow, expire it.
442 */
443 sc->sc_rxttot += sc->sc_rxtcur;
444 if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
445 goto dropit;
446
447 TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
448 (void)syn_cache_respond(sc);
449
450 /* Advance the timer back-off. */
451 sc->sc_rxtshift++;
452 syn_cache_timer_arm(sc);
453
454 goto out;
455
456 dropit:
457 TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
458 syn_cache_rm(sc);
459 if (sc->sc_ipopts)
460 (void) m_free(sc->sc_ipopts);
461 rtcache_free(&sc->sc_route);
462
463 free:
464 callout_destroy(&sc->sc_timer);
465 pool_put(&syn_cache_pool, sc);
466
467 out:
468 KERNEL_UNLOCK_ONE(NULL);
469 mutex_exit(softnet_lock);
470 }
471
472 /*
473 * Remove syn cache created by the specified tcb entry,
474 * because this does not make sense to keep them
475 * (if there's no tcb entry, syn cache entry will never be used)
476 */
477 void
478 syn_cache_cleanup(struct tcpcb *tp)
479 {
480 struct syn_cache *sc, *nsc;
481 int s;
482
483 s = splsoftnet();
484
485 for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
486 nsc = LIST_NEXT(sc, sc_tpq);
487
488 #ifdef DIAGNOSTIC
489 if (sc->sc_tp != tp)
490 panic("invalid sc_tp in syn_cache_cleanup");
491 #endif
492 syn_cache_rm(sc);
493 syn_cache_put(sc); /* calls pool_put but see spl above */
494 }
495 /* just for safety */
496 LIST_INIT(&tp->t_sc);
497
498 splx(s);
499 }
500
501 /*
502 * Find an entry in the syn cache.
503 */
504 static struct syn_cache *
505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
506 struct syn_cache_head **headp)
507 {
508 struct syn_cache *sc;
509 struct syn_cache_head *scp;
510 u_int32_t hash;
511 int s;
512
513 SYN_HASHALL(hash, src, dst);
514
515 scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
516 *headp = scp;
517 s = splsoftnet();
518 for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
519 sc = TAILQ_NEXT(sc, sc_bucketq)) {
520 if (sc->sc_hash != hash)
521 continue;
522 if (!memcmp(&sc->sc_src, src, src->sa_len) &&
523 !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
524 splx(s);
525 return (sc);
526 }
527 }
528 splx(s);
529 return (NULL);
530 }
531
532 /*
533 * This function gets called when we receive an ACK for a socket in the
534 * LISTEN state. We look up the connection in the syn cache, and if it's
535 * there, we pull it out of the cache and turn it into a full-blown
536 * connection in the SYN-RECEIVED state.
537 *
538 * The return values may not be immediately obvious, and their effects
539 * can be subtle, so here they are:
540 *
541 * NULL SYN was not found in cache; caller should drop the
542 * packet and send an RST.
543 *
544 * -1 We were unable to create the new connection, and are
545 * aborting it. An ACK,RST is being sent to the peer
546 * (unless we got screwey sequence numbers; see below),
547 * because the 3-way handshake has been completed. Caller
548 * should not free the mbuf, since we may be using it. If
549 * we are not, we will free it.
550 *
551 * Otherwise, the return value is a pointer to the new socket
552 * associated with the connection.
553 */
554 struct socket *
555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
556 struct tcphdr *th, struct socket *so, struct mbuf *m)
557 {
558 struct syn_cache *sc;
559 struct syn_cache_head *scp;
560 struct inpcb *inp = NULL;
561 #ifdef INET6
562 struct in6pcb *in6p = NULL;
563 #endif
564 struct tcpcb *tp;
565 int s;
566 struct socket *oso;
567
568 s = splsoftnet();
569 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
570 splx(s);
571 return NULL;
572 }
573
574 /*
575 * Verify the sequence and ack numbers. Try getting the correct
576 * response again.
577 */
578 if ((th->th_ack != sc->sc_iss + 1) ||
579 SEQ_LEQ(th->th_seq, sc->sc_irs) ||
580 SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
581 m_freem(m);
582 (void)syn_cache_respond(sc);
583 splx(s);
584 return ((struct socket *)(-1));
585 }
586
587 /* Remove this cache entry */
588 syn_cache_rm(sc);
589 splx(s);
590
591 /*
592 * Ok, create the full blown connection, and set things up
593 * as they would have been set up if we had created the
594 * connection when the SYN arrived. If we can't create
595 * the connection, abort it.
596 */
597 /*
598 * inp still has the OLD in_pcb stuff, set the
599 * v6-related flags on the new guy, too. This is
600 * done particularly for the case where an AF_INET6
601 * socket is bound only to a port, and a v4 connection
602 * comes in on that port.
603 * we also copy the flowinfo from the original pcb
604 * to the new one.
605 */
606 oso = so;
607 so = sonewconn(so, true);
608 if (so == NULL)
609 goto resetandabort;
610
611 switch (so->so_proto->pr_domain->dom_family) {
612 case AF_INET:
613 inp = sotoinpcb(so);
614 break;
615 #ifdef INET6
616 case AF_INET6:
617 in6p = sotoin6pcb(so);
618 break;
619 #endif
620 }
621
622 switch (src->sa_family) {
623 case AF_INET:
624 if (inp) {
625 inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
626 inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
627 inp->inp_options = ip_srcroute(m);
628 in_pcbstate(inp, INP_BOUND);
629 if (inp->inp_options == NULL) {
630 inp->inp_options = sc->sc_ipopts;
631 sc->sc_ipopts = NULL;
632 }
633 }
634 #ifdef INET6
635 else if (in6p) {
636 /* IPv4 packet to AF_INET6 socket */
637 memset(&in6p->in6p_laddr, 0, sizeof(in6p->in6p_laddr));
638 in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
639 bcopy(&((struct sockaddr_in *)dst)->sin_addr,
640 &in6p->in6p_laddr.s6_addr32[3],
641 sizeof(((struct sockaddr_in *)dst)->sin_addr));
642 in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
643 in6totcpcb(in6p)->t_family = AF_INET;
644 if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
645 in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
646 else
647 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
648 in6_pcbstate(in6p, IN6P_BOUND);
649 }
650 #endif
651 break;
652 #ifdef INET6
653 case AF_INET6:
654 if (in6p) {
655 in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
656 in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
657 in6_pcbstate(in6p, IN6P_BOUND);
658 }
659 break;
660 #endif
661 }
662
663 #ifdef INET6
664 if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
665 struct in6pcb *oin6p = sotoin6pcb(oso);
666 /* inherit socket options from the listening socket */
667 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
668 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
669 m_freem(in6p->in6p_options);
670 in6p->in6p_options = NULL;
671 }
672 ip6_savecontrol(in6p, &in6p->in6p_options,
673 mtod(m, struct ip6_hdr *), m);
674 }
675 #endif
676
677 /*
678 * Give the new socket our cached route reference.
679 */
680 if (inp) {
681 rtcache_copy(&inp->inp_route, &sc->sc_route);
682 rtcache_free(&sc->sc_route);
683 }
684 #ifdef INET6
685 else {
686 rtcache_copy(&in6p->in6p_route, &sc->sc_route);
687 rtcache_free(&sc->sc_route);
688 }
689 #endif
690
691 if (inp) {
692 struct sockaddr_in sin;
693 memcpy(&sin, src, src->sa_len);
694 if (in_pcbconnect(inp, &sin, &lwp0)) {
695 goto resetandabort;
696 }
697 }
698 #ifdef INET6
699 else if (in6p) {
700 struct sockaddr_in6 sin6;
701 memcpy(&sin6, src, src->sa_len);
702 if (src->sa_family == AF_INET) {
703 /* IPv4 packet to AF_INET6 socket */
704 in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
705 }
706 if (in6_pcbconnect(in6p, &sin6, NULL)) {
707 goto resetandabort;
708 }
709 }
710 #endif
711 else {
712 goto resetandabort;
713 }
714
715 if (inp)
716 tp = intotcpcb(inp);
717 #ifdef INET6
718 else if (in6p)
719 tp = in6totcpcb(in6p);
720 #endif
721 else
722 tp = NULL;
723
724 tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
725 if (sc->sc_request_r_scale != 15) {
726 tp->requested_s_scale = sc->sc_requested_s_scale;
727 tp->request_r_scale = sc->sc_request_r_scale;
728 tp->snd_scale = sc->sc_requested_s_scale;
729 tp->rcv_scale = sc->sc_request_r_scale;
730 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
731 }
732 if (sc->sc_flags & SCF_TIMESTAMP)
733 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
734 tp->ts_timebase = sc->sc_timebase;
735
736 tp->t_template = tcp_template(tp);
737 if (tp->t_template == 0) {
738 tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
739 so = NULL;
740 m_freem(m);
741 goto abort;
742 }
743
744 tp->iss = sc->sc_iss;
745 tp->irs = sc->sc_irs;
746 tcp_sendseqinit(tp);
747 tcp_rcvseqinit(tp);
748 tp->t_state = TCPS_SYN_RECEIVED;
749 TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
750 TCP_STATINC(TCP_STAT_ACCEPTS);
751
752 if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
753 tp->t_flags |= TF_WILL_SACK;
754
755 if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
756 tp->t_flags |= TF_ECN_PERMIT;
757
758 #ifdef TCP_SIGNATURE
759 if (sc->sc_flags & SCF_SIGNATURE)
760 tp->t_flags |= TF_SIGNATURE;
761 #endif
762
763 /* Initialize tp->t_ourmss before we deal with the peer's! */
764 tp->t_ourmss = sc->sc_ourmaxseg;
765 tcp_mss_from_peer(tp, sc->sc_peermaxseg);
766
767 /*
768 * Initialize the initial congestion window. If we
769 * had to retransmit the SYN,ACK, we must initialize cwnd
770 * to 1 segment (i.e. the Loss Window).
771 */
772 if (sc->sc_rxtshift)
773 tp->snd_cwnd = tp->t_peermss;
774 else {
775 int ss = tcp_init_win;
776 if (inp != NULL && in_localaddr(inp->inp_faddr))
777 ss = tcp_init_win_local;
778 #ifdef INET6
779 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
780 ss = tcp_init_win_local;
781 #endif
782 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
783 }
784
785 tcp_rmx_rtt(tp);
786 tp->snd_wl1 = sc->sc_irs;
787 tp->rcv_up = sc->sc_irs + 1;
788
789 /*
790 * This is what would have happened in tcp_output() when
791 * the SYN,ACK was sent.
792 */
793 tp->snd_up = tp->snd_una;
794 tp->snd_max = tp->snd_nxt = tp->iss+1;
795 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
796 if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
797 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
798 tp->last_ack_sent = tp->rcv_nxt;
799 tp->t_partialacks = -1;
800 tp->t_dupacks = 0;
801
802 TCP_STATINC(TCP_STAT_SC_COMPLETED);
803 s = splsoftnet();
804 syn_cache_put(sc);
805 splx(s);
806 return so;
807
808 resetandabort:
809 (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
810 abort:
811 if (so != NULL) {
812 (void) soqremque(so, 1);
813 (void) soabort(so);
814 mutex_enter(softnet_lock);
815 }
816 s = splsoftnet();
817 syn_cache_put(sc);
818 splx(s);
819 TCP_STATINC(TCP_STAT_SC_ABORTED);
820 return ((struct socket *)(-1));
821 }
822
823 /*
824 * This function is called when we get a RST for a
825 * non-existent connection, so that we can see if the
826 * connection is in the syn cache. If it is, zap it.
827 */
828
829 void
830 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
831 {
832 struct syn_cache *sc;
833 struct syn_cache_head *scp;
834 int s = splsoftnet();
835
836 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
837 splx(s);
838 return;
839 }
840 if (SEQ_LT(th->th_seq, sc->sc_irs) ||
841 SEQ_GT(th->th_seq, sc->sc_irs+1)) {
842 splx(s);
843 return;
844 }
845 syn_cache_rm(sc);
846 TCP_STATINC(TCP_STAT_SC_RESET);
847 syn_cache_put(sc); /* calls pool_put but see spl above */
848 splx(s);
849 }
850
851 void
852 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
853 struct tcphdr *th)
854 {
855 struct syn_cache *sc;
856 struct syn_cache_head *scp;
857 int s;
858
859 s = splsoftnet();
860 if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
861 splx(s);
862 return;
863 }
864 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
865 if (ntohl(th->th_seq) != sc->sc_iss) {
866 splx(s);
867 return;
868 }
869
870 /*
871 * If we've retransmitted 3 times and this is our second error,
872 * we remove the entry. Otherwise, we allow it to continue on.
873 * This prevents us from incorrectly nuking an entry during a
874 * spurious network outage.
875 *
876 * See tcp_notify().
877 */
878 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
879 sc->sc_flags |= SCF_UNREACH;
880 splx(s);
881 return;
882 }
883
884 syn_cache_rm(sc);
885 TCP_STATINC(TCP_STAT_SC_UNREACH);
886 syn_cache_put(sc); /* calls pool_put but see spl above */
887 splx(s);
888 }
889
890 /*
891 * Given a LISTEN socket and an inbound SYN request, add this to the syn
892 * cache, and send back a segment:
893 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
894 * to the source.
895 *
896 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
897 * Doing so would require that we hold onto the data and deliver it
898 * to the application. However, if we are the target of a SYN-flood
899 * DoS attack, an attacker could send data which would eventually
900 * consume all available buffer space if it were ACKed. By not ACKing
901 * the data, we avoid this DoS scenario.
902 */
903 int
904 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
905 unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
906 int optlen, struct tcp_opt_info *oi)
907 {
908 struct tcpcb tb, *tp;
909 long win;
910 struct syn_cache *sc;
911 struct syn_cache_head *scp;
912 struct mbuf *ipopts;
913 int s;
914
915 tp = sototcpcb(so);
916
917 /*
918 * Initialize some local state.
919 */
920 win = sbspace(&so->so_rcv);
921 if (win > TCP_MAXWIN)
922 win = TCP_MAXWIN;
923
924 #ifdef TCP_SIGNATURE
925 if (optp || (tp->t_flags & TF_SIGNATURE))
926 #else
927 if (optp)
928 #endif
929 {
930 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
931 #ifdef TCP_SIGNATURE
932 tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
933 #endif
934 tb.t_state = TCPS_LISTEN;
935 if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
936 return 0;
937 } else
938 tb.t_flags = 0;
939
940 switch (src->sa_family) {
941 case AF_INET:
942 /* Remember the IP options, if any. */
943 ipopts = ip_srcroute(m);
944 break;
945 default:
946 ipopts = NULL;
947 }
948
949 /*
950 * See if we already have an entry for this connection.
951 * If we do, resend the SYN,ACK. We do not count this
952 * as a retransmission (XXX though maybe we should).
953 */
954 if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
955 TCP_STATINC(TCP_STAT_SC_DUPESYN);
956 if (ipopts) {
957 /*
958 * If we were remembering a previous source route,
959 * forget it and use the new one we've been given.
960 */
961 if (sc->sc_ipopts)
962 (void)m_free(sc->sc_ipopts);
963 sc->sc_ipopts = ipopts;
964 }
965 sc->sc_timestamp = tb.ts_recent;
966 m_freem(m);
967 if (syn_cache_respond(sc) == 0) {
968 uint64_t *tcps = TCP_STAT_GETREF();
969 tcps[TCP_STAT_SNDACKS]++;
970 tcps[TCP_STAT_SNDTOTAL]++;
971 TCP_STAT_PUTREF();
972 }
973 return 1;
974 }
975
976 s = splsoftnet();
977 sc = pool_get(&syn_cache_pool, PR_NOWAIT);
978 splx(s);
979 if (sc == NULL) {
980 if (ipopts)
981 (void)m_free(ipopts);
982 return 0;
983 }
984
985 /*
986 * Fill in the cache, and put the necessary IP and TCP
987 * options into the reply.
988 */
989 memset(sc, 0, sizeof(struct syn_cache));
990 callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
991 memcpy(&sc->sc_src, src, src->sa_len);
992 memcpy(&sc->sc_dst, dst, dst->sa_len);
993 sc->sc_flags = 0;
994 sc->sc_ipopts = ipopts;
995 sc->sc_irs = th->th_seq;
996 switch (src->sa_family) {
997 case AF_INET:
998 {
999 struct sockaddr_in *srcin = (void *)src;
1000 struct sockaddr_in *dstin = (void *)dst;
1001
1002 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
1003 &srcin->sin_addr, dstin->sin_port,
1004 srcin->sin_port, sizeof(dstin->sin_addr));
1005 break;
1006 }
1007 #ifdef INET6
1008 case AF_INET6:
1009 {
1010 struct sockaddr_in6 *srcin6 = (void *)src;
1011 struct sockaddr_in6 *dstin6 = (void *)dst;
1012
1013 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
1014 &srcin6->sin6_addr, dstin6->sin6_port,
1015 srcin6->sin6_port, sizeof(dstin6->sin6_addr));
1016 break;
1017 }
1018 #endif
1019 }
1020 sc->sc_peermaxseg = oi->maxseg;
1021 sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
1022 m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
1023 sc->sc_win = win;
1024 sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */
1025 sc->sc_timestamp = tb.ts_recent;
1026 if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
1027 (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1028 sc->sc_flags |= SCF_TIMESTAMP;
1029 if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1030 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1031 sc->sc_requested_s_scale = tb.requested_s_scale;
1032 sc->sc_request_r_scale = 0;
1033 /*
1034 * Pick the smallest possible scaling factor that
1035 * will still allow us to scale up to sb_max.
1036 *
1037 * We do this because there are broken firewalls that
1038 * will corrupt the window scale option, leading to
1039 * the other endpoint believing that our advertised
1040 * window is unscaled. At scale factors larger than
1041 * 5 the unscaled window will drop below 1500 bytes,
1042 * leading to serious problems when traversing these
1043 * broken firewalls.
1044 *
1045 * With the default sbmax of 256K, a scale factor
1046 * of 3 will be chosen by this algorithm. Those who
1047 * choose a larger sbmax should watch out
1048 * for the compatibility problems mentioned above.
1049 *
1050 * RFC1323: The Window field in a SYN (i.e., a <SYN>
1051 * or <SYN,ACK>) segment itself is never scaled.
1052 */
1053 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
1054 (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
1055 sc->sc_request_r_scale++;
1056 } else {
1057 sc->sc_requested_s_scale = 15;
1058 sc->sc_request_r_scale = 15;
1059 }
1060 if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
1061 sc->sc_flags |= SCF_SACK_PERMIT;
1062
1063 /*
1064 * ECN setup packet received.
1065 */
1066 if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
1067 sc->sc_flags |= SCF_ECN_PERMIT;
1068
1069 #ifdef TCP_SIGNATURE
1070 if (tb.t_flags & TF_SIGNATURE)
1071 sc->sc_flags |= SCF_SIGNATURE;
1072 #endif
1073 sc->sc_tp = tp;
1074 m_freem(m);
1075 if (syn_cache_respond(sc) == 0) {
1076 uint64_t *tcps = TCP_STAT_GETREF();
1077 tcps[TCP_STAT_SNDACKS]++;
1078 tcps[TCP_STAT_SNDTOTAL]++;
1079 TCP_STAT_PUTREF();
1080 syn_cache_insert(sc, tp);
1081 } else {
1082 s = splsoftnet();
1083 /*
1084 * syn_cache_put() will try to schedule the timer, so
1085 * we need to initialize it
1086 */
1087 syn_cache_timer_arm(sc);
1088 syn_cache_put(sc);
1089 splx(s);
1090 TCP_STATINC(TCP_STAT_SC_DROPPED);
1091 }
1092 return 1;
1093 }
1094
1095 /*
1096 * syn_cache_respond: (re)send SYN+ACK.
1097 *
1098 * Returns 0 on success.
1099 */
1100
1101 static int
1102 syn_cache_respond(struct syn_cache *sc)
1103 {
1104 #ifdef INET6
1105 struct rtentry *rt = NULL;
1106 #endif
1107 struct route *ro;
1108 u_int8_t *optp;
1109 int optlen, error;
1110 u_int16_t tlen;
1111 struct ip *ip = NULL;
1112 #ifdef INET6
1113 struct ip6_hdr *ip6 = NULL;
1114 #endif
1115 struct tcpcb *tp;
1116 struct tcphdr *th;
1117 struct mbuf *m;
1118 u_int hlen;
1119 #ifdef TCP_SIGNATURE
1120 struct secasvar *sav = NULL;
1121 u_int8_t *sigp = NULL;
1122 #endif
1123
1124 ro = &sc->sc_route;
1125 switch (sc->sc_src.sa.sa_family) {
1126 case AF_INET:
1127 hlen = sizeof(struct ip);
1128 break;
1129 #ifdef INET6
1130 case AF_INET6:
1131 hlen = sizeof(struct ip6_hdr);
1132 break;
1133 #endif
1134 default:
1135 return EAFNOSUPPORT;
1136 }
1137
1138 /* Worst case scenario, since we don't know the option size yet. */
1139 tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
1140 KASSERT(max_linkhdr + tlen <= MCLBYTES);
1141
1142 /*
1143 * Create the IP+TCP header from scratch.
1144 */
1145 MGETHDR(m, M_DONTWAIT, MT_DATA);
1146 if (m && (max_linkhdr + tlen) > MHLEN) {
1147 MCLGET(m, M_DONTWAIT);
1148 if ((m->m_flags & M_EXT) == 0) {
1149 m_freem(m);
1150 m = NULL;
1151 }
1152 }
1153 if (m == NULL)
1154 return ENOBUFS;
1155 MCLAIM(m, &tcp_tx_mowner);
1156
1157 tp = sc->sc_tp;
1158
1159 /* Fixup the mbuf. */
1160 m->m_data += max_linkhdr;
1161 m_reset_rcvif(m);
1162 memset(mtod(m, void *), 0, tlen);
1163
1164 switch (sc->sc_src.sa.sa_family) {
1165 case AF_INET:
1166 ip = mtod(m, struct ip *);
1167 ip->ip_v = 4;
1168 ip->ip_dst = sc->sc_src.sin.sin_addr;
1169 ip->ip_src = sc->sc_dst.sin.sin_addr;
1170 ip->ip_p = IPPROTO_TCP;
1171 th = (struct tcphdr *)(ip + 1);
1172 th->th_dport = sc->sc_src.sin.sin_port;
1173 th->th_sport = sc->sc_dst.sin.sin_port;
1174 break;
1175 #ifdef INET6
1176 case AF_INET6:
1177 ip6 = mtod(m, struct ip6_hdr *);
1178 ip6->ip6_vfc = IPV6_VERSION;
1179 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
1180 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
1181 ip6->ip6_nxt = IPPROTO_TCP;
1182 /* ip6_plen will be updated in ip6_output() */
1183 th = (struct tcphdr *)(ip6 + 1);
1184 th->th_dport = sc->sc_src.sin6.sin6_port;
1185 th->th_sport = sc->sc_dst.sin6.sin6_port;
1186 break;
1187 #endif
1188 default:
1189 panic("%s: impossible (1)", __func__);
1190 }
1191
1192 th->th_seq = htonl(sc->sc_iss);
1193 th->th_ack = htonl(sc->sc_irs + 1);
1194 th->th_flags = TH_SYN|TH_ACK;
1195 th->th_win = htons(sc->sc_win);
1196 /* th_x2, th_sum, th_urp already 0 from memset */
1197
1198 /* Tack on the TCP options. */
1199 optp = (u_int8_t *)(th + 1);
1200 optlen = 0;
1201 *optp++ = TCPOPT_MAXSEG;
1202 *optp++ = TCPOLEN_MAXSEG;
1203 *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
1204 *optp++ = sc->sc_ourmaxseg & 0xff;
1205 optlen += TCPOLEN_MAXSEG;
1206
1207 if (sc->sc_request_r_scale != 15) {
1208 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1209 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1210 sc->sc_request_r_scale);
1211 optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
1212 optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
1213 }
1214
1215 if (sc->sc_flags & SCF_SACK_PERMIT) {
1216 /* Let the peer know that we will SACK. */
1217 *optp++ = TCPOPT_SACK_PERMITTED;
1218 *optp++ = TCPOLEN_SACK_PERMITTED;
1219 optlen += TCPOLEN_SACK_PERMITTED;
1220 }
1221
1222 if (sc->sc_flags & SCF_TIMESTAMP) {
1223 while (optlen % 4 != 2) {
1224 optlen += TCPOLEN_NOP;
1225 *optp++ = TCPOPT_NOP;
1226 }
1227 *optp++ = TCPOPT_TIMESTAMP;
1228 *optp++ = TCPOLEN_TIMESTAMP;
1229 u_int32_t *lp = (u_int32_t *)(optp);
1230 /* Form timestamp option as shown in appendix A of RFC 1323. */
1231 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
1232 *lp = htonl(sc->sc_timestamp);
1233 optp += TCPOLEN_TIMESTAMP - 2;
1234 optlen += TCPOLEN_TIMESTAMP;
1235 }
1236
1237 #ifdef TCP_SIGNATURE
1238 if (sc->sc_flags & SCF_SIGNATURE) {
1239 sav = tcp_signature_getsav(m);
1240 if (sav == NULL) {
1241 m_freem(m);
1242 return EPERM;
1243 }
1244
1245 *optp++ = TCPOPT_SIGNATURE;
1246 *optp++ = TCPOLEN_SIGNATURE;
1247 sigp = optp;
1248 memset(optp, 0, TCP_SIGLEN);
1249 optp += TCP_SIGLEN;
1250 optlen += TCPOLEN_SIGNATURE;
1251 }
1252 #endif
1253
1254 /*
1255 * Terminate and pad TCP options to a 4 byte boundary.
1256 *
1257 * According to RFC793: "The content of the header beyond the
1258 * End-of-Option option must be header padding (i.e., zero)."
1259 * And later: "The padding is composed of zeros."
1260 */
1261 if (optlen % 4) {
1262 optlen += TCPOLEN_EOL;
1263 *optp++ = TCPOPT_EOL;
1264 }
1265 while (optlen % 4) {
1266 optlen += TCPOLEN_PAD;
1267 *optp++ = TCPOPT_PAD;
1268 }
1269
1270 /* Compute the actual values now that we've added the options. */
1271 tlen = hlen + sizeof(struct tcphdr) + optlen;
1272 m->m_len = m->m_pkthdr.len = tlen;
1273 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1274
1275 #ifdef TCP_SIGNATURE
1276 if (sav) {
1277 (void)tcp_signature(m, th, hlen, sav, sigp);
1278 key_sa_recordxfer(sav, m);
1279 KEY_SA_UNREF(&sav);
1280 }
1281 #endif
1282
1283 /*
1284 * Send ECN SYN-ACK setup packet.
1285 * Routes can be asymmetric, so, even if we receive a packet
1286 * with ECE and CWR set, we must not assume no one will block
1287 * the ECE packet we are about to send.
1288 */
1289 if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
1290 SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
1291 th->th_flags |= TH_ECE;
1292 TCP_STATINC(TCP_STAT_ECN_SHS);
1293
1294 /*
1295 * draft-ietf-tcpm-ecnsyn-00.txt
1296 *
1297 * "[...] a TCP node MAY respond to an ECN-setup
1298 * SYN packet by setting ECT in the responding
1299 * ECN-setup SYN/ACK packet, indicating to routers
1300 * that the SYN/ACK packet is ECN-Capable.
1301 * This allows a congested router along the path
1302 * to mark the packet instead of dropping the
1303 * packet as an indication of congestion."
1304 *
1305 * "[...] There can be a great benefit in setting
1306 * an ECN-capable codepoint in SYN/ACK packets [...]
1307 * Congestion is most likely to occur in
1308 * the server-to-client direction. As a result,
1309 * setting an ECN-capable codepoint in SYN/ACK
1310 * packets can reduce the occurrence of three-second
1311 * retransmit timeouts resulting from the drop
1312 * of SYN/ACK packets."
1313 *
1314 * Page 4 and 6, January 2006.
1315 */
1316
1317 switch (sc->sc_src.sa.sa_family) {
1318 case AF_INET:
1319 ip->ip_tos |= IPTOS_ECN_ECT0;
1320 break;
1321 #ifdef INET6
1322 case AF_INET6:
1323 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
1324 break;
1325 #endif
1326 }
1327 TCP_STATINC(TCP_STAT_ECN_ECT);
1328 }
1329
1330
1331 /*
1332 * Compute the packet's checksum.
1333 *
1334 * Fill in some straggling IP bits. Note the stack expects
1335 * ip_len to be in host order, for convenience.
1336 */
1337 switch (sc->sc_src.sa.sa_family) {
1338 case AF_INET:
1339 ip->ip_len = htons(tlen - hlen);
1340 th->th_sum = 0;
1341 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1342 ip->ip_len = htons(tlen);
1343 ip->ip_ttl = ip_defttl;
1344 /* XXX tos? */
1345 break;
1346 #ifdef INET6
1347 case AF_INET6:
1348 ip6->ip6_plen = htons(tlen - hlen);
1349 th->th_sum = 0;
1350 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1351 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
1352 ip6->ip6_vfc |= IPV6_VERSION;
1353 ip6->ip6_plen = htons(tlen - hlen);
1354 /* ip6_hlim will be initialized afterwards */
1355 /* XXX flowlabel? */
1356 break;
1357 #endif
1358 }
1359
1360 /* XXX use IPsec policy on listening socket, on SYN ACK */
1361 tp = sc->sc_tp;
1362
1363 switch (sc->sc_src.sa.sa_family) {
1364 case AF_INET:
1365 error = ip_output(m, sc->sc_ipopts, ro,
1366 (ip_mtudisc ? IP_MTUDISC : 0),
1367 NULL, tp ? tp->t_inpcb : NULL);
1368 break;
1369 #ifdef INET6
1370 case AF_INET6:
1371 ip6->ip6_hlim = in6_selecthlim(NULL,
1372 (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
1373 rtcache_unref(rt, ro);
1374
1375 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
1376 tp ? tp->t_in6pcb : NULL, NULL);
1377 break;
1378 #endif
1379 default:
1380 panic("%s: impossible (2)", __func__);
1381 }
1382
1383 return error;
1384 }
1385