tcp_vtw.c revision 1.11 1 /*
2 * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Coyote Point Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using
32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime
33 * Truncation (MSLT).
34 *
35 * MSLT and VTW were contributed by Coyote Point Systems, Inc.
36 *
37 * Even after a TCP session enters the TIME_WAIT state, its corresponding
38 * socket and protocol control blocks (PCBs) stick around until the TCP
39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload
40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs
41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead
42 * weight in RAM.
43 *
44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to
45 * a class based on the nearness of the peer. Corresponding to each class
46 * is an MSL, and a session uses the MSL of its class. The classes are
47 * loopback (local host equals remote host), local (local host and remote
48 * host are on the same link/subnet), and remote (local host and remote
49 * host communicate via one or more gateways). Classes corresponding to
50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10
51 * seconds for local, 60 seconds for remote. Loopback and local sessions
52 * expire more quickly when MSLT is used.
53 *
54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket
55 * dead weight with a compact representation of the session, called a
56 * "vestigial PCB". VTW data structures are designed to be very fast and
57 * memory-efficient: for fast insertion and lookup of vestigial PCBs,
58 * the PCBs are stored in a hash table that is designed to minimize the
59 * number of cacheline visits per lookup/insertion. The memory both
60 * for vestigial PCBs and for elements of the PCB hashtable come from
61 * fixed-size pools, and linked data structures exploit this to conserve
62 * memory by representing references with a narrow index/offset from the
63 * start of a pool instead of a pointer. When space for new vestigial PCBs
64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first.
65 * VTW cooperates with MSLT.
66 *
67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN
68 * cache.
69 *
70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT
71 * sessions as fast as it can is approximately 17% idle when VTW is active
72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM
73 * when VTW is active (approximately 64k vestigial PCBs are created) than
74 * when it is inactive.
75 */
76
77 #include <sys/cdefs.h>
78
79 #include "opt_ddb.h"
80 #include "opt_inet.h"
81 #include "opt_ipsec.h"
82 #include "opt_inet_csum.h"
83 #include "opt_tcp_debug.h"
84
85 #include <sys/param.h>
86 #include <sys/systm.h>
87 #include <sys/malloc.h>
88 #include <sys/kmem.h>
89 #include <sys/mbuf.h>
90 #include <sys/protosw.h>
91 #include <sys/socket.h>
92 #include <sys/socketvar.h>
93 #include <sys/errno.h>
94 #include <sys/syslog.h>
95 #include <sys/pool.h>
96 #include <sys/domain.h>
97 #include <sys/kernel.h>
98 #include <net/if.h>
99 #include <net/route.h>
100 #include <net/if_types.h>
101
102 #include <netinet/in.h>
103 #include <netinet/in_systm.h>
104 #include <netinet/ip.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_var.h>
107 #include <netinet/ip_var.h>
108 #include <netinet/in_offload.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet6/in6_pcb.h>
112 #include <netinet6/ip6_var.h>
113 #include <netinet6/in6_var.h>
114 #include <netinet/icmp6.h>
115 #include <netinet6/nd6.h>
116
117 #include <netinet/tcp.h>
118 #include <netinet/tcp_fsm.h>
119 #include <netinet/tcp_seq.h>
120 #include <netinet/tcp_timer.h>
121 #include <netinet/tcp_var.h>
122 #include <netinet/tcp_private.h>
123 #include <netinet/tcpip.h>
124
125 #include <netinet/tcp_vtw.h>
126
127 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.11 2014/09/05 06:03:51 matt Exp $");
128
129 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
130
131 static void vtw_debug_init(void);
132
133 fatp_ctl_t fat_tcpv4;
134 fatp_ctl_t fat_tcpv6;
135 vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
136 vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
137 vtw_stats_t vtw_stats;
138
139 /* We provide state for the lookup_ports iterator.
140 * As currently we are netlock-protected, there is one.
141 * If we were finer-grain, we would have one per CPU.
142 * I do not want to be in the business of alloc/free.
143 * The best alternate would be allocate on the caller's
144 * stack, but that would require them to know the struct,
145 * or at least the size.
146 * See how she goes.
147 */
148 struct tcp_ports_iterator {
149 union {
150 struct in_addr v4;
151 struct in6_addr v6;
152 } addr;
153 u_int port;
154
155 uint32_t wild : 1;
156
157 vtw_ctl_t *ctl;
158 fatp_t *fp;
159
160 uint16_t slot_idx;
161 uint16_t ctl_idx;
162 };
163
164 static struct tcp_ports_iterator tcp_ports_iterator_v4;
165 static struct tcp_ports_iterator tcp_ports_iterator_v6;
166
167 static int vtw_age(vtw_ctl_t *, struct timeval *);
168
169 /*!\brief allocate a fat pointer from a collection.
170 */
171 static fatp_t *
172 fatp_alloc(fatp_ctl_t *fat)
173 {
174 fatp_t *fp = 0;
175
176 if (fat->nfree) {
177 fp = fat->free;
178 if (fp) {
179 fat->free = fatp_next(fat, fp);
180 --fat->nfree;
181 ++fat->nalloc;
182 fp->nxt = 0;
183
184 KASSERT(!fp->inuse);
185 }
186 }
187
188 return fp;
189 }
190
191 /*!\brief free a fat pointer.
192 */
193 static void
194 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
195 {
196 if (fp) {
197 KASSERT(!fp->inuse);
198 KASSERT(!fp->nxt);
199
200 fp->nxt = fatp_index(fat, fat->free);
201 fat->free = fp;
202
203 ++fat->nfree;
204 --fat->nalloc;
205 }
206 }
207
208 /*!\brief initialise a collection of fat pointers.
209 *
210 *\param n # hash buckets
211 *\param m total # fat pointers to allocate
212 *
213 * We allocate 2x as much, as we have two hashes: full and lport only.
214 */
215 static void
216 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
217 fatp_t *fat_base, fatp_t **fat_hash)
218 {
219 fatp_t *fp;
220
221 KASSERT(n <= FATP_MAX / 2);
222
223 fat->hash = fat_hash;
224 fat->base = fat_base;
225
226 fat->port = &fat->hash[m];
227
228 fat->mask = m - 1; // ASSERT is power of 2 (m)
229 fat->lim = fat->base + 2*n - 1;
230 fat->nfree = 0;
231 fat->nalloc = 2*n;
232
233 /* Initialise the free list.
234 */
235 for (fp = fat->lim; fp >= fat->base; --fp) {
236 fatp_free(fat, fp);
237 }
238 }
239
240 /*
241 * The `xtra' is XORed into the tag stored.
242 */
243 static uint32_t fatp_xtra[] = {
244 0x11111111,0x22222222,0x33333333,0x44444444,
245 0x55555555,0x66666666,0x77777777,0x88888888,
246 0x12121212,0x21212121,0x34343434,0x43434343,
247 0x56565656,0x65656565,0x78787878,0x87878787,
248 0x11221122,0x22112211,0x33443344,0x44334433,
249 0x55665566,0x66556655,0x77887788,0x88778877,
250 0x11112222,0x22221111,0x33334444,0x44443333,
251 0x55556666,0x66665555,0x77778888,0x88887777,
252 };
253
254 /*!\brief turn a {fatp_t*,slot} into an integral key.
255 *
256 * The key can be used to obtain the fatp_t, and the slot,
257 * as it directly encodes them.
258 */
259 static inline uint32_t
260 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
261 {
262 CTASSERT(CACHE_LINE_SIZE == 32 ||
263 CACHE_LINE_SIZE == 64 ||
264 CACHE_LINE_SIZE == 128);
265
266 switch (fatp_ntags()) {
267 case 7:
268 return (fatp_index(fat, fp) << 3) | slot;
269 case 15:
270 return (fatp_index(fat, fp) << 4) | slot;
271 case 31:
272 return (fatp_index(fat, fp) << 5) | slot;
273 default:
274 KASSERT(0 && "no support, for no good reason");
275 return ~0;
276 }
277 }
278
279 static inline uint32_t
280 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
281 {
282 CTASSERT(CACHE_LINE_SIZE == 32 ||
283 CACHE_LINE_SIZE == 64 ||
284 CACHE_LINE_SIZE == 128);
285
286 switch (fatp_ntags()) {
287 case 7:
288 return key & 7;
289 case 15:
290 return key & 15;
291 case 31:
292 return key & 31;
293 default:
294 KASSERT(0 && "no support, for no good reason");
295 return ~0;
296 }
297 }
298
299 static inline fatp_t *
300 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
301 {
302 CTASSERT(CACHE_LINE_SIZE == 32 ||
303 CACHE_LINE_SIZE == 64 ||
304 CACHE_LINE_SIZE == 128);
305
306 switch (fatp_ntags()) {
307 case 7:
308 key >>= 3;
309 break;
310 case 15:
311 key >>= 4;
312 break;
313 case 31:
314 key >>= 5;
315 break;
316 default:
317 KASSERT(0 && "no support, for no good reason");
318 return 0;
319 }
320
321 return key ? fat->base + key - 1 : 0;
322 }
323
324 static inline uint32_t
325 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
326 {
327 return (idx << ctl->idx_bits) | idx;
328 }
329
330 static inline uint32_t
331 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
332 {
333 uint32_t idx = bits & ctl->idx_mask;
334
335 if (idx_encode(ctl, idx) == bits)
336 return idx;
337 else
338 return ~0;
339 }
340
341 /*!\brief insert index into fatp hash
342 *
343 *\param idx - index of element being placed in hash chain
344 *\param tag - 32-bit tag identifier
345 *
346 *\returns
347 * value which can be used to locate entry.
348 *
349 *\note
350 * we rely on the fact that there are unused high bits in the index
351 * for verification purposes on lookup.
352 */
353
354 static inline uint32_t
355 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
356 void *dbg)
357 {
358 fatp_t *fp;
359 fatp_t **hash = (which ? fat->port : fat->hash);
360 int i;
361
362 fp = hash[tag & fat->mask];
363
364 while (!fp || fatp_full(fp)) {
365 fatp_t *fq;
366
367 /* All entries are inuse at the top level.
368 * We allocate a spare, and push the top level
369 * down one. All entries in the fp we push down
370 * (think of a tape worm here) will be expelled sooner than
371 * any entries added subsequently to this hash bucket.
372 * This is a property of the time waits we are exploiting.
373 */
374
375 fq = fatp_alloc(fat);
376 if (!fq) {
377 vtw_age(fat->vtw, 0);
378 fp = hash[tag & fat->mask];
379 continue;
380 }
381
382 fq->inuse = 0;
383 fq->nxt = fatp_index(fat, fp);
384
385 hash[tag & fat->mask] = fq;
386
387 fp = fq;
388 }
389
390 KASSERT(!fatp_full(fp));
391
392 /* Fill highest index first. Lookup is lowest first.
393 */
394 for (i = fatp_ntags(); --i >= 0; ) {
395 if (!((1 << i) & fp->inuse)) {
396 break;
397 }
398 }
399
400 fp->inuse |= 1 << i;
401 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
402
403 db_trace(KTR_VTW
404 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
405 , fp->inuse
406 , i, fp->tag[i]));
407
408 return fatp_key(fat, fp, i);
409 }
410
411 static inline int
412 vtw_alive(const vtw_t *vtw)
413 {
414 return vtw->hashed && vtw->expire.tv_sec;
415 }
416
417 static inline uint32_t
418 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
419 {
420 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
421 return v4 - ctl->base.v4;
422
423 KASSERT(0 && "vtw out of bounds");
424
425 return ~0;
426 }
427
428 static inline uint32_t
429 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
430 {
431 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
432 return v6 - ctl->base.v6;
433
434 KASSERT(0 && "vtw out of bounds");
435
436 return ~0;
437 }
438
439 static inline uint32_t
440 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
441 {
442 if (ctl->clidx)
443 ctl = ctl->ctl;
444
445 if (ctl->is_v4)
446 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
447
448 if (ctl->is_v6)
449 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
450
451 KASSERT(0 && "neither 4 nor 6. most curious.");
452
453 return ~0;
454 }
455
456 static inline vtw_t *
457 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
458 {
459 if (ctl->clidx)
460 ctl = ctl->ctl;
461
462 /* See if the index looks like it might be an index.
463 * Bits on outside of the valid index bits is a give away.
464 */
465 idx = idx_decode(ctl, idx);
466
467 if (idx == ~0) {
468 return 0;
469 } else if (ctl->is_v4) {
470 vtw_v4_t *vtw = ctl->base.v4 + idx;
471
472 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
473 ? &vtw->common : 0;
474 } else if (ctl->is_v6) {
475 vtw_v6_t *vtw = ctl->base.v6 + idx;
476
477 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
478 ? &vtw->common : 0;
479 } else {
480 KASSERT(0 && "badness");
481 return 0;
482 }
483 }
484
485 /*!\brief return the next vtw after this one.
486 *
487 * Due to the differing sizes of the entries in differing
488 * arenas, we have to ensure we ++ the correct pointer type.
489 *
490 * Also handles wrap.
491 */
492 static inline vtw_t *
493 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
494 {
495 if (ctl->is_v4) {
496 vtw_v4_t *v4 = (void*)vtw;
497
498 vtw = &(++v4)->common;
499 } else {
500 vtw_v6_t *v6 = (void*)vtw;
501
502 vtw = &(++v6)->common;
503 }
504
505 if (vtw > ctl->lim.v)
506 vtw = ctl->base.v;
507
508 return vtw;
509 }
510
511 /*!\brief remove entry from FATP hash chains
512 */
513 static inline void
514 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
515 {
516 fatp_ctl_t *fat = ctl->fat;
517 fatp_t *fp;
518 uint32_t key = vtw->key;
519 uint32_t tag, slot, idx;
520 vtw_v4_t *v4 = (void*)vtw;
521 vtw_v6_t *v6 = (void*)vtw;
522
523 if (!vtw->hashed) {
524 KASSERT(0 && "unhashed");
525 return;
526 }
527
528 if (fat->vtw->is_v4) {
529 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
530 } else if (fat->vtw->is_v6) {
531 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
532 } else {
533 tag = 0;
534 KASSERT(0 && "not reached");
535 }
536
537 /* Remove from fat->hash[]
538 */
539 slot = fatp_slot_from_key(fat, key);
540 fp = fatp_from_key(fat, key);
541 idx = vtw_index(ctl, vtw);
542
543 db_trace(KTR_VTW
544 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
545 , fp->inuse, slot, idx, key, tag));
546
547 KASSERT(fp->inuse & (1 << slot));
548 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
549 ^ fatp_xtra[slot]));
550
551 if ((fp->inuse & (1 << slot))
552 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
553 ^ fatp_xtra[slot])) {
554 fp->inuse ^= 1 << slot;
555 fp->tag[slot] = 0;
556
557 /* When we delete entries, we do not compact. This is
558 * due to temporality. We add entries, and they
559 * (eventually) expire. Older entries will be further
560 * down the chain.
561 */
562 if (!fp->inuse) {
563 uint32_t hi = tag & fat->mask;
564 fatp_t *fq = 0;
565 fatp_t *fr = fat->hash[hi];
566
567 while (fr && fr != fp) {
568 fr = fatp_next(fat, fq = fr);
569 }
570
571 if (fr == fp) {
572 if (fq) {
573 fq->nxt = fp->nxt;
574 fp->nxt = 0;
575 fatp_free(fat, fp);
576 } else {
577 KASSERT(fat->hash[hi] == fp);
578
579 if (fp->nxt) {
580 fat->hash[hi]
581 = fatp_next(fat, fp);
582 fp->nxt = 0;
583 fatp_free(fat, fp);
584 } else {
585 /* retain for next use.
586 */
587 ;
588 }
589 }
590 } else {
591 fr = fat->hash[hi];
592
593 do {
594 db_trace(KTR_VTW
595 , (fr
596 , "fat:*del inuse %5.5x"
597 " nxt %x"
598 , fr->inuse, fr->nxt));
599
600 fr = fatp_next(fat, fq = fr);
601 } while (fr && fr != fp);
602
603 KASSERT(0 && "oops");
604 }
605 }
606 vtw->key ^= ~0;
607 }
608
609 if (fat->vtw->is_v4) {
610 tag = v4_port_tag(v4->lport);
611 } else if (fat->vtw->is_v6) {
612 tag = v6_port_tag(v6->lport);
613 }
614
615 /* Remove from fat->port[]
616 */
617 key = vtw->port_key;
618 slot = fatp_slot_from_key(fat, key);
619 fp = fatp_from_key(fat, key);
620 idx = vtw_index(ctl, vtw);
621
622 db_trace(KTR_VTW
623 , (fp, "fatport: del inuse %5.5x"
624 " slot %x idx %x key %x tag %x"
625 , fp->inuse, slot, idx, key, tag));
626
627 KASSERT(fp->inuse & (1 << slot));
628 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
629 ^ fatp_xtra[slot]));
630
631 if ((fp->inuse & (1 << slot))
632 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
633 ^ fatp_xtra[slot])) {
634 fp->inuse ^= 1 << slot;
635 fp->tag[slot] = 0;
636
637 if (!fp->inuse) {
638 uint32_t hi = tag & fat->mask;
639 fatp_t *fq = 0;
640 fatp_t *fr = fat->port[hi];
641
642 while (fr && fr != fp) {
643 fr = fatp_next(fat, fq = fr);
644 }
645
646 if (fr == fp) {
647 if (fq) {
648 fq->nxt = fp->nxt;
649 fp->nxt = 0;
650 fatp_free(fat, fp);
651 } else {
652 KASSERT(fat->port[hi] == fp);
653
654 if (fp->nxt) {
655 fat->port[hi]
656 = fatp_next(fat, fp);
657 fp->nxt = 0;
658 fatp_free(fat, fp);
659 } else {
660 /* retain for next use.
661 */
662 ;
663 }
664 }
665 }
666 }
667 vtw->port_key ^= ~0;
668 }
669
670 vtw->hashed = 0;
671 }
672
673 /*!\brief remove entry from hash, possibly free.
674 */
675 void
676 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
677 {
678 KASSERT(mutex_owned(softnet_lock));
679
680 if (vtw->hashed) {
681 ++vtw_stats.del;
682 vtw_unhash(ctl, vtw);
683 }
684
685 /* We only delete the oldest entry.
686 */
687 if (vtw != ctl->oldest.v)
688 return;
689
690 --ctl->nalloc;
691 ++ctl->nfree;
692
693 vtw->expire.tv_sec = 0;
694 vtw->expire.tv_usec = ~0;
695
696 if (!ctl->nalloc)
697 ctl->oldest.v = 0;
698
699 ctl->oldest.v = vtw_next(ctl, vtw);
700 }
701
702 /*!\brief insert vestigial timewait in hash chain
703 */
704 static void
705 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
706 {
707 uint32_t idx = vtw_index(ctl, vtw);
708 uint32_t tag;
709 vtw_v4_t *v4 = (void*)vtw;
710
711 KASSERT(mutex_owned(softnet_lock));
712 KASSERT(!vtw->hashed);
713 KASSERT(ctl->clidx == vtw->msl_class);
714
715 ++vtw_stats.ins;
716
717 tag = v4_tag(v4->faddr, v4->fport,
718 v4->laddr, v4->lport);
719
720 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
721
722 db_trace(KTR_VTW, (ctl
723 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
724 " tag %8.8x key %8.8x"
725 , v4->faddr, v4->fport
726 , v4->laddr, v4->lport
727 , tag
728 , vtw->key));
729
730 tag = v4_port_tag(v4->lport);
731 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
732
733 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
734 , v4->lport, v4->lport
735 , tag
736 , vtw->key));
737
738 vtw->hashed = 1;
739 }
740
741 /*!\brief insert vestigial timewait in hash chain
742 */
743 static void
744 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
745 {
746 uint32_t idx = vtw_index(ctl, vtw);
747 uint32_t tag;
748 vtw_v6_t *v6 = (void*)vtw;
749
750 KASSERT(mutex_owned(softnet_lock));
751 KASSERT(!vtw->hashed);
752 KASSERT(ctl->clidx == vtw->msl_class);
753
754 ++vtw_stats.ins;
755
756 tag = v6_tag(&v6->faddr, v6->fport,
757 &v6->laddr, v6->lport);
758
759 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
760
761 tag = v6_port_tag(v6->lport);
762 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
763
764 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
765 , v6->lport, v6->lport
766 , tag
767 , vtw->key));
768
769 vtw->hashed = 1;
770 }
771
772 static vtw_t *
773 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
774 , uint32_t laddr, uint16_t lport
775 , int which)
776 {
777 vtw_v4_t *v4;
778 vtw_t *vtw;
779 uint32_t tag;
780 fatp_t *fp;
781 int i;
782 uint32_t fatps = 0, probes = 0, losings = 0;
783
784 if (!ctl || !ctl->fat)
785 return 0;
786
787 ++vtw_stats.look[which];
788
789 if (which) {
790 tag = v4_port_tag(lport);
791 fp = ctl->fat->port[tag & ctl->fat->mask];
792 } else {
793 tag = v4_tag(faddr, fport, laddr, lport);
794 fp = ctl->fat->hash[tag & ctl->fat->mask];
795 }
796
797 while (fp && fp->inuse) {
798 uint32_t inuse = fp->inuse;
799
800 ++fatps;
801
802 for (i = 0; inuse && i < fatp_ntags(); ++i) {
803 uint32_t idx;
804
805 if (!(inuse & (1 << i)))
806 continue;
807
808 inuse ^= 1 << i;
809
810 ++probes;
811 ++vtw_stats.probe[which];
812
813 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
814 vtw = vtw_from_index(ctl, idx);
815
816 if (!vtw) {
817 /* Hopefully fast path.
818 */
819 db_trace(KTR_VTW
820 , (fp, "vtw: fast %A:%P %A:%P"
821 " idx %x tag %x"
822 , faddr, fport
823 , laddr, lport
824 , idx, tag));
825 continue;
826 }
827
828 v4 = (void*)vtw;
829
830 /* The de-referencing of vtw is what we want to avoid.
831 * Losing.
832 */
833 if (vtw_alive(vtw)
834 && ((which ? vtw->port_key : vtw->key)
835 == fatp_key(ctl->fat, fp, i))
836 && (which
837 || (v4->faddr == faddr && v4->laddr == laddr
838 && v4->fport == fport))
839 && v4->lport == lport) {
840 ++vtw_stats.hit[which];
841
842 db_trace(KTR_VTW
843 , (fp, "vtw: hit %8.8x:%4.4x"
844 " %8.8x:%4.4x idx %x key %x"
845 , faddr, fport
846 , laddr, lport
847 , idx_decode(ctl, idx), vtw->key));
848
849 KASSERT(vtw->hashed);
850
851 goto out;
852 }
853 ++vtw_stats.losing[which];
854 ++losings;
855
856 if (vtw_alive(vtw)) {
857 db_trace(KTR_VTW
858 , (fp, "vtw:!mis %8.8x:%4.4x"
859 " %8.8x:%4.4x key %x tag %x"
860 , faddr, fport
861 , laddr, lport
862 , fatp_key(ctl->fat, fp, i)
863 , v4_tag(faddr, fport
864 , laddr, lport)));
865 db_trace(KTR_VTW
866 , (vtw, "vtw:!mis %8.8x:%4.4x"
867 " %8.8x:%4.4x key %x tag %x"
868 , v4->faddr, v4->fport
869 , v4->laddr, v4->lport
870 , vtw->key
871 , v4_tag(v4->faddr, v4->fport
872 , v4->laddr, v4->lport)));
873
874 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
875 db_trace(KTR_VTW
876 , (vtw, "vtw:!mis %8.8x:%4.4x"
877 " %8.8x:%4.4x key %x"
878 " which %x"
879 , v4->faddr, v4->fport
880 , v4->laddr, v4->lport
881 , vtw->key
882 , which));
883
884 } else {
885 db_trace(KTR_VTW
886 , (vtw
887 , "vtw:!mis"
888 " key %8.8x != %8.8x"
889 " idx %x i %x which %x"
890 , vtw->key
891 , fatp_key(ctl->fat, fp, i)
892 , idx_decode(ctl, idx)
893 , i
894 , which));
895 }
896 } else {
897 db_trace(KTR_VTW
898 , (fp
899 , "vtw:!mis free entry"
900 " idx %x vtw %p which %x"
901 , idx_decode(ctl, idx)
902 , vtw, which));
903 }
904 }
905
906 if (fp->nxt) {
907 fp = fatp_next(ctl->fat, fp);
908 } else {
909 break;
910 }
911 }
912 ++vtw_stats.miss[which];
913 vtw = 0;
914 out:
915 if (fatps > vtw_stats.max_chain[which])
916 vtw_stats.max_chain[which] = fatps;
917 if (probes > vtw_stats.max_probe[which])
918 vtw_stats.max_probe[which] = probes;
919 if (losings > vtw_stats.max_loss[which])
920 vtw_stats.max_loss[which] = losings;
921
922 return vtw;
923 }
924
925 static vtw_t *
926 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
927 , const struct in6_addr *laddr, uint16_t lport
928 , int which)
929 {
930 vtw_v6_t *v6;
931 vtw_t *vtw;
932 uint32_t tag;
933 fatp_t *fp;
934 int i;
935 uint32_t fatps = 0, probes = 0, losings = 0;
936
937 ++vtw_stats.look[which];
938
939 if (!ctl || !ctl->fat)
940 return 0;
941
942 if (which) {
943 tag = v6_port_tag(lport);
944 fp = ctl->fat->port[tag & ctl->fat->mask];
945 } else {
946 tag = v6_tag(faddr, fport, laddr, lport);
947 fp = ctl->fat->hash[tag & ctl->fat->mask];
948 }
949
950 while (fp && fp->inuse) {
951 uint32_t inuse = fp->inuse;
952
953 ++fatps;
954
955 for (i = 0; inuse && i < fatp_ntags(); ++i) {
956 uint32_t idx;
957
958 if (!(inuse & (1 << i)))
959 continue;
960
961 inuse ^= 1 << i;
962
963 ++probes;
964 ++vtw_stats.probe[which];
965
966 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
967 vtw = vtw_from_index(ctl, idx);
968
969 db_trace(KTR_VTW
970 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
971 , i
972 , db_store(faddr, sizeof (*faddr)), fport
973 , db_store(laddr, sizeof (*laddr)), lport
974 , idx_decode(ctl, idx)));
975
976 if (!vtw) {
977 /* Hopefully fast path.
978 */
979 continue;
980 }
981
982 v6 = (void*)vtw;
983
984 if (vtw_alive(vtw)
985 && ((which ? vtw->port_key : vtw->key)
986 == fatp_key(ctl->fat, fp, i))
987 && v6->lport == lport
988 && (which
989 || (v6->fport == fport
990 && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
991 && !bcmp(&v6->laddr, laddr
992 , sizeof (*laddr))))) {
993 ++vtw_stats.hit[which];
994
995 KASSERT(vtw->hashed);
996 goto out;
997 } else {
998 ++vtw_stats.losing[which];
999 ++losings;
1000 }
1001 }
1002
1003 if (fp->nxt) {
1004 fp = fatp_next(ctl->fat, fp);
1005 } else {
1006 break;
1007 }
1008 }
1009 ++vtw_stats.miss[which];
1010 vtw = 0;
1011 out:
1012 if (fatps > vtw_stats.max_chain[which])
1013 vtw_stats.max_chain[which] = fatps;
1014 if (probes > vtw_stats.max_probe[which])
1015 vtw_stats.max_probe[which] = probes;
1016 if (losings > vtw_stats.max_loss[which])
1017 vtw_stats.max_loss[which] = losings;
1018
1019 return vtw;
1020 }
1021
1022 /*!\brief port iterator
1023 */
1024 static vtw_t *
1025 vtw_next_port_v4(struct tcp_ports_iterator *it)
1026 {
1027 vtw_ctl_t *ctl = it->ctl;
1028 vtw_v4_t *v4;
1029 vtw_t *vtw;
1030 uint32_t tag;
1031 uint16_t lport = it->port;
1032 fatp_t *fp;
1033 int i;
1034 uint32_t fatps = 0, probes = 0, losings = 0;
1035
1036 tag = v4_port_tag(lport);
1037 if (!it->fp) {
1038 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1039 it->slot_idx = 0;
1040 }
1041 fp = it->fp;
1042
1043 while (fp) {
1044 uint32_t inuse = fp->inuse;
1045
1046 ++fatps;
1047
1048 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1049 uint32_t idx;
1050
1051 if (!(inuse & (1 << i)))
1052 continue;
1053
1054 inuse &= ~0 << i;
1055
1056 if (i < it->slot_idx)
1057 continue;
1058
1059 ++vtw_stats.probe[1];
1060 ++probes;
1061
1062 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1063 vtw = vtw_from_index(ctl, idx);
1064
1065 if (!vtw) {
1066 /* Hopefully fast path.
1067 */
1068 continue;
1069 }
1070
1071 v4 = (void*)vtw;
1072
1073 if (vtw_alive(vtw)
1074 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1075 && v4->lport == lport) {
1076 ++vtw_stats.hit[1];
1077
1078 it->slot_idx = i + 1;
1079
1080 goto out;
1081 } else if (vtw_alive(vtw)) {
1082 ++vtw_stats.losing[1];
1083 ++losings;
1084
1085 db_trace(KTR_VTW
1086 , (vtw, "vtw:!mis"
1087 " port %8.8x:%4.4x %8.8x:%4.4x"
1088 " key %x port %x"
1089 , v4->faddr, v4->fport
1090 , v4->laddr, v4->lport
1091 , vtw->key
1092 , lport));
1093 } else {
1094 /* Really losing here. We are coming
1095 * up with references to free entries.
1096 * Might find it better to use
1097 * traditional, or need another
1098 * add-hockery. The other add-hockery
1099 * would be to pul more into into the
1100 * cache line to reject the false
1101 * hits.
1102 */
1103 ++vtw_stats.losing[1];
1104 ++losings;
1105 db_trace(KTR_VTW
1106 , (fp, "vtw:!mis port %x"
1107 " - free entry idx %x vtw %p"
1108 , lport
1109 , idx_decode(ctl, idx)
1110 , vtw));
1111 }
1112 }
1113
1114 if (fp->nxt) {
1115 it->fp = fp = fatp_next(ctl->fat, fp);
1116 it->slot_idx = 0;
1117 } else {
1118 it->fp = 0;
1119 break;
1120 }
1121 }
1122 ++vtw_stats.miss[1];
1123
1124 vtw = 0;
1125 out:
1126 if (fatps > vtw_stats.max_chain[1])
1127 vtw_stats.max_chain[1] = fatps;
1128 if (probes > vtw_stats.max_probe[1])
1129 vtw_stats.max_probe[1] = probes;
1130 if (losings > vtw_stats.max_loss[1])
1131 vtw_stats.max_loss[1] = losings;
1132
1133 return vtw;
1134 }
1135
1136 /*!\brief port iterator
1137 */
1138 static vtw_t *
1139 vtw_next_port_v6(struct tcp_ports_iterator *it)
1140 {
1141 vtw_ctl_t *ctl = it->ctl;
1142 vtw_v6_t *v6;
1143 vtw_t *vtw;
1144 uint32_t tag;
1145 uint16_t lport = it->port;
1146 fatp_t *fp;
1147 int i;
1148 uint32_t fatps = 0, probes = 0, losings = 0;
1149
1150 tag = v6_port_tag(lport);
1151 if (!it->fp) {
1152 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1153 it->slot_idx = 0;
1154 }
1155 fp = it->fp;
1156
1157 while (fp) {
1158 uint32_t inuse = fp->inuse;
1159
1160 ++fatps;
1161
1162 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1163 uint32_t idx;
1164
1165 if (!(inuse & (1 << i)))
1166 continue;
1167
1168 inuse &= ~0 << i;
1169
1170 if (i < it->slot_idx)
1171 continue;
1172
1173 ++vtw_stats.probe[1];
1174 ++probes;
1175
1176 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1177 vtw = vtw_from_index(ctl, idx);
1178
1179 if (!vtw) {
1180 /* Hopefully fast path.
1181 */
1182 continue;
1183 }
1184
1185 v6 = (void*)vtw;
1186
1187 db_trace(KTR_VTW
1188 , (vtw, "vtw: i %x idx %x fp->tag %x"
1189 " tag %x xtra %x"
1190 , i, idx_decode(ctl, idx)
1191 , fp->tag[i], tag, fatp_xtra[i]));
1192
1193 if (vtw_alive(vtw)
1194 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1195 && v6->lport == lport) {
1196 ++vtw_stats.hit[1];
1197
1198 db_trace(KTR_VTW
1199 , (fp, "vtw: nxt port %P - %4.4x"
1200 " idx %x key %x"
1201 , lport, lport
1202 , idx_decode(ctl, idx), vtw->key));
1203
1204 it->slot_idx = i + 1;
1205 goto out;
1206 } else if (vtw_alive(vtw)) {
1207 ++vtw_stats.losing[1];
1208
1209 db_trace(KTR_VTW
1210 , (vtw, "vtw:!mis port %6A:%4.4x"
1211 " %6A:%4.4x key %x port %x"
1212 , db_store(&v6->faddr
1213 , sizeof (v6->faddr))
1214 , v6->fport
1215 , db_store(&v6->laddr
1216 , sizeof (v6->faddr))
1217 , v6->lport
1218 , vtw->key
1219 , lport));
1220 } else {
1221 /* Really losing here. We are coming
1222 * up with references to free entries.
1223 * Might find it better to use
1224 * traditional, or need another
1225 * add-hockery. The other add-hockery
1226 * would be to pul more into into the
1227 * cache line to reject the false
1228 * hits.
1229 */
1230 ++vtw_stats.losing[1];
1231 ++losings;
1232
1233 db_trace(KTR_VTW
1234 , (fp
1235 , "vtw:!mis port %x"
1236 " - free entry idx %x vtw %p"
1237 , lport, idx_decode(ctl, idx)
1238 , vtw));
1239 }
1240 }
1241
1242 if (fp->nxt) {
1243 it->fp = fp = fatp_next(ctl->fat, fp);
1244 it->slot_idx = 0;
1245 } else {
1246 it->fp = 0;
1247 break;
1248 }
1249 }
1250 ++vtw_stats.miss[1];
1251
1252 vtw = 0;
1253 out:
1254 if (fatps > vtw_stats.max_chain[1])
1255 vtw_stats.max_chain[1] = fatps;
1256 if (probes > vtw_stats.max_probe[1])
1257 vtw_stats.max_probe[1] = probes;
1258 if (losings > vtw_stats.max_loss[1])
1259 vtw_stats.max_loss[1] = losings;
1260
1261 return vtw;
1262 }
1263
1264 /*!\brief initialise the VTW allocation arena
1265 *
1266 * There are 1+3 allocation classes:
1267 * 0 classless
1268 * {1,2,3} MSL-class based allocation
1269 *
1270 * The allocation arenas are all initialised. Classless gets all the
1271 * space. MSL-class based divides the arena, so that allocation
1272 * within a class can proceed without having to consider entries
1273 * (aka: cache lines) from different classes.
1274 *
1275 * Usually, we are completely classless or class-based, but there can be
1276 * transition periods, corresponding to dynamic adjustments in the config
1277 * by the operator.
1278 */
1279 static void
1280 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1281 {
1282 int class_n, i;
1283 vtw_t *base;
1284
1285 ctl->base.v = ctl_base_v;
1286
1287 if (ctl->is_v4) {
1288 ctl->lim.v4 = ctl->base.v4 + n - 1;
1289 ctl->alloc.v4 = ctl->base.v4;
1290 } else {
1291 ctl->lim.v6 = ctl->base.v6 + n - 1;
1292 ctl->alloc.v6 = ctl->base.v6;
1293 }
1294
1295 ctl->nfree = n;
1296 ctl->ctl = ctl;
1297
1298 ctl->idx_bits = 32;
1299 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1300 ctl->idx_mask >>= 1;
1301 ctl->idx_bits -= 1;
1302 }
1303
1304 ctl->idx_mask <<= 1;
1305 ctl->idx_mask |= 1;
1306 ctl->idx_bits += 1;
1307
1308 ctl->fat = fat;
1309 fat->vtw = ctl;
1310
1311 /* Divide the resources equally amongst the classes.
1312 * This is not optimal, as the different classes
1313 * arrive and leave at different rates, but it is
1314 * the best I can do for now.
1315 */
1316 class_n = n / (VTW_NCLASS-1);
1317 base = ctl->base.v;
1318
1319 for (i = 1; i < VTW_NCLASS; ++i) {
1320 int j;
1321
1322 ctl[i] = ctl[0];
1323 ctl[i].clidx = i;
1324
1325 ctl[i].base.v = base;
1326 ctl[i].alloc = ctl[i].base;
1327
1328 for (j = 0; j < class_n - 1; ++j) {
1329 if (tcp_msl_enable)
1330 base->msl_class = i;
1331 base = vtw_next(ctl, base);
1332 }
1333
1334 ctl[i].lim.v = base;
1335 base = vtw_next(ctl, base);
1336 ctl[i].nfree = class_n;
1337 }
1338
1339 vtw_debug_init();
1340 }
1341
1342 /*!\brief map class to TCP MSL
1343 */
1344 static inline uint32_t
1345 class_to_msl(int msl_class)
1346 {
1347 switch (msl_class) {
1348 case 0:
1349 case 1:
1350 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1351 case 2:
1352 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1353 default:
1354 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1355 }
1356 }
1357
1358 /*!\brief map TCP MSL to class
1359 */
1360 static inline uint32_t
1361 msl_to_class(int msl)
1362 {
1363 if (tcp_msl_enable) {
1364 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1365 return 1+2;
1366 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1367 return 1+1;
1368 return 1;
1369 }
1370 return 0;
1371 }
1372
1373 /*!\brief allocate a vtw entry
1374 */
1375 static inline vtw_t *
1376 vtw_alloc(vtw_ctl_t *ctl)
1377 {
1378 vtw_t *vtw = 0;
1379 int stuck = 0;
1380 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1381 int msl;
1382
1383 KASSERT(mutex_owned(softnet_lock));
1384
1385 /* If no resources, we will not get far.
1386 */
1387 if (!ctl || !ctl->base.v4 || avail <= 0)
1388 return 0;
1389
1390 /* Obtain a free one.
1391 */
1392 while (!ctl->nfree) {
1393 vtw_age(ctl, 0);
1394
1395 if (++stuck > avail) {
1396 /* When in transition between
1397 * schemes (classless, classed) we
1398 * can be stuck having to await the
1399 * expiration of cross-allocated entries.
1400 *
1401 * Returning zero means we will fall back to the
1402 * traditional TIME_WAIT handling, except in the
1403 * case of a re-shed, in which case we cannot
1404 * perform the reshecd, but will retain the extant
1405 * entry.
1406 */
1407 db_trace(KTR_VTW
1408 , (ctl, "vtw:!none free in class %x %x/%x"
1409 , ctl->clidx
1410 , ctl->nalloc, ctl->nfree));
1411
1412 return 0;
1413 }
1414 }
1415
1416 vtw = ctl->alloc.v;
1417
1418 if (vtw->msl_class != ctl->clidx) {
1419 /* Usurping rules:
1420 * 0 -> {1,2,3} or {1,2,3} -> 0
1421 */
1422 KASSERT(!vtw->msl_class || !ctl->clidx);
1423
1424 if (vtw->hashed || vtw->expire.tv_sec) {
1425 /* As this is owned by some other class,
1426 * we must wait for it to expire it.
1427 * This will only happen on class/classless
1428 * transitions, which are guaranteed to progress
1429 * to completion in small finite time, barring bugs.
1430 */
1431 db_trace(KTR_VTW
1432 , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1433 , vtw, vtw->msl_class, ctl->clidx
1434 , vtw->expire.tv_sec
1435 , vtw->expire.tv_usec
1436 , vtw->hashed ? " hashed" : ""));
1437
1438 return 0;
1439 }
1440
1441 db_trace(KTR_VTW
1442 , (ctl, "vtw:!%p usurped from %x to %x"
1443 , vtw, vtw->msl_class, ctl->clidx));
1444
1445 vtw->msl_class = ctl->clidx;
1446 }
1447
1448 if (vtw_alive(vtw)) {
1449 KASSERT(0 && "next free not free");
1450 return 0;
1451 }
1452
1453 /* Advance allocation poiter.
1454 */
1455 ctl->alloc.v = vtw_next(ctl, vtw);
1456
1457 --ctl->nfree;
1458 ++ctl->nalloc;
1459
1460 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1461
1462 /* mark expiration
1463 */
1464 getmicrouptime(&vtw->expire);
1465
1466 /* Move expiration into the future.
1467 */
1468 vtw->expire.tv_sec += msl / 1000;
1469 vtw->expire.tv_usec += 1000 * (msl % 1000);
1470
1471 while (vtw->expire.tv_usec >= 1000*1000) {
1472 vtw->expire.tv_usec -= 1000*1000;
1473 vtw->expire.tv_sec += 1;
1474 }
1475
1476 if (!ctl->oldest.v)
1477 ctl->oldest.v = vtw;
1478
1479 return vtw;
1480 }
1481
1482 /*!\brief expiration
1483 */
1484 static int
1485 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1486 {
1487 vtw_t *vtw;
1488 struct timeval then, *when = _when;
1489 int maxtries = 0;
1490
1491 if (!ctl->oldest.v) {
1492 KASSERT(!ctl->nalloc);
1493 return 0;
1494 }
1495
1496 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1497 if (++maxtries > ctl->nalloc)
1498 break;
1499
1500 if (vtw->msl_class != ctl->clidx) {
1501 db_trace(KTR_VTW
1502 , (vtw, "vtw:!age class mismatch %x != %x"
1503 , vtw->msl_class, ctl->clidx));
1504 /* XXXX
1505 * See if the appropriate action is to skip to the next.
1506 * XXXX
1507 */
1508 ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1509 continue;
1510 }
1511 if (!when) {
1512 /* Latch oldest timeval if none specified.
1513 */
1514 then = vtw->expire;
1515 when = &then;
1516 }
1517
1518 if (!timercmp(&vtw->expire, when, <=))
1519 break;
1520
1521 db_trace(KTR_VTW
1522 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1523 , ctl->clidx
1524 , vtw->expire.tv_sec
1525 , vtw->expire.tv_usec
1526 , ctl->nalloc
1527 , ctl->nfree));
1528
1529 if (!_when)
1530 ++vtw_stats.kill;
1531
1532 vtw_del(ctl, vtw);
1533 vtw = ctl->oldest.v;
1534 }
1535
1536 return ctl->nalloc; // # remaining allocated
1537 }
1538
1539 static callout_t vtw_cs;
1540
1541 /*!\brief notice the passage of time.
1542 * It seems to be getting faster. What happened to the year?
1543 */
1544 static void
1545 vtw_tick(void *arg)
1546 {
1547 struct timeval now;
1548 int i, cnt = 0;
1549
1550 getmicrouptime(&now);
1551
1552 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1553 , now.tv_sec, now.tv_usec));
1554
1555 mutex_enter(softnet_lock);
1556
1557 for (i = 0; i < VTW_NCLASS; ++i) {
1558 cnt += vtw_age(&vtw_tcpv4[i], &now);
1559 cnt += vtw_age(&vtw_tcpv6[i], &now);
1560 }
1561
1562 /* Keep ticks coming while we need them.
1563 */
1564 if (cnt)
1565 callout_schedule(&vtw_cs, hz / 5);
1566 else {
1567 tcp_vtw_was_enabled = 0;
1568 tcbtable.vestige = 0;
1569 }
1570 mutex_exit(softnet_lock);
1571 }
1572
1573 /* in_pcblookup_ports assist for handling vestigial entries.
1574 */
1575 static void *
1576 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1577 {
1578 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1579
1580 bzero(it, sizeof (*it));
1581
1582 /* Note: the reference to vtw_tcpv4[0] is fine.
1583 * We do not need per-class iteration. We just
1584 * need to get to the fat, and there is one
1585 * shared fat.
1586 */
1587 if (vtw_tcpv4[0].fat) {
1588 it->addr.v4 = addr;
1589 it->port = port;
1590 it->wild = !!wild;
1591 it->ctl = &vtw_tcpv4[0];
1592
1593 ++vtw_stats.look[1];
1594 }
1595
1596 return it;
1597 }
1598
1599 /*!\brief export an IPv4 vtw.
1600 */
1601 static int
1602 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1603 {
1604 vtw_v4_t *v4 = (void*)vtw;
1605
1606 bzero(res, sizeof (*res));
1607
1608 if (ctl && vtw) {
1609 if (!ctl->clidx && vtw->msl_class)
1610 ctl += vtw->msl_class;
1611 else
1612 KASSERT(ctl->clidx == vtw->msl_class);
1613
1614 res->valid = 1;
1615 res->v4 = 1;
1616
1617 res->faddr.v4.s_addr = v4->faddr;
1618 res->laddr.v4.s_addr = v4->laddr;
1619 res->fport = v4->fport;
1620 res->lport = v4->lport;
1621 res->vtw = vtw; // netlock held over call(s)
1622 res->ctl = ctl;
1623 res->reuse_addr = vtw->reuse_addr;
1624 res->reuse_port = vtw->reuse_port;
1625 res->snd_nxt = vtw->snd_nxt;
1626 res->rcv_nxt = vtw->rcv_nxt;
1627 res->rcv_wnd = vtw->rcv_wnd;
1628 res->uid = vtw->uid;
1629 }
1630
1631 return res->valid;
1632 }
1633
1634 /*!\brief return next port in the port iterator. yowza.
1635 */
1636 static int
1637 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1638 {
1639 struct tcp_ports_iterator *it = arg;
1640 vtw_t *vtw = 0;
1641
1642 if (it->ctl)
1643 vtw = vtw_next_port_v4(it);
1644
1645 if (!vtw)
1646 it->ctl = 0;
1647
1648 return vtw_export_v4(it->ctl, vtw, res);
1649 }
1650
1651 static int
1652 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1653 struct in_addr laddr, uint16_t lport,
1654 struct vestigial_inpcb *res)
1655 {
1656 vtw_t *vtw;
1657 vtw_ctl_t *ctl;
1658
1659
1660 db_trace(KTR_VTW
1661 , (res, "vtw: lookup %A:%P %A:%P"
1662 , faddr, fport
1663 , laddr, lport));
1664
1665 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1666 , faddr.s_addr, fport
1667 , laddr.s_addr, lport, 0);
1668
1669 return vtw_export_v4(ctl, vtw, res);
1670 }
1671
1672 /* in_pcblookup_ports assist for handling vestigial entries.
1673 */
1674 static void *
1675 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1676 {
1677 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1678
1679 bzero(it, sizeof (*it));
1680
1681 /* Note: the reference to vtw_tcpv6[0] is fine.
1682 * We do not need per-class iteration. We just
1683 * need to get to the fat, and there is one
1684 * shared fat.
1685 */
1686 if (vtw_tcpv6[0].fat) {
1687 it->addr.v6 = *addr;
1688 it->port = port;
1689 it->wild = !!wild;
1690 it->ctl = &vtw_tcpv6[0];
1691
1692 ++vtw_stats.look[1];
1693 }
1694
1695 return it;
1696 }
1697
1698 /*!\brief export an IPv6 vtw.
1699 */
1700 static int
1701 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1702 {
1703 vtw_v6_t *v6 = (void*)vtw;
1704
1705 bzero(res, sizeof (*res));
1706
1707 if (ctl && vtw) {
1708 if (!ctl->clidx && vtw->msl_class)
1709 ctl += vtw->msl_class;
1710 else
1711 KASSERT(ctl->clidx == vtw->msl_class);
1712
1713 res->valid = 1;
1714 res->v4 = 0;
1715
1716 res->faddr.v6 = v6->faddr;
1717 res->laddr.v6 = v6->laddr;
1718 res->fport = v6->fport;
1719 res->lport = v6->lport;
1720 res->vtw = vtw; // netlock held over call(s)
1721 res->ctl = ctl;
1722
1723 res->v6only = vtw->v6only;
1724 res->reuse_addr = vtw->reuse_addr;
1725 res->reuse_port = vtw->reuse_port;
1726
1727 res->snd_nxt = vtw->snd_nxt;
1728 res->rcv_nxt = vtw->rcv_nxt;
1729 res->rcv_wnd = vtw->rcv_wnd;
1730 res->uid = vtw->uid;
1731 }
1732
1733 return res->valid;
1734 }
1735
1736 static int
1737 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1738 {
1739 struct tcp_ports_iterator *it = arg;
1740 vtw_t *vtw = 0;
1741
1742 if (it->ctl)
1743 vtw = vtw_next_port_v6(it);
1744
1745 if (!vtw)
1746 it->ctl = 0;
1747
1748 return vtw_export_v6(it->ctl, vtw, res);
1749 }
1750
1751 static int
1752 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1753 const struct in6_addr *laddr, uint16_t lport,
1754 struct vestigial_inpcb *res)
1755 {
1756 vtw_ctl_t *ctl;
1757 vtw_t *vtw;
1758
1759 db_trace(KTR_VTW
1760 , (res, "vtw: lookup %6A:%P %6A:%P"
1761 , db_store(faddr, sizeof (*faddr)), fport
1762 , db_store(laddr, sizeof (*laddr)), lport));
1763
1764 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1765 , faddr, fport
1766 , laddr, lport, 0);
1767
1768 return vtw_export_v6(ctl, vtw, res);
1769 }
1770
1771 static vestigial_hooks_t tcp_hooks = {
1772 .init_ports4 = tcp_init_ports_v4,
1773 .next_port4 = tcp_next_port_v4,
1774 .lookup4 = tcp_lookup_v4,
1775 .init_ports6 = tcp_init_ports_v6,
1776 .next_port6 = tcp_next_port_v6,
1777 .lookup6 = tcp_lookup_v6,
1778 };
1779
1780 static bool
1781 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1782 {
1783 fatp_ctl_t *fat;
1784 vtw_ctl_t *ctl;
1785
1786 switch (af) {
1787 case AF_INET:
1788 fat = &fat_tcpv4;
1789 ctl = &vtw_tcpv4[0];
1790 break;
1791 case AF_INET6:
1792 fat = &fat_tcpv6;
1793 ctl = &vtw_tcpv6[0];
1794 break;
1795 default:
1796 return false;
1797 }
1798 if (fatp != NULL)
1799 *fatp = fat;
1800 if (ctlp != NULL)
1801 *ctlp = ctl;
1802 return true;
1803 }
1804
1805 /*!\brief initialize controlling instance
1806 */
1807 static int
1808 vtw_control_init(int af)
1809 {
1810 fatp_ctl_t *fat;
1811 vtw_ctl_t *ctl;
1812 fatp_t *fat_base;
1813 fatp_t **fat_hash;
1814 vtw_t *ctl_base_v;
1815 uint32_t n, m;
1816 size_t sz;
1817
1818 KASSERT(powerof2(tcp_vtw_entries));
1819
1820 if (!vtw_select(af, &fat, &ctl))
1821 return EAFNOSUPPORT;
1822
1823 if (fat->hash != NULL) {
1824 KASSERT(fat->base != NULL && ctl->base.v != NULL);
1825 return 0;
1826 }
1827
1828 /* Allocate 10% more capacity in the fat pointers.
1829 * We should only need ~#hash additional based on
1830 * how they age, but TIME_WAIT assassination could cause
1831 * sparse fat pointer utilisation.
1832 */
1833 m = 512;
1834 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1835 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1836
1837 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1838
1839 if (fat_hash == NULL) {
1840 printf("%s: could not allocate %zu bytes for "
1841 "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1842 return ENOMEM;
1843 }
1844
1845 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1846
1847 if (fat_base == NULL) {
1848 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1849 printf("%s: could not allocate %zu bytes for "
1850 "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1851 return ENOMEM;
1852 }
1853
1854 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1855
1856 if (ctl_base_v == NULL) {
1857 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1858 kmem_free(fat_base, 2*n * sizeof(fatp_t));
1859 printf("%s: could not allocate %zu bytes for "
1860 "vtw_t array", __func__, tcp_vtw_entries * sz);
1861 return ENOMEM;
1862 }
1863
1864 fatp_init(fat, n, m, fat_base, fat_hash);
1865
1866 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1867
1868 return 0;
1869 }
1870
1871 /*!\brief select controlling instance
1872 */
1873 static vtw_ctl_t *
1874 vtw_control(int af, uint32_t msl)
1875 {
1876 fatp_ctl_t *fat;
1877 vtw_ctl_t *ctl;
1878 int msl_class = msl_to_class(msl);
1879
1880 if (!vtw_select(af, &fat, &ctl))
1881 return NULL;
1882
1883 if (!fat->base || !ctl->base.v)
1884 return NULL;
1885
1886 if (!tcp_vtw_was_enabled) {
1887 /* This guarantees is timer ticks until we no longer need them.
1888 */
1889 tcp_vtw_was_enabled = 1;
1890
1891 callout_schedule(&vtw_cs, hz / 5);
1892
1893 tcbtable.vestige = &tcp_hooks;
1894 }
1895
1896 return ctl + msl_class;
1897 }
1898
1899 /*!\brief add TCP pcb to vestigial timewait
1900 */
1901 int
1902 vtw_add(int af, struct tcpcb *tp)
1903 {
1904 #ifdef VTW_DEBUG
1905 int enable;
1906 #endif
1907 vtw_ctl_t *ctl;
1908 vtw_t *vtw;
1909
1910 KASSERT(mutex_owned(softnet_lock));
1911
1912 ctl = vtw_control(af, tp->t_msl);
1913 if (!ctl)
1914 return 0;
1915
1916 #ifdef VTW_DEBUG
1917 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1918 #endif
1919
1920 vtw = vtw_alloc(ctl);
1921
1922 if (vtw) {
1923 vtw->snd_nxt = tp->snd_nxt;
1924 vtw->rcv_nxt = tp->rcv_nxt;
1925
1926 switch (af) {
1927 case AF_INET: {
1928 struct inpcb *inp = tp->t_inpcb;
1929 vtw_v4_t *v4 = (void*)vtw;
1930
1931 v4->faddr = inp->inp_faddr.s_addr;
1932 v4->laddr = inp->inp_laddr.s_addr;
1933 v4->fport = inp->inp_fport;
1934 v4->lport = inp->inp_lport;
1935
1936 vtw->reuse_port = !!(inp->inp_socket->so_options
1937 & SO_REUSEPORT);
1938 vtw->reuse_addr = !!(inp->inp_socket->so_options
1939 & SO_REUSEADDR);
1940 vtw->v6only = 0;
1941 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1942
1943 vtw_inshash_v4(ctl, vtw);
1944
1945
1946 #ifdef VTW_DEBUG
1947 /* Immediate lookup (connected and port) to
1948 * ensure at least that works!
1949 */
1950 if (enable & 4) {
1951 KASSERT(vtw_lookup_hash_v4
1952 (ctl
1953 , inp->inp_faddr.s_addr, inp->inp_fport
1954 , inp->inp_laddr.s_addr, inp->inp_lport
1955 , 0)
1956 == vtw);
1957 KASSERT(vtw_lookup_hash_v4
1958 (ctl
1959 , inp->inp_faddr.s_addr, inp->inp_fport
1960 , inp->inp_laddr.s_addr, inp->inp_lport
1961 , 1));
1962 }
1963 /* Immediate port iterator functionality check: not wild
1964 */
1965 if (enable & 8) {
1966 struct tcp_ports_iterator *it;
1967 struct vestigial_inpcb res;
1968 int cnt = 0;
1969
1970 it = tcp_init_ports_v4(inp->inp_laddr
1971 , inp->inp_lport, 0);
1972
1973 while (tcp_next_port_v4(it, &res)) {
1974 ++cnt;
1975 }
1976 KASSERT(cnt);
1977 }
1978 /* Immediate port iterator functionality check: wild
1979 */
1980 if (enable & 16) {
1981 struct tcp_ports_iterator *it;
1982 struct vestigial_inpcb res;
1983 struct in_addr any;
1984 int cnt = 0;
1985
1986 any.s_addr = htonl(INADDR_ANY);
1987
1988 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1989
1990 while (tcp_next_port_v4(it, &res)) {
1991 ++cnt;
1992 }
1993 KASSERT(cnt);
1994 }
1995 #endif /* VTW_DEBUG */
1996 break;
1997 }
1998
1999 case AF_INET6: {
2000 struct in6pcb *inp = tp->t_in6pcb;
2001 vtw_v6_t *v6 = (void*)vtw;
2002
2003 v6->faddr = inp->in6p_faddr;
2004 v6->laddr = inp->in6p_laddr;
2005 v6->fport = inp->in6p_fport;
2006 v6->lport = inp->in6p_lport;
2007
2008 vtw->reuse_port = !!(inp->in6p_socket->so_options
2009 & SO_REUSEPORT);
2010 vtw->reuse_addr = !!(inp->in6p_socket->so_options
2011 & SO_REUSEADDR);
2012 vtw->v6only = !!(inp->in6p_flags
2013 & IN6P_IPV6_V6ONLY);
2014 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid;
2015
2016 vtw_inshash_v6(ctl, vtw);
2017 #ifdef VTW_DEBUG
2018 /* Immediate lookup (connected and port) to
2019 * ensure at least that works!
2020 */
2021 if (enable & 4) {
2022 KASSERT(vtw_lookup_hash_v6(ctl
2023 , &inp->in6p_faddr, inp->in6p_fport
2024 , &inp->in6p_laddr, inp->in6p_lport
2025 , 0)
2026 == vtw);
2027 KASSERT(vtw_lookup_hash_v6
2028 (ctl
2029 , &inp->in6p_faddr, inp->in6p_fport
2030 , &inp->in6p_laddr, inp->in6p_lport
2031 , 1));
2032 }
2033 /* Immediate port iterator functionality check: not wild
2034 */
2035 if (enable & 8) {
2036 struct tcp_ports_iterator *it;
2037 struct vestigial_inpcb res;
2038 int cnt = 0;
2039
2040 it = tcp_init_ports_v6(&inp->in6p_laddr
2041 , inp->in6p_lport, 0);
2042
2043 while (tcp_next_port_v6(it, &res)) {
2044 ++cnt;
2045 }
2046 KASSERT(cnt);
2047 }
2048 /* Immediate port iterator functionality check: wild
2049 */
2050 if (enable & 16) {
2051 struct tcp_ports_iterator *it;
2052 struct vestigial_inpcb res;
2053 static struct in6_addr any = IN6ADDR_ANY_INIT;
2054 int cnt = 0;
2055
2056 it = tcp_init_ports_v6(&any
2057 , inp->in6p_lport, 1);
2058
2059 while (tcp_next_port_v6(it, &res)) {
2060 ++cnt;
2061 }
2062 KASSERT(cnt);
2063 }
2064 #endif /* VTW_DEBUG */
2065 break;
2066 }
2067 }
2068
2069 tcp_canceltimers(tp);
2070 tp = tcp_close(tp);
2071 KASSERT(!tp);
2072
2073 return 1;
2074 }
2075
2076 return 0;
2077 }
2078
2079 /*!\brief restart timer for vestigial time-wait entry
2080 */
2081 static void
2082 vtw_restart_v4(vestigial_inpcb_t *vp)
2083 {
2084 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2085 vtw_t *vtw;
2086 vtw_t *cp = ©.common;
2087 vtw_ctl_t *ctl;
2088
2089 KASSERT(mutex_owned(softnet_lock));
2090
2091 db_trace(KTR_VTW
2092 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2093 , vp->faddr.v4.s_addr, vp->fport
2094 , vp->laddr.v4.s_addr, vp->lport));
2095
2096 /* Class might have changed, so have a squiz.
2097 */
2098 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2099 vtw = vtw_alloc(ctl);
2100
2101 if (vtw) {
2102 vtw_v4_t *v4 = (void*)vtw;
2103
2104 /* Safe now to unhash the old entry
2105 */
2106 vtw_del(vp->ctl, vp->vtw);
2107
2108 vtw->snd_nxt = cp->snd_nxt;
2109 vtw->rcv_nxt = cp->rcv_nxt;
2110
2111 v4->faddr = copy.faddr;
2112 v4->laddr = copy.laddr;
2113 v4->fport = copy.fport;
2114 v4->lport = copy.lport;
2115
2116 vtw->reuse_port = cp->reuse_port;
2117 vtw->reuse_addr = cp->reuse_addr;
2118 vtw->v6only = 0;
2119 vtw->uid = cp->uid;
2120
2121 vtw_inshash_v4(ctl, vtw);
2122 }
2123
2124 vp->valid = 0;
2125 }
2126
2127 /*!\brief restart timer for vestigial time-wait entry
2128 */
2129 static void
2130 vtw_restart_v6(vestigial_inpcb_t *vp)
2131 {
2132 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2133 vtw_t *vtw;
2134 vtw_t *cp = ©.common;
2135 vtw_ctl_t *ctl;
2136
2137 KASSERT(mutex_owned(softnet_lock));
2138
2139 db_trace(KTR_VTW
2140 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2141 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2142 , vp->fport
2143 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2144 , vp->lport));
2145
2146 /* Class might have changed, so have a squiz.
2147 */
2148 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2149 vtw = vtw_alloc(ctl);
2150
2151 if (vtw) {
2152 vtw_v6_t *v6 = (void*)vtw;
2153
2154 /* Safe now to unhash the old entry
2155 */
2156 vtw_del(vp->ctl, vp->vtw);
2157
2158 vtw->snd_nxt = cp->snd_nxt;
2159 vtw->rcv_nxt = cp->rcv_nxt;
2160
2161 v6->faddr = copy.faddr;
2162 v6->laddr = copy.laddr;
2163 v6->fport = copy.fport;
2164 v6->lport = copy.lport;
2165
2166 vtw->reuse_port = cp->reuse_port;
2167 vtw->reuse_addr = cp->reuse_addr;
2168 vtw->v6only = cp->v6only;
2169 vtw->uid = cp->uid;
2170
2171 vtw_inshash_v6(ctl, vtw);
2172 }
2173
2174 vp->valid = 0;
2175 }
2176
2177 /*!\brief restart timer for vestigial time-wait entry
2178 */
2179 void
2180 vtw_restart(vestigial_inpcb_t *vp)
2181 {
2182 if (!vp || !vp->valid)
2183 return;
2184
2185 if (vp->v4)
2186 vtw_restart_v4(vp);
2187 else
2188 vtw_restart_v6(vp);
2189 }
2190
2191 int
2192 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2193 {
2194 int en, rc;
2195 struct sysctlnode node;
2196
2197 node = *rnode;
2198 en = *(int *)rnode->sysctl_data;
2199 node.sysctl_data = &en;
2200
2201 rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2202 if (rc != 0 || newp == NULL)
2203 return rc;
2204
2205 if (rnode->sysctl_data != &tcp4_vtw_enable &&
2206 rnode->sysctl_data != &tcp6_vtw_enable)
2207 rc = ENOENT;
2208 else if ((en & 1) == 0)
2209 rc = 0;
2210 else if (rnode->sysctl_data == &tcp4_vtw_enable)
2211 rc = vtw_control_init(AF_INET);
2212 else /* rnode->sysctl_data == &tcp6_vtw_enable */
2213 rc = vtw_control_init(AF_INET6);
2214
2215 if (rc == 0)
2216 *(int *)rnode->sysctl_data = en;
2217
2218 return rc;
2219 }
2220
2221 int
2222 vtw_earlyinit(void)
2223 {
2224 int i, rc;
2225
2226 callout_init(&vtw_cs, 0);
2227 callout_setfunc(&vtw_cs, vtw_tick, 0);
2228
2229 for (i = 0; i < VTW_NCLASS; ++i) {
2230 vtw_tcpv4[i].is_v4 = 1;
2231 vtw_tcpv6[i].is_v6 = 1;
2232 }
2233
2234 if ((tcp4_vtw_enable & 1) != 0 &&
2235 (rc = vtw_control_init(AF_INET)) != 0)
2236 return rc;
2237
2238 if ((tcp6_vtw_enable & 1) != 0 &&
2239 (rc = vtw_control_init(AF_INET6)) != 0)
2240 return rc;
2241
2242 return 0;
2243 }
2244
2245 #ifdef VTW_DEBUG
2246 #include <sys/syscallargs.h>
2247 #include <sys/sysctl.h>
2248
2249 /*!\brief add lalp, fafp entries for debug
2250 */
2251 int
2252 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class)
2253 {
2254 vtw_ctl_t *ctl;
2255 vtw_t *vtw;
2256
2257 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class));
2258 if (!ctl)
2259 return 0;
2260
2261 vtw = vtw_alloc(ctl);
2262
2263 if (vtw) {
2264 vtw->snd_nxt = 0;
2265 vtw->rcv_nxt = 0;
2266
2267 switch (af) {
2268 case AF_INET: {
2269 vtw_v4_t *v4 = (void*)vtw;
2270
2271 v4->faddr = fa->sin_addr.v4.s_addr;
2272 v4->laddr = la->sin_addr.v4.s_addr;
2273 v4->fport = fa->sin_port;
2274 v4->lport = la->sin_port;
2275
2276 vtw->reuse_port = 1;
2277 vtw->reuse_addr = 1;
2278 vtw->v6only = 0;
2279 vtw->uid = 0;
2280
2281 vtw_inshash_v4(ctl, vtw);
2282 break;
2283 }
2284
2285 case AF_INET6: {
2286 vtw_v6_t *v6 = (void*)vtw;
2287
2288 v6->faddr = fa->sin_addr.v6;
2289 v6->laddr = la->sin_addr.v6;
2290
2291 v6->fport = fa->sin_port;
2292 v6->lport = la->sin_port;
2293
2294 vtw->reuse_port = 1;
2295 vtw->reuse_addr = 1;
2296 vtw->v6only = 0;
2297 vtw->uid = 0;
2298
2299 vtw_inshash_v6(ctl, vtw);
2300 break;
2301 }
2302
2303 default:
2304 break;
2305 }
2306
2307 return 1;
2308 }
2309
2310 return 0;
2311 }
2312
2313 static int vtw_syscall = 0;
2314
2315 static int
2316 vtw_debug_process(vtw_sysargs_t *ap)
2317 {
2318 struct vestigial_inpcb vestige;
2319 int rc = 0;
2320
2321 mutex_enter(softnet_lock);
2322
2323 switch (ap->op) {
2324 case 0: // insert
2325 vtw_debug_add(ap->la.sin_family
2326 , &ap->la
2327 , &ap->fa
2328 , TCPTV_MSL
2329 , 0);
2330 break;
2331
2332 case 1: // lookup
2333 case 2: // restart
2334 switch (ap->la.sin_family) {
2335 case AF_INET:
2336 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2337 ap->la.sin_addr.v4, ap->la.sin_port,
2338 &vestige)) {
2339 if (ap->op == 2) {
2340 vtw_restart(&vestige);
2341 }
2342 rc = 0;
2343 } else
2344 rc = ESRCH;
2345 break;
2346
2347 case AF_INET6:
2348 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2349 &ap->la.sin_addr.v6, ap->la.sin_port,
2350 &vestige)) {
2351 if (ap->op == 2) {
2352 vtw_restart(&vestige);
2353 }
2354 rc = 0;
2355 } else
2356 rc = ESRCH;
2357 break;
2358 default:
2359 rc = EINVAL;
2360 }
2361 break;
2362
2363 default:
2364 rc = EINVAL;
2365 }
2366
2367 mutex_exit(softnet_lock);
2368 return rc;
2369 }
2370
2371 struct sys_vtw_args {
2372 syscallarg(const vtw_sysargs_t *) req;
2373 syscallarg(size_t) len;
2374 };
2375
2376 static int
2377 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2378 {
2379 const struct sys_vtw_args *uap = _;
2380 void *buf;
2381 int rc;
2382 size_t len = SCARG(uap, len);
2383
2384 if (len != sizeof (vtw_sysargs_t))
2385 return EINVAL;
2386
2387 buf = kmem_alloc(len, KM_SLEEP);
2388 if (!buf)
2389 return ENOMEM;
2390
2391 rc = copyin(SCARG(uap, req), buf, len);
2392 if (!rc) {
2393 rc = vtw_debug_process(buf);
2394 }
2395 kmem_free(buf, len);
2396
2397 return rc;
2398 }
2399
2400 static void
2401 vtw_sanity_check(void)
2402 {
2403 vtw_ctl_t *ctl;
2404 vtw_t *vtw;
2405 int i;
2406 int n;
2407
2408 for (i = 0; i < VTW_NCLASS; ++i) {
2409 ctl = &vtw_tcpv4[i];
2410
2411 if (!ctl->base.v || ctl->nalloc)
2412 continue;
2413
2414 for (n = 0, vtw = ctl->base.v; ; ) {
2415 ++n;
2416 vtw = vtw_next(ctl, vtw);
2417 if (vtw == ctl->base.v)
2418 break;
2419 }
2420 db_trace(KTR_VTW
2421 , (ctl, "sanity: class %x n %x nfree %x"
2422 , i, n, ctl->nfree));
2423
2424 KASSERT(n == ctl->nfree);
2425 }
2426
2427 for (i = 0; i < VTW_NCLASS; ++i) {
2428 ctl = &vtw_tcpv6[i];
2429
2430 if (!ctl->base.v || ctl->nalloc)
2431 continue;
2432
2433 for (n = 0, vtw = ctl->base.v; ; ) {
2434 ++n;
2435 vtw = vtw_next(ctl, vtw);
2436 if (vtw == ctl->base.v)
2437 break;
2438 }
2439 db_trace(KTR_VTW
2440 , (ctl, "sanity: class %x n %x nfree %x"
2441 , i, n, ctl->nfree));
2442 KASSERT(n == ctl->nfree);
2443 }
2444 }
2445
2446 /*!\brief Initialise debug support.
2447 */
2448 static void
2449 vtw_debug_init(void)
2450 {
2451 int i;
2452
2453 vtw_sanity_check();
2454
2455 if (vtw_syscall)
2456 return;
2457
2458 for (i = 511; i; --i) {
2459 if (sysent[i].sy_call == sys_nosys) {
2460 sysent[i].sy_call = vtw_sys;
2461 sysent[i].sy_narg = 2;
2462 sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2463 sysent[i].sy_flags = 0;
2464
2465 vtw_syscall = i;
2466 break;
2467 }
2468 }
2469 if (i) {
2470 const struct sysctlnode *node;
2471 uint32_t flags;
2472
2473 flags = sysctl_root.sysctl_flags;
2474
2475 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2476 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2477
2478 sysctl_createv(0, 0, 0, &node,
2479 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2480 "koff",
2481 SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2482 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2483
2484 if (!node) {
2485 sysctl_createv(0, 0, 0, &node,
2486 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2487 "koffka",
2488 SYSCTL_DESCR("The Real(tm) Kernel"
2489 " Obscure Feature Finder"),
2490 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2491 }
2492 if (node) {
2493 sysctl_createv(0, 0, 0, 0,
2494 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2495 CTLTYPE_INT, "vtw_debug_syscall",
2496 SYSCTL_DESCR("vtw debug"
2497 " system call number"),
2498 0, 0, &vtw_syscall, 0, node->sysctl_num,
2499 CTL_CREATE, CTL_EOL);
2500 }
2501 sysctl_root.sysctl_flags = flags;
2502 }
2503 }
2504 #else /* !VTW_DEBUG */
2505 static void
2506 vtw_debug_init(void)
2507 {
2508 return;
2509 }
2510 #endif /* !VTW_DEBUG */
2511