tcp_vtw.c revision 1.24.8.1 1 /*
2 * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Coyote Point Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using
32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime
33 * Truncation (MSLT).
34 *
35 * MSLT and VTW were contributed by Coyote Point Systems, Inc.
36 *
37 * Even after a TCP session enters the TIME_WAIT state, its corresponding
38 * socket and protocol control blocks (PCBs) stick around until the TCP
39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload
40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs
41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead
42 * weight in RAM.
43 *
44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to
45 * a class based on the nearness of the peer. Corresponding to each class
46 * is an MSL, and a session uses the MSL of its class. The classes are
47 * loopback (local host equals remote host), local (local host and remote
48 * host are on the same link/subnet), and remote (local host and remote
49 * host communicate via one or more gateways). Classes corresponding to
50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10
51 * seconds for local, 60 seconds for remote. Loopback and local sessions
52 * expire more quickly when MSLT is used.
53 *
54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket
55 * dead weight with a compact representation of the session, called a
56 * "vestigial PCB". VTW data structures are designed to be very fast and
57 * memory-efficient: for fast insertion and lookup of vestigial PCBs,
58 * the PCBs are stored in a hash table that is designed to minimize the
59 * number of cacheline visits per lookup/insertion. The memory both
60 * for vestigial PCBs and for elements of the PCB hashtable come from
61 * fixed-size pools, and linked data structures exploit this to conserve
62 * memory by representing references with a narrow index/offset from the
63 * start of a pool instead of a pointer. When space for new vestigial PCBs
64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first.
65 * VTW cooperates with MSLT.
66 *
67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN
68 * cache.
69 *
70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT
71 * sessions as fast as it can is approximately 17% idle when VTW is active
72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM
73 * when VTW is active (approximately 64k vestigial PCBs are created) than
74 * when it is inactive.
75 */
76
77 #include <sys/cdefs.h>
78
79 #ifdef _KERNEL_OPT
80 #include "opt_ddb.h"
81 #include "opt_inet.h"
82 #include "opt_inet_csum.h"
83 #include "opt_tcp_debug.h"
84 #endif
85
86 #include <sys/param.h>
87 #include <sys/systm.h>
88 #include <sys/kmem.h>
89 #include <sys/mbuf.h>
90 #include <sys/protosw.h>
91 #include <sys/socket.h>
92 #include <sys/socketvar.h>
93 #include <sys/errno.h>
94 #include <sys/syslog.h>
95 #include <sys/pool.h>
96 #include <sys/domain.h>
97 #include <sys/kernel.h>
98 #include <net/if.h>
99 #include <net/if_types.h>
100
101 #include <netinet/in.h>
102 #include <netinet/in_systm.h>
103 #include <netinet/ip.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/in_var.h>
106 #include <netinet/ip_var.h>
107 #include <netinet/in_offload.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet6/in6_pcb.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet6/in6_var.h>
113 #include <netinet/icmp6.h>
114
115 #include <netinet/tcp.h>
116 #include <netinet/tcp_fsm.h>
117 #include <netinet/tcp_seq.h>
118 #include <netinet/tcp_timer.h>
119 #include <netinet/tcp_var.h>
120 #include <netinet/tcp_private.h>
121
122 #include <netinet/tcp_vtw.h>
123
124 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.24.8.1 2025/08/02 05:57:50 perseant Exp $");
125
126 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
127
128 static void vtw_debug_init(void);
129
130 fatp_ctl_t fat_tcpv4;
131 fatp_ctl_t fat_tcpv6;
132 vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
133 vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
134 vtw_stats_t vtw_stats;
135
136 /* We provide state for the lookup_ports iterator.
137 * As currently we are netlock-protected, there is one.
138 * If we were finer-grain, we would have one per CPU.
139 * I do not want to be in the business of alloc/free.
140 * The best alternate would be allocate on the caller's
141 * stack, but that would require them to know the struct,
142 * or at least the size.
143 * See how she goes.
144 */
145 struct tcp_ports_iterator {
146 union {
147 struct in_addr v4;
148 struct in6_addr v6;
149 } addr;
150 u_int port;
151
152 uint32_t wild : 1;
153
154 vtw_ctl_t *ctl;
155 fatp_t *fp;
156
157 uint16_t slot_idx;
158 uint16_t ctl_idx;
159 };
160
161 static struct tcp_ports_iterator tcp_ports_iterator_v4;
162 static struct tcp_ports_iterator tcp_ports_iterator_v6;
163
164 static int vtw_age(vtw_ctl_t *, struct timeval *);
165
166 /*!\brief allocate a fat pointer from a collection.
167 */
168 static fatp_t *
169 fatp_alloc(fatp_ctl_t *fat)
170 {
171 fatp_t *fp = 0;
172
173 if (fat->nfree) {
174 fp = fat->free;
175 if (fp) {
176 fat->free = fatp_next(fat, fp);
177 --fat->nfree;
178 ++fat->nalloc;
179 fp->nxt = 0;
180
181 KASSERT(!fp->inuse);
182 }
183 }
184
185 return fp;
186 }
187
188 /*!\brief free a fat pointer.
189 */
190 static void
191 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
192 {
193 if (fp) {
194 KASSERT(!fp->inuse);
195 KASSERT(!fp->nxt);
196
197 fp->nxt = fatp_index(fat, fat->free);
198 fat->free = fp;
199
200 ++fat->nfree;
201 --fat->nalloc;
202 }
203 }
204
205 /*!\brief initialise a collection of fat pointers.
206 *
207 *\param n # hash buckets
208 *\param m total # fat pointers to allocate
209 *
210 * We allocate 2x as much, as we have two hashes: full and lport only.
211 */
212 static void
213 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
214 fatp_t *fat_base, fatp_t **fat_hash)
215 {
216 fatp_t *fp;
217
218 KASSERT(n <= FATP_MAX / 2);
219
220 fat->hash = fat_hash;
221 fat->base = fat_base;
222
223 fat->port = &fat->hash[m];
224
225 fat->mask = m - 1; // ASSERT is power of 2 (m)
226 fat->lim = fat->base + 2*n - 1;
227 fat->nfree = 0;
228 fat->nalloc = 2*n;
229
230 /* Initialise the free list.
231 */
232 for (fp = fat->lim; fp >= fat->base; --fp) {
233 fatp_free(fat, fp);
234 }
235 }
236
237 /*
238 * The `xtra' is XORed into the tag stored.
239 */
240 static uint32_t fatp_xtra[] = {
241 0x11111111,0x22222222,0x33333333,0x44444444,
242 0x55555555,0x66666666,0x77777777,0x88888888,
243 0x12121212,0x21212121,0x34343434,0x43434343,
244 0x56565656,0x65656565,0x78787878,0x87878787,
245 0x11221122,0x22112211,0x33443344,0x44334433,
246 0x55665566,0x66556655,0x77887788,0x88778877,
247 0x11112222,0x22221111,0x33334444,0x44443333,
248 0x55556666,0x66665555,0x77778888,0x88887777,
249 };
250
251 /*!\brief turn a {fatp_t*,slot} into an integral key.
252 *
253 * The key can be used to obtain the fatp_t, and the slot,
254 * as it directly encodes them.
255 */
256 static inline uint32_t
257 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
258 {
259 CTASSERT(CACHE_LINE_SIZE == 32 ||
260 CACHE_LINE_SIZE == 64 ||
261 CACHE_LINE_SIZE == 128 ||
262 CACHE_LINE_SIZE == 256);
263
264 switch (fatp_ntags()) {
265 case 7:
266 return (fatp_index(fat, fp) << 3) | slot;
267 case 15:
268 return (fatp_index(fat, fp) << 4) | slot;
269 case 31:
270 return (fatp_index(fat, fp) << 5) | slot;
271 default:
272 KASSERT(0 && "no support, for no good reason");
273 return ~0;
274 }
275 }
276
277 static inline uint32_t
278 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
279 {
280 CTASSERT(CACHE_LINE_SIZE == 32 ||
281 CACHE_LINE_SIZE == 64 ||
282 CACHE_LINE_SIZE == 128 ||
283 CACHE_LINE_SIZE == 256);
284
285 switch (fatp_ntags()) {
286 case 7:
287 return key & 7;
288 case 15:
289 return key & 15;
290 case 31:
291 return key & 31;
292 default:
293 KASSERT(0 && "no support, for no good reason");
294 return ~0;
295 }
296 }
297
298 static inline fatp_t *
299 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
300 {
301 CTASSERT(CACHE_LINE_SIZE == 32 ||
302 CACHE_LINE_SIZE == 64 ||
303 CACHE_LINE_SIZE == 128 ||
304 CACHE_LINE_SIZE == 256);
305
306 switch (fatp_ntags()) {
307 case 7:
308 key >>= 3;
309 break;
310 case 15:
311 key >>= 4;
312 break;
313 case 31:
314 key >>= 5;
315 break;
316 default:
317 KASSERT(0 && "no support, for no good reason");
318 return 0;
319 }
320
321 return key ? fat->base + key - 1 : 0;
322 }
323
324 static inline uint32_t
325 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
326 {
327 return (idx << ctl->idx_bits) | idx;
328 }
329
330 static inline uint32_t
331 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
332 {
333 uint32_t idx = bits & ctl->idx_mask;
334
335 if (idx_encode(ctl, idx) == bits)
336 return idx;
337 else
338 return ~0;
339 }
340
341 /*!\brief insert index into fatp hash
342 *
343 *\param idx - index of element being placed in hash chain
344 *\param tag - 32-bit tag identifier
345 *
346 *\returns
347 * value which can be used to locate entry.
348 *
349 *\note
350 * we rely on the fact that there are unused high bits in the index
351 * for verification purposes on lookup.
352 */
353
354 static inline uint32_t
355 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
356 void *dbg)
357 {
358 fatp_t *fp;
359 fatp_t **hash = (which ? fat->port : fat->hash);
360 int i;
361
362 fp = hash[tag & fat->mask];
363
364 while (!fp || fatp_full(fp)) {
365 fatp_t *fq;
366
367 /* All entries are inuse at the top level.
368 * We allocate a spare, and push the top level
369 * down one. All entries in the fp we push down
370 * (think of a tape worm here) will be expelled sooner than
371 * any entries added subsequently to this hash bucket.
372 * This is a property of the time waits we are exploiting.
373 */
374
375 fq = fatp_alloc(fat);
376 if (!fq) {
377 vtw_age(fat->vtw, 0);
378 fp = hash[tag & fat->mask];
379 continue;
380 }
381
382 fq->inuse = 0;
383 fq->nxt = fatp_index(fat, fp);
384
385 hash[tag & fat->mask] = fq;
386
387 fp = fq;
388 }
389
390 KASSERT(!fatp_full(fp));
391
392 /* Fill highest index first. Lookup is lowest first.
393 */
394 for (i = fatp_ntags(); --i >= 0; ) {
395 if (!((1 << i) & fp->inuse)) {
396 break;
397 }
398 }
399
400 fp->inuse |= 1 << i;
401 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
402
403 db_trace(KTR_VTW
404 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
405 , fp->inuse
406 , i, fp->tag[i]));
407
408 return fatp_key(fat, fp, i);
409 }
410
411 static inline int
412 vtw_alive(const vtw_t *vtw)
413 {
414 return vtw->hashed && vtw->expire.tv_sec;
415 }
416
417 static inline uint32_t
418 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
419 {
420 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
421 return v4 - ctl->base.v4;
422
423 KASSERT(0 && "vtw out of bounds");
424
425 return ~0;
426 }
427
428 static inline uint32_t
429 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
430 {
431 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
432 return v6 - ctl->base.v6;
433
434 KASSERT(0 && "vtw out of bounds");
435
436 return ~0;
437 }
438
439 static inline uint32_t
440 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
441 {
442 if (ctl->clidx)
443 ctl = ctl->ctl;
444
445 if (ctl->is_v4)
446 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
447
448 if (ctl->is_v6)
449 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
450
451 KASSERT(0 && "neither 4 nor 6. most curious.");
452
453 return ~0;
454 }
455
456 static inline vtw_t *
457 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
458 {
459 if (ctl->clidx)
460 ctl = ctl->ctl;
461
462 /* See if the index looks like it might be an index.
463 * Bits on outside of the valid index bits is a give away.
464 */
465 idx = idx_decode(ctl, idx);
466
467 if (idx == ~0) {
468 return 0;
469 } else if (ctl->is_v4) {
470 vtw_v4_t *vtw = ctl->base.v4 + idx;
471
472 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
473 ? &vtw->common : 0;
474 } else if (ctl->is_v6) {
475 vtw_v6_t *vtw = ctl->base.v6 + idx;
476
477 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
478 ? &vtw->common : 0;
479 } else {
480 KASSERT(0 && "badness");
481 return 0;
482 }
483 }
484
485 /*!\brief return the next vtw after this one.
486 *
487 * Due to the differing sizes of the entries in differing
488 * arenas, we have to ensure we ++ the correct pointer type.
489 *
490 * Also handles wrap.
491 */
492 static inline vtw_t *
493 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
494 {
495 if (ctl->is_v4) {
496 vtw_v4_t *v4 = (void*)vtw;
497
498 vtw = &(++v4)->common;
499 } else {
500 vtw_v6_t *v6 = (void*)vtw;
501
502 vtw = &(++v6)->common;
503 }
504
505 if (vtw > ctl->lim.v)
506 vtw = ctl->base.v;
507
508 return vtw;
509 }
510
511 /*!\brief remove entry from FATP hash chains
512 */
513 static inline void
514 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
515 {
516 fatp_ctl_t *fat = ctl->fat;
517 fatp_t *fp;
518 uint32_t key = vtw->key;
519 uint32_t tag, slot, idx;
520 vtw_v4_t *v4 = (void*)vtw;
521 vtw_v6_t *v6 = (void*)vtw;
522
523 if (!vtw->hashed) {
524 KASSERT(0 && "unhashed");
525 return;
526 }
527
528 if (fat->vtw->is_v4) {
529 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
530 } else if (fat->vtw->is_v6) {
531 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
532 } else {
533 tag = 0;
534 KASSERT(0 && "not reached");
535 }
536
537 /* Remove from fat->hash[]
538 */
539 slot = fatp_slot_from_key(fat, key);
540 fp = fatp_from_key(fat, key);
541 idx = vtw_index(ctl, vtw);
542
543 db_trace(KTR_VTW
544 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
545 , fp->inuse, slot, idx, key, tag));
546
547 KASSERT(fp->inuse & (1 << slot));
548 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
549 ^ fatp_xtra[slot]));
550
551 if ((fp->inuse & (1 << slot))
552 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
553 ^ fatp_xtra[slot])) {
554 fp->inuse ^= 1 << slot;
555 fp->tag[slot] = 0;
556
557 /* When we delete entries, we do not compact. This is
558 * due to temporality. We add entries, and they
559 * (eventually) expire. Older entries will be further
560 * down the chain.
561 */
562 if (!fp->inuse) {
563 uint32_t hi = tag & fat->mask;
564 fatp_t *fq = 0;
565 fatp_t *fr = fat->hash[hi];
566
567 while (fr && fr != fp) {
568 fr = fatp_next(fat, fq = fr);
569 }
570
571 if (fr == fp) {
572 if (fq) {
573 fq->nxt = fp->nxt;
574 fp->nxt = 0;
575 fatp_free(fat, fp);
576 } else {
577 KASSERT(fat->hash[hi] == fp);
578
579 if (fp->nxt) {
580 fat->hash[hi]
581 = fatp_next(fat, fp);
582 fp->nxt = 0;
583 fatp_free(fat, fp);
584 } else {
585 /* retain for next use.
586 */
587 ;
588 }
589 }
590 } else {
591 fr = fat->hash[hi];
592
593 do {
594 db_trace(KTR_VTW
595 , (fr
596 , "fat:*del inuse %5.5x"
597 " nxt %x"
598 , fr->inuse, fr->nxt));
599
600 fr = fatp_next(fat, fq = fr);
601 } while (fr && fr != fp);
602
603 KASSERT(0 && "oops");
604 }
605 }
606 vtw->key ^= ~0;
607 }
608
609 if (fat->vtw->is_v4) {
610 tag = v4_port_tag(v4->lport);
611 } else if (fat->vtw->is_v6) {
612 tag = v6_port_tag(v6->lport);
613 }
614
615 /* Remove from fat->port[]
616 */
617 key = vtw->port_key;
618 slot = fatp_slot_from_key(fat, key);
619 fp = fatp_from_key(fat, key);
620 idx = vtw_index(ctl, vtw);
621
622 db_trace(KTR_VTW
623 , (fp, "fatport: del inuse %5.5x"
624 " slot %x idx %x key %x tag %x"
625 , fp->inuse, slot, idx, key, tag));
626
627 KASSERT(fp->inuse & (1 << slot));
628 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
629 ^ fatp_xtra[slot]));
630
631 if ((fp->inuse & (1 << slot))
632 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
633 ^ fatp_xtra[slot])) {
634 fp->inuse ^= 1 << slot;
635 fp->tag[slot] = 0;
636
637 if (!fp->inuse) {
638 uint32_t hi = tag & fat->mask;
639 fatp_t *fq = 0;
640 fatp_t *fr = fat->port[hi];
641
642 while (fr && fr != fp) {
643 fr = fatp_next(fat, fq = fr);
644 }
645
646 if (fr == fp) {
647 if (fq) {
648 fq->nxt = fp->nxt;
649 fp->nxt = 0;
650 fatp_free(fat, fp);
651 } else {
652 KASSERT(fat->port[hi] == fp);
653
654 if (fp->nxt) {
655 fat->port[hi]
656 = fatp_next(fat, fp);
657 fp->nxt = 0;
658 fatp_free(fat, fp);
659 } else {
660 /* retain for next use.
661 */
662 ;
663 }
664 }
665 }
666 }
667 vtw->port_key ^= ~0;
668 }
669
670 vtw->hashed = 0;
671 }
672
673 /*!\brief remove entry from hash, possibly free.
674 */
675 void
676 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
677 {
678 KASSERT(mutex_owned(softnet_lock));
679
680 if (vtw->hashed) {
681 ++vtw_stats.del;
682 vtw_unhash(ctl, vtw);
683 }
684
685 /* We only delete the oldest entry.
686 */
687 if (vtw != ctl->oldest.v)
688 return;
689
690 --ctl->nalloc;
691 ++ctl->nfree;
692
693 vtw->expire.tv_sec = 0;
694 vtw->expire.tv_usec = ~0;
695
696 if (!ctl->nalloc)
697 ctl->oldest.v = 0;
698
699 ctl->oldest.v = vtw_next(ctl, vtw);
700 }
701
702 /*!\brief insert vestigial timewait in hash chain
703 */
704 static void
705 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
706 {
707 uint32_t idx = vtw_index(ctl, vtw);
708 uint32_t tag;
709 vtw_v4_t *v4 = (void*)vtw;
710
711 KASSERT(mutex_owned(softnet_lock));
712 KASSERT(!vtw->hashed);
713 KASSERT(ctl->clidx == vtw->msl_class);
714
715 ++vtw_stats.ins;
716
717 tag = v4_tag(v4->faddr, v4->fport,
718 v4->laddr, v4->lport);
719
720 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
721
722 db_trace(KTR_VTW, (ctl
723 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
724 " tag %8.8x key %8.8x"
725 , v4->faddr, v4->fport
726 , v4->laddr, v4->lport
727 , tag
728 , vtw->key));
729
730 tag = v4_port_tag(v4->lport);
731 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
732
733 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
734 , v4->lport, v4->lport
735 , tag
736 , vtw->key));
737
738 vtw->hashed = 1;
739 }
740
741 /*!\brief insert vestigial timewait in hash chain
742 */
743 static void
744 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
745 {
746 uint32_t idx = vtw_index(ctl, vtw);
747 uint32_t tag;
748 vtw_v6_t *v6 = (void*)vtw;
749
750 KASSERT(mutex_owned(softnet_lock));
751 KASSERT(!vtw->hashed);
752 KASSERT(ctl->clidx == vtw->msl_class);
753
754 ++vtw_stats.ins;
755
756 tag = v6_tag(&v6->faddr, v6->fport,
757 &v6->laddr, v6->lport);
758
759 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
760
761 tag = v6_port_tag(v6->lport);
762 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
763
764 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
765 , v6->lport, v6->lport
766 , tag
767 , vtw->key));
768
769 vtw->hashed = 1;
770 }
771
772 static vtw_t *
773 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
774 , uint32_t laddr, uint16_t lport
775 , int which)
776 {
777 vtw_v4_t *v4;
778 vtw_t *vtw;
779 uint32_t tag;
780 fatp_t *fp;
781 int i;
782 uint32_t fatps = 0, probes = 0, losings = 0;
783
784 if (!ctl || !ctl->fat)
785 return 0;
786
787 ++vtw_stats.look[which];
788
789 if (which) {
790 tag = v4_port_tag(lport);
791 fp = ctl->fat->port[tag & ctl->fat->mask];
792 } else {
793 tag = v4_tag(faddr, fport, laddr, lport);
794 fp = ctl->fat->hash[tag & ctl->fat->mask];
795 }
796
797 while (fp && fp->inuse) {
798 uint32_t inuse = fp->inuse;
799
800 ++fatps;
801
802 for (i = 0; inuse && i < fatp_ntags(); ++i) {
803 uint32_t idx;
804
805 if (!(inuse & (1 << i)))
806 continue;
807
808 inuse ^= 1 << i;
809
810 ++probes;
811 ++vtw_stats.probe[which];
812
813 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
814 vtw = vtw_from_index(ctl, idx);
815
816 if (!vtw) {
817 /* Hopefully fast path.
818 */
819 db_trace(KTR_VTW
820 , (fp, "vtw: fast %A:%P %A:%P"
821 " idx %x tag %x"
822 , faddr, fport
823 , laddr, lport
824 , idx, tag));
825 continue;
826 }
827
828 v4 = (void*)vtw;
829
830 /* The de-referencing of vtw is what we want to avoid.
831 * Losing.
832 */
833 if (vtw_alive(vtw)
834 && ((which ? vtw->port_key : vtw->key)
835 == fatp_key(ctl->fat, fp, i))
836 && (which
837 || (v4->faddr == faddr && v4->laddr == laddr
838 && v4->fport == fport))
839 && v4->lport == lport) {
840 ++vtw_stats.hit[which];
841
842 db_trace(KTR_VTW
843 , (fp, "vtw: hit %8.8x:%4.4x"
844 " %8.8x:%4.4x idx %x key %x"
845 , faddr, fport
846 , laddr, lport
847 , idx_decode(ctl, idx), vtw->key));
848
849 KASSERT(vtw->hashed);
850
851 goto out;
852 }
853 ++vtw_stats.losing[which];
854 ++losings;
855
856 if (vtw_alive(vtw)) {
857 db_trace(KTR_VTW
858 , (fp, "vtw:!mis %8.8x:%4.4x"
859 " %8.8x:%4.4x key %x tag %x"
860 , faddr, fport
861 , laddr, lport
862 , fatp_key(ctl->fat, fp, i)
863 , v4_tag(faddr, fport
864 , laddr, lport)));
865 db_trace(KTR_VTW
866 , (vtw, "vtw:!mis %8.8x:%4.4x"
867 " %8.8x:%4.4x key %x tag %x"
868 , v4->faddr, v4->fport
869 , v4->laddr, v4->lport
870 , vtw->key
871 , v4_tag(v4->faddr, v4->fport
872 , v4->laddr, v4->lport)));
873
874 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
875 db_trace(KTR_VTW
876 , (vtw, "vtw:!mis %8.8x:%4.4x"
877 " %8.8x:%4.4x key %x"
878 " which %x"
879 , v4->faddr, v4->fport
880 , v4->laddr, v4->lport
881 , vtw->key
882 , which));
883
884 } else {
885 db_trace(KTR_VTW
886 , (vtw
887 , "vtw:!mis"
888 " key %8.8x != %8.8x"
889 " idx %x i %x which %x"
890 , vtw->key
891 , fatp_key(ctl->fat, fp, i)
892 , idx_decode(ctl, idx)
893 , i
894 , which));
895 }
896 } else {
897 db_trace(KTR_VTW
898 , (fp
899 , "vtw:!mis free entry"
900 " idx %x vtw %p which %x"
901 , idx_decode(ctl, idx)
902 , vtw, which));
903 }
904 }
905
906 if (fp->nxt) {
907 fp = fatp_next(ctl->fat, fp);
908 } else {
909 break;
910 }
911 }
912 ++vtw_stats.miss[which];
913 vtw = 0;
914 out:
915 if (fatps > vtw_stats.max_chain[which])
916 vtw_stats.max_chain[which] = fatps;
917 if (probes > vtw_stats.max_probe[which])
918 vtw_stats.max_probe[which] = probes;
919 if (losings > vtw_stats.max_loss[which])
920 vtw_stats.max_loss[which] = losings;
921
922 return vtw;
923 }
924
925 static vtw_t *
926 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
927 , const struct in6_addr *laddr, uint16_t lport
928 , int which)
929 {
930 vtw_v6_t *v6;
931 vtw_t *vtw;
932 uint32_t tag;
933 fatp_t *fp;
934 int i;
935 uint32_t fatps = 0, probes = 0, losings = 0;
936
937 ++vtw_stats.look[which];
938
939 if (!ctl || !ctl->fat)
940 return 0;
941
942 if (which) {
943 tag = v6_port_tag(lport);
944 fp = ctl->fat->port[tag & ctl->fat->mask];
945 } else {
946 tag = v6_tag(faddr, fport, laddr, lport);
947 fp = ctl->fat->hash[tag & ctl->fat->mask];
948 }
949
950 while (fp && fp->inuse) {
951 uint32_t inuse = fp->inuse;
952
953 ++fatps;
954
955 for (i = 0; inuse && i < fatp_ntags(); ++i) {
956 uint32_t idx;
957
958 if (!(inuse & (1 << i)))
959 continue;
960
961 inuse ^= 1 << i;
962
963 ++probes;
964 ++vtw_stats.probe[which];
965
966 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
967 vtw = vtw_from_index(ctl, idx);
968
969 db_trace(KTR_VTW
970 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
971 , i
972 , db_store(faddr, sizeof (*faddr)), fport
973 , db_store(laddr, sizeof (*laddr)), lport
974 , idx_decode(ctl, idx)));
975
976 if (!vtw) {
977 /* Hopefully fast path.
978 */
979 continue;
980 }
981
982 v6 = (void*)vtw;
983
984 if (vtw_alive(vtw)
985 && ((which ? vtw->port_key : vtw->key)
986 == fatp_key(ctl->fat, fp, i))
987 && v6->lport == lport
988 && (which
989 || (v6->fport == fport
990 && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
991 && !bcmp(&v6->laddr, laddr
992 , sizeof (*laddr))))) {
993 ++vtw_stats.hit[which];
994
995 KASSERT(vtw->hashed);
996 goto out;
997 } else {
998 ++vtw_stats.losing[which];
999 ++losings;
1000 }
1001 }
1002
1003 if (fp->nxt) {
1004 fp = fatp_next(ctl->fat, fp);
1005 } else {
1006 break;
1007 }
1008 }
1009 ++vtw_stats.miss[which];
1010 vtw = 0;
1011 out:
1012 if (fatps > vtw_stats.max_chain[which])
1013 vtw_stats.max_chain[which] = fatps;
1014 if (probes > vtw_stats.max_probe[which])
1015 vtw_stats.max_probe[which] = probes;
1016 if (losings > vtw_stats.max_loss[which])
1017 vtw_stats.max_loss[which] = losings;
1018
1019 return vtw;
1020 }
1021
1022 /*!\brief port iterator
1023 */
1024 static vtw_t *
1025 vtw_next_port_v4(struct tcp_ports_iterator *it)
1026 {
1027 vtw_ctl_t *ctl = it->ctl;
1028 vtw_v4_t *v4;
1029 vtw_t *vtw;
1030 uint32_t tag;
1031 uint16_t lport = it->port;
1032 fatp_t *fp;
1033 int i;
1034 uint32_t fatps = 0, probes = 0, losings = 0;
1035
1036 tag = v4_port_tag(lport);
1037 if (!it->fp) {
1038 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1039 it->slot_idx = 0;
1040 }
1041 fp = it->fp;
1042
1043 while (fp) {
1044 uint32_t inuse = fp->inuse;
1045
1046 ++fatps;
1047
1048 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1049 uint32_t idx;
1050
1051 if (!(inuse & (1 << i)))
1052 continue;
1053
1054 inuse &= ~0U << i;
1055
1056 if (i < it->slot_idx)
1057 continue;
1058
1059 ++vtw_stats.probe[1];
1060 ++probes;
1061
1062 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1063 vtw = vtw_from_index(ctl, idx);
1064
1065 if (!vtw) {
1066 /* Hopefully fast path.
1067 */
1068 continue;
1069 }
1070
1071 v4 = (void*)vtw;
1072
1073 if (vtw_alive(vtw)
1074 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1075 && v4->lport == lport) {
1076 ++vtw_stats.hit[1];
1077
1078 it->slot_idx = i + 1;
1079
1080 goto out;
1081 } else if (vtw_alive(vtw)) {
1082 ++vtw_stats.losing[1];
1083 ++losings;
1084
1085 db_trace(KTR_VTW
1086 , (vtw, "vtw:!mis"
1087 " port %8.8x:%4.4x %8.8x:%4.4x"
1088 " key %x port %x"
1089 , v4->faddr, v4->fport
1090 , v4->laddr, v4->lport
1091 , vtw->key
1092 , lport));
1093 } else {
1094 /* Really losing here. We are coming
1095 * up with references to free entries.
1096 * Might find it better to use
1097 * traditional, or need another
1098 * add-hockery. The other add-hockery
1099 * would be to pul more into into the
1100 * cache line to reject the false
1101 * hits.
1102 */
1103 ++vtw_stats.losing[1];
1104 ++losings;
1105 db_trace(KTR_VTW
1106 , (fp, "vtw:!mis port %x"
1107 " - free entry idx %x vtw %p"
1108 , lport
1109 , idx_decode(ctl, idx)
1110 , vtw));
1111 }
1112 }
1113
1114 if (fp->nxt) {
1115 it->fp = fp = fatp_next(ctl->fat, fp);
1116 it->slot_idx = 0;
1117 } else {
1118 it->fp = 0;
1119 break;
1120 }
1121 }
1122 ++vtw_stats.miss[1];
1123
1124 vtw = 0;
1125 out:
1126 if (fatps > vtw_stats.max_chain[1])
1127 vtw_stats.max_chain[1] = fatps;
1128 if (probes > vtw_stats.max_probe[1])
1129 vtw_stats.max_probe[1] = probes;
1130 if (losings > vtw_stats.max_loss[1])
1131 vtw_stats.max_loss[1] = losings;
1132
1133 return vtw;
1134 }
1135
1136 /*!\brief port iterator
1137 */
1138 static vtw_t *
1139 vtw_next_port_v6(struct tcp_ports_iterator *it)
1140 {
1141 vtw_ctl_t *ctl = it->ctl;
1142 vtw_v6_t *v6;
1143 vtw_t *vtw;
1144 uint32_t tag;
1145 uint16_t lport = it->port;
1146 fatp_t *fp;
1147 int i;
1148 uint32_t fatps = 0, probes = 0, losings = 0;
1149
1150 tag = v6_port_tag(lport);
1151 if (!it->fp) {
1152 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1153 it->slot_idx = 0;
1154 }
1155 fp = it->fp;
1156
1157 while (fp) {
1158 uint32_t inuse = fp->inuse;
1159
1160 ++fatps;
1161
1162 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1163 uint32_t idx;
1164
1165 if (!(inuse & (1 << i)))
1166 continue;
1167
1168 inuse &= ~0U << i;
1169
1170 if (i < it->slot_idx)
1171 continue;
1172
1173 ++vtw_stats.probe[1];
1174 ++probes;
1175
1176 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1177 vtw = vtw_from_index(ctl, idx);
1178
1179 if (!vtw) {
1180 /* Hopefully fast path.
1181 */
1182 continue;
1183 }
1184
1185 v6 = (void*)vtw;
1186
1187 db_trace(KTR_VTW
1188 , (vtw, "vtw: i %x idx %x fp->tag %x"
1189 " tag %x xtra %x"
1190 , i, idx_decode(ctl, idx)
1191 , fp->tag[i], tag, fatp_xtra[i]));
1192
1193 if (vtw_alive(vtw)
1194 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1195 && v6->lport == lport) {
1196 ++vtw_stats.hit[1];
1197
1198 db_trace(KTR_VTW
1199 , (fp, "vtw: nxt port %P - %4.4x"
1200 " idx %x key %x"
1201 , lport, lport
1202 , idx_decode(ctl, idx), vtw->key));
1203
1204 it->slot_idx = i + 1;
1205 goto out;
1206 } else if (vtw_alive(vtw)) {
1207 ++vtw_stats.losing[1];
1208
1209 db_trace(KTR_VTW
1210 , (vtw, "vtw:!mis port %6A:%4.4x"
1211 " %6A:%4.4x key %x port %x"
1212 , db_store(&v6->faddr
1213 , sizeof (v6->faddr))
1214 , v6->fport
1215 , db_store(&v6->laddr
1216 , sizeof (v6->faddr))
1217 , v6->lport
1218 , vtw->key
1219 , lport));
1220 } else {
1221 /* Really losing here. We are coming
1222 * up with references to free entries.
1223 * Might find it better to use
1224 * traditional, or need another
1225 * add-hockery. The other add-hockery
1226 * would be to pul more into into the
1227 * cache line to reject the false
1228 * hits.
1229 */
1230 ++vtw_stats.losing[1];
1231 ++losings;
1232
1233 db_trace(KTR_VTW
1234 , (fp
1235 , "vtw:!mis port %x"
1236 " - free entry idx %x vtw %p"
1237 , lport, idx_decode(ctl, idx)
1238 , vtw));
1239 }
1240 }
1241
1242 if (fp->nxt) {
1243 it->fp = fp = fatp_next(ctl->fat, fp);
1244 it->slot_idx = 0;
1245 } else {
1246 it->fp = 0;
1247 break;
1248 }
1249 }
1250 ++vtw_stats.miss[1];
1251
1252 vtw = 0;
1253 out:
1254 if (fatps > vtw_stats.max_chain[1])
1255 vtw_stats.max_chain[1] = fatps;
1256 if (probes > vtw_stats.max_probe[1])
1257 vtw_stats.max_probe[1] = probes;
1258 if (losings > vtw_stats.max_loss[1])
1259 vtw_stats.max_loss[1] = losings;
1260
1261 return vtw;
1262 }
1263
1264 /*!\brief initialise the VTW allocation arena
1265 *
1266 * There are 1+3 allocation classes:
1267 * 0 classless
1268 * {1,2,3} MSL-class based allocation
1269 *
1270 * The allocation arenas are all initialised. Classless gets all the
1271 * space. MSL-class based divides the arena, so that allocation
1272 * within a class can proceed without having to consider entries
1273 * (aka: cache lines) from different classes.
1274 *
1275 * Usually, we are completely classless or class-based, but there can be
1276 * transition periods, corresponding to dynamic adjustments in the config
1277 * by the operator.
1278 */
1279 static void
1280 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1281 {
1282 int class_n, i;
1283 vtw_t *base;
1284
1285 ctl->base.v = ctl_base_v;
1286
1287 if (ctl->is_v4) {
1288 ctl->lim.v4 = ctl->base.v4 + n - 1;
1289 ctl->alloc.v4 = ctl->base.v4;
1290 } else {
1291 ctl->lim.v6 = ctl->base.v6 + n - 1;
1292 ctl->alloc.v6 = ctl->base.v6;
1293 }
1294
1295 ctl->nfree = n;
1296 ctl->ctl = ctl;
1297
1298 ctl->idx_bits = 32;
1299 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1300 ctl->idx_mask >>= 1;
1301 ctl->idx_bits -= 1;
1302 }
1303
1304 ctl->idx_mask <<= 1;
1305 ctl->idx_mask |= 1;
1306 ctl->idx_bits += 1;
1307
1308 ctl->fat = fat;
1309 fat->vtw = ctl;
1310
1311 /* Divide the resources equally amongst the classes.
1312 * This is not optimal, as the different classes
1313 * arrive and leave at different rates, but it is
1314 * the best I can do for now.
1315 */
1316 class_n = n / (VTW_NCLASS-1);
1317 base = ctl->base.v;
1318
1319 for (i = 1; i < VTW_NCLASS; ++i) {
1320 int j;
1321
1322 ctl[i] = ctl[0];
1323 ctl[i].clidx = i;
1324
1325 ctl[i].base.v = base;
1326 ctl[i].alloc = ctl[i].base;
1327
1328 for (j = 0; j < class_n - 1; ++j) {
1329 if (tcp_msl_enable)
1330 base->msl_class = i;
1331 base = vtw_next(ctl, base);
1332 }
1333
1334 ctl[i].lim.v = base;
1335 base = vtw_next(ctl, base);
1336 ctl[i].nfree = class_n;
1337 }
1338
1339 vtw_debug_init();
1340 }
1341
1342 /*!\brief map class to TCP MSL
1343 */
1344 static inline uint32_t
1345 class_to_msl(int msl_class)
1346 {
1347 switch (msl_class) {
1348 case 0:
1349 case 1:
1350 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1351 case 2:
1352 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1353 default:
1354 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1355 }
1356 }
1357
1358 /*!\brief map TCP MSL to class
1359 */
1360 static inline uint32_t
1361 msl_to_class(int msl)
1362 {
1363 if (tcp_msl_enable) {
1364 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1365 return 1+2;
1366 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1367 return 1+1;
1368 return 1;
1369 }
1370 return 0;
1371 }
1372
1373 /*!\brief allocate a vtw entry
1374 */
1375 static inline vtw_t *
1376 vtw_alloc(vtw_ctl_t *ctl)
1377 {
1378 vtw_t *vtw = 0;
1379 int stuck = 0;
1380 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1381 int msl;
1382
1383 KASSERT(mutex_owned(softnet_lock));
1384
1385 /* If no resources, we will not get far.
1386 */
1387 if (!ctl || !ctl->base.v4 || avail <= 0)
1388 return 0;
1389
1390 /* Obtain a free one.
1391 */
1392 while (!ctl->nfree) {
1393 vtw_age(ctl, 0);
1394
1395 if (++stuck > avail) {
1396 /* When in transition between
1397 * schemes (classless, classed) we
1398 * can be stuck having to await the
1399 * expiration of cross-allocated entries.
1400 *
1401 * Returning zero means we will fall back to the
1402 * traditional TIME_WAIT handling, except in the
1403 * case of a re-shed, in which case we cannot
1404 * perform the reshecd, but will retain the extant
1405 * entry.
1406 */
1407 db_trace(KTR_VTW
1408 , (ctl, "vtw:!none free in class %x %x/%x"
1409 , ctl->clidx
1410 , ctl->nalloc, ctl->nfree));
1411
1412 return 0;
1413 }
1414 }
1415
1416 vtw = ctl->alloc.v;
1417
1418 if (vtw->msl_class != ctl->clidx) {
1419 /* Usurping rules:
1420 * 0 -> {1,2,3} or {1,2,3} -> 0
1421 */
1422 KASSERT(!vtw->msl_class || !ctl->clidx);
1423
1424 if (vtw->hashed || vtw->expire.tv_sec) {
1425 /* As this is owned by some other class,
1426 * we must wait for it to expire it.
1427 * This will only happen on class/classless
1428 * transitions, which are guaranteed to progress
1429 * to completion in small finite time, barring bugs.
1430 */
1431 db_trace(KTR_VTW
1432 , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1433 , vtw, vtw->msl_class, ctl->clidx
1434 , vtw->expire.tv_sec
1435 , vtw->expire.tv_usec
1436 , vtw->hashed ? " hashed" : ""));
1437
1438 return 0;
1439 }
1440
1441 db_trace(KTR_VTW
1442 , (ctl, "vtw:!%p usurped from %x to %x"
1443 , vtw, vtw->msl_class, ctl->clidx));
1444
1445 vtw->msl_class = ctl->clidx;
1446 }
1447
1448 if (vtw_alive(vtw)) {
1449 KASSERT(0 && "next free not free");
1450 return 0;
1451 }
1452
1453 /* Advance allocation pointer.
1454 */
1455 ctl->alloc.v = vtw_next(ctl, vtw);
1456
1457 --ctl->nfree;
1458 ++ctl->nalloc;
1459
1460 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1461
1462 /* mark expiration
1463 */
1464 getmicrouptime(&vtw->expire);
1465
1466 /* Move expiration into the future.
1467 */
1468 vtw->expire.tv_sec += msl / 1000;
1469 vtw->expire.tv_usec += 1000 * (msl % 1000);
1470
1471 while (vtw->expire.tv_usec >= 1000*1000) {
1472 vtw->expire.tv_usec -= 1000*1000;
1473 vtw->expire.tv_sec += 1;
1474 }
1475
1476 if (!ctl->oldest.v)
1477 ctl->oldest.v = vtw;
1478
1479 return vtw;
1480 }
1481
1482 /*!\brief expiration
1483 */
1484 static int
1485 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1486 {
1487 vtw_t *vtw;
1488 struct timeval then, *when = _when;
1489 int maxtries = 0;
1490
1491 if (!ctl->oldest.v) {
1492 KASSERT(!ctl->nalloc);
1493 return 0;
1494 }
1495
1496 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1497 if (++maxtries > ctl->nalloc)
1498 break;
1499
1500 if (vtw->msl_class != ctl->clidx) {
1501 db_trace(KTR_VTW
1502 , (vtw, "vtw:!age class mismatch %x != %x"
1503 , vtw->msl_class, ctl->clidx));
1504 /* XXXX
1505 * See if the appropriate action is to skip to the next.
1506 * XXXX
1507 */
1508 ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1509 continue;
1510 }
1511 if (!when) {
1512 /* Latch oldest timeval if none specified.
1513 */
1514 then = vtw->expire;
1515 when = &then;
1516 }
1517
1518 if (!timercmp(&vtw->expire, when, <=))
1519 break;
1520
1521 db_trace(KTR_VTW
1522 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1523 , ctl->clidx
1524 , vtw->expire.tv_sec
1525 , vtw->expire.tv_usec
1526 , ctl->nalloc
1527 , ctl->nfree));
1528
1529 if (!_when)
1530 ++vtw_stats.kill;
1531
1532 vtw_del(ctl, vtw);
1533 vtw = ctl->oldest.v;
1534 }
1535
1536 return ctl->nalloc; // # remaining allocated
1537 }
1538
1539 static callout_t vtw_cs;
1540
1541 /*!\brief notice the passage of time.
1542 * It seems to be getting faster. What happened to the year?
1543 */
1544 static void
1545 vtw_tick(void *arg)
1546 {
1547 struct timeval now;
1548 int i, cnt = 0;
1549
1550 getmicrouptime(&now);
1551
1552 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1553 , now.tv_sec, now.tv_usec));
1554
1555 mutex_enter(softnet_lock);
1556
1557 for (i = 0; i < VTW_NCLASS; ++i) {
1558 cnt += vtw_age(&vtw_tcpv4[i], &now);
1559 cnt += vtw_age(&vtw_tcpv6[i], &now);
1560 }
1561
1562 /* Keep ticks coming while we need them.
1563 */
1564 if (cnt)
1565 callout_schedule(&vtw_cs, hz / 5);
1566 else {
1567 tcp_vtw_was_enabled = 0;
1568 tcbtable.vestige = 0;
1569 }
1570 mutex_exit(softnet_lock);
1571 }
1572
1573 /* inpcb_lookup_locals assist for handling vestigial entries.
1574 */
1575 static void *
1576 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1577 {
1578 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1579
1580 bzero(it, sizeof (*it));
1581
1582 /* Note: the reference to vtw_tcpv4[0] is fine.
1583 * We do not need per-class iteration. We just
1584 * need to get to the fat, and there is one
1585 * shared fat.
1586 */
1587 if (vtw_tcpv4[0].fat) {
1588 it->addr.v4 = addr;
1589 it->port = port;
1590 it->wild = !!wild;
1591 it->ctl = &vtw_tcpv4[0];
1592
1593 ++vtw_stats.look[1];
1594 }
1595
1596 return it;
1597 }
1598
1599 /*!\brief export an IPv4 vtw.
1600 */
1601 static int
1602 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1603 {
1604 vtw_v4_t *v4 = (void*)vtw;
1605
1606 bzero(res, sizeof (*res));
1607
1608 if (ctl && vtw) {
1609 if (!ctl->clidx && vtw->msl_class)
1610 ctl += vtw->msl_class;
1611 else
1612 KASSERT(ctl->clidx == vtw->msl_class);
1613
1614 res->valid = 1;
1615 res->v4 = 1;
1616
1617 res->faddr.v4.s_addr = v4->faddr;
1618 res->laddr.v4.s_addr = v4->laddr;
1619 res->fport = v4->fport;
1620 res->lport = v4->lport;
1621 res->vtw = vtw; // netlock held over call(s)
1622 res->ctl = ctl;
1623 res->reuse_addr = vtw->reuse_addr;
1624 res->reuse_port = vtw->reuse_port;
1625 res->snd_nxt = vtw->snd_nxt;
1626 res->rcv_nxt = vtw->rcv_nxt;
1627 res->rcv_wnd = vtw->rcv_wnd;
1628 res->uid = vtw->uid;
1629 }
1630
1631 return res->valid;
1632 }
1633
1634 /*!\brief return next port in the port iterator. yowza.
1635 */
1636 static int
1637 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1638 {
1639 struct tcp_ports_iterator *it = arg;
1640 vtw_t *vtw = 0;
1641
1642 if (it->ctl)
1643 vtw = vtw_next_port_v4(it);
1644
1645 if (!vtw)
1646 it->ctl = 0;
1647
1648 return vtw_export_v4(it->ctl, vtw, res);
1649 }
1650
1651 static int
1652 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1653 struct in_addr laddr, uint16_t lport,
1654 struct vestigial_inpcb *res)
1655 {
1656 vtw_t *vtw;
1657 vtw_ctl_t *ctl;
1658
1659
1660 db_trace(KTR_VTW
1661 , (res, "vtw: lookup %A:%P %A:%P"
1662 , faddr, fport
1663 , laddr, lport));
1664
1665 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1666 , faddr.s_addr, fport
1667 , laddr.s_addr, lport, 0);
1668
1669 return vtw_export_v4(ctl, vtw, res);
1670 }
1671
1672 /* inpcb_lookup_locals assist for handling vestigial entries.
1673 */
1674 static void *
1675 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1676 {
1677 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1678
1679 bzero(it, sizeof (*it));
1680
1681 /* Note: the reference to vtw_tcpv6[0] is fine.
1682 * We do not need per-class iteration. We just
1683 * need to get to the fat, and there is one
1684 * shared fat.
1685 */
1686 if (vtw_tcpv6[0].fat) {
1687 it->addr.v6 = *addr;
1688 it->port = port;
1689 it->wild = !!wild;
1690 it->ctl = &vtw_tcpv6[0];
1691
1692 ++vtw_stats.look[1];
1693 }
1694
1695 return it;
1696 }
1697
1698 /*!\brief export an IPv6 vtw.
1699 */
1700 static int
1701 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1702 {
1703 vtw_v6_t *v6 = (void*)vtw;
1704
1705 bzero(res, sizeof (*res));
1706
1707 if (ctl && vtw) {
1708 if (!ctl->clidx && vtw->msl_class)
1709 ctl += vtw->msl_class;
1710 else
1711 KASSERT(ctl->clidx == vtw->msl_class);
1712
1713 res->valid = 1;
1714 res->v4 = 0;
1715
1716 res->faddr.v6 = v6->faddr;
1717 res->laddr.v6 = v6->laddr;
1718 res->fport = v6->fport;
1719 res->lport = v6->lport;
1720 res->vtw = vtw; // netlock held over call(s)
1721 res->ctl = ctl;
1722
1723 res->v6only = vtw->v6only;
1724 res->reuse_addr = vtw->reuse_addr;
1725 res->reuse_port = vtw->reuse_port;
1726
1727 res->snd_nxt = vtw->snd_nxt;
1728 res->rcv_nxt = vtw->rcv_nxt;
1729 res->rcv_wnd = vtw->rcv_wnd;
1730 res->uid = vtw->uid;
1731 }
1732
1733 return res->valid;
1734 }
1735
1736 static int
1737 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1738 {
1739 struct tcp_ports_iterator *it = arg;
1740 vtw_t *vtw = 0;
1741
1742 if (it->ctl)
1743 vtw = vtw_next_port_v6(it);
1744
1745 if (!vtw)
1746 it->ctl = 0;
1747
1748 return vtw_export_v6(it->ctl, vtw, res);
1749 }
1750
1751 static int
1752 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1753 const struct in6_addr *laddr, uint16_t lport,
1754 struct vestigial_inpcb *res)
1755 {
1756 vtw_ctl_t *ctl;
1757 vtw_t *vtw;
1758
1759 db_trace(KTR_VTW
1760 , (res, "vtw: lookup %6A:%P %6A:%P"
1761 , db_store(faddr, sizeof (*faddr)), fport
1762 , db_store(laddr, sizeof (*laddr)), lport));
1763
1764 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1765 , faddr, fport
1766 , laddr, lport, 0);
1767
1768 return vtw_export_v6(ctl, vtw, res);
1769 }
1770
1771 static vestigial_hooks_t tcp_hooks = {
1772 .init_ports4 = tcp_init_ports_v4,
1773 .next_port4 = tcp_next_port_v4,
1774 .lookup4 = tcp_lookup_v4,
1775 .init_ports6 = tcp_init_ports_v6,
1776 .next_port6 = tcp_next_port_v6,
1777 .lookup6 = tcp_lookup_v6,
1778 };
1779
1780 static bool
1781 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1782 {
1783 fatp_ctl_t *fat;
1784 vtw_ctl_t *ctl;
1785
1786 switch (af) {
1787 case AF_INET:
1788 fat = &fat_tcpv4;
1789 ctl = &vtw_tcpv4[0];
1790 break;
1791 case AF_INET6:
1792 fat = &fat_tcpv6;
1793 ctl = &vtw_tcpv6[0];
1794 break;
1795 default:
1796 return false;
1797 }
1798 if (fatp != NULL)
1799 *fatp = fat;
1800 if (ctlp != NULL)
1801 *ctlp = ctl;
1802 return true;
1803 }
1804
1805 /*!\brief initialize controlling instance
1806 */
1807 static int
1808 vtw_control_init(int af)
1809 {
1810 fatp_ctl_t *fat;
1811 vtw_ctl_t *ctl;
1812 fatp_t *fat_base;
1813 fatp_t **fat_hash;
1814 vtw_t *ctl_base_v;
1815 uint32_t n, m;
1816 size_t sz;
1817
1818 KASSERT(powerof2(tcp_vtw_entries));
1819
1820 if (!vtw_select(af, &fat, &ctl))
1821 return EAFNOSUPPORT;
1822
1823 if (fat->hash != NULL) {
1824 KASSERT(fat->base != NULL && ctl->base.v != NULL);
1825 return 0;
1826 }
1827
1828 /* Allocate 10% more capacity in the fat pointers.
1829 * We should only need ~#hash additional based on
1830 * how they age, but TIME_WAIT assassination could cause
1831 * sparse fat pointer utilisation.
1832 */
1833 m = 512;
1834 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1835 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1836
1837 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_SLEEP);
1838 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_SLEEP);
1839 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_SLEEP);
1840 fatp_init(fat, n, m, fat_base, fat_hash);
1841 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1842
1843 return 0;
1844 }
1845
1846 /*!\brief select controlling instance
1847 */
1848 static vtw_ctl_t *
1849 vtw_control(int af, uint32_t msl)
1850 {
1851 fatp_ctl_t *fat;
1852 vtw_ctl_t *ctl;
1853 int msl_class = msl_to_class(msl);
1854
1855 if (!vtw_select(af, &fat, &ctl))
1856 return NULL;
1857
1858 if (!fat->base || !ctl->base.v)
1859 return NULL;
1860
1861 if (!tcp_vtw_was_enabled) {
1862 /* This guarantees is timer ticks until we no longer need them.
1863 */
1864 tcp_vtw_was_enabled = 1;
1865
1866 callout_schedule(&vtw_cs, hz / 5);
1867
1868 tcbtable.vestige = &tcp_hooks;
1869 }
1870
1871 return ctl + msl_class;
1872 }
1873
1874 /*!\brief add TCP pcb to vestigial timewait
1875 */
1876 int
1877 vtw_add(int af, struct tcpcb *tp)
1878 {
1879 #ifdef VTW_DEBUG
1880 int enable;
1881 #endif
1882 vtw_ctl_t *ctl;
1883 vtw_t *vtw;
1884
1885 KASSERT(mutex_owned(softnet_lock));
1886
1887 ctl = vtw_control(af, tp->t_msl);
1888 if (!ctl)
1889 return 0;
1890
1891 #ifdef VTW_DEBUG
1892 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1893 #endif
1894
1895 vtw = vtw_alloc(ctl);
1896
1897 if (vtw) {
1898 vtw->snd_nxt = tp->snd_nxt;
1899 vtw->rcv_nxt = tp->rcv_nxt;
1900
1901 switch (af) {
1902 case AF_INET: {
1903 struct inpcb *inp = tp->t_inpcb;
1904 vtw_v4_t *v4 = (void*)vtw;
1905
1906 v4->faddr = in4p_faddr(inp).s_addr;
1907 v4->laddr = in4p_laddr(inp).s_addr;
1908 v4->fport = inp->inp_fport;
1909 v4->lport = inp->inp_lport;
1910
1911 vtw->reuse_port = !!(inp->inp_socket->so_options
1912 & SO_REUSEPORT);
1913 vtw->reuse_addr = !!(inp->inp_socket->so_options
1914 & SO_REUSEADDR);
1915 vtw->v6only = 0;
1916 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1917
1918 vtw_inshash_v4(ctl, vtw);
1919
1920
1921 #ifdef VTW_DEBUG
1922 /* Immediate lookup (connected and port) to
1923 * ensure at least that works!
1924 */
1925 if (enable & 4) {
1926 KASSERT(vtw_lookup_hash_v4
1927 (ctl
1928 , in4p_faddr(inp).s_addr, inp->inp_fport
1929 , in4p_laddr(inp).s_addr, inp->inp_lport
1930 , 0)
1931 == vtw);
1932 KASSERT(vtw_lookup_hash_v4
1933 (ctl
1934 , in4p_faddr(inp).s_addr, inp->inp_fport
1935 , in4p_laddr(inp).s_addr, inp->inp_lport
1936 , 1));
1937 }
1938 /* Immediate port iterator functionality check: not wild
1939 */
1940 if (enable & 8) {
1941 struct tcp_ports_iterator *it;
1942 struct vestigial_inpcb res;
1943 int cnt = 0;
1944
1945 it = tcp_init_ports_v4(in4p_laddr(inp)
1946 , inp->inp_lport, 0);
1947
1948 while (tcp_next_port_v4(it, &res)) {
1949 ++cnt;
1950 }
1951 KASSERT(cnt);
1952 }
1953 /* Immediate port iterator functionality check: wild
1954 */
1955 if (enable & 16) {
1956 struct tcp_ports_iterator *it;
1957 struct vestigial_inpcb res;
1958 struct in_addr any;
1959 int cnt = 0;
1960
1961 any.s_addr = htonl(INADDR_ANY);
1962
1963 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1964
1965 while (tcp_next_port_v4(it, &res)) {
1966 ++cnt;
1967 }
1968 KASSERT(cnt);
1969 }
1970 #endif /* VTW_DEBUG */
1971 break;
1972 }
1973
1974 case AF_INET6: {
1975 struct inpcb *inp = tp->t_inpcb;
1976 vtw_v6_t *v6 = (void*)vtw;
1977
1978 v6->faddr = in6p_faddr(inp);
1979 v6->laddr = in6p_laddr(inp);
1980 v6->fport = inp->inp_fport;
1981 v6->lport = inp->inp_lport;
1982
1983 vtw->reuse_port = !!(inp->inp_socket->so_options
1984 & SO_REUSEPORT);
1985 vtw->reuse_addr = !!(inp->inp_socket->so_options
1986 & SO_REUSEADDR);
1987 vtw->v6only = !!(inp->inp_flags
1988 & IN6P_IPV6_V6ONLY);
1989 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1990
1991 vtw_inshash_v6(ctl, vtw);
1992 #ifdef VTW_DEBUG
1993 /* Immediate lookup (connected and port) to
1994 * ensure at least that works!
1995 */
1996 if (enable & 4) {
1997 KASSERT(vtw_lookup_hash_v6(ctl
1998 , &in6p_faddr(inp), inp->inp_fport
1999 , &in6p_laddr(inp), inp->inp_lport
2000 , 0)
2001 == vtw);
2002 KASSERT(vtw_lookup_hash_v6
2003 (ctl
2004 , &in6p_faddr(inp), inp->inp_fport
2005 , &in6p_laddr(inp), inp->inp_lport
2006 , 1));
2007 }
2008 /* Immediate port iterator functionality check: not wild
2009 */
2010 if (enable & 8) {
2011 struct tcp_ports_iterator *it;
2012 struct vestigial_inpcb res;
2013 int cnt = 0;
2014
2015 it = tcp_init_ports_v6(&in6p_laddr(inp)
2016 , inp->inp_lport, 0);
2017
2018 while (tcp_next_port_v6(it, &res)) {
2019 ++cnt;
2020 }
2021 KASSERT(cnt);
2022 }
2023 /* Immediate port iterator functionality check: wild
2024 */
2025 if (enable & 16) {
2026 struct tcp_ports_iterator *it;
2027 struct vestigial_inpcb res;
2028 static struct in6_addr any = IN6ADDR_ANY_INIT;
2029 int cnt = 0;
2030
2031 it = tcp_init_ports_v6(&any
2032 , inp->inp_lport, 1);
2033
2034 while (tcp_next_port_v6(it, &res)) {
2035 ++cnt;
2036 }
2037 KASSERT(cnt);
2038 }
2039 #endif /* VTW_DEBUG */
2040 break;
2041 }
2042 }
2043
2044 tcp_canceltimers(tp);
2045 tp = tcp_close(tp);
2046 KASSERT(!tp);
2047
2048 return 1;
2049 }
2050
2051 return 0;
2052 }
2053
2054 /*!\brief restart timer for vestigial time-wait entry
2055 */
2056 static void
2057 vtw_restart_v4(vestigial_inpcb_t *vp)
2058 {
2059 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2060 vtw_t *vtw;
2061 vtw_t *cp = ©.common;
2062 vtw_ctl_t *ctl;
2063
2064 KASSERT(mutex_owned(softnet_lock));
2065
2066 db_trace(KTR_VTW
2067 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2068 , vp->faddr.v4.s_addr, vp->fport
2069 , vp->laddr.v4.s_addr, vp->lport));
2070
2071 /* Class might have changed, so have a squiz.
2072 */
2073 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2074 vtw = vtw_alloc(ctl);
2075
2076 if (vtw) {
2077 vtw_v4_t *v4 = (void*)vtw;
2078
2079 /* Safe now to unhash the old entry
2080 */
2081 vtw_del(vp->ctl, vp->vtw);
2082
2083 vtw->snd_nxt = cp->snd_nxt;
2084 vtw->rcv_nxt = cp->rcv_nxt;
2085
2086 v4->faddr = copy.faddr;
2087 v4->laddr = copy.laddr;
2088 v4->fport = copy.fport;
2089 v4->lport = copy.lport;
2090
2091 vtw->reuse_port = cp->reuse_port;
2092 vtw->reuse_addr = cp->reuse_addr;
2093 vtw->v6only = 0;
2094 vtw->uid = cp->uid;
2095
2096 vtw_inshash_v4(ctl, vtw);
2097 }
2098
2099 vp->valid = 0;
2100 }
2101
2102 /*!\brief restart timer for vestigial time-wait entry
2103 */
2104 static void
2105 vtw_restart_v6(vestigial_inpcb_t *vp)
2106 {
2107 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2108 vtw_t *vtw;
2109 vtw_t *cp = ©.common;
2110 vtw_ctl_t *ctl;
2111
2112 KASSERT(mutex_owned(softnet_lock));
2113
2114 db_trace(KTR_VTW
2115 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2116 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2117 , vp->fport
2118 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2119 , vp->lport));
2120
2121 /* Class might have changed, so have a squiz.
2122 */
2123 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2124 vtw = vtw_alloc(ctl);
2125
2126 if (vtw) {
2127 vtw_v6_t *v6 = (void*)vtw;
2128
2129 /* Safe now to unhash the old entry
2130 */
2131 vtw_del(vp->ctl, vp->vtw);
2132
2133 vtw->snd_nxt = cp->snd_nxt;
2134 vtw->rcv_nxt = cp->rcv_nxt;
2135
2136 v6->faddr = copy.faddr;
2137 v6->laddr = copy.laddr;
2138 v6->fport = copy.fport;
2139 v6->lport = copy.lport;
2140
2141 vtw->reuse_port = cp->reuse_port;
2142 vtw->reuse_addr = cp->reuse_addr;
2143 vtw->v6only = cp->v6only;
2144 vtw->uid = cp->uid;
2145
2146 vtw_inshash_v6(ctl, vtw);
2147 }
2148
2149 vp->valid = 0;
2150 }
2151
2152 /*!\brief restart timer for vestigial time-wait entry
2153 */
2154 void
2155 vtw_restart(vestigial_inpcb_t *vp)
2156 {
2157 if (!vp || !vp->valid)
2158 return;
2159
2160 if (vp->v4)
2161 vtw_restart_v4(vp);
2162 else
2163 vtw_restart_v6(vp);
2164 }
2165
2166 int
2167 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2168 {
2169 int en, rc;
2170 struct sysctlnode node;
2171
2172 node = *rnode;
2173 en = *(int *)rnode->sysctl_data;
2174 node.sysctl_data = &en;
2175
2176 rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2177 if (rc != 0 || newp == NULL)
2178 return rc;
2179
2180 if (rnode->sysctl_data != &tcp4_vtw_enable &&
2181 rnode->sysctl_data != &tcp6_vtw_enable)
2182 rc = ENOENT;
2183 else if ((en & 1) == 0)
2184 rc = 0;
2185 else if (rnode->sysctl_data == &tcp4_vtw_enable)
2186 rc = vtw_control_init(AF_INET);
2187 else /* rnode->sysctl_data == &tcp6_vtw_enable */
2188 rc = vtw_control_init(AF_INET6);
2189
2190 if (rc == 0)
2191 *(int *)rnode->sysctl_data = en;
2192
2193 return rc;
2194 }
2195
2196 int
2197 vtw_earlyinit(void)
2198 {
2199 int i, rc;
2200
2201 callout_init(&vtw_cs, 0);
2202 callout_setfunc(&vtw_cs, vtw_tick, 0);
2203
2204 for (i = 0; i < VTW_NCLASS; ++i) {
2205 vtw_tcpv4[i].is_v4 = 1;
2206 vtw_tcpv6[i].is_v6 = 1;
2207 }
2208
2209 if ((tcp4_vtw_enable & 1) != 0 &&
2210 (rc = vtw_control_init(AF_INET)) != 0)
2211 return rc;
2212
2213 if ((tcp6_vtw_enable & 1) != 0 &&
2214 (rc = vtw_control_init(AF_INET6)) != 0)
2215 return rc;
2216
2217 return 0;
2218 }
2219
2220 #ifdef VTW_DEBUG
2221 #include <sys/syscallargs.h>
2222 #include <sys/sysctl.h>
2223
2224 /*!\brief add lalp, fafp entries for debug
2225 */
2226 int
2227 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class)
2228 {
2229 vtw_ctl_t *ctl;
2230 vtw_t *vtw;
2231
2232 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class));
2233 if (!ctl)
2234 return 0;
2235
2236 vtw = vtw_alloc(ctl);
2237
2238 if (vtw) {
2239 vtw->snd_nxt = 0;
2240 vtw->rcv_nxt = 0;
2241
2242 switch (af) {
2243 case AF_INET: {
2244 vtw_v4_t *v4 = (void*)vtw;
2245
2246 v4->faddr = fa->sin_addr.v4.s_addr;
2247 v4->laddr = la->sin_addr.v4.s_addr;
2248 v4->fport = fa->sin_port;
2249 v4->lport = la->sin_port;
2250
2251 vtw->reuse_port = 1;
2252 vtw->reuse_addr = 1;
2253 vtw->v6only = 0;
2254 vtw->uid = 0;
2255
2256 vtw_inshash_v4(ctl, vtw);
2257 break;
2258 }
2259
2260 case AF_INET6: {
2261 vtw_v6_t *v6 = (void*)vtw;
2262
2263 v6->faddr = fa->sin_addr.v6;
2264 v6->laddr = la->sin_addr.v6;
2265
2266 v6->fport = fa->sin_port;
2267 v6->lport = la->sin_port;
2268
2269 vtw->reuse_port = 1;
2270 vtw->reuse_addr = 1;
2271 vtw->v6only = 0;
2272 vtw->uid = 0;
2273
2274 vtw_inshash_v6(ctl, vtw);
2275 break;
2276 }
2277
2278 default:
2279 break;
2280 }
2281
2282 return 1;
2283 }
2284
2285 return 0;
2286 }
2287
2288 static int vtw_syscall = 0;
2289
2290 static int
2291 vtw_debug_process(vtw_sysargs_t *ap)
2292 {
2293 struct vestigial_inpcb vestige;
2294 int rc = 0;
2295
2296 mutex_enter(softnet_lock);
2297
2298 switch (ap->op) {
2299 case 0: // insert
2300 vtw_debug_add(ap->la.sin_family
2301 , &ap->la
2302 , &ap->fa
2303 , TCPTV_MSL
2304 , 0);
2305 break;
2306
2307 case 1: // lookup
2308 case 2: // restart
2309 switch (ap->la.sin_family) {
2310 case AF_INET:
2311 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2312 ap->la.sin_addr.v4, ap->la.sin_port,
2313 &vestige)) {
2314 if (ap->op == 2) {
2315 vtw_restart(&vestige);
2316 }
2317 rc = 0;
2318 } else
2319 rc = ESRCH;
2320 break;
2321
2322 case AF_INET6:
2323 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2324 &ap->la.sin_addr.v6, ap->la.sin_port,
2325 &vestige)) {
2326 if (ap->op == 2) {
2327 vtw_restart(&vestige);
2328 }
2329 rc = 0;
2330 } else
2331 rc = ESRCH;
2332 break;
2333 default:
2334 rc = EINVAL;
2335 }
2336 break;
2337
2338 default:
2339 rc = EINVAL;
2340 }
2341
2342 mutex_exit(softnet_lock);
2343 return rc;
2344 }
2345
2346 struct sys_vtw_args {
2347 syscallarg(const vtw_sysargs_t *) req;
2348 syscallarg(size_t) len;
2349 };
2350
2351 static int
2352 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2353 {
2354 const struct sys_vtw_args *uap = _;
2355 void *buf;
2356 int rc;
2357 size_t len = SCARG(uap, len);
2358
2359 if (len != sizeof (vtw_sysargs_t))
2360 return EINVAL;
2361
2362 buf = kmem_alloc(len, KM_SLEEP);
2363 rc = copyin(SCARG(uap, req), buf, len);
2364 if (!rc) {
2365 rc = vtw_debug_process(buf);
2366 }
2367 kmem_free(buf, len);
2368
2369 return rc;
2370 }
2371
2372 static void
2373 vtw_sanity_check(void)
2374 {
2375 vtw_ctl_t *ctl;
2376 vtw_t *vtw;
2377 int i;
2378 int n;
2379
2380 for (i = 0; i < VTW_NCLASS; ++i) {
2381 ctl = &vtw_tcpv4[i];
2382
2383 if (!ctl->base.v || ctl->nalloc)
2384 continue;
2385
2386 for (n = 0, vtw = ctl->base.v; ; ) {
2387 ++n;
2388 vtw = vtw_next(ctl, vtw);
2389 if (vtw == ctl->base.v)
2390 break;
2391 }
2392 db_trace(KTR_VTW
2393 , (ctl, "sanity: class %x n %x nfree %x"
2394 , i, n, ctl->nfree));
2395
2396 KASSERT(n == ctl->nfree);
2397 }
2398
2399 for (i = 0; i < VTW_NCLASS; ++i) {
2400 ctl = &vtw_tcpv6[i];
2401
2402 if (!ctl->base.v || ctl->nalloc)
2403 continue;
2404
2405 for (n = 0, vtw = ctl->base.v; ; ) {
2406 ++n;
2407 vtw = vtw_next(ctl, vtw);
2408 if (vtw == ctl->base.v)
2409 break;
2410 }
2411 db_trace(KTR_VTW
2412 , (ctl, "sanity: class %x n %x nfree %x"
2413 , i, n, ctl->nfree));
2414 KASSERT(n == ctl->nfree);
2415 }
2416 }
2417
2418 /*!\brief Initialise debug support.
2419 */
2420 static void
2421 vtw_debug_init(void)
2422 {
2423 int i;
2424
2425 vtw_sanity_check();
2426
2427 if (vtw_syscall)
2428 return;
2429
2430 for (i = 511; i; --i) {
2431 if (sysent[i].sy_call == sys_nosys) {
2432 sysent[i].sy_call = vtw_sys;
2433 sysent[i].sy_narg = 2;
2434 sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2435 sysent[i].sy_flags = 0;
2436
2437 vtw_syscall = i;
2438 break;
2439 }
2440 }
2441 if (i) {
2442 const struct sysctlnode *node;
2443 uint32_t flags;
2444
2445 flags = sysctl_root.sysctl_flags;
2446
2447 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2448 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2449
2450 sysctl_createv(0, 0, 0, &node,
2451 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2452 "koff",
2453 SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2454 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2455
2456 if (!node) {
2457 sysctl_createv(0, 0, 0, &node,
2458 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2459 "koffka",
2460 SYSCTL_DESCR("The Real(tm) Kernel"
2461 " Obscure Feature Finder"),
2462 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2463 }
2464 if (node) {
2465 sysctl_createv(0, 0, 0, 0,
2466 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2467 CTLTYPE_INT, "vtw_debug_syscall",
2468 SYSCTL_DESCR("vtw debug"
2469 " system call number"),
2470 0, 0, &vtw_syscall, 0, node->sysctl_num,
2471 CTL_CREATE, CTL_EOL);
2472 }
2473 sysctl_root.sysctl_flags = flags;
2474 }
2475 }
2476 #else /* !VTW_DEBUG */
2477 static void
2478 vtw_debug_init(void)
2479 {
2480 return;
2481 }
2482 #endif /* !VTW_DEBUG */
2483