tcp_vtw.c revision 1.18 1 /*
2 * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Coyote Point Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using
32 * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime
33 * Truncation (MSLT).
34 *
35 * MSLT and VTW were contributed by Coyote Point Systems, Inc.
36 *
37 * Even after a TCP session enters the TIME_WAIT state, its corresponding
38 * socket and protocol control blocks (PCBs) stick around until the TCP
39 * Maximum Segment Lifetime (MSL) expires. On a host whose workload
40 * necessarily creates and closes down many TCP sockets, the sockets & PCBs
41 * for TCP sessions in TIME_WAIT state amount to many megabytes of dead
42 * weight in RAM.
43 *
44 * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to
45 * a class based on the nearness of the peer. Corresponding to each class
46 * is an MSL, and a session uses the MSL of its class. The classes are
47 * loopback (local host equals remote host), local (local host and remote
48 * host are on the same link/subnet), and remote (local host and remote
49 * host communicate via one or more gateways). Classes corresponding to
50 * nearer peers have lower MSLs by default: 2 seconds for loopback, 10
51 * seconds for local, 60 seconds for remote. Loopback and local sessions
52 * expire more quickly when MSLT is used.
53 *
54 * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket
55 * dead weight with a compact representation of the session, called a
56 * "vestigial PCB". VTW data structures are designed to be very fast and
57 * memory-efficient: for fast insertion and lookup of vestigial PCBs,
58 * the PCBs are stored in a hash table that is designed to minimize the
59 * number of cacheline visits per lookup/insertion. The memory both
60 * for vestigial PCBs and for elements of the PCB hashtable come from
61 * fixed-size pools, and linked data structures exploit this to conserve
62 * memory by representing references with a narrow index/offset from the
63 * start of a pool instead of a pointer. When space for new vestigial PCBs
64 * runs out, VTW makes room by discarding old vestigial PCBs, oldest first.
65 * VTW cooperates with MSLT.
66 *
67 * It may help to think of VTW as a "FIN cache" by analogy to the SYN
68 * cache.
69 *
70 * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT
71 * sessions as fast as it can is approximately 17% idle when VTW is active
72 * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM
73 * when VTW is active (approximately 64k vestigial PCBs are created) than
74 * when it is inactive.
75 */
76
77 #include <sys/cdefs.h>
78
79 #ifdef _KERNEL_OPT
80 #include "opt_ddb.h"
81 #include "opt_inet.h"
82 #include "opt_inet_csum.h"
83 #include "opt_tcp_debug.h"
84 #endif
85
86 #include <sys/param.h>
87 #include <sys/systm.h>
88 #include <sys/kmem.h>
89 #include <sys/mbuf.h>
90 #include <sys/protosw.h>
91 #include <sys/socket.h>
92 #include <sys/socketvar.h>
93 #include <sys/errno.h>
94 #include <sys/syslog.h>
95 #include <sys/pool.h>
96 #include <sys/domain.h>
97 #include <sys/kernel.h>
98 #include <net/if.h>
99 #include <net/if_types.h>
100
101 #include <netinet/in.h>
102 #include <netinet/in_systm.h>
103 #include <netinet/ip.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/in_var.h>
106 #include <netinet/ip_var.h>
107 #include <netinet/in_offload.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet6/in6_pcb.h>
111 #include <netinet6/ip6_var.h>
112 #include <netinet6/in6_var.h>
113 #include <netinet/icmp6.h>
114
115 #include <netinet/tcp.h>
116 #include <netinet/tcp_fsm.h>
117 #include <netinet/tcp_seq.h>
118 #include <netinet/tcp_timer.h>
119 #include <netinet/tcp_var.h>
120 #include <netinet/tcp_private.h>
121 #include <netinet/tcpip.h>
122
123 #include <netinet/tcp_vtw.h>
124
125 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.18 2017/06/01 02:45:14 chs Exp $");
126
127 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
128
129 static void vtw_debug_init(void);
130
131 fatp_ctl_t fat_tcpv4;
132 fatp_ctl_t fat_tcpv6;
133 vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
134 vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
135 vtw_stats_t vtw_stats;
136
137 /* We provide state for the lookup_ports iterator.
138 * As currently we are netlock-protected, there is one.
139 * If we were finer-grain, we would have one per CPU.
140 * I do not want to be in the business of alloc/free.
141 * The best alternate would be allocate on the caller's
142 * stack, but that would require them to know the struct,
143 * or at least the size.
144 * See how she goes.
145 */
146 struct tcp_ports_iterator {
147 union {
148 struct in_addr v4;
149 struct in6_addr v6;
150 } addr;
151 u_int port;
152
153 uint32_t wild : 1;
154
155 vtw_ctl_t *ctl;
156 fatp_t *fp;
157
158 uint16_t slot_idx;
159 uint16_t ctl_idx;
160 };
161
162 static struct tcp_ports_iterator tcp_ports_iterator_v4;
163 static struct tcp_ports_iterator tcp_ports_iterator_v6;
164
165 static int vtw_age(vtw_ctl_t *, struct timeval *);
166
167 /*!\brief allocate a fat pointer from a collection.
168 */
169 static fatp_t *
170 fatp_alloc(fatp_ctl_t *fat)
171 {
172 fatp_t *fp = 0;
173
174 if (fat->nfree) {
175 fp = fat->free;
176 if (fp) {
177 fat->free = fatp_next(fat, fp);
178 --fat->nfree;
179 ++fat->nalloc;
180 fp->nxt = 0;
181
182 KASSERT(!fp->inuse);
183 }
184 }
185
186 return fp;
187 }
188
189 /*!\brief free a fat pointer.
190 */
191 static void
192 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
193 {
194 if (fp) {
195 KASSERT(!fp->inuse);
196 KASSERT(!fp->nxt);
197
198 fp->nxt = fatp_index(fat, fat->free);
199 fat->free = fp;
200
201 ++fat->nfree;
202 --fat->nalloc;
203 }
204 }
205
206 /*!\brief initialise a collection of fat pointers.
207 *
208 *\param n # hash buckets
209 *\param m total # fat pointers to allocate
210 *
211 * We allocate 2x as much, as we have two hashes: full and lport only.
212 */
213 static void
214 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
215 fatp_t *fat_base, fatp_t **fat_hash)
216 {
217 fatp_t *fp;
218
219 KASSERT(n <= FATP_MAX / 2);
220
221 fat->hash = fat_hash;
222 fat->base = fat_base;
223
224 fat->port = &fat->hash[m];
225
226 fat->mask = m - 1; // ASSERT is power of 2 (m)
227 fat->lim = fat->base + 2*n - 1;
228 fat->nfree = 0;
229 fat->nalloc = 2*n;
230
231 /* Initialise the free list.
232 */
233 for (fp = fat->lim; fp >= fat->base; --fp) {
234 fatp_free(fat, fp);
235 }
236 }
237
238 /*
239 * The `xtra' is XORed into the tag stored.
240 */
241 static uint32_t fatp_xtra[] = {
242 0x11111111,0x22222222,0x33333333,0x44444444,
243 0x55555555,0x66666666,0x77777777,0x88888888,
244 0x12121212,0x21212121,0x34343434,0x43434343,
245 0x56565656,0x65656565,0x78787878,0x87878787,
246 0x11221122,0x22112211,0x33443344,0x44334433,
247 0x55665566,0x66556655,0x77887788,0x88778877,
248 0x11112222,0x22221111,0x33334444,0x44443333,
249 0x55556666,0x66665555,0x77778888,0x88887777,
250 };
251
252 /*!\brief turn a {fatp_t*,slot} into an integral key.
253 *
254 * The key can be used to obtain the fatp_t, and the slot,
255 * as it directly encodes them.
256 */
257 static inline uint32_t
258 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
259 {
260 CTASSERT(CACHE_LINE_SIZE == 32 ||
261 CACHE_LINE_SIZE == 64 ||
262 CACHE_LINE_SIZE == 128);
263
264 switch (fatp_ntags()) {
265 case 7:
266 return (fatp_index(fat, fp) << 3) | slot;
267 case 15:
268 return (fatp_index(fat, fp) << 4) | slot;
269 case 31:
270 return (fatp_index(fat, fp) << 5) | slot;
271 default:
272 KASSERT(0 && "no support, for no good reason");
273 return ~0;
274 }
275 }
276
277 static inline uint32_t
278 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
279 {
280 CTASSERT(CACHE_LINE_SIZE == 32 ||
281 CACHE_LINE_SIZE == 64 ||
282 CACHE_LINE_SIZE == 128);
283
284 switch (fatp_ntags()) {
285 case 7:
286 return key & 7;
287 case 15:
288 return key & 15;
289 case 31:
290 return key & 31;
291 default:
292 KASSERT(0 && "no support, for no good reason");
293 return ~0;
294 }
295 }
296
297 static inline fatp_t *
298 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
299 {
300 CTASSERT(CACHE_LINE_SIZE == 32 ||
301 CACHE_LINE_SIZE == 64 ||
302 CACHE_LINE_SIZE == 128);
303
304 switch (fatp_ntags()) {
305 case 7:
306 key >>= 3;
307 break;
308 case 15:
309 key >>= 4;
310 break;
311 case 31:
312 key >>= 5;
313 break;
314 default:
315 KASSERT(0 && "no support, for no good reason");
316 return 0;
317 }
318
319 return key ? fat->base + key - 1 : 0;
320 }
321
322 static inline uint32_t
323 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
324 {
325 return (idx << ctl->idx_bits) | idx;
326 }
327
328 static inline uint32_t
329 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
330 {
331 uint32_t idx = bits & ctl->idx_mask;
332
333 if (idx_encode(ctl, idx) == bits)
334 return idx;
335 else
336 return ~0;
337 }
338
339 /*!\brief insert index into fatp hash
340 *
341 *\param idx - index of element being placed in hash chain
342 *\param tag - 32-bit tag identifier
343 *
344 *\returns
345 * value which can be used to locate entry.
346 *
347 *\note
348 * we rely on the fact that there are unused high bits in the index
349 * for verification purposes on lookup.
350 */
351
352 static inline uint32_t
353 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
354 void *dbg)
355 {
356 fatp_t *fp;
357 fatp_t **hash = (which ? fat->port : fat->hash);
358 int i;
359
360 fp = hash[tag & fat->mask];
361
362 while (!fp || fatp_full(fp)) {
363 fatp_t *fq;
364
365 /* All entries are inuse at the top level.
366 * We allocate a spare, and push the top level
367 * down one. All entries in the fp we push down
368 * (think of a tape worm here) will be expelled sooner than
369 * any entries added subsequently to this hash bucket.
370 * This is a property of the time waits we are exploiting.
371 */
372
373 fq = fatp_alloc(fat);
374 if (!fq) {
375 vtw_age(fat->vtw, 0);
376 fp = hash[tag & fat->mask];
377 continue;
378 }
379
380 fq->inuse = 0;
381 fq->nxt = fatp_index(fat, fp);
382
383 hash[tag & fat->mask] = fq;
384
385 fp = fq;
386 }
387
388 KASSERT(!fatp_full(fp));
389
390 /* Fill highest index first. Lookup is lowest first.
391 */
392 for (i = fatp_ntags(); --i >= 0; ) {
393 if (!((1 << i) & fp->inuse)) {
394 break;
395 }
396 }
397
398 fp->inuse |= 1 << i;
399 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
400
401 db_trace(KTR_VTW
402 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
403 , fp->inuse
404 , i, fp->tag[i]));
405
406 return fatp_key(fat, fp, i);
407 }
408
409 static inline int
410 vtw_alive(const vtw_t *vtw)
411 {
412 return vtw->hashed && vtw->expire.tv_sec;
413 }
414
415 static inline uint32_t
416 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
417 {
418 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
419 return v4 - ctl->base.v4;
420
421 KASSERT(0 && "vtw out of bounds");
422
423 return ~0;
424 }
425
426 static inline uint32_t
427 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
428 {
429 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
430 return v6 - ctl->base.v6;
431
432 KASSERT(0 && "vtw out of bounds");
433
434 return ~0;
435 }
436
437 static inline uint32_t
438 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
439 {
440 if (ctl->clidx)
441 ctl = ctl->ctl;
442
443 if (ctl->is_v4)
444 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
445
446 if (ctl->is_v6)
447 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
448
449 KASSERT(0 && "neither 4 nor 6. most curious.");
450
451 return ~0;
452 }
453
454 static inline vtw_t *
455 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
456 {
457 if (ctl->clidx)
458 ctl = ctl->ctl;
459
460 /* See if the index looks like it might be an index.
461 * Bits on outside of the valid index bits is a give away.
462 */
463 idx = idx_decode(ctl, idx);
464
465 if (idx == ~0) {
466 return 0;
467 } else if (ctl->is_v4) {
468 vtw_v4_t *vtw = ctl->base.v4 + idx;
469
470 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
471 ? &vtw->common : 0;
472 } else if (ctl->is_v6) {
473 vtw_v6_t *vtw = ctl->base.v6 + idx;
474
475 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
476 ? &vtw->common : 0;
477 } else {
478 KASSERT(0 && "badness");
479 return 0;
480 }
481 }
482
483 /*!\brief return the next vtw after this one.
484 *
485 * Due to the differing sizes of the entries in differing
486 * arenas, we have to ensure we ++ the correct pointer type.
487 *
488 * Also handles wrap.
489 */
490 static inline vtw_t *
491 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
492 {
493 if (ctl->is_v4) {
494 vtw_v4_t *v4 = (void*)vtw;
495
496 vtw = &(++v4)->common;
497 } else {
498 vtw_v6_t *v6 = (void*)vtw;
499
500 vtw = &(++v6)->common;
501 }
502
503 if (vtw > ctl->lim.v)
504 vtw = ctl->base.v;
505
506 return vtw;
507 }
508
509 /*!\brief remove entry from FATP hash chains
510 */
511 static inline void
512 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
513 {
514 fatp_ctl_t *fat = ctl->fat;
515 fatp_t *fp;
516 uint32_t key = vtw->key;
517 uint32_t tag, slot, idx;
518 vtw_v4_t *v4 = (void*)vtw;
519 vtw_v6_t *v6 = (void*)vtw;
520
521 if (!vtw->hashed) {
522 KASSERT(0 && "unhashed");
523 return;
524 }
525
526 if (fat->vtw->is_v4) {
527 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
528 } else if (fat->vtw->is_v6) {
529 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
530 } else {
531 tag = 0;
532 KASSERT(0 && "not reached");
533 }
534
535 /* Remove from fat->hash[]
536 */
537 slot = fatp_slot_from_key(fat, key);
538 fp = fatp_from_key(fat, key);
539 idx = vtw_index(ctl, vtw);
540
541 db_trace(KTR_VTW
542 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
543 , fp->inuse, slot, idx, key, tag));
544
545 KASSERT(fp->inuse & (1 << slot));
546 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
547 ^ fatp_xtra[slot]));
548
549 if ((fp->inuse & (1 << slot))
550 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
551 ^ fatp_xtra[slot])) {
552 fp->inuse ^= 1 << slot;
553 fp->tag[slot] = 0;
554
555 /* When we delete entries, we do not compact. This is
556 * due to temporality. We add entries, and they
557 * (eventually) expire. Older entries will be further
558 * down the chain.
559 */
560 if (!fp->inuse) {
561 uint32_t hi = tag & fat->mask;
562 fatp_t *fq = 0;
563 fatp_t *fr = fat->hash[hi];
564
565 while (fr && fr != fp) {
566 fr = fatp_next(fat, fq = fr);
567 }
568
569 if (fr == fp) {
570 if (fq) {
571 fq->nxt = fp->nxt;
572 fp->nxt = 0;
573 fatp_free(fat, fp);
574 } else {
575 KASSERT(fat->hash[hi] == fp);
576
577 if (fp->nxt) {
578 fat->hash[hi]
579 = fatp_next(fat, fp);
580 fp->nxt = 0;
581 fatp_free(fat, fp);
582 } else {
583 /* retain for next use.
584 */
585 ;
586 }
587 }
588 } else {
589 fr = fat->hash[hi];
590
591 do {
592 db_trace(KTR_VTW
593 , (fr
594 , "fat:*del inuse %5.5x"
595 " nxt %x"
596 , fr->inuse, fr->nxt));
597
598 fr = fatp_next(fat, fq = fr);
599 } while (fr && fr != fp);
600
601 KASSERT(0 && "oops");
602 }
603 }
604 vtw->key ^= ~0;
605 }
606
607 if (fat->vtw->is_v4) {
608 tag = v4_port_tag(v4->lport);
609 } else if (fat->vtw->is_v6) {
610 tag = v6_port_tag(v6->lport);
611 }
612
613 /* Remove from fat->port[]
614 */
615 key = vtw->port_key;
616 slot = fatp_slot_from_key(fat, key);
617 fp = fatp_from_key(fat, key);
618 idx = vtw_index(ctl, vtw);
619
620 db_trace(KTR_VTW
621 , (fp, "fatport: del inuse %5.5x"
622 " slot %x idx %x key %x tag %x"
623 , fp->inuse, slot, idx, key, tag));
624
625 KASSERT(fp->inuse & (1 << slot));
626 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
627 ^ fatp_xtra[slot]));
628
629 if ((fp->inuse & (1 << slot))
630 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
631 ^ fatp_xtra[slot])) {
632 fp->inuse ^= 1 << slot;
633 fp->tag[slot] = 0;
634
635 if (!fp->inuse) {
636 uint32_t hi = tag & fat->mask;
637 fatp_t *fq = 0;
638 fatp_t *fr = fat->port[hi];
639
640 while (fr && fr != fp) {
641 fr = fatp_next(fat, fq = fr);
642 }
643
644 if (fr == fp) {
645 if (fq) {
646 fq->nxt = fp->nxt;
647 fp->nxt = 0;
648 fatp_free(fat, fp);
649 } else {
650 KASSERT(fat->port[hi] == fp);
651
652 if (fp->nxt) {
653 fat->port[hi]
654 = fatp_next(fat, fp);
655 fp->nxt = 0;
656 fatp_free(fat, fp);
657 } else {
658 /* retain for next use.
659 */
660 ;
661 }
662 }
663 }
664 }
665 vtw->port_key ^= ~0;
666 }
667
668 vtw->hashed = 0;
669 }
670
671 /*!\brief remove entry from hash, possibly free.
672 */
673 void
674 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
675 {
676 KASSERT(mutex_owned(softnet_lock));
677
678 if (vtw->hashed) {
679 ++vtw_stats.del;
680 vtw_unhash(ctl, vtw);
681 }
682
683 /* We only delete the oldest entry.
684 */
685 if (vtw != ctl->oldest.v)
686 return;
687
688 --ctl->nalloc;
689 ++ctl->nfree;
690
691 vtw->expire.tv_sec = 0;
692 vtw->expire.tv_usec = ~0;
693
694 if (!ctl->nalloc)
695 ctl->oldest.v = 0;
696
697 ctl->oldest.v = vtw_next(ctl, vtw);
698 }
699
700 /*!\brief insert vestigial timewait in hash chain
701 */
702 static void
703 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
704 {
705 uint32_t idx = vtw_index(ctl, vtw);
706 uint32_t tag;
707 vtw_v4_t *v4 = (void*)vtw;
708
709 KASSERT(mutex_owned(softnet_lock));
710 KASSERT(!vtw->hashed);
711 KASSERT(ctl->clidx == vtw->msl_class);
712
713 ++vtw_stats.ins;
714
715 tag = v4_tag(v4->faddr, v4->fport,
716 v4->laddr, v4->lport);
717
718 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
719
720 db_trace(KTR_VTW, (ctl
721 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
722 " tag %8.8x key %8.8x"
723 , v4->faddr, v4->fport
724 , v4->laddr, v4->lport
725 , tag
726 , vtw->key));
727
728 tag = v4_port_tag(v4->lport);
729 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
730
731 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
732 , v4->lport, v4->lport
733 , tag
734 , vtw->key));
735
736 vtw->hashed = 1;
737 }
738
739 /*!\brief insert vestigial timewait in hash chain
740 */
741 static void
742 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
743 {
744 uint32_t idx = vtw_index(ctl, vtw);
745 uint32_t tag;
746 vtw_v6_t *v6 = (void*)vtw;
747
748 KASSERT(mutex_owned(softnet_lock));
749 KASSERT(!vtw->hashed);
750 KASSERT(ctl->clidx == vtw->msl_class);
751
752 ++vtw_stats.ins;
753
754 tag = v6_tag(&v6->faddr, v6->fport,
755 &v6->laddr, v6->lport);
756
757 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
758
759 tag = v6_port_tag(v6->lport);
760 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
761
762 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
763 , v6->lport, v6->lport
764 , tag
765 , vtw->key));
766
767 vtw->hashed = 1;
768 }
769
770 static vtw_t *
771 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
772 , uint32_t laddr, uint16_t lport
773 , int which)
774 {
775 vtw_v4_t *v4;
776 vtw_t *vtw;
777 uint32_t tag;
778 fatp_t *fp;
779 int i;
780 uint32_t fatps = 0, probes = 0, losings = 0;
781
782 if (!ctl || !ctl->fat)
783 return 0;
784
785 ++vtw_stats.look[which];
786
787 if (which) {
788 tag = v4_port_tag(lport);
789 fp = ctl->fat->port[tag & ctl->fat->mask];
790 } else {
791 tag = v4_tag(faddr, fport, laddr, lport);
792 fp = ctl->fat->hash[tag & ctl->fat->mask];
793 }
794
795 while (fp && fp->inuse) {
796 uint32_t inuse = fp->inuse;
797
798 ++fatps;
799
800 for (i = 0; inuse && i < fatp_ntags(); ++i) {
801 uint32_t idx;
802
803 if (!(inuse & (1 << i)))
804 continue;
805
806 inuse ^= 1 << i;
807
808 ++probes;
809 ++vtw_stats.probe[which];
810
811 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
812 vtw = vtw_from_index(ctl, idx);
813
814 if (!vtw) {
815 /* Hopefully fast path.
816 */
817 db_trace(KTR_VTW
818 , (fp, "vtw: fast %A:%P %A:%P"
819 " idx %x tag %x"
820 , faddr, fport
821 , laddr, lport
822 , idx, tag));
823 continue;
824 }
825
826 v4 = (void*)vtw;
827
828 /* The de-referencing of vtw is what we want to avoid.
829 * Losing.
830 */
831 if (vtw_alive(vtw)
832 && ((which ? vtw->port_key : vtw->key)
833 == fatp_key(ctl->fat, fp, i))
834 && (which
835 || (v4->faddr == faddr && v4->laddr == laddr
836 && v4->fport == fport))
837 && v4->lport == lport) {
838 ++vtw_stats.hit[which];
839
840 db_trace(KTR_VTW
841 , (fp, "vtw: hit %8.8x:%4.4x"
842 " %8.8x:%4.4x idx %x key %x"
843 , faddr, fport
844 , laddr, lport
845 , idx_decode(ctl, idx), vtw->key));
846
847 KASSERT(vtw->hashed);
848
849 goto out;
850 }
851 ++vtw_stats.losing[which];
852 ++losings;
853
854 if (vtw_alive(vtw)) {
855 db_trace(KTR_VTW
856 , (fp, "vtw:!mis %8.8x:%4.4x"
857 " %8.8x:%4.4x key %x tag %x"
858 , faddr, fport
859 , laddr, lport
860 , fatp_key(ctl->fat, fp, i)
861 , v4_tag(faddr, fport
862 , laddr, lport)));
863 db_trace(KTR_VTW
864 , (vtw, "vtw:!mis %8.8x:%4.4x"
865 " %8.8x:%4.4x key %x tag %x"
866 , v4->faddr, v4->fport
867 , v4->laddr, v4->lport
868 , vtw->key
869 , v4_tag(v4->faddr, v4->fport
870 , v4->laddr, v4->lport)));
871
872 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
873 db_trace(KTR_VTW
874 , (vtw, "vtw:!mis %8.8x:%4.4x"
875 " %8.8x:%4.4x key %x"
876 " which %x"
877 , v4->faddr, v4->fport
878 , v4->laddr, v4->lport
879 , vtw->key
880 , which));
881
882 } else {
883 db_trace(KTR_VTW
884 , (vtw
885 , "vtw:!mis"
886 " key %8.8x != %8.8x"
887 " idx %x i %x which %x"
888 , vtw->key
889 , fatp_key(ctl->fat, fp, i)
890 , idx_decode(ctl, idx)
891 , i
892 , which));
893 }
894 } else {
895 db_trace(KTR_VTW
896 , (fp
897 , "vtw:!mis free entry"
898 " idx %x vtw %p which %x"
899 , idx_decode(ctl, idx)
900 , vtw, which));
901 }
902 }
903
904 if (fp->nxt) {
905 fp = fatp_next(ctl->fat, fp);
906 } else {
907 break;
908 }
909 }
910 ++vtw_stats.miss[which];
911 vtw = 0;
912 out:
913 if (fatps > vtw_stats.max_chain[which])
914 vtw_stats.max_chain[which] = fatps;
915 if (probes > vtw_stats.max_probe[which])
916 vtw_stats.max_probe[which] = probes;
917 if (losings > vtw_stats.max_loss[which])
918 vtw_stats.max_loss[which] = losings;
919
920 return vtw;
921 }
922
923 static vtw_t *
924 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
925 , const struct in6_addr *laddr, uint16_t lport
926 , int which)
927 {
928 vtw_v6_t *v6;
929 vtw_t *vtw;
930 uint32_t tag;
931 fatp_t *fp;
932 int i;
933 uint32_t fatps = 0, probes = 0, losings = 0;
934
935 ++vtw_stats.look[which];
936
937 if (!ctl || !ctl->fat)
938 return 0;
939
940 if (which) {
941 tag = v6_port_tag(lport);
942 fp = ctl->fat->port[tag & ctl->fat->mask];
943 } else {
944 tag = v6_tag(faddr, fport, laddr, lport);
945 fp = ctl->fat->hash[tag & ctl->fat->mask];
946 }
947
948 while (fp && fp->inuse) {
949 uint32_t inuse = fp->inuse;
950
951 ++fatps;
952
953 for (i = 0; inuse && i < fatp_ntags(); ++i) {
954 uint32_t idx;
955
956 if (!(inuse & (1 << i)))
957 continue;
958
959 inuse ^= 1 << i;
960
961 ++probes;
962 ++vtw_stats.probe[which];
963
964 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
965 vtw = vtw_from_index(ctl, idx);
966
967 db_trace(KTR_VTW
968 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
969 , i
970 , db_store(faddr, sizeof (*faddr)), fport
971 , db_store(laddr, sizeof (*laddr)), lport
972 , idx_decode(ctl, idx)));
973
974 if (!vtw) {
975 /* Hopefully fast path.
976 */
977 continue;
978 }
979
980 v6 = (void*)vtw;
981
982 if (vtw_alive(vtw)
983 && ((which ? vtw->port_key : vtw->key)
984 == fatp_key(ctl->fat, fp, i))
985 && v6->lport == lport
986 && (which
987 || (v6->fport == fport
988 && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
989 && !bcmp(&v6->laddr, laddr
990 , sizeof (*laddr))))) {
991 ++vtw_stats.hit[which];
992
993 KASSERT(vtw->hashed);
994 goto out;
995 } else {
996 ++vtw_stats.losing[which];
997 ++losings;
998 }
999 }
1000
1001 if (fp->nxt) {
1002 fp = fatp_next(ctl->fat, fp);
1003 } else {
1004 break;
1005 }
1006 }
1007 ++vtw_stats.miss[which];
1008 vtw = 0;
1009 out:
1010 if (fatps > vtw_stats.max_chain[which])
1011 vtw_stats.max_chain[which] = fatps;
1012 if (probes > vtw_stats.max_probe[which])
1013 vtw_stats.max_probe[which] = probes;
1014 if (losings > vtw_stats.max_loss[which])
1015 vtw_stats.max_loss[which] = losings;
1016
1017 return vtw;
1018 }
1019
1020 /*!\brief port iterator
1021 */
1022 static vtw_t *
1023 vtw_next_port_v4(struct tcp_ports_iterator *it)
1024 {
1025 vtw_ctl_t *ctl = it->ctl;
1026 vtw_v4_t *v4;
1027 vtw_t *vtw;
1028 uint32_t tag;
1029 uint16_t lport = it->port;
1030 fatp_t *fp;
1031 int i;
1032 uint32_t fatps = 0, probes = 0, losings = 0;
1033
1034 tag = v4_port_tag(lport);
1035 if (!it->fp) {
1036 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1037 it->slot_idx = 0;
1038 }
1039 fp = it->fp;
1040
1041 while (fp) {
1042 uint32_t inuse = fp->inuse;
1043
1044 ++fatps;
1045
1046 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1047 uint32_t idx;
1048
1049 if (!(inuse & (1 << i)))
1050 continue;
1051
1052 inuse &= ~0U << i;
1053
1054 if (i < it->slot_idx)
1055 continue;
1056
1057 ++vtw_stats.probe[1];
1058 ++probes;
1059
1060 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1061 vtw = vtw_from_index(ctl, idx);
1062
1063 if (!vtw) {
1064 /* Hopefully fast path.
1065 */
1066 continue;
1067 }
1068
1069 v4 = (void*)vtw;
1070
1071 if (vtw_alive(vtw)
1072 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1073 && v4->lport == lport) {
1074 ++vtw_stats.hit[1];
1075
1076 it->slot_idx = i + 1;
1077
1078 goto out;
1079 } else if (vtw_alive(vtw)) {
1080 ++vtw_stats.losing[1];
1081 ++losings;
1082
1083 db_trace(KTR_VTW
1084 , (vtw, "vtw:!mis"
1085 " port %8.8x:%4.4x %8.8x:%4.4x"
1086 " key %x port %x"
1087 , v4->faddr, v4->fport
1088 , v4->laddr, v4->lport
1089 , vtw->key
1090 , lport));
1091 } else {
1092 /* Really losing here. We are coming
1093 * up with references to free entries.
1094 * Might find it better to use
1095 * traditional, or need another
1096 * add-hockery. The other add-hockery
1097 * would be to pul more into into the
1098 * cache line to reject the false
1099 * hits.
1100 */
1101 ++vtw_stats.losing[1];
1102 ++losings;
1103 db_trace(KTR_VTW
1104 , (fp, "vtw:!mis port %x"
1105 " - free entry idx %x vtw %p"
1106 , lport
1107 , idx_decode(ctl, idx)
1108 , vtw));
1109 }
1110 }
1111
1112 if (fp->nxt) {
1113 it->fp = fp = fatp_next(ctl->fat, fp);
1114 it->slot_idx = 0;
1115 } else {
1116 it->fp = 0;
1117 break;
1118 }
1119 }
1120 ++vtw_stats.miss[1];
1121
1122 vtw = 0;
1123 out:
1124 if (fatps > vtw_stats.max_chain[1])
1125 vtw_stats.max_chain[1] = fatps;
1126 if (probes > vtw_stats.max_probe[1])
1127 vtw_stats.max_probe[1] = probes;
1128 if (losings > vtw_stats.max_loss[1])
1129 vtw_stats.max_loss[1] = losings;
1130
1131 return vtw;
1132 }
1133
1134 /*!\brief port iterator
1135 */
1136 static vtw_t *
1137 vtw_next_port_v6(struct tcp_ports_iterator *it)
1138 {
1139 vtw_ctl_t *ctl = it->ctl;
1140 vtw_v6_t *v6;
1141 vtw_t *vtw;
1142 uint32_t tag;
1143 uint16_t lport = it->port;
1144 fatp_t *fp;
1145 int i;
1146 uint32_t fatps = 0, probes = 0, losings = 0;
1147
1148 tag = v6_port_tag(lport);
1149 if (!it->fp) {
1150 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1151 it->slot_idx = 0;
1152 }
1153 fp = it->fp;
1154
1155 while (fp) {
1156 uint32_t inuse = fp->inuse;
1157
1158 ++fatps;
1159
1160 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1161 uint32_t idx;
1162
1163 if (!(inuse & (1 << i)))
1164 continue;
1165
1166 inuse &= ~0U << i;
1167
1168 if (i < it->slot_idx)
1169 continue;
1170
1171 ++vtw_stats.probe[1];
1172 ++probes;
1173
1174 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1175 vtw = vtw_from_index(ctl, idx);
1176
1177 if (!vtw) {
1178 /* Hopefully fast path.
1179 */
1180 continue;
1181 }
1182
1183 v6 = (void*)vtw;
1184
1185 db_trace(KTR_VTW
1186 , (vtw, "vtw: i %x idx %x fp->tag %x"
1187 " tag %x xtra %x"
1188 , i, idx_decode(ctl, idx)
1189 , fp->tag[i], tag, fatp_xtra[i]));
1190
1191 if (vtw_alive(vtw)
1192 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1193 && v6->lport == lport) {
1194 ++vtw_stats.hit[1];
1195
1196 db_trace(KTR_VTW
1197 , (fp, "vtw: nxt port %P - %4.4x"
1198 " idx %x key %x"
1199 , lport, lport
1200 , idx_decode(ctl, idx), vtw->key));
1201
1202 it->slot_idx = i + 1;
1203 goto out;
1204 } else if (vtw_alive(vtw)) {
1205 ++vtw_stats.losing[1];
1206
1207 db_trace(KTR_VTW
1208 , (vtw, "vtw:!mis port %6A:%4.4x"
1209 " %6A:%4.4x key %x port %x"
1210 , db_store(&v6->faddr
1211 , sizeof (v6->faddr))
1212 , v6->fport
1213 , db_store(&v6->laddr
1214 , sizeof (v6->faddr))
1215 , v6->lport
1216 , vtw->key
1217 , lport));
1218 } else {
1219 /* Really losing here. We are coming
1220 * up with references to free entries.
1221 * Might find it better to use
1222 * traditional, or need another
1223 * add-hockery. The other add-hockery
1224 * would be to pul more into into the
1225 * cache line to reject the false
1226 * hits.
1227 */
1228 ++vtw_stats.losing[1];
1229 ++losings;
1230
1231 db_trace(KTR_VTW
1232 , (fp
1233 , "vtw:!mis port %x"
1234 " - free entry idx %x vtw %p"
1235 , lport, idx_decode(ctl, idx)
1236 , vtw));
1237 }
1238 }
1239
1240 if (fp->nxt) {
1241 it->fp = fp = fatp_next(ctl->fat, fp);
1242 it->slot_idx = 0;
1243 } else {
1244 it->fp = 0;
1245 break;
1246 }
1247 }
1248 ++vtw_stats.miss[1];
1249
1250 vtw = 0;
1251 out:
1252 if (fatps > vtw_stats.max_chain[1])
1253 vtw_stats.max_chain[1] = fatps;
1254 if (probes > vtw_stats.max_probe[1])
1255 vtw_stats.max_probe[1] = probes;
1256 if (losings > vtw_stats.max_loss[1])
1257 vtw_stats.max_loss[1] = losings;
1258
1259 return vtw;
1260 }
1261
1262 /*!\brief initialise the VTW allocation arena
1263 *
1264 * There are 1+3 allocation classes:
1265 * 0 classless
1266 * {1,2,3} MSL-class based allocation
1267 *
1268 * The allocation arenas are all initialised. Classless gets all the
1269 * space. MSL-class based divides the arena, so that allocation
1270 * within a class can proceed without having to consider entries
1271 * (aka: cache lines) from different classes.
1272 *
1273 * Usually, we are completely classless or class-based, but there can be
1274 * transition periods, corresponding to dynamic adjustments in the config
1275 * by the operator.
1276 */
1277 static void
1278 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1279 {
1280 int class_n, i;
1281 vtw_t *base;
1282
1283 ctl->base.v = ctl_base_v;
1284
1285 if (ctl->is_v4) {
1286 ctl->lim.v4 = ctl->base.v4 + n - 1;
1287 ctl->alloc.v4 = ctl->base.v4;
1288 } else {
1289 ctl->lim.v6 = ctl->base.v6 + n - 1;
1290 ctl->alloc.v6 = ctl->base.v6;
1291 }
1292
1293 ctl->nfree = n;
1294 ctl->ctl = ctl;
1295
1296 ctl->idx_bits = 32;
1297 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1298 ctl->idx_mask >>= 1;
1299 ctl->idx_bits -= 1;
1300 }
1301
1302 ctl->idx_mask <<= 1;
1303 ctl->idx_mask |= 1;
1304 ctl->idx_bits += 1;
1305
1306 ctl->fat = fat;
1307 fat->vtw = ctl;
1308
1309 /* Divide the resources equally amongst the classes.
1310 * This is not optimal, as the different classes
1311 * arrive and leave at different rates, but it is
1312 * the best I can do for now.
1313 */
1314 class_n = n / (VTW_NCLASS-1);
1315 base = ctl->base.v;
1316
1317 for (i = 1; i < VTW_NCLASS; ++i) {
1318 int j;
1319
1320 ctl[i] = ctl[0];
1321 ctl[i].clidx = i;
1322
1323 ctl[i].base.v = base;
1324 ctl[i].alloc = ctl[i].base;
1325
1326 for (j = 0; j < class_n - 1; ++j) {
1327 if (tcp_msl_enable)
1328 base->msl_class = i;
1329 base = vtw_next(ctl, base);
1330 }
1331
1332 ctl[i].lim.v = base;
1333 base = vtw_next(ctl, base);
1334 ctl[i].nfree = class_n;
1335 }
1336
1337 vtw_debug_init();
1338 }
1339
1340 /*!\brief map class to TCP MSL
1341 */
1342 static inline uint32_t
1343 class_to_msl(int msl_class)
1344 {
1345 switch (msl_class) {
1346 case 0:
1347 case 1:
1348 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1349 case 2:
1350 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1351 default:
1352 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1353 }
1354 }
1355
1356 /*!\brief map TCP MSL to class
1357 */
1358 static inline uint32_t
1359 msl_to_class(int msl)
1360 {
1361 if (tcp_msl_enable) {
1362 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1363 return 1+2;
1364 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1365 return 1+1;
1366 return 1;
1367 }
1368 return 0;
1369 }
1370
1371 /*!\brief allocate a vtw entry
1372 */
1373 static inline vtw_t *
1374 vtw_alloc(vtw_ctl_t *ctl)
1375 {
1376 vtw_t *vtw = 0;
1377 int stuck = 0;
1378 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1379 int msl;
1380
1381 KASSERT(mutex_owned(softnet_lock));
1382
1383 /* If no resources, we will not get far.
1384 */
1385 if (!ctl || !ctl->base.v4 || avail <= 0)
1386 return 0;
1387
1388 /* Obtain a free one.
1389 */
1390 while (!ctl->nfree) {
1391 vtw_age(ctl, 0);
1392
1393 if (++stuck > avail) {
1394 /* When in transition between
1395 * schemes (classless, classed) we
1396 * can be stuck having to await the
1397 * expiration of cross-allocated entries.
1398 *
1399 * Returning zero means we will fall back to the
1400 * traditional TIME_WAIT handling, except in the
1401 * case of a re-shed, in which case we cannot
1402 * perform the reshecd, but will retain the extant
1403 * entry.
1404 */
1405 db_trace(KTR_VTW
1406 , (ctl, "vtw:!none free in class %x %x/%x"
1407 , ctl->clidx
1408 , ctl->nalloc, ctl->nfree));
1409
1410 return 0;
1411 }
1412 }
1413
1414 vtw = ctl->alloc.v;
1415
1416 if (vtw->msl_class != ctl->clidx) {
1417 /* Usurping rules:
1418 * 0 -> {1,2,3} or {1,2,3} -> 0
1419 */
1420 KASSERT(!vtw->msl_class || !ctl->clidx);
1421
1422 if (vtw->hashed || vtw->expire.tv_sec) {
1423 /* As this is owned by some other class,
1424 * we must wait for it to expire it.
1425 * This will only happen on class/classless
1426 * transitions, which are guaranteed to progress
1427 * to completion in small finite time, barring bugs.
1428 */
1429 db_trace(KTR_VTW
1430 , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1431 , vtw, vtw->msl_class, ctl->clidx
1432 , vtw->expire.tv_sec
1433 , vtw->expire.tv_usec
1434 , vtw->hashed ? " hashed" : ""));
1435
1436 return 0;
1437 }
1438
1439 db_trace(KTR_VTW
1440 , (ctl, "vtw:!%p usurped from %x to %x"
1441 , vtw, vtw->msl_class, ctl->clidx));
1442
1443 vtw->msl_class = ctl->clidx;
1444 }
1445
1446 if (vtw_alive(vtw)) {
1447 KASSERT(0 && "next free not free");
1448 return 0;
1449 }
1450
1451 /* Advance allocation poiter.
1452 */
1453 ctl->alloc.v = vtw_next(ctl, vtw);
1454
1455 --ctl->nfree;
1456 ++ctl->nalloc;
1457
1458 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1459
1460 /* mark expiration
1461 */
1462 getmicrouptime(&vtw->expire);
1463
1464 /* Move expiration into the future.
1465 */
1466 vtw->expire.tv_sec += msl / 1000;
1467 vtw->expire.tv_usec += 1000 * (msl % 1000);
1468
1469 while (vtw->expire.tv_usec >= 1000*1000) {
1470 vtw->expire.tv_usec -= 1000*1000;
1471 vtw->expire.tv_sec += 1;
1472 }
1473
1474 if (!ctl->oldest.v)
1475 ctl->oldest.v = vtw;
1476
1477 return vtw;
1478 }
1479
1480 /*!\brief expiration
1481 */
1482 static int
1483 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1484 {
1485 vtw_t *vtw;
1486 struct timeval then, *when = _when;
1487 int maxtries = 0;
1488
1489 if (!ctl->oldest.v) {
1490 KASSERT(!ctl->nalloc);
1491 return 0;
1492 }
1493
1494 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1495 if (++maxtries > ctl->nalloc)
1496 break;
1497
1498 if (vtw->msl_class != ctl->clidx) {
1499 db_trace(KTR_VTW
1500 , (vtw, "vtw:!age class mismatch %x != %x"
1501 , vtw->msl_class, ctl->clidx));
1502 /* XXXX
1503 * See if the appropriate action is to skip to the next.
1504 * XXXX
1505 */
1506 ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1507 continue;
1508 }
1509 if (!when) {
1510 /* Latch oldest timeval if none specified.
1511 */
1512 then = vtw->expire;
1513 when = &then;
1514 }
1515
1516 if (!timercmp(&vtw->expire, when, <=))
1517 break;
1518
1519 db_trace(KTR_VTW
1520 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1521 , ctl->clidx
1522 , vtw->expire.tv_sec
1523 , vtw->expire.tv_usec
1524 , ctl->nalloc
1525 , ctl->nfree));
1526
1527 if (!_when)
1528 ++vtw_stats.kill;
1529
1530 vtw_del(ctl, vtw);
1531 vtw = ctl->oldest.v;
1532 }
1533
1534 return ctl->nalloc; // # remaining allocated
1535 }
1536
1537 static callout_t vtw_cs;
1538
1539 /*!\brief notice the passage of time.
1540 * It seems to be getting faster. What happened to the year?
1541 */
1542 static void
1543 vtw_tick(void *arg)
1544 {
1545 struct timeval now;
1546 int i, cnt = 0;
1547
1548 getmicrouptime(&now);
1549
1550 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1551 , now.tv_sec, now.tv_usec));
1552
1553 mutex_enter(softnet_lock);
1554
1555 for (i = 0; i < VTW_NCLASS; ++i) {
1556 cnt += vtw_age(&vtw_tcpv4[i], &now);
1557 cnt += vtw_age(&vtw_tcpv6[i], &now);
1558 }
1559
1560 /* Keep ticks coming while we need them.
1561 */
1562 if (cnt)
1563 callout_schedule(&vtw_cs, hz / 5);
1564 else {
1565 tcp_vtw_was_enabled = 0;
1566 tcbtable.vestige = 0;
1567 }
1568 mutex_exit(softnet_lock);
1569 }
1570
1571 /* in_pcblookup_ports assist for handling vestigial entries.
1572 */
1573 static void *
1574 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1575 {
1576 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1577
1578 bzero(it, sizeof (*it));
1579
1580 /* Note: the reference to vtw_tcpv4[0] is fine.
1581 * We do not need per-class iteration. We just
1582 * need to get to the fat, and there is one
1583 * shared fat.
1584 */
1585 if (vtw_tcpv4[0].fat) {
1586 it->addr.v4 = addr;
1587 it->port = port;
1588 it->wild = !!wild;
1589 it->ctl = &vtw_tcpv4[0];
1590
1591 ++vtw_stats.look[1];
1592 }
1593
1594 return it;
1595 }
1596
1597 /*!\brief export an IPv4 vtw.
1598 */
1599 static int
1600 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1601 {
1602 vtw_v4_t *v4 = (void*)vtw;
1603
1604 bzero(res, sizeof (*res));
1605
1606 if (ctl && vtw) {
1607 if (!ctl->clidx && vtw->msl_class)
1608 ctl += vtw->msl_class;
1609 else
1610 KASSERT(ctl->clidx == vtw->msl_class);
1611
1612 res->valid = 1;
1613 res->v4 = 1;
1614
1615 res->faddr.v4.s_addr = v4->faddr;
1616 res->laddr.v4.s_addr = v4->laddr;
1617 res->fport = v4->fport;
1618 res->lport = v4->lport;
1619 res->vtw = vtw; // netlock held over call(s)
1620 res->ctl = ctl;
1621 res->reuse_addr = vtw->reuse_addr;
1622 res->reuse_port = vtw->reuse_port;
1623 res->snd_nxt = vtw->snd_nxt;
1624 res->rcv_nxt = vtw->rcv_nxt;
1625 res->rcv_wnd = vtw->rcv_wnd;
1626 res->uid = vtw->uid;
1627 }
1628
1629 return res->valid;
1630 }
1631
1632 /*!\brief return next port in the port iterator. yowza.
1633 */
1634 static int
1635 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1636 {
1637 struct tcp_ports_iterator *it = arg;
1638 vtw_t *vtw = 0;
1639
1640 if (it->ctl)
1641 vtw = vtw_next_port_v4(it);
1642
1643 if (!vtw)
1644 it->ctl = 0;
1645
1646 return vtw_export_v4(it->ctl, vtw, res);
1647 }
1648
1649 static int
1650 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1651 struct in_addr laddr, uint16_t lport,
1652 struct vestigial_inpcb *res)
1653 {
1654 vtw_t *vtw;
1655 vtw_ctl_t *ctl;
1656
1657
1658 db_trace(KTR_VTW
1659 , (res, "vtw: lookup %A:%P %A:%P"
1660 , faddr, fport
1661 , laddr, lport));
1662
1663 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1664 , faddr.s_addr, fport
1665 , laddr.s_addr, lport, 0);
1666
1667 return vtw_export_v4(ctl, vtw, res);
1668 }
1669
1670 /* in_pcblookup_ports assist for handling vestigial entries.
1671 */
1672 static void *
1673 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1674 {
1675 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1676
1677 bzero(it, sizeof (*it));
1678
1679 /* Note: the reference to vtw_tcpv6[0] is fine.
1680 * We do not need per-class iteration. We just
1681 * need to get to the fat, and there is one
1682 * shared fat.
1683 */
1684 if (vtw_tcpv6[0].fat) {
1685 it->addr.v6 = *addr;
1686 it->port = port;
1687 it->wild = !!wild;
1688 it->ctl = &vtw_tcpv6[0];
1689
1690 ++vtw_stats.look[1];
1691 }
1692
1693 return it;
1694 }
1695
1696 /*!\brief export an IPv6 vtw.
1697 */
1698 static int
1699 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1700 {
1701 vtw_v6_t *v6 = (void*)vtw;
1702
1703 bzero(res, sizeof (*res));
1704
1705 if (ctl && vtw) {
1706 if (!ctl->clidx && vtw->msl_class)
1707 ctl += vtw->msl_class;
1708 else
1709 KASSERT(ctl->clidx == vtw->msl_class);
1710
1711 res->valid = 1;
1712 res->v4 = 0;
1713
1714 res->faddr.v6 = v6->faddr;
1715 res->laddr.v6 = v6->laddr;
1716 res->fport = v6->fport;
1717 res->lport = v6->lport;
1718 res->vtw = vtw; // netlock held over call(s)
1719 res->ctl = ctl;
1720
1721 res->v6only = vtw->v6only;
1722 res->reuse_addr = vtw->reuse_addr;
1723 res->reuse_port = vtw->reuse_port;
1724
1725 res->snd_nxt = vtw->snd_nxt;
1726 res->rcv_nxt = vtw->rcv_nxt;
1727 res->rcv_wnd = vtw->rcv_wnd;
1728 res->uid = vtw->uid;
1729 }
1730
1731 return res->valid;
1732 }
1733
1734 static int
1735 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1736 {
1737 struct tcp_ports_iterator *it = arg;
1738 vtw_t *vtw = 0;
1739
1740 if (it->ctl)
1741 vtw = vtw_next_port_v6(it);
1742
1743 if (!vtw)
1744 it->ctl = 0;
1745
1746 return vtw_export_v6(it->ctl, vtw, res);
1747 }
1748
1749 static int
1750 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1751 const struct in6_addr *laddr, uint16_t lport,
1752 struct vestigial_inpcb *res)
1753 {
1754 vtw_ctl_t *ctl;
1755 vtw_t *vtw;
1756
1757 db_trace(KTR_VTW
1758 , (res, "vtw: lookup %6A:%P %6A:%P"
1759 , db_store(faddr, sizeof (*faddr)), fport
1760 , db_store(laddr, sizeof (*laddr)), lport));
1761
1762 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1763 , faddr, fport
1764 , laddr, lport, 0);
1765
1766 return vtw_export_v6(ctl, vtw, res);
1767 }
1768
1769 static vestigial_hooks_t tcp_hooks = {
1770 .init_ports4 = tcp_init_ports_v4,
1771 .next_port4 = tcp_next_port_v4,
1772 .lookup4 = tcp_lookup_v4,
1773 .init_ports6 = tcp_init_ports_v6,
1774 .next_port6 = tcp_next_port_v6,
1775 .lookup6 = tcp_lookup_v6,
1776 };
1777
1778 static bool
1779 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1780 {
1781 fatp_ctl_t *fat;
1782 vtw_ctl_t *ctl;
1783
1784 switch (af) {
1785 case AF_INET:
1786 fat = &fat_tcpv4;
1787 ctl = &vtw_tcpv4[0];
1788 break;
1789 case AF_INET6:
1790 fat = &fat_tcpv6;
1791 ctl = &vtw_tcpv6[0];
1792 break;
1793 default:
1794 return false;
1795 }
1796 if (fatp != NULL)
1797 *fatp = fat;
1798 if (ctlp != NULL)
1799 *ctlp = ctl;
1800 return true;
1801 }
1802
1803 /*!\brief initialize controlling instance
1804 */
1805 static int
1806 vtw_control_init(int af)
1807 {
1808 fatp_ctl_t *fat;
1809 vtw_ctl_t *ctl;
1810 fatp_t *fat_base;
1811 fatp_t **fat_hash;
1812 vtw_t *ctl_base_v;
1813 uint32_t n, m;
1814 size_t sz;
1815
1816 KASSERT(powerof2(tcp_vtw_entries));
1817
1818 if (!vtw_select(af, &fat, &ctl))
1819 return EAFNOSUPPORT;
1820
1821 if (fat->hash != NULL) {
1822 KASSERT(fat->base != NULL && ctl->base.v != NULL);
1823 return 0;
1824 }
1825
1826 /* Allocate 10% more capacity in the fat pointers.
1827 * We should only need ~#hash additional based on
1828 * how they age, but TIME_WAIT assassination could cause
1829 * sparse fat pointer utilisation.
1830 */
1831 m = 512;
1832 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1833 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1834
1835 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1836
1837 if (fat_hash == NULL) {
1838 printf("%s: could not allocate %zu bytes for "
1839 "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1840 return ENOMEM;
1841 }
1842
1843 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1844
1845 if (fat_base == NULL) {
1846 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1847 printf("%s: could not allocate %zu bytes for "
1848 "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1849 return ENOMEM;
1850 }
1851
1852 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1853
1854 if (ctl_base_v == NULL) {
1855 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1856 kmem_free(fat_base, 2*n * sizeof(fatp_t));
1857 printf("%s: could not allocate %zu bytes for "
1858 "vtw_t array", __func__, tcp_vtw_entries * sz);
1859 return ENOMEM;
1860 }
1861
1862 fatp_init(fat, n, m, fat_base, fat_hash);
1863
1864 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1865
1866 return 0;
1867 }
1868
1869 /*!\brief select controlling instance
1870 */
1871 static vtw_ctl_t *
1872 vtw_control(int af, uint32_t msl)
1873 {
1874 fatp_ctl_t *fat;
1875 vtw_ctl_t *ctl;
1876 int msl_class = msl_to_class(msl);
1877
1878 if (!vtw_select(af, &fat, &ctl))
1879 return NULL;
1880
1881 if (!fat->base || !ctl->base.v)
1882 return NULL;
1883
1884 if (!tcp_vtw_was_enabled) {
1885 /* This guarantees is timer ticks until we no longer need them.
1886 */
1887 tcp_vtw_was_enabled = 1;
1888
1889 callout_schedule(&vtw_cs, hz / 5);
1890
1891 tcbtable.vestige = &tcp_hooks;
1892 }
1893
1894 return ctl + msl_class;
1895 }
1896
1897 /*!\brief add TCP pcb to vestigial timewait
1898 */
1899 int
1900 vtw_add(int af, struct tcpcb *tp)
1901 {
1902 #ifdef VTW_DEBUG
1903 int enable;
1904 #endif
1905 vtw_ctl_t *ctl;
1906 vtw_t *vtw;
1907
1908 KASSERT(mutex_owned(softnet_lock));
1909
1910 ctl = vtw_control(af, tp->t_msl);
1911 if (!ctl)
1912 return 0;
1913
1914 #ifdef VTW_DEBUG
1915 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1916 #endif
1917
1918 vtw = vtw_alloc(ctl);
1919
1920 if (vtw) {
1921 vtw->snd_nxt = tp->snd_nxt;
1922 vtw->rcv_nxt = tp->rcv_nxt;
1923
1924 switch (af) {
1925 case AF_INET: {
1926 struct inpcb *inp = tp->t_inpcb;
1927 vtw_v4_t *v4 = (void*)vtw;
1928
1929 v4->faddr = inp->inp_faddr.s_addr;
1930 v4->laddr = inp->inp_laddr.s_addr;
1931 v4->fport = inp->inp_fport;
1932 v4->lport = inp->inp_lport;
1933
1934 vtw->reuse_port = !!(inp->inp_socket->so_options
1935 & SO_REUSEPORT);
1936 vtw->reuse_addr = !!(inp->inp_socket->so_options
1937 & SO_REUSEADDR);
1938 vtw->v6only = 0;
1939 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1940
1941 vtw_inshash_v4(ctl, vtw);
1942
1943
1944 #ifdef VTW_DEBUG
1945 /* Immediate lookup (connected and port) to
1946 * ensure at least that works!
1947 */
1948 if (enable & 4) {
1949 KASSERT(vtw_lookup_hash_v4
1950 (ctl
1951 , inp->inp_faddr.s_addr, inp->inp_fport
1952 , inp->inp_laddr.s_addr, inp->inp_lport
1953 , 0)
1954 == vtw);
1955 KASSERT(vtw_lookup_hash_v4
1956 (ctl
1957 , inp->inp_faddr.s_addr, inp->inp_fport
1958 , inp->inp_laddr.s_addr, inp->inp_lport
1959 , 1));
1960 }
1961 /* Immediate port iterator functionality check: not wild
1962 */
1963 if (enable & 8) {
1964 struct tcp_ports_iterator *it;
1965 struct vestigial_inpcb res;
1966 int cnt = 0;
1967
1968 it = tcp_init_ports_v4(inp->inp_laddr
1969 , inp->inp_lport, 0);
1970
1971 while (tcp_next_port_v4(it, &res)) {
1972 ++cnt;
1973 }
1974 KASSERT(cnt);
1975 }
1976 /* Immediate port iterator functionality check: wild
1977 */
1978 if (enable & 16) {
1979 struct tcp_ports_iterator *it;
1980 struct vestigial_inpcb res;
1981 struct in_addr any;
1982 int cnt = 0;
1983
1984 any.s_addr = htonl(INADDR_ANY);
1985
1986 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1987
1988 while (tcp_next_port_v4(it, &res)) {
1989 ++cnt;
1990 }
1991 KASSERT(cnt);
1992 }
1993 #endif /* VTW_DEBUG */
1994 break;
1995 }
1996
1997 case AF_INET6: {
1998 struct in6pcb *inp = tp->t_in6pcb;
1999 vtw_v6_t *v6 = (void*)vtw;
2000
2001 v6->faddr = inp->in6p_faddr;
2002 v6->laddr = inp->in6p_laddr;
2003 v6->fport = inp->in6p_fport;
2004 v6->lport = inp->in6p_lport;
2005
2006 vtw->reuse_port = !!(inp->in6p_socket->so_options
2007 & SO_REUSEPORT);
2008 vtw->reuse_addr = !!(inp->in6p_socket->so_options
2009 & SO_REUSEADDR);
2010 vtw->v6only = !!(inp->in6p_flags
2011 & IN6P_IPV6_V6ONLY);
2012 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid;
2013
2014 vtw_inshash_v6(ctl, vtw);
2015 #ifdef VTW_DEBUG
2016 /* Immediate lookup (connected and port) to
2017 * ensure at least that works!
2018 */
2019 if (enable & 4) {
2020 KASSERT(vtw_lookup_hash_v6(ctl
2021 , &inp->in6p_faddr, inp->in6p_fport
2022 , &inp->in6p_laddr, inp->in6p_lport
2023 , 0)
2024 == vtw);
2025 KASSERT(vtw_lookup_hash_v6
2026 (ctl
2027 , &inp->in6p_faddr, inp->in6p_fport
2028 , &inp->in6p_laddr, inp->in6p_lport
2029 , 1));
2030 }
2031 /* Immediate port iterator functionality check: not wild
2032 */
2033 if (enable & 8) {
2034 struct tcp_ports_iterator *it;
2035 struct vestigial_inpcb res;
2036 int cnt = 0;
2037
2038 it = tcp_init_ports_v6(&inp->in6p_laddr
2039 , inp->in6p_lport, 0);
2040
2041 while (tcp_next_port_v6(it, &res)) {
2042 ++cnt;
2043 }
2044 KASSERT(cnt);
2045 }
2046 /* Immediate port iterator functionality check: wild
2047 */
2048 if (enable & 16) {
2049 struct tcp_ports_iterator *it;
2050 struct vestigial_inpcb res;
2051 static struct in6_addr any = IN6ADDR_ANY_INIT;
2052 int cnt = 0;
2053
2054 it = tcp_init_ports_v6(&any
2055 , inp->in6p_lport, 1);
2056
2057 while (tcp_next_port_v6(it, &res)) {
2058 ++cnt;
2059 }
2060 KASSERT(cnt);
2061 }
2062 #endif /* VTW_DEBUG */
2063 break;
2064 }
2065 }
2066
2067 tcp_canceltimers(tp);
2068 tp = tcp_close(tp);
2069 KASSERT(!tp);
2070
2071 return 1;
2072 }
2073
2074 return 0;
2075 }
2076
2077 /*!\brief restart timer for vestigial time-wait entry
2078 */
2079 static void
2080 vtw_restart_v4(vestigial_inpcb_t *vp)
2081 {
2082 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2083 vtw_t *vtw;
2084 vtw_t *cp = ©.common;
2085 vtw_ctl_t *ctl;
2086
2087 KASSERT(mutex_owned(softnet_lock));
2088
2089 db_trace(KTR_VTW
2090 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2091 , vp->faddr.v4.s_addr, vp->fport
2092 , vp->laddr.v4.s_addr, vp->lport));
2093
2094 /* Class might have changed, so have a squiz.
2095 */
2096 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2097 vtw = vtw_alloc(ctl);
2098
2099 if (vtw) {
2100 vtw_v4_t *v4 = (void*)vtw;
2101
2102 /* Safe now to unhash the old entry
2103 */
2104 vtw_del(vp->ctl, vp->vtw);
2105
2106 vtw->snd_nxt = cp->snd_nxt;
2107 vtw->rcv_nxt = cp->rcv_nxt;
2108
2109 v4->faddr = copy.faddr;
2110 v4->laddr = copy.laddr;
2111 v4->fport = copy.fport;
2112 v4->lport = copy.lport;
2113
2114 vtw->reuse_port = cp->reuse_port;
2115 vtw->reuse_addr = cp->reuse_addr;
2116 vtw->v6only = 0;
2117 vtw->uid = cp->uid;
2118
2119 vtw_inshash_v4(ctl, vtw);
2120 }
2121
2122 vp->valid = 0;
2123 }
2124
2125 /*!\brief restart timer for vestigial time-wait entry
2126 */
2127 static void
2128 vtw_restart_v6(vestigial_inpcb_t *vp)
2129 {
2130 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2131 vtw_t *vtw;
2132 vtw_t *cp = ©.common;
2133 vtw_ctl_t *ctl;
2134
2135 KASSERT(mutex_owned(softnet_lock));
2136
2137 db_trace(KTR_VTW
2138 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2139 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2140 , vp->fport
2141 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2142 , vp->lport));
2143
2144 /* Class might have changed, so have a squiz.
2145 */
2146 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2147 vtw = vtw_alloc(ctl);
2148
2149 if (vtw) {
2150 vtw_v6_t *v6 = (void*)vtw;
2151
2152 /* Safe now to unhash the old entry
2153 */
2154 vtw_del(vp->ctl, vp->vtw);
2155
2156 vtw->snd_nxt = cp->snd_nxt;
2157 vtw->rcv_nxt = cp->rcv_nxt;
2158
2159 v6->faddr = copy.faddr;
2160 v6->laddr = copy.laddr;
2161 v6->fport = copy.fport;
2162 v6->lport = copy.lport;
2163
2164 vtw->reuse_port = cp->reuse_port;
2165 vtw->reuse_addr = cp->reuse_addr;
2166 vtw->v6only = cp->v6only;
2167 vtw->uid = cp->uid;
2168
2169 vtw_inshash_v6(ctl, vtw);
2170 }
2171
2172 vp->valid = 0;
2173 }
2174
2175 /*!\brief restart timer for vestigial time-wait entry
2176 */
2177 void
2178 vtw_restart(vestigial_inpcb_t *vp)
2179 {
2180 if (!vp || !vp->valid)
2181 return;
2182
2183 if (vp->v4)
2184 vtw_restart_v4(vp);
2185 else
2186 vtw_restart_v6(vp);
2187 }
2188
2189 int
2190 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2191 {
2192 int en, rc;
2193 struct sysctlnode node;
2194
2195 node = *rnode;
2196 en = *(int *)rnode->sysctl_data;
2197 node.sysctl_data = &en;
2198
2199 rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2200 if (rc != 0 || newp == NULL)
2201 return rc;
2202
2203 if (rnode->sysctl_data != &tcp4_vtw_enable &&
2204 rnode->sysctl_data != &tcp6_vtw_enable)
2205 rc = ENOENT;
2206 else if ((en & 1) == 0)
2207 rc = 0;
2208 else if (rnode->sysctl_data == &tcp4_vtw_enable)
2209 rc = vtw_control_init(AF_INET);
2210 else /* rnode->sysctl_data == &tcp6_vtw_enable */
2211 rc = vtw_control_init(AF_INET6);
2212
2213 if (rc == 0)
2214 *(int *)rnode->sysctl_data = en;
2215
2216 return rc;
2217 }
2218
2219 int
2220 vtw_earlyinit(void)
2221 {
2222 int i, rc;
2223
2224 callout_init(&vtw_cs, 0);
2225 callout_setfunc(&vtw_cs, vtw_tick, 0);
2226
2227 for (i = 0; i < VTW_NCLASS; ++i) {
2228 vtw_tcpv4[i].is_v4 = 1;
2229 vtw_tcpv6[i].is_v6 = 1;
2230 }
2231
2232 if ((tcp4_vtw_enable & 1) != 0 &&
2233 (rc = vtw_control_init(AF_INET)) != 0)
2234 return rc;
2235
2236 if ((tcp6_vtw_enable & 1) != 0 &&
2237 (rc = vtw_control_init(AF_INET6)) != 0)
2238 return rc;
2239
2240 return 0;
2241 }
2242
2243 #ifdef VTW_DEBUG
2244 #include <sys/syscallargs.h>
2245 #include <sys/sysctl.h>
2246
2247 /*!\brief add lalp, fafp entries for debug
2248 */
2249 int
2250 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class)
2251 {
2252 vtw_ctl_t *ctl;
2253 vtw_t *vtw;
2254
2255 ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class));
2256 if (!ctl)
2257 return 0;
2258
2259 vtw = vtw_alloc(ctl);
2260
2261 if (vtw) {
2262 vtw->snd_nxt = 0;
2263 vtw->rcv_nxt = 0;
2264
2265 switch (af) {
2266 case AF_INET: {
2267 vtw_v4_t *v4 = (void*)vtw;
2268
2269 v4->faddr = fa->sin_addr.v4.s_addr;
2270 v4->laddr = la->sin_addr.v4.s_addr;
2271 v4->fport = fa->sin_port;
2272 v4->lport = la->sin_port;
2273
2274 vtw->reuse_port = 1;
2275 vtw->reuse_addr = 1;
2276 vtw->v6only = 0;
2277 vtw->uid = 0;
2278
2279 vtw_inshash_v4(ctl, vtw);
2280 break;
2281 }
2282
2283 case AF_INET6: {
2284 vtw_v6_t *v6 = (void*)vtw;
2285
2286 v6->faddr = fa->sin_addr.v6;
2287 v6->laddr = la->sin_addr.v6;
2288
2289 v6->fport = fa->sin_port;
2290 v6->lport = la->sin_port;
2291
2292 vtw->reuse_port = 1;
2293 vtw->reuse_addr = 1;
2294 vtw->v6only = 0;
2295 vtw->uid = 0;
2296
2297 vtw_inshash_v6(ctl, vtw);
2298 break;
2299 }
2300
2301 default:
2302 break;
2303 }
2304
2305 return 1;
2306 }
2307
2308 return 0;
2309 }
2310
2311 static int vtw_syscall = 0;
2312
2313 static int
2314 vtw_debug_process(vtw_sysargs_t *ap)
2315 {
2316 struct vestigial_inpcb vestige;
2317 int rc = 0;
2318
2319 mutex_enter(softnet_lock);
2320
2321 switch (ap->op) {
2322 case 0: // insert
2323 vtw_debug_add(ap->la.sin_family
2324 , &ap->la
2325 , &ap->fa
2326 , TCPTV_MSL
2327 , 0);
2328 break;
2329
2330 case 1: // lookup
2331 case 2: // restart
2332 switch (ap->la.sin_family) {
2333 case AF_INET:
2334 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2335 ap->la.sin_addr.v4, ap->la.sin_port,
2336 &vestige)) {
2337 if (ap->op == 2) {
2338 vtw_restart(&vestige);
2339 }
2340 rc = 0;
2341 } else
2342 rc = ESRCH;
2343 break;
2344
2345 case AF_INET6:
2346 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2347 &ap->la.sin_addr.v6, ap->la.sin_port,
2348 &vestige)) {
2349 if (ap->op == 2) {
2350 vtw_restart(&vestige);
2351 }
2352 rc = 0;
2353 } else
2354 rc = ESRCH;
2355 break;
2356 default:
2357 rc = EINVAL;
2358 }
2359 break;
2360
2361 default:
2362 rc = EINVAL;
2363 }
2364
2365 mutex_exit(softnet_lock);
2366 return rc;
2367 }
2368
2369 struct sys_vtw_args {
2370 syscallarg(const vtw_sysargs_t *) req;
2371 syscallarg(size_t) len;
2372 };
2373
2374 static int
2375 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2376 {
2377 const struct sys_vtw_args *uap = _;
2378 void *buf;
2379 int rc;
2380 size_t len = SCARG(uap, len);
2381
2382 if (len != sizeof (vtw_sysargs_t))
2383 return EINVAL;
2384
2385 buf = kmem_alloc(len, KM_SLEEP);
2386 rc = copyin(SCARG(uap, req), buf, len);
2387 if (!rc) {
2388 rc = vtw_debug_process(buf);
2389 }
2390 kmem_free(buf, len);
2391
2392 return rc;
2393 }
2394
2395 static void
2396 vtw_sanity_check(void)
2397 {
2398 vtw_ctl_t *ctl;
2399 vtw_t *vtw;
2400 int i;
2401 int n;
2402
2403 for (i = 0; i < VTW_NCLASS; ++i) {
2404 ctl = &vtw_tcpv4[i];
2405
2406 if (!ctl->base.v || ctl->nalloc)
2407 continue;
2408
2409 for (n = 0, vtw = ctl->base.v; ; ) {
2410 ++n;
2411 vtw = vtw_next(ctl, vtw);
2412 if (vtw == ctl->base.v)
2413 break;
2414 }
2415 db_trace(KTR_VTW
2416 , (ctl, "sanity: class %x n %x nfree %x"
2417 , i, n, ctl->nfree));
2418
2419 KASSERT(n == ctl->nfree);
2420 }
2421
2422 for (i = 0; i < VTW_NCLASS; ++i) {
2423 ctl = &vtw_tcpv6[i];
2424
2425 if (!ctl->base.v || ctl->nalloc)
2426 continue;
2427
2428 for (n = 0, vtw = ctl->base.v; ; ) {
2429 ++n;
2430 vtw = vtw_next(ctl, vtw);
2431 if (vtw == ctl->base.v)
2432 break;
2433 }
2434 db_trace(KTR_VTW
2435 , (ctl, "sanity: class %x n %x nfree %x"
2436 , i, n, ctl->nfree));
2437 KASSERT(n == ctl->nfree);
2438 }
2439 }
2440
2441 /*!\brief Initialise debug support.
2442 */
2443 static void
2444 vtw_debug_init(void)
2445 {
2446 int i;
2447
2448 vtw_sanity_check();
2449
2450 if (vtw_syscall)
2451 return;
2452
2453 for (i = 511; i; --i) {
2454 if (sysent[i].sy_call == sys_nosys) {
2455 sysent[i].sy_call = vtw_sys;
2456 sysent[i].sy_narg = 2;
2457 sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2458 sysent[i].sy_flags = 0;
2459
2460 vtw_syscall = i;
2461 break;
2462 }
2463 }
2464 if (i) {
2465 const struct sysctlnode *node;
2466 uint32_t flags;
2467
2468 flags = sysctl_root.sysctl_flags;
2469
2470 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2471 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2472
2473 sysctl_createv(0, 0, 0, &node,
2474 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2475 "koff",
2476 SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2477 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2478
2479 if (!node) {
2480 sysctl_createv(0, 0, 0, &node,
2481 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2482 "koffka",
2483 SYSCTL_DESCR("The Real(tm) Kernel"
2484 " Obscure Feature Finder"),
2485 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2486 }
2487 if (node) {
2488 sysctl_createv(0, 0, 0, 0,
2489 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2490 CTLTYPE_INT, "vtw_debug_syscall",
2491 SYSCTL_DESCR("vtw debug"
2492 " system call number"),
2493 0, 0, &vtw_syscall, 0, node->sysctl_num,
2494 CTL_CREATE, CTL_EOL);
2495 }
2496 sysctl_root.sysctl_flags = flags;
2497 }
2498 }
2499 #else /* !VTW_DEBUG */
2500 static void
2501 vtw_debug_init(void)
2502 {
2503 return;
2504 }
2505 #endif /* !VTW_DEBUG */
2506