tcp_vtw.c revision 1.6.2.2 1 /*
2 * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Coyote Point Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29 #include <sys/cdefs.h>
30
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_ipsec.h"
34 #include "opt_inet_csum.h"
35 #include "opt_tcp_debug.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kmem.h>
41 #include <sys/mbuf.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/errno.h>
46 #include <sys/syslog.h>
47 #include <sys/pool.h>
48 #include <sys/domain.h>
49 #include <sys/kernel.h>
50 #include <net/if.h>
51 #include <net/route.h>
52 #include <net/if_types.h>
53
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_offload.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6_var.h>
65 #include <netinet6/in6_var.h>
66 #include <netinet/icmp6.h>
67 #include <netinet6/nd6.h>
68
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_private.h>
75 #include <netinet/tcpip.h>
76
77 #include <machine/stdarg.h>
78 #include <netinet/tcp_vtw.h>
79
80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.6.2.2 2011/06/06 09:09:57 jruoho Exp $");
81
82 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
83
84 static void vtw_debug_init(void);
85
86 fatp_ctl_t fat_tcpv4;
87 fatp_ctl_t fat_tcpv6;
88 vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
89 vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
90 vtw_stats_t vtw_stats;
91
92 /* We provide state for the lookup_ports iterator.
93 * As currently we are netlock-protected, there is one.
94 * If we were finer-grain, we would have one per CPU.
95 * I do not want to be in the business of alloc/free.
96 * The best alternate would be allocate on the caller's
97 * stack, but that would require them to know the struct,
98 * or at least the size.
99 * See how she goes.
100 */
101 struct tcp_ports_iterator {
102 union {
103 struct in_addr v4;
104 struct in6_addr v6;
105 } addr;
106 u_int port;
107
108 uint32_t wild : 1;
109
110 vtw_ctl_t *ctl;
111 fatp_t *fp;
112
113 uint16_t slot_idx;
114 uint16_t ctl_idx;
115 };
116
117 static struct tcp_ports_iterator tcp_ports_iterator_v4;
118 static struct tcp_ports_iterator tcp_ports_iterator_v6;
119
120 static int vtw_age(vtw_ctl_t *, struct timeval *);
121
122 /*!\brief allocate a fat pointer from a collection.
123 */
124 static fatp_t *
125 fatp_alloc(fatp_ctl_t *fat)
126 {
127 fatp_t *fp = 0;
128
129 if (fat->nfree) {
130 fp = fat->free;
131 if (fp) {
132 fat->free = fatp_next(fat, fp);
133 --fat->nfree;
134 ++fat->nalloc;
135 fp->nxt = 0;
136
137 KASSERT(!fp->inuse);
138 }
139 }
140
141 return fp;
142 }
143
144 /*!\brief free a fat pointer.
145 */
146 static void
147 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
148 {
149 if (fp) {
150 KASSERT(!fp->inuse);
151 KASSERT(!fp->nxt);
152
153 fp->nxt = fatp_index(fat, fat->free);
154 fat->free = fp;
155
156 ++fat->nfree;
157 --fat->nalloc;
158 }
159 }
160
161 /*!\brief initialise a collection of fat pointers.
162 *
163 *\param n # hash buckets
164 *\param m total # fat pointers to allocate
165 *
166 * We allocate 2x as much, as we have two hashes: full and lport only.
167 */
168 static void
169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
170 fatp_t *fat_base, fatp_t **fat_hash)
171 {
172 fatp_t *fp;
173
174 KASSERT(n <= FATP_MAX / 2);
175
176 fat->hash = fat_hash;
177 fat->base = fat_base;
178
179 fat->port = &fat->hash[m];
180
181 fat->mask = m - 1; // ASSERT is power of 2 (m)
182 fat->lim = fat->base + 2*n - 1;
183 fat->nfree = 0;
184 fat->nalloc = 2*n;
185
186 /* Initialise the free list.
187 */
188 for (fp = fat->lim; fp >= fat->base; --fp) {
189 fatp_free(fat, fp);
190 }
191 }
192
193 /*
194 * The `xtra' is XORed into the tag stored.
195 */
196 static uint32_t fatp_xtra[] = {
197 0x11111111,0x22222222,0x33333333,0x44444444,
198 0x55555555,0x66666666,0x77777777,0x88888888,
199 0x12121212,0x21212121,0x34343434,0x43434343,
200 0x56565656,0x65656565,0x78787878,0x87878787,
201 0x11221122,0x22112211,0x33443344,0x44334433,
202 0x55665566,0x66556655,0x77887788,0x88778877,
203 0x11112222,0x22221111,0x33334444,0x44443333,
204 0x55556666,0x66665555,0x77778888,0x88887777,
205 };
206
207 /*!\brief turn a {fatp_t*,slot} into an integral key.
208 *
209 * The key can be used to obtain the fatp_t, and the slot,
210 * as it directly encodes them.
211 */
212 static inline uint32_t
213 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
214 {
215 CTASSERT(CACHE_LINE_SIZE == 32 ||
216 CACHE_LINE_SIZE == 64 ||
217 CACHE_LINE_SIZE == 128);
218
219 switch (fatp_ntags()) {
220 case 7:
221 return (fatp_index(fat, fp) << 3) | slot;
222 case 15:
223 return (fatp_index(fat, fp) << 4) | slot;
224 case 31:
225 return (fatp_index(fat, fp) << 5) | slot;
226 default:
227 KASSERT(0 && "no support, for no good reason");
228 return ~0;
229 }
230 }
231
232 static inline uint32_t
233 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
234 {
235 CTASSERT(CACHE_LINE_SIZE == 32 ||
236 CACHE_LINE_SIZE == 64 ||
237 CACHE_LINE_SIZE == 128);
238
239 switch (fatp_ntags()) {
240 case 7:
241 return key & 7;
242 case 15:
243 return key & 15;
244 case 31:
245 return key & 31;
246 default:
247 KASSERT(0 && "no support, for no good reason");
248 return ~0;
249 }
250 }
251
252 static inline fatp_t *
253 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
254 {
255 CTASSERT(CACHE_LINE_SIZE == 32 ||
256 CACHE_LINE_SIZE == 64 ||
257 CACHE_LINE_SIZE == 128);
258
259 switch (fatp_ntags()) {
260 case 7:
261 key >>= 3;
262 break;
263 case 15:
264 key >>= 4;
265 break;
266 case 31:
267 key >>= 5;
268 break;
269 default:
270 KASSERT(0 && "no support, for no good reason");
271 return 0;
272 }
273
274 return key ? fat->base + key - 1 : 0;
275 }
276
277 static inline uint32_t
278 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
279 {
280 return (idx << ctl->idx_bits) | idx;
281 }
282
283 static inline uint32_t
284 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
285 {
286 uint32_t idx = bits & ctl->idx_mask;
287
288 if (idx_encode(ctl, idx) == bits)
289 return idx;
290 else
291 return ~0;
292 }
293
294 /*!\brief insert index into fatp hash
295 *
296 *\param idx - index of element being placed in hash chain
297 *\param tag - 32-bit tag identifier
298 *
299 *\returns
300 * value which can be used to locate entry.
301 *
302 *\note
303 * we rely on the fact that there are unused high bits in the index
304 * for verification purposes on lookup.
305 */
306
307 static inline uint32_t
308 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
309 void *dbg)
310 {
311 fatp_t *fp;
312 fatp_t **hash = (which ? fat->port : fat->hash);
313 int i;
314
315 fp = hash[tag & fat->mask];
316
317 while (!fp || fatp_full(fp)) {
318 fatp_t *fq;
319
320 /* All entries are inuse at the top level.
321 * We allocate a spare, and push the top level
322 * down one. All entries in the fp we push down
323 * (think of a tape worm here) will be expelled sooner than
324 * any entries added subsequently to this hash bucket.
325 * This is a property of the time waits we are exploiting.
326 */
327
328 fq = fatp_alloc(fat);
329 if (!fq) {
330 vtw_age(fat->vtw, 0);
331 fp = hash[tag & fat->mask];
332 continue;
333 }
334
335 fq->inuse = 0;
336 fq->nxt = fatp_index(fat, fp);
337
338 hash[tag & fat->mask] = fq;
339
340 fp = fq;
341 }
342
343 KASSERT(!fatp_full(fp));
344
345 /* Fill highest index first. Lookup is lowest first.
346 */
347 for (i = fatp_ntags(); --i >= 0; ) {
348 if (!((1 << i) & fp->inuse)) {
349 break;
350 }
351 }
352
353 fp->inuse |= 1 << i;
354 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
355
356 db_trace(KTR_VTW
357 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
358 , fp->inuse
359 , i, fp->tag[i]));
360
361 return fatp_key(fat, fp, i);
362 }
363
364 static inline int
365 vtw_alive(const vtw_t *vtw)
366 {
367 return vtw->hashed && vtw->expire.tv_sec;
368 }
369
370 static inline uint32_t
371 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
372 {
373 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
374 return v4 - ctl->base.v4;
375
376 KASSERT(0 && "vtw out of bounds");
377
378 return ~0;
379 }
380
381 static inline uint32_t
382 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
383 {
384 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
385 return v6 - ctl->base.v6;
386
387 KASSERT(0 && "vtw out of bounds");
388
389 return ~0;
390 }
391
392 static inline uint32_t
393 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
394 {
395 if (ctl->clidx)
396 ctl = ctl->ctl;
397
398 if (ctl->is_v4)
399 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
400
401 if (ctl->is_v6)
402 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
403
404 KASSERT(0 && "neither 4 nor 6. most curious.");
405
406 return ~0;
407 }
408
409 static inline vtw_t *
410 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
411 {
412 if (ctl->clidx)
413 ctl = ctl->ctl;
414
415 /* See if the index looks like it might be an index.
416 * Bits on outside of the valid index bits is a give away.
417 */
418 idx = idx_decode(ctl, idx);
419
420 if (idx == ~0) {
421 return 0;
422 } else if (ctl->is_v4) {
423 vtw_v4_t *vtw = ctl->base.v4 + idx;
424
425 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
426 ? &vtw->common : 0;
427 } else if (ctl->is_v6) {
428 vtw_v6_t *vtw = ctl->base.v6 + idx;
429
430 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
431 ? &vtw->common : 0;
432 } else {
433 KASSERT(0 && "badness");
434 return 0;
435 }
436 }
437
438 /*!\brief return the next vtw after this one.
439 *
440 * Due to the differing sizes of the entries in differing
441 * arenas, we have to ensure we ++ the correct pointer type.
442 *
443 * Also handles wrap.
444 */
445 static inline vtw_t *
446 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
447 {
448 if (ctl->is_v4) {
449 vtw_v4_t *v4 = (void*)vtw;
450
451 vtw = &(++v4)->common;
452 } else {
453 vtw_v6_t *v6 = (void*)vtw;
454
455 vtw = &(++v6)->common;
456 }
457
458 if (vtw > ctl->lim.v)
459 vtw = ctl->base.v;
460
461 return vtw;
462 }
463
464 /*!\brief remove entry from FATP hash chains
465 */
466 static inline void
467 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
468 {
469 fatp_ctl_t *fat = ctl->fat;
470 fatp_t *fp;
471 uint32_t key = vtw->key;
472 uint32_t tag, slot, idx;
473 vtw_v4_t *v4 = (void*)vtw;
474 vtw_v6_t *v6 = (void*)vtw;
475
476 if (!vtw->hashed) {
477 KASSERT(0 && "unhashed");
478 return;
479 }
480
481 if (fat->vtw->is_v4) {
482 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
483 } else if (fat->vtw->is_v6) {
484 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
485 } else {
486 tag = 0;
487 KASSERT(0 && "not reached");
488 }
489
490 /* Remove from fat->hash[]
491 */
492 slot = fatp_slot_from_key(fat, key);
493 fp = fatp_from_key(fat, key);
494 idx = vtw_index(ctl, vtw);
495
496 db_trace(KTR_VTW
497 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
498 , fp->inuse, slot, idx, key, tag));
499
500 KASSERT(fp->inuse & (1 << slot));
501 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
502 ^ fatp_xtra[slot]));
503
504 if ((fp->inuse & (1 << slot))
505 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
506 ^ fatp_xtra[slot])) {
507 fp->inuse ^= 1 << slot;
508 fp->tag[slot] = 0;
509
510 /* When we delete entries, we do not compact. This is
511 * due to temporality. We add entries, and they
512 * (eventually) expire. Older entries will be further
513 * down the chain.
514 */
515 if (!fp->inuse) {
516 uint32_t hi = tag & fat->mask;
517 fatp_t *fq = 0;
518 fatp_t *fr = fat->hash[hi];
519
520 while (fr && fr != fp) {
521 fr = fatp_next(fat, fq = fr);
522 }
523
524 if (fr == fp) {
525 if (fq) {
526 fq->nxt = fp->nxt;
527 fp->nxt = 0;
528 fatp_free(fat, fp);
529 } else {
530 KASSERT(fat->hash[hi] == fp);
531
532 if (fp->nxt) {
533 fat->hash[hi]
534 = fatp_next(fat, fp);
535 fp->nxt = 0;
536 fatp_free(fat, fp);
537 } else {
538 /* retain for next use.
539 */
540 ;
541 }
542 }
543 } else {
544 fr = fat->hash[hi];
545
546 do {
547 db_trace(KTR_VTW
548 , (fr
549 , "fat:*del inuse %5.5x"
550 " nxt %x"
551 , fr->inuse, fr->nxt));
552
553 fr = fatp_next(fat, fq = fr);
554 } while (fr && fr != fp);
555
556 KASSERT(0 && "oops");
557 }
558 }
559 vtw->key ^= ~0;
560 }
561
562 if (fat->vtw->is_v4) {
563 tag = v4_port_tag(v4->lport);
564 } else if (fat->vtw->is_v6) {
565 tag = v6_port_tag(v6->lport);
566 }
567
568 /* Remove from fat->port[]
569 */
570 key = vtw->port_key;
571 slot = fatp_slot_from_key(fat, key);
572 fp = fatp_from_key(fat, key);
573 idx = vtw_index(ctl, vtw);
574
575 db_trace(KTR_VTW
576 , (fp, "fatport: del inuse %5.5x"
577 " slot %x idx %x key %x tag %x"
578 , fp->inuse, slot, idx, key, tag));
579
580 KASSERT(fp->inuse & (1 << slot));
581 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
582 ^ fatp_xtra[slot]));
583
584 if ((fp->inuse & (1 << slot))
585 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
586 ^ fatp_xtra[slot])) {
587 fp->inuse ^= 1 << slot;
588 fp->tag[slot] = 0;
589
590 if (!fp->inuse) {
591 uint32_t hi = tag & fat->mask;
592 fatp_t *fq = 0;
593 fatp_t *fr = fat->port[hi];
594
595 while (fr && fr != fp) {
596 fr = fatp_next(fat, fq = fr);
597 }
598
599 if (fr == fp) {
600 if (fq) {
601 fq->nxt = fp->nxt;
602 fp->nxt = 0;
603 fatp_free(fat, fp);
604 } else {
605 KASSERT(fat->port[hi] == fp);
606
607 if (fp->nxt) {
608 fat->port[hi]
609 = fatp_next(fat, fp);
610 fp->nxt = 0;
611 fatp_free(fat, fp);
612 } else {
613 /* retain for next use.
614 */
615 ;
616 }
617 }
618 }
619 }
620 vtw->port_key ^= ~0;
621 }
622
623 vtw->hashed = 0;
624 }
625
626 /*!\brief remove entry from hash, possibly free.
627 */
628 void
629 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
630 {
631 KASSERT(mutex_owned(softnet_lock));
632
633 if (vtw->hashed) {
634 ++vtw_stats.del;
635 vtw_unhash(ctl, vtw);
636 }
637
638 /* We only delete the oldest entry.
639 */
640 if (vtw != ctl->oldest.v)
641 return;
642
643 --ctl->nalloc;
644 ++ctl->nfree;
645
646 vtw->expire.tv_sec = 0;
647 vtw->expire.tv_usec = ~0;
648
649 if (!ctl->nalloc)
650 ctl->oldest.v = 0;
651
652 ctl->oldest.v = vtw_next(ctl, vtw);
653 }
654
655 /*!\brief insert vestigial timewait in hash chain
656 */
657 static void
658 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
659 {
660 uint32_t idx = vtw_index(ctl, vtw);
661 uint32_t tag;
662 vtw_v4_t *v4 = (void*)vtw;
663
664 KASSERT(mutex_owned(softnet_lock));
665 KASSERT(!vtw->hashed);
666 KASSERT(ctl->clidx == vtw->msl_class);
667
668 ++vtw_stats.ins;
669
670 tag = v4_tag(v4->faddr, v4->fport,
671 v4->laddr, v4->lport);
672
673 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
674
675 db_trace(KTR_VTW, (ctl
676 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
677 " tag %8.8x key %8.8x"
678 , v4->faddr, v4->fport
679 , v4->laddr, v4->lport
680 , tag
681 , vtw->key));
682
683 tag = v4_port_tag(v4->lport);
684 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
685
686 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
687 , v4->lport, v4->lport
688 , tag
689 , vtw->key));
690
691 vtw->hashed = 1;
692 }
693
694 /*!\brief insert vestigial timewait in hash chain
695 */
696 static void
697 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
698 {
699 uint32_t idx = vtw_index(ctl, vtw);
700 uint32_t tag;
701 vtw_v6_t *v6 = (void*)vtw;
702
703 KASSERT(mutex_owned(softnet_lock));
704 KASSERT(!vtw->hashed);
705 KASSERT(ctl->clidx == vtw->msl_class);
706
707 ++vtw_stats.ins;
708
709 tag = v6_tag(&v6->faddr, v6->fport,
710 &v6->laddr, v6->lport);
711
712 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
713
714 tag = v6_port_tag(v6->lport);
715 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
716
717 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
718 , v6->lport, v6->lport
719 , tag
720 , vtw->key));
721
722 vtw->hashed = 1;
723 }
724
725 static vtw_t *
726 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
727 , uint32_t laddr, uint16_t lport
728 , int which)
729 {
730 vtw_v4_t *v4;
731 vtw_t *vtw;
732 uint32_t tag;
733 fatp_t *fp;
734 int i;
735 uint32_t fatps = 0, probes = 0, losings = 0;
736
737 if (!ctl || !ctl->fat)
738 return 0;
739
740 ++vtw_stats.look[which];
741
742 if (which) {
743 tag = v4_port_tag(lport);
744 fp = ctl->fat->port[tag & ctl->fat->mask];
745 } else {
746 tag = v4_tag(faddr, fport, laddr, lport);
747 fp = ctl->fat->hash[tag & ctl->fat->mask];
748 }
749
750 while (fp && fp->inuse) {
751 uint32_t inuse = fp->inuse;
752
753 ++fatps;
754
755 for (i = 0; inuse && i < fatp_ntags(); ++i) {
756 uint32_t idx;
757
758 if (!(inuse & (1 << i)))
759 continue;
760
761 inuse ^= 1 << i;
762
763 ++probes;
764 ++vtw_stats.probe[which];
765
766 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
767 vtw = vtw_from_index(ctl, idx);
768
769 if (!vtw) {
770 /* Hopefully fast path.
771 */
772 db_trace(KTR_VTW
773 , (fp, "vtw: fast %A:%P %A:%P"
774 " idx %x tag %x"
775 , faddr, fport
776 , laddr, lport
777 , idx, tag));
778 continue;
779 }
780
781 v4 = (void*)vtw;
782
783 /* The de-referencing of vtw is what we want to avoid.
784 * Losing.
785 */
786 if (vtw_alive(vtw)
787 && ((which ? vtw->port_key : vtw->key)
788 == fatp_key(ctl->fat, fp, i))
789 && (which
790 || (v4->faddr == faddr && v4->laddr == laddr
791 && v4->fport == fport))
792 && v4->lport == lport) {
793 ++vtw_stats.hit[which];
794
795 db_trace(KTR_VTW
796 , (fp, "vtw: hit %8.8x:%4.4x"
797 " %8.8x:%4.4x idx %x key %x"
798 , faddr, fport
799 , laddr, lport
800 , idx_decode(ctl, idx), vtw->key));
801
802 KASSERT(vtw->hashed);
803
804 goto out;
805 }
806 ++vtw_stats.losing[which];
807 ++losings;
808
809 if (vtw_alive(vtw)) {
810 db_trace(KTR_VTW
811 , (fp, "vtw:!mis %8.8x:%4.4x"
812 " %8.8x:%4.4x key %x tag %x"
813 , faddr, fport
814 , laddr, lport
815 , fatp_key(ctl->fat, fp, i)
816 , v4_tag(faddr, fport
817 , laddr, lport)));
818 db_trace(KTR_VTW
819 , (vtw, "vtw:!mis %8.8x:%4.4x"
820 " %8.8x:%4.4x key %x tag %x"
821 , v4->faddr, v4->fport
822 , v4->laddr, v4->lport
823 , vtw->key
824 , v4_tag(v4->faddr, v4->fport
825 , v4->laddr, v4->lport)));
826
827 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
828 db_trace(KTR_VTW
829 , (vtw, "vtw:!mis %8.8x:%4.4x"
830 " %8.8x:%4.4x key %x"
831 " which %x"
832 , v4->faddr, v4->fport
833 , v4->laddr, v4->lport
834 , vtw->key
835 , which));
836
837 } else {
838 db_trace(KTR_VTW
839 , (vtw
840 , "vtw:!mis"
841 " key %8.8x != %8.8x"
842 " idx %x i %x which %x"
843 , vtw->key
844 , fatp_key(ctl->fat, fp, i)
845 , idx_decode(ctl, idx)
846 , i
847 , which));
848 }
849 } else {
850 db_trace(KTR_VTW
851 , (fp
852 , "vtw:!mis free entry"
853 " idx %x vtw %p which %x"
854 , idx_decode(ctl, idx)
855 , vtw, which));
856 }
857 }
858
859 if (fp->nxt) {
860 fp = fatp_next(ctl->fat, fp);
861 } else {
862 break;
863 }
864 }
865 ++vtw_stats.miss[which];
866 vtw = 0;
867 out:
868 if (fatps > vtw_stats.max_chain[which])
869 vtw_stats.max_chain[which] = fatps;
870 if (probes > vtw_stats.max_probe[which])
871 vtw_stats.max_probe[which] = probes;
872 if (losings > vtw_stats.max_loss[which])
873 vtw_stats.max_loss[which] = losings;
874
875 return vtw;
876 }
877
878 static vtw_t *
879 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
880 , const struct in6_addr *laddr, uint16_t lport
881 , int which)
882 {
883 vtw_v6_t *v6;
884 vtw_t *vtw;
885 uint32_t tag;
886 fatp_t *fp;
887 int i;
888 uint32_t fatps = 0, probes = 0, losings = 0;
889
890 ++vtw_stats.look[which];
891
892 if (!ctl || !ctl->fat)
893 return 0;
894
895 if (which) {
896 tag = v6_port_tag(lport);
897 fp = ctl->fat->port[tag & ctl->fat->mask];
898 } else {
899 tag = v6_tag(faddr, fport, laddr, lport);
900 fp = ctl->fat->hash[tag & ctl->fat->mask];
901 }
902
903 while (fp && fp->inuse) {
904 uint32_t inuse = fp->inuse;
905
906 ++fatps;
907
908 for (i = 0; inuse && i < fatp_ntags(); ++i) {
909 uint32_t idx;
910
911 if (!(inuse & (1 << i)))
912 continue;
913
914 inuse ^= 1 << i;
915
916 ++probes;
917 ++vtw_stats.probe[which];
918
919 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
920 vtw = vtw_from_index(ctl, idx);
921
922 db_trace(KTR_VTW
923 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
924 , i
925 , db_store(faddr, sizeof (*faddr)), fport
926 , db_store(laddr, sizeof (*laddr)), lport
927 , idx_decode(ctl, idx)));
928
929 if (!vtw) {
930 /* Hopefully fast path.
931 */
932 continue;
933 }
934
935 v6 = (void*)vtw;
936
937 if (vtw_alive(vtw)
938 && ((which ? vtw->port_key : vtw->key)
939 == fatp_key(ctl->fat, fp, i))
940 && v6->lport == lport
941 && (which
942 || (v6->fport == fport
943 && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
944 && !bcmp(&v6->laddr, laddr
945 , sizeof (*laddr))))) {
946 ++vtw_stats.hit[which];
947
948 KASSERT(vtw->hashed);
949 goto out;
950 } else {
951 ++vtw_stats.losing[which];
952 ++losings;
953 }
954 }
955
956 if (fp->nxt) {
957 fp = fatp_next(ctl->fat, fp);
958 } else {
959 break;
960 }
961 }
962 ++vtw_stats.miss[which];
963 vtw = 0;
964 out:
965 if (fatps > vtw_stats.max_chain[which])
966 vtw_stats.max_chain[which] = fatps;
967 if (probes > vtw_stats.max_probe[which])
968 vtw_stats.max_probe[which] = probes;
969 if (losings > vtw_stats.max_loss[which])
970 vtw_stats.max_loss[which] = losings;
971
972 return vtw;
973 }
974
975 /*!\brief port iterator
976 */
977 static vtw_t *
978 vtw_next_port_v4(struct tcp_ports_iterator *it)
979 {
980 vtw_ctl_t *ctl = it->ctl;
981 vtw_v4_t *v4;
982 vtw_t *vtw;
983 uint32_t tag;
984 uint16_t lport = it->port;
985 fatp_t *fp;
986 int i;
987 uint32_t fatps = 0, probes = 0, losings = 0;
988
989 tag = v4_port_tag(lport);
990 if (!it->fp) {
991 it->fp = ctl->fat->port[tag & ctl->fat->mask];
992 it->slot_idx = 0;
993 }
994 fp = it->fp;
995
996 while (fp) {
997 uint32_t inuse = fp->inuse;
998
999 ++fatps;
1000
1001 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1002 uint32_t idx;
1003
1004 if (!(inuse & (1 << i)))
1005 continue;
1006
1007 inuse &= ~0 << i;
1008
1009 if (i < it->slot_idx)
1010 continue;
1011
1012 ++vtw_stats.probe[1];
1013 ++probes;
1014
1015 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1016 vtw = vtw_from_index(ctl, idx);
1017
1018 if (!vtw) {
1019 /* Hopefully fast path.
1020 */
1021 continue;
1022 }
1023
1024 v4 = (void*)vtw;
1025
1026 if (vtw_alive(vtw)
1027 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1028 && v4->lport == lport) {
1029 ++vtw_stats.hit[1];
1030
1031 it->slot_idx = i + 1;
1032
1033 goto out;
1034 } else if (vtw_alive(vtw)) {
1035 ++vtw_stats.losing[1];
1036 ++losings;
1037
1038 db_trace(KTR_VTW
1039 , (vtw, "vtw:!mis"
1040 " port %8.8x:%4.4x %8.8x:%4.4x"
1041 " key %x port %x"
1042 , v4->faddr, v4->fport
1043 , v4->laddr, v4->lport
1044 , vtw->key
1045 , lport));
1046 } else {
1047 /* Really losing here. We are coming
1048 * up with references to free entries.
1049 * Might find it better to use
1050 * traditional, or need another
1051 * add-hockery. The other add-hockery
1052 * would be to pul more into into the
1053 * cache line to reject the false
1054 * hits.
1055 */
1056 ++vtw_stats.losing[1];
1057 ++losings;
1058 db_trace(KTR_VTW
1059 , (fp, "vtw:!mis port %x"
1060 " - free entry idx %x vtw %p"
1061 , lport
1062 , idx_decode(ctl, idx)
1063 , vtw));
1064 }
1065 }
1066
1067 if (fp->nxt) {
1068 it->fp = fp = fatp_next(ctl->fat, fp);
1069 it->slot_idx = 0;
1070 } else {
1071 it->fp = 0;
1072 break;
1073 }
1074 }
1075 ++vtw_stats.miss[1];
1076
1077 vtw = 0;
1078 out:
1079 if (fatps > vtw_stats.max_chain[1])
1080 vtw_stats.max_chain[1] = fatps;
1081 if (probes > vtw_stats.max_probe[1])
1082 vtw_stats.max_probe[1] = probes;
1083 if (losings > vtw_stats.max_loss[1])
1084 vtw_stats.max_loss[1] = losings;
1085
1086 return vtw;
1087 }
1088
1089 /*!\brief port iterator
1090 */
1091 static vtw_t *
1092 vtw_next_port_v6(struct tcp_ports_iterator *it)
1093 {
1094 vtw_ctl_t *ctl = it->ctl;
1095 vtw_v6_t *v6;
1096 vtw_t *vtw;
1097 uint32_t tag;
1098 uint16_t lport = it->port;
1099 fatp_t *fp;
1100 int i;
1101 uint32_t fatps = 0, probes = 0, losings = 0;
1102
1103 tag = v6_port_tag(lport);
1104 if (!it->fp) {
1105 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1106 it->slot_idx = 0;
1107 }
1108 fp = it->fp;
1109
1110 while (fp) {
1111 uint32_t inuse = fp->inuse;
1112
1113 ++fatps;
1114
1115 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1116 uint32_t idx;
1117
1118 if (!(inuse & (1 << i)))
1119 continue;
1120
1121 inuse &= ~0 << i;
1122
1123 if (i < it->slot_idx)
1124 continue;
1125
1126 ++vtw_stats.probe[1];
1127 ++probes;
1128
1129 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1130 vtw = vtw_from_index(ctl, idx);
1131
1132 if (!vtw) {
1133 /* Hopefully fast path.
1134 */
1135 continue;
1136 }
1137
1138 v6 = (void*)vtw;
1139
1140 db_trace(KTR_VTW
1141 , (vtw, "vtw: i %x idx %x fp->tag %x"
1142 " tag %x xtra %x"
1143 , i, idx_decode(ctl, idx)
1144 , fp->tag[i], tag, fatp_xtra[i]));
1145
1146 if (vtw_alive(vtw)
1147 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1148 && v6->lport == lport) {
1149 ++vtw_stats.hit[1];
1150
1151 db_trace(KTR_VTW
1152 , (fp, "vtw: nxt port %P - %4.4x"
1153 " idx %x key %x"
1154 , lport, lport
1155 , idx_decode(ctl, idx), vtw->key));
1156
1157 it->slot_idx = i + 1;
1158 goto out;
1159 } else if (vtw_alive(vtw)) {
1160 ++vtw_stats.losing[1];
1161
1162 db_trace(KTR_VTW
1163 , (vtw, "vtw:!mis port %6A:%4.4x"
1164 " %6A:%4.4x key %x port %x"
1165 , db_store(&v6->faddr
1166 , sizeof (v6->faddr))
1167 , v6->fport
1168 , db_store(&v6->laddr
1169 , sizeof (v6->faddr))
1170 , v6->lport
1171 , vtw->key
1172 , lport));
1173 } else {
1174 /* Really losing here. We are coming
1175 * up with references to free entries.
1176 * Might find it better to use
1177 * traditional, or need another
1178 * add-hockery. The other add-hockery
1179 * would be to pul more into into the
1180 * cache line to reject the false
1181 * hits.
1182 */
1183 ++vtw_stats.losing[1];
1184 ++losings;
1185
1186 db_trace(KTR_VTW
1187 , (fp
1188 , "vtw:!mis port %x"
1189 " - free entry idx %x vtw %p"
1190 , lport, idx_decode(ctl, idx)
1191 , vtw));
1192 }
1193 }
1194
1195 if (fp->nxt) {
1196 it->fp = fp = fatp_next(ctl->fat, fp);
1197 it->slot_idx = 0;
1198 } else {
1199 it->fp = 0;
1200 break;
1201 }
1202 }
1203 ++vtw_stats.miss[1];
1204
1205 vtw = 0;
1206 out:
1207 if (fatps > vtw_stats.max_chain[1])
1208 vtw_stats.max_chain[1] = fatps;
1209 if (probes > vtw_stats.max_probe[1])
1210 vtw_stats.max_probe[1] = probes;
1211 if (losings > vtw_stats.max_loss[1])
1212 vtw_stats.max_loss[1] = losings;
1213
1214 return vtw;
1215 }
1216
1217 /*!\brief initialise the VTW allocation arena
1218 *
1219 * There are 1+3 allocation classes:
1220 * 0 classless
1221 * {1,2,3} MSL-class based allocation
1222 *
1223 * The allocation arenas are all initialised. Classless gets all the
1224 * space. MSL-class based divides the arena, so that allocation
1225 * within a class can proceed without having to consider entries
1226 * (aka: cache lines) from different classes.
1227 *
1228 * Usually, we are completely classless or class-based, but there can be
1229 * transition periods, corresponding to dynamic adjustments in the config
1230 * by the operator.
1231 */
1232 static void
1233 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1234 {
1235 int class_n, i;
1236 vtw_t *base;
1237
1238 ctl->base.v = ctl_base_v;
1239
1240 if (ctl->is_v4) {
1241 ctl->lim.v4 = ctl->base.v4 + n - 1;
1242 ctl->alloc.v4 = ctl->base.v4;
1243 } else {
1244 ctl->lim.v6 = ctl->base.v6 + n - 1;
1245 ctl->alloc.v6 = ctl->base.v6;
1246 }
1247
1248 ctl->nfree = n;
1249 ctl->ctl = ctl;
1250
1251 ctl->idx_bits = 32;
1252 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1253 ctl->idx_mask >>= 1;
1254 ctl->idx_bits -= 1;
1255 }
1256
1257 ctl->idx_mask <<= 1;
1258 ctl->idx_mask |= 1;
1259 ctl->idx_bits += 1;
1260
1261 ctl->fat = fat;
1262 fat->vtw = ctl;
1263
1264 /* Divide the resources equally amongst the classes.
1265 * This is not optimal, as the different classes
1266 * arrive and leave at different rates, but it is
1267 * the best I can do for now.
1268 */
1269 class_n = n / (VTW_NCLASS-1);
1270 base = ctl->base.v;
1271
1272 for (i = 1; i < VTW_NCLASS; ++i) {
1273 int j;
1274
1275 ctl[i] = ctl[0];
1276 ctl[i].clidx = i;
1277
1278 ctl[i].base.v = base;
1279 ctl[i].alloc = ctl[i].base;
1280
1281 for (j = 0; j < class_n - 1; ++j) {
1282 if (tcp_msl_enable)
1283 base->msl_class = i;
1284 base = vtw_next(ctl, base);
1285 }
1286
1287 ctl[i].lim.v = base;
1288 base = vtw_next(ctl, base);
1289 ctl[i].nfree = class_n;
1290 }
1291
1292 vtw_debug_init();
1293 }
1294
1295 /*!\brief map class to TCP MSL
1296 */
1297 static inline uint32_t
1298 class_to_msl(int class)
1299 {
1300 switch (class) {
1301 case 0:
1302 case 1:
1303 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1304 case 2:
1305 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1306 default:
1307 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1308 }
1309 }
1310
1311 /*!\brief map TCP MSL to class
1312 */
1313 static inline uint32_t
1314 msl_to_class(int msl)
1315 {
1316 if (tcp_msl_enable) {
1317 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1318 return 1+2;
1319 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1320 return 1+1;
1321 return 1;
1322 }
1323 return 0;
1324 }
1325
1326 /*!\brief allocate a vtw entry
1327 */
1328 static inline vtw_t *
1329 vtw_alloc(vtw_ctl_t *ctl)
1330 {
1331 vtw_t *vtw = 0;
1332 int stuck = 0;
1333 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1334 int msl;
1335
1336 KASSERT(mutex_owned(softnet_lock));
1337
1338 /* If no resources, we will not get far.
1339 */
1340 if (!ctl || !ctl->base.v4 || avail <= 0)
1341 return 0;
1342
1343 /* Obtain a free one.
1344 */
1345 while (!ctl->nfree) {
1346 vtw_age(ctl, 0);
1347
1348 if (++stuck > avail) {
1349 /* When in transition between
1350 * schemes (classless, classed) we
1351 * can be stuck having to await the
1352 * expiration of cross-allocated entries.
1353 *
1354 * Returning zero means we will fall back to the
1355 * traditional TIME_WAIT handling, except in the
1356 * case of a re-shed, in which case we cannot
1357 * perform the reshecd, but will retain the extant
1358 * entry.
1359 */
1360 db_trace(KTR_VTW
1361 , (ctl, "vtw:!none free in class %x %x/%x"
1362 , ctl->clidx
1363 , ctl->nalloc, ctl->nfree));
1364
1365 return 0;
1366 }
1367 }
1368
1369 vtw = ctl->alloc.v;
1370
1371 if (vtw->msl_class != ctl->clidx) {
1372 /* Usurping rules:
1373 * 0 -> {1,2,3} or {1,2,3} -> 0
1374 */
1375 KASSERT(!vtw->msl_class || !ctl->clidx);
1376
1377 if (vtw->hashed || vtw->expire.tv_sec) {
1378 /* As this is owned by some other class,
1379 * we must wait for it to expire it.
1380 * This will only happen on class/classless
1381 * transitions, which are guaranteed to progress
1382 * to completion in small finite time, barring bugs.
1383 */
1384 db_trace(KTR_VTW
1385 , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1386 , vtw, vtw->msl_class, ctl->clidx
1387 , vtw->expire.tv_sec
1388 , vtw->expire.tv_usec
1389 , vtw->hashed ? " hashed" : ""));
1390
1391 return 0;
1392 }
1393
1394 db_trace(KTR_VTW
1395 , (ctl, "vtw:!%p usurped from %x to %x"
1396 , vtw, vtw->msl_class, ctl->clidx));
1397
1398 vtw->msl_class = ctl->clidx;
1399 }
1400
1401 if (vtw_alive(vtw)) {
1402 KASSERT(0 && "next free not free");
1403 return 0;
1404 }
1405
1406 /* Advance allocation poiter.
1407 */
1408 ctl->alloc.v = vtw_next(ctl, vtw);
1409
1410 --ctl->nfree;
1411 ++ctl->nalloc;
1412
1413 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1414
1415 /* mark expiration
1416 */
1417 getmicrouptime(&vtw->expire);
1418
1419 /* Move expiration into the future.
1420 */
1421 vtw->expire.tv_sec += msl / 1000;
1422 vtw->expire.tv_usec += 1000 * (msl % 1000);
1423
1424 while (vtw->expire.tv_usec >= 1000*1000) {
1425 vtw->expire.tv_usec -= 1000*1000;
1426 vtw->expire.tv_sec += 1;
1427 }
1428
1429 if (!ctl->oldest.v)
1430 ctl->oldest.v = vtw;
1431
1432 return vtw;
1433 }
1434
1435 /*!\brief expiration
1436 */
1437 static int
1438 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1439 {
1440 vtw_t *vtw;
1441 struct timeval then, *when = _when;
1442 int maxtries = 0;
1443
1444 if (!ctl->oldest.v) {
1445 KASSERT(!ctl->nalloc);
1446 return 0;
1447 }
1448
1449 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1450 if (++maxtries > ctl->nalloc)
1451 break;
1452
1453 if (vtw->msl_class != ctl->clidx) {
1454 db_trace(KTR_VTW
1455 , (vtw, "vtw:!age class mismatch %x != %x"
1456 , vtw->msl_class, ctl->clidx));
1457 /* XXXX
1458 * See if the appropriate action is to skip to the next.
1459 * XXXX
1460 */
1461 ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1462 continue;
1463 }
1464 if (!when) {
1465 /* Latch oldest timeval if none specified.
1466 */
1467 then = vtw->expire;
1468 when = &then;
1469 }
1470
1471 if (!timercmp(&vtw->expire, when, <=))
1472 break;
1473
1474 db_trace(KTR_VTW
1475 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1476 , ctl->clidx
1477 , vtw->expire.tv_sec
1478 , vtw->expire.tv_usec
1479 , ctl->nalloc
1480 , ctl->nfree));
1481
1482 if (!_when)
1483 ++vtw_stats.kill;
1484
1485 vtw_del(ctl, vtw);
1486 vtw = ctl->oldest.v;
1487 }
1488
1489 return ctl->nalloc; // # remaining allocated
1490 }
1491
1492 static callout_t vtw_cs;
1493
1494 /*!\brief notice the passage of time.
1495 * It seems to be getting faster. What happened to the year?
1496 */
1497 static void
1498 vtw_tick(void *arg)
1499 {
1500 struct timeval now;
1501 int i, cnt = 0;
1502
1503 getmicrouptime(&now);
1504
1505 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1506 , now.tv_sec, now.tv_usec));
1507
1508 mutex_enter(softnet_lock);
1509
1510 for (i = 0; i < VTW_NCLASS; ++i) {
1511 cnt += vtw_age(&vtw_tcpv4[i], &now);
1512 cnt += vtw_age(&vtw_tcpv6[i], &now);
1513 }
1514
1515 /* Keep ticks coming while we need them.
1516 */
1517 if (cnt)
1518 callout_schedule(&vtw_cs, hz / 5);
1519 else {
1520 tcp_vtw_was_enabled = 0;
1521 tcbtable.vestige = 0;
1522 }
1523 mutex_exit(softnet_lock);
1524 }
1525
1526 /* in_pcblookup_ports assist for handling vestigial entries.
1527 */
1528 static void *
1529 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1530 {
1531 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1532
1533 bzero(it, sizeof (*it));
1534
1535 /* Note: the reference to vtw_tcpv4[0] is fine.
1536 * We do not need per-class iteration. We just
1537 * need to get to the fat, and there is one
1538 * shared fat.
1539 */
1540 if (vtw_tcpv4[0].fat) {
1541 it->addr.v4 = addr;
1542 it->port = port;
1543 it->wild = !!wild;
1544 it->ctl = &vtw_tcpv4[0];
1545
1546 ++vtw_stats.look[1];
1547 }
1548
1549 return it;
1550 }
1551
1552 /*!\brief export an IPv4 vtw.
1553 */
1554 static int
1555 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1556 {
1557 vtw_v4_t *v4 = (void*)vtw;
1558
1559 bzero(res, sizeof (*res));
1560
1561 if (ctl && vtw) {
1562 if (!ctl->clidx && vtw->msl_class)
1563 ctl += vtw->msl_class;
1564 else
1565 KASSERT(ctl->clidx == vtw->msl_class);
1566
1567 res->valid = 1;
1568 res->v4 = 1;
1569
1570 res->faddr.v4.s_addr = v4->faddr;
1571 res->laddr.v4.s_addr = v4->laddr;
1572 res->fport = v4->fport;
1573 res->lport = v4->lport;
1574 res->vtw = vtw; // netlock held over call(s)
1575 res->ctl = ctl;
1576 res->reuse_addr = vtw->reuse_addr;
1577 res->reuse_port = vtw->reuse_port;
1578 res->snd_nxt = vtw->snd_nxt;
1579 res->rcv_nxt = vtw->rcv_nxt;
1580 res->rcv_wnd = vtw->rcv_wnd;
1581 res->uid = vtw->uid;
1582 }
1583
1584 return res->valid;
1585 }
1586
1587 /*!\brief return next port in the port iterator. yowza.
1588 */
1589 static int
1590 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1591 {
1592 struct tcp_ports_iterator *it = arg;
1593 vtw_t *vtw = 0;
1594
1595 if (it->ctl)
1596 vtw = vtw_next_port_v4(it);
1597
1598 if (!vtw)
1599 it->ctl = 0;
1600
1601 return vtw_export_v4(it->ctl, vtw, res);
1602 }
1603
1604 static int
1605 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1606 struct in_addr laddr, uint16_t lport,
1607 struct vestigial_inpcb *res)
1608 {
1609 vtw_t *vtw;
1610 vtw_ctl_t *ctl;
1611
1612
1613 db_trace(KTR_VTW
1614 , (res, "vtw: lookup %A:%P %A:%P"
1615 , faddr, fport
1616 , laddr, lport));
1617
1618 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1619 , faddr.s_addr, fport
1620 , laddr.s_addr, lport, 0);
1621
1622 return vtw_export_v4(ctl, vtw, res);
1623 }
1624
1625 /* in_pcblookup_ports assist for handling vestigial entries.
1626 */
1627 static void *
1628 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1629 {
1630 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1631
1632 bzero(it, sizeof (*it));
1633
1634 /* Note: the reference to vtw_tcpv6[0] is fine.
1635 * We do not need per-class iteration. We just
1636 * need to get to the fat, and there is one
1637 * shared fat.
1638 */
1639 if (vtw_tcpv6[0].fat) {
1640 it->addr.v6 = *addr;
1641 it->port = port;
1642 it->wild = !!wild;
1643 it->ctl = &vtw_tcpv6[0];
1644
1645 ++vtw_stats.look[1];
1646 }
1647
1648 return it;
1649 }
1650
1651 /*!\brief export an IPv6 vtw.
1652 */
1653 static int
1654 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1655 {
1656 vtw_v6_t *v6 = (void*)vtw;
1657
1658 bzero(res, sizeof (*res));
1659
1660 if (ctl && vtw) {
1661 if (!ctl->clidx && vtw->msl_class)
1662 ctl += vtw->msl_class;
1663 else
1664 KASSERT(ctl->clidx == vtw->msl_class);
1665
1666 res->valid = 1;
1667 res->v4 = 0;
1668
1669 res->faddr.v6 = v6->faddr;
1670 res->laddr.v6 = v6->laddr;
1671 res->fport = v6->fport;
1672 res->lport = v6->lport;
1673 res->vtw = vtw; // netlock held over call(s)
1674 res->ctl = ctl;
1675
1676 res->v6only = vtw->v6only;
1677 res->reuse_addr = vtw->reuse_addr;
1678 res->reuse_port = vtw->reuse_port;
1679
1680 res->snd_nxt = vtw->snd_nxt;
1681 res->rcv_nxt = vtw->rcv_nxt;
1682 res->rcv_wnd = vtw->rcv_wnd;
1683 res->uid = vtw->uid;
1684 }
1685
1686 return res->valid;
1687 }
1688
1689 static int
1690 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1691 {
1692 struct tcp_ports_iterator *it = arg;
1693 vtw_t *vtw = 0;
1694
1695 if (it->ctl)
1696 vtw = vtw_next_port_v6(it);
1697
1698 if (!vtw)
1699 it->ctl = 0;
1700
1701 return vtw_export_v6(it->ctl, vtw, res);
1702 }
1703
1704 static int
1705 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1706 const struct in6_addr *laddr, uint16_t lport,
1707 struct vestigial_inpcb *res)
1708 {
1709 vtw_ctl_t *ctl;
1710 vtw_t *vtw;
1711
1712 db_trace(KTR_VTW
1713 , (res, "vtw: lookup %6A:%P %6A:%P"
1714 , db_store(faddr, sizeof (*faddr)), fport
1715 , db_store(laddr, sizeof (*laddr)), lport));
1716
1717 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1718 , faddr, fport
1719 , laddr, lport, 0);
1720
1721 return vtw_export_v6(ctl, vtw, res);
1722 }
1723
1724 static vestigial_hooks_t tcp_hooks = {
1725 .init_ports4 = tcp_init_ports_v4,
1726 .next_port4 = tcp_next_port_v4,
1727 .lookup4 = tcp_lookup_v4,
1728 .init_ports6 = tcp_init_ports_v6,
1729 .next_port6 = tcp_next_port_v6,
1730 .lookup6 = tcp_lookup_v6,
1731 };
1732
1733 static bool
1734 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1735 {
1736 fatp_ctl_t *fat;
1737 vtw_ctl_t *ctl;
1738
1739 switch (af) {
1740 case AF_INET:
1741 fat = &fat_tcpv4;
1742 ctl = &vtw_tcpv4[0];
1743 break;
1744 case AF_INET6:
1745 fat = &fat_tcpv6;
1746 ctl = &vtw_tcpv6[0];
1747 break;
1748 default:
1749 return false;
1750 }
1751 if (fatp != NULL)
1752 *fatp = fat;
1753 if (ctlp != NULL)
1754 *ctlp = ctl;
1755 return true;
1756 }
1757
1758 /*!\brief initialize controlling instance
1759 */
1760 static int
1761 vtw_control_init(int af)
1762 {
1763 fatp_ctl_t *fat;
1764 vtw_ctl_t *ctl;
1765 fatp_t *fat_base;
1766 fatp_t **fat_hash;
1767 vtw_t *ctl_base_v;
1768 uint32_t n, m;
1769 size_t sz;
1770
1771 KASSERT(powerof2(tcp_vtw_entries));
1772
1773 if (!vtw_select(af, &fat, &ctl))
1774 return EAFNOSUPPORT;
1775
1776 if (fat->hash != NULL) {
1777 KASSERT(fat->base != NULL && ctl->base.v != NULL);
1778 return 0;
1779 }
1780
1781 /* Allocate 10% more capacity in the fat pointers.
1782 * We should only need ~#hash additional based on
1783 * how they age, but TIME_WAIT assassination could cause
1784 * sparse fat pointer utilisation.
1785 */
1786 m = 512;
1787 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1788 sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1789
1790 fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1791
1792 if (fat_hash == NULL) {
1793 printf("%s: could not allocate %zu bytes for "
1794 "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1795 return ENOMEM;
1796 }
1797
1798 fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1799
1800 if (fat_base == NULL) {
1801 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1802 printf("%s: could not allocate %zu bytes for "
1803 "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1804 return ENOMEM;
1805 }
1806
1807 ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1808
1809 if (ctl_base_v == NULL) {
1810 kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1811 kmem_free(fat_base, 2*n * sizeof(fatp_t));
1812 printf("%s: could not allocate %zu bytes for "
1813 "vtw_t array", __func__, tcp_vtw_entries * sz);
1814 return ENOMEM;
1815 }
1816
1817 fatp_init(fat, n, m, fat_base, fat_hash);
1818
1819 vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1820
1821 return 0;
1822 }
1823
1824 /*!\brief select controlling instance
1825 */
1826 static vtw_ctl_t *
1827 vtw_control(int af, uint32_t msl)
1828 {
1829 fatp_ctl_t *fat;
1830 vtw_ctl_t *ctl;
1831 int class = msl_to_class(msl);
1832
1833 if (!vtw_select(af, &fat, &ctl))
1834 return NULL;
1835
1836 if (!fat->base || !ctl->base.v)
1837 return NULL;
1838
1839 if (!tcp_vtw_was_enabled) {
1840 /* This guarantees is timer ticks until we no longer need them.
1841 */
1842 tcp_vtw_was_enabled = 1;
1843
1844 callout_schedule(&vtw_cs, hz / 5);
1845
1846 tcbtable.vestige = &tcp_hooks;
1847 }
1848
1849 return ctl + class;
1850 }
1851
1852 /*!\brief add TCP pcb to vestigial timewait
1853 */
1854 int
1855 vtw_add(int af, struct tcpcb *tp)
1856 {
1857 int enable;
1858 vtw_ctl_t *ctl;
1859 vtw_t *vtw;
1860
1861 KASSERT(mutex_owned(softnet_lock));
1862
1863 ctl = vtw_control(af, tp->t_msl);
1864 if (!ctl)
1865 return 0;
1866
1867 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1868
1869 vtw = vtw_alloc(ctl);
1870
1871 if (vtw) {
1872 vtw->snd_nxt = tp->snd_nxt;
1873 vtw->rcv_nxt = tp->rcv_nxt;
1874
1875 switch (af) {
1876 case AF_INET: {
1877 struct inpcb *inp = tp->t_inpcb;
1878 vtw_v4_t *v4 = (void*)vtw;
1879
1880 v4->faddr = inp->inp_faddr.s_addr;
1881 v4->laddr = inp->inp_laddr.s_addr;
1882 v4->fport = inp->inp_fport;
1883 v4->lport = inp->inp_lport;
1884
1885 vtw->reuse_port = !!(inp->inp_socket->so_options
1886 & SO_REUSEPORT);
1887 vtw->reuse_addr = !!(inp->inp_socket->so_options
1888 & SO_REUSEADDR);
1889 vtw->v6only = 0;
1890 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1891
1892 vtw_inshash_v4(ctl, vtw);
1893
1894
1895 #ifdef VTW_DEBUG
1896 /* Immediate lookup (connected and port) to
1897 * ensure at least that works!
1898 */
1899 if (enable & 4) {
1900 KASSERT(vtw_lookup_hash_v4
1901 (ctl
1902 , inp->inp_faddr.s_addr, inp->inp_fport
1903 , inp->inp_laddr.s_addr, inp->inp_lport
1904 , 0)
1905 == vtw);
1906 KASSERT(vtw_lookup_hash_v4
1907 (ctl
1908 , inp->inp_faddr.s_addr, inp->inp_fport
1909 , inp->inp_laddr.s_addr, inp->inp_lport
1910 , 1));
1911 }
1912 /* Immediate port iterator functionality check: not wild
1913 */
1914 if (enable & 8) {
1915 struct tcp_ports_iterator *it;
1916 struct vestigial_inpcb res;
1917 int cnt = 0;
1918
1919 it = tcp_init_ports_v4(inp->inp_laddr
1920 , inp->inp_lport, 0);
1921
1922 while (tcp_next_port_v4(it, &res)) {
1923 ++cnt;
1924 }
1925 KASSERT(cnt);
1926 }
1927 /* Immediate port iterator functionality check: wild
1928 */
1929 if (enable & 16) {
1930 struct tcp_ports_iterator *it;
1931 struct vestigial_inpcb res;
1932 struct in_addr any;
1933 int cnt = 0;
1934
1935 any.s_addr = htonl(INADDR_ANY);
1936
1937 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1938
1939 while (tcp_next_port_v4(it, &res)) {
1940 ++cnt;
1941 }
1942 KASSERT(cnt);
1943 }
1944 #endif /* VTW_DEBUG */
1945 break;
1946 }
1947
1948 case AF_INET6: {
1949 struct in6pcb *inp = tp->t_in6pcb;
1950 vtw_v6_t *v6 = (void*)vtw;
1951
1952 v6->faddr = inp->in6p_faddr;
1953 v6->laddr = inp->in6p_laddr;
1954 v6->fport = inp->in6p_fport;
1955 v6->lport = inp->in6p_lport;
1956
1957 vtw->reuse_port = !!(inp->in6p_socket->so_options
1958 & SO_REUSEPORT);
1959 vtw->reuse_addr = !!(inp->in6p_socket->so_options
1960 & SO_REUSEADDR);
1961 vtw->v6only = !!(inp->in6p_flags
1962 & IN6P_IPV6_V6ONLY);
1963 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid;
1964
1965 vtw_inshash_v6(ctl, vtw);
1966 #ifdef VTW_DEBUG
1967 /* Immediate lookup (connected and port) to
1968 * ensure at least that works!
1969 */
1970 if (enable & 4) {
1971 KASSERT(vtw_lookup_hash_v6(ctl
1972 , &inp->in6p_faddr, inp->in6p_fport
1973 , &inp->in6p_laddr, inp->in6p_lport
1974 , 0)
1975 == vtw);
1976 KASSERT(vtw_lookup_hash_v6
1977 (ctl
1978 , &inp->in6p_faddr, inp->in6p_fport
1979 , &inp->in6p_laddr, inp->in6p_lport
1980 , 1));
1981 }
1982 /* Immediate port iterator functionality check: not wild
1983 */
1984 if (enable & 8) {
1985 struct tcp_ports_iterator *it;
1986 struct vestigial_inpcb res;
1987 int cnt = 0;
1988
1989 it = tcp_init_ports_v6(&inp->in6p_laddr
1990 , inp->in6p_lport, 0);
1991
1992 while (tcp_next_port_v6(it, &res)) {
1993 ++cnt;
1994 }
1995 KASSERT(cnt);
1996 }
1997 /* Immediate port iterator functionality check: wild
1998 */
1999 if (enable & 16) {
2000 struct tcp_ports_iterator *it;
2001 struct vestigial_inpcb res;
2002 static struct in6_addr any = IN6ADDR_ANY_INIT;
2003 int cnt = 0;
2004
2005 it = tcp_init_ports_v6(&any
2006 , inp->in6p_lport, 1);
2007
2008 while (tcp_next_port_v6(it, &res)) {
2009 ++cnt;
2010 }
2011 KASSERT(cnt);
2012 }
2013 #endif /* VTW_DEBUG */
2014 break;
2015 }
2016 }
2017
2018 tcp_canceltimers(tp);
2019 tp = tcp_close(tp);
2020 KASSERT(!tp);
2021
2022 return 1;
2023 }
2024
2025 return 0;
2026 }
2027
2028 /*!\brief restart timer for vestigial time-wait entry
2029 */
2030 static void
2031 vtw_restart_v4(vestigial_inpcb_t *vp)
2032 {
2033 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2034 vtw_t *vtw;
2035 vtw_t *cp = ©.common;
2036 vtw_ctl_t *ctl;
2037
2038 KASSERT(mutex_owned(softnet_lock));
2039
2040 db_trace(KTR_VTW
2041 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2042 , vp->faddr.v4.s_addr, vp->fport
2043 , vp->laddr.v4.s_addr, vp->lport));
2044
2045 /* Class might have changed, so have a squiz.
2046 */
2047 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2048 vtw = vtw_alloc(ctl);
2049
2050 if (vtw) {
2051 vtw_v4_t *v4 = (void*)vtw;
2052
2053 /* Safe now to unhash the old entry
2054 */
2055 vtw_del(vp->ctl, vp->vtw);
2056
2057 vtw->snd_nxt = cp->snd_nxt;
2058 vtw->rcv_nxt = cp->rcv_nxt;
2059
2060 v4->faddr = copy.faddr;
2061 v4->laddr = copy.laddr;
2062 v4->fport = copy.fport;
2063 v4->lport = copy.lport;
2064
2065 vtw->reuse_port = cp->reuse_port;
2066 vtw->reuse_addr = cp->reuse_addr;
2067 vtw->v6only = 0;
2068 vtw->uid = cp->uid;
2069
2070 vtw_inshash_v4(ctl, vtw);
2071 }
2072
2073 vp->valid = 0;
2074 }
2075
2076 /*!\brief restart timer for vestigial time-wait entry
2077 */
2078 static void
2079 vtw_restart_v6(vestigial_inpcb_t *vp)
2080 {
2081 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2082 vtw_t *vtw;
2083 vtw_t *cp = ©.common;
2084 vtw_ctl_t *ctl;
2085
2086 KASSERT(mutex_owned(softnet_lock));
2087
2088 db_trace(KTR_VTW
2089 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2090 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2091 , vp->fport
2092 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2093 , vp->lport));
2094
2095 /* Class might have changed, so have a squiz.
2096 */
2097 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2098 vtw = vtw_alloc(ctl);
2099
2100 if (vtw) {
2101 vtw_v6_t *v6 = (void*)vtw;
2102
2103 /* Safe now to unhash the old entry
2104 */
2105 vtw_del(vp->ctl, vp->vtw);
2106
2107 vtw->snd_nxt = cp->snd_nxt;
2108 vtw->rcv_nxt = cp->rcv_nxt;
2109
2110 v6->faddr = copy.faddr;
2111 v6->laddr = copy.laddr;
2112 v6->fport = copy.fport;
2113 v6->lport = copy.lport;
2114
2115 vtw->reuse_port = cp->reuse_port;
2116 vtw->reuse_addr = cp->reuse_addr;
2117 vtw->v6only = cp->v6only;
2118 vtw->uid = cp->uid;
2119
2120 vtw_inshash_v6(ctl, vtw);
2121 }
2122
2123 vp->valid = 0;
2124 }
2125
2126 /*!\brief restart timer for vestigial time-wait entry
2127 */
2128 void
2129 vtw_restart(vestigial_inpcb_t *vp)
2130 {
2131 if (!vp || !vp->valid)
2132 return;
2133
2134 if (vp->v4)
2135 vtw_restart_v4(vp);
2136 else
2137 vtw_restart_v6(vp);
2138 }
2139
2140 int
2141 vtw_earlyinit(void)
2142 {
2143 int i, rc;
2144
2145 callout_init(&vtw_cs, 0);
2146 callout_setfunc(&vtw_cs, vtw_tick, 0);
2147
2148 for (i = 0; i < VTW_NCLASS; ++i) {
2149 vtw_tcpv4[i].is_v4 = 1;
2150 vtw_tcpv6[i].is_v6 = 1;
2151 }
2152
2153 if ((rc = vtw_control_init(AF_INET)) != 0 ||
2154 (rc = vtw_control_init(AF_INET6)) != 0)
2155 return rc;
2156
2157 return 0;
2158 }
2159
2160 #ifdef VTW_DEBUG
2161 #include <sys/syscallargs.h>
2162 #include <sys/sysctl.h>
2163
2164 /*!\brief add lalp, fafp entries for debug
2165 */
2166 int
2167 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class)
2168 {
2169 vtw_ctl_t *ctl;
2170 vtw_t *vtw;
2171
2172 ctl = vtw_control(af, msl ? msl : class_to_msl(class));
2173 if (!ctl)
2174 return 0;
2175
2176 vtw = vtw_alloc(ctl);
2177
2178 if (vtw) {
2179 vtw->snd_nxt = 0;
2180 vtw->rcv_nxt = 0;
2181
2182 switch (af) {
2183 case AF_INET: {
2184 vtw_v4_t *v4 = (void*)vtw;
2185
2186 v4->faddr = fa->sin_addr.v4.s_addr;
2187 v4->laddr = la->sin_addr.v4.s_addr;
2188 v4->fport = fa->sin_port;
2189 v4->lport = la->sin_port;
2190
2191 vtw->reuse_port = 1;
2192 vtw->reuse_addr = 1;
2193 vtw->v6only = 0;
2194 vtw->uid = 0;
2195
2196 vtw_inshash_v4(ctl, vtw);
2197 break;
2198 }
2199
2200 case AF_INET6: {
2201 vtw_v6_t *v6 = (void*)vtw;
2202
2203 v6->faddr = fa->sin_addr.v6;
2204 v6->laddr = la->sin_addr.v6;
2205
2206 v6->fport = fa->sin_port;
2207 v6->lport = la->sin_port;
2208
2209 vtw->reuse_port = 1;
2210 vtw->reuse_addr = 1;
2211 vtw->v6only = 0;
2212 vtw->uid = 0;
2213
2214 vtw_inshash_v6(ctl, vtw);
2215 break;
2216 }
2217
2218 default:
2219 break;
2220 }
2221
2222 return 1;
2223 }
2224
2225 return 0;
2226 }
2227
2228 static int vtw_syscall = 0;
2229
2230 static int
2231 vtw_debug_process(vtw_sysargs_t *ap)
2232 {
2233 struct vestigial_inpcb vestige;
2234 int rc = 0;
2235
2236 mutex_enter(softnet_lock);
2237
2238 switch (ap->op) {
2239 case 0: // insert
2240 vtw_debug_add(ap->la.sin_family
2241 , &ap->la
2242 , &ap->fa
2243 , TCPTV_MSL
2244 , 0);
2245 break;
2246
2247 case 1: // lookup
2248 case 2: // restart
2249 switch (ap->la.sin_family) {
2250 case AF_INET:
2251 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2252 ap->la.sin_addr.v4, ap->la.sin_port,
2253 &vestige)) {
2254 if (ap->op == 2) {
2255 vtw_restart(&vestige);
2256 }
2257 rc = 0;
2258 } else
2259 rc = ESRCH;
2260 break;
2261
2262 case AF_INET6:
2263 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2264 &ap->la.sin_addr.v6, ap->la.sin_port,
2265 &vestige)) {
2266 if (ap->op == 2) {
2267 vtw_restart(&vestige);
2268 }
2269 rc = 0;
2270 } else
2271 rc = ESRCH;
2272 break;
2273 default:
2274 rc = EINVAL;
2275 }
2276 break;
2277
2278 default:
2279 rc = EINVAL;
2280 }
2281
2282 mutex_exit(softnet_lock);
2283 return rc;
2284 }
2285
2286 struct sys_vtw_args {
2287 syscallarg(const vtw_sysargs_t *) req;
2288 syscallarg(size_t) len;
2289 };
2290
2291 static int
2292 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2293 {
2294 const struct sys_vtw_args *uap = _;
2295 void *buf;
2296 int rc;
2297 size_t len = SCARG(uap, len);
2298
2299 if (len != sizeof (vtw_sysargs_t))
2300 return EINVAL;
2301
2302 buf = kmem_alloc(len, KM_SLEEP);
2303 if (!buf)
2304 return ENOMEM;
2305
2306 rc = copyin(SCARG(uap, req), buf, len);
2307 if (!rc) {
2308 rc = vtw_debug_process(buf);
2309 }
2310 kmem_free(buf, len);
2311
2312 return rc;
2313 }
2314
2315 static void
2316 vtw_sanity_check(void)
2317 {
2318 vtw_ctl_t *ctl;
2319 vtw_t *vtw;
2320 int i;
2321 int n;
2322
2323 for (i = 0; i < VTW_NCLASS; ++i) {
2324 ctl = &vtw_tcpv4[i];
2325
2326 if (!ctl->base.v || ctl->nalloc)
2327 continue;
2328
2329 for (n = 0, vtw = ctl->base.v; ; ) {
2330 ++n;
2331 vtw = vtw_next(ctl, vtw);
2332 if (vtw == ctl->base.v)
2333 break;
2334 }
2335 db_trace(KTR_VTW
2336 , (ctl, "sanity: class %x n %x nfree %x"
2337 , i, n, ctl->nfree));
2338
2339 KASSERT(n == ctl->nfree);
2340 }
2341
2342 for (i = 0; i < VTW_NCLASS; ++i) {
2343 ctl = &vtw_tcpv6[i];
2344
2345 if (!ctl->base.v || ctl->nalloc)
2346 continue;
2347
2348 for (n = 0, vtw = ctl->base.v; ; ) {
2349 ++n;
2350 vtw = vtw_next(ctl, vtw);
2351 if (vtw == ctl->base.v)
2352 break;
2353 }
2354 db_trace(KTR_VTW
2355 , (ctl, "sanity: class %x n %x nfree %x"
2356 , i, n, ctl->nfree));
2357 KASSERT(n == ctl->nfree);
2358 }
2359 }
2360
2361 /*!\brief Initialise debug support.
2362 */
2363 static void
2364 vtw_debug_init(void)
2365 {
2366 int i;
2367
2368 vtw_sanity_check();
2369
2370 if (vtw_syscall)
2371 return;
2372
2373 for (i = 511; i; --i) {
2374 if (sysent[i].sy_call == sys_nosys) {
2375 sysent[i].sy_call = vtw_sys;
2376 sysent[i].sy_narg = 2;
2377 sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2378 sysent[i].sy_flags = 0;
2379
2380 vtw_syscall = i;
2381 break;
2382 }
2383 }
2384 if (i) {
2385 const struct sysctlnode *node;
2386 uint32_t flags;
2387
2388 flags = sysctl_root.sysctl_flags;
2389
2390 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2391 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2392
2393 sysctl_createv(0, 0, 0, &node,
2394 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2395 "koff",
2396 SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2397 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2398
2399 if (!node) {
2400 sysctl_createv(0, 0, 0, &node,
2401 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2402 "koffka",
2403 SYSCTL_DESCR("The Real(tm) Kernel"
2404 " Obscure Feature Finder"),
2405 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2406 }
2407 if (node) {
2408 sysctl_createv(0, 0, 0, 0,
2409 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2410 CTLTYPE_INT, "vtw_debug_syscall",
2411 SYSCTL_DESCR("vtw debug"
2412 " system call number"),
2413 0, 0, &vtw_syscall, 0, node->sysctl_num,
2414 CTL_CREATE, CTL_EOL);
2415 }
2416 sysctl_root.sysctl_flags = flags;
2417 }
2418 }
2419 #else /* !VTW_DEBUG */
2420 static void
2421 vtw_debug_init(void)
2422 {
2423 return;
2424 }
2425 #endif /* !VTW_DEBUG */
2426