tcp_vtw.c revision 1.3 1 /*
2 * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to The NetBSD Foundation
6 * by Coyote Point Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29 #include <sys/cdefs.h>
30
31 #include "opt_ddb.h"
32 #include "opt_inet.h"
33 #include "opt_ipsec.h"
34 #include "opt_inet_csum.h"
35 #include "opt_tcp_debug.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/malloc.h>
40 #include <sys/kmem.h>
41 #include <sys/mbuf.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/errno.h>
46 #include <sys/syslog.h>
47 #include <sys/pool.h>
48 #include <sys/domain.h>
49 #include <sys/kernel.h>
50 #include <net/if.h>
51 #include <net/route.h>
52 #include <net/if_types.h>
53
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/in_offload.h>
61 #include <netinet/ip6.h>
62 #include <netinet6/ip6_var.h>
63 #include <netinet6/in6_pcb.h>
64 #include <netinet6/ip6_var.h>
65 #include <netinet6/in6_var.h>
66 #include <netinet/icmp6.h>
67 #include <netinet6/nd6.h>
68
69 #include <netinet/tcp.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_seq.h>
72 #include <netinet/tcp_timer.h>
73 #include <netinet/tcp_var.h>
74 #include <netinet/tcp_private.h>
75 #include <netinet/tcpip.h>
76
77 #include <machine/stdarg.h>
78 #include <netinet/tcp_vtw.h>
79
80 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.3 2011/05/11 15:08:59 drochner Exp $");
81
82 #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
83
84 static void vtw_debug_init(void);
85
86 fatp_ctl_t fat_tcpv4;
87 fatp_ctl_t fat_tcpv6;
88 vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
89 vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
90 vtw_stats_t vtw_stats;
91
92 /* We provide state for the lookup_ports iterator.
93 * As currently we are netlock-protected, there is one.
94 * If we were finer-grain, we would have one per CPU.
95 * I do not want to be in the business of alloc/free.
96 * The best alternate would be allocate on the caller's
97 * stack, but that would require them to know the struct,
98 * or at least the size.
99 * See how she goes.
100 */
101 struct tcp_ports_iterator {
102 union {
103 struct in_addr v4;
104 struct in6_addr v6;
105 } addr;
106 u_int port;
107
108 uint32_t wild : 1;
109
110 vtw_ctl_t *ctl;
111 fatp_t *fp;
112
113 uint16_t slot_idx;
114 uint16_t ctl_idx;
115 };
116
117 static struct tcp_ports_iterator tcp_ports_iterator_v4;
118 static struct tcp_ports_iterator tcp_ports_iterator_v6;
119
120 static int vtw_age(vtw_ctl_t *, struct timeval *);
121
122 /*!\brief allocate a fat pointer from a collection.
123 */
124 static fatp_t *
125 fatp_alloc(fatp_ctl_t *fat)
126 {
127 fatp_t *fp = 0;
128
129 if (fat->nfree) {
130 fp = fat->free;
131 if (fp) {
132 fat->free = fatp_next(fat, fp);
133 --fat->nfree;
134 ++fat->nalloc;
135 fp->nxt = 0;
136
137 KASSERT(!fp->inuse);
138 }
139 }
140
141 return fp;
142 }
143
144 /*!\brief free a fat pointer.
145 */
146 static void
147 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
148 {
149 if (fp) {
150 KASSERT(!fp->inuse);
151 KASSERT(!fp->nxt);
152
153 fp->nxt = fatp_index(fat, fat->free);
154 fat->free = fp;
155
156 ++fat->nfree;
157 --fat->nalloc;
158 }
159 }
160
161 /*!\brief initialise a collection of fat pointers.
162 *
163 *\param n # hash buckets
164 *\param m total # fat pointers to allocate
165 *
166 * We allocate 2x as much, as we have two hashes: full and lport only.
167 */
168 static void
169 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m)
170 {
171 fatp_t *fp;
172
173 KASSERT(n <= FATP_MAX / 2);
174
175 fat->hash = kmem_alloc(2*m * sizeof (fatp_t *), KM_SLEEP);
176 fat->base = kmem_alloc(2*n * sizeof (fatp_t), KM_SLEEP);
177
178 if (!fat->base) {
179 if (fat->hash)
180 kmem_free(fat->hash, 2*m * sizeof (fatp_t *));
181
182 bzero(fat, sizeof (*fat));
183 return;
184 }
185
186 fat->port = &fat->hash[m];
187
188 fat->mask = m - 1; // ASSERT is power of 2 (m)
189 fat->lim = fat->base + 2*n - 1;
190 fat->nfree = 0;
191 fat->nalloc = 2*n;
192
193 bzero(fat->hash, 2*m * sizeof (fatp_t *));
194 bzero(fat->base, 2*n * sizeof (fatp_t));
195
196 /* Initialise the free list.
197 */
198 for (fp = fat->lim; fp >= fat->base; --fp) {
199 fatp_free(fat, fp);
200 }
201 }
202
203 /*
204 * The `xtra' is XORed into the tag stored.
205 */
206 static uint32_t fatp_xtra[] = {
207 0x11111111,0x22222222,0x33333333,0x44444444,
208 0x55555555,0x66666666,0x77777777,0x88888888,
209 0x12121212,0x21212121,0x34343434,0x43434343,
210 0x56565656,0x65656565,0x78787878,0x87878787,
211 0x11221122,0x22112211,0x33443344,0x44334433,
212 0x55665566,0x66556655,0x77887788,0x88778877,
213 0x11112222,0x22221111,0x33334444,0x44443333,
214 0x55556666,0x66665555,0x77778888,0x88887777,
215 };
216
217 /*!\brief turn a {fatp_t*,slot} into an integral key.
218 *
219 * The key can be used to obtain the fatp_t, and the slot,
220 * as it directly encodes them.
221 */
222 static inline uint32_t
223 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
224 {
225 CTASSERT(CACHE_LINE_SIZE == 32 ||
226 CACHE_LINE_SIZE == 64 ||
227 CACHE_LINE_SIZE == 128);
228
229 switch (fatp_ntags()) {
230 case 7:
231 return (fatp_index(fat, fp) << 3) | slot;
232 case 15:
233 return (fatp_index(fat, fp) << 4) | slot;
234 case 31:
235 return (fatp_index(fat, fp) << 5) | slot;
236 default:
237 KASSERT(0 && "no support, for no good reason");
238 return ~0;
239 }
240 }
241
242 static inline uint32_t
243 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
244 {
245 CTASSERT(CACHE_LINE_SIZE == 32 ||
246 CACHE_LINE_SIZE == 64 ||
247 CACHE_LINE_SIZE == 128);
248
249 switch (fatp_ntags()) {
250 case 7:
251 return key & 7;
252 case 15:
253 return key & 15;
254 case 31:
255 return key & 31;
256 default:
257 KASSERT(0 && "no support, for no good reason");
258 return ~0;
259 }
260 }
261
262 static inline fatp_t *
263 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
264 {
265 CTASSERT(CACHE_LINE_SIZE == 32 ||
266 CACHE_LINE_SIZE == 64 ||
267 CACHE_LINE_SIZE == 128);
268
269 switch (fatp_ntags()) {
270 case 7:
271 key >>= 3;
272 break;
273 case 15:
274 key >>= 4;
275 break;
276 case 31:
277 key >>= 5;
278 break;
279 default:
280 KASSERT(0 && "no support, for no good reason");
281 return 0;
282 }
283
284 return key ? fat->base + key - 1 : 0;
285 }
286
287 static inline uint32_t
288 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
289 {
290 return (idx << ctl->idx_bits) | idx;
291 }
292
293 static inline uint32_t
294 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
295 {
296 uint32_t idx = bits & ctl->idx_mask;
297
298 if (idx_encode(ctl, idx) == bits)
299 return idx;
300 else
301 return ~0;
302 }
303
304 /*!\brief insert index into fatp hash
305 *
306 *\param idx - index of element being placed in hash chain
307 *\param tag - 32-bit tag identifier
308 *
309 *\returns
310 * value which can be used to locate entry.
311 *
312 *\note
313 * we rely on the fact that there are unused high bits in the index
314 * for verification purposes on lookup.
315 */
316
317 static inline uint32_t
318 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
319 void *dbg)
320 {
321 fatp_t *fp;
322 fatp_t **hash = (which ? fat->port : fat->hash);
323 int i;
324
325 fp = hash[tag & fat->mask];
326
327 while (!fp || fatp_full(fp)) {
328 fatp_t *fq;
329
330 /* All entries are inuse at the top level.
331 * We allocate a spare, and push the top level
332 * down one. All entries in the fp we push down
333 * (think of a tape worm here) will be expelled sooner than
334 * any entries added subsequently to this hash bucket.
335 * This is a property of the time waits we are exploiting.
336 */
337
338 fq = fatp_alloc(fat);
339 if (!fq) {
340 vtw_age(fat->vtw, 0);
341 fp = hash[tag & fat->mask];
342 continue;
343 }
344
345 fq->inuse = 0;
346 fq->nxt = fatp_index(fat, fp);
347
348 hash[tag & fat->mask] = fq;
349
350 fp = fq;
351 }
352
353 KASSERT(!fatp_full(fp));
354
355 /* Fill highest index first. Lookup is lowest first.
356 */
357 for (i = fatp_ntags(); --i >= 0; ) {
358 if (!((1 << i) & fp->inuse)) {
359 break;
360 }
361 }
362
363 fp->inuse |= 1 << i;
364 fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
365
366 db_trace(KTR_VTW
367 , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
368 , fp->inuse
369 , i, fp->tag[i]));
370
371 return fatp_key(fat, fp, i);
372 }
373
374 static inline int
375 vtw_alive(const vtw_t *vtw)
376 {
377 return vtw->hashed && vtw->expire.tv_sec;
378 }
379
380 static inline uint32_t
381 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
382 {
383 if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
384 return v4 - ctl->base.v4;
385
386 KASSERT(0 && "vtw out of bounds");
387
388 return ~0;
389 }
390
391 static inline uint32_t
392 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
393 {
394 if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
395 return v6 - ctl->base.v6;
396
397 KASSERT(0 && "vtw out of bounds");
398
399 return ~0;
400 }
401
402 static inline uint32_t
403 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
404 {
405 if (ctl->clidx)
406 ctl = ctl->ctl;
407
408 if (ctl->is_v4)
409 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
410
411 if (ctl->is_v6)
412 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
413
414 KASSERT(0 && "neither 4 nor 6. most curious.");
415
416 return ~0;
417 }
418
419 static inline vtw_t *
420 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
421 {
422 if (ctl->clidx)
423 ctl = ctl->ctl;
424
425 /* See if the index looks like it might be an index.
426 * Bits on outside of the valid index bits is a give away.
427 */
428 idx = idx_decode(ctl, idx);
429
430 if (idx == ~0) {
431 return 0;
432 } else if (ctl->is_v4) {
433 vtw_v4_t *vtw = ctl->base.v4 + idx;
434
435 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
436 ? &vtw->common : 0;
437 } else if (ctl->is_v6) {
438 vtw_v6_t *vtw = ctl->base.v6 + idx;
439
440 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
441 ? &vtw->common : 0;
442 } else {
443 KASSERT(0 && "badness");
444 return 0;
445 }
446 }
447
448 /*!\brief return the next vtw after this one.
449 *
450 * Due to the differing sizes of the entries in differing
451 * arenas, we have to ensure we ++ the correct pointer type.
452 *
453 * Also handles wrap.
454 */
455 static inline vtw_t *
456 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
457 {
458 if (ctl->is_v4) {
459 vtw_v4_t *v4 = (void*)vtw;
460
461 vtw = &(++v4)->common;
462 } else {
463 vtw_v6_t *v6 = (void*)vtw;
464
465 vtw = &(++v6)->common;
466 }
467
468 if (vtw > ctl->lim.v)
469 vtw = ctl->base.v;
470
471 return vtw;
472 }
473
474 /*!\brief remove entry from FATP hash chains
475 */
476 static inline void
477 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
478 {
479 fatp_ctl_t *fat = ctl->fat;
480 fatp_t *fp;
481 uint32_t key = vtw->key;
482 uint32_t tag, slot, idx;
483 vtw_v4_t *v4 = (void*)vtw;
484 vtw_v6_t *v6 = (void*)vtw;
485
486 if (!vtw->hashed) {
487 KASSERT(0 && "unhashed");
488 return;
489 }
490
491 if (fat->vtw->is_v4) {
492 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
493 } else if (fat->vtw->is_v6) {
494 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
495 } else {
496 tag = 0;
497 KASSERT(0 && "not reached");
498 }
499
500 /* Remove from fat->hash[]
501 */
502 slot = fatp_slot_from_key(fat, key);
503 fp = fatp_from_key(fat, key);
504 idx = vtw_index(ctl, vtw);
505
506 db_trace(KTR_VTW
507 , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
508 , fp->inuse, slot, idx, key, tag));
509
510 KASSERT(fp->inuse & (1 << slot));
511 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
512 ^ fatp_xtra[slot]));
513
514 if ((fp->inuse & (1 << slot))
515 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
516 ^ fatp_xtra[slot])) {
517 fp->inuse ^= 1 << slot;
518 fp->tag[slot] = 0;
519
520 /* When we delete entries, we do not compact. This is
521 * due to temporality. We add entries, and they
522 * (eventually) expire. Older entries will be further
523 * down the chain.
524 */
525 if (!fp->inuse) {
526 uint32_t hi = tag & fat->mask;
527 fatp_t *fq = 0;
528 fatp_t *fr = fat->hash[hi];
529
530 while (fr && fr != fp) {
531 fr = fatp_next(fat, fq = fr);
532 }
533
534 if (fr == fp) {
535 if (fq) {
536 fq->nxt = fp->nxt;
537 fp->nxt = 0;
538 fatp_free(fat, fp);
539 } else {
540 KASSERT(fat->hash[hi] == fp);
541
542 if (fp->nxt) {
543 fat->hash[hi]
544 = fatp_next(fat, fp);
545 fp->nxt = 0;
546 fatp_free(fat, fp);
547 } else {
548 /* retain for next use.
549 */
550 ;
551 }
552 }
553 } else {
554 fr = fat->hash[hi];
555
556 do {
557 db_trace(KTR_VTW
558 , (fr
559 , "fat:*del inuse %5.5x"
560 " nxt %x"
561 , fr->inuse, fr->nxt));
562
563 fr = fatp_next(fat, fq = fr);
564 } while (fr && fr != fp);
565
566 KASSERT(0 && "oops");
567 }
568 }
569 vtw->key ^= ~0;
570 }
571
572 if (fat->vtw->is_v4) {
573 tag = v4_port_tag(v4->lport);
574 } else if (fat->vtw->is_v6) {
575 tag = v6_port_tag(v6->lport);
576 }
577
578 /* Remove from fat->port[]
579 */
580 key = vtw->port_key;
581 slot = fatp_slot_from_key(fat, key);
582 fp = fatp_from_key(fat, key);
583 idx = vtw_index(ctl, vtw);
584
585 db_trace(KTR_VTW
586 , (fp, "fatport: del inuse %5.5x"
587 " slot %x idx %x key %x tag %x"
588 , fp->inuse, slot, idx, key, tag));
589
590 KASSERT(fp->inuse & (1 << slot));
591 KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
592 ^ fatp_xtra[slot]));
593
594 if ((fp->inuse & (1 << slot))
595 && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
596 ^ fatp_xtra[slot])) {
597 fp->inuse ^= 1 << slot;
598 fp->tag[slot] = 0;
599
600 if (!fp->inuse) {
601 uint32_t hi = tag & fat->mask;
602 fatp_t *fq = 0;
603 fatp_t *fr = fat->port[hi];
604
605 while (fr && fr != fp) {
606 fr = fatp_next(fat, fq = fr);
607 }
608
609 if (fr == fp) {
610 if (fq) {
611 fq->nxt = fp->nxt;
612 fp->nxt = 0;
613 fatp_free(fat, fp);
614 } else {
615 KASSERT(fat->port[hi] == fp);
616
617 if (fp->nxt) {
618 fat->port[hi]
619 = fatp_next(fat, fp);
620 fp->nxt = 0;
621 fatp_free(fat, fp);
622 } else {
623 /* retain for next use.
624 */
625 ;
626 }
627 }
628 }
629 }
630 vtw->port_key ^= ~0;
631 }
632
633 vtw->hashed = 0;
634 }
635
636 /*!\brief remove entry from hash, possibly free.
637 */
638 void
639 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
640 {
641 KASSERT(mutex_owned(softnet_lock));
642
643 if (vtw->hashed) {
644 ++vtw_stats.del;
645 vtw_unhash(ctl, vtw);
646 }
647
648 /* We only delete the oldest entry.
649 */
650 if (vtw != ctl->oldest.v)
651 return;
652
653 --ctl->nalloc;
654 ++ctl->nfree;
655
656 vtw->expire.tv_sec = 0;
657 vtw->expire.tv_usec = ~0;
658
659 if (!ctl->nalloc)
660 ctl->oldest.v = 0;
661
662 ctl->oldest.v = vtw_next(ctl, vtw);
663 }
664
665 /*!\brief insert vestigeal timewait in hash chain
666 */
667 static void
668 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
669 {
670 uint32_t idx = vtw_index(ctl, vtw);
671 uint32_t tag;
672 vtw_v4_t *v4 = (void*)vtw;
673
674 KASSERT(mutex_owned(softnet_lock));
675 KASSERT(!vtw->hashed);
676 KASSERT(ctl->clidx == vtw->msl_class);
677
678 ++vtw_stats.ins;
679
680 tag = v4_tag(v4->faddr, v4->fport,
681 v4->laddr, v4->lport);
682
683 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
684
685 db_trace(KTR_VTW, (ctl
686 , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
687 " tag %8.8x key %8.8x"
688 , v4->faddr, v4->fport
689 , v4->laddr, v4->lport
690 , tag
691 , vtw->key));
692
693 tag = v4_port_tag(v4->lport);
694 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
695
696 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
697 , v4->lport, v4->lport
698 , tag
699 , vtw->key));
700
701 vtw->hashed = 1;
702 }
703
704 /*!\brief insert vestigeal timewait in hash chain
705 */
706 static void
707 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
708 {
709 uint32_t idx = vtw_index(ctl, vtw);
710 uint32_t tag;
711 vtw_v6_t *v6 = (void*)vtw;
712
713 KASSERT(mutex_owned(softnet_lock));
714 KASSERT(!vtw->hashed);
715 KASSERT(ctl->clidx == vtw->msl_class);
716
717 ++vtw_stats.ins;
718
719 tag = v6_tag(&v6->faddr, v6->fport,
720 &v6->laddr, v6->lport);
721
722 vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
723
724 tag = v6_port_tag(v6->lport);
725 vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
726
727 db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
728 , v6->lport, v6->lport
729 , tag
730 , vtw->key));
731
732 vtw->hashed = 1;
733 }
734
735 static vtw_t *
736 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
737 , uint32_t laddr, uint16_t lport
738 , int which)
739 {
740 vtw_v4_t *v4;
741 vtw_t *vtw;
742 uint32_t tag;
743 fatp_t *fp;
744 int i;
745 uint32_t fatps = 0, probes = 0, losings = 0;
746
747 if (!ctl || !ctl->fat)
748 return 0;
749
750 ++vtw_stats.look[which];
751
752 if (which) {
753 tag = v4_port_tag(lport);
754 fp = ctl->fat->port[tag & ctl->fat->mask];
755 } else {
756 tag = v4_tag(faddr, fport, laddr, lport);
757 fp = ctl->fat->hash[tag & ctl->fat->mask];
758 }
759
760 while (fp && fp->inuse) {
761 uint32_t inuse = fp->inuse;
762
763 ++fatps;
764
765 for (i = 0; inuse && i < fatp_ntags(); ++i) {
766 uint32_t idx;
767
768 if (!(inuse & (1 << i)))
769 continue;
770
771 inuse ^= 1 << i;
772
773 ++probes;
774 ++vtw_stats.probe[which];
775
776 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
777 vtw = vtw_from_index(ctl, idx);
778
779 if (!vtw) {
780 /* Hopefully fast path.
781 */
782 db_trace(KTR_VTW
783 , (fp, "vtw: fast %A:%P %A:%P"
784 " idx %x tag %x"
785 , faddr, fport
786 , laddr, lport
787 , idx, tag));
788 continue;
789 }
790
791 v4 = (void*)vtw;
792
793 /* The de-referencing of vtw is what we want to avoid.
794 * Losing.
795 */
796 if (vtw_alive(vtw)
797 && ((which ? vtw->port_key : vtw->key)
798 == fatp_key(ctl->fat, fp, i))
799 && (which
800 || (v4->faddr == faddr && v4->laddr == laddr
801 && v4->fport == fport))
802 && v4->lport == lport) {
803 ++vtw_stats.hit[which];
804
805 db_trace(KTR_VTW
806 , (fp, "vtw: hit %8.8x:%4.4x"
807 " %8.8x:%4.4x idx %x key %x"
808 , faddr, fport
809 , laddr, lport
810 , idx_decode(ctl, idx), vtw->key));
811
812 KASSERT(vtw->hashed);
813
814 goto out;
815 }
816 ++vtw_stats.losing[which];
817 ++losings;
818
819 if (vtw_alive(vtw)) {
820 db_trace(KTR_VTW
821 , (fp, "vtw:!mis %8.8x:%4.4x"
822 " %8.8x:%4.4x key %x tag %x"
823 , faddr, fport
824 , laddr, lport
825 , fatp_key(ctl->fat, fp, i)
826 , v4_tag(faddr, fport
827 , laddr, lport)));
828 db_trace(KTR_VTW
829 , (vtw, "vtw:!mis %8.8x:%4.4x"
830 " %8.8x:%4.4x key %x tag %x"
831 , v4->faddr, v4->fport
832 , v4->laddr, v4->lport
833 , vtw->key
834 , v4_tag(v4->faddr, v4->fport
835 , v4->laddr, v4->lport)));
836
837 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
838 db_trace(KTR_VTW
839 , (vtw, "vtw:!mis %8.8x:%4.4x"
840 " %8.8x:%4.4x key %x"
841 " which %x"
842 , v4->faddr, v4->fport
843 , v4->laddr, v4->lport
844 , vtw->key
845 , which));
846
847 } else {
848 db_trace(KTR_VTW
849 , (vtw
850 , "vtw:!mis"
851 " key %8.8x != %8.8x"
852 " idx %x i %x which %x"
853 , vtw->key
854 , fatp_key(ctl->fat, fp, i)
855 , idx_decode(ctl, idx)
856 , i
857 , which));
858 }
859 } else {
860 db_trace(KTR_VTW
861 , (fp
862 , "vtw:!mis free entry"
863 " idx %x vtw %p which %x"
864 , idx_decode(ctl, idx)
865 , vtw, which));
866 }
867 }
868
869 if (fp->nxt) {
870 fp = fatp_next(ctl->fat, fp);
871 } else {
872 break;
873 }
874 }
875 ++vtw_stats.miss[which];
876 vtw = 0;
877 out:
878 if (fatps > vtw_stats.max_chain[which])
879 vtw_stats.max_chain[which] = fatps;
880 if (probes > vtw_stats.max_probe[which])
881 vtw_stats.max_probe[which] = probes;
882 if (losings > vtw_stats.max_loss[which])
883 vtw_stats.max_loss[which] = losings;
884
885 return vtw;
886 }
887
888 static vtw_t *
889 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
890 , const struct in6_addr *laddr, uint16_t lport
891 , int which)
892 {
893 vtw_v6_t *v6;
894 vtw_t *vtw;
895 uint32_t tag;
896 fatp_t *fp;
897 int i;
898 uint32_t fatps = 0, probes = 0, losings = 0;
899
900 ++vtw_stats.look[which];
901
902 if (!ctl || !ctl->fat)
903 return 0;
904
905 if (which) {
906 tag = v6_port_tag(lport);
907 fp = ctl->fat->port[tag & ctl->fat->mask];
908 } else {
909 tag = v6_tag(faddr, fport, laddr, lport);
910 fp = ctl->fat->hash[tag & ctl->fat->mask];
911 }
912
913 while (fp && fp->inuse) {
914 uint32_t inuse = fp->inuse;
915
916 ++fatps;
917
918 for (i = 0; inuse && i < fatp_ntags(); ++i) {
919 uint32_t idx;
920
921 if (!(inuse & (1 << i)))
922 continue;
923
924 inuse ^= 1 << i;
925
926 ++probes;
927 ++vtw_stats.probe[which];
928
929 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
930 vtw = vtw_from_index(ctl, idx);
931
932 db_trace(KTR_VTW
933 , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
934 , i
935 , db_store(faddr, sizeof (*faddr)), fport
936 , db_store(laddr, sizeof (*laddr)), lport
937 , idx_decode(ctl, idx)));
938
939 if (!vtw) {
940 /* Hopefully fast path.
941 */
942 continue;
943 }
944
945 v6 = (void*)vtw;
946
947 if (vtw_alive(vtw)
948 && ((which ? vtw->port_key : vtw->key)
949 == fatp_key(ctl->fat, fp, i))
950 && v6->lport == lport
951 && (which
952 || (v6->fport == fport
953 && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
954 && !bcmp(&v6->laddr, laddr
955 , sizeof (*laddr))))) {
956 ++vtw_stats.hit[which];
957
958 KASSERT(vtw->hashed);
959 goto out;
960 } else {
961 ++vtw_stats.losing[which];
962 ++losings;
963 }
964 }
965
966 if (fp->nxt) {
967 fp = fatp_next(ctl->fat, fp);
968 } else {
969 break;
970 }
971 }
972 ++vtw_stats.miss[which];
973 vtw = 0;
974 out:
975 if (fatps > vtw_stats.max_chain[which])
976 vtw_stats.max_chain[which] = fatps;
977 if (probes > vtw_stats.max_probe[which])
978 vtw_stats.max_probe[which] = probes;
979 if (losings > vtw_stats.max_loss[which])
980 vtw_stats.max_loss[which] = losings;
981
982 return vtw;
983 }
984
985 /*!\brief port iterator
986 */
987 static vtw_t *
988 vtw_next_port_v4(struct tcp_ports_iterator *it)
989 {
990 vtw_ctl_t *ctl = it->ctl;
991 vtw_v4_t *v4;
992 vtw_t *vtw;
993 uint32_t tag;
994 uint16_t lport = it->port;
995 fatp_t *fp;
996 int i;
997 uint32_t fatps = 0, probes = 0, losings = 0;
998
999 tag = v4_port_tag(lport);
1000 if (!it->fp) {
1001 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1002 it->slot_idx = 0;
1003 }
1004 fp = it->fp;
1005
1006 while (fp) {
1007 uint32_t inuse = fp->inuse;
1008
1009 ++fatps;
1010
1011 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1012 uint32_t idx;
1013
1014 if (!(inuse & (1 << i)))
1015 continue;
1016
1017 inuse &= ~0 << i;
1018
1019 if (i < it->slot_idx)
1020 continue;
1021
1022 ++vtw_stats.probe[1];
1023 ++probes;
1024
1025 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1026 vtw = vtw_from_index(ctl, idx);
1027
1028 if (!vtw) {
1029 /* Hopefully fast path.
1030 */
1031 continue;
1032 }
1033
1034 v4 = (void*)vtw;
1035
1036 if (vtw_alive(vtw)
1037 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1038 && v4->lport == lport) {
1039 ++vtw_stats.hit[1];
1040
1041 it->slot_idx = i + 1;
1042
1043 goto out;
1044 } else if (vtw_alive(vtw)) {
1045 ++vtw_stats.losing[1];
1046 ++losings;
1047
1048 db_trace(KTR_VTW
1049 , (vtw, "vtw:!mis"
1050 " port %8.8x:%4.4x %8.8x:%4.4x"
1051 " key %x port %x"
1052 , v4->faddr, v4->fport
1053 , v4->laddr, v4->lport
1054 , vtw->key
1055 , lport));
1056 } else {
1057 /* Really losing here. We are coming
1058 * up with references to free entries.
1059 * Might find it better to use
1060 * traditional, or need another
1061 * add-hockery. The other add-hockery
1062 * would be to pul more into into the
1063 * cache line to reject the false
1064 * hits.
1065 */
1066 ++vtw_stats.losing[1];
1067 ++losings;
1068 db_trace(KTR_VTW
1069 , (fp, "vtw:!mis port %x"
1070 " - free entry idx %x vtw %p"
1071 , lport
1072 , idx_decode(ctl, idx)
1073 , vtw));
1074 }
1075 }
1076
1077 if (fp->nxt) {
1078 it->fp = fp = fatp_next(ctl->fat, fp);
1079 it->slot_idx = 0;
1080 } else {
1081 it->fp = 0;
1082 break;
1083 }
1084 }
1085 ++vtw_stats.miss[1];
1086
1087 vtw = 0;
1088 out:
1089 if (fatps > vtw_stats.max_chain[1])
1090 vtw_stats.max_chain[1] = fatps;
1091 if (probes > vtw_stats.max_probe[1])
1092 vtw_stats.max_probe[1] = probes;
1093 if (losings > vtw_stats.max_loss[1])
1094 vtw_stats.max_loss[1] = losings;
1095
1096 return vtw;
1097 }
1098
1099 /*!\brief port iterator
1100 */
1101 static vtw_t *
1102 vtw_next_port_v6(struct tcp_ports_iterator *it)
1103 {
1104 vtw_ctl_t *ctl = it->ctl;
1105 vtw_v6_t *v6;
1106 vtw_t *vtw;
1107 uint32_t tag;
1108 uint16_t lport = it->port;
1109 fatp_t *fp;
1110 int i;
1111 uint32_t fatps = 0, probes = 0, losings = 0;
1112
1113 tag = v6_port_tag(lport);
1114 if (!it->fp) {
1115 it->fp = ctl->fat->port[tag & ctl->fat->mask];
1116 it->slot_idx = 0;
1117 }
1118 fp = it->fp;
1119
1120 while (fp) {
1121 uint32_t inuse = fp->inuse;
1122
1123 ++fatps;
1124
1125 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1126 uint32_t idx;
1127
1128 if (!(inuse & (1 << i)))
1129 continue;
1130
1131 inuse &= ~0 << i;
1132
1133 if (i < it->slot_idx)
1134 continue;
1135
1136 ++vtw_stats.probe[1];
1137 ++probes;
1138
1139 idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1140 vtw = vtw_from_index(ctl, idx);
1141
1142 if (!vtw) {
1143 /* Hopefully fast path.
1144 */
1145 continue;
1146 }
1147
1148 v6 = (void*)vtw;
1149
1150 db_trace(KTR_VTW
1151 , (vtw, "vtw: i %x idx %x fp->tag %x"
1152 " tag %x xtra %x"
1153 , i, idx_decode(ctl, idx)
1154 , fp->tag[i], tag, fatp_xtra[i]));
1155
1156 if (vtw_alive(vtw)
1157 && vtw->port_key == fatp_key(ctl->fat, fp, i)
1158 && v6->lport == lport) {
1159 ++vtw_stats.hit[1];
1160
1161 db_trace(KTR_VTW
1162 , (fp, "vtw: nxt port %P - %4.4x"
1163 " idx %x key %x"
1164 , lport, lport
1165 , idx_decode(ctl, idx), vtw->key));
1166
1167 it->slot_idx = i + 1;
1168 goto out;
1169 } else if (vtw_alive(vtw)) {
1170 ++vtw_stats.losing[1];
1171
1172 db_trace(KTR_VTW
1173 , (vtw, "vtw:!mis port %6A:%4.4x"
1174 " %6A:%4.4x key %x port %x"
1175 , db_store(&v6->faddr
1176 , sizeof (v6->faddr))
1177 , v6->fport
1178 , db_store(&v6->laddr
1179 , sizeof (v6->faddr))
1180 , v6->lport
1181 , vtw->key
1182 , lport));
1183 } else {
1184 /* Really losing here. We are coming
1185 * up with references to free entries.
1186 * Might find it better to use
1187 * traditional, or need another
1188 * add-hockery. The other add-hockery
1189 * would be to pul more into into the
1190 * cache line to reject the false
1191 * hits.
1192 */
1193 ++vtw_stats.losing[1];
1194 ++losings;
1195
1196 db_trace(KTR_VTW
1197 , (fp
1198 , "vtw:!mis port %x"
1199 " - free entry idx %x vtw %p"
1200 , lport, idx_decode(ctl, idx)
1201 , vtw));
1202 }
1203 }
1204
1205 if (fp->nxt) {
1206 it->fp = fp = fatp_next(ctl->fat, fp);
1207 it->slot_idx = 0;
1208 } else {
1209 it->fp = 0;
1210 break;
1211 }
1212 }
1213 ++vtw_stats.miss[1];
1214
1215 vtw = 0;
1216 out:
1217 if (fatps > vtw_stats.max_chain[1])
1218 vtw_stats.max_chain[1] = fatps;
1219 if (probes > vtw_stats.max_probe[1])
1220 vtw_stats.max_probe[1] = probes;
1221 if (losings > vtw_stats.max_loss[1])
1222 vtw_stats.max_loss[1] = losings;
1223
1224 return vtw;
1225 }
1226
1227 /*!\brief initialise the VTW allocation arena
1228 *
1229 * There are 1+3 allocation classes:
1230 * 0 classless
1231 * {1,2,3} MSL-class based allocation
1232 *
1233 * The allocation arenas are all initialised. Classless gets all the
1234 * space. MSL-class based divides the arena, so that allocation
1235 * within a class can proceed without having to consider entries
1236 * (aka: cache lines) from different classes.
1237 *
1238 * Usually, we are completely classless or class-based, but there can be
1239 * transition periods, corresponding to dynamic adjustments in the config
1240 * by the operator.
1241 */
1242 static void
1243 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, uint32_t n)
1244 {
1245 int i;
1246 int sz = (ctl->is_v4 ? sizeof (vtw_v4_t) : sizeof (vtw_v6_t));
1247
1248 ctl->base.v4 = kmem_alloc(n * sz, KM_SLEEP);
1249 if (ctl->base.v4) {
1250 vtw_t *base;
1251 int class_n;
1252
1253 bzero(ctl->base.v4, n * sz);
1254
1255 if (ctl->is_v4) {
1256 ctl->lim.v4 = ctl->base.v4 + n - 1;
1257 ctl->alloc.v4 = ctl->base.v4;
1258 } else {
1259 ctl->lim.v6 = ctl->base.v6 + n - 1;
1260 ctl->alloc.v6 = ctl->base.v6;
1261 }
1262
1263 ctl->nfree = n;
1264 ctl->ctl = ctl;
1265
1266 ctl->idx_bits = 32;
1267 for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1268 ctl->idx_mask >>= 1;
1269 ctl->idx_bits -= 1;
1270 }
1271
1272 ctl->idx_mask <<= 1;
1273 ctl->idx_mask |= 1;
1274 ctl->idx_bits += 1;
1275
1276 ctl->fat = fat;
1277 fat->vtw = ctl;
1278
1279 /* Divide the resources equally amongst the classes.
1280 * This is not optimal, as the different classes
1281 * arrive and leave at different rates, but it is
1282 * the best I can do for now.
1283 */
1284 class_n = n / (VTW_NCLASS-1);
1285 base = ctl->base.v;
1286
1287 for (i = 1; i < VTW_NCLASS; ++i) {
1288 int j;
1289
1290 ctl[i] = ctl[0];
1291 ctl[i].clidx = i;
1292
1293 ctl[i].base.v = base;
1294 ctl[i].alloc = ctl[i].base;
1295
1296 for (j = 0; j < class_n - 1; ++j) {
1297 if (tcp_msl_enable)
1298 base->msl_class = i;
1299 base = vtw_next(ctl, base);
1300 }
1301
1302 ctl[i].lim.v = base;
1303 base = vtw_next(ctl, base);
1304 ctl[i].nfree = class_n;
1305 }
1306 }
1307
1308 vtw_debug_init();
1309 }
1310
1311 /*!\brief map class to TCP MSL
1312 */
1313 static inline uint32_t
1314 class_to_msl(int class)
1315 {
1316 switch (class) {
1317 case 0:
1318 case 1:
1319 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1320 case 2:
1321 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1322 default:
1323 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1324 }
1325 }
1326
1327 /*!\brief map TCP MSL to class
1328 */
1329 static inline uint32_t
1330 msl_to_class(int msl)
1331 {
1332 if (tcp_msl_enable) {
1333 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1334 return 1+2;
1335 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1336 return 1+1;
1337 return 1;
1338 }
1339 return 0;
1340 }
1341
1342 /*!\brief allocate a vtw entry
1343 */
1344 static inline vtw_t *
1345 vtw_alloc(vtw_ctl_t *ctl)
1346 {
1347 vtw_t *vtw = 0;
1348 int stuck = 0;
1349 int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1350 int msl;
1351
1352 KASSERT(mutex_owned(softnet_lock));
1353
1354 /* If no resources, we will not get far.
1355 */
1356 if (!ctl || !ctl->base.v4 || avail <= 0)
1357 return 0;
1358
1359 /* Obtain a free one.
1360 */
1361 while (!ctl->nfree) {
1362 vtw_age(ctl, 0);
1363
1364 if (++stuck > avail) {
1365 /* When in transition between
1366 * schemes (classless, classed) we
1367 * can be stuck having to await the
1368 * expiration of cross-allocated entries.
1369 *
1370 * Returning zero means we will fall back to the
1371 * traditional TIME_WAIT handling, except in the
1372 * case of a re-shed, in which case we cannot
1373 * perform the reshecd, but will retain the extant
1374 * entry.
1375 */
1376 db_trace(KTR_VTW
1377 , (ctl, "vtw:!none free in class %x %x/%x"
1378 , ctl->clidx
1379 , ctl->nalloc, ctl->nfree));
1380
1381 return 0;
1382 }
1383 }
1384
1385 vtw = ctl->alloc.v;
1386
1387 if (vtw->msl_class != ctl->clidx) {
1388 /* Usurping rules:
1389 * 0 -> {1,2,3} or {1,2,3} -> 0
1390 */
1391 KASSERT(!vtw->msl_class || !ctl->clidx);
1392
1393 if (vtw->hashed || vtw->expire.tv_sec) {
1394 /* As this is owned by some other class,
1395 * we must wait for it to expire it.
1396 * This will only happen on class/classless
1397 * transitions, which are guaranteed to progress
1398 * to completion in small finite time, barring bugs.
1399 */
1400 db_trace(KTR_VTW
1401 , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1402 , vtw, vtw->msl_class, ctl->clidx
1403 , vtw->expire.tv_sec
1404 , vtw->expire.tv_usec
1405 , vtw->hashed ? " hashed" : ""));
1406
1407 return 0;
1408 }
1409
1410 db_trace(KTR_VTW
1411 , (ctl, "vtw:!%p usurped from %x to %x"
1412 , vtw, vtw->msl_class, ctl->clidx));
1413
1414 vtw->msl_class = ctl->clidx;
1415 }
1416
1417 if (vtw_alive(vtw)) {
1418 KASSERT(0 && "next free not free");
1419 return 0;
1420 }
1421
1422 /* Advance allocation poiter.
1423 */
1424 ctl->alloc.v = vtw_next(ctl, vtw);
1425
1426 --ctl->nfree;
1427 ++ctl->nalloc;
1428
1429 msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1430
1431 /* mark expiration
1432 */
1433 getmicrouptime(&vtw->expire);
1434
1435 /* Move expiration into the future.
1436 */
1437 vtw->expire.tv_sec += msl / 1000;
1438 vtw->expire.tv_usec += 1000 * (msl % 1000);
1439
1440 while (vtw->expire.tv_usec >= 1000*1000) {
1441 vtw->expire.tv_usec -= 1000*1000;
1442 vtw->expire.tv_sec += 1;
1443 }
1444
1445 if (!ctl->oldest.v)
1446 ctl->oldest.v = vtw;
1447
1448 return vtw;
1449 }
1450
1451 /*!\brief expiration
1452 */
1453 static int
1454 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1455 {
1456 vtw_t *vtw;
1457 struct timeval then, *when = _when;
1458 int maxtries = 0;
1459
1460 if (!ctl->oldest.v) {
1461 KASSERT(!ctl->nalloc);
1462 return 0;
1463 }
1464
1465 for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1466 if (++maxtries > ctl->nalloc)
1467 break;
1468
1469 if (vtw->msl_class != ctl->clidx) {
1470 db_trace(KTR_VTW
1471 , (vtw, "vtw:!age class mismatch %x != %x"
1472 , vtw->msl_class, ctl->clidx));
1473 /* XXXX
1474 * See if the appropriate action is to skip to the next.
1475 * XXXX
1476 */
1477 ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1478 continue;
1479 }
1480 if (!when) {
1481 /* Latch oldest timeval if none specified.
1482 */
1483 then = vtw->expire;
1484 when = &then;
1485 }
1486
1487 if (!timercmp(&vtw->expire, when, <=))
1488 break;
1489
1490 db_trace(KTR_VTW
1491 , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1492 , ctl->clidx
1493 , vtw->expire.tv_sec
1494 , vtw->expire.tv_usec
1495 , ctl->nalloc
1496 , ctl->nfree));
1497
1498 if (!_when)
1499 ++vtw_stats.kill;
1500
1501 vtw_del(ctl, vtw);
1502 vtw = ctl->oldest.v;
1503 }
1504
1505 return ctl->nalloc; // # remaining allocated
1506 }
1507
1508 static callout_t vtw_cs;
1509
1510 /*!\brief notice the passage of time.
1511 * It seems to be getting faster. What happened to the year?
1512 */
1513 static void
1514 vtw_tick(void *arg)
1515 {
1516 struct timeval now;
1517 int i, cnt = 0;
1518
1519 getmicrouptime(&now);
1520
1521 db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1522 , now.tv_sec, now.tv_usec));
1523
1524 mutex_enter(softnet_lock);
1525
1526 for (i = 0; i < VTW_NCLASS; ++i) {
1527 cnt += vtw_age(&vtw_tcpv4[i], &now);
1528 cnt += vtw_age(&vtw_tcpv6[i], &now);
1529 }
1530
1531 /* Keep ticks coming while we need them.
1532 */
1533 if (cnt)
1534 callout_schedule(&vtw_cs, hz / 5);
1535 else {
1536 tcp_vtw_was_enabled = 0;
1537 tcbtable.vestige = 0;
1538 }
1539 mutex_exit(softnet_lock);
1540 }
1541
1542 /* in_pcblookup_ports assist for handling vestigial entries.
1543 */
1544 static void *
1545 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1546 {
1547 struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1548
1549 bzero(it, sizeof (*it));
1550
1551 /* Note: the reference to vtw_tcpv4[0] is fine.
1552 * We do not need per-class iteration. We just
1553 * need to get to the fat, and there is one
1554 * shared fat.
1555 */
1556 if (vtw_tcpv4[0].fat) {
1557 it->addr.v4 = addr;
1558 it->port = port;
1559 it->wild = !!wild;
1560 it->ctl = &vtw_tcpv4[0];
1561
1562 ++vtw_stats.look[1];
1563 }
1564
1565 return it;
1566 }
1567
1568 /*!\brief export an IPv4 vtw.
1569 */
1570 static int
1571 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1572 {
1573 vtw_v4_t *v4 = (void*)vtw;
1574
1575 bzero(res, sizeof (*res));
1576
1577 if (ctl && vtw) {
1578 if (!ctl->clidx && vtw->msl_class)
1579 ctl += vtw->msl_class;
1580 else
1581 KASSERT(ctl->clidx == vtw->msl_class);
1582
1583 res->valid = 1;
1584 res->v4 = 1;
1585
1586 res->faddr.v4.s_addr = v4->faddr;
1587 res->laddr.v4.s_addr = v4->laddr;
1588 res->fport = v4->fport;
1589 res->lport = v4->lport;
1590 res->vtw = vtw; // netlock held over call(s)
1591 res->ctl = ctl;
1592 res->reuse_addr = vtw->reuse_addr;
1593 res->reuse_port = vtw->reuse_port;
1594 res->snd_nxt = vtw->snd_nxt;
1595 res->rcv_nxt = vtw->rcv_nxt;
1596 res->rcv_wnd = vtw->rcv_wnd;
1597 res->uid = vtw->uid;
1598 }
1599
1600 return res->valid;
1601 }
1602
1603 /*!\brief return next port in the port iterator. yowza.
1604 */
1605 static int
1606 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1607 {
1608 struct tcp_ports_iterator *it = arg;
1609 vtw_t *vtw = 0;
1610
1611 if (it->ctl)
1612 vtw = vtw_next_port_v4(it);
1613
1614 if (!vtw)
1615 it->ctl = 0;
1616
1617 return vtw_export_v4(it->ctl, vtw, res);
1618 }
1619
1620 static int
1621 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1622 struct in_addr laddr, uint16_t lport,
1623 struct vestigial_inpcb *res)
1624 {
1625 vtw_t *vtw;
1626 vtw_ctl_t *ctl;
1627
1628
1629 db_trace(KTR_VTW
1630 , (res, "vtw: lookup %A:%P %A:%P"
1631 , faddr, fport
1632 , laddr, lport));
1633
1634 vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1635 , faddr.s_addr, fport
1636 , laddr.s_addr, lport, 0);
1637
1638 return vtw_export_v4(ctl, vtw, res);
1639 }
1640
1641 /* in_pcblookup_ports assist for handling vestigial entries.
1642 */
1643 static void *
1644 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1645 {
1646 struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1647
1648 bzero(it, sizeof (*it));
1649
1650 /* Note: the reference to vtw_tcpv6[0] is fine.
1651 * We do not need per-class iteration. We just
1652 * need to get to the fat, and there is one
1653 * shared fat.
1654 */
1655 if (vtw_tcpv6[0].fat) {
1656 it->addr.v6 = *addr;
1657 it->port = port;
1658 it->wild = !!wild;
1659 it->ctl = &vtw_tcpv6[0];
1660
1661 ++vtw_stats.look[1];
1662 }
1663
1664 return it;
1665 }
1666
1667 /*!\brief export an IPv6 vtw.
1668 */
1669 static int
1670 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1671 {
1672 vtw_v6_t *v6 = (void*)vtw;
1673
1674 bzero(res, sizeof (*res));
1675
1676 if (ctl && vtw) {
1677 if (!ctl->clidx && vtw->msl_class)
1678 ctl += vtw->msl_class;
1679 else
1680 KASSERT(ctl->clidx == vtw->msl_class);
1681
1682 res->valid = 1;
1683 res->v4 = 0;
1684
1685 res->faddr.v6 = v6->faddr;
1686 res->laddr.v6 = v6->laddr;
1687 res->fport = v6->fport;
1688 res->lport = v6->lport;
1689 res->vtw = vtw; // netlock held over call(s)
1690 res->ctl = ctl;
1691
1692 res->v6only = vtw->v6only;
1693 res->reuse_addr = vtw->reuse_addr;
1694 res->reuse_port = vtw->reuse_port;
1695
1696 res->snd_nxt = vtw->snd_nxt;
1697 res->rcv_nxt = vtw->rcv_nxt;
1698 res->rcv_wnd = vtw->rcv_wnd;
1699 res->uid = vtw->uid;
1700 }
1701
1702 return res->valid;
1703 }
1704
1705 static int
1706 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1707 {
1708 struct tcp_ports_iterator *it = arg;
1709 vtw_t *vtw = 0;
1710
1711 if (it->ctl)
1712 vtw = vtw_next_port_v6(it);
1713
1714 if (!vtw)
1715 it->ctl = 0;
1716
1717 return vtw_export_v6(it->ctl, vtw, res);
1718 }
1719
1720 static int
1721 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1722 const struct in6_addr *laddr, uint16_t lport,
1723 struct vestigial_inpcb *res)
1724 {
1725 vtw_ctl_t *ctl;
1726 vtw_t *vtw;
1727
1728 db_trace(KTR_VTW
1729 , (res, "vtw: lookup %6A:%P %6A:%P"
1730 , db_store(faddr, sizeof (*faddr)), fport
1731 , db_store(laddr, sizeof (*laddr)), lport));
1732
1733 vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1734 , faddr, fport
1735 , laddr, lport, 0);
1736
1737 return vtw_export_v6(ctl, vtw, res);
1738 }
1739
1740 static vestigial_hooks_t tcp_hooks = {
1741 .init_ports4 = tcp_init_ports_v4,
1742 .next_port4 = tcp_next_port_v4,
1743 .lookup4 = tcp_lookup_v4,
1744 .init_ports6 = tcp_init_ports_v6,
1745 .next_port6 = tcp_next_port_v6,
1746 .lookup6 = tcp_lookup_v6,
1747 };
1748
1749 static bool
1750 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1751 {
1752 fatp_ctl_t *fat;
1753 vtw_ctl_t *ctl;
1754
1755 switch (af) {
1756 case AF_INET:
1757 fat = &fat_tcpv4;
1758 ctl = &vtw_tcpv4[0];
1759 break;
1760 case AF_INET6:
1761 fat = &fat_tcpv6;
1762 ctl = &vtw_tcpv6[0];
1763 break;
1764 default:
1765 return false;
1766 }
1767 if (fatp != NULL)
1768 *fatp = fat;
1769 if (ctlp != NULL)
1770 *ctlp = ctl;
1771 return true;
1772 }
1773
1774 /*!\brief initialize controlling instance
1775 */
1776 static int
1777 vtw_control_init(int af)
1778 {
1779 fatp_ctl_t *fat;
1780 vtw_ctl_t *ctl;
1781
1782 if (!vtw_select(af, &fat, &ctl))
1783 return EAFNOSUPPORT;
1784
1785 if (!fat->base) {
1786 uint32_t n, m;
1787
1788 KASSERT(powerof2(tcp_vtw_entries));
1789
1790 /* Allocate 10% more capacity in the fat pointers.
1791 * We should only need ~#hash additional based on
1792 * how they age, but TIME_WAIT assassination could cause
1793 * sparse fat pointer utilisation.
1794 */
1795 m = 512;
1796 n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1797
1798 fatp_init(fat, n, m);
1799
1800 if (!fat->base)
1801 return ENOMEM;
1802 }
1803
1804 if (!ctl->base.v) {
1805
1806 vtw_init(fat, ctl, tcp_vtw_entries);
1807 if (!ctl->base.v)
1808 return ENOMEM;
1809 }
1810
1811 return 0;
1812 }
1813
1814 /*!\brief select controlling instance
1815 */
1816 static vtw_ctl_t *
1817 vtw_control(int af, uint32_t msl)
1818 {
1819 fatp_ctl_t *fat;
1820 vtw_ctl_t *ctl;
1821 int class = msl_to_class(msl);
1822
1823 if (!vtw_select(af, &fat, &ctl))
1824 return NULL;
1825
1826 if (!fat->base || !ctl->base.v)
1827 return NULL;
1828
1829 return ctl + class;
1830 }
1831
1832 /*!\brief add TCP pcb to vestigial timewait
1833 */
1834 int
1835 vtw_add(int af, struct tcpcb *tp)
1836 {
1837 int enable;
1838 vtw_ctl_t *ctl;
1839 vtw_t *vtw;
1840
1841 KASSERT(mutex_owned(softnet_lock));
1842
1843 ctl = vtw_control(af, tp->t_msl);
1844 if (!ctl)
1845 return 0;
1846
1847 enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1848
1849 vtw = vtw_alloc(ctl);
1850
1851 if (vtw) {
1852 vtw->snd_nxt = tp->snd_nxt;
1853 vtw->rcv_nxt = tp->rcv_nxt;
1854
1855 switch (af) {
1856 case AF_INET: {
1857 struct inpcb *inp = tp->t_inpcb;
1858 vtw_v4_t *v4 = (void*)vtw;
1859
1860 v4->faddr = inp->inp_faddr.s_addr;
1861 v4->laddr = inp->inp_laddr.s_addr;
1862 v4->fport = inp->inp_fport;
1863 v4->lport = inp->inp_lport;
1864
1865 vtw->reuse_port = !!(inp->inp_socket->so_options
1866 & SO_REUSEPORT);
1867 vtw->reuse_addr = !!(inp->inp_socket->so_options
1868 & SO_REUSEADDR);
1869 vtw->v6only = 0;
1870 vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1871
1872 vtw_inshash_v4(ctl, vtw);
1873
1874
1875 #ifdef VTW_DEBUG
1876 /* Immediate lookup (connected and port) to
1877 * ensure at least that works!
1878 */
1879 if (enable & 4) {
1880 KASSERT(vtw_lookup_hash_v4
1881 (ctl
1882 , inp->inp_faddr.s_addr, inp->inp_fport
1883 , inp->inp_laddr.s_addr, inp->inp_lport
1884 , 0)
1885 == vtw);
1886 KASSERT(vtw_lookup_hash_v4
1887 (ctl
1888 , inp->inp_faddr.s_addr, inp->inp_fport
1889 , inp->inp_laddr.s_addr, inp->inp_lport
1890 , 1));
1891 }
1892 /* Immediate port iterator functionality check: not wild
1893 */
1894 if (enable & 8) {
1895 struct tcp_ports_iterator *it;
1896 struct vestigial_inpcb res;
1897 int cnt = 0;
1898
1899 it = tcp_init_ports_v4(inp->inp_laddr
1900 , inp->inp_lport, 0);
1901
1902 while (tcp_next_port_v4(it, &res)) {
1903 ++cnt;
1904 }
1905 KASSERT(cnt);
1906 }
1907 /* Immediate port iterator functionality check: wild
1908 */
1909 if (enable & 16) {
1910 struct tcp_ports_iterator *it;
1911 struct vestigial_inpcb res;
1912 struct in_addr any;
1913 int cnt = 0;
1914
1915 any.s_addr = htonl(INADDR_ANY);
1916
1917 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1918
1919 while (tcp_next_port_v4(it, &res)) {
1920 ++cnt;
1921 }
1922 KASSERT(cnt);
1923 }
1924 #endif /* VTW_DEBUG */
1925 break;
1926 }
1927
1928 case AF_INET6: {
1929 struct in6pcb *inp = tp->t_in6pcb;
1930 vtw_v6_t *v6 = (void*)vtw;
1931
1932 v6->faddr = inp->in6p_faddr;
1933 v6->laddr = inp->in6p_laddr;
1934 v6->fport = inp->in6p_fport;
1935 v6->lport = inp->in6p_lport;
1936
1937 vtw->reuse_port = !!(inp->in6p_socket->so_options
1938 & SO_REUSEPORT);
1939 vtw->reuse_addr = !!(inp->in6p_socket->so_options
1940 & SO_REUSEADDR);
1941 vtw->v6only = !!(inp->in6p_flags
1942 & IN6P_IPV6_V6ONLY);
1943 vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid;
1944
1945 vtw_inshash_v6(ctl, vtw);
1946 #ifdef VTW_DEBUG
1947 /* Immediate lookup (connected and port) to
1948 * ensure at least that works!
1949 */
1950 if (enable & 4) {
1951 KASSERT(vtw_lookup_hash_v6(ctl
1952 , &inp->in6p_faddr, inp->in6p_fport
1953 , &inp->in6p_laddr, inp->in6p_lport
1954 , 0)
1955 == vtw);
1956 KASSERT(vtw_lookup_hash_v6
1957 (ctl
1958 , &inp->in6p_faddr, inp->in6p_fport
1959 , &inp->in6p_laddr, inp->in6p_lport
1960 , 1));
1961 }
1962 /* Immediate port iterator functionality check: not wild
1963 */
1964 if (enable & 8) {
1965 struct tcp_ports_iterator *it;
1966 struct vestigial_inpcb res;
1967 int cnt = 0;
1968
1969 it = tcp_init_ports_v6(&inp->in6p_laddr
1970 , inp->in6p_lport, 0);
1971
1972 while (tcp_next_port_v6(it, &res)) {
1973 ++cnt;
1974 }
1975 KASSERT(cnt);
1976 }
1977 /* Immediate port iterator functionality check: wild
1978 */
1979 if (enable & 16) {
1980 struct tcp_ports_iterator *it;
1981 struct vestigial_inpcb res;
1982 static struct in6_addr any = IN6ADDR_ANY_INIT;
1983 int cnt = 0;
1984
1985 it = tcp_init_ports_v6(&any
1986 , inp->in6p_lport, 1);
1987
1988 while (tcp_next_port_v6(it, &res)) {
1989 ++cnt;
1990 }
1991 KASSERT(cnt);
1992 }
1993 #endif /* VTW_DEBUG */
1994 break;
1995 }
1996 }
1997
1998 tcp_canceltimers(tp);
1999 tp = tcp_close(tp);
2000 KASSERT(!tp);
2001
2002 return 1;
2003 }
2004
2005 return 0;
2006 }
2007
2008 /*!\brief restart timer for vestigial time-wait entry
2009 */
2010 static void
2011 vtw_restart_v4(vestigial_inpcb_t *vp)
2012 {
2013 vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2014 vtw_t *vtw;
2015 vtw_t *cp = ©.common;
2016 vtw_ctl_t *ctl;
2017
2018 KASSERT(mutex_owned(softnet_lock));
2019
2020 db_trace(KTR_VTW
2021 , (vp->vtw, "vtw: restart %A:%P %A:%P"
2022 , vp->faddr.v4.s_addr, vp->fport
2023 , vp->laddr.v4.s_addr, vp->lport));
2024
2025 /* Class might have changed, so have a squiz.
2026 */
2027 ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2028 vtw = vtw_alloc(ctl);
2029
2030 if (vtw) {
2031 vtw_v4_t *v4 = (void*)vtw;
2032
2033 /* Safe now to unhash the old entry
2034 */
2035 vtw_del(vp->ctl, vp->vtw);
2036
2037 vtw->snd_nxt = cp->snd_nxt;
2038 vtw->rcv_nxt = cp->rcv_nxt;
2039
2040 v4->faddr = copy.faddr;
2041 v4->laddr = copy.laddr;
2042 v4->fport = copy.fport;
2043 v4->lport = copy.lport;
2044
2045 vtw->reuse_port = cp->reuse_port;
2046 vtw->reuse_addr = cp->reuse_addr;
2047 vtw->v6only = 0;
2048 vtw->uid = cp->uid;
2049
2050 vtw_inshash_v4(ctl, vtw);
2051 }
2052
2053 vp->valid = 0;
2054 }
2055
2056 /*!\brief restart timer for vestigial time-wait entry
2057 */
2058 static void
2059 vtw_restart_v6(vestigial_inpcb_t *vp)
2060 {
2061 vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2062 vtw_t *vtw;
2063 vtw_t *cp = ©.common;
2064 vtw_ctl_t *ctl;
2065
2066 KASSERT(mutex_owned(softnet_lock));
2067
2068 db_trace(KTR_VTW
2069 , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2070 , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2071 , vp->fport
2072 , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2073 , vp->lport));
2074
2075 /* Class might have changed, so have a squiz.
2076 */
2077 ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2078 vtw = vtw_alloc(ctl);
2079
2080 if (vtw) {
2081 vtw_v6_t *v6 = (void*)vtw;
2082
2083 /* Safe now to unhash the old entry
2084 */
2085 vtw_del(vp->ctl, vp->vtw);
2086
2087 vtw->snd_nxt = cp->snd_nxt;
2088 vtw->rcv_nxt = cp->rcv_nxt;
2089
2090 v6->faddr = copy.faddr;
2091 v6->laddr = copy.laddr;
2092 v6->fport = copy.fport;
2093 v6->lport = copy.lport;
2094
2095 vtw->reuse_port = cp->reuse_port;
2096 vtw->reuse_addr = cp->reuse_addr;
2097 vtw->v6only = cp->v6only;
2098 vtw->uid = cp->uid;
2099
2100 vtw_inshash_v6(ctl, vtw);
2101 }
2102
2103 vp->valid = 0;
2104 }
2105
2106 /*!\brief restart timer for vestigial time-wait entry
2107 */
2108 void
2109 vtw_restart(vestigial_inpcb_t *vp)
2110 {
2111 if (!vp || !vp->valid)
2112 return;
2113
2114 if (vp->v4)
2115 vtw_restart_v4(vp);
2116 else
2117 vtw_restart_v6(vp);
2118 }
2119
2120 int
2121 vtw_earlyinit(void)
2122 {
2123 int rc;
2124
2125 if (!tcp_vtw_was_enabled) {
2126 int i;
2127
2128 /* This guarantees is timer ticks until we no longer need them.
2129 */
2130 tcp_vtw_was_enabled = 1;
2131
2132 callout_init(&vtw_cs, 0);
2133 callout_setfunc(&vtw_cs, vtw_tick, 0);
2134 callout_schedule(&vtw_cs, hz / 5);
2135
2136 for (i = 0; i < VTW_NCLASS; ++i) {
2137 vtw_tcpv4[i].is_v4 = 1;
2138 vtw_tcpv6[i].is_v6 = 1;
2139 }
2140
2141 tcbtable.vestige = &tcp_hooks;
2142 }
2143
2144 if ((rc = vtw_control_init(AF_INET)) != 0 ||
2145 (rc = vtw_control_init(AF_INET6)) != 0)
2146 return rc;
2147
2148 return 0;
2149 }
2150
2151 #ifdef VTW_DEBUG
2152 #include <sys/syscallargs.h>
2153 #include <sys/sysctl.h>
2154
2155 /*!\brief add lalp, fafp entries for debug
2156 */
2157 int
2158 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int class)
2159 {
2160 vtw_ctl_t *ctl;
2161 vtw_t *vtw;
2162
2163 ctl = vtw_control(af, msl ? msl : class_to_msl(class));
2164 if (!ctl)
2165 return 0;
2166
2167 vtw = vtw_alloc(ctl);
2168
2169 if (vtw) {
2170 vtw->snd_nxt = 0;
2171 vtw->rcv_nxt = 0;
2172
2173 switch (af) {
2174 case AF_INET: {
2175 vtw_v4_t *v4 = (void*)vtw;
2176
2177 v4->faddr = fa->sin_addr.v4.s_addr;
2178 v4->laddr = la->sin_addr.v4.s_addr;
2179 v4->fport = fa->sin_port;
2180 v4->lport = la->sin_port;
2181
2182 vtw->reuse_port = 1;
2183 vtw->reuse_addr = 1;
2184 vtw->v6only = 0;
2185 vtw->uid = 0;
2186
2187 vtw_inshash_v4(ctl, vtw);
2188 break;
2189 }
2190
2191 case AF_INET6: {
2192 vtw_v6_t *v6 = (void*)vtw;
2193
2194 v6->faddr = fa->sin_addr.v6;
2195 v6->laddr = la->sin_addr.v6;
2196
2197 v6->fport = fa->sin_port;
2198 v6->lport = la->sin_port;
2199
2200 vtw->reuse_port = 1;
2201 vtw->reuse_addr = 1;
2202 vtw->v6only = 0;
2203 vtw->uid = 0;
2204
2205 vtw_inshash_v6(ctl, vtw);
2206 break;
2207 }
2208
2209 default:
2210 break;
2211 }
2212
2213 return 1;
2214 }
2215
2216 return 0;
2217 }
2218
2219 static int vtw_syscall = 0;
2220
2221 static int
2222 vtw_debug_process(vtw_sysargs_t *ap)
2223 {
2224 struct vestigial_inpcb vestige;
2225 int rc = 0;
2226
2227 mutex_enter(softnet_lock);
2228
2229 switch (ap->op) {
2230 case 0: // insert
2231 vtw_debug_add(ap->la.sin_family
2232 , &ap->la
2233 , &ap->fa
2234 , TCPTV_MSL
2235 , 0);
2236 break;
2237
2238 case 1: // lookup
2239 case 2: // restart
2240 switch (ap->la.sin_family) {
2241 case AF_INET:
2242 if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2243 ap->la.sin_addr.v4, ap->la.sin_port,
2244 &vestige)) {
2245 if (ap->op == 2) {
2246 vtw_restart(&vestige);
2247 }
2248 rc = 0;
2249 } else
2250 rc = ESRCH;
2251 break;
2252
2253 case AF_INET6:
2254 if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2255 &ap->la.sin_addr.v6, ap->la.sin_port,
2256 &vestige)) {
2257 if (ap->op == 2) {
2258 vtw_restart(&vestige);
2259 }
2260 rc = 0;
2261 } else
2262 rc = ESRCH;
2263 break;
2264 default:
2265 rc = EINVAL;
2266 }
2267 break;
2268
2269 default:
2270 rc = EINVAL;
2271 }
2272
2273 mutex_exit(softnet_lock);
2274 return rc;
2275 }
2276
2277 struct sys_vtw_args {
2278 syscallarg(const vtw_sysargs_t *) req;
2279 syscallarg(size_t) len;
2280 };
2281
2282 static int
2283 vtw_sys(struct lwp *l, const void *_, register_t *retval)
2284 {
2285 const struct sys_vtw_args *uap = _;
2286 void *buf;
2287 int rc;
2288 size_t len = SCARG(uap, len);
2289
2290 if (len != sizeof (vtw_sysargs_t))
2291 return EINVAL;
2292
2293 buf = kmem_alloc(len, KM_SLEEP);
2294 if (!buf)
2295 return ENOMEM;
2296
2297 rc = copyin(SCARG(uap, req), buf, len);
2298 if (!rc) {
2299 rc = vtw_debug_process(buf);
2300 }
2301 kmem_free(buf, len);
2302
2303 return rc;
2304 }
2305
2306 static void
2307 vtw_sanity_check(void)
2308 {
2309 vtw_ctl_t *ctl;
2310 vtw_t *vtw;
2311 int i;
2312 int n;
2313
2314 for (i = 0; i < VTW_NCLASS; ++i) {
2315 ctl = &vtw_tcpv4[i];
2316
2317 if (!ctl->base.v || ctl->nalloc)
2318 continue;
2319
2320 for (n = 0, vtw = ctl->base.v; ; ) {
2321 ++n;
2322 vtw = vtw_next(ctl, vtw);
2323 if (vtw == ctl->base.v)
2324 break;
2325 }
2326 db_trace(KTR_VTW
2327 , (ctl, "sanity: class %x n %x nfree %x"
2328 , i, n, ctl->nfree));
2329
2330 KASSERT(n == ctl->nfree);
2331 }
2332
2333 for (i = 0; i < VTW_NCLASS; ++i) {
2334 ctl = &vtw_tcpv6[i];
2335
2336 if (!ctl->base.v || ctl->nalloc)
2337 continue;
2338
2339 for (n = 0, vtw = ctl->base.v; ; ) {
2340 ++n;
2341 vtw = vtw_next(ctl, vtw);
2342 if (vtw == ctl->base.v)
2343 break;
2344 }
2345 db_trace(KTR_VTW
2346 , (ctl, "sanity: class %x n %x nfree %x"
2347 , i, n, ctl->nfree));
2348 KASSERT(n == ctl->nfree);
2349 }
2350 }
2351
2352 /*!\brief Initialise debug support.
2353 */
2354 static void
2355 vtw_debug_init(void)
2356 {
2357 int i;
2358
2359 vtw_sanity_check();
2360
2361 if (vtw_syscall)
2362 return;
2363
2364 for (i = 511; i; --i) {
2365 if (sysent[i].sy_call == sys_nosys) {
2366 sysent[i].sy_call = vtw_sys;
2367 sysent[i].sy_narg = 2;
2368 sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2369 sysent[i].sy_flags = 0;
2370
2371 vtw_syscall = i;
2372 break;
2373 }
2374 }
2375 if (i) {
2376 const struct sysctlnode *node;
2377 uint32_t flags;
2378
2379 flags = sysctl_root.sysctl_flags;
2380
2381 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2382 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2383
2384 sysctl_createv(0, 0, 0, &node,
2385 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2386 "koff",
2387 SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2388 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2389
2390 if (!node) {
2391 sysctl_createv(0, 0, 0, &node,
2392 CTLFLAG_PERMANENT, CTLTYPE_NODE,
2393 "koffka",
2394 SYSCTL_DESCR("The Real(tm) Kernel"
2395 " Obscure Feature Finder"),
2396 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2397 }
2398 if (node) {
2399 sysctl_createv(0, 0, 0, 0,
2400 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2401 CTLTYPE_INT, "vtw_debug_syscall",
2402 SYSCTL_DESCR("vtw debug"
2403 " system call number"),
2404 0, 0, &vtw_syscall, 0, node->sysctl_num,
2405 CTL_CREATE, CTL_EOL);
2406 }
2407 sysctl_root.sysctl_flags = flags;
2408 }
2409 }
2410 #else /* !VTW_DEBUG */
2411 static void
2412 vtw_debug_init(void)
2413 {
2414 return;
2415 }
2416 #endif /* !VTW_DEBUG */
2417