tcp_vtw.c revision 1.9.2.2 1 1.1 dyoung /*
2 1.1 dyoung * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 1.1 dyoung * All rights reserved.
4 1.1 dyoung *
5 1.1 dyoung * This code is derived from software contributed to The NetBSD Foundation
6 1.1 dyoung * by Coyote Point Systems, Inc.
7 1.1 dyoung *
8 1.1 dyoung * Redistribution and use in source and binary forms, with or without
9 1.1 dyoung * modification, are permitted provided that the following conditions
10 1.1 dyoung * are met:
11 1.1 dyoung * 1. Redistributions of source code must retain the above copyright
12 1.1 dyoung * notice, this list of conditions and the following disclaimer.
13 1.1 dyoung * 2. Redistributions in binary form must reproduce the above copyright
14 1.1 dyoung * notice, this list of conditions and the following disclaimer in the
15 1.1 dyoung * documentation and/or other materials provided with the distribution.
16 1.1 dyoung *
17 1.1 dyoung * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 1.1 dyoung * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 1.1 dyoung * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 1.1 dyoung * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 1.1 dyoung * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 1.1 dyoung * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 1.1 dyoung * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 1.1 dyoung * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 1.1 dyoung * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 1.1 dyoung * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 1.1 dyoung * POSSIBILITY OF SUCH DAMAGE.
28 1.1 dyoung */
29 1.9 yamt
30 1.9 yamt /*
31 1.9 yamt * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using
32 1.9 yamt * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime
33 1.9 yamt * Truncation (MSLT).
34 1.9 yamt *
35 1.9 yamt * MSLT and VTW were contributed by Coyote Point Systems, Inc.
36 1.9 yamt *
37 1.9 yamt * Even after a TCP session enters the TIME_WAIT state, its corresponding
38 1.9 yamt * socket and protocol control blocks (PCBs) stick around until the TCP
39 1.9 yamt * Maximum Segment Lifetime (MSL) expires. On a host whose workload
40 1.9 yamt * necessarily creates and closes down many TCP sockets, the sockets & PCBs
41 1.9 yamt * for TCP sessions in TIME_WAIT state amount to many megabytes of dead
42 1.9 yamt * weight in RAM.
43 1.9 yamt *
44 1.9 yamt * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to
45 1.9 yamt * a class based on the nearness of the peer. Corresponding to each class
46 1.9 yamt * is an MSL, and a session uses the MSL of its class. The classes are
47 1.9 yamt * loopback (local host equals remote host), local (local host and remote
48 1.9 yamt * host are on the same link/subnet), and remote (local host and remote
49 1.9 yamt * host communicate via one or more gateways). Classes corresponding to
50 1.9 yamt * nearer peers have lower MSLs by default: 2 seconds for loopback, 10
51 1.9 yamt * seconds for local, 60 seconds for remote. Loopback and local sessions
52 1.9 yamt * expire more quickly when MSLT is used.
53 1.9 yamt *
54 1.9 yamt * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket
55 1.9 yamt * dead weight with a compact representation of the session, called a
56 1.9 yamt * "vestigial PCB". VTW data structures are designed to be very fast and
57 1.9 yamt * memory-efficient: for fast insertion and lookup of vestigial PCBs,
58 1.9 yamt * the PCBs are stored in a hash table that is designed to minimize the
59 1.9 yamt * number of cacheline visits per lookup/insertion. The memory both
60 1.9 yamt * for vestigial PCBs and for elements of the PCB hashtable come from
61 1.9 yamt * fixed-size pools, and linked data structures exploit this to conserve
62 1.9 yamt * memory by representing references with a narrow index/offset from the
63 1.9 yamt * start of a pool instead of a pointer. When space for new vestigial PCBs
64 1.9 yamt * runs out, VTW makes room by discarding old vestigial PCBs, oldest first.
65 1.9 yamt * VTW cooperates with MSLT.
66 1.9 yamt *
67 1.9 yamt * It may help to think of VTW as a "FIN cache" by analogy to the SYN
68 1.9 yamt * cache.
69 1.9 yamt *
70 1.9 yamt * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT
71 1.9 yamt * sessions as fast as it can is approximately 17% idle when VTW is active
72 1.9 yamt * versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM
73 1.9 yamt * when VTW is active (approximately 64k vestigial PCBs are created) than
74 1.9 yamt * when it is inactive.
75 1.9 yamt */
76 1.9 yamt
77 1.1 dyoung #include <sys/cdefs.h>
78 1.1 dyoung
79 1.9.2.2 jdolecek #ifdef _KERNEL_OPT
80 1.1 dyoung #include "opt_ddb.h"
81 1.1 dyoung #include "opt_inet.h"
82 1.1 dyoung #include "opt_inet_csum.h"
83 1.1 dyoung #include "opt_tcp_debug.h"
84 1.9.2.2 jdolecek #endif
85 1.1 dyoung
86 1.1 dyoung #include <sys/param.h>
87 1.1 dyoung #include <sys/systm.h>
88 1.1 dyoung #include <sys/kmem.h>
89 1.1 dyoung #include <sys/mbuf.h>
90 1.1 dyoung #include <sys/protosw.h>
91 1.1 dyoung #include <sys/socket.h>
92 1.1 dyoung #include <sys/socketvar.h>
93 1.1 dyoung #include <sys/errno.h>
94 1.1 dyoung #include <sys/syslog.h>
95 1.1 dyoung #include <sys/pool.h>
96 1.1 dyoung #include <sys/domain.h>
97 1.1 dyoung #include <sys/kernel.h>
98 1.1 dyoung #include <net/if.h>
99 1.1 dyoung #include <net/if_types.h>
100 1.1 dyoung
101 1.1 dyoung #include <netinet/in.h>
102 1.1 dyoung #include <netinet/in_systm.h>
103 1.1 dyoung #include <netinet/ip.h>
104 1.1 dyoung #include <netinet/in_pcb.h>
105 1.1 dyoung #include <netinet/in_var.h>
106 1.1 dyoung #include <netinet/ip_var.h>
107 1.1 dyoung #include <netinet/in_offload.h>
108 1.1 dyoung #include <netinet/ip6.h>
109 1.1 dyoung #include <netinet6/ip6_var.h>
110 1.1 dyoung #include <netinet6/in6_pcb.h>
111 1.1 dyoung #include <netinet6/ip6_var.h>
112 1.1 dyoung #include <netinet6/in6_var.h>
113 1.1 dyoung #include <netinet/icmp6.h>
114 1.1 dyoung
115 1.1 dyoung #include <netinet/tcp.h>
116 1.1 dyoung #include <netinet/tcp_fsm.h>
117 1.1 dyoung #include <netinet/tcp_seq.h>
118 1.1 dyoung #include <netinet/tcp_timer.h>
119 1.1 dyoung #include <netinet/tcp_var.h>
120 1.1 dyoung #include <netinet/tcp_private.h>
121 1.1 dyoung #include <netinet/tcpip.h>
122 1.1 dyoung
123 1.1 dyoung #include <netinet/tcp_vtw.h>
124 1.1 dyoung
125 1.9.2.1 tls __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.9.2.2 2017/12/03 11:39:04 jdolecek Exp $");
126 1.1 dyoung
127 1.1 dyoung #define db_trace(__a, __b) do { } while (/*CONSTCOND*/0)
128 1.1 dyoung
129 1.1 dyoung static void vtw_debug_init(void);
130 1.1 dyoung
131 1.1 dyoung fatp_ctl_t fat_tcpv4;
132 1.1 dyoung fatp_ctl_t fat_tcpv6;
133 1.1 dyoung vtw_ctl_t vtw_tcpv4[VTW_NCLASS];
134 1.1 dyoung vtw_ctl_t vtw_tcpv6[VTW_NCLASS];
135 1.1 dyoung vtw_stats_t vtw_stats;
136 1.1 dyoung
137 1.1 dyoung /* We provide state for the lookup_ports iterator.
138 1.1 dyoung * As currently we are netlock-protected, there is one.
139 1.1 dyoung * If we were finer-grain, we would have one per CPU.
140 1.1 dyoung * I do not want to be in the business of alloc/free.
141 1.1 dyoung * The best alternate would be allocate on the caller's
142 1.1 dyoung * stack, but that would require them to know the struct,
143 1.1 dyoung * or at least the size.
144 1.1 dyoung * See how she goes.
145 1.1 dyoung */
146 1.1 dyoung struct tcp_ports_iterator {
147 1.1 dyoung union {
148 1.1 dyoung struct in_addr v4;
149 1.1 dyoung struct in6_addr v6;
150 1.1 dyoung } addr;
151 1.1 dyoung u_int port;
152 1.1 dyoung
153 1.1 dyoung uint32_t wild : 1;
154 1.1 dyoung
155 1.1 dyoung vtw_ctl_t *ctl;
156 1.1 dyoung fatp_t *fp;
157 1.1 dyoung
158 1.1 dyoung uint16_t slot_idx;
159 1.1 dyoung uint16_t ctl_idx;
160 1.1 dyoung };
161 1.1 dyoung
162 1.1 dyoung static struct tcp_ports_iterator tcp_ports_iterator_v4;
163 1.1 dyoung static struct tcp_ports_iterator tcp_ports_iterator_v6;
164 1.1 dyoung
165 1.1 dyoung static int vtw_age(vtw_ctl_t *, struct timeval *);
166 1.1 dyoung
167 1.1 dyoung /*!\brief allocate a fat pointer from a collection.
168 1.1 dyoung */
169 1.1 dyoung static fatp_t *
170 1.1 dyoung fatp_alloc(fatp_ctl_t *fat)
171 1.1 dyoung {
172 1.1 dyoung fatp_t *fp = 0;
173 1.1 dyoung
174 1.1 dyoung if (fat->nfree) {
175 1.1 dyoung fp = fat->free;
176 1.1 dyoung if (fp) {
177 1.1 dyoung fat->free = fatp_next(fat, fp);
178 1.1 dyoung --fat->nfree;
179 1.1 dyoung ++fat->nalloc;
180 1.1 dyoung fp->nxt = 0;
181 1.1 dyoung
182 1.1 dyoung KASSERT(!fp->inuse);
183 1.1 dyoung }
184 1.1 dyoung }
185 1.1 dyoung
186 1.1 dyoung return fp;
187 1.1 dyoung }
188 1.1 dyoung
189 1.1 dyoung /*!\brief free a fat pointer.
190 1.1 dyoung */
191 1.1 dyoung static void
192 1.1 dyoung fatp_free(fatp_ctl_t *fat, fatp_t *fp)
193 1.1 dyoung {
194 1.1 dyoung if (fp) {
195 1.1 dyoung KASSERT(!fp->inuse);
196 1.1 dyoung KASSERT(!fp->nxt);
197 1.1 dyoung
198 1.1 dyoung fp->nxt = fatp_index(fat, fat->free);
199 1.1 dyoung fat->free = fp;
200 1.1 dyoung
201 1.1 dyoung ++fat->nfree;
202 1.1 dyoung --fat->nalloc;
203 1.1 dyoung }
204 1.1 dyoung }
205 1.1 dyoung
206 1.1 dyoung /*!\brief initialise a collection of fat pointers.
207 1.1 dyoung *
208 1.1 dyoung *\param n # hash buckets
209 1.1 dyoung *\param m total # fat pointers to allocate
210 1.1 dyoung *
211 1.1 dyoung * We allocate 2x as much, as we have two hashes: full and lport only.
212 1.1 dyoung */
213 1.1 dyoung static void
214 1.6 dyoung fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
215 1.6 dyoung fatp_t *fat_base, fatp_t **fat_hash)
216 1.1 dyoung {
217 1.1 dyoung fatp_t *fp;
218 1.1 dyoung
219 1.1 dyoung KASSERT(n <= FATP_MAX / 2);
220 1.1 dyoung
221 1.6 dyoung fat->hash = fat_hash;
222 1.6 dyoung fat->base = fat_base;
223 1.1 dyoung
224 1.1 dyoung fat->port = &fat->hash[m];
225 1.1 dyoung
226 1.1 dyoung fat->mask = m - 1; // ASSERT is power of 2 (m)
227 1.1 dyoung fat->lim = fat->base + 2*n - 1;
228 1.1 dyoung fat->nfree = 0;
229 1.1 dyoung fat->nalloc = 2*n;
230 1.1 dyoung
231 1.1 dyoung /* Initialise the free list.
232 1.1 dyoung */
233 1.1 dyoung for (fp = fat->lim; fp >= fat->base; --fp) {
234 1.1 dyoung fatp_free(fat, fp);
235 1.1 dyoung }
236 1.1 dyoung }
237 1.1 dyoung
238 1.1 dyoung /*
239 1.1 dyoung * The `xtra' is XORed into the tag stored.
240 1.1 dyoung */
241 1.1 dyoung static uint32_t fatp_xtra[] = {
242 1.1 dyoung 0x11111111,0x22222222,0x33333333,0x44444444,
243 1.1 dyoung 0x55555555,0x66666666,0x77777777,0x88888888,
244 1.1 dyoung 0x12121212,0x21212121,0x34343434,0x43434343,
245 1.1 dyoung 0x56565656,0x65656565,0x78787878,0x87878787,
246 1.1 dyoung 0x11221122,0x22112211,0x33443344,0x44334433,
247 1.1 dyoung 0x55665566,0x66556655,0x77887788,0x88778877,
248 1.1 dyoung 0x11112222,0x22221111,0x33334444,0x44443333,
249 1.1 dyoung 0x55556666,0x66665555,0x77778888,0x88887777,
250 1.1 dyoung };
251 1.1 dyoung
252 1.1 dyoung /*!\brief turn a {fatp_t*,slot} into an integral key.
253 1.1 dyoung *
254 1.1 dyoung * The key can be used to obtain the fatp_t, and the slot,
255 1.1 dyoung * as it directly encodes them.
256 1.1 dyoung */
257 1.1 dyoung static inline uint32_t
258 1.1 dyoung fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
259 1.1 dyoung {
260 1.1 dyoung CTASSERT(CACHE_LINE_SIZE == 32 ||
261 1.1 dyoung CACHE_LINE_SIZE == 64 ||
262 1.1 dyoung CACHE_LINE_SIZE == 128);
263 1.1 dyoung
264 1.1 dyoung switch (fatp_ntags()) {
265 1.1 dyoung case 7:
266 1.1 dyoung return (fatp_index(fat, fp) << 3) | slot;
267 1.1 dyoung case 15:
268 1.1 dyoung return (fatp_index(fat, fp) << 4) | slot;
269 1.1 dyoung case 31:
270 1.1 dyoung return (fatp_index(fat, fp) << 5) | slot;
271 1.1 dyoung default:
272 1.1 dyoung KASSERT(0 && "no support, for no good reason");
273 1.1 dyoung return ~0;
274 1.1 dyoung }
275 1.1 dyoung }
276 1.1 dyoung
277 1.1 dyoung static inline uint32_t
278 1.1 dyoung fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
279 1.1 dyoung {
280 1.1 dyoung CTASSERT(CACHE_LINE_SIZE == 32 ||
281 1.1 dyoung CACHE_LINE_SIZE == 64 ||
282 1.1 dyoung CACHE_LINE_SIZE == 128);
283 1.1 dyoung
284 1.1 dyoung switch (fatp_ntags()) {
285 1.1 dyoung case 7:
286 1.1 dyoung return key & 7;
287 1.1 dyoung case 15:
288 1.1 dyoung return key & 15;
289 1.1 dyoung case 31:
290 1.1 dyoung return key & 31;
291 1.1 dyoung default:
292 1.1 dyoung KASSERT(0 && "no support, for no good reason");
293 1.1 dyoung return ~0;
294 1.1 dyoung }
295 1.1 dyoung }
296 1.1 dyoung
297 1.1 dyoung static inline fatp_t *
298 1.1 dyoung fatp_from_key(fatp_ctl_t *fat, uint32_t key)
299 1.1 dyoung {
300 1.1 dyoung CTASSERT(CACHE_LINE_SIZE == 32 ||
301 1.1 dyoung CACHE_LINE_SIZE == 64 ||
302 1.1 dyoung CACHE_LINE_SIZE == 128);
303 1.1 dyoung
304 1.1 dyoung switch (fatp_ntags()) {
305 1.1 dyoung case 7:
306 1.1 dyoung key >>= 3;
307 1.1 dyoung break;
308 1.1 dyoung case 15:
309 1.1 dyoung key >>= 4;
310 1.1 dyoung break;
311 1.1 dyoung case 31:
312 1.1 dyoung key >>= 5;
313 1.1 dyoung break;
314 1.1 dyoung default:
315 1.1 dyoung KASSERT(0 && "no support, for no good reason");
316 1.1 dyoung return 0;
317 1.1 dyoung }
318 1.1 dyoung
319 1.1 dyoung return key ? fat->base + key - 1 : 0;
320 1.1 dyoung }
321 1.1 dyoung
322 1.1 dyoung static inline uint32_t
323 1.1 dyoung idx_encode(vtw_ctl_t *ctl, uint32_t idx)
324 1.1 dyoung {
325 1.1 dyoung return (idx << ctl->idx_bits) | idx;
326 1.1 dyoung }
327 1.1 dyoung
328 1.1 dyoung static inline uint32_t
329 1.1 dyoung idx_decode(vtw_ctl_t *ctl, uint32_t bits)
330 1.1 dyoung {
331 1.1 dyoung uint32_t idx = bits & ctl->idx_mask;
332 1.1 dyoung
333 1.1 dyoung if (idx_encode(ctl, idx) == bits)
334 1.1 dyoung return idx;
335 1.1 dyoung else
336 1.1 dyoung return ~0;
337 1.1 dyoung }
338 1.1 dyoung
339 1.1 dyoung /*!\brief insert index into fatp hash
340 1.1 dyoung *
341 1.1 dyoung *\param idx - index of element being placed in hash chain
342 1.1 dyoung *\param tag - 32-bit tag identifier
343 1.1 dyoung *
344 1.1 dyoung *\returns
345 1.1 dyoung * value which can be used to locate entry.
346 1.1 dyoung *
347 1.1 dyoung *\note
348 1.1 dyoung * we rely on the fact that there are unused high bits in the index
349 1.1 dyoung * for verification purposes on lookup.
350 1.1 dyoung */
351 1.1 dyoung
352 1.1 dyoung static inline uint32_t
353 1.1 dyoung fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
354 1.1 dyoung void *dbg)
355 1.1 dyoung {
356 1.1 dyoung fatp_t *fp;
357 1.1 dyoung fatp_t **hash = (which ? fat->port : fat->hash);
358 1.1 dyoung int i;
359 1.1 dyoung
360 1.1 dyoung fp = hash[tag & fat->mask];
361 1.1 dyoung
362 1.1 dyoung while (!fp || fatp_full(fp)) {
363 1.1 dyoung fatp_t *fq;
364 1.1 dyoung
365 1.1 dyoung /* All entries are inuse at the top level.
366 1.1 dyoung * We allocate a spare, and push the top level
367 1.1 dyoung * down one. All entries in the fp we push down
368 1.1 dyoung * (think of a tape worm here) will be expelled sooner than
369 1.1 dyoung * any entries added subsequently to this hash bucket.
370 1.1 dyoung * This is a property of the time waits we are exploiting.
371 1.1 dyoung */
372 1.1 dyoung
373 1.1 dyoung fq = fatp_alloc(fat);
374 1.1 dyoung if (!fq) {
375 1.1 dyoung vtw_age(fat->vtw, 0);
376 1.1 dyoung fp = hash[tag & fat->mask];
377 1.1 dyoung continue;
378 1.1 dyoung }
379 1.1 dyoung
380 1.1 dyoung fq->inuse = 0;
381 1.1 dyoung fq->nxt = fatp_index(fat, fp);
382 1.1 dyoung
383 1.1 dyoung hash[tag & fat->mask] = fq;
384 1.1 dyoung
385 1.1 dyoung fp = fq;
386 1.1 dyoung }
387 1.1 dyoung
388 1.1 dyoung KASSERT(!fatp_full(fp));
389 1.1 dyoung
390 1.1 dyoung /* Fill highest index first. Lookup is lowest first.
391 1.1 dyoung */
392 1.1 dyoung for (i = fatp_ntags(); --i >= 0; ) {
393 1.1 dyoung if (!((1 << i) & fp->inuse)) {
394 1.1 dyoung break;
395 1.1 dyoung }
396 1.1 dyoung }
397 1.1 dyoung
398 1.1 dyoung fp->inuse |= 1 << i;
399 1.1 dyoung fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
400 1.1 dyoung
401 1.1 dyoung db_trace(KTR_VTW
402 1.1 dyoung , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
403 1.1 dyoung , fp->inuse
404 1.1 dyoung , i, fp->tag[i]));
405 1.1 dyoung
406 1.1 dyoung return fatp_key(fat, fp, i);
407 1.1 dyoung }
408 1.1 dyoung
409 1.1 dyoung static inline int
410 1.1 dyoung vtw_alive(const vtw_t *vtw)
411 1.1 dyoung {
412 1.1 dyoung return vtw->hashed && vtw->expire.tv_sec;
413 1.1 dyoung }
414 1.1 dyoung
415 1.1 dyoung static inline uint32_t
416 1.1 dyoung vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
417 1.1 dyoung {
418 1.1 dyoung if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
419 1.1 dyoung return v4 - ctl->base.v4;
420 1.1 dyoung
421 1.1 dyoung KASSERT(0 && "vtw out of bounds");
422 1.1 dyoung
423 1.1 dyoung return ~0;
424 1.1 dyoung }
425 1.1 dyoung
426 1.1 dyoung static inline uint32_t
427 1.1 dyoung vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
428 1.1 dyoung {
429 1.1 dyoung if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
430 1.1 dyoung return v6 - ctl->base.v6;
431 1.1 dyoung
432 1.1 dyoung KASSERT(0 && "vtw out of bounds");
433 1.1 dyoung
434 1.1 dyoung return ~0;
435 1.1 dyoung }
436 1.1 dyoung
437 1.1 dyoung static inline uint32_t
438 1.1 dyoung vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
439 1.1 dyoung {
440 1.1 dyoung if (ctl->clidx)
441 1.1 dyoung ctl = ctl->ctl;
442 1.1 dyoung
443 1.1 dyoung if (ctl->is_v4)
444 1.1 dyoung return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
445 1.1 dyoung
446 1.1 dyoung if (ctl->is_v6)
447 1.1 dyoung return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
448 1.1 dyoung
449 1.1 dyoung KASSERT(0 && "neither 4 nor 6. most curious.");
450 1.1 dyoung
451 1.1 dyoung return ~0;
452 1.1 dyoung }
453 1.1 dyoung
454 1.1 dyoung static inline vtw_t *
455 1.1 dyoung vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
456 1.1 dyoung {
457 1.1 dyoung if (ctl->clidx)
458 1.1 dyoung ctl = ctl->ctl;
459 1.1 dyoung
460 1.1 dyoung /* See if the index looks like it might be an index.
461 1.1 dyoung * Bits on outside of the valid index bits is a give away.
462 1.1 dyoung */
463 1.1 dyoung idx = idx_decode(ctl, idx);
464 1.1 dyoung
465 1.1 dyoung if (idx == ~0) {
466 1.1 dyoung return 0;
467 1.1 dyoung } else if (ctl->is_v4) {
468 1.1 dyoung vtw_v4_t *vtw = ctl->base.v4 + idx;
469 1.1 dyoung
470 1.1 dyoung return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
471 1.1 dyoung ? &vtw->common : 0;
472 1.1 dyoung } else if (ctl->is_v6) {
473 1.1 dyoung vtw_v6_t *vtw = ctl->base.v6 + idx;
474 1.1 dyoung
475 1.1 dyoung return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
476 1.1 dyoung ? &vtw->common : 0;
477 1.1 dyoung } else {
478 1.1 dyoung KASSERT(0 && "badness");
479 1.1 dyoung return 0;
480 1.1 dyoung }
481 1.1 dyoung }
482 1.1 dyoung
483 1.1 dyoung /*!\brief return the next vtw after this one.
484 1.1 dyoung *
485 1.1 dyoung * Due to the differing sizes of the entries in differing
486 1.1 dyoung * arenas, we have to ensure we ++ the correct pointer type.
487 1.1 dyoung *
488 1.1 dyoung * Also handles wrap.
489 1.1 dyoung */
490 1.1 dyoung static inline vtw_t *
491 1.1 dyoung vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
492 1.1 dyoung {
493 1.1 dyoung if (ctl->is_v4) {
494 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
495 1.1 dyoung
496 1.1 dyoung vtw = &(++v4)->common;
497 1.1 dyoung } else {
498 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
499 1.1 dyoung
500 1.1 dyoung vtw = &(++v6)->common;
501 1.1 dyoung }
502 1.1 dyoung
503 1.1 dyoung if (vtw > ctl->lim.v)
504 1.1 dyoung vtw = ctl->base.v;
505 1.1 dyoung
506 1.1 dyoung return vtw;
507 1.1 dyoung }
508 1.1 dyoung
509 1.1 dyoung /*!\brief remove entry from FATP hash chains
510 1.1 dyoung */
511 1.1 dyoung static inline void
512 1.1 dyoung vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
513 1.1 dyoung {
514 1.1 dyoung fatp_ctl_t *fat = ctl->fat;
515 1.1 dyoung fatp_t *fp;
516 1.1 dyoung uint32_t key = vtw->key;
517 1.1 dyoung uint32_t tag, slot, idx;
518 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
519 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
520 1.1 dyoung
521 1.1 dyoung if (!vtw->hashed) {
522 1.1 dyoung KASSERT(0 && "unhashed");
523 1.1 dyoung return;
524 1.1 dyoung }
525 1.1 dyoung
526 1.1 dyoung if (fat->vtw->is_v4) {
527 1.1 dyoung tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
528 1.1 dyoung } else if (fat->vtw->is_v6) {
529 1.1 dyoung tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
530 1.1 dyoung } else {
531 1.1 dyoung tag = 0;
532 1.1 dyoung KASSERT(0 && "not reached");
533 1.1 dyoung }
534 1.1 dyoung
535 1.1 dyoung /* Remove from fat->hash[]
536 1.1 dyoung */
537 1.1 dyoung slot = fatp_slot_from_key(fat, key);
538 1.1 dyoung fp = fatp_from_key(fat, key);
539 1.1 dyoung idx = vtw_index(ctl, vtw);
540 1.1 dyoung
541 1.1 dyoung db_trace(KTR_VTW
542 1.1 dyoung , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
543 1.1 dyoung , fp->inuse, slot, idx, key, tag));
544 1.1 dyoung
545 1.1 dyoung KASSERT(fp->inuse & (1 << slot));
546 1.1 dyoung KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
547 1.1 dyoung ^ fatp_xtra[slot]));
548 1.1 dyoung
549 1.1 dyoung if ((fp->inuse & (1 << slot))
550 1.1 dyoung && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
551 1.1 dyoung ^ fatp_xtra[slot])) {
552 1.1 dyoung fp->inuse ^= 1 << slot;
553 1.1 dyoung fp->tag[slot] = 0;
554 1.1 dyoung
555 1.1 dyoung /* When we delete entries, we do not compact. This is
556 1.1 dyoung * due to temporality. We add entries, and they
557 1.1 dyoung * (eventually) expire. Older entries will be further
558 1.1 dyoung * down the chain.
559 1.1 dyoung */
560 1.1 dyoung if (!fp->inuse) {
561 1.1 dyoung uint32_t hi = tag & fat->mask;
562 1.1 dyoung fatp_t *fq = 0;
563 1.1 dyoung fatp_t *fr = fat->hash[hi];
564 1.1 dyoung
565 1.1 dyoung while (fr && fr != fp) {
566 1.1 dyoung fr = fatp_next(fat, fq = fr);
567 1.1 dyoung }
568 1.1 dyoung
569 1.1 dyoung if (fr == fp) {
570 1.1 dyoung if (fq) {
571 1.1 dyoung fq->nxt = fp->nxt;
572 1.1 dyoung fp->nxt = 0;
573 1.1 dyoung fatp_free(fat, fp);
574 1.1 dyoung } else {
575 1.1 dyoung KASSERT(fat->hash[hi] == fp);
576 1.1 dyoung
577 1.1 dyoung if (fp->nxt) {
578 1.1 dyoung fat->hash[hi]
579 1.1 dyoung = fatp_next(fat, fp);
580 1.1 dyoung fp->nxt = 0;
581 1.1 dyoung fatp_free(fat, fp);
582 1.1 dyoung } else {
583 1.1 dyoung /* retain for next use.
584 1.1 dyoung */
585 1.1 dyoung ;
586 1.1 dyoung }
587 1.1 dyoung }
588 1.1 dyoung } else {
589 1.1 dyoung fr = fat->hash[hi];
590 1.1 dyoung
591 1.1 dyoung do {
592 1.1 dyoung db_trace(KTR_VTW
593 1.1 dyoung , (fr
594 1.1 dyoung , "fat:*del inuse %5.5x"
595 1.1 dyoung " nxt %x"
596 1.1 dyoung , fr->inuse, fr->nxt));
597 1.1 dyoung
598 1.1 dyoung fr = fatp_next(fat, fq = fr);
599 1.1 dyoung } while (fr && fr != fp);
600 1.1 dyoung
601 1.1 dyoung KASSERT(0 && "oops");
602 1.1 dyoung }
603 1.1 dyoung }
604 1.1 dyoung vtw->key ^= ~0;
605 1.1 dyoung }
606 1.1 dyoung
607 1.1 dyoung if (fat->vtw->is_v4) {
608 1.1 dyoung tag = v4_port_tag(v4->lport);
609 1.1 dyoung } else if (fat->vtw->is_v6) {
610 1.1 dyoung tag = v6_port_tag(v6->lport);
611 1.1 dyoung }
612 1.1 dyoung
613 1.1 dyoung /* Remove from fat->port[]
614 1.1 dyoung */
615 1.1 dyoung key = vtw->port_key;
616 1.1 dyoung slot = fatp_slot_from_key(fat, key);
617 1.1 dyoung fp = fatp_from_key(fat, key);
618 1.1 dyoung idx = vtw_index(ctl, vtw);
619 1.1 dyoung
620 1.1 dyoung db_trace(KTR_VTW
621 1.1 dyoung , (fp, "fatport: del inuse %5.5x"
622 1.1 dyoung " slot %x idx %x key %x tag %x"
623 1.1 dyoung , fp->inuse, slot, idx, key, tag));
624 1.1 dyoung
625 1.1 dyoung KASSERT(fp->inuse & (1 << slot));
626 1.1 dyoung KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
627 1.1 dyoung ^ fatp_xtra[slot]));
628 1.1 dyoung
629 1.1 dyoung if ((fp->inuse & (1 << slot))
630 1.1 dyoung && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
631 1.1 dyoung ^ fatp_xtra[slot])) {
632 1.1 dyoung fp->inuse ^= 1 << slot;
633 1.1 dyoung fp->tag[slot] = 0;
634 1.1 dyoung
635 1.1 dyoung if (!fp->inuse) {
636 1.1 dyoung uint32_t hi = tag & fat->mask;
637 1.1 dyoung fatp_t *fq = 0;
638 1.1 dyoung fatp_t *fr = fat->port[hi];
639 1.1 dyoung
640 1.1 dyoung while (fr && fr != fp) {
641 1.1 dyoung fr = fatp_next(fat, fq = fr);
642 1.1 dyoung }
643 1.1 dyoung
644 1.1 dyoung if (fr == fp) {
645 1.1 dyoung if (fq) {
646 1.1 dyoung fq->nxt = fp->nxt;
647 1.1 dyoung fp->nxt = 0;
648 1.1 dyoung fatp_free(fat, fp);
649 1.1 dyoung } else {
650 1.1 dyoung KASSERT(fat->port[hi] == fp);
651 1.1 dyoung
652 1.1 dyoung if (fp->nxt) {
653 1.1 dyoung fat->port[hi]
654 1.1 dyoung = fatp_next(fat, fp);
655 1.1 dyoung fp->nxt = 0;
656 1.1 dyoung fatp_free(fat, fp);
657 1.1 dyoung } else {
658 1.1 dyoung /* retain for next use.
659 1.1 dyoung */
660 1.1 dyoung ;
661 1.1 dyoung }
662 1.1 dyoung }
663 1.1 dyoung }
664 1.1 dyoung }
665 1.1 dyoung vtw->port_key ^= ~0;
666 1.1 dyoung }
667 1.1 dyoung
668 1.1 dyoung vtw->hashed = 0;
669 1.1 dyoung }
670 1.1 dyoung
671 1.1 dyoung /*!\brief remove entry from hash, possibly free.
672 1.1 dyoung */
673 1.1 dyoung void
674 1.1 dyoung vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
675 1.1 dyoung {
676 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
677 1.1 dyoung
678 1.1 dyoung if (vtw->hashed) {
679 1.1 dyoung ++vtw_stats.del;
680 1.1 dyoung vtw_unhash(ctl, vtw);
681 1.1 dyoung }
682 1.1 dyoung
683 1.1 dyoung /* We only delete the oldest entry.
684 1.1 dyoung */
685 1.1 dyoung if (vtw != ctl->oldest.v)
686 1.1 dyoung return;
687 1.1 dyoung
688 1.1 dyoung --ctl->nalloc;
689 1.1 dyoung ++ctl->nfree;
690 1.1 dyoung
691 1.1 dyoung vtw->expire.tv_sec = 0;
692 1.1 dyoung vtw->expire.tv_usec = ~0;
693 1.1 dyoung
694 1.1 dyoung if (!ctl->nalloc)
695 1.1 dyoung ctl->oldest.v = 0;
696 1.1 dyoung
697 1.1 dyoung ctl->oldest.v = vtw_next(ctl, vtw);
698 1.1 dyoung }
699 1.1 dyoung
700 1.4 dholland /*!\brief insert vestigial timewait in hash chain
701 1.1 dyoung */
702 1.1 dyoung static void
703 1.1 dyoung vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
704 1.1 dyoung {
705 1.1 dyoung uint32_t idx = vtw_index(ctl, vtw);
706 1.1 dyoung uint32_t tag;
707 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
708 1.1 dyoung
709 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
710 1.1 dyoung KASSERT(!vtw->hashed);
711 1.1 dyoung KASSERT(ctl->clidx == vtw->msl_class);
712 1.1 dyoung
713 1.1 dyoung ++vtw_stats.ins;
714 1.1 dyoung
715 1.1 dyoung tag = v4_tag(v4->faddr, v4->fport,
716 1.1 dyoung v4->laddr, v4->lport);
717 1.1 dyoung
718 1.1 dyoung vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
719 1.1 dyoung
720 1.1 dyoung db_trace(KTR_VTW, (ctl
721 1.1 dyoung , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
722 1.1 dyoung " tag %8.8x key %8.8x"
723 1.1 dyoung , v4->faddr, v4->fport
724 1.1 dyoung , v4->laddr, v4->lport
725 1.1 dyoung , tag
726 1.1 dyoung , vtw->key));
727 1.1 dyoung
728 1.1 dyoung tag = v4_port_tag(v4->lport);
729 1.1 dyoung vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
730 1.1 dyoung
731 1.1 dyoung db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
732 1.1 dyoung , v4->lport, v4->lport
733 1.1 dyoung , tag
734 1.1 dyoung , vtw->key));
735 1.1 dyoung
736 1.1 dyoung vtw->hashed = 1;
737 1.1 dyoung }
738 1.1 dyoung
739 1.4 dholland /*!\brief insert vestigial timewait in hash chain
740 1.1 dyoung */
741 1.1 dyoung static void
742 1.1 dyoung vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
743 1.1 dyoung {
744 1.1 dyoung uint32_t idx = vtw_index(ctl, vtw);
745 1.1 dyoung uint32_t tag;
746 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
747 1.1 dyoung
748 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
749 1.1 dyoung KASSERT(!vtw->hashed);
750 1.1 dyoung KASSERT(ctl->clidx == vtw->msl_class);
751 1.1 dyoung
752 1.1 dyoung ++vtw_stats.ins;
753 1.1 dyoung
754 1.1 dyoung tag = v6_tag(&v6->faddr, v6->fport,
755 1.1 dyoung &v6->laddr, v6->lport);
756 1.1 dyoung
757 1.1 dyoung vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
758 1.1 dyoung
759 1.1 dyoung tag = v6_port_tag(v6->lport);
760 1.1 dyoung vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
761 1.1 dyoung
762 1.1 dyoung db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
763 1.1 dyoung , v6->lport, v6->lport
764 1.1 dyoung , tag
765 1.1 dyoung , vtw->key));
766 1.1 dyoung
767 1.1 dyoung vtw->hashed = 1;
768 1.1 dyoung }
769 1.1 dyoung
770 1.1 dyoung static vtw_t *
771 1.1 dyoung vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
772 1.1 dyoung , uint32_t laddr, uint16_t lport
773 1.1 dyoung , int which)
774 1.1 dyoung {
775 1.1 dyoung vtw_v4_t *v4;
776 1.1 dyoung vtw_t *vtw;
777 1.1 dyoung uint32_t tag;
778 1.1 dyoung fatp_t *fp;
779 1.1 dyoung int i;
780 1.1 dyoung uint32_t fatps = 0, probes = 0, losings = 0;
781 1.1 dyoung
782 1.1 dyoung if (!ctl || !ctl->fat)
783 1.1 dyoung return 0;
784 1.1 dyoung
785 1.1 dyoung ++vtw_stats.look[which];
786 1.1 dyoung
787 1.1 dyoung if (which) {
788 1.1 dyoung tag = v4_port_tag(lport);
789 1.1 dyoung fp = ctl->fat->port[tag & ctl->fat->mask];
790 1.1 dyoung } else {
791 1.1 dyoung tag = v4_tag(faddr, fport, laddr, lport);
792 1.1 dyoung fp = ctl->fat->hash[tag & ctl->fat->mask];
793 1.1 dyoung }
794 1.1 dyoung
795 1.1 dyoung while (fp && fp->inuse) {
796 1.1 dyoung uint32_t inuse = fp->inuse;
797 1.1 dyoung
798 1.1 dyoung ++fatps;
799 1.1 dyoung
800 1.1 dyoung for (i = 0; inuse && i < fatp_ntags(); ++i) {
801 1.1 dyoung uint32_t idx;
802 1.1 dyoung
803 1.1 dyoung if (!(inuse & (1 << i)))
804 1.1 dyoung continue;
805 1.1 dyoung
806 1.1 dyoung inuse ^= 1 << i;
807 1.1 dyoung
808 1.1 dyoung ++probes;
809 1.1 dyoung ++vtw_stats.probe[which];
810 1.1 dyoung
811 1.1 dyoung idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
812 1.1 dyoung vtw = vtw_from_index(ctl, idx);
813 1.1 dyoung
814 1.1 dyoung if (!vtw) {
815 1.1 dyoung /* Hopefully fast path.
816 1.1 dyoung */
817 1.1 dyoung db_trace(KTR_VTW
818 1.1 dyoung , (fp, "vtw: fast %A:%P %A:%P"
819 1.1 dyoung " idx %x tag %x"
820 1.1 dyoung , faddr, fport
821 1.1 dyoung , laddr, lport
822 1.1 dyoung , idx, tag));
823 1.1 dyoung continue;
824 1.1 dyoung }
825 1.1 dyoung
826 1.1 dyoung v4 = (void*)vtw;
827 1.1 dyoung
828 1.1 dyoung /* The de-referencing of vtw is what we want to avoid.
829 1.1 dyoung * Losing.
830 1.1 dyoung */
831 1.1 dyoung if (vtw_alive(vtw)
832 1.1 dyoung && ((which ? vtw->port_key : vtw->key)
833 1.1 dyoung == fatp_key(ctl->fat, fp, i))
834 1.1 dyoung && (which
835 1.1 dyoung || (v4->faddr == faddr && v4->laddr == laddr
836 1.1 dyoung && v4->fport == fport))
837 1.1 dyoung && v4->lport == lport) {
838 1.1 dyoung ++vtw_stats.hit[which];
839 1.1 dyoung
840 1.1 dyoung db_trace(KTR_VTW
841 1.1 dyoung , (fp, "vtw: hit %8.8x:%4.4x"
842 1.1 dyoung " %8.8x:%4.4x idx %x key %x"
843 1.1 dyoung , faddr, fport
844 1.1 dyoung , laddr, lport
845 1.1 dyoung , idx_decode(ctl, idx), vtw->key));
846 1.1 dyoung
847 1.1 dyoung KASSERT(vtw->hashed);
848 1.1 dyoung
849 1.1 dyoung goto out;
850 1.1 dyoung }
851 1.1 dyoung ++vtw_stats.losing[which];
852 1.1 dyoung ++losings;
853 1.1 dyoung
854 1.1 dyoung if (vtw_alive(vtw)) {
855 1.1 dyoung db_trace(KTR_VTW
856 1.1 dyoung , (fp, "vtw:!mis %8.8x:%4.4x"
857 1.1 dyoung " %8.8x:%4.4x key %x tag %x"
858 1.1 dyoung , faddr, fport
859 1.1 dyoung , laddr, lport
860 1.1 dyoung , fatp_key(ctl->fat, fp, i)
861 1.1 dyoung , v4_tag(faddr, fport
862 1.1 dyoung , laddr, lport)));
863 1.1 dyoung db_trace(KTR_VTW
864 1.1 dyoung , (vtw, "vtw:!mis %8.8x:%4.4x"
865 1.1 dyoung " %8.8x:%4.4x key %x tag %x"
866 1.1 dyoung , v4->faddr, v4->fport
867 1.1 dyoung , v4->laddr, v4->lport
868 1.1 dyoung , vtw->key
869 1.1 dyoung , v4_tag(v4->faddr, v4->fport
870 1.1 dyoung , v4->laddr, v4->lport)));
871 1.1 dyoung
872 1.1 dyoung if (vtw->key == fatp_key(ctl->fat, fp, i)) {
873 1.1 dyoung db_trace(KTR_VTW
874 1.1 dyoung , (vtw, "vtw:!mis %8.8x:%4.4x"
875 1.1 dyoung " %8.8x:%4.4x key %x"
876 1.1 dyoung " which %x"
877 1.1 dyoung , v4->faddr, v4->fport
878 1.1 dyoung , v4->laddr, v4->lport
879 1.1 dyoung , vtw->key
880 1.1 dyoung , which));
881 1.1 dyoung
882 1.1 dyoung } else {
883 1.1 dyoung db_trace(KTR_VTW
884 1.1 dyoung , (vtw
885 1.1 dyoung , "vtw:!mis"
886 1.1 dyoung " key %8.8x != %8.8x"
887 1.1 dyoung " idx %x i %x which %x"
888 1.1 dyoung , vtw->key
889 1.1 dyoung , fatp_key(ctl->fat, fp, i)
890 1.1 dyoung , idx_decode(ctl, idx)
891 1.1 dyoung , i
892 1.1 dyoung , which));
893 1.1 dyoung }
894 1.1 dyoung } else {
895 1.1 dyoung db_trace(KTR_VTW
896 1.1 dyoung , (fp
897 1.1 dyoung , "vtw:!mis free entry"
898 1.1 dyoung " idx %x vtw %p which %x"
899 1.1 dyoung , idx_decode(ctl, idx)
900 1.1 dyoung , vtw, which));
901 1.1 dyoung }
902 1.1 dyoung }
903 1.1 dyoung
904 1.1 dyoung if (fp->nxt) {
905 1.1 dyoung fp = fatp_next(ctl->fat, fp);
906 1.1 dyoung } else {
907 1.1 dyoung break;
908 1.1 dyoung }
909 1.1 dyoung }
910 1.1 dyoung ++vtw_stats.miss[which];
911 1.1 dyoung vtw = 0;
912 1.1 dyoung out:
913 1.1 dyoung if (fatps > vtw_stats.max_chain[which])
914 1.1 dyoung vtw_stats.max_chain[which] = fatps;
915 1.1 dyoung if (probes > vtw_stats.max_probe[which])
916 1.1 dyoung vtw_stats.max_probe[which] = probes;
917 1.1 dyoung if (losings > vtw_stats.max_loss[which])
918 1.1 dyoung vtw_stats.max_loss[which] = losings;
919 1.1 dyoung
920 1.1 dyoung return vtw;
921 1.1 dyoung }
922 1.1 dyoung
923 1.1 dyoung static vtw_t *
924 1.1 dyoung vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
925 1.1 dyoung , const struct in6_addr *laddr, uint16_t lport
926 1.1 dyoung , int which)
927 1.1 dyoung {
928 1.1 dyoung vtw_v6_t *v6;
929 1.1 dyoung vtw_t *vtw;
930 1.1 dyoung uint32_t tag;
931 1.1 dyoung fatp_t *fp;
932 1.1 dyoung int i;
933 1.1 dyoung uint32_t fatps = 0, probes = 0, losings = 0;
934 1.1 dyoung
935 1.1 dyoung ++vtw_stats.look[which];
936 1.1 dyoung
937 1.1 dyoung if (!ctl || !ctl->fat)
938 1.1 dyoung return 0;
939 1.1 dyoung
940 1.1 dyoung if (which) {
941 1.1 dyoung tag = v6_port_tag(lport);
942 1.1 dyoung fp = ctl->fat->port[tag & ctl->fat->mask];
943 1.1 dyoung } else {
944 1.1 dyoung tag = v6_tag(faddr, fport, laddr, lport);
945 1.1 dyoung fp = ctl->fat->hash[tag & ctl->fat->mask];
946 1.1 dyoung }
947 1.1 dyoung
948 1.1 dyoung while (fp && fp->inuse) {
949 1.1 dyoung uint32_t inuse = fp->inuse;
950 1.1 dyoung
951 1.1 dyoung ++fatps;
952 1.1 dyoung
953 1.1 dyoung for (i = 0; inuse && i < fatp_ntags(); ++i) {
954 1.1 dyoung uint32_t idx;
955 1.1 dyoung
956 1.1 dyoung if (!(inuse & (1 << i)))
957 1.1 dyoung continue;
958 1.1 dyoung
959 1.1 dyoung inuse ^= 1 << i;
960 1.1 dyoung
961 1.1 dyoung ++probes;
962 1.1 dyoung ++vtw_stats.probe[which];
963 1.1 dyoung
964 1.1 dyoung idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
965 1.1 dyoung vtw = vtw_from_index(ctl, idx);
966 1.1 dyoung
967 1.1 dyoung db_trace(KTR_VTW
968 1.1 dyoung , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
969 1.1 dyoung , i
970 1.1 dyoung , db_store(faddr, sizeof (*faddr)), fport
971 1.1 dyoung , db_store(laddr, sizeof (*laddr)), lport
972 1.1 dyoung , idx_decode(ctl, idx)));
973 1.1 dyoung
974 1.1 dyoung if (!vtw) {
975 1.1 dyoung /* Hopefully fast path.
976 1.1 dyoung */
977 1.1 dyoung continue;
978 1.1 dyoung }
979 1.1 dyoung
980 1.1 dyoung v6 = (void*)vtw;
981 1.1 dyoung
982 1.1 dyoung if (vtw_alive(vtw)
983 1.1 dyoung && ((which ? vtw->port_key : vtw->key)
984 1.1 dyoung == fatp_key(ctl->fat, fp, i))
985 1.1 dyoung && v6->lport == lport
986 1.1 dyoung && (which
987 1.1 dyoung || (v6->fport == fport
988 1.1 dyoung && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
989 1.1 dyoung && !bcmp(&v6->laddr, laddr
990 1.1 dyoung , sizeof (*laddr))))) {
991 1.1 dyoung ++vtw_stats.hit[which];
992 1.1 dyoung
993 1.1 dyoung KASSERT(vtw->hashed);
994 1.1 dyoung goto out;
995 1.1 dyoung } else {
996 1.1 dyoung ++vtw_stats.losing[which];
997 1.1 dyoung ++losings;
998 1.1 dyoung }
999 1.1 dyoung }
1000 1.1 dyoung
1001 1.1 dyoung if (fp->nxt) {
1002 1.1 dyoung fp = fatp_next(ctl->fat, fp);
1003 1.1 dyoung } else {
1004 1.1 dyoung break;
1005 1.1 dyoung }
1006 1.1 dyoung }
1007 1.1 dyoung ++vtw_stats.miss[which];
1008 1.1 dyoung vtw = 0;
1009 1.1 dyoung out:
1010 1.1 dyoung if (fatps > vtw_stats.max_chain[which])
1011 1.1 dyoung vtw_stats.max_chain[which] = fatps;
1012 1.1 dyoung if (probes > vtw_stats.max_probe[which])
1013 1.1 dyoung vtw_stats.max_probe[which] = probes;
1014 1.1 dyoung if (losings > vtw_stats.max_loss[which])
1015 1.1 dyoung vtw_stats.max_loss[which] = losings;
1016 1.1 dyoung
1017 1.1 dyoung return vtw;
1018 1.1 dyoung }
1019 1.1 dyoung
1020 1.1 dyoung /*!\brief port iterator
1021 1.1 dyoung */
1022 1.1 dyoung static vtw_t *
1023 1.1 dyoung vtw_next_port_v4(struct tcp_ports_iterator *it)
1024 1.1 dyoung {
1025 1.1 dyoung vtw_ctl_t *ctl = it->ctl;
1026 1.1 dyoung vtw_v4_t *v4;
1027 1.1 dyoung vtw_t *vtw;
1028 1.1 dyoung uint32_t tag;
1029 1.1 dyoung uint16_t lport = it->port;
1030 1.1 dyoung fatp_t *fp;
1031 1.1 dyoung int i;
1032 1.1 dyoung uint32_t fatps = 0, probes = 0, losings = 0;
1033 1.1 dyoung
1034 1.1 dyoung tag = v4_port_tag(lport);
1035 1.1 dyoung if (!it->fp) {
1036 1.1 dyoung it->fp = ctl->fat->port[tag & ctl->fat->mask];
1037 1.1 dyoung it->slot_idx = 0;
1038 1.1 dyoung }
1039 1.1 dyoung fp = it->fp;
1040 1.1 dyoung
1041 1.1 dyoung while (fp) {
1042 1.1 dyoung uint32_t inuse = fp->inuse;
1043 1.1 dyoung
1044 1.1 dyoung ++fatps;
1045 1.1 dyoung
1046 1.1 dyoung for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1047 1.1 dyoung uint32_t idx;
1048 1.1 dyoung
1049 1.1 dyoung if (!(inuse & (1 << i)))
1050 1.1 dyoung continue;
1051 1.1 dyoung
1052 1.9.2.2 jdolecek inuse &= ~0U << i;
1053 1.1 dyoung
1054 1.1 dyoung if (i < it->slot_idx)
1055 1.1 dyoung continue;
1056 1.1 dyoung
1057 1.1 dyoung ++vtw_stats.probe[1];
1058 1.1 dyoung ++probes;
1059 1.1 dyoung
1060 1.1 dyoung idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1061 1.1 dyoung vtw = vtw_from_index(ctl, idx);
1062 1.1 dyoung
1063 1.1 dyoung if (!vtw) {
1064 1.1 dyoung /* Hopefully fast path.
1065 1.1 dyoung */
1066 1.1 dyoung continue;
1067 1.1 dyoung }
1068 1.1 dyoung
1069 1.1 dyoung v4 = (void*)vtw;
1070 1.1 dyoung
1071 1.1 dyoung if (vtw_alive(vtw)
1072 1.1 dyoung && vtw->port_key == fatp_key(ctl->fat, fp, i)
1073 1.1 dyoung && v4->lport == lport) {
1074 1.1 dyoung ++vtw_stats.hit[1];
1075 1.1 dyoung
1076 1.1 dyoung it->slot_idx = i + 1;
1077 1.1 dyoung
1078 1.1 dyoung goto out;
1079 1.1 dyoung } else if (vtw_alive(vtw)) {
1080 1.1 dyoung ++vtw_stats.losing[1];
1081 1.1 dyoung ++losings;
1082 1.1 dyoung
1083 1.1 dyoung db_trace(KTR_VTW
1084 1.1 dyoung , (vtw, "vtw:!mis"
1085 1.1 dyoung " port %8.8x:%4.4x %8.8x:%4.4x"
1086 1.1 dyoung " key %x port %x"
1087 1.1 dyoung , v4->faddr, v4->fport
1088 1.1 dyoung , v4->laddr, v4->lport
1089 1.1 dyoung , vtw->key
1090 1.1 dyoung , lport));
1091 1.1 dyoung } else {
1092 1.1 dyoung /* Really losing here. We are coming
1093 1.1 dyoung * up with references to free entries.
1094 1.1 dyoung * Might find it better to use
1095 1.1 dyoung * traditional, or need another
1096 1.1 dyoung * add-hockery. The other add-hockery
1097 1.1 dyoung * would be to pul more into into the
1098 1.1 dyoung * cache line to reject the false
1099 1.1 dyoung * hits.
1100 1.1 dyoung */
1101 1.1 dyoung ++vtw_stats.losing[1];
1102 1.1 dyoung ++losings;
1103 1.1 dyoung db_trace(KTR_VTW
1104 1.1 dyoung , (fp, "vtw:!mis port %x"
1105 1.1 dyoung " - free entry idx %x vtw %p"
1106 1.1 dyoung , lport
1107 1.1 dyoung , idx_decode(ctl, idx)
1108 1.1 dyoung , vtw));
1109 1.1 dyoung }
1110 1.1 dyoung }
1111 1.1 dyoung
1112 1.1 dyoung if (fp->nxt) {
1113 1.1 dyoung it->fp = fp = fatp_next(ctl->fat, fp);
1114 1.1 dyoung it->slot_idx = 0;
1115 1.1 dyoung } else {
1116 1.1 dyoung it->fp = 0;
1117 1.1 dyoung break;
1118 1.1 dyoung }
1119 1.1 dyoung }
1120 1.1 dyoung ++vtw_stats.miss[1];
1121 1.1 dyoung
1122 1.1 dyoung vtw = 0;
1123 1.1 dyoung out:
1124 1.1 dyoung if (fatps > vtw_stats.max_chain[1])
1125 1.1 dyoung vtw_stats.max_chain[1] = fatps;
1126 1.1 dyoung if (probes > vtw_stats.max_probe[1])
1127 1.1 dyoung vtw_stats.max_probe[1] = probes;
1128 1.1 dyoung if (losings > vtw_stats.max_loss[1])
1129 1.1 dyoung vtw_stats.max_loss[1] = losings;
1130 1.1 dyoung
1131 1.1 dyoung return vtw;
1132 1.1 dyoung }
1133 1.1 dyoung
1134 1.1 dyoung /*!\brief port iterator
1135 1.1 dyoung */
1136 1.1 dyoung static vtw_t *
1137 1.1 dyoung vtw_next_port_v6(struct tcp_ports_iterator *it)
1138 1.1 dyoung {
1139 1.1 dyoung vtw_ctl_t *ctl = it->ctl;
1140 1.1 dyoung vtw_v6_t *v6;
1141 1.1 dyoung vtw_t *vtw;
1142 1.1 dyoung uint32_t tag;
1143 1.1 dyoung uint16_t lport = it->port;
1144 1.1 dyoung fatp_t *fp;
1145 1.1 dyoung int i;
1146 1.1 dyoung uint32_t fatps = 0, probes = 0, losings = 0;
1147 1.1 dyoung
1148 1.1 dyoung tag = v6_port_tag(lport);
1149 1.1 dyoung if (!it->fp) {
1150 1.1 dyoung it->fp = ctl->fat->port[tag & ctl->fat->mask];
1151 1.1 dyoung it->slot_idx = 0;
1152 1.1 dyoung }
1153 1.1 dyoung fp = it->fp;
1154 1.1 dyoung
1155 1.1 dyoung while (fp) {
1156 1.1 dyoung uint32_t inuse = fp->inuse;
1157 1.1 dyoung
1158 1.1 dyoung ++fatps;
1159 1.1 dyoung
1160 1.1 dyoung for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
1161 1.1 dyoung uint32_t idx;
1162 1.1 dyoung
1163 1.1 dyoung if (!(inuse & (1 << i)))
1164 1.1 dyoung continue;
1165 1.1 dyoung
1166 1.9.2.2 jdolecek inuse &= ~0U << i;
1167 1.1 dyoung
1168 1.1 dyoung if (i < it->slot_idx)
1169 1.1 dyoung continue;
1170 1.1 dyoung
1171 1.1 dyoung ++vtw_stats.probe[1];
1172 1.1 dyoung ++probes;
1173 1.1 dyoung
1174 1.1 dyoung idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
1175 1.1 dyoung vtw = vtw_from_index(ctl, idx);
1176 1.1 dyoung
1177 1.1 dyoung if (!vtw) {
1178 1.1 dyoung /* Hopefully fast path.
1179 1.1 dyoung */
1180 1.1 dyoung continue;
1181 1.1 dyoung }
1182 1.1 dyoung
1183 1.1 dyoung v6 = (void*)vtw;
1184 1.1 dyoung
1185 1.1 dyoung db_trace(KTR_VTW
1186 1.1 dyoung , (vtw, "vtw: i %x idx %x fp->tag %x"
1187 1.1 dyoung " tag %x xtra %x"
1188 1.1 dyoung , i, idx_decode(ctl, idx)
1189 1.1 dyoung , fp->tag[i], tag, fatp_xtra[i]));
1190 1.1 dyoung
1191 1.1 dyoung if (vtw_alive(vtw)
1192 1.1 dyoung && vtw->port_key == fatp_key(ctl->fat, fp, i)
1193 1.1 dyoung && v6->lport == lport) {
1194 1.1 dyoung ++vtw_stats.hit[1];
1195 1.1 dyoung
1196 1.1 dyoung db_trace(KTR_VTW
1197 1.1 dyoung , (fp, "vtw: nxt port %P - %4.4x"
1198 1.1 dyoung " idx %x key %x"
1199 1.1 dyoung , lport, lport
1200 1.1 dyoung , idx_decode(ctl, idx), vtw->key));
1201 1.1 dyoung
1202 1.1 dyoung it->slot_idx = i + 1;
1203 1.1 dyoung goto out;
1204 1.1 dyoung } else if (vtw_alive(vtw)) {
1205 1.1 dyoung ++vtw_stats.losing[1];
1206 1.1 dyoung
1207 1.1 dyoung db_trace(KTR_VTW
1208 1.1 dyoung , (vtw, "vtw:!mis port %6A:%4.4x"
1209 1.1 dyoung " %6A:%4.4x key %x port %x"
1210 1.1 dyoung , db_store(&v6->faddr
1211 1.1 dyoung , sizeof (v6->faddr))
1212 1.1 dyoung , v6->fport
1213 1.1 dyoung , db_store(&v6->laddr
1214 1.1 dyoung , sizeof (v6->faddr))
1215 1.1 dyoung , v6->lport
1216 1.1 dyoung , vtw->key
1217 1.1 dyoung , lport));
1218 1.1 dyoung } else {
1219 1.1 dyoung /* Really losing here. We are coming
1220 1.1 dyoung * up with references to free entries.
1221 1.1 dyoung * Might find it better to use
1222 1.1 dyoung * traditional, or need another
1223 1.1 dyoung * add-hockery. The other add-hockery
1224 1.1 dyoung * would be to pul more into into the
1225 1.1 dyoung * cache line to reject the false
1226 1.1 dyoung * hits.
1227 1.1 dyoung */
1228 1.1 dyoung ++vtw_stats.losing[1];
1229 1.1 dyoung ++losings;
1230 1.1 dyoung
1231 1.1 dyoung db_trace(KTR_VTW
1232 1.1 dyoung , (fp
1233 1.1 dyoung , "vtw:!mis port %x"
1234 1.1 dyoung " - free entry idx %x vtw %p"
1235 1.1 dyoung , lport, idx_decode(ctl, idx)
1236 1.1 dyoung , vtw));
1237 1.1 dyoung }
1238 1.1 dyoung }
1239 1.1 dyoung
1240 1.1 dyoung if (fp->nxt) {
1241 1.1 dyoung it->fp = fp = fatp_next(ctl->fat, fp);
1242 1.1 dyoung it->slot_idx = 0;
1243 1.1 dyoung } else {
1244 1.1 dyoung it->fp = 0;
1245 1.1 dyoung break;
1246 1.1 dyoung }
1247 1.1 dyoung }
1248 1.1 dyoung ++vtw_stats.miss[1];
1249 1.1 dyoung
1250 1.1 dyoung vtw = 0;
1251 1.1 dyoung out:
1252 1.1 dyoung if (fatps > vtw_stats.max_chain[1])
1253 1.1 dyoung vtw_stats.max_chain[1] = fatps;
1254 1.1 dyoung if (probes > vtw_stats.max_probe[1])
1255 1.1 dyoung vtw_stats.max_probe[1] = probes;
1256 1.1 dyoung if (losings > vtw_stats.max_loss[1])
1257 1.1 dyoung vtw_stats.max_loss[1] = losings;
1258 1.1 dyoung
1259 1.1 dyoung return vtw;
1260 1.1 dyoung }
1261 1.1 dyoung
1262 1.1 dyoung /*!\brief initialise the VTW allocation arena
1263 1.1 dyoung *
1264 1.1 dyoung * There are 1+3 allocation classes:
1265 1.1 dyoung * 0 classless
1266 1.1 dyoung * {1,2,3} MSL-class based allocation
1267 1.1 dyoung *
1268 1.1 dyoung * The allocation arenas are all initialised. Classless gets all the
1269 1.1 dyoung * space. MSL-class based divides the arena, so that allocation
1270 1.1 dyoung * within a class can proceed without having to consider entries
1271 1.1 dyoung * (aka: cache lines) from different classes.
1272 1.1 dyoung *
1273 1.1 dyoung * Usually, we are completely classless or class-based, but there can be
1274 1.1 dyoung * transition periods, corresponding to dynamic adjustments in the config
1275 1.1 dyoung * by the operator.
1276 1.1 dyoung */
1277 1.1 dyoung static void
1278 1.6 dyoung vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
1279 1.1 dyoung {
1280 1.6 dyoung int class_n, i;
1281 1.6 dyoung vtw_t *base;
1282 1.1 dyoung
1283 1.6 dyoung ctl->base.v = ctl_base_v;
1284 1.1 dyoung
1285 1.6 dyoung if (ctl->is_v4) {
1286 1.6 dyoung ctl->lim.v4 = ctl->base.v4 + n - 1;
1287 1.6 dyoung ctl->alloc.v4 = ctl->base.v4;
1288 1.6 dyoung } else {
1289 1.6 dyoung ctl->lim.v6 = ctl->base.v6 + n - 1;
1290 1.6 dyoung ctl->alloc.v6 = ctl->base.v6;
1291 1.6 dyoung }
1292 1.1 dyoung
1293 1.6 dyoung ctl->nfree = n;
1294 1.6 dyoung ctl->ctl = ctl;
1295 1.1 dyoung
1296 1.6 dyoung ctl->idx_bits = 32;
1297 1.6 dyoung for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
1298 1.6 dyoung ctl->idx_mask >>= 1;
1299 1.6 dyoung ctl->idx_bits -= 1;
1300 1.6 dyoung }
1301 1.1 dyoung
1302 1.6 dyoung ctl->idx_mask <<= 1;
1303 1.6 dyoung ctl->idx_mask |= 1;
1304 1.6 dyoung ctl->idx_bits += 1;
1305 1.1 dyoung
1306 1.6 dyoung ctl->fat = fat;
1307 1.6 dyoung fat->vtw = ctl;
1308 1.1 dyoung
1309 1.6 dyoung /* Divide the resources equally amongst the classes.
1310 1.6 dyoung * This is not optimal, as the different classes
1311 1.6 dyoung * arrive and leave at different rates, but it is
1312 1.6 dyoung * the best I can do for now.
1313 1.6 dyoung */
1314 1.6 dyoung class_n = n / (VTW_NCLASS-1);
1315 1.6 dyoung base = ctl->base.v;
1316 1.1 dyoung
1317 1.6 dyoung for (i = 1; i < VTW_NCLASS; ++i) {
1318 1.6 dyoung int j;
1319 1.1 dyoung
1320 1.6 dyoung ctl[i] = ctl[0];
1321 1.6 dyoung ctl[i].clidx = i;
1322 1.1 dyoung
1323 1.6 dyoung ctl[i].base.v = base;
1324 1.6 dyoung ctl[i].alloc = ctl[i].base;
1325 1.1 dyoung
1326 1.6 dyoung for (j = 0; j < class_n - 1; ++j) {
1327 1.6 dyoung if (tcp_msl_enable)
1328 1.6 dyoung base->msl_class = i;
1329 1.1 dyoung base = vtw_next(ctl, base);
1330 1.1 dyoung }
1331 1.6 dyoung
1332 1.6 dyoung ctl[i].lim.v = base;
1333 1.6 dyoung base = vtw_next(ctl, base);
1334 1.6 dyoung ctl[i].nfree = class_n;
1335 1.1 dyoung }
1336 1.1 dyoung
1337 1.1 dyoung vtw_debug_init();
1338 1.1 dyoung }
1339 1.1 dyoung
1340 1.1 dyoung /*!\brief map class to TCP MSL
1341 1.1 dyoung */
1342 1.1 dyoung static inline uint32_t
1343 1.9.2.2 jdolecek class_to_msl(int msl_class)
1344 1.1 dyoung {
1345 1.9.2.2 jdolecek switch (msl_class) {
1346 1.1 dyoung case 0:
1347 1.1 dyoung case 1:
1348 1.1 dyoung return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
1349 1.1 dyoung case 2:
1350 1.1 dyoung return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
1351 1.1 dyoung default:
1352 1.1 dyoung return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
1353 1.1 dyoung }
1354 1.1 dyoung }
1355 1.1 dyoung
1356 1.1 dyoung /*!\brief map TCP MSL to class
1357 1.1 dyoung */
1358 1.1 dyoung static inline uint32_t
1359 1.1 dyoung msl_to_class(int msl)
1360 1.1 dyoung {
1361 1.1 dyoung if (tcp_msl_enable) {
1362 1.1 dyoung if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
1363 1.1 dyoung return 1+2;
1364 1.1 dyoung if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
1365 1.1 dyoung return 1+1;
1366 1.1 dyoung return 1;
1367 1.1 dyoung }
1368 1.1 dyoung return 0;
1369 1.1 dyoung }
1370 1.1 dyoung
1371 1.1 dyoung /*!\brief allocate a vtw entry
1372 1.1 dyoung */
1373 1.1 dyoung static inline vtw_t *
1374 1.1 dyoung vtw_alloc(vtw_ctl_t *ctl)
1375 1.1 dyoung {
1376 1.1 dyoung vtw_t *vtw = 0;
1377 1.1 dyoung int stuck = 0;
1378 1.1 dyoung int avail = ctl ? (ctl->nalloc + ctl->nfree) : 0;
1379 1.1 dyoung int msl;
1380 1.1 dyoung
1381 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
1382 1.1 dyoung
1383 1.1 dyoung /* If no resources, we will not get far.
1384 1.1 dyoung */
1385 1.1 dyoung if (!ctl || !ctl->base.v4 || avail <= 0)
1386 1.1 dyoung return 0;
1387 1.1 dyoung
1388 1.1 dyoung /* Obtain a free one.
1389 1.1 dyoung */
1390 1.1 dyoung while (!ctl->nfree) {
1391 1.1 dyoung vtw_age(ctl, 0);
1392 1.1 dyoung
1393 1.1 dyoung if (++stuck > avail) {
1394 1.1 dyoung /* When in transition between
1395 1.1 dyoung * schemes (classless, classed) we
1396 1.1 dyoung * can be stuck having to await the
1397 1.1 dyoung * expiration of cross-allocated entries.
1398 1.1 dyoung *
1399 1.1 dyoung * Returning zero means we will fall back to the
1400 1.1 dyoung * traditional TIME_WAIT handling, except in the
1401 1.1 dyoung * case of a re-shed, in which case we cannot
1402 1.1 dyoung * perform the reshecd, but will retain the extant
1403 1.1 dyoung * entry.
1404 1.1 dyoung */
1405 1.1 dyoung db_trace(KTR_VTW
1406 1.1 dyoung , (ctl, "vtw:!none free in class %x %x/%x"
1407 1.1 dyoung , ctl->clidx
1408 1.1 dyoung , ctl->nalloc, ctl->nfree));
1409 1.1 dyoung
1410 1.1 dyoung return 0;
1411 1.1 dyoung }
1412 1.1 dyoung }
1413 1.1 dyoung
1414 1.1 dyoung vtw = ctl->alloc.v;
1415 1.1 dyoung
1416 1.1 dyoung if (vtw->msl_class != ctl->clidx) {
1417 1.1 dyoung /* Usurping rules:
1418 1.1 dyoung * 0 -> {1,2,3} or {1,2,3} -> 0
1419 1.1 dyoung */
1420 1.1 dyoung KASSERT(!vtw->msl_class || !ctl->clidx);
1421 1.1 dyoung
1422 1.1 dyoung if (vtw->hashed || vtw->expire.tv_sec) {
1423 1.1 dyoung /* As this is owned by some other class,
1424 1.1 dyoung * we must wait for it to expire it.
1425 1.1 dyoung * This will only happen on class/classless
1426 1.1 dyoung * transitions, which are guaranteed to progress
1427 1.1 dyoung * to completion in small finite time, barring bugs.
1428 1.1 dyoung */
1429 1.1 dyoung db_trace(KTR_VTW
1430 1.1 dyoung , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
1431 1.1 dyoung , vtw, vtw->msl_class, ctl->clidx
1432 1.1 dyoung , vtw->expire.tv_sec
1433 1.1 dyoung , vtw->expire.tv_usec
1434 1.1 dyoung , vtw->hashed ? " hashed" : ""));
1435 1.1 dyoung
1436 1.1 dyoung return 0;
1437 1.1 dyoung }
1438 1.1 dyoung
1439 1.1 dyoung db_trace(KTR_VTW
1440 1.1 dyoung , (ctl, "vtw:!%p usurped from %x to %x"
1441 1.1 dyoung , vtw, vtw->msl_class, ctl->clidx));
1442 1.1 dyoung
1443 1.1 dyoung vtw->msl_class = ctl->clidx;
1444 1.1 dyoung }
1445 1.1 dyoung
1446 1.1 dyoung if (vtw_alive(vtw)) {
1447 1.1 dyoung KASSERT(0 && "next free not free");
1448 1.1 dyoung return 0;
1449 1.1 dyoung }
1450 1.1 dyoung
1451 1.1 dyoung /* Advance allocation poiter.
1452 1.1 dyoung */
1453 1.1 dyoung ctl->alloc.v = vtw_next(ctl, vtw);
1454 1.1 dyoung
1455 1.1 dyoung --ctl->nfree;
1456 1.1 dyoung ++ctl->nalloc;
1457 1.1 dyoung
1458 1.1 dyoung msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ; // msec
1459 1.1 dyoung
1460 1.1 dyoung /* mark expiration
1461 1.1 dyoung */
1462 1.3 drochner getmicrouptime(&vtw->expire);
1463 1.1 dyoung
1464 1.1 dyoung /* Move expiration into the future.
1465 1.1 dyoung */
1466 1.1 dyoung vtw->expire.tv_sec += msl / 1000;
1467 1.1 dyoung vtw->expire.tv_usec += 1000 * (msl % 1000);
1468 1.1 dyoung
1469 1.1 dyoung while (vtw->expire.tv_usec >= 1000*1000) {
1470 1.1 dyoung vtw->expire.tv_usec -= 1000*1000;
1471 1.1 dyoung vtw->expire.tv_sec += 1;
1472 1.1 dyoung }
1473 1.1 dyoung
1474 1.1 dyoung if (!ctl->oldest.v)
1475 1.1 dyoung ctl->oldest.v = vtw;
1476 1.1 dyoung
1477 1.1 dyoung return vtw;
1478 1.1 dyoung }
1479 1.1 dyoung
1480 1.1 dyoung /*!\brief expiration
1481 1.1 dyoung */
1482 1.1 dyoung static int
1483 1.1 dyoung vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
1484 1.1 dyoung {
1485 1.1 dyoung vtw_t *vtw;
1486 1.1 dyoung struct timeval then, *when = _when;
1487 1.1 dyoung int maxtries = 0;
1488 1.1 dyoung
1489 1.1 dyoung if (!ctl->oldest.v) {
1490 1.1 dyoung KASSERT(!ctl->nalloc);
1491 1.1 dyoung return 0;
1492 1.1 dyoung }
1493 1.1 dyoung
1494 1.1 dyoung for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
1495 1.1 dyoung if (++maxtries > ctl->nalloc)
1496 1.1 dyoung break;
1497 1.1 dyoung
1498 1.1 dyoung if (vtw->msl_class != ctl->clidx) {
1499 1.1 dyoung db_trace(KTR_VTW
1500 1.1 dyoung , (vtw, "vtw:!age class mismatch %x != %x"
1501 1.1 dyoung , vtw->msl_class, ctl->clidx));
1502 1.1 dyoung /* XXXX
1503 1.1 dyoung * See if the appropriate action is to skip to the next.
1504 1.1 dyoung * XXXX
1505 1.1 dyoung */
1506 1.1 dyoung ctl->oldest.v = vtw = vtw_next(ctl, vtw);
1507 1.1 dyoung continue;
1508 1.1 dyoung }
1509 1.1 dyoung if (!when) {
1510 1.1 dyoung /* Latch oldest timeval if none specified.
1511 1.1 dyoung */
1512 1.1 dyoung then = vtw->expire;
1513 1.1 dyoung when = &then;
1514 1.1 dyoung }
1515 1.1 dyoung
1516 1.1 dyoung if (!timercmp(&vtw->expire, when, <=))
1517 1.1 dyoung break;
1518 1.1 dyoung
1519 1.1 dyoung db_trace(KTR_VTW
1520 1.1 dyoung , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
1521 1.1 dyoung , ctl->clidx
1522 1.1 dyoung , vtw->expire.tv_sec
1523 1.1 dyoung , vtw->expire.tv_usec
1524 1.1 dyoung , ctl->nalloc
1525 1.1 dyoung , ctl->nfree));
1526 1.1 dyoung
1527 1.1 dyoung if (!_when)
1528 1.1 dyoung ++vtw_stats.kill;
1529 1.1 dyoung
1530 1.1 dyoung vtw_del(ctl, vtw);
1531 1.1 dyoung vtw = ctl->oldest.v;
1532 1.1 dyoung }
1533 1.1 dyoung
1534 1.1 dyoung return ctl->nalloc; // # remaining allocated
1535 1.1 dyoung }
1536 1.1 dyoung
1537 1.1 dyoung static callout_t vtw_cs;
1538 1.1 dyoung
1539 1.1 dyoung /*!\brief notice the passage of time.
1540 1.1 dyoung * It seems to be getting faster. What happened to the year?
1541 1.1 dyoung */
1542 1.1 dyoung static void
1543 1.1 dyoung vtw_tick(void *arg)
1544 1.1 dyoung {
1545 1.1 dyoung struct timeval now;
1546 1.1 dyoung int i, cnt = 0;
1547 1.1 dyoung
1548 1.3 drochner getmicrouptime(&now);
1549 1.1 dyoung
1550 1.1 dyoung db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
1551 1.1 dyoung , now.tv_sec, now.tv_usec));
1552 1.1 dyoung
1553 1.1 dyoung mutex_enter(softnet_lock);
1554 1.1 dyoung
1555 1.1 dyoung for (i = 0; i < VTW_NCLASS; ++i) {
1556 1.1 dyoung cnt += vtw_age(&vtw_tcpv4[i], &now);
1557 1.1 dyoung cnt += vtw_age(&vtw_tcpv6[i], &now);
1558 1.1 dyoung }
1559 1.1 dyoung
1560 1.1 dyoung /* Keep ticks coming while we need them.
1561 1.1 dyoung */
1562 1.1 dyoung if (cnt)
1563 1.1 dyoung callout_schedule(&vtw_cs, hz / 5);
1564 1.1 dyoung else {
1565 1.1 dyoung tcp_vtw_was_enabled = 0;
1566 1.1 dyoung tcbtable.vestige = 0;
1567 1.1 dyoung }
1568 1.1 dyoung mutex_exit(softnet_lock);
1569 1.1 dyoung }
1570 1.1 dyoung
1571 1.1 dyoung /* in_pcblookup_ports assist for handling vestigial entries.
1572 1.1 dyoung */
1573 1.1 dyoung static void *
1574 1.1 dyoung tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
1575 1.1 dyoung {
1576 1.1 dyoung struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
1577 1.1 dyoung
1578 1.1 dyoung bzero(it, sizeof (*it));
1579 1.1 dyoung
1580 1.1 dyoung /* Note: the reference to vtw_tcpv4[0] is fine.
1581 1.1 dyoung * We do not need per-class iteration. We just
1582 1.1 dyoung * need to get to the fat, and there is one
1583 1.1 dyoung * shared fat.
1584 1.1 dyoung */
1585 1.1 dyoung if (vtw_tcpv4[0].fat) {
1586 1.1 dyoung it->addr.v4 = addr;
1587 1.1 dyoung it->port = port;
1588 1.1 dyoung it->wild = !!wild;
1589 1.1 dyoung it->ctl = &vtw_tcpv4[0];
1590 1.1 dyoung
1591 1.1 dyoung ++vtw_stats.look[1];
1592 1.1 dyoung }
1593 1.1 dyoung
1594 1.1 dyoung return it;
1595 1.1 dyoung }
1596 1.1 dyoung
1597 1.1 dyoung /*!\brief export an IPv4 vtw.
1598 1.1 dyoung */
1599 1.1 dyoung static int
1600 1.1 dyoung vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1601 1.1 dyoung {
1602 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
1603 1.1 dyoung
1604 1.1 dyoung bzero(res, sizeof (*res));
1605 1.1 dyoung
1606 1.1 dyoung if (ctl && vtw) {
1607 1.1 dyoung if (!ctl->clidx && vtw->msl_class)
1608 1.1 dyoung ctl += vtw->msl_class;
1609 1.1 dyoung else
1610 1.1 dyoung KASSERT(ctl->clidx == vtw->msl_class);
1611 1.1 dyoung
1612 1.1 dyoung res->valid = 1;
1613 1.1 dyoung res->v4 = 1;
1614 1.1 dyoung
1615 1.1 dyoung res->faddr.v4.s_addr = v4->faddr;
1616 1.1 dyoung res->laddr.v4.s_addr = v4->laddr;
1617 1.1 dyoung res->fport = v4->fport;
1618 1.1 dyoung res->lport = v4->lport;
1619 1.1 dyoung res->vtw = vtw; // netlock held over call(s)
1620 1.1 dyoung res->ctl = ctl;
1621 1.1 dyoung res->reuse_addr = vtw->reuse_addr;
1622 1.1 dyoung res->reuse_port = vtw->reuse_port;
1623 1.1 dyoung res->snd_nxt = vtw->snd_nxt;
1624 1.1 dyoung res->rcv_nxt = vtw->rcv_nxt;
1625 1.1 dyoung res->rcv_wnd = vtw->rcv_wnd;
1626 1.1 dyoung res->uid = vtw->uid;
1627 1.1 dyoung }
1628 1.1 dyoung
1629 1.1 dyoung return res->valid;
1630 1.1 dyoung }
1631 1.1 dyoung
1632 1.1 dyoung /*!\brief return next port in the port iterator. yowza.
1633 1.1 dyoung */
1634 1.1 dyoung static int
1635 1.1 dyoung tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
1636 1.1 dyoung {
1637 1.1 dyoung struct tcp_ports_iterator *it = arg;
1638 1.1 dyoung vtw_t *vtw = 0;
1639 1.1 dyoung
1640 1.1 dyoung if (it->ctl)
1641 1.1 dyoung vtw = vtw_next_port_v4(it);
1642 1.1 dyoung
1643 1.1 dyoung if (!vtw)
1644 1.1 dyoung it->ctl = 0;
1645 1.1 dyoung
1646 1.1 dyoung return vtw_export_v4(it->ctl, vtw, res);
1647 1.1 dyoung }
1648 1.1 dyoung
1649 1.1 dyoung static int
1650 1.1 dyoung tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
1651 1.1 dyoung struct in_addr laddr, uint16_t lport,
1652 1.1 dyoung struct vestigial_inpcb *res)
1653 1.1 dyoung {
1654 1.1 dyoung vtw_t *vtw;
1655 1.1 dyoung vtw_ctl_t *ctl;
1656 1.1 dyoung
1657 1.1 dyoung
1658 1.1 dyoung db_trace(KTR_VTW
1659 1.1 dyoung , (res, "vtw: lookup %A:%P %A:%P"
1660 1.1 dyoung , faddr, fport
1661 1.1 dyoung , laddr, lport));
1662 1.1 dyoung
1663 1.1 dyoung vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
1664 1.1 dyoung , faddr.s_addr, fport
1665 1.1 dyoung , laddr.s_addr, lport, 0);
1666 1.1 dyoung
1667 1.1 dyoung return vtw_export_v4(ctl, vtw, res);
1668 1.1 dyoung }
1669 1.1 dyoung
1670 1.1 dyoung /* in_pcblookup_ports assist for handling vestigial entries.
1671 1.1 dyoung */
1672 1.1 dyoung static void *
1673 1.1 dyoung tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
1674 1.1 dyoung {
1675 1.1 dyoung struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
1676 1.1 dyoung
1677 1.1 dyoung bzero(it, sizeof (*it));
1678 1.1 dyoung
1679 1.1 dyoung /* Note: the reference to vtw_tcpv6[0] is fine.
1680 1.1 dyoung * We do not need per-class iteration. We just
1681 1.1 dyoung * need to get to the fat, and there is one
1682 1.1 dyoung * shared fat.
1683 1.1 dyoung */
1684 1.1 dyoung if (vtw_tcpv6[0].fat) {
1685 1.1 dyoung it->addr.v6 = *addr;
1686 1.1 dyoung it->port = port;
1687 1.1 dyoung it->wild = !!wild;
1688 1.1 dyoung it->ctl = &vtw_tcpv6[0];
1689 1.1 dyoung
1690 1.1 dyoung ++vtw_stats.look[1];
1691 1.1 dyoung }
1692 1.1 dyoung
1693 1.1 dyoung return it;
1694 1.1 dyoung }
1695 1.1 dyoung
1696 1.1 dyoung /*!\brief export an IPv6 vtw.
1697 1.1 dyoung */
1698 1.1 dyoung static int
1699 1.1 dyoung vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
1700 1.1 dyoung {
1701 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
1702 1.1 dyoung
1703 1.1 dyoung bzero(res, sizeof (*res));
1704 1.1 dyoung
1705 1.1 dyoung if (ctl && vtw) {
1706 1.1 dyoung if (!ctl->clidx && vtw->msl_class)
1707 1.1 dyoung ctl += vtw->msl_class;
1708 1.1 dyoung else
1709 1.1 dyoung KASSERT(ctl->clidx == vtw->msl_class);
1710 1.1 dyoung
1711 1.1 dyoung res->valid = 1;
1712 1.1 dyoung res->v4 = 0;
1713 1.1 dyoung
1714 1.1 dyoung res->faddr.v6 = v6->faddr;
1715 1.1 dyoung res->laddr.v6 = v6->laddr;
1716 1.1 dyoung res->fport = v6->fport;
1717 1.1 dyoung res->lport = v6->lport;
1718 1.1 dyoung res->vtw = vtw; // netlock held over call(s)
1719 1.1 dyoung res->ctl = ctl;
1720 1.1 dyoung
1721 1.1 dyoung res->v6only = vtw->v6only;
1722 1.1 dyoung res->reuse_addr = vtw->reuse_addr;
1723 1.1 dyoung res->reuse_port = vtw->reuse_port;
1724 1.1 dyoung
1725 1.1 dyoung res->snd_nxt = vtw->snd_nxt;
1726 1.1 dyoung res->rcv_nxt = vtw->rcv_nxt;
1727 1.1 dyoung res->rcv_wnd = vtw->rcv_wnd;
1728 1.1 dyoung res->uid = vtw->uid;
1729 1.1 dyoung }
1730 1.1 dyoung
1731 1.1 dyoung return res->valid;
1732 1.1 dyoung }
1733 1.1 dyoung
1734 1.1 dyoung static int
1735 1.1 dyoung tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
1736 1.1 dyoung {
1737 1.1 dyoung struct tcp_ports_iterator *it = arg;
1738 1.1 dyoung vtw_t *vtw = 0;
1739 1.1 dyoung
1740 1.1 dyoung if (it->ctl)
1741 1.1 dyoung vtw = vtw_next_port_v6(it);
1742 1.1 dyoung
1743 1.1 dyoung if (!vtw)
1744 1.1 dyoung it->ctl = 0;
1745 1.1 dyoung
1746 1.1 dyoung return vtw_export_v6(it->ctl, vtw, res);
1747 1.1 dyoung }
1748 1.1 dyoung
1749 1.1 dyoung static int
1750 1.1 dyoung tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
1751 1.1 dyoung const struct in6_addr *laddr, uint16_t lport,
1752 1.1 dyoung struct vestigial_inpcb *res)
1753 1.1 dyoung {
1754 1.1 dyoung vtw_ctl_t *ctl;
1755 1.1 dyoung vtw_t *vtw;
1756 1.1 dyoung
1757 1.1 dyoung db_trace(KTR_VTW
1758 1.1 dyoung , (res, "vtw: lookup %6A:%P %6A:%P"
1759 1.1 dyoung , db_store(faddr, sizeof (*faddr)), fport
1760 1.1 dyoung , db_store(laddr, sizeof (*laddr)), lport));
1761 1.1 dyoung
1762 1.1 dyoung vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
1763 1.1 dyoung , faddr, fport
1764 1.1 dyoung , laddr, lport, 0);
1765 1.1 dyoung
1766 1.1 dyoung return vtw_export_v6(ctl, vtw, res);
1767 1.1 dyoung }
1768 1.1 dyoung
1769 1.1 dyoung static vestigial_hooks_t tcp_hooks = {
1770 1.1 dyoung .init_ports4 = tcp_init_ports_v4,
1771 1.1 dyoung .next_port4 = tcp_next_port_v4,
1772 1.1 dyoung .lookup4 = tcp_lookup_v4,
1773 1.1 dyoung .init_ports6 = tcp_init_ports_v6,
1774 1.1 dyoung .next_port6 = tcp_next_port_v6,
1775 1.1 dyoung .lookup6 = tcp_lookup_v6,
1776 1.1 dyoung };
1777 1.1 dyoung
1778 1.1 dyoung static bool
1779 1.1 dyoung vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
1780 1.1 dyoung {
1781 1.1 dyoung fatp_ctl_t *fat;
1782 1.1 dyoung vtw_ctl_t *ctl;
1783 1.1 dyoung
1784 1.1 dyoung switch (af) {
1785 1.1 dyoung case AF_INET:
1786 1.1 dyoung fat = &fat_tcpv4;
1787 1.1 dyoung ctl = &vtw_tcpv4[0];
1788 1.1 dyoung break;
1789 1.1 dyoung case AF_INET6:
1790 1.1 dyoung fat = &fat_tcpv6;
1791 1.1 dyoung ctl = &vtw_tcpv6[0];
1792 1.1 dyoung break;
1793 1.1 dyoung default:
1794 1.1 dyoung return false;
1795 1.1 dyoung }
1796 1.1 dyoung if (fatp != NULL)
1797 1.1 dyoung *fatp = fat;
1798 1.1 dyoung if (ctlp != NULL)
1799 1.1 dyoung *ctlp = ctl;
1800 1.1 dyoung return true;
1801 1.1 dyoung }
1802 1.1 dyoung
1803 1.1 dyoung /*!\brief initialize controlling instance
1804 1.1 dyoung */
1805 1.1 dyoung static int
1806 1.1 dyoung vtw_control_init(int af)
1807 1.1 dyoung {
1808 1.1 dyoung fatp_ctl_t *fat;
1809 1.1 dyoung vtw_ctl_t *ctl;
1810 1.6 dyoung fatp_t *fat_base;
1811 1.6 dyoung fatp_t **fat_hash;
1812 1.6 dyoung vtw_t *ctl_base_v;
1813 1.6 dyoung uint32_t n, m;
1814 1.6 dyoung size_t sz;
1815 1.6 dyoung
1816 1.6 dyoung KASSERT(powerof2(tcp_vtw_entries));
1817 1.1 dyoung
1818 1.1 dyoung if (!vtw_select(af, &fat, &ctl))
1819 1.1 dyoung return EAFNOSUPPORT;
1820 1.1 dyoung
1821 1.6 dyoung if (fat->hash != NULL) {
1822 1.6 dyoung KASSERT(fat->base != NULL && ctl->base.v != NULL);
1823 1.6 dyoung return 0;
1824 1.6 dyoung }
1825 1.6 dyoung
1826 1.6 dyoung /* Allocate 10% more capacity in the fat pointers.
1827 1.6 dyoung * We should only need ~#hash additional based on
1828 1.6 dyoung * how they age, but TIME_WAIT assassination could cause
1829 1.6 dyoung * sparse fat pointer utilisation.
1830 1.6 dyoung */
1831 1.6 dyoung m = 512;
1832 1.6 dyoung n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
1833 1.6 dyoung sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
1834 1.6 dyoung
1835 1.6 dyoung fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_NOSLEEP);
1836 1.6 dyoung
1837 1.6 dyoung if (fat_hash == NULL) {
1838 1.6 dyoung printf("%s: could not allocate %zu bytes for "
1839 1.6 dyoung "hash anchors", __func__, 2*m * sizeof(fatp_t *));
1840 1.6 dyoung return ENOMEM;
1841 1.6 dyoung }
1842 1.1 dyoung
1843 1.6 dyoung fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_NOSLEEP);
1844 1.1 dyoung
1845 1.6 dyoung if (fat_base == NULL) {
1846 1.6 dyoung kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1847 1.6 dyoung printf("%s: could not allocate %zu bytes for "
1848 1.6 dyoung "fatp_t array", __func__, 2*n * sizeof(fatp_t));
1849 1.6 dyoung return ENOMEM;
1850 1.6 dyoung }
1851 1.1 dyoung
1852 1.6 dyoung ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_NOSLEEP);
1853 1.1 dyoung
1854 1.6 dyoung if (ctl_base_v == NULL) {
1855 1.6 dyoung kmem_free(fat_hash, 2*m * sizeof (fatp_t *));
1856 1.6 dyoung kmem_free(fat_base, 2*n * sizeof(fatp_t));
1857 1.6 dyoung printf("%s: could not allocate %zu bytes for "
1858 1.6 dyoung "vtw_t array", __func__, tcp_vtw_entries * sz);
1859 1.6 dyoung return ENOMEM;
1860 1.1 dyoung }
1861 1.1 dyoung
1862 1.6 dyoung fatp_init(fat, n, m, fat_base, fat_hash);
1863 1.1 dyoung
1864 1.6 dyoung vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
1865 1.1 dyoung
1866 1.1 dyoung return 0;
1867 1.1 dyoung }
1868 1.1 dyoung
1869 1.1 dyoung /*!\brief select controlling instance
1870 1.1 dyoung */
1871 1.1 dyoung static vtw_ctl_t *
1872 1.1 dyoung vtw_control(int af, uint32_t msl)
1873 1.1 dyoung {
1874 1.1 dyoung fatp_ctl_t *fat;
1875 1.1 dyoung vtw_ctl_t *ctl;
1876 1.9.2.2 jdolecek int msl_class = msl_to_class(msl);
1877 1.1 dyoung
1878 1.1 dyoung if (!vtw_select(af, &fat, &ctl))
1879 1.1 dyoung return NULL;
1880 1.1 dyoung
1881 1.1 dyoung if (!fat->base || !ctl->base.v)
1882 1.1 dyoung return NULL;
1883 1.1 dyoung
1884 1.5 dyoung if (!tcp_vtw_was_enabled) {
1885 1.5 dyoung /* This guarantees is timer ticks until we no longer need them.
1886 1.5 dyoung */
1887 1.5 dyoung tcp_vtw_was_enabled = 1;
1888 1.5 dyoung
1889 1.5 dyoung callout_schedule(&vtw_cs, hz / 5);
1890 1.5 dyoung
1891 1.5 dyoung tcbtable.vestige = &tcp_hooks;
1892 1.5 dyoung }
1893 1.5 dyoung
1894 1.9.2.2 jdolecek return ctl + msl_class;
1895 1.1 dyoung }
1896 1.1 dyoung
1897 1.1 dyoung /*!\brief add TCP pcb to vestigial timewait
1898 1.1 dyoung */
1899 1.1 dyoung int
1900 1.1 dyoung vtw_add(int af, struct tcpcb *tp)
1901 1.1 dyoung {
1902 1.9.2.1 tls #ifdef VTW_DEBUG
1903 1.1 dyoung int enable;
1904 1.9.2.1 tls #endif
1905 1.1 dyoung vtw_ctl_t *ctl;
1906 1.1 dyoung vtw_t *vtw;
1907 1.1 dyoung
1908 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
1909 1.1 dyoung
1910 1.1 dyoung ctl = vtw_control(af, tp->t_msl);
1911 1.1 dyoung if (!ctl)
1912 1.1 dyoung return 0;
1913 1.1 dyoung
1914 1.9.2.1 tls #ifdef VTW_DEBUG
1915 1.1 dyoung enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
1916 1.9.2.1 tls #endif
1917 1.1 dyoung
1918 1.1 dyoung vtw = vtw_alloc(ctl);
1919 1.1 dyoung
1920 1.1 dyoung if (vtw) {
1921 1.1 dyoung vtw->snd_nxt = tp->snd_nxt;
1922 1.1 dyoung vtw->rcv_nxt = tp->rcv_nxt;
1923 1.1 dyoung
1924 1.1 dyoung switch (af) {
1925 1.1 dyoung case AF_INET: {
1926 1.1 dyoung struct inpcb *inp = tp->t_inpcb;
1927 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
1928 1.1 dyoung
1929 1.1 dyoung v4->faddr = inp->inp_faddr.s_addr;
1930 1.1 dyoung v4->laddr = inp->inp_laddr.s_addr;
1931 1.1 dyoung v4->fport = inp->inp_fport;
1932 1.1 dyoung v4->lport = inp->inp_lport;
1933 1.1 dyoung
1934 1.1 dyoung vtw->reuse_port = !!(inp->inp_socket->so_options
1935 1.1 dyoung & SO_REUSEPORT);
1936 1.1 dyoung vtw->reuse_addr = !!(inp->inp_socket->so_options
1937 1.1 dyoung & SO_REUSEADDR);
1938 1.1 dyoung vtw->v6only = 0;
1939 1.1 dyoung vtw->uid = inp->inp_socket->so_uidinfo->ui_uid;
1940 1.1 dyoung
1941 1.1 dyoung vtw_inshash_v4(ctl, vtw);
1942 1.1 dyoung
1943 1.1 dyoung
1944 1.1 dyoung #ifdef VTW_DEBUG
1945 1.1 dyoung /* Immediate lookup (connected and port) to
1946 1.1 dyoung * ensure at least that works!
1947 1.1 dyoung */
1948 1.1 dyoung if (enable & 4) {
1949 1.1 dyoung KASSERT(vtw_lookup_hash_v4
1950 1.1 dyoung (ctl
1951 1.1 dyoung , inp->inp_faddr.s_addr, inp->inp_fport
1952 1.1 dyoung , inp->inp_laddr.s_addr, inp->inp_lport
1953 1.1 dyoung , 0)
1954 1.1 dyoung == vtw);
1955 1.1 dyoung KASSERT(vtw_lookup_hash_v4
1956 1.1 dyoung (ctl
1957 1.1 dyoung , inp->inp_faddr.s_addr, inp->inp_fport
1958 1.1 dyoung , inp->inp_laddr.s_addr, inp->inp_lport
1959 1.1 dyoung , 1));
1960 1.1 dyoung }
1961 1.1 dyoung /* Immediate port iterator functionality check: not wild
1962 1.1 dyoung */
1963 1.1 dyoung if (enable & 8) {
1964 1.1 dyoung struct tcp_ports_iterator *it;
1965 1.1 dyoung struct vestigial_inpcb res;
1966 1.1 dyoung int cnt = 0;
1967 1.1 dyoung
1968 1.1 dyoung it = tcp_init_ports_v4(inp->inp_laddr
1969 1.1 dyoung , inp->inp_lport, 0);
1970 1.1 dyoung
1971 1.1 dyoung while (tcp_next_port_v4(it, &res)) {
1972 1.1 dyoung ++cnt;
1973 1.1 dyoung }
1974 1.1 dyoung KASSERT(cnt);
1975 1.1 dyoung }
1976 1.1 dyoung /* Immediate port iterator functionality check: wild
1977 1.1 dyoung */
1978 1.1 dyoung if (enable & 16) {
1979 1.1 dyoung struct tcp_ports_iterator *it;
1980 1.1 dyoung struct vestigial_inpcb res;
1981 1.1 dyoung struct in_addr any;
1982 1.1 dyoung int cnt = 0;
1983 1.1 dyoung
1984 1.1 dyoung any.s_addr = htonl(INADDR_ANY);
1985 1.1 dyoung
1986 1.1 dyoung it = tcp_init_ports_v4(any, inp->inp_lport, 1);
1987 1.1 dyoung
1988 1.1 dyoung while (tcp_next_port_v4(it, &res)) {
1989 1.1 dyoung ++cnt;
1990 1.1 dyoung }
1991 1.1 dyoung KASSERT(cnt);
1992 1.1 dyoung }
1993 1.1 dyoung #endif /* VTW_DEBUG */
1994 1.1 dyoung break;
1995 1.1 dyoung }
1996 1.1 dyoung
1997 1.1 dyoung case AF_INET6: {
1998 1.1 dyoung struct in6pcb *inp = tp->t_in6pcb;
1999 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
2000 1.1 dyoung
2001 1.1 dyoung v6->faddr = inp->in6p_faddr;
2002 1.1 dyoung v6->laddr = inp->in6p_laddr;
2003 1.1 dyoung v6->fport = inp->in6p_fport;
2004 1.1 dyoung v6->lport = inp->in6p_lport;
2005 1.1 dyoung
2006 1.1 dyoung vtw->reuse_port = !!(inp->in6p_socket->so_options
2007 1.1 dyoung & SO_REUSEPORT);
2008 1.1 dyoung vtw->reuse_addr = !!(inp->in6p_socket->so_options
2009 1.1 dyoung & SO_REUSEADDR);
2010 1.1 dyoung vtw->v6only = !!(inp->in6p_flags
2011 1.1 dyoung & IN6P_IPV6_V6ONLY);
2012 1.1 dyoung vtw->uid = inp->in6p_socket->so_uidinfo->ui_uid;
2013 1.1 dyoung
2014 1.1 dyoung vtw_inshash_v6(ctl, vtw);
2015 1.1 dyoung #ifdef VTW_DEBUG
2016 1.1 dyoung /* Immediate lookup (connected and port) to
2017 1.1 dyoung * ensure at least that works!
2018 1.1 dyoung */
2019 1.1 dyoung if (enable & 4) {
2020 1.1 dyoung KASSERT(vtw_lookup_hash_v6(ctl
2021 1.1 dyoung , &inp->in6p_faddr, inp->in6p_fport
2022 1.1 dyoung , &inp->in6p_laddr, inp->in6p_lport
2023 1.1 dyoung , 0)
2024 1.1 dyoung == vtw);
2025 1.1 dyoung KASSERT(vtw_lookup_hash_v6
2026 1.1 dyoung (ctl
2027 1.1 dyoung , &inp->in6p_faddr, inp->in6p_fport
2028 1.1 dyoung , &inp->in6p_laddr, inp->in6p_lport
2029 1.1 dyoung , 1));
2030 1.1 dyoung }
2031 1.1 dyoung /* Immediate port iterator functionality check: not wild
2032 1.1 dyoung */
2033 1.1 dyoung if (enable & 8) {
2034 1.1 dyoung struct tcp_ports_iterator *it;
2035 1.1 dyoung struct vestigial_inpcb res;
2036 1.1 dyoung int cnt = 0;
2037 1.1 dyoung
2038 1.1 dyoung it = tcp_init_ports_v6(&inp->in6p_laddr
2039 1.1 dyoung , inp->in6p_lport, 0);
2040 1.1 dyoung
2041 1.1 dyoung while (tcp_next_port_v6(it, &res)) {
2042 1.1 dyoung ++cnt;
2043 1.1 dyoung }
2044 1.1 dyoung KASSERT(cnt);
2045 1.1 dyoung }
2046 1.1 dyoung /* Immediate port iterator functionality check: wild
2047 1.1 dyoung */
2048 1.1 dyoung if (enable & 16) {
2049 1.1 dyoung struct tcp_ports_iterator *it;
2050 1.1 dyoung struct vestigial_inpcb res;
2051 1.1 dyoung static struct in6_addr any = IN6ADDR_ANY_INIT;
2052 1.1 dyoung int cnt = 0;
2053 1.1 dyoung
2054 1.1 dyoung it = tcp_init_ports_v6(&any
2055 1.1 dyoung , inp->in6p_lport, 1);
2056 1.1 dyoung
2057 1.1 dyoung while (tcp_next_port_v6(it, &res)) {
2058 1.1 dyoung ++cnt;
2059 1.1 dyoung }
2060 1.1 dyoung KASSERT(cnt);
2061 1.1 dyoung }
2062 1.1 dyoung #endif /* VTW_DEBUG */
2063 1.1 dyoung break;
2064 1.1 dyoung }
2065 1.1 dyoung }
2066 1.1 dyoung
2067 1.1 dyoung tcp_canceltimers(tp);
2068 1.1 dyoung tp = tcp_close(tp);
2069 1.1 dyoung KASSERT(!tp);
2070 1.1 dyoung
2071 1.1 dyoung return 1;
2072 1.1 dyoung }
2073 1.1 dyoung
2074 1.1 dyoung return 0;
2075 1.1 dyoung }
2076 1.1 dyoung
2077 1.1 dyoung /*!\brief restart timer for vestigial time-wait entry
2078 1.1 dyoung */
2079 1.1 dyoung static void
2080 1.1 dyoung vtw_restart_v4(vestigial_inpcb_t *vp)
2081 1.1 dyoung {
2082 1.1 dyoung vtw_v4_t copy = *(vtw_v4_t*)vp->vtw;
2083 1.1 dyoung vtw_t *vtw;
2084 1.1 dyoung vtw_t *cp = ©.common;
2085 1.1 dyoung vtw_ctl_t *ctl;
2086 1.1 dyoung
2087 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
2088 1.1 dyoung
2089 1.1 dyoung db_trace(KTR_VTW
2090 1.1 dyoung , (vp->vtw, "vtw: restart %A:%P %A:%P"
2091 1.1 dyoung , vp->faddr.v4.s_addr, vp->fport
2092 1.1 dyoung , vp->laddr.v4.s_addr, vp->lport));
2093 1.1 dyoung
2094 1.1 dyoung /* Class might have changed, so have a squiz.
2095 1.1 dyoung */
2096 1.1 dyoung ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
2097 1.1 dyoung vtw = vtw_alloc(ctl);
2098 1.1 dyoung
2099 1.1 dyoung if (vtw) {
2100 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
2101 1.1 dyoung
2102 1.1 dyoung /* Safe now to unhash the old entry
2103 1.1 dyoung */
2104 1.1 dyoung vtw_del(vp->ctl, vp->vtw);
2105 1.1 dyoung
2106 1.1 dyoung vtw->snd_nxt = cp->snd_nxt;
2107 1.1 dyoung vtw->rcv_nxt = cp->rcv_nxt;
2108 1.1 dyoung
2109 1.1 dyoung v4->faddr = copy.faddr;
2110 1.1 dyoung v4->laddr = copy.laddr;
2111 1.1 dyoung v4->fport = copy.fport;
2112 1.1 dyoung v4->lport = copy.lport;
2113 1.1 dyoung
2114 1.1 dyoung vtw->reuse_port = cp->reuse_port;
2115 1.1 dyoung vtw->reuse_addr = cp->reuse_addr;
2116 1.1 dyoung vtw->v6only = 0;
2117 1.1 dyoung vtw->uid = cp->uid;
2118 1.1 dyoung
2119 1.1 dyoung vtw_inshash_v4(ctl, vtw);
2120 1.1 dyoung }
2121 1.1 dyoung
2122 1.1 dyoung vp->valid = 0;
2123 1.1 dyoung }
2124 1.1 dyoung
2125 1.1 dyoung /*!\brief restart timer for vestigial time-wait entry
2126 1.1 dyoung */
2127 1.1 dyoung static void
2128 1.1 dyoung vtw_restart_v6(vestigial_inpcb_t *vp)
2129 1.1 dyoung {
2130 1.1 dyoung vtw_v6_t copy = *(vtw_v6_t*)vp->vtw;
2131 1.1 dyoung vtw_t *vtw;
2132 1.1 dyoung vtw_t *cp = ©.common;
2133 1.1 dyoung vtw_ctl_t *ctl;
2134 1.1 dyoung
2135 1.1 dyoung KASSERT(mutex_owned(softnet_lock));
2136 1.1 dyoung
2137 1.1 dyoung db_trace(KTR_VTW
2138 1.1 dyoung , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
2139 1.1 dyoung , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
2140 1.1 dyoung , vp->fport
2141 1.1 dyoung , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
2142 1.1 dyoung , vp->lport));
2143 1.1 dyoung
2144 1.1 dyoung /* Class might have changed, so have a squiz.
2145 1.1 dyoung */
2146 1.1 dyoung ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
2147 1.1 dyoung vtw = vtw_alloc(ctl);
2148 1.1 dyoung
2149 1.1 dyoung if (vtw) {
2150 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
2151 1.1 dyoung
2152 1.1 dyoung /* Safe now to unhash the old entry
2153 1.1 dyoung */
2154 1.1 dyoung vtw_del(vp->ctl, vp->vtw);
2155 1.1 dyoung
2156 1.1 dyoung vtw->snd_nxt = cp->snd_nxt;
2157 1.1 dyoung vtw->rcv_nxt = cp->rcv_nxt;
2158 1.1 dyoung
2159 1.1 dyoung v6->faddr = copy.faddr;
2160 1.1 dyoung v6->laddr = copy.laddr;
2161 1.1 dyoung v6->fport = copy.fport;
2162 1.1 dyoung v6->lport = copy.lport;
2163 1.1 dyoung
2164 1.1 dyoung vtw->reuse_port = cp->reuse_port;
2165 1.1 dyoung vtw->reuse_addr = cp->reuse_addr;
2166 1.1 dyoung vtw->v6only = cp->v6only;
2167 1.1 dyoung vtw->uid = cp->uid;
2168 1.1 dyoung
2169 1.1 dyoung vtw_inshash_v6(ctl, vtw);
2170 1.1 dyoung }
2171 1.1 dyoung
2172 1.1 dyoung vp->valid = 0;
2173 1.1 dyoung }
2174 1.1 dyoung
2175 1.1 dyoung /*!\brief restart timer for vestigial time-wait entry
2176 1.1 dyoung */
2177 1.1 dyoung void
2178 1.1 dyoung vtw_restart(vestigial_inpcb_t *vp)
2179 1.1 dyoung {
2180 1.1 dyoung if (!vp || !vp->valid)
2181 1.1 dyoung return;
2182 1.1 dyoung
2183 1.1 dyoung if (vp->v4)
2184 1.1 dyoung vtw_restart_v4(vp);
2185 1.1 dyoung else
2186 1.1 dyoung vtw_restart_v6(vp);
2187 1.1 dyoung }
2188 1.1 dyoung
2189 1.1 dyoung int
2190 1.7 dyoung sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
2191 1.7 dyoung {
2192 1.7 dyoung int en, rc;
2193 1.7 dyoung struct sysctlnode node;
2194 1.7 dyoung
2195 1.7 dyoung node = *rnode;
2196 1.7 dyoung en = *(int *)rnode->sysctl_data;
2197 1.7 dyoung node.sysctl_data = &en;
2198 1.7 dyoung
2199 1.7 dyoung rc = sysctl_lookup(SYSCTLFN_CALL(&node));
2200 1.7 dyoung if (rc != 0 || newp == NULL)
2201 1.7 dyoung return rc;
2202 1.7 dyoung
2203 1.7 dyoung if (rnode->sysctl_data != &tcp4_vtw_enable &&
2204 1.7 dyoung rnode->sysctl_data != &tcp6_vtw_enable)
2205 1.7 dyoung rc = ENOENT;
2206 1.7 dyoung else if ((en & 1) == 0)
2207 1.7 dyoung rc = 0;
2208 1.7 dyoung else if (rnode->sysctl_data == &tcp4_vtw_enable)
2209 1.7 dyoung rc = vtw_control_init(AF_INET);
2210 1.7 dyoung else /* rnode->sysctl_data == &tcp6_vtw_enable */
2211 1.7 dyoung rc = vtw_control_init(AF_INET6);
2212 1.7 dyoung
2213 1.7 dyoung if (rc == 0)
2214 1.7 dyoung *(int *)rnode->sysctl_data = en;
2215 1.7 dyoung
2216 1.7 dyoung return rc;
2217 1.7 dyoung }
2218 1.7 dyoung
2219 1.7 dyoung int
2220 1.1 dyoung vtw_earlyinit(void)
2221 1.1 dyoung {
2222 1.5 dyoung int i, rc;
2223 1.1 dyoung
2224 1.5 dyoung callout_init(&vtw_cs, 0);
2225 1.5 dyoung callout_setfunc(&vtw_cs, vtw_tick, 0);
2226 1.1 dyoung
2227 1.5 dyoung for (i = 0; i < VTW_NCLASS; ++i) {
2228 1.5 dyoung vtw_tcpv4[i].is_v4 = 1;
2229 1.5 dyoung vtw_tcpv6[i].is_v6 = 1;
2230 1.1 dyoung }
2231 1.1 dyoung
2232 1.7 dyoung if ((tcp4_vtw_enable & 1) != 0 &&
2233 1.7 dyoung (rc = vtw_control_init(AF_INET)) != 0)
2234 1.7 dyoung return rc;
2235 1.7 dyoung
2236 1.7 dyoung if ((tcp6_vtw_enable & 1) != 0 &&
2237 1.1 dyoung (rc = vtw_control_init(AF_INET6)) != 0)
2238 1.1 dyoung return rc;
2239 1.1 dyoung
2240 1.1 dyoung return 0;
2241 1.1 dyoung }
2242 1.1 dyoung
2243 1.1 dyoung #ifdef VTW_DEBUG
2244 1.1 dyoung #include <sys/syscallargs.h>
2245 1.1 dyoung #include <sys/sysctl.h>
2246 1.1 dyoung
2247 1.1 dyoung /*!\brief add lalp, fafp entries for debug
2248 1.1 dyoung */
2249 1.1 dyoung int
2250 1.9.2.2 jdolecek vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class)
2251 1.1 dyoung {
2252 1.1 dyoung vtw_ctl_t *ctl;
2253 1.1 dyoung vtw_t *vtw;
2254 1.1 dyoung
2255 1.9.2.2 jdolecek ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class));
2256 1.1 dyoung if (!ctl)
2257 1.1 dyoung return 0;
2258 1.1 dyoung
2259 1.1 dyoung vtw = vtw_alloc(ctl);
2260 1.1 dyoung
2261 1.1 dyoung if (vtw) {
2262 1.1 dyoung vtw->snd_nxt = 0;
2263 1.1 dyoung vtw->rcv_nxt = 0;
2264 1.1 dyoung
2265 1.1 dyoung switch (af) {
2266 1.1 dyoung case AF_INET: {
2267 1.1 dyoung vtw_v4_t *v4 = (void*)vtw;
2268 1.1 dyoung
2269 1.1 dyoung v4->faddr = fa->sin_addr.v4.s_addr;
2270 1.1 dyoung v4->laddr = la->sin_addr.v4.s_addr;
2271 1.1 dyoung v4->fport = fa->sin_port;
2272 1.1 dyoung v4->lport = la->sin_port;
2273 1.1 dyoung
2274 1.1 dyoung vtw->reuse_port = 1;
2275 1.1 dyoung vtw->reuse_addr = 1;
2276 1.1 dyoung vtw->v6only = 0;
2277 1.1 dyoung vtw->uid = 0;
2278 1.1 dyoung
2279 1.1 dyoung vtw_inshash_v4(ctl, vtw);
2280 1.1 dyoung break;
2281 1.1 dyoung }
2282 1.1 dyoung
2283 1.1 dyoung case AF_INET6: {
2284 1.1 dyoung vtw_v6_t *v6 = (void*)vtw;
2285 1.1 dyoung
2286 1.1 dyoung v6->faddr = fa->sin_addr.v6;
2287 1.1 dyoung v6->laddr = la->sin_addr.v6;
2288 1.1 dyoung
2289 1.1 dyoung v6->fport = fa->sin_port;
2290 1.1 dyoung v6->lport = la->sin_port;
2291 1.1 dyoung
2292 1.1 dyoung vtw->reuse_port = 1;
2293 1.1 dyoung vtw->reuse_addr = 1;
2294 1.1 dyoung vtw->v6only = 0;
2295 1.1 dyoung vtw->uid = 0;
2296 1.1 dyoung
2297 1.1 dyoung vtw_inshash_v6(ctl, vtw);
2298 1.1 dyoung break;
2299 1.1 dyoung }
2300 1.1 dyoung
2301 1.1 dyoung default:
2302 1.1 dyoung break;
2303 1.1 dyoung }
2304 1.1 dyoung
2305 1.1 dyoung return 1;
2306 1.1 dyoung }
2307 1.1 dyoung
2308 1.1 dyoung return 0;
2309 1.1 dyoung }
2310 1.1 dyoung
2311 1.1 dyoung static int vtw_syscall = 0;
2312 1.1 dyoung
2313 1.1 dyoung static int
2314 1.1 dyoung vtw_debug_process(vtw_sysargs_t *ap)
2315 1.1 dyoung {
2316 1.1 dyoung struct vestigial_inpcb vestige;
2317 1.1 dyoung int rc = 0;
2318 1.1 dyoung
2319 1.1 dyoung mutex_enter(softnet_lock);
2320 1.1 dyoung
2321 1.1 dyoung switch (ap->op) {
2322 1.1 dyoung case 0: // insert
2323 1.1 dyoung vtw_debug_add(ap->la.sin_family
2324 1.1 dyoung , &ap->la
2325 1.1 dyoung , &ap->fa
2326 1.1 dyoung , TCPTV_MSL
2327 1.1 dyoung , 0);
2328 1.1 dyoung break;
2329 1.1 dyoung
2330 1.1 dyoung case 1: // lookup
2331 1.1 dyoung case 2: // restart
2332 1.1 dyoung switch (ap->la.sin_family) {
2333 1.1 dyoung case AF_INET:
2334 1.1 dyoung if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
2335 1.1 dyoung ap->la.sin_addr.v4, ap->la.sin_port,
2336 1.1 dyoung &vestige)) {
2337 1.1 dyoung if (ap->op == 2) {
2338 1.1 dyoung vtw_restart(&vestige);
2339 1.1 dyoung }
2340 1.1 dyoung rc = 0;
2341 1.1 dyoung } else
2342 1.1 dyoung rc = ESRCH;
2343 1.1 dyoung break;
2344 1.1 dyoung
2345 1.1 dyoung case AF_INET6:
2346 1.1 dyoung if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
2347 1.1 dyoung &ap->la.sin_addr.v6, ap->la.sin_port,
2348 1.1 dyoung &vestige)) {
2349 1.1 dyoung if (ap->op == 2) {
2350 1.1 dyoung vtw_restart(&vestige);
2351 1.1 dyoung }
2352 1.1 dyoung rc = 0;
2353 1.1 dyoung } else
2354 1.1 dyoung rc = ESRCH;
2355 1.1 dyoung break;
2356 1.1 dyoung default:
2357 1.1 dyoung rc = EINVAL;
2358 1.1 dyoung }
2359 1.1 dyoung break;
2360 1.1 dyoung
2361 1.1 dyoung default:
2362 1.1 dyoung rc = EINVAL;
2363 1.1 dyoung }
2364 1.1 dyoung
2365 1.1 dyoung mutex_exit(softnet_lock);
2366 1.1 dyoung return rc;
2367 1.1 dyoung }
2368 1.1 dyoung
2369 1.1 dyoung struct sys_vtw_args {
2370 1.1 dyoung syscallarg(const vtw_sysargs_t *) req;
2371 1.1 dyoung syscallarg(size_t) len;
2372 1.1 dyoung };
2373 1.1 dyoung
2374 1.1 dyoung static int
2375 1.1 dyoung vtw_sys(struct lwp *l, const void *_, register_t *retval)
2376 1.1 dyoung {
2377 1.1 dyoung const struct sys_vtw_args *uap = _;
2378 1.1 dyoung void *buf;
2379 1.1 dyoung int rc;
2380 1.1 dyoung size_t len = SCARG(uap, len);
2381 1.1 dyoung
2382 1.1 dyoung if (len != sizeof (vtw_sysargs_t))
2383 1.1 dyoung return EINVAL;
2384 1.1 dyoung
2385 1.1 dyoung buf = kmem_alloc(len, KM_SLEEP);
2386 1.1 dyoung rc = copyin(SCARG(uap, req), buf, len);
2387 1.1 dyoung if (!rc) {
2388 1.1 dyoung rc = vtw_debug_process(buf);
2389 1.1 dyoung }
2390 1.1 dyoung kmem_free(buf, len);
2391 1.1 dyoung
2392 1.1 dyoung return rc;
2393 1.1 dyoung }
2394 1.1 dyoung
2395 1.1 dyoung static void
2396 1.1 dyoung vtw_sanity_check(void)
2397 1.1 dyoung {
2398 1.1 dyoung vtw_ctl_t *ctl;
2399 1.1 dyoung vtw_t *vtw;
2400 1.1 dyoung int i;
2401 1.1 dyoung int n;
2402 1.1 dyoung
2403 1.1 dyoung for (i = 0; i < VTW_NCLASS; ++i) {
2404 1.1 dyoung ctl = &vtw_tcpv4[i];
2405 1.1 dyoung
2406 1.1 dyoung if (!ctl->base.v || ctl->nalloc)
2407 1.1 dyoung continue;
2408 1.1 dyoung
2409 1.1 dyoung for (n = 0, vtw = ctl->base.v; ; ) {
2410 1.1 dyoung ++n;
2411 1.1 dyoung vtw = vtw_next(ctl, vtw);
2412 1.1 dyoung if (vtw == ctl->base.v)
2413 1.1 dyoung break;
2414 1.1 dyoung }
2415 1.1 dyoung db_trace(KTR_VTW
2416 1.1 dyoung , (ctl, "sanity: class %x n %x nfree %x"
2417 1.1 dyoung , i, n, ctl->nfree));
2418 1.1 dyoung
2419 1.1 dyoung KASSERT(n == ctl->nfree);
2420 1.1 dyoung }
2421 1.1 dyoung
2422 1.1 dyoung for (i = 0; i < VTW_NCLASS; ++i) {
2423 1.1 dyoung ctl = &vtw_tcpv6[i];
2424 1.1 dyoung
2425 1.1 dyoung if (!ctl->base.v || ctl->nalloc)
2426 1.1 dyoung continue;
2427 1.1 dyoung
2428 1.1 dyoung for (n = 0, vtw = ctl->base.v; ; ) {
2429 1.1 dyoung ++n;
2430 1.1 dyoung vtw = vtw_next(ctl, vtw);
2431 1.1 dyoung if (vtw == ctl->base.v)
2432 1.1 dyoung break;
2433 1.1 dyoung }
2434 1.1 dyoung db_trace(KTR_VTW
2435 1.1 dyoung , (ctl, "sanity: class %x n %x nfree %x"
2436 1.1 dyoung , i, n, ctl->nfree));
2437 1.1 dyoung KASSERT(n == ctl->nfree);
2438 1.1 dyoung }
2439 1.1 dyoung }
2440 1.1 dyoung
2441 1.1 dyoung /*!\brief Initialise debug support.
2442 1.1 dyoung */
2443 1.1 dyoung static void
2444 1.1 dyoung vtw_debug_init(void)
2445 1.1 dyoung {
2446 1.1 dyoung int i;
2447 1.1 dyoung
2448 1.1 dyoung vtw_sanity_check();
2449 1.1 dyoung
2450 1.1 dyoung if (vtw_syscall)
2451 1.1 dyoung return;
2452 1.1 dyoung
2453 1.1 dyoung for (i = 511; i; --i) {
2454 1.1 dyoung if (sysent[i].sy_call == sys_nosys) {
2455 1.1 dyoung sysent[i].sy_call = vtw_sys;
2456 1.1 dyoung sysent[i].sy_narg = 2;
2457 1.1 dyoung sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
2458 1.1 dyoung sysent[i].sy_flags = 0;
2459 1.1 dyoung
2460 1.1 dyoung vtw_syscall = i;
2461 1.1 dyoung break;
2462 1.1 dyoung }
2463 1.1 dyoung }
2464 1.1 dyoung if (i) {
2465 1.1 dyoung const struct sysctlnode *node;
2466 1.1 dyoung uint32_t flags;
2467 1.1 dyoung
2468 1.1 dyoung flags = sysctl_root.sysctl_flags;
2469 1.1 dyoung
2470 1.1 dyoung sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
2471 1.1 dyoung sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
2472 1.1 dyoung
2473 1.1 dyoung sysctl_createv(0, 0, 0, &node,
2474 1.1 dyoung CTLFLAG_PERMANENT, CTLTYPE_NODE,
2475 1.1 dyoung "koff",
2476 1.1 dyoung SYSCTL_DESCR("Kernel Obscure Feature Finder"),
2477 1.1 dyoung 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2478 1.1 dyoung
2479 1.1 dyoung if (!node) {
2480 1.1 dyoung sysctl_createv(0, 0, 0, &node,
2481 1.1 dyoung CTLFLAG_PERMANENT, CTLTYPE_NODE,
2482 1.1 dyoung "koffka",
2483 1.1 dyoung SYSCTL_DESCR("The Real(tm) Kernel"
2484 1.1 dyoung " Obscure Feature Finder"),
2485 1.1 dyoung 0, 0, 0, 0, CTL_CREATE, CTL_EOL);
2486 1.1 dyoung }
2487 1.1 dyoung if (node) {
2488 1.1 dyoung sysctl_createv(0, 0, 0, 0,
2489 1.1 dyoung CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2490 1.1 dyoung CTLTYPE_INT, "vtw_debug_syscall",
2491 1.1 dyoung SYSCTL_DESCR("vtw debug"
2492 1.1 dyoung " system call number"),
2493 1.1 dyoung 0, 0, &vtw_syscall, 0, node->sysctl_num,
2494 1.1 dyoung CTL_CREATE, CTL_EOL);
2495 1.1 dyoung }
2496 1.1 dyoung sysctl_root.sysctl_flags = flags;
2497 1.1 dyoung }
2498 1.1 dyoung }
2499 1.1 dyoung #else /* !VTW_DEBUG */
2500 1.1 dyoung static void
2501 1.1 dyoung vtw_debug_init(void)
2502 1.1 dyoung {
2503 1.1 dyoung return;
2504 1.1 dyoung }
2505 1.1 dyoung #endif /* !VTW_DEBUG */
2506