tcp_vtw.h revision 1.4 1 1.1 dyoung /*
2 1.1 dyoung * Copyright (c) 2011 The NetBSD Foundation, Inc.
3 1.1 dyoung * All rights reserved.
4 1.1 dyoung *
5 1.1 dyoung * This code is derived from software contributed to The NetBSD Foundation
6 1.1 dyoung * by Coyote Point Systems, Inc.
7 1.1 dyoung *
8 1.1 dyoung * Redistribution and use in source and binary forms, with or without
9 1.1 dyoung * modification, are permitted provided that the following conditions
10 1.1 dyoung * are met:
11 1.1 dyoung * 1. Redistributions of source code must retain the above copyright
12 1.1 dyoung * notice, this list of conditions and the following disclaimer.
13 1.1 dyoung * 2. Redistributions in binary form must reproduce the above copyright
14 1.1 dyoung * notice, this list of conditions and the following disclaimer in the
15 1.1 dyoung * documentation and/or other materials provided with the distribution.
16 1.1 dyoung *
17 1.1 dyoung * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 1.1 dyoung * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 1.1 dyoung * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 1.1 dyoung * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 1.1 dyoung * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 1.1 dyoung * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 1.1 dyoung * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 1.1 dyoung * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 1.1 dyoung * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 1.1 dyoung * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 1.1 dyoung * POSSIBILITY OF SUCH DAMAGE.
28 1.1 dyoung */
29 1.1 dyoung
30 1.1 dyoung /*
31 1.1 dyoung * Vestigial time-wait.
32 1.1 dyoung *
33 1.1 dyoung * This implementation uses cache-efficient techniques, which will
34 1.1 dyoung * appear somewhat peculiar. The main philosophy is to optimise the
35 1.1 dyoung * amount of information available within a cache line. Cache miss is
36 1.1 dyoung * expensive. So we employ ad-hoc techniques to pull a series of
37 1.1 dyoung * linked-list follows into a cache line. One cache line, multiple
38 1.1 dyoung * linked-list equivalents.
39 1.1 dyoung *
40 1.1 dyoung * One such ad-hoc technique is fat pointers. Additional degrees of
41 1.1 dyoung * ad-hoqueness result from having to hand tune it for pointer size
42 1.1 dyoung * and for cache line size.
43 1.1 dyoung *
44 1.1 dyoung * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
45 1.1 dyoung * data structures into one cache line. The additional 32 bits in the
46 1.1 dyoung * cache line are used for linking fat pointers, and for
47 1.1 dyoung * allocation/bookkeeping.
48 1.1 dyoung *
49 1.1 dyoung * The 15 32-bit tags encode the pointers to the linked list elements,
50 1.1 dyoung * and also encode the results of a search comparison.
51 1.1 dyoung *
52 1.1 dyoung * First, some more assumptions/restrictions.
53 1.1 dyoung *
54 1.1 dyoung * All the fat pointers are from a contiguous allocation arena. Thus,
55 1.1 dyoung * we can refer to them by offset from a base, not as full pointers.
56 1.1 dyoung *
57 1.1 dyoung * All the linked list data elements are also from a contiguous
58 1.1 dyoung * allocation arena, again so that we can refer to them as offset from
59 1.1 dyoung * a base.
60 1.1 dyoung *
61 1.1 dyoung * In order to add a data element to a fat pointer, a key value is
62 1.1 dyoung * computed, based on unique data within the data element. It is the
63 1.1 dyoung * linear searching of the linked lists of these elements based on
64 1.1 dyoung * these unique data that are being optimised here.
65 1.1 dyoung *
66 1.1 dyoung * Lets call the function that computes the key k(e), where e is the
67 1.1 dyoung * data element. In this example, k(e) returns 32-bits.
68 1.1 dyoung *
69 1.1 dyoung * Consider a set E (say of order 15) of data elements. Let K be
70 1.1 dyoung * the set of the k(e) for e in E.
71 1.1 dyoung *
72 1.1 dyoung * Let O be the set of the offsets from the base of the data elements in E.
73 1.1 dyoung *
74 1.1 dyoung * For each x in K, for each matching o in O, let t be x ^ o. These
75 1.1 dyoung * are the tags. (More or less).
76 1.1 dyoung *
77 1.1 dyoung * In order to search all the data elements in E, we compute the
78 1.1 dyoung * search key, and one at a time, XOR the key into the tags. If any
79 1.1 dyoung * result is a valid data element index, we have a possible match. If
80 1.1 dyoung * not, there is no match.
81 1.1 dyoung *
82 1.1 dyoung * The no-match cases mean we do not have to de-reference the pointer
83 1.1 dyoung * to the data element in question. We save cache miss penalty and
84 1.1 dyoung * cache load decreases. Only in the case of a valid looking data
85 1.1 dyoung * element index, do we have to look closer.
86 1.1 dyoung *
87 1.1 dyoung * Thus, in the absence of false positives, 15 data elements can be
88 1.1 dyoung * searched with one cache line fill, as opposed to 15 cache line
89 1.1 dyoung * fills for the usual implementation.
90 1.1 dyoung *
91 1.1 dyoung * The vestigial time waits (vtw_t), the data elements in the above, are
92 1.1 dyoung * searched by faddr, fport, laddr, lport. The key is a function of
93 1.1 dyoung * these values.
94 1.1 dyoung *
95 1.1 dyoung * We hash these keys into the traditional hash chains to reduce the
96 1.1 dyoung * search time, and use fat pointers to reduce the cache impacts of
97 1.1 dyoung * searching.
98 1.1 dyoung *
99 1.1 dyoung * The vtw_t are, per requirement, in a contiguous chunk. Allocation
100 1.1 dyoung * is done with a clock hand, and all vtw_t within one allocation
101 1.1 dyoung * domain have the same lifetime, so they will always be sorted by
102 1.1 dyoung * age.
103 1.1 dyoung *
104 1.1 dyoung * A vtw_t will be allocated, timestamped, and have a fixed future
105 1.1 dyoung * expiration. It will be added to a hash bucket implemented with fat
106 1.1 dyoung * pointers, which means that a cache line will be allocated in the
107 1.1 dyoung * hash bucket, placed at the head (more recent in time) and the vtw_t
108 1.1 dyoung * will be added to this. As more entries are added, the fat pointer
109 1.1 dyoung * cache line will fill, requiring additional cache lines for fat
110 1.1 dyoung * pointers to be allocated. These will be added at the head, and the
111 1.1 dyoung * aged entries will hang down, tapeworm like. As the vtw_t entries
112 1.1 dyoung * expire, the corresponding slot in the fat pointer will be
113 1.1 dyoung * reclaimed, and eventually the cache line will completely empty and
114 1.1 dyoung * be re-cycled, if not at the head of the chain.
115 1.1 dyoung *
116 1.1 dyoung * At times, a time-wait timer is restarted. This corresponds to
117 1.1 dyoung * deleting the current entry and re-adding it.
118 1.1 dyoung *
119 1.1 dyoung * Most of the time, they are just placed here to die.
120 1.1 dyoung */
121 1.1 dyoung #ifndef _NETINET_TCP_VTW_H
122 1.1 dyoung #define _NETINET_TCP_VTW_H
123 1.1 dyoung
124 1.1 dyoung #include <sys/types.h>
125 1.1 dyoung #include <sys/socket.h>
126 1.1 dyoung #include <sys/sysctl.h>
127 1.1 dyoung #include <net/if.h>
128 1.1 dyoung #include <net/route.h>
129 1.1 dyoung #include <netinet/in.h>
130 1.1 dyoung #include <netinet/in_systm.h>
131 1.1 dyoung #include <netinet/ip.h>
132 1.1 dyoung #include <netinet/in_pcb.h>
133 1.1 dyoung #include <netinet/in_var.h>
134 1.1 dyoung #include <netinet/ip_var.h>
135 1.1 dyoung #include <netinet/in.h>
136 1.1 dyoung #include <netinet/tcp.h>
137 1.1 dyoung #include <netinet/tcp_timer.h>
138 1.1 dyoung #include <netinet/tcp_var.h>
139 1.1 dyoung #include <netinet6/in6.h>
140 1.1 dyoung #include <netinet/ip6.h>
141 1.1 dyoung #include <netinet6/ip6_var.h>
142 1.1 dyoung #include <netinet6/in6_pcb.h>
143 1.1 dyoung #include <netinet6/ip6_var.h>
144 1.1 dyoung #include <netinet6/in6_var.h>
145 1.1 dyoung #include <netinet/icmp6.h>
146 1.1 dyoung #include <netinet6/nd6.h>
147 1.1 dyoung
148 1.1 dyoung #define VTW_NCLASS (1+3) /* # different classes */
149 1.1 dyoung
150 1.1 dyoung /*
151 1.1 dyoung * fat pointers, MI.
152 1.1 dyoung */
153 1.1 dyoung struct fatp_mi;
154 1.1 dyoung
155 1.1 dyoung typedef uint32_t fatp_word_t;
156 1.1 dyoung
157 1.1 dyoung typedef struct fatp_mi fatp_t;
158 1.1 dyoung
159 1.1 dyoung /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(),
160 1.1 dyoung * fatp_slot_from_key(), fatp_xtra[].
161 1.1 dyoung */
162 1.1 dyoung #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
163 1.1 dyoung #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
164 1.1 dyoung
165 1.1 dyoung #define FATP_MAX (1 << FATP_NXT_WIDTH)
166 1.1 dyoung
167 1.1 dyoung /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
168 1.1 dyoung * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t.
169 1.1 dyoung * The comments on the fatp_mi members, below, correspond to the worked
170 1.1 dyoung * example.
171 1.1 dyoung */
172 1.1 dyoung struct fatp_mi {
173 1.1 dyoung fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */
174 1.1 dyoung fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
175 1.1 dyoung fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */
176 1.1 dyoung };
177 1.1 dyoung
178 1.1 dyoung static inline int
179 1.1 dyoung fatp_ntags(void)
180 1.1 dyoung {
181 1.1 dyoung return FATP_NTAGS;
182 1.1 dyoung }
183 1.1 dyoung
184 1.1 dyoung static inline int
185 1.1 dyoung fatp_full(fatp_t *fp)
186 1.1 dyoung {
187 1.1 dyoung fatp_t full;
188 1.1 dyoung
189 1.1 dyoung full.inuse = ~0;
190 1.1 dyoung
191 1.1 dyoung return (fp->inuse == full.inuse);
192 1.1 dyoung }
193 1.1 dyoung
194 1.1 dyoung struct vtw_common;
195 1.1 dyoung struct vtw_v4;
196 1.1 dyoung struct vtw_v6;
197 1.1 dyoung struct vtw_ctl;
198 1.1 dyoung
199 1.1 dyoung /*!\brief common to all vtw
200 1.1 dyoung */
201 1.1 dyoung typedef struct vtw_common {
202 1.1 dyoung struct timeval expire; /* date of birth+msl */
203 1.1 dyoung uint32_t key; /* hash key: full hash */
204 1.1 dyoung uint32_t port_key; /* hash key: local port hash */
205 1.1 dyoung uint32_t rcv_nxt;
206 1.1 dyoung uint32_t rcv_wnd;
207 1.1 dyoung uint32_t snd_nxt;
208 1.1 dyoung uint32_t snd_scale : 8; /* window scaling for send win */
209 1.1 dyoung uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */
210 1.1 dyoung uint32_t reuse_port : 1;
211 1.1 dyoung uint32_t reuse_addr : 1;
212 1.1 dyoung uint32_t v6only : 1;
213 1.1 dyoung uint32_t hashed : 1; /* reachable via FATP */
214 1.1 dyoung uint32_t uid;
215 1.1 dyoung } vtw_t;
216 1.1 dyoung
217 1.1 dyoung /*!\brief vestigial timewait for IPv4
218 1.1 dyoung */
219 1.1 dyoung typedef struct vtw_v4 {
220 1.1 dyoung vtw_t common; /* must be first */
221 1.1 dyoung uint16_t lport;
222 1.1 dyoung uint16_t fport;
223 1.1 dyoung uint32_t laddr;
224 1.1 dyoung uint32_t faddr;
225 1.1 dyoung } vtw_v4_t;
226 1.1 dyoung
227 1.1 dyoung /*!\brief vestigial timewait for IPv6
228 1.1 dyoung */
229 1.1 dyoung typedef struct vtw_v6 {
230 1.1 dyoung vtw_t common; /* must be first */
231 1.1 dyoung uint16_t lport;
232 1.1 dyoung uint16_t fport;
233 1.1 dyoung struct in6_addr laddr;
234 1.1 dyoung struct in6_addr faddr;
235 1.1 dyoung } vtw_v6_t;
236 1.1 dyoung
237 1.1 dyoung struct fatp_ctl;
238 1.1 dyoung typedef struct vtw_ctl vtw_ctl_t;
239 1.1 dyoung typedef struct fatp_ctl fatp_ctl_t;
240 1.1 dyoung
241 1.1 dyoung /*
242 1.3 dholland * The vestigial time waits are kept in a contiguous chunk.
243 1.1 dyoung * Allocation and free pointers run as clock hands thru this array.
244 1.1 dyoung */
245 1.1 dyoung struct vtw_ctl {
246 1.1 dyoung fatp_ctl_t *fat; /* collection of fatp to use */
247 1.1 dyoung vtw_ctl_t *ctl; /* <! controller's controller */
248 1.1 dyoung union {
249 1.1 dyoung vtw_t *v; /* common */
250 1.1 dyoung struct vtw_v4 *v4; /* IPv4 resources */
251 1.1 dyoung struct vtw_v6 *v6; /* IPv6 resources */
252 1.1 dyoung } base, /* base of vtw_t array */
253 1.1 dyoung /**/ lim, /* extent of vtw_t array */
254 1.1 dyoung /**/ alloc, /* allocation pointer */
255 1.1 dyoung /**/ oldest; /* ^ to oldest */
256 1.1 dyoung uint32_t nfree; /* # free */
257 1.1 dyoung uint32_t nalloc; /* # allocated */
258 1.1 dyoung uint32_t idx_mask; /* mask capturing all index bits*/
259 1.1 dyoung uint32_t is_v4 : 1;
260 1.1 dyoung uint32_t is_v6 : 1;
261 1.1 dyoung uint32_t idx_bits: 6;
262 1.1 dyoung uint32_t clidx : 3; /* <! class index */
263 1.1 dyoung };
264 1.1 dyoung
265 1.1 dyoung /*!\brief Collections of fat pointers.
266 1.1 dyoung */
267 1.1 dyoung struct fatp_ctl {
268 1.1 dyoung vtw_ctl_t *vtw; /* associated VTWs */
269 1.1 dyoung fatp_t *base; /* base of fatp_t array */
270 1.1 dyoung fatp_t *lim; /* extent of fatp_t array */
271 1.1 dyoung fatp_t *free; /* free list */
272 1.1 dyoung uint32_t mask; /* hash mask */
273 1.1 dyoung uint32_t nfree; /* # free */
274 1.1 dyoung uint32_t nalloc; /* # allocated */
275 1.1 dyoung fatp_t **hash; /* hash anchors */
276 1.1 dyoung fatp_t **port; /* port hash anchors */
277 1.1 dyoung };
278 1.1 dyoung
279 1.1 dyoung /*!\brief stats
280 1.1 dyoung */
281 1.1 dyoung struct vtw_stats {
282 1.1 dyoung uint64_t ins; /* <! inserts */
283 1.1 dyoung uint64_t del; /* <! deleted */
284 1.1 dyoung uint64_t kill; /* <! assassination */
285 1.1 dyoung uint64_t look[2]; /* <! lookup: full hash, port hash */
286 1.1 dyoung uint64_t hit[2]; /* <! lookups that hit */
287 1.1 dyoung uint64_t miss[2]; /* <! lookups that miss */
288 1.1 dyoung uint64_t probe[2]; /* <! hits+miss */
289 1.1 dyoung uint64_t losing[2]; /* <! misses requiring dereference */
290 1.1 dyoung uint64_t max_chain[2]; /* <! max fatp chain traversed */
291 1.1 dyoung uint64_t max_probe[2]; /* <! max probes in any one chain */
292 1.1 dyoung uint64_t max_loss[2]; /* <! max losing probes in any one
293 1.1 dyoung * chain
294 1.1 dyoung */
295 1.1 dyoung };
296 1.1 dyoung
297 1.1 dyoung typedef struct vtw_stats vtw_stats_t;
298 1.1 dyoung
299 1.1 dyoung /*!\brief follow fatp next 'pointer'
300 1.1 dyoung */
301 1.1 dyoung static inline fatp_t *
302 1.1 dyoung fatp_next(fatp_ctl_t *fat, fatp_t *fp)
303 1.1 dyoung {
304 1.1 dyoung return fp->nxt ? fat->base + fp->nxt-1 : 0;
305 1.1 dyoung }
306 1.1 dyoung
307 1.1 dyoung /*!\brief determine a collection-relative fat pointer index.
308 1.1 dyoung */
309 1.1 dyoung static inline uint32_t
310 1.1 dyoung fatp_index(fatp_ctl_t *fat, fatp_t *fp)
311 1.1 dyoung {
312 1.1 dyoung return fp ? 1 + (fp - fat->base) : 0;
313 1.1 dyoung }
314 1.1 dyoung
315 1.1 dyoung
316 1.1 dyoung static inline uint32_t
317 1.1 dyoung v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
318 1.1 dyoung {
319 1.1 dyoung return (ntohl(faddr) + ntohs(fport)
320 1.1 dyoung + ntohl(laddr) + ntohs(lport));
321 1.1 dyoung }
322 1.1 dyoung
323 1.1 dyoung static inline uint32_t
324 1.1 dyoung v6_tag(const struct in6_addr *faddr, uint16_t fport,
325 1.1 dyoung const struct in6_addr *laddr, uint16_t lport)
326 1.1 dyoung {
327 1.1 dyoung #ifdef IN6_HASH
328 1.1 dyoung return IN6_HASH(faddr, fport, laddr, lport);
329 1.1 dyoung #else
330 1.1 dyoung return 0;
331 1.1 dyoung #endif
332 1.1 dyoung }
333 1.1 dyoung
334 1.1 dyoung static inline uint32_t
335 1.1 dyoung v4_port_tag(uint16_t lport)
336 1.1 dyoung {
337 1.1 dyoung uint32_t tag = lport ^ (lport << 11);
338 1.1 dyoung
339 1.1 dyoung tag ^= tag << 3;
340 1.1 dyoung tag += tag >> 5;
341 1.1 dyoung tag ^= tag << 4;
342 1.1 dyoung tag += tag >> 17;
343 1.1 dyoung tag ^= tag << 25;
344 1.1 dyoung tag += tag >> 6;
345 1.1 dyoung
346 1.1 dyoung return tag;
347 1.1 dyoung }
348 1.1 dyoung
349 1.1 dyoung static inline uint32_t
350 1.1 dyoung v6_port_tag(uint16_t lport)
351 1.1 dyoung {
352 1.1 dyoung return v4_port_tag(lport);
353 1.1 dyoung }
354 1.1 dyoung
355 1.1 dyoung struct tcpcb;
356 1.1 dyoung struct tcphdr;
357 1.1 dyoung
358 1.1 dyoung int vtw_add(int, struct tcpcb *);
359 1.1 dyoung void vtw_del(vtw_ctl_t *, vtw_t *);
360 1.1 dyoung int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
361 1.1 dyoung uint32_t faddr, uint16_t fport,
362 1.1 dyoung uint32_t laddr, uint16_t lport);
363 1.1 dyoung struct ip6_hdr;
364 1.1 dyoung struct in6_addr;
365 1.1 dyoung
366 1.1 dyoung int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
367 1.1 dyoung const struct in6_addr *faddr, uint16_t fport,
368 1.1 dyoung const struct in6_addr *laddr, uint16_t lport);
369 1.1 dyoung
370 1.1 dyoung typedef struct vestigial_inpcb {
371 1.1 dyoung union {
372 1.1 dyoung struct in_addr v4;
373 1.1 dyoung struct in6_addr v6;
374 1.1 dyoung } faddr, laddr;
375 1.1 dyoung uint16_t fport, lport;
376 1.1 dyoung uint32_t valid : 1;
377 1.1 dyoung uint32_t v4 : 1;
378 1.1 dyoung uint32_t reuse_addr : 1;
379 1.1 dyoung uint32_t reuse_port : 1;
380 1.1 dyoung uint32_t v6only : 1;
381 1.1 dyoung uint32_t more_tbd : 1;
382 1.1 dyoung uint32_t uid;
383 1.1 dyoung uint32_t rcv_nxt;
384 1.1 dyoung uint32_t rcv_wnd;
385 1.1 dyoung uint32_t snd_nxt;
386 1.1 dyoung struct vtw_common *vtw;
387 1.1 dyoung struct vtw_ctl *ctl;
388 1.1 dyoung } vestigial_inpcb_t;
389 1.1 dyoung
390 1.1 dyoung void vtw_restart(vestigial_inpcb_t*);
391 1.1 dyoung int vtw_earlyinit(void);
392 1.4 dyoung int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
393 1.1 dyoung
394 1.1 dyoung #ifdef VTW_DEBUG
395 1.1 dyoung typedef struct sin_either {
396 1.1 dyoung uint8_t sin_len;
397 1.1 dyoung uint8_t sin_family;
398 1.1 dyoung uint16_t sin_port;
399 1.1 dyoung union {
400 1.1 dyoung struct in_addr v4;
401 1.1 dyoung struct in6_addr v6;
402 1.1 dyoung } sin_addr;
403 1.1 dyoung } sin_either_t;
404 1.1 dyoung
405 1.1 dyoung int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
406 1.1 dyoung
407 1.1 dyoung typedef struct vtw_sysargs {
408 1.1 dyoung uint32_t op;
409 1.1 dyoung sin_either_t fa;
410 1.1 dyoung sin_either_t la;
411 1.1 dyoung } vtw_sysargs_t;
412 1.1 dyoung
413 1.1 dyoung #endif /* VTW_DEBUG */
414 1.1 dyoung
415 1.1 dyoung #endif /* _NETINET_TCP_VTW_H */
416