tcp_vtw.h revision 1.6.2.1 1 /* $NetBSD: tcp_vtw.h,v 1.6.2.1 2013/07/17 03:16:31 rmind Exp $ */
2 /*
3 * Copyright (c) 2011 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Coyote Point Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Vestigial time-wait.
33 *
34 * This implementation uses cache-efficient techniques, which will
35 * appear somewhat peculiar. The main philosophy is to optimise the
36 * amount of information available within a cache line. Cache miss is
37 * expensive. So we employ ad-hoc techniques to pull a series of
38 * linked-list follows into a cache line. One cache line, multiple
39 * linked-list equivalents.
40 *
41 * One such ad-hoc technique is fat pointers. Additional degrees of
42 * ad-hoqueness result from having to hand tune it for pointer size
43 * and for cache line size.
44 *
45 * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
46 * data structures into one cache line. The additional 32 bits in the
47 * cache line are used for linking fat pointers, and for
48 * allocation/bookkeeping.
49 *
50 * The 15 32-bit tags encode the pointers to the linked list elements,
51 * and also encode the results of a search comparison.
52 *
53 * First, some more assumptions/restrictions.
54 *
55 * All the fat pointers are from a contiguous allocation arena. Thus,
56 * we can refer to them by offset from a base, not as full pointers.
57 *
58 * All the linked list data elements are also from a contiguous
59 * allocation arena, again so that we can refer to them as offset from
60 * a base.
61 *
62 * In order to add a data element to a fat pointer, a key value is
63 * computed, based on unique data within the data element. It is the
64 * linear searching of the linked lists of these elements based on
65 * these unique data that are being optimised here.
66 *
67 * Lets call the function that computes the key k(e), where e is the
68 * data element. In this example, k(e) returns 32-bits.
69 *
70 * Consider a set E (say of order 15) of data elements. Let K be
71 * the set of the k(e) for e in E.
72 *
73 * Let O be the set of the offsets from the base of the data elements in E.
74 *
75 * For each x in K, for each matching o in O, let t be x ^ o. These
76 * are the tags. (More or less).
77 *
78 * In order to search all the data elements in E, we compute the
79 * search key, and one at a time, XOR the key into the tags. If any
80 * result is a valid data element index, we have a possible match. If
81 * not, there is no match.
82 *
83 * The no-match cases mean we do not have to de-reference the pointer
84 * to the data element in question. We save cache miss penalty and
85 * cache load decreases. Only in the case of a valid looking data
86 * element index, do we have to look closer.
87 *
88 * Thus, in the absence of false positives, 15 data elements can be
89 * searched with one cache line fill, as opposed to 15 cache line
90 * fills for the usual implementation.
91 *
92 * The vestigial time waits (vtw_t), the data elements in the above, are
93 * searched by faddr, fport, laddr, lport. The key is a function of
94 * these values.
95 *
96 * We hash these keys into the traditional hash chains to reduce the
97 * search time, and use fat pointers to reduce the cache impacts of
98 * searching.
99 *
100 * The vtw_t are, per requirement, in a contiguous chunk. Allocation
101 * is done with a clock hand, and all vtw_t within one allocation
102 * domain have the same lifetime, so they will always be sorted by
103 * age.
104 *
105 * A vtw_t will be allocated, timestamped, and have a fixed future
106 * expiration. It will be added to a hash bucket implemented with fat
107 * pointers, which means that a cache line will be allocated in the
108 * hash bucket, placed at the head (more recent in time) and the vtw_t
109 * will be added to this. As more entries are added, the fat pointer
110 * cache line will fill, requiring additional cache lines for fat
111 * pointers to be allocated. These will be added at the head, and the
112 * aged entries will hang down, tapeworm like. As the vtw_t entries
113 * expire, the corresponding slot in the fat pointer will be
114 * reclaimed, and eventually the cache line will completely empty and
115 * be re-cycled, if not at the head of the chain.
116 *
117 * At times, a time-wait timer is restarted. This corresponds to
118 * deleting the current entry and re-adding it.
119 *
120 * Most of the time, they are just placed here to die.
121 */
122 #ifndef _NETINET_TCP_VTW_H
123 #define _NETINET_TCP_VTW_H
124
125 #if !defined(_KERNEL)
126 #error "not supposed to be exposed to userland"
127 #endif
128
129 #include <sys/types.h>
130 #include <sys/socket.h>
131 #include <sys/sysctl.h>
132 #include <net/if.h>
133 #include <net/route.h>
134 #include <netinet/in.h>
135 #include <netinet/in_systm.h>
136 #include <netinet/ip.h>
137 #include <netinet/in_pcb.h>
138 #include <netinet/in_var.h>
139 #include <netinet/ip_var.h>
140 #include <netinet/in.h>
141 #include <netinet/tcp.h>
142 #include <netinet/tcp_timer.h>
143 #include <netinet/tcp_var.h>
144 #include <netinet6/in6.h>
145 #include <netinet/ip6.h>
146 #include <netinet6/ip6_var.h>
147 #include <netinet6/in6_pcb.h>
148 #include <netinet6/ip6_var.h>
149 #include <netinet6/in6_var.h>
150 #include <netinet/icmp6.h>
151 #include <netinet6/nd6.h>
152
153 #define VTW_NCLASS (1+3) /* # different classes */
154
155 /*
156 * fat pointers, MI.
157 */
158 struct fatp_mi;
159
160 typedef uint32_t fatp_word_t;
161
162 typedef struct fatp_mi fatp_t;
163
164 /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(),
165 * fatp_slot_from_key(), fatp_xtra[].
166 */
167 #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
168 #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
169
170 #define FATP_MAX (1 << FATP_NXT_WIDTH)
171
172 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
173 * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t.
174 * The comments on the fatp_mi members, below, correspond to the worked
175 * example.
176 */
177 struct fatp_mi {
178 fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */
179 fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
180 fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */
181 };
182
183 static inline int
184 fatp_ntags(void)
185 {
186 return FATP_NTAGS;
187 }
188
189 static inline int
190 fatp_full(fatp_t *fp)
191 {
192 fatp_t full;
193
194 full.inuse = (1U << FATP_NTAGS) - 1U;
195
196 return (fp->inuse == full.inuse);
197 }
198
199 struct vtw_common;
200 struct vtw_v4;
201 struct vtw_v6;
202 struct vtw_ctl;
203
204 /*!\brief common to all vtw
205 */
206 typedef struct vtw_common {
207 struct timeval expire; /* date of birth+msl */
208 uint32_t key; /* hash key: full hash */
209 uint32_t port_key; /* hash key: local port hash */
210 uint32_t rcv_nxt;
211 uint32_t rcv_wnd;
212 uint32_t snd_nxt;
213 uint32_t snd_scale : 8; /* window scaling for send win */
214 uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */
215 uint32_t reuse_port : 1;
216 uint32_t reuse_addr : 1;
217 uint32_t v6only : 1;
218 uint32_t hashed : 1; /* reachable via FATP */
219 uint32_t uid;
220 } vtw_t;
221
222 /*!\brief vestigial timewait for IPv4
223 */
224 typedef struct vtw_v4 {
225 vtw_t common; /* must be first */
226 uint16_t lport;
227 uint16_t fport;
228 uint32_t laddr;
229 uint32_t faddr;
230 } vtw_v4_t;
231
232 /*!\brief vestigial timewait for IPv6
233 */
234 typedef struct vtw_v6 {
235 vtw_t common; /* must be first */
236 uint16_t lport;
237 uint16_t fport;
238 struct in6_addr laddr;
239 struct in6_addr faddr;
240 } vtw_v6_t;
241
242 struct fatp_ctl;
243 typedef struct vtw_ctl vtw_ctl_t;
244 typedef struct fatp_ctl fatp_ctl_t;
245
246 /*
247 * The vestigial time waits are kept in a contiguous chunk.
248 * Allocation and free pointers run as clock hands thru this array.
249 */
250 struct vtw_ctl {
251 fatp_ctl_t *fat; /* collection of fatp to use */
252 vtw_ctl_t *ctl; /* <! controller's controller */
253 union {
254 vtw_t *v; /* common */
255 struct vtw_v4 *v4; /* IPv4 resources */
256 struct vtw_v6 *v6; /* IPv6 resources */
257 } base, /* base of vtw_t array */
258 /**/ lim, /* extent of vtw_t array */
259 /**/ alloc, /* allocation pointer */
260 /**/ oldest; /* ^ to oldest */
261 uint32_t nfree; /* # free */
262 uint32_t nalloc; /* # allocated */
263 uint32_t idx_mask; /* mask capturing all index bits*/
264 uint32_t is_v4 : 1;
265 uint32_t is_v6 : 1;
266 uint32_t idx_bits: 6;
267 uint32_t clidx : 3; /* <! class index */
268 };
269
270 /*!\brief Collections of fat pointers.
271 */
272 struct fatp_ctl {
273 vtw_ctl_t *vtw; /* associated VTWs */
274 fatp_t *base; /* base of fatp_t array */
275 fatp_t *lim; /* extent of fatp_t array */
276 fatp_t *free; /* free list */
277 uint32_t mask; /* hash mask */
278 uint32_t nfree; /* # free */
279 uint32_t nalloc; /* # allocated */
280 fatp_t **hash; /* hash anchors */
281 fatp_t **port; /* port hash anchors */
282 };
283
284 /*!\brief stats
285 */
286 struct vtw_stats {
287 uint64_t ins; /* <! inserts */
288 uint64_t del; /* <! deleted */
289 uint64_t kill; /* <! assassination */
290 uint64_t look[2]; /* <! lookup: full hash, port hash */
291 uint64_t hit[2]; /* <! lookups that hit */
292 uint64_t miss[2]; /* <! lookups that miss */
293 uint64_t probe[2]; /* <! hits+miss */
294 uint64_t losing[2]; /* <! misses requiring dereference */
295 uint64_t max_chain[2]; /* <! max fatp chain traversed */
296 uint64_t max_probe[2]; /* <! max probes in any one chain */
297 uint64_t max_loss[2]; /* <! max losing probes in any one
298 * chain
299 */
300 };
301
302 typedef struct vtw_stats vtw_stats_t;
303
304 /*!\brief follow fatp next 'pointer'
305 */
306 static inline fatp_t *
307 fatp_next(fatp_ctl_t *fat, fatp_t *fp)
308 {
309 return fp->nxt ? fat->base + fp->nxt-1 : 0;
310 }
311
312 /*!\brief determine a collection-relative fat pointer index.
313 */
314 static inline uint32_t
315 fatp_index(fatp_ctl_t *fat, fatp_t *fp)
316 {
317 return fp ? 1 + (fp - fat->base) : 0;
318 }
319
320
321 static inline uint32_t
322 v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
323 {
324 return (ntohl(faddr) + ntohs(fport)
325 + ntohl(laddr) + ntohs(lport));
326 }
327
328 static inline uint32_t
329 v6_tag(const struct in6_addr *faddr, uint16_t fport,
330 const struct in6_addr *laddr, uint16_t lport)
331 {
332 #ifdef IN6_HASH
333 return IN6_HASH(faddr, fport, laddr, lport);
334 #else
335 return 0;
336 #endif
337 }
338
339 static inline uint32_t
340 v4_port_tag(uint16_t lport)
341 {
342 uint32_t tag = lport ^ (lport << 11);
343
344 tag ^= tag << 3;
345 tag += tag >> 5;
346 tag ^= tag << 4;
347 tag += tag >> 17;
348 tag ^= tag << 25;
349 tag += tag >> 6;
350
351 return tag;
352 }
353
354 static inline uint32_t
355 v6_port_tag(uint16_t lport)
356 {
357 return v4_port_tag(lport);
358 }
359
360 struct tcpcb;
361 struct tcphdr;
362
363 int vtw_add(int, struct tcpcb *);
364 void vtw_del(vtw_ctl_t *, vtw_t *);
365 int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
366 uint32_t faddr, uint16_t fport,
367 uint32_t laddr, uint16_t lport);
368 struct ip6_hdr;
369 struct in6_addr;
370
371 int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
372 const struct in6_addr *faddr, uint16_t fport,
373 const struct in6_addr *laddr, uint16_t lport);
374
375 typedef struct vestigial_inpcb {
376 union {
377 struct in_addr v4;
378 struct in6_addr v6;
379 } faddr, laddr;
380 uint16_t fport, lport;
381 uint32_t valid : 1;
382 uint32_t v4 : 1;
383 uint32_t reuse_addr : 1;
384 uint32_t reuse_port : 1;
385 uint32_t v6only : 1;
386 uint32_t more_tbd : 1;
387 uint32_t uid;
388 uint32_t rcv_nxt;
389 uint32_t rcv_wnd;
390 uint32_t snd_nxt;
391 struct vtw_common *vtw;
392 struct vtw_ctl *ctl;
393 } vestigial_inpcb_t;
394
395 /*
396 * Hooks for vestigial pcb entries. If vestigial entries exist for a
397 * table (TCP only) the vestigial pointer is set.
398 */
399 typedef struct vestigial_hooks {
400 /* IPv4 hooks */
401 void *(*init_ports4)(struct in_addr, u_int, int);
402 int (*next_port4)(void *, struct vestigial_inpcb *);
403 int (*lookup4)(struct in_addr, uint16_t,
404 struct in_addr, uint16_t,
405 struct vestigial_inpcb *);
406 /* IPv6 hooks */
407 void *(*init_ports6)(const struct in6_addr*, u_int, int);
408 int (*next_port6)(void *, struct vestigial_inpcb *);
409 int (*lookup6)(const struct in6_addr *, uint16_t,
410 const struct in6_addr *, uint16_t,
411 struct vestigial_inpcb *);
412 } vestigial_hooks_t;
413
414 void vtw_restart(vestigial_inpcb_t*);
415 int vtw_earlyinit(void);
416 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
417
418 #ifdef VTW_DEBUG
419 typedef struct sin_either {
420 uint8_t sin_len;
421 uint8_t sin_family;
422 uint16_t sin_port;
423 union {
424 struct in_addr v4;
425 struct in6_addr v6;
426 } sin_addr;
427 } sin_either_t;
428
429 int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
430
431 typedef struct vtw_sysargs {
432 uint32_t op;
433 sin_either_t fa;
434 sin_either_t la;
435 } vtw_sysargs_t;
436
437 #endif /* VTW_DEBUG */
438
439 #endif /* _NETINET_TCP_VTW_H */
440