Home | History | Annotate | Line # | Download | only in netinet
tcp_vtw.h revision 1.6.2.1
      1 /*	$NetBSD: tcp_vtw.h,v 1.6.2.1 2013/07/17 03:16:31 rmind Exp $	*/
      2 /*
      3  * Copyright (c) 2011 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Coyote Point Systems, Inc.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * Vestigial time-wait.
     33  *
     34  * This implementation uses cache-efficient techniques, which will
     35  * appear somewhat peculiar.  The main philosophy is to optimise the
     36  * amount of information available within a cache line.  Cache miss is
     37  * expensive.  So we employ ad-hoc techniques to pull a series of
     38  * linked-list follows into a cache line.  One cache line, multiple
     39  * linked-list equivalents.
     40  *
     41  * One such ad-hoc technique is fat pointers.  Additional degrees of
     42  * ad-hoqueness result from having to hand tune it for pointer size
     43  * and for cache line size.
     44  *
     45  * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
     46  * data structures into one cache line.  The additional 32 bits in the
     47  * cache line are used for linking fat pointers, and for
     48  * allocation/bookkeeping.
     49  *
     50  * The 15 32-bit tags encode the pointers to the linked list elements,
     51  * and also encode the results of a search comparison.
     52  *
     53  * First, some more assumptions/restrictions.
     54  *
     55  * All the fat pointers are from a contiguous allocation arena.  Thus,
     56  * we can refer to them by offset from a base, not as full pointers.
     57  *
     58  * All the linked list data elements are also from a contiguous
     59  * allocation arena, again so that we can refer to them as offset from
     60  * a base.
     61  *
     62  * In order to add a data element to a fat pointer, a key value is
     63  * computed, based on unique data within the data element.  It is the
     64  * linear searching of the linked lists of these elements based on
     65  * these unique data that are being optimised here.
     66  *
     67  * Lets call the function that computes the key k(e), where e is the
     68  * data element.  In this example, k(e) returns 32-bits.
     69  *
     70  * Consider a set E (say of order 15) of data elements.  Let K be
     71  * the set of the k(e) for e in E.
     72  *
     73  * Let O be the set of the offsets from the base of the data elements in E.
     74  *
     75  * For each x in K, for each matching o in O, let t be x ^ o.  These
     76  * are the tags. (More or less).
     77  *
     78  * In order to search all the data elements in E, we compute the
     79  * search key, and one at a time, XOR the key into the tags.  If any
     80  * result is a valid data element index, we have a possible match.  If
     81  * not, there is no match.
     82  *
     83  * The no-match cases mean we do not have to de-reference the pointer
     84  * to the data element in question.  We save cache miss penalty and
     85  * cache load decreases.  Only in the case of a valid looking data
     86  * element index, do we have to look closer.
     87  *
     88  * Thus, in the absence of false positives, 15 data elements can be
     89  * searched with one cache line fill, as opposed to 15 cache line
     90  * fills for the usual implementation.
     91  *
     92  * The vestigial time waits (vtw_t), the data elements in the above, are
     93  * searched by faddr, fport, laddr, lport.  The key is a function of
     94  * these values.
     95  *
     96  * We hash these keys into the traditional hash chains to reduce the
     97  * search time, and use fat pointers to reduce the cache impacts of
     98  * searching.
     99  *
    100  * The vtw_t are, per requirement, in a contiguous chunk.  Allocation
    101  * is done with a clock hand, and all vtw_t within one allocation
    102  * domain have the same lifetime, so they will always be sorted by
    103  * age.
    104  *
    105  * A vtw_t will be allocated, timestamped, and have a fixed future
    106  * expiration.  It will be added to a hash bucket implemented with fat
    107  * pointers, which means that a cache line will be allocated in the
    108  * hash bucket, placed at the head (more recent in time) and the vtw_t
    109  * will be added to this.  As more entries are added, the fat pointer
    110  * cache line will fill, requiring additional cache lines for fat
    111  * pointers to be allocated. These will be added at the head, and the
    112  * aged entries will hang down, tapeworm like.  As the vtw_t entries
    113  * expire, the corresponding slot in the fat pointer will be
    114  * reclaimed, and eventually the cache line will completely empty and
    115  * be re-cycled, if not at the head of the chain.
    116  *
    117  * At times, a time-wait timer is restarted.  This corresponds to
    118  * deleting the current entry and re-adding it.
    119  *
    120  * Most of the time, they are just placed here to die.
    121  */
    122 #ifndef _NETINET_TCP_VTW_H
    123 #define _NETINET_TCP_VTW_H
    124 
    125 #if !defined(_KERNEL)
    126 #error "not supposed to be exposed to userland"
    127 #endif
    128 
    129 #include <sys/types.h>
    130 #include <sys/socket.h>
    131 #include <sys/sysctl.h>
    132 #include <net/if.h>
    133 #include <net/route.h>
    134 #include <netinet/in.h>
    135 #include <netinet/in_systm.h>
    136 #include <netinet/ip.h>
    137 #include <netinet/in_pcb.h>
    138 #include <netinet/in_var.h>
    139 #include <netinet/ip_var.h>
    140 #include <netinet/in.h>
    141 #include <netinet/tcp.h>
    142 #include <netinet/tcp_timer.h>
    143 #include <netinet/tcp_var.h>
    144 #include <netinet6/in6.h>
    145 #include <netinet/ip6.h>
    146 #include <netinet6/ip6_var.h>
    147 #include <netinet6/in6_pcb.h>
    148 #include <netinet6/ip6_var.h>
    149 #include <netinet6/in6_var.h>
    150 #include <netinet/icmp6.h>
    151 #include <netinet6/nd6.h>
    152 
    153 #define	VTW_NCLASS	(1+3)		/* # different classes */
    154 
    155 /*
    156  * fat pointers, MI.
    157  */
    158 struct fatp_mi;
    159 
    160 typedef uint32_t fatp_word_t;
    161 
    162 typedef struct fatp_mi	fatp_t;
    163 
    164 /* Supported cacheline sizes: 32 64 128 bytes.  See fatp_key(),
    165  * fatp_slot_from_key(), fatp_xtra[].
    166  */
    167 #define	FATP_NTAGS	(CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1)
    168 #define	FATP_NXT_WIDTH	(sizeof(fatp_word_t) * NBBY - FATP_NTAGS)
    169 
    170 #define	FATP_MAX	(1 << FATP_NXT_WIDTH)
    171 
    172 /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
    173  * 15 tags per cacheline.  At most 2^17 fat pointers per fatp_ctl_t.
    174  * The comments on the fatp_mi members, below, correspond to the worked
    175  * example.
    176  */
    177 struct fatp_mi {
    178 	fatp_word_t	inuse	: FATP_NTAGS;	/* (1+15)*4 == CL_SIZE */
    179 	fatp_word_t	nxt	: FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
    180 	fatp_word_t	tag[FATP_NTAGS];	/* 15 tags per CL */
    181 };
    182 
    183 static inline int
    184 fatp_ntags(void)
    185 {
    186 	return FATP_NTAGS;
    187 }
    188 
    189 static inline int
    190 fatp_full(fatp_t *fp)
    191 {
    192 	fatp_t full;
    193 
    194 	full.inuse = (1U << FATP_NTAGS) - 1U;
    195 
    196 	return (fp->inuse == full.inuse);
    197 }
    198 
    199 struct vtw_common;
    200 struct vtw_v4;
    201 struct vtw_v6;
    202 struct vtw_ctl;
    203 
    204 /*!\brief common to all vtw
    205  */
    206 typedef struct vtw_common {
    207 	struct timeval	expire;		/* date of birth+msl */
    208 	uint32_t	key;		/* hash key: full hash */
    209 	uint32_t	port_key;	/* hash key: local port hash */
    210 	uint32_t	rcv_nxt;
    211 	uint32_t	rcv_wnd;
    212 	uint32_t	snd_nxt;
    213 	uint32_t	snd_scale	: 8;	/* window scaling for send win */
    214 	uint32_t	msl_class	: 2;	/* TCP MSL class {0,1,2,3} */
    215 	uint32_t	reuse_port	: 1;
    216 	uint32_t	reuse_addr	: 1;
    217 	uint32_t	v6only		: 1;
    218 	uint32_t	hashed		: 1;	/* reachable via FATP */
    219 	uint32_t	uid;
    220 } vtw_t;
    221 
    222 /*!\brief vestigial timewait for IPv4
    223  */
    224 typedef struct vtw_v4 {
    225 	vtw_t		common;		/*  must be first */
    226 	uint16_t	lport;
    227 	uint16_t	fport;
    228 	uint32_t	laddr;
    229 	uint32_t	faddr;
    230 } vtw_v4_t;
    231 
    232 /*!\brief vestigial timewait for IPv6
    233  */
    234 typedef struct vtw_v6 {
    235 	vtw_t		common;		/* must be first */
    236 	uint16_t	lport;
    237 	uint16_t	fport;
    238 	struct in6_addr	laddr;
    239 	struct in6_addr	faddr;
    240 } vtw_v6_t;
    241 
    242 struct fatp_ctl;
    243 typedef struct vtw_ctl		vtw_ctl_t;
    244 typedef struct fatp_ctl		fatp_ctl_t;
    245 
    246 /*
    247  * The vestigial time waits are kept in a contiguous chunk.
    248  * Allocation and free pointers run as clock hands thru this array.
    249  */
    250 struct vtw_ctl {
    251 	fatp_ctl_t	*fat;		/* collection of fatp to use	*/
    252 	vtw_ctl_t	*ctl;		/* <! controller's controller	*/
    253 	union {
    254 		vtw_t		*v;	/* common			*/
    255 		struct vtw_v4	*v4;	/* IPv4 resources		*/
    256 		struct vtw_v6	*v6;	/* IPv6 resources		*/
    257 	}		base,		/* base of vtw_t array		*/
    258 		/**/	lim,		/* extent of vtw_t array	*/
    259 		/**/	alloc,		/* allocation pointer		*/
    260 		/**/	oldest;		/* ^ to oldest			*/
    261 	uint32_t	nfree;		/* # free			*/
    262 	uint32_t	nalloc;		/* # allocated			*/
    263 	uint32_t	idx_mask;	/* mask capturing all index bits*/
    264 	uint32_t	is_v4	: 1;
    265 	uint32_t	is_v6	: 1;
    266 	uint32_t	idx_bits: 6;
    267 	uint32_t	clidx	: 3;	/* <! class index */
    268 };
    269 
    270 /*!\brief Collections of fat pointers.
    271  */
    272 struct fatp_ctl {
    273 	vtw_ctl_t	*vtw;		/* associated VTWs		*/
    274 	fatp_t		*base;		/* base of fatp_t array		*/
    275 	fatp_t		*lim;		/* extent of fatp_t array	*/
    276 	fatp_t		*free;		/* free list			*/
    277 	uint32_t	mask;		/* hash mask			*/
    278 	uint32_t	nfree;		/* # free			*/
    279 	uint32_t	nalloc;		/* # allocated			*/
    280 	fatp_t		**hash;		/* hash anchors			*/
    281 	fatp_t		**port;		/* port hash anchors		*/
    282 };
    283 
    284 /*!\brief stats
    285  */
    286 struct vtw_stats {
    287 	uint64_t	ins;		/* <! inserts */
    288 	uint64_t	del;		/* <! deleted */
    289 	uint64_t	kill;		/* <! assassination */
    290 	uint64_t	look[2];	/* <! lookup: full hash, port hash */
    291 	uint64_t	hit[2];		/* <! lookups that hit */
    292 	uint64_t	miss[2];	/* <! lookups that miss */
    293 	uint64_t	probe[2];	/* <! hits+miss */
    294 	uint64_t	losing[2];	/* <! misses requiring dereference */
    295 	uint64_t	max_chain[2];	/* <! max fatp chain traversed */
    296 	uint64_t	max_probe[2];	/* <! max probes in any one chain */
    297 	uint64_t	max_loss[2];	/* <! max losing probes in any one
    298 					 * chain
    299 					 */
    300 };
    301 
    302 typedef struct vtw_stats	vtw_stats_t;
    303 
    304 /*!\brief	follow fatp next 'pointer'
    305  */
    306 static inline fatp_t *
    307 fatp_next(fatp_ctl_t *fat, fatp_t *fp)
    308 {
    309 	return fp->nxt ? fat->base + fp->nxt-1 : 0;
    310 }
    311 
    312 /*!\brief determine a collection-relative fat pointer index.
    313  */
    314 static inline uint32_t
    315 fatp_index(fatp_ctl_t *fat, fatp_t *fp)
    316 {
    317 	return fp ? 1 + (fp - fat->base) : 0;
    318 }
    319 
    320 
    321 static inline uint32_t
    322 v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport)
    323 {
    324 	return (ntohl(faddr)   + ntohs(fport)
    325 		+ ntohl(laddr) + ntohs(lport));
    326 }
    327 
    328 static inline uint32_t
    329 v6_tag(const struct in6_addr *faddr, uint16_t fport,
    330        const struct in6_addr *laddr, uint16_t lport)
    331 {
    332 #ifdef IN6_HASH
    333 	return IN6_HASH(faddr, fport, laddr, lport);
    334 #else
    335 	return 0;
    336 #endif
    337 }
    338 
    339 static inline uint32_t
    340 v4_port_tag(uint16_t lport)
    341 {
    342 	uint32_t tag = lport ^ (lport << 11);
    343 
    344 	tag ^= tag << 3;
    345 	tag += tag >> 5;
    346 	tag ^= tag << 4;
    347 	tag += tag >> 17;
    348 	tag ^= tag << 25;
    349 	tag += tag >> 6;
    350 
    351 	return tag;
    352 }
    353 
    354 static inline uint32_t
    355 v6_port_tag(uint16_t lport)
    356 {
    357 	return v4_port_tag(lport);
    358 }
    359 
    360 struct tcpcb;
    361 struct tcphdr;
    362 
    363 int  vtw_add(int, struct tcpcb *);
    364 void vtw_del(vtw_ctl_t *, vtw_t *);
    365 int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th,
    366 		  uint32_t faddr, uint16_t fport,
    367 		  uint32_t laddr, uint16_t lport);
    368 struct ip6_hdr;
    369 struct in6_addr;
    370 
    371 int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th,
    372 		  const struct in6_addr *faddr, uint16_t fport,
    373 		  const struct in6_addr *laddr, uint16_t lport);
    374 
    375 typedef struct vestigial_inpcb {
    376 	union {
    377 		struct in_addr	v4;
    378 		struct in6_addr	v6;
    379 	} faddr, laddr;
    380 	uint16_t		fport, lport;
    381 	uint32_t		valid		: 1;
    382 	uint32_t		v4		: 1;
    383 	uint32_t		reuse_addr	: 1;
    384 	uint32_t		reuse_port	: 1;
    385 	uint32_t		v6only		: 1;
    386 	uint32_t		more_tbd	: 1;
    387 	uint32_t		uid;
    388 	uint32_t		rcv_nxt;
    389 	uint32_t		rcv_wnd;
    390 	uint32_t		snd_nxt;
    391 	struct vtw_common	*vtw;
    392 	struct vtw_ctl		*ctl;
    393 } vestigial_inpcb_t;
    394 
    395 /*
    396  * Hooks for vestigial pcb entries.  If vestigial entries exist for a
    397  * table (TCP only) the vestigial pointer is set.
    398  */
    399 typedef struct vestigial_hooks {
    400 	/* IPv4 hooks */
    401 	void	*(*init_ports4)(struct in_addr, u_int, int);
    402 	int	(*next_port4)(void *, struct vestigial_inpcb *);
    403 	int	(*lookup4)(struct in_addr, uint16_t,
    404 			   struct in_addr, uint16_t,
    405 			   struct vestigial_inpcb *);
    406 	/* IPv6 hooks */
    407 	void	*(*init_ports6)(const struct in6_addr*, u_int, int);
    408 	int	(*next_port6)(void *, struct vestigial_inpcb *);
    409 	int	(*lookup6)(const struct in6_addr *, uint16_t,
    410 			   const struct in6_addr *, uint16_t,
    411 			   struct vestigial_inpcb *);
    412 } vestigial_hooks_t;
    413 
    414 void vtw_restart(vestigial_inpcb_t*);
    415 int vtw_earlyinit(void);
    416 int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
    417 
    418 #ifdef VTW_DEBUG
    419 typedef struct sin_either {
    420 	uint8_t		sin_len;
    421 	uint8_t		sin_family;
    422 	uint16_t	sin_port;
    423 	union {
    424 		struct in_addr	v4;
    425 		struct in6_addr	v6;
    426 	}		sin_addr;
    427 } sin_either_t;
    428 
    429 int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int);
    430 
    431 typedef struct vtw_sysargs {
    432 	uint32_t	op;
    433 	sin_either_t	fa;
    434 	sin_either_t	la;
    435 } vtw_sysargs_t;
    436 
    437 #endif /* VTW_DEBUG */
    438 
    439 #endif /* _NETINET_TCP_VTW_H */
    440