Home | History | Annotate | Line # | Download | only in npf
npf_conn.c revision 1.18
      1 /*	$NetBSD: npf_conn.c,v 1.18 2016/12/10 05:41:10 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2014-2015 Mindaugas Rasiukevicius <rmind at netbsd org>
      5  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This material is based upon work partially supported by The
      9  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * NPF connection tracking for stateful filtering and translation.
     35  *
     36  * Overview
     37  *
     38  *	Connection direction is identified by the direction of its first
     39  *	packet.  Packets can be incoming or outgoing with respect to an
     40  *	interface.  To describe the packet in the context of connection
     41  *	direction we will use the terms "forwards stream" and "backwards
     42  *	stream".  All connections have two keys and thus two entries:
     43  *
     44  *		npf_conn_t::c_forw_entry for the forwards stream and
     45  *		npf_conn_t::c_back_entry for the backwards stream.
     46  *
     47  *	The keys are formed from the 5-tuple (source/destination address,
     48  *	source/destination port and the protocol).  Additional matching
     49  *	is performed for the interface (a common behaviour is equivalent
     50  *	to the 6-tuple lookup including the interface ID).  Note that the
     51  *	key may be formed using translated values in a case of NAT.
     52  *
     53  *	Connections can serve two purposes: for the implicit passing or
     54  *	to accommodate the dynamic NAT.  Connections for the former purpose
     55  *	are created by the rules with "stateful" attribute and are used for
     56  *	stateful filtering.  Such connections indicate that the packet of
     57  *	the backwards stream should be passed without inspection of the
     58  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
     59  *	with a connection.  Such connections are created by the NAT policies
     60  *	and they have a relationship with NAT translation structure via
     61  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
     62  *	which is a common case.
     63  *
     64  * Connection life-cycle
     65  *
     66  *	Connections are established when a packet matches said rule or
     67  *	NAT policy.  Both keys of the established connection are inserted
     68  *	into the connection database.  A garbage collection thread
     69  *	periodically scans all connections and depending on connection
     70  *	properties (e.g. last activity time, protocol) removes connection
     71  *	entries and expires the actual connections.
     72  *
     73  *	Each connection has a reference count.  The reference is acquired
     74  *	on lookup and should be released by the caller.  It guarantees that
     75  *	the connection will not be destroyed, although it may be expired.
     76  *
     77  * Synchronisation
     78  *
     79  *	Connection database is accessed in a lock-less manner by the main
     80  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
     81  *	are always called from a software interrupt, the database is
     82  *	protected using passive serialisation.  The main place which can
     83  *	destroy a connection is npf_conn_worker().  The database itself
     84  *	can be replaced and destroyed in npf_conn_reload().
     85  *
     86  * ALG support
     87  *
     88  *	Application-level gateways (ALGs) can override generic connection
     89  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
     90  *	performing their own lookup using different key.  Recursive call
     91  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
     92  *	npf_conn_lookup() function for this purpose.
     93  *
     94  * Lock order
     95  *
     96  *	npf_config_lock ->
     97  *		conn_lock ->
     98  *			npf_conn_t::c_lock
     99  */
    100 
    101 #include <sys/cdefs.h>
    102 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.18 2016/12/10 05:41:10 christos Exp $");
    103 
    104 #include <sys/param.h>
    105 #include <sys/types.h>
    106 
    107 #include <netinet/in.h>
    108 #include <netinet/tcp.h>
    109 
    110 #include <sys/atomic.h>
    111 #include <sys/condvar.h>
    112 #include <sys/kmem.h>
    113 #include <sys/kthread.h>
    114 #include <sys/mutex.h>
    115 #include <net/pfil.h>
    116 #include <sys/pool.h>
    117 #include <sys/queue.h>
    118 #include <sys/systm.h>
    119 
    120 #define __NPF_CONN_PRIVATE
    121 #include "npf_conn.h"
    122 #include "npf_impl.h"
    123 
    124 /*
    125  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
    126  */
    127 CTASSERT(PFIL_ALL == (0x001 | 0x002));
    128 #define	CONN_ACTIVE	0x004	/* visible on inspection */
    129 #define	CONN_PASS	0x008	/* perform implicit passing */
    130 #define	CONN_EXPIRE	0x010	/* explicitly expire */
    131 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
    132 
    133 /*
    134  * Connection tracking state: disabled (off) or enabled (on).
    135  */
    136 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
    137 static volatile int	conn_tracking	__cacheline_aligned;
    138 
    139 /* Connection tracking database, connection cache and the lock. */
    140 static npf_conndb_t *	conn_db		__read_mostly;
    141 static pool_cache_t	conn_cache	__read_mostly;
    142 static kmutex_t		conn_lock	__cacheline_aligned;
    143 
    144 static void	npf_conn_worker(void);
    145 static void	npf_conn_destroy(npf_conn_t *);
    146 
    147 /*
    148  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
    149  */
    150 
    151 void
    152 npf_conn_sysinit(void)
    153 {
    154 	conn_cache = pool_cache_init(sizeof(npf_conn_t), coherency_unit,
    155 	    0, 0, "npfconpl", NULL, IPL_NET, NULL, NULL, NULL);
    156 	mutex_init(&conn_lock, MUTEX_DEFAULT, IPL_NONE);
    157 	conn_tracking = CONN_TRACKING_OFF;
    158 	conn_db = npf_conndb_create();
    159 
    160 	npf_worker_register(npf_conn_worker);
    161 }
    162 
    163 void
    164 npf_conn_sysfini(void)
    165 {
    166 	/* Note: the caller should have flushed the connections. */
    167 	KASSERT(conn_tracking == CONN_TRACKING_OFF);
    168 	npf_worker_unregister(npf_conn_worker);
    169 
    170 	npf_conndb_destroy(conn_db);
    171 	pool_cache_destroy(conn_cache);
    172 	mutex_destroy(&conn_lock);
    173 }
    174 
    175 /*
    176  * npf_conn_load: perform the load by flushing the current connection
    177  * database and replacing it with the new one or just destroying.
    178  *
    179  * => The caller must disable the connection tracking and ensure that
    180  *    there are no connection database lookups or references in-flight.
    181  */
    182 void
    183 npf_conn_load(npf_conndb_t *ndb, bool track)
    184 {
    185 	npf_conndb_t *odb = NULL;
    186 
    187 	KASSERT(npf_config_locked_p());
    188 
    189 	/*
    190 	 * The connection database is in the quiescent state.
    191 	 * Prevent G/C thread from running and install a new database.
    192 	 */
    193 	mutex_enter(&conn_lock);
    194 	if (ndb) {
    195 		KASSERT(conn_tracking == CONN_TRACKING_OFF);
    196 		odb = conn_db;
    197 		conn_db = ndb;
    198 		membar_sync();
    199 	}
    200 	if (track) {
    201 		/* After this point lookups start flying in. */
    202 		conn_tracking = CONN_TRACKING_ON;
    203 	}
    204 	mutex_exit(&conn_lock);
    205 
    206 	if (odb) {
    207 		/*
    208 		 * Flush all, no sync since the caller did it for us.
    209 		 * Also, release the pool cache memory.
    210 		 */
    211 		npf_conn_gc(odb, true, false);
    212 		npf_conndb_destroy(odb);
    213 		pool_cache_invalidate(conn_cache);
    214 	}
    215 }
    216 
    217 /*
    218  * npf_conn_tracking: enable/disable connection tracking.
    219  */
    220 void
    221 npf_conn_tracking(bool track)
    222 {
    223 	KASSERT(npf_config_locked_p());
    224 	conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
    225 }
    226 
    227 static inline bool
    228 npf_conn_trackable_p(const npf_cache_t *npc)
    229 {
    230 	/*
    231 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
    232 	 * not cached - protocol is not supported or packet is invalid.
    233 	 */
    234 	if (conn_tracking != CONN_TRACKING_ON) {
    235 		return false;
    236 	}
    237 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    238 		return false;
    239 	}
    240 	return true;
    241 }
    242 
    243 static uint32_t
    244 connkey_setkey(npf_connkey_t *key, uint32_t proto, const void *ipv,
    245     uint16_t *id, size_t alen, bool forw)
    246 {
    247 	uint32_t isrc, idst;
    248 	const npf_addr_t * const *ips = ipv;
    249 	if (__predict_true(forw)) {
    250 		isrc = NPF_SRC, idst = NPF_DST;
    251 	} else {
    252 		isrc = NPF_DST, idst = NPF_SRC;
    253 	}
    254 
    255 	/*
    256 	 * Construct a key formed out of 32-bit integers.  The key layout:
    257 	 *
    258 	 * Field: | proto  |  alen  | src-id | dst-id | src-addr | dst-addr |
    259 	 *        +--------+--------+--------+--------+----------+----------+
    260 	 * Bits:  |   16   |   16   |   16   |   16   |  32-128  |  32-128  |
    261 	 *
    262 	 * The source and destination are inverted if they key is for the
    263 	 * backwards stream (forw == false).  The address length depends
    264 	 * on the 'alen' field; it is a length in bytes, either 4 or 16.
    265 	 */
    266 
    267 	key->ck_key[0] = ((uint32_t)proto << 16) | (alen & 0xffff);
    268 	key->ck_key[1] = ((uint32_t)id[isrc] << 16) | id[idst];
    269 
    270 	if (__predict_true(alen == sizeof(in_addr_t))) {
    271 		key->ck_key[2] = ips[isrc]->s6_addr32[0];
    272 		key->ck_key[3] = ips[idst]->s6_addr32[0];
    273 		return 4 * sizeof(uint32_t);
    274 	} else {
    275 		const u_int nwords = alen >> 2;
    276 		memcpy(&key->ck_key[2], ips[isrc], alen);
    277 		memcpy(&key->ck_key[2 + nwords], ips[idst], alen);
    278 		return (2 + (nwords * 2)) * sizeof(uint32_t);
    279 	}
    280 }
    281 
    282 /*
    283  * npf_conn_conkey: construct a key for the connection lookup.
    284  *
    285  * => Returns the key length in bytes or zero on failure.
    286  */
    287 unsigned
    288 npf_conn_conkey(const npf_cache_t *npc, npf_connkey_t *key, const bool forw)
    289 {
    290 	const u_int alen = npc->npc_alen;
    291 	const struct tcphdr *th;
    292 	const struct udphdr *uh;
    293 	uint16_t id[2];
    294 
    295 	switch (npc->npc_proto) {
    296 	case IPPROTO_TCP:
    297 		KASSERT(npf_iscached(npc, NPC_TCP));
    298 		th = npc->npc_l4.tcp;
    299 		id[NPF_SRC] = th->th_sport;
    300 		id[NPF_DST] = th->th_dport;
    301 		break;
    302 	case IPPROTO_UDP:
    303 		KASSERT(npf_iscached(npc, NPC_UDP));
    304 		uh = npc->npc_l4.udp;
    305 		id[NPF_SRC] = uh->uh_sport;
    306 		id[NPF_DST] = uh->uh_dport;
    307 		break;
    308 	case IPPROTO_ICMP:
    309 		if (npf_iscached(npc, NPC_ICMP_ID)) {
    310 			const struct icmp *ic = npc->npc_l4.icmp;
    311 			id[NPF_SRC] = ic->icmp_id;
    312 			id[NPF_DST] = ic->icmp_id;
    313 			break;
    314 		}
    315 		return 0;
    316 	case IPPROTO_ICMPV6:
    317 		if (npf_iscached(npc, NPC_ICMP_ID)) {
    318 			const struct icmp6_hdr *ic6 = npc->npc_l4.icmp6;
    319 			id[NPF_SRC] = ic6->icmp6_id;
    320 			id[NPF_DST] = ic6->icmp6_id;
    321 			break;
    322 		}
    323 		return 0;
    324 	default:
    325 		/* Unsupported protocol. */
    326 		return 0;
    327 	}
    328 
    329 	return connkey_setkey(key, npc->npc_proto, npc->npc_ips, id, alen,
    330 	    forw);
    331 }
    332 
    333 static __inline void
    334 connkey_set_addr(npf_connkey_t *key, const npf_addr_t *naddr, const int di)
    335 {
    336 	const u_int alen = key->ck_key[0] & 0xffff;
    337 	uint32_t *addr = &key->ck_key[2 + ((alen >> 2) * di)];
    338 
    339 	KASSERT(alen > 0);
    340 	memcpy(addr, naddr, alen);
    341 }
    342 
    343 static __inline void
    344 connkey_set_id(npf_connkey_t *key, const uint16_t id, const int di)
    345 {
    346 	const uint32_t oid = key->ck_key[1];
    347 	const u_int shift = 16 * !di;
    348 	const uint32_t mask = 0xffff0000 >> shift;
    349 
    350 	key->ck_key[1] = ((uint32_t)id << shift) | (oid & mask);
    351 }
    352 
    353 /*
    354  * npf_conn_ok: check if the connection is active, and has the right direction.
    355  */
    356 static bool
    357 npf_conn_ok(npf_conn_t *con, const int di, bool forw)
    358 {
    359 	uint32_t flags = con->c_flags;
    360 
    361 	/* Check if connection is active and not expired. */
    362 	bool ok = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
    363 	if (__predict_false(!ok)) {
    364 		return false;
    365 	}
    366 
    367 	/* Check if the direction is consistent */
    368 	bool pforw = (flags & PFIL_ALL) == di;
    369 	if (__predict_false(forw != pforw)) {
    370 		return false;
    371 	}
    372 	return true;
    373 }
    374 
    375 /*
    376  * npf_conn_lookup: lookup if there is an established connection.
    377  *
    378  * => If found, we will hold a reference for the caller.
    379  */
    380 npf_conn_t *
    381 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
    382 {
    383 	const nbuf_t *nbuf = npc->npc_nbuf;
    384 	npf_conn_t *con;
    385 	npf_connkey_t key;
    386 	u_int cifid;
    387 
    388 	/* Construct a key and lookup for a connection in the store. */
    389 	if (!npf_conn_conkey(npc, &key, true)) {
    390 		return NULL;
    391 	}
    392 	con = npf_conndb_lookup(conn_db, &key, forw);
    393 	if (con == NULL) {
    394 		return NULL;
    395 	}
    396 	KASSERT(npc->npc_proto == con->c_proto);
    397 
    398 	/* Check if connection is active and not expired. */
    399 	if (!npf_conn_ok(con, di, *forw)) {
    400 		atomic_dec_uint(&con->c_refcnt);
    401 		return NULL;
    402 	}
    403 
    404 	/*
    405 	 * Match the interface and the direction of the connection entry
    406 	 * and the packet.
    407 	 */
    408 	cifid = con->c_ifid;
    409 	if (__predict_false(cifid && cifid != nbuf->nb_ifid)) {
    410 		atomic_dec_uint(&con->c_refcnt);
    411 		return NULL;
    412 	}
    413 
    414 	/* Update the last activity time. */
    415 	getnanouptime(&con->c_atime);
    416 	return con;
    417 }
    418 
    419 /*
    420  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
    421  *
    422  * => If found, we will hold a reference for the caller.
    423  */
    424 npf_conn_t *
    425 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
    426 {
    427 	nbuf_t *nbuf = npc->npc_nbuf;
    428 	npf_conn_t *con;
    429 	bool forw, ok;
    430 
    431 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    432 	if (!npf_conn_trackable_p(npc)) {
    433 		return NULL;
    434 	}
    435 
    436 	/* Query ALG which may lookup connection for us. */
    437 	if ((con = npf_alg_conn(npc, di)) != NULL) {
    438 		/* Note: reference is held. */
    439 		return con;
    440 	}
    441 	if (nbuf_head_mbuf(nbuf) == NULL) {
    442 		*error = ENOMEM;
    443 		return NULL;
    444 	}
    445 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    446 
    447 	/* Main lookup of the connection. */
    448 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
    449 		return NULL;
    450 	}
    451 
    452 	/* Inspect the protocol data and handle state changes. */
    453 	mutex_enter(&con->c_lock);
    454 	ok = npf_state_inspect(npc, &con->c_state, forw);
    455 	mutex_exit(&con->c_lock);
    456 
    457 	/* If invalid state: let the rules deal with it. */
    458 	if (__predict_false(!ok)) {
    459 		npf_conn_release(con);
    460 		npf_stats_inc(NPF_STAT_INVALID_STATE);
    461 		return NULL;
    462 	}
    463 
    464 	/*
    465 	 * If this is multi-end state, then specially tag the packet
    466 	 * so it will be just passed-through on other interfaces.
    467 	 */
    468 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
    469 		npf_conn_release(con);
    470 		*error = ENOMEM;
    471 		return NULL;
    472 	}
    473 	return con;
    474 }
    475 
    476 /*
    477  * npf_conn_establish: create a new connection, insert into the global list.
    478  *
    479  * => Connection is created with the reference held for the caller.
    480  * => Connection will be activated on the first reference release.
    481  */
    482 npf_conn_t *
    483 npf_conn_establish(npf_cache_t *npc, int di, bool per_if)
    484 {
    485 	const nbuf_t *nbuf = npc->npc_nbuf;
    486 	npf_conn_t *con;
    487 	int error = 0;
    488 
    489 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    490 
    491 	if (!npf_conn_trackable_p(npc)) {
    492 		return NULL;
    493 	}
    494 
    495 	/* Allocate and initialise the new connection. */
    496 	con = pool_cache_get(conn_cache, PR_NOWAIT);
    497 	if (__predict_false(!con)) {
    498 		return NULL;
    499 	}
    500 	NPF_PRINTF(("NPF: create conn %p\n", con));
    501 	npf_stats_inc(NPF_STAT_CONN_CREATE);
    502 
    503 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    504 	con->c_flags = (di & PFIL_ALL);
    505 	con->c_refcnt = 0;
    506 	con->c_rproc = NULL;
    507 	con->c_nat = NULL;
    508 
    509 	/* Initialize the protocol state. */
    510 	if (!npf_state_init(npc, &con->c_state)) {
    511 		npf_conn_destroy(con);
    512 		return NULL;
    513 	}
    514 
    515 	KASSERT(npf_iscached(npc, NPC_IP46));
    516 	npf_connkey_t *fw = &con->c_forw_entry;
    517 	npf_connkey_t *bk = &con->c_back_entry;
    518 
    519 	/*
    520 	 * Construct "forwards" and "backwards" keys.  Also, set the
    521 	 * interface ID for this connection (unless it is global).
    522 	 */
    523 	if (!npf_conn_conkey(npc, fw, true) ||
    524 	    !npf_conn_conkey(npc, bk, false)) {
    525 		npf_conn_destroy(con);
    526 		return NULL;
    527 	}
    528 	fw->ck_backptr = bk->ck_backptr = con;
    529 	con->c_ifid = per_if ? nbuf->nb_ifid : 0;
    530 	con->c_proto = npc->npc_proto;
    531 
    532 	/*
    533 	 * Set last activity time for a new connection and acquire
    534 	 * a reference for the caller before we make it visible.
    535 	 */
    536 	getnanouptime(&con->c_atime);
    537 	con->c_refcnt = 1;
    538 
    539 	/*
    540 	 * Insert both keys (entries representing directions) of the
    541 	 * connection.  At this point it becomes visible, but we activate
    542 	 * the connection later.
    543 	 */
    544 	mutex_enter(&con->c_lock);
    545 	if (!npf_conndb_insert(conn_db, fw, con)) {
    546 		error = EISCONN;
    547 		goto err;
    548 	}
    549 	if (!npf_conndb_insert(conn_db, bk, con)) {
    550 		npf_conn_t *ret __diagused;
    551 		ret = npf_conndb_remove(conn_db, fw);
    552 		KASSERT(ret == con);
    553 		error = EISCONN;
    554 		goto err;
    555 	}
    556 err:
    557 	/*
    558 	 * If we have hit the duplicate: mark the connection as expired
    559 	 * and let the G/C thread to take care of it.  We cannot do it
    560 	 * here since there might be references acquired already.
    561 	 */
    562 	if (error) {
    563 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    564 		atomic_dec_uint(&con->c_refcnt);
    565 		npf_stats_inc(NPF_STAT_RACE_CONN);
    566 	} else {
    567 		NPF_PRINTF(("NPF: establish conn %p\n", con));
    568 	}
    569 
    570 	/* Finally, insert into the connection list. */
    571 	npf_conndb_enqueue(conn_db, con);
    572 	mutex_exit(&con->c_lock);
    573 
    574 	return error ? NULL : con;
    575 }
    576 
    577 static void
    578 npf_conn_destroy(npf_conn_t *con)
    579 {
    580 	KASSERT(con->c_refcnt == 0);
    581 
    582 	if (con->c_nat) {
    583 		/* Release any NAT structures. */
    584 		npf_nat_destroy(con->c_nat);
    585 	}
    586 	if (con->c_rproc) {
    587 		/* Release the rule procedure. */
    588 		npf_rproc_release(con->c_rproc);
    589 	}
    590 
    591 	/* Destroy the state. */
    592 	npf_state_destroy(&con->c_state);
    593 	mutex_destroy(&con->c_lock);
    594 
    595 	/* Free the structure, increase the counter. */
    596 	pool_cache_put(conn_cache, con);
    597 	npf_stats_inc(NPF_STAT_CONN_DESTROY);
    598 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
    599 }
    600 
    601 /*
    602  * npf_conn_setnat: associate NAT entry with the connection, update and
    603  * re-insert connection entry using the translation values.
    604  *
    605  * => The caller must be holding a reference.
    606  */
    607 int
    608 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
    609     npf_nat_t *nt, u_int ntype)
    610 {
    611 	static const u_int nat_type_dimap[] = {
    612 		[NPF_NATOUT] = NPF_DST,
    613 		[NPF_NATIN] = NPF_SRC,
    614 	};
    615 	npf_connkey_t key, *bk;
    616 	npf_conn_t *ret __diagused;
    617 	npf_addr_t *taddr;
    618 	in_port_t tport;
    619 	u_int tidx;
    620 
    621 	KASSERT(con->c_refcnt > 0);
    622 
    623 	npf_nat_gettrans(nt, &taddr, &tport);
    624 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
    625 	tidx = nat_type_dimap[ntype];
    626 
    627 	/* Construct a "backwards" key. */
    628 	if (!npf_conn_conkey(npc, &key, false)) {
    629 		return EINVAL;
    630 	}
    631 
    632 	/* Acquire the lock and check for the races. */
    633 	mutex_enter(&con->c_lock);
    634 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    635 		/* The connection got expired. */
    636 		mutex_exit(&con->c_lock);
    637 		return EINVAL;
    638 	}
    639 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
    640 
    641 	if (__predict_false(con->c_nat != NULL)) {
    642 		/* Race with a duplicate packet. */
    643 		mutex_exit(&con->c_lock);
    644 		npf_stats_inc(NPF_STAT_RACE_NAT);
    645 		return EISCONN;
    646 	}
    647 
    648 	/* Remove the "backwards" entry. */
    649 	ret = npf_conndb_remove(conn_db, &con->c_back_entry);
    650 	KASSERT(ret == con);
    651 
    652 	/* Set the source/destination IDs to the translation values. */
    653 	bk = &con->c_back_entry;
    654 	connkey_set_addr(bk, taddr, tidx);
    655 	if (tport) {
    656 		connkey_set_id(bk, tport, tidx);
    657 	}
    658 
    659 	/* Finally, re-insert the "backwards" entry. */
    660 	if (!npf_conndb_insert(conn_db, bk, con)) {
    661 		/*
    662 		 * Race: we have hit the duplicate, remove the "forwards"
    663 		 * entry and expire our connection; it is no longer valid.
    664 		 */
    665 		ret = npf_conndb_remove(conn_db, &con->c_forw_entry);
    666 		KASSERT(ret == con);
    667 
    668 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    669 		mutex_exit(&con->c_lock);
    670 
    671 		npf_stats_inc(NPF_STAT_RACE_NAT);
    672 		return EISCONN;
    673 	}
    674 
    675 	/* Associate the NAT entry and release the lock. */
    676 	con->c_nat = nt;
    677 	mutex_exit(&con->c_lock);
    678 	return 0;
    679 }
    680 
    681 /*
    682  * npf_conn_expire: explicitly mark connection as expired.
    683  */
    684 void
    685 npf_conn_expire(npf_conn_t *con)
    686 {
    687 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
    688 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
    689 }
    690 
    691 /*
    692  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
    693  */
    694 bool
    695 npf_conn_pass(const npf_conn_t *con, npf_rproc_t **rp)
    696 {
    697 	KASSERT(con->c_refcnt > 0);
    698 	if (__predict_true(con->c_flags & CONN_PASS)) {
    699 		*rp = con->c_rproc;
    700 		return true;
    701 	}
    702 	return false;
    703 }
    704 
    705 /*
    706  * npf_conn_setpass: mark connection as a "pass" one and associate the
    707  * rule procedure with it.
    708  */
    709 void
    710 npf_conn_setpass(npf_conn_t *con, npf_rproc_t *rp)
    711 {
    712 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
    713 	KASSERT(con->c_refcnt > 0);
    714 	KASSERT(con->c_rproc == NULL);
    715 
    716 	/*
    717 	 * No need for atomic since the connection is not yet active.
    718 	 * If rproc is set, the caller transfers its reference to us,
    719 	 * which will be released on npf_conn_destroy().
    720 	 */
    721 	atomic_or_uint(&con->c_flags, CONN_PASS);
    722 	con->c_rproc = rp;
    723 }
    724 
    725 /*
    726  * npf_conn_release: release a reference, which might allow G/C thread
    727  * to destroy this connection.
    728  */
    729 void
    730 npf_conn_release(npf_conn_t *con)
    731 {
    732 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
    733 		/* Activate: after this, connection is globally visible. */
    734 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
    735 	}
    736 	KASSERT(con->c_refcnt > 0);
    737 	atomic_dec_uint(&con->c_refcnt);
    738 }
    739 
    740 /*
    741  * npf_conn_getnat: return associated NAT data entry and indicate
    742  * whether it is a "forwards" or "backwards" stream.
    743  */
    744 npf_nat_t *
    745 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
    746 {
    747 	KASSERT(con->c_refcnt > 0);
    748 	*forw = (con->c_flags & PFIL_ALL) == di;
    749 	return con->c_nat;
    750 }
    751 
    752 /*
    753  * npf_conn_expired: criterion to check if connection is expired.
    754  */
    755 static inline bool
    756 npf_conn_expired(const npf_conn_t *con, const struct timespec *tsnow)
    757 {
    758 	const int etime = npf_state_etime(&con->c_state, con->c_proto);
    759 	struct timespec tsdiff;
    760 
    761 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    762 		/* Explicitly marked to be expired. */
    763 		return true;
    764 	}
    765 	timespecsub(tsnow, &con->c_atime, &tsdiff);
    766 	return tsdiff.tv_sec > etime;
    767 }
    768 
    769 /*
    770  * npf_conn_gc: garbage collect the expired connections.
    771  *
    772  * => Must run in a single-threaded manner.
    773  * => If it is a flush request, then destroy all connections.
    774  * => If 'sync' is true, then perform passive serialisation.
    775  */
    776 void
    777 npf_conn_gc(npf_conndb_t *cd, bool flush, bool sync)
    778 {
    779 	npf_conn_t *con, *prev, *gclist = NULL;
    780 	struct timespec tsnow;
    781 
    782 	getnanouptime(&tsnow);
    783 
    784 	/*
    785 	 * Scan all connections and check them for expiration.
    786 	 */
    787 	prev = NULL;
    788 	con = npf_conndb_getlist(cd);
    789 	while (con) {
    790 		npf_conn_t *next = con->c_next;
    791 
    792 		/* Expired?  Flushing all? */
    793 		if (!npf_conn_expired(con, &tsnow) && !flush) {
    794 			prev = con;
    795 			con = next;
    796 			continue;
    797 		}
    798 
    799 		/* Remove both entries of the connection. */
    800 		mutex_enter(&con->c_lock);
    801 		if ((con->c_flags & CONN_REMOVED) == 0) {
    802 			npf_conn_t *ret __diagused;
    803 
    804 			ret = npf_conndb_remove(cd, &con->c_forw_entry);
    805 			KASSERT(ret == con);
    806 			ret = npf_conndb_remove(cd, &con->c_back_entry);
    807 			KASSERT(ret == con);
    808 		}
    809 
    810 		/* Flag the removal and expiration. */
    811 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    812 		mutex_exit(&con->c_lock);
    813 
    814 		/* Move to the G/C list. */
    815 		npf_conndb_dequeue(cd, con, prev);
    816 		con->c_next = gclist;
    817 		gclist = con;
    818 
    819 		/* Next.. */
    820 		con = next;
    821 	}
    822 	npf_conndb_settail(cd, prev);
    823 
    824 	/*
    825 	 * Ensure it is safe to destroy the connections.
    826 	 * Note: drop the conn_lock (see the lock order).
    827 	 */
    828 	if (sync) {
    829 		mutex_exit(&conn_lock);
    830 		if (gclist) {
    831 			npf_config_enter();
    832 			npf_config_sync();
    833 			npf_config_exit();
    834 		}
    835 	}
    836 
    837 	/*
    838 	 * Garbage collect all expired connections.
    839 	 * May need to wait for the references to drain.
    840 	 */
    841 	con = gclist;
    842 	while (con) {
    843 		npf_conn_t *next = con->c_next;
    844 
    845 		/*
    846 		 * Destroy only if removed and no references.
    847 		 * Otherwise, wait for a tiny moment.
    848 		 */
    849 		if (__predict_false(con->c_refcnt)) {
    850 			kpause("npfcongc", false, 1, NULL);
    851 			continue;
    852 		}
    853 		npf_conn_destroy(con);
    854 		con = next;
    855 	}
    856 }
    857 
    858 /*
    859  * npf_conn_worker: G/C to run from a worker thread.
    860  */
    861 static void
    862 npf_conn_worker(void)
    863 {
    864 	mutex_enter(&conn_lock);
    865 	/* Note: the conn_lock will be released (sync == true). */
    866 	npf_conn_gc(conn_db, false, true);
    867 }
    868 
    869 /*
    870  * npf_conndb_export: construct a list of connections prepared for saving.
    871  * Note: this is expected to be an expensive operation.
    872  */
    873 int
    874 npf_conndb_export(prop_array_t conlist)
    875 {
    876 	npf_conn_t *con, *prev;
    877 
    878 	/*
    879 	 * Note: acquire conn_lock to prevent from the database
    880 	 * destruction and G/C thread.
    881 	 */
    882 	mutex_enter(&conn_lock);
    883 	if (conn_tracking != CONN_TRACKING_ON) {
    884 		mutex_exit(&conn_lock);
    885 		return 0;
    886 	}
    887 	prev = NULL;
    888 	con = npf_conndb_getlist(conn_db);
    889 	while (con) {
    890 		npf_conn_t *next = con->c_next;
    891 		prop_dictionary_t cdict;
    892 
    893 		if ((cdict = npf_conn_export(con)) != NULL) {
    894 			prop_array_add(conlist, cdict);
    895 			prop_object_release(cdict);
    896 		}
    897 		prev = con;
    898 		con = next;
    899 	}
    900 	npf_conndb_settail(conn_db, prev);
    901 	mutex_exit(&conn_lock);
    902 	return 0;
    903 }
    904 
    905 /*
    906  * npf_conn_export: serialise a single connection.
    907  */
    908 prop_dictionary_t
    909 npf_conn_export(const npf_conn_t *con)
    910 {
    911 	prop_dictionary_t cdict;
    912 	prop_data_t d;
    913 
    914 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
    915 		return NULL;
    916 	}
    917 	cdict = prop_dictionary_create();
    918 	prop_dictionary_set_uint32(cdict, "flags", con->c_flags);
    919 	prop_dictionary_set_uint32(cdict, "proto", con->c_proto);
    920 	if (con->c_ifid) {
    921 		const char *ifname = npf_ifmap_getname(con->c_ifid);
    922 		prop_dictionary_set_cstring(cdict, "ifname", ifname);
    923 	}
    924 
    925 	d = prop_data_create_data(&con->c_state, sizeof(npf_state_t));
    926 	prop_dictionary_set_and_rel(cdict, "state", d);
    927 
    928 	const uint32_t *fkey = con->c_forw_entry.ck_key;
    929 	d = prop_data_create_data(fkey, NPF_CONN_MAXKEYLEN);
    930 	prop_dictionary_set_and_rel(cdict, "forw-key", d);
    931 
    932 	const uint32_t *bkey = con->c_back_entry.ck_key;
    933 	d = prop_data_create_data(bkey, NPF_CONN_MAXKEYLEN);
    934 	prop_dictionary_set_and_rel(cdict, "back-key", d);
    935 
    936 	if (con->c_nat) {
    937 		npf_nat_export(cdict, con->c_nat);
    938 	}
    939 	return cdict;
    940 }
    941 
    942 static uint32_t
    943 npf_connkey_import(prop_dictionary_t idict, npf_connkey_t *key, uint16_t *dir)
    944 {
    945 	uint16_t proto;
    946 	prop_object_t sobj, dobj;
    947 	uint16_t id[2];
    948 	npf_addr_t const * ips[2];
    949 
    950 	prop_dictionary_get_uint16(idict, "proto", &proto);
    951 	prop_dictionary_get_uint16(idict, "direction", dir);
    952 
    953 	prop_dictionary_get_uint16(idict, "sport", &id[NPF_SRC]);
    954 	prop_dictionary_get_uint16(idict, "dport", &id[NPF_DST]);
    955 
    956 	sobj = prop_dictionary_get(idict, "saddr");
    957 	if ((ips[NPF_SRC] = prop_data_data_nocopy(sobj)) == NULL)
    958 		return 0;
    959 
    960 	dobj = prop_dictionary_get(idict, "daddr");
    961 	if ((ips[NPF_DST] = prop_data_data_nocopy(dobj)) == NULL)
    962 		return 0;
    963 
    964 	size_t alen = prop_data_size(sobj);
    965 	if (alen != prop_data_size(dobj))
    966 		return 0;
    967 	*(const int *)ips[NPF_SRC], id[NPF_SRC],
    968 	*(const int *)ips[NPF_DST], id[NPF_DST], alen, proto, *dir);
    969 
    970 	return connkey_setkey(key, proto, ips, id, alen, true);
    971 }
    972 
    973 int
    974 npf_conn_find(prop_dictionary_t idict, prop_dictionary_t *odict)
    975 {
    976 	npf_connkey_t key;
    977 	npf_conn_t *con;
    978 	uint16_t dir;
    979 	bool forw;
    980 
    981 	if (!npf_connkey_import(idict, &key, &dir)) {
    982 		return EINVAL;
    983 	}
    984 
    985 	con = npf_conndb_lookup(conn_db, &key, &forw);
    986 	if (con == NULL) {
    987 		return ESRCH;
    988 	}
    989 
    990 	dir = dir == PFIL_IN ? PFIL_OUT : PFIL_IN;
    991 	if (!npf_conn_ok(con, dir, true)) {
    992 		atomic_dec_uint(&con->c_refcnt);
    993 		return ESRCH;
    994 	}
    995 
    996 	*odict = npf_conn_export(con);
    997 	if (*odict == NULL) {
    998 		atomic_dec_uint(&con->c_refcnt);
    999 		return ENOSPC;
   1000 	}
   1001 	atomic_dec_uint(&con->c_refcnt);
   1002 
   1003 	return 0;
   1004 }
   1005 
   1006 /*
   1007  * npf_conn_import: fully reconstruct a single connection from a
   1008  * directory and insert into the given database.
   1009  */
   1010 int
   1011 npf_conn_import(npf_conndb_t *cd, prop_dictionary_t cdict,
   1012     npf_ruleset_t *natlist)
   1013 {
   1014 	npf_conn_t *con;
   1015 	npf_connkey_t *fw, *bk;
   1016 	prop_object_t obj;
   1017 	const char *ifname;
   1018 	const void *d;
   1019 
   1020 	/* Allocate a connection and initialise it (clear first). */
   1021 	con = pool_cache_get(conn_cache, PR_WAITOK);
   1022 	memset(con, 0, sizeof(npf_conn_t));
   1023 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
   1024 	npf_stats_inc(NPF_STAT_CONN_CREATE);
   1025 
   1026 	prop_dictionary_get_uint32(cdict, "proto", &con->c_proto);
   1027 	prop_dictionary_get_uint32(cdict, "flags", &con->c_flags);
   1028 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
   1029 	getnanouptime(&con->c_atime);
   1030 
   1031 	if (prop_dictionary_get_cstring_nocopy(cdict, "ifname", &ifname) &&
   1032 	    (con->c_ifid = npf_ifmap_register(ifname)) == 0) {
   1033 		goto err;
   1034 	}
   1035 
   1036 	obj = prop_dictionary_get(cdict, "state");
   1037 	if ((d = prop_data_data_nocopy(obj)) == NULL ||
   1038 	    prop_data_size(obj) != sizeof(npf_state_t)) {
   1039 		goto err;
   1040 	}
   1041 	memcpy(&con->c_state, d, sizeof(npf_state_t));
   1042 
   1043 	/* Reconstruct NAT association, if any. */
   1044 	if ((obj = prop_dictionary_get(cdict, "nat")) != NULL &&
   1045 	    (con->c_nat = npf_nat_import(obj, natlist, con)) == NULL) {
   1046 		goto err;
   1047 	}
   1048 
   1049 	/*
   1050 	 * Fetch and copy the keys for each direction.
   1051 	 */
   1052 	obj = prop_dictionary_get(cdict, "forw-key");
   1053 	if ((d = prop_data_data_nocopy(obj)) == NULL ||
   1054 	    prop_data_size(obj) != NPF_CONN_MAXKEYLEN) {
   1055 		goto err;
   1056 	}
   1057 	fw = &con->c_forw_entry;
   1058 	memcpy(&fw->ck_key, d, NPF_CONN_MAXKEYLEN);
   1059 
   1060 	obj = prop_dictionary_get(cdict, "back-key");
   1061 	if ((d = prop_data_data_nocopy(obj)) == NULL ||
   1062 	    prop_data_size(obj) != NPF_CONN_MAXKEYLEN) {
   1063 		goto err;
   1064 	}
   1065 	bk = &con->c_back_entry;
   1066 	memcpy(&bk->ck_key, d, NPF_CONN_MAXKEYLEN);
   1067 
   1068 	fw->ck_backptr = bk->ck_backptr = con;
   1069 
   1070 	/* Insert the entries and the connection itself. */
   1071 	if (!npf_conndb_insert(cd, fw, con)) {
   1072 		goto err;
   1073 	}
   1074 	if (!npf_conndb_insert(cd, bk, con)) {
   1075 		npf_conndb_remove(cd, fw);
   1076 		goto err;
   1077 	}
   1078 
   1079 	NPF_PRINTF(("NPF: imported conn %p\n", con));
   1080 	npf_conndb_enqueue(cd, con);
   1081 	return 0;
   1082 err:
   1083 	npf_conn_destroy(con);
   1084 	return EINVAL;
   1085 }
   1086 
   1087 #if defined(DDB) || defined(_NPF_TESTING)
   1088 
   1089 void
   1090 npf_conn_print(const npf_conn_t *con)
   1091 {
   1092 	const u_int alen = NPF_CONN_GETALEN(&con->c_forw_entry);
   1093 	const uint32_t *fkey = con->c_forw_entry.ck_key;
   1094 	const uint32_t *bkey = con->c_back_entry.ck_key;
   1095 	const u_int proto = con->c_proto;
   1096 	struct timespec tsnow, tsdiff;
   1097 	const void *src, *dst;
   1098 	int etime;
   1099 
   1100 	getnanouptime(&tsnow);
   1101 	timespecsub(&tsnow, &con->c_atime, &tsdiff);
   1102 	etime = npf_state_etime(&con->c_state, proto);
   1103 
   1104 	printf("%p:\n\tproto %d flags 0x%x tsdiff %d etime %d\n",
   1105 	    con, proto, con->c_flags, (int)tsdiff.tv_sec, etime);
   1106 
   1107 	src = &fkey[2], dst = &fkey[2 + (alen >> 2)];
   1108 	printf("\tforw %s:%d", npf_addr_dump(src, alen), ntohs(fkey[1] >> 16));
   1109 	printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(fkey[1] & 0xffff));
   1110 
   1111 	src = &bkey[2], dst = &bkey[2 + (alen >> 2)];
   1112 	printf("\tback %s:%d", npf_addr_dump(src, alen), ntohs(bkey[1] >> 16));
   1113 	printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(bkey[1] & 0xffff));
   1114 
   1115 	npf_state_dump(&con->c_state);
   1116 	if (con->c_nat) {
   1117 		npf_nat_dump(con->c_nat);
   1118 	}
   1119 }
   1120 
   1121 #endif
   1122