Home | History | Annotate | Line # | Download | only in npf
npf_conn.c revision 1.28
      1 /*-
      2  * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
      3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This material is based upon work partially supported by The
      7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * NPF connection tracking for stateful filtering and translation.
     33  *
     34  * Overview
     35  *
     36  *	Packets can be incoming or outgoing with respect to an interface.
     37  *	Connection direction is identified by the direction of its first
     38  *	packet.  The meaning of incoming/outgoing packet in the context of
     39  *	connection direction can be confusing.  Therefore, we will use the
     40  *	terms "forwards stream" and "backwards stream", where packets in
     41  *	the forwards stream mean the packets travelling in the direction
     42  *	as the connection direction.
     43  *
     44  *	All connections have two keys and thus two entries:
     45  *
     46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
     47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
     48  *
     49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
     50  *	to allocate variable-length npf_conn_t structures based on whether
     51  *	the IPv4 or IPv6 addresses are used.  See the npf_connkey.c source
     52  *	file for the description of the key layouts.
     53  *
     54  *	The keys are formed from the 5-tuple (source/destination address,
     55  *	source/destination port and the protocol).  Additional matching
     56  *	is performed for the interface (a common behaviour is equivalent
     57  *	to the 6-tuple lookup including the interface ID).  Note that the
     58  *	key may be formed using translated values in a case of NAT.
     59  *
     60  *	Connections can serve two purposes: for the implicit passing or
     61  *	to accommodate the dynamic NAT.  Connections for the former purpose
     62  *	are created by the rules with "stateful" attribute and are used for
     63  *	stateful filtering.  Such connections indicate that the packet of
     64  *	the backwards stream should be passed without inspection of the
     65  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
     66  *	with a connection.  Such connections are created by the NAT policies
     67  *	and they have a relationship with NAT translation structure via
     68  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
     69  *	which is a common case.
     70  *
     71  * Connection life-cycle
     72  *
     73  *	Connections are established when a packet matches said rule or
     74  *	NAT policy.  Both keys of the established connection are inserted
     75  *	into the connection database.  A garbage collection thread
     76  *	periodically scans all connections and depending on connection
     77  *	properties (e.g. last activity time, protocol) removes connection
     78  *	entries and expires the actual connections.
     79  *
     80  *	Each connection has a reference count.  The reference is acquired
     81  *	on lookup and should be released by the caller.  It guarantees that
     82  *	the connection will not be destroyed, although it may be expired.
     83  *
     84  * Synchronisation
     85  *
     86  *	Connection database is accessed in a lock-less manner by the main
     87  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
     88  *	are always called from a software interrupt, the database is
     89  *	protected using passive serialisation.  The main place which can
     90  *	destroy a connection is npf_conn_worker().  The database itself
     91  *	can be replaced and destroyed in npf_conn_reload().
     92  *
     93  * ALG support
     94  *
     95  *	Application-level gateways (ALGs) can override generic connection
     96  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
     97  *	performing their own lookup using different key.  Recursive call
     98  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
     99  *	npf_conn_lookup() function for this purpose.
    100  *
    101  * Lock order
    102  *
    103  *	npf_config_lock ->
    104  *		conn_lock ->
    105  *			npf_conn_t::c_lock
    106  */
    107 
    108 #ifdef _KERNEL
    109 #include <sys/cdefs.h>
    110 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.28 2019/08/06 10:25:13 christos Exp $");
    111 
    112 #include <sys/param.h>
    113 #include <sys/types.h>
    114 
    115 #include <netinet/in.h>
    116 #include <netinet/tcp.h>
    117 
    118 #include <sys/atomic.h>
    119 #include <sys/kmem.h>
    120 #include <sys/mutex.h>
    121 #include <net/pfil.h>
    122 #include <sys/pool.h>
    123 #include <sys/queue.h>
    124 #include <sys/systm.h>
    125 #endif
    126 
    127 #define __NPF_CONN_PRIVATE
    128 #include "npf_conn.h"
    129 #include "npf_impl.h"
    130 
    131 /* A helper to select the IPv4 or IPv6 connection cache. */
    132 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
    133 
    134 /*
    135  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
    136  */
    137 CTASSERT(PFIL_ALL == (0x001 | 0x002));
    138 #define	CONN_ACTIVE	0x004	/* visible on inspection */
    139 #define	CONN_PASS	0x008	/* perform implicit passing */
    140 #define	CONN_EXPIRE	0x010	/* explicitly expire */
    141 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
    142 
    143 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
    144 
    145 static nvlist_t *npf_conn_export(npf_t *, npf_conn_t *);
    146 static void npf_conn_destroy_idx(npf_t *, npf_conn_t *, unsigned);
    147 
    148 /*
    149  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
    150  */
    151 
    152 void
    153 npf_conn_init(npf_t *npf, int flags)
    154 {
    155 	npf->conn_cache[0] = pool_cache_init(
    156 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
    157 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
    158 	npf->conn_cache[1] = pool_cache_init(
    159 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
    160 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
    161 
    162 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
    163 	npf->conn_tracking = CONN_TRACKING_OFF;
    164 	npf->conn_db = npf_conndb_create();
    165 
    166 	if ((flags & NPF_NO_GC) == 0) {
    167 		npf_worker_register(npf, npf_conn_worker);
    168 	}
    169 	npf_conndb_sysinit(npf);
    170 }
    171 
    172 void
    173 npf_conn_fini(npf_t *npf)
    174 {
    175 	npf_conndb_sysfini(npf);
    176 
    177 	/* Note: the caller should have flushed the connections. */
    178 	KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    179 	npf_worker_unregister(npf, npf_conn_worker);
    180 
    181 	npf_conndb_destroy(npf->conn_db);
    182 	pool_cache_destroy(npf->conn_cache[0]);
    183 	pool_cache_destroy(npf->conn_cache[1]);
    184 	mutex_destroy(&npf->conn_lock);
    185 }
    186 
    187 /*
    188  * npf_conn_load: perform the load by flushing the current connection
    189  * database and replacing it with the new one or just destroying.
    190  *
    191  * => The caller must disable the connection tracking and ensure that
    192  *    there are no connection database lookups or references in-flight.
    193  */
    194 void
    195 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
    196 {
    197 	npf_conndb_t *odb = NULL;
    198 
    199 	KASSERT(npf_config_locked_p(npf));
    200 
    201 	/*
    202 	 * The connection database is in the quiescent state.
    203 	 * Prevent G/C thread from running and install a new database.
    204 	 */
    205 	mutex_enter(&npf->conn_lock);
    206 	if (ndb) {
    207 		KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    208 		odb = npf->conn_db;
    209 		npf->conn_db = ndb;
    210 		membar_sync();
    211 	}
    212 	if (track) {
    213 		/* After this point lookups start flying in. */
    214 		npf->conn_tracking = CONN_TRACKING_ON;
    215 	}
    216 	mutex_exit(&npf->conn_lock);
    217 
    218 	if (odb) {
    219 		/*
    220 		 * Flush all, no sync since the caller did it for us.
    221 		 * Also, release the pool cache memory.
    222 		 */
    223 		npf_conndb_gc(npf, odb, true, false);
    224 		npf_conndb_destroy(odb);
    225 		pool_cache_invalidate(npf->conn_cache[0]);
    226 		pool_cache_invalidate(npf->conn_cache[1]);
    227 	}
    228 }
    229 
    230 /*
    231  * npf_conn_tracking: enable/disable connection tracking.
    232  */
    233 void
    234 npf_conn_tracking(npf_t *npf, bool track)
    235 {
    236 	KASSERT(npf_config_locked_p(npf));
    237 	npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
    238 }
    239 
    240 static inline bool
    241 npf_conn_trackable_p(const npf_cache_t *npc)
    242 {
    243 	const npf_t *npf = npc->npc_ctx;
    244 
    245 	/*
    246 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
    247 	 * not cached - protocol is not supported or packet is invalid.
    248 	 */
    249 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    250 		return false;
    251 	}
    252 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    253 		return false;
    254 	}
    255 	return true;
    256 }
    257 
    258 static inline void
    259 conn_update_atime(npf_conn_t *con)
    260 {
    261 	struct timespec tsnow;
    262 
    263 	getnanouptime(&tsnow);
    264 	con->c_atime = tsnow.tv_sec;
    265 }
    266 
    267 /*
    268  * npf_conn_check: check that:
    269  *
    270  *	- the connection is active;
    271  *
    272  *	- the packet is travelling in the right direction with the respect
    273  *	  to the connection direction (if interface-id is not zero);
    274  *
    275  *	- the packet is travelling on the same interface as the
    276  *	  connection interface (if interface-id is not zero).
    277  */
    278 static bool
    279 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
    280     const unsigned di, const bool forw)
    281 {
    282 	const uint32_t flags = con->c_flags;
    283 	const unsigned ifid = con->c_ifid;
    284 	bool active, pforw;
    285 
    286 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
    287 	if (__predict_false(!active)) {
    288 		return false;
    289 	}
    290 	if (ifid && nbuf) {
    291 		pforw = (flags & PFIL_ALL) == (unsigned)di;
    292 		if (__predict_false(forw != pforw)) {
    293 			return false;
    294 		}
    295 		if (__predict_false(ifid != nbuf->nb_ifid)) {
    296 			return false;
    297 		}
    298 	}
    299 	return true;
    300 }
    301 
    302 /*
    303  * npf_conn_lookup: lookup if there is an established connection.
    304  *
    305  * => If found, we will hold a reference for the caller.
    306  */
    307 npf_conn_t *
    308 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
    309 {
    310 	npf_t *npf = npc->npc_ctx;
    311 	const nbuf_t *nbuf = npc->npc_nbuf;
    312 	npf_conn_t *con;
    313 	npf_connkey_t key;
    314 
    315 	/* Construct a key and lookup for a connection in the store. */
    316 	if (!npf_conn_conkey(npc, &key, true)) {
    317 		return NULL;
    318 	}
    319 	con = npf_conndb_lookup(npf->conn_db, &key, forw);
    320 	if (con == NULL) {
    321 		return NULL;
    322 	}
    323 	KASSERT(npc->npc_proto == con->c_proto);
    324 
    325 	/* Extra checks for the connection and packet. */
    326 	if (!npf_conn_check(con, nbuf, di, *forw)) {
    327 		atomic_dec_uint(&con->c_refcnt);
    328 		return NULL;
    329 	}
    330 
    331 	/* Update the last activity time. */
    332 	conn_update_atime(con);
    333 	return con;
    334 }
    335 
    336 /*
    337  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
    338  *
    339  * => If found, we will hold a reference for the caller.
    340  */
    341 npf_conn_t *
    342 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
    343 {
    344 	nbuf_t *nbuf = npc->npc_nbuf;
    345 	npf_conn_t *con;
    346 	bool forw, ok;
    347 
    348 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    349 	if (!npf_conn_trackable_p(npc)) {
    350 		return NULL;
    351 	}
    352 
    353 	/* Query ALG which may lookup connection for us. */
    354 	if ((con = npf_alg_conn(npc, di)) != NULL) {
    355 		/* Note: reference is held. */
    356 		return con;
    357 	}
    358 	if (nbuf_head_mbuf(nbuf) == NULL) {
    359 		*error = ENOMEM;
    360 		return NULL;
    361 	}
    362 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    363 
    364 	/* Main lookup of the connection. */
    365 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
    366 		return NULL;
    367 	}
    368 
    369 	/* Inspect the protocol data and handle state changes. */
    370 	mutex_enter(&con->c_lock);
    371 	ok = npf_state_inspect(npc, &con->c_state, forw);
    372 	mutex_exit(&con->c_lock);
    373 
    374 	/* If invalid state: let the rules deal with it. */
    375 	if (__predict_false(!ok)) {
    376 		npf_conn_release(con);
    377 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
    378 		return NULL;
    379 	}
    380 
    381 	/*
    382 	 * If this is multi-end state, then specially tag the packet
    383 	 * so it will be just passed-through on other interfaces.
    384 	 */
    385 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
    386 		npf_conn_release(con);
    387 		*error = ENOMEM;
    388 		return NULL;
    389 	}
    390 	return con;
    391 }
    392 
    393 /*
    394  * npf_conn_establish: create a new connection, insert into the global list.
    395  *
    396  * => Connection is created with the reference held for the caller.
    397  * => Connection will be activated on the first reference release.
    398  */
    399 npf_conn_t *
    400 npf_conn_establish(npf_cache_t *npc, int di, bool global)
    401 {
    402 	npf_t *npf = npc->npc_ctx;
    403 	const unsigned alen = npc->npc_alen;
    404 	const unsigned idx = NPF_CONNCACHE(alen);
    405 	const nbuf_t *nbuf = npc->npc_nbuf;
    406 	npf_connkey_t *fw, *bk;
    407 	npf_conn_t *con;
    408 	int error = 0;
    409 
    410 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    411 
    412 	if (!npf_conn_trackable_p(npc)) {
    413 		return NULL;
    414 	}
    415 
    416 	/* Allocate and initialise the new connection. */
    417 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
    418 	if (__predict_false(!con)) {
    419 		npf_worker_signal(npf);
    420 		return NULL;
    421 	}
    422 	NPF_PRINTF(("NPF: create conn %p\n", con));
    423 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    424 
    425 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    426 	con->c_flags = (di & PFIL_ALL);
    427 	con->c_refcnt = 0;
    428 	con->c_rproc = NULL;
    429 	con->c_nat = NULL;
    430 
    431 	con->c_proto = npc->npc_proto;
    432 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
    433 
    434 	/* Initialize the protocol state. */
    435 	if (!npf_state_init(npc, &con->c_state)) {
    436 		npf_conn_destroy_idx(npf, con, idx);
    437 		return NULL;
    438 	}
    439 	KASSERT(npf_iscached(npc, NPC_IP46));
    440 
    441 	fw = npf_conn_getforwkey(con);
    442 	bk = npf_conn_getbackkey(con, alen);
    443 
    444 	/*
    445 	 * Construct "forwards" and "backwards" keys.  Also, set the
    446 	 * interface ID for this connection (unless it is global).
    447 	 */
    448 	if (!npf_conn_conkey(npc, fw, true) ||
    449 	    !npf_conn_conkey(npc, bk, false)) {
    450 		npf_conn_destroy_idx(npf, con, idx);
    451 		return NULL;
    452 	}
    453 	con->c_ifid = global ? nbuf->nb_ifid : 0;
    454 
    455 	/*
    456 	 * Set last activity time for a new connection and acquire
    457 	 * a reference for the caller before we make it visible.
    458 	 */
    459 	conn_update_atime(con);
    460 	con->c_refcnt = 1;
    461 
    462 	/*
    463 	 * Insert both keys (entries representing directions) of the
    464 	 * connection.  At this point it becomes visible, but we activate
    465 	 * the connection later.
    466 	 */
    467 	mutex_enter(&con->c_lock);
    468 	if (!npf_conndb_insert(npf->conn_db, fw, con, true)) {
    469 		error = EISCONN;
    470 		goto err;
    471 	}
    472 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    473 		npf_conn_t *ret __diagused;
    474 		ret = npf_conndb_remove(npf->conn_db, fw);
    475 		KASSERT(ret == con);
    476 		error = EISCONN;
    477 		goto err;
    478 	}
    479 err:
    480 	/*
    481 	 * If we have hit the duplicate: mark the connection as expired
    482 	 * and let the G/C thread to take care of it.  We cannot do it
    483 	 * here since there might be references acquired already.
    484 	 */
    485 	if (error) {
    486 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    487 		atomic_dec_uint(&con->c_refcnt);
    488 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
    489 	} else {
    490 		NPF_PRINTF(("NPF: establish conn %p\n", con));
    491 	}
    492 
    493 	/* Finally, insert into the connection list. */
    494 	npf_conndb_enqueue(npf->conn_db, con);
    495 	mutex_exit(&con->c_lock);
    496 
    497 	return error ? NULL : con;
    498 }
    499 
    500 void
    501 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
    502 {
    503 	const npf_connkey_t *key = npf_conn_getforwkey(con);
    504 	const unsigned alen = NPF_CONNKEY_ALEN(key);
    505 	npf_conn_destroy_idx(npf, con, NPF_CONNCACHE(alen));
    506 }
    507 
    508 static void
    509 npf_conn_destroy_idx(npf_t *npf, npf_conn_t *con, unsigned idx)
    510 {
    511 	KASSERT(con->c_refcnt == 0);
    512 
    513 	if (con->c_nat) {
    514 		/* Release any NAT structures. */
    515 		npf_nat_destroy(con->c_nat);
    516 	}
    517 	if (con->c_rproc) {
    518 		/* Release the rule procedure. */
    519 		npf_rproc_release(con->c_rproc);
    520 	}
    521 
    522 	/* Destroy the state. */
    523 	npf_state_destroy(&con->c_state);
    524 	mutex_destroy(&con->c_lock);
    525 
    526 	/* Free the structure, increase the counter. */
    527 	pool_cache_put(npf->conn_cache[idx], con);
    528 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
    529 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
    530 }
    531 
    532 /*
    533  * npf_conn_setnat: associate NAT entry with the connection, update and
    534  * re-insert connection entry using the translation values.
    535  *
    536  * => The caller must be holding a reference.
    537  */
    538 int
    539 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
    540     npf_nat_t *nt, unsigned ntype)
    541 {
    542 	static const u_int nat_type_dimap[] = {
    543 		[NPF_NATOUT] = NPF_DST,
    544 		[NPF_NATIN] = NPF_SRC,
    545 	};
    546 	npf_t *npf = npc->npc_ctx;
    547 	npf_connkey_t key, *fw, *bk;
    548 	npf_conn_t *ret __diagused;
    549 	npf_addr_t *taddr;
    550 	in_port_t tport;
    551 
    552 	KASSERT(con->c_refcnt > 0);
    553 
    554 	npf_nat_gettrans(nt, &taddr, &tport);
    555 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
    556 
    557 	/* Construct a "backwards" key. */
    558 	if (!npf_conn_conkey(npc, &key, false)) {
    559 		return EINVAL;
    560 	}
    561 
    562 	/* Acquire the lock and check for the races. */
    563 	mutex_enter(&con->c_lock);
    564 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    565 		/* The connection got expired. */
    566 		mutex_exit(&con->c_lock);
    567 		return EINVAL;
    568 	}
    569 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
    570 
    571 	if (__predict_false(con->c_nat != NULL)) {
    572 		/* Race with a duplicate packet. */
    573 		mutex_exit(&con->c_lock);
    574 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    575 		return EISCONN;
    576 	}
    577 
    578 	/* Remove the "backwards" key. */
    579 	fw = npf_conn_getforwkey(con);
    580 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    581 	ret = npf_conndb_remove(npf->conn_db, bk);
    582 	KASSERT(ret == con);
    583 
    584 	/* Set the source/destination IDs to the translation values. */
    585 	npf_conn_adjkey(bk, taddr, tport, nat_type_dimap[ntype]);
    586 
    587 	/* Finally, re-insert the "backwards" key. */
    588 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    589 		/*
    590 		 * Race: we have hit the duplicate, remove the "forwards"
    591 		 * key and expire our connection; it is no longer valid.
    592 		 */
    593 		ret = npf_conndb_remove(npf->conn_db, fw);
    594 		KASSERT(ret == con);
    595 
    596 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    597 		mutex_exit(&con->c_lock);
    598 
    599 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    600 		return EISCONN;
    601 	}
    602 
    603 	/* Associate the NAT entry and release the lock. */
    604 	con->c_nat = nt;
    605 	mutex_exit(&con->c_lock);
    606 	return 0;
    607 }
    608 
    609 /*
    610  * npf_conn_expire: explicitly mark connection as expired.
    611  */
    612 void
    613 npf_conn_expire(npf_conn_t *con)
    614 {
    615 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
    616 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
    617 }
    618 
    619 /*
    620  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
    621  */
    622 bool
    623 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
    624 {
    625 	KASSERT(con->c_refcnt > 0);
    626 	if (__predict_true(con->c_flags & CONN_PASS)) {
    627 		mi->mi_rid = con->c_rid;
    628 		mi->mi_retfl = con->c_retfl;
    629 		*rp = con->c_rproc;
    630 		return true;
    631 	}
    632 	return false;
    633 }
    634 
    635 /*
    636  * npf_conn_setpass: mark connection as a "pass" one and associate the
    637  * rule procedure with it.
    638  */
    639 void
    640 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
    641 {
    642 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
    643 	KASSERT(con->c_refcnt > 0);
    644 	KASSERT(con->c_rproc == NULL);
    645 
    646 	/*
    647 	 * No need for atomic since the connection is not yet active.
    648 	 * If rproc is set, the caller transfers its reference to us,
    649 	 * which will be released on npf_conn_destroy().
    650 	 */
    651 	atomic_or_uint(&con->c_flags, CONN_PASS);
    652 	con->c_rproc = rp;
    653 	if (rp) {
    654 		con->c_rid = mi->mi_rid;
    655 		con->c_retfl = mi->mi_retfl;
    656 	}
    657 }
    658 
    659 /*
    660  * npf_conn_release: release a reference, which might allow G/C thread
    661  * to destroy this connection.
    662  */
    663 void
    664 npf_conn_release(npf_conn_t *con)
    665 {
    666 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
    667 		/* Activate: after this, connection is globally visible. */
    668 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
    669 	}
    670 	KASSERT(con->c_refcnt > 0);
    671 	atomic_dec_uint(&con->c_refcnt);
    672 }
    673 
    674 /*
    675  * npf_conn_getnat: return associated NAT data entry and indicate
    676  * whether it is a "forwards" or "backwards" stream.
    677  */
    678 npf_nat_t *
    679 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
    680 {
    681 	KASSERT(con->c_refcnt > 0);
    682 	*forw = (con->c_flags & PFIL_ALL) == (u_int)di;
    683 	return con->c_nat;
    684 }
    685 
    686 /*
    687  * npf_conn_expired: criterion to check if connection is expired.
    688  */
    689 bool
    690 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
    691 {
    692 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
    693 	int elapsed;
    694 
    695 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    696 		/* Explicitly marked to be expired. */
    697 		return true;
    698 	}
    699 
    700 	/*
    701 	 * Note: another thread may update 'atime' and it might
    702 	 * become greater than 'now'.
    703 	 */
    704 	elapsed = (int64_t)tsnow - con->c_atime;
    705 	return elapsed > etime;
    706 }
    707 
    708 /*
    709  * npf_conn_remove: unlink the connection and mark as expired.
    710  */
    711 void
    712 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
    713 {
    714 	/* Remove both entries of the connection. */
    715 	mutex_enter(&con->c_lock);
    716 	if ((con->c_flags & CONN_REMOVED) == 0) {
    717 		npf_connkey_t *fw, *bk;
    718 		npf_conn_t *ret __diagused;
    719 
    720 		fw = npf_conn_getforwkey(con);
    721 		ret = npf_conndb_remove(cd, fw);
    722 		KASSERT(ret == con);
    723 
    724 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    725 		ret = npf_conndb_remove(cd, bk);
    726 		KASSERT(ret == con);
    727 	}
    728 
    729 	/* Flag the removal and expiration. */
    730 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    731 	mutex_exit(&con->c_lock);
    732 }
    733 
    734 /*
    735  * npf_conn_worker: G/C to run from a worker thread.
    736  */
    737 void
    738 npf_conn_worker(npf_t *npf)
    739 {
    740 	npf_conndb_gc(npf, npf->conn_db, false, true);
    741 }
    742 
    743 /*
    744  * npf_conndb_export: construct a list of connections prepared for saving.
    745  * Note: this is expected to be an expensive operation.
    746  */
    747 int
    748 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
    749 {
    750 	npf_conn_t *head, *con;
    751 
    752 	/*
    753 	 * Note: acquire conn_lock to prevent from the database
    754 	 * destruction and G/C thread.
    755 	 */
    756 	mutex_enter(&npf->conn_lock);
    757 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    758 		mutex_exit(&npf->conn_lock);
    759 		return 0;
    760 	}
    761 	head = npf_conndb_getlist(npf->conn_db);
    762 	con = head;
    763 	while (con) {
    764 		nvlist_t *cdict;
    765 
    766 		if ((cdict = npf_conn_export(npf, con)) != NULL) {
    767 			nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
    768 			nvlist_destroy(cdict);
    769 		}
    770 		if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
    771 			break;
    772 		}
    773 	}
    774 	mutex_exit(&npf->conn_lock);
    775 	return 0;
    776 }
    777 
    778 /*
    779  * npf_conn_export: serialise a single connection.
    780  */
    781 static nvlist_t *
    782 npf_conn_export(npf_t *npf, npf_conn_t *con)
    783 {
    784 	nvlist_t *cdict, *kdict;
    785 	npf_connkey_t *fw, *bk;
    786 	unsigned alen;
    787 
    788 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
    789 		return NULL;
    790 	}
    791 	cdict = nvlist_create(0);
    792 	nvlist_add_number(cdict, "flags", con->c_flags);
    793 	nvlist_add_number(cdict, "proto", con->c_proto);
    794 	if (con->c_ifid) {
    795 		const char *ifname = npf_ifmap_getname(npf, con->c_ifid);
    796 		nvlist_add_string(cdict, "ifname", ifname);
    797 	}
    798 	nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
    799 
    800 	fw = npf_conn_getforwkey(con);
    801 	alen = NPF_CONNKEY_ALEN(fw);
    802 	bk = npf_conn_getbackkey(con, alen);
    803 
    804 	kdict = npf_connkey_export(fw);
    805 	nvlist_move_nvlist(cdict, "forw-key", kdict);
    806 
    807 	kdict = npf_connkey_export(bk);
    808 	nvlist_move_nvlist(cdict, "back-key", kdict);
    809 
    810 	/* Let the address length be based on on first key. */
    811 	nvlist_add_number(cdict, "alen", alen);
    812 
    813 	if (con->c_nat) {
    814 		npf_nat_export(cdict, con->c_nat);
    815 	}
    816 	return cdict;
    817 }
    818 
    819 /*
    820  * npf_conn_import: fully reconstruct a single connection from a
    821  * nvlist and insert into the given database.
    822  */
    823 int
    824 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
    825     npf_ruleset_t *natlist)
    826 {
    827 	npf_conn_t *con;
    828 	npf_connkey_t *fw, *bk;
    829 	const nvlist_t *nat, *conkey;
    830 	const char *ifname;
    831 	const void *state;
    832 	unsigned alen, idx;
    833 	size_t len;
    834 
    835 	/*
    836 	 * To determine the length of the connection, which depends
    837 	 * on the address length in the connection keys.
    838 	 */
    839 	alen = dnvlist_get_number(cdict, "alen", 0);
    840 	idx = NPF_CONNCACHE(alen);
    841 
    842 	/* Allocate a connection and initialise it (clear first). */
    843 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
    844 	memset(con, 0, sizeof(npf_conn_t));
    845 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    846 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    847 
    848 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
    849 	con->c_flags = dnvlist_get_number(cdict, "flags", 0);
    850 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
    851 	conn_update_atime(con);
    852 
    853 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
    854 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
    855 		goto err;
    856 	}
    857 
    858 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
    859 	if (!state || len != sizeof(npf_state_t)) {
    860 		goto err;
    861 	}
    862 	memcpy(&con->c_state, state, sizeof(npf_state_t));
    863 
    864 	/* Reconstruct NAT association, if any. */
    865 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
    866 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
    867 		goto err;
    868 	}
    869 
    870 	/*
    871 	 * Fetch and copy the keys for each direction.
    872 	 */
    873 	fw = npf_conn_getforwkey(con);
    874 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
    875 	if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
    876 		goto err;
    877 	}
    878 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    879 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
    880 	if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
    881 		goto err;
    882 	}
    883 
    884 	/* Guard against the contradicting address lengths. */
    885 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
    886 		goto err;
    887 	}
    888 
    889 	/* Insert the entries and the connection itself. */
    890 	if (!npf_conndb_insert(cd, fw, con, true)) {
    891 		goto err;
    892 	}
    893 	if (!npf_conndb_insert(cd, bk, con, false)) {
    894 		npf_conndb_remove(cd, fw);
    895 		goto err;
    896 	}
    897 
    898 	NPF_PRINTF(("NPF: imported conn %p\n", con));
    899 	npf_conndb_enqueue(cd, con);
    900 	return 0;
    901 err:
    902 	npf_conn_destroy_idx(npf, con, idx);
    903 	return EINVAL;
    904 }
    905 
    906 int
    907 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
    908 {
    909 	const nvlist_t *kdict;
    910 	npf_connkey_t key;
    911 	npf_conn_t *con;
    912 	uint16_t dir;
    913 	bool forw;
    914 
    915 	kdict = dnvlist_get_nvlist(idict, "key", NULL);
    916 	if (!kdict || !npf_connkey_import(kdict, &key)) {
    917 		return EINVAL;
    918 	}
    919 	con = npf_conndb_lookup(npf->conn_db, &key, &forw);
    920 	if (con == NULL) {
    921 		return ESRCH;
    922 	}
    923 	dir = dnvlist_get_number(idict, "direction", 0);
    924 	if (!npf_conn_check(con, NULL, dir, true)) {
    925 		atomic_dec_uint(&con->c_refcnt);
    926 		return ESRCH;
    927 	}
    928 	*odict = npf_conn_export(npf, con);
    929 	atomic_dec_uint(&con->c_refcnt);
    930 	return *odict ? 0 : ENOSPC;
    931 }
    932 
    933 #if defined(DDB) || defined(_NPF_TESTING)
    934 
    935 void
    936 npf_conn_print(npf_conn_t *con)
    937 {
    938 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
    939 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    940 	const unsigned proto = con->c_proto;
    941 	struct timespec tspnow;
    942 
    943 	getnanouptime(&tspnow);
    944 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
    945 	    proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime),
    946 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
    947 	npf_connkey_print(fw);
    948 	npf_connkey_print(bk);
    949 	npf_state_dump(&con->c_state);
    950 	if (con->c_nat) {
    951 		npf_nat_dump(con->c_nat);
    952 	}
    953 }
    954 
    955 #endif
    956