Home | History | Annotate | Line # | Download | only in npf
npf_conn.c revision 1.27.2.2
      1 /*-
      2  * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
      3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This material is based upon work partially supported by The
      7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * NPF connection tracking for stateful filtering and translation.
     33  *
     34  * Overview
     35  *
     36  *	Packets can be incoming or outgoing with respect to an interface.
     37  *	Connection direction is identified by the direction of its first
     38  *	packet.  The meaning of incoming/outgoing packet in the context of
     39  *	connection direction can be confusing.  Therefore, we will use the
     40  *	terms "forwards stream" and "backwards stream", where packets in
     41  *	the forwards stream mean the packets travelling in the direction
     42  *	as the connection direction.
     43  *
     44  *	All connections have two keys and thus two entries:
     45  *
     46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
     47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
     48  *
     49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
     50  *	to allocate variable-length npf_conn_t structures based on whether
     51  *	the IPv4 or IPv6 addresses are used.  See the npf_connkey.c source
     52  *	file for the description of the key layouts.
     53  *
     54  *	The keys are formed from the 5-tuple (source/destination address,
     55  *	source/destination port and the protocol).  Additional matching
     56  *	is performed for the interface (a common behaviour is equivalent
     57  *	to the 6-tuple lookup including the interface ID).  Note that the
     58  *	key may be formed using translated values in a case of NAT.
     59  *
     60  *	Connections can serve two purposes: for the implicit passing or
     61  *	to accommodate the dynamic NAT.  Connections for the former purpose
     62  *	are created by the rules with "stateful" attribute and are used for
     63  *	stateful filtering.  Such connections indicate that the packet of
     64  *	the backwards stream should be passed without inspection of the
     65  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
     66  *	with a connection.  Such connections are created by the NAT policies
     67  *	and they have a relationship with NAT translation structure via
     68  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
     69  *	which is a common case.
     70  *
     71  * Connection life-cycle
     72  *
     73  *	Connections are established when a packet matches said rule or
     74  *	NAT policy.  Both keys of the established connection are inserted
     75  *	into the connection database.  A garbage collection thread
     76  *	periodically scans all connections and depending on connection
     77  *	properties (e.g. last activity time, protocol) removes connection
     78  *	entries and expires the actual connections.
     79  *
     80  *	Each connection has a reference count.  The reference is acquired
     81  *	on lookup and should be released by the caller.  It guarantees that
     82  *	the connection will not be destroyed, although it may be expired.
     83  *
     84  * Synchronisation
     85  *
     86  *	Connection database is accessed in a lock-less manner by the main
     87  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
     88  *	are always called from a software interrupt, the database is
     89  *	protected using passive serialisation.  The main place which can
     90  *	destroy a connection is npf_conn_worker().  The database itself
     91  *	can be replaced and destroyed in npf_conn_reload().
     92  *
     93  * ALG support
     94  *
     95  *	Application-level gateways (ALGs) can override generic connection
     96  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
     97  *	performing their own lookup using different key.  Recursive call
     98  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
     99  *	npf_conn_lookup() function for this purpose.
    100  *
    101  * Lock order
    102  *
    103  *	npf_config_lock ->
    104  *		conn_lock ->
    105  *			npf_conn_t::c_lock
    106  */
    107 
    108 #ifdef _KERNEL
    109 #include <sys/cdefs.h>
    110 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.27.2.2 2019/10/04 08:06:35 martin Exp $");
    111 
    112 #include <sys/param.h>
    113 #include <sys/types.h>
    114 
    115 #include <netinet/in.h>
    116 #include <netinet/tcp.h>
    117 
    118 #include <sys/atomic.h>
    119 #include <sys/kmem.h>
    120 #include <sys/mutex.h>
    121 #include <net/pfil.h>
    122 #include <sys/pool.h>
    123 #include <sys/queue.h>
    124 #include <sys/systm.h>
    125 #endif
    126 
    127 #define __NPF_CONN_PRIVATE
    128 #include "npf_conn.h"
    129 #include "npf_impl.h"
    130 
    131 /* A helper to select the IPv4 or IPv6 connection cache. */
    132 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
    133 
    134 /*
    135  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
    136  */
    137 CTASSERT(PFIL_ALL == (0x001 | 0x002));
    138 #define	CONN_ACTIVE	0x004	/* visible on inspection */
    139 #define	CONN_PASS	0x008	/* perform implicit passing */
    140 #define	CONN_EXPIRE	0x010	/* explicitly expire */
    141 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
    142 
    143 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
    144 
    145 static nvlist_t *npf_conn_export(npf_t *, npf_conn_t *);
    146 
    147 /*
    148  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
    149  */
    150 
    151 void
    152 npf_conn_init(npf_t *npf)
    153 {
    154 	npf->conn_cache[0] = pool_cache_init(
    155 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
    156 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
    157 	npf->conn_cache[1] = pool_cache_init(
    158 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
    159 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
    160 
    161 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
    162 	npf->conn_tracking = CONN_TRACKING_OFF;
    163 	npf->conn_db = npf_conndb_create();
    164 	npf_conndb_sysinit(npf);
    165 }
    166 
    167 void
    168 npf_conn_fini(npf_t *npf)
    169 {
    170 	npf_conndb_sysfini(npf);
    171 
    172 	/* Note: the caller should have flushed the connections. */
    173 	KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    174 	npf_worker_unregister(npf, npf_conn_worker);
    175 
    176 	npf_conndb_destroy(npf->conn_db);
    177 	pool_cache_destroy(npf->conn_cache[0]);
    178 	pool_cache_destroy(npf->conn_cache[1]);
    179 	mutex_destroy(&npf->conn_lock);
    180 }
    181 
    182 /*
    183  * npf_conn_load: perform the load by flushing the current connection
    184  * database and replacing it with the new one or just destroying.
    185  *
    186  * => The caller must disable the connection tracking and ensure that
    187  *    there are no connection database lookups or references in-flight.
    188  */
    189 void
    190 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
    191 {
    192 	npf_conndb_t *odb = NULL;
    193 
    194 	KASSERT(npf_config_locked_p(npf));
    195 
    196 	/*
    197 	 * The connection database is in the quiescent state.
    198 	 * Prevent G/C thread from running and install a new database.
    199 	 */
    200 	mutex_enter(&npf->conn_lock);
    201 	if (ndb) {
    202 		KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    203 		odb = npf->conn_db;
    204 		npf->conn_db = ndb;
    205 		membar_sync();
    206 	}
    207 	if (track) {
    208 		/* After this point lookups start flying in. */
    209 		npf->conn_tracking = CONN_TRACKING_ON;
    210 	}
    211 	mutex_exit(&npf->conn_lock);
    212 
    213 	if (odb) {
    214 		/*
    215 		 * Flush all, no sync since the caller did it for us.
    216 		 * Also, release the pool cache memory.
    217 		 */
    218 		npf_conndb_gc(npf, odb, true, false);
    219 		npf_conndb_destroy(odb);
    220 		pool_cache_invalidate(npf->conn_cache[0]);
    221 		pool_cache_invalidate(npf->conn_cache[1]);
    222 	}
    223 }
    224 
    225 /*
    226  * npf_conn_tracking: enable/disable connection tracking.
    227  */
    228 void
    229 npf_conn_tracking(npf_t *npf, bool track)
    230 {
    231 	KASSERT(npf_config_locked_p(npf));
    232 	npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
    233 }
    234 
    235 static inline bool
    236 npf_conn_trackable_p(const npf_cache_t *npc)
    237 {
    238 	const npf_t *npf = npc->npc_ctx;
    239 
    240 	/*
    241 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
    242 	 * not cached - protocol is not supported or packet is invalid.
    243 	 */
    244 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    245 		return false;
    246 	}
    247 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    248 		return false;
    249 	}
    250 	return true;
    251 }
    252 
    253 static inline void
    254 conn_update_atime(npf_conn_t *con)
    255 {
    256 	struct timespec tsnow;
    257 
    258 	getnanouptime(&tsnow);
    259 	con->c_atime = tsnow.tv_sec;
    260 }
    261 
    262 /*
    263  * npf_conn_check: check that:
    264  *
    265  *	- the connection is active;
    266  *
    267  *	- the packet is travelling in the right direction with the respect
    268  *	  to the connection direction (if interface-id is not zero);
    269  *
    270  *	- the packet is travelling on the same interface as the
    271  *	  connection interface (if interface-id is not zero).
    272  */
    273 static bool
    274 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
    275     const unsigned di, const bool forw)
    276 {
    277 	const uint32_t flags = con->c_flags;
    278 	const unsigned ifid = con->c_ifid;
    279 	bool active, pforw;
    280 
    281 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
    282 	if (__predict_false(!active)) {
    283 		return false;
    284 	}
    285 	if (ifid && nbuf) {
    286 		pforw = (flags & PFIL_ALL) == (unsigned)di;
    287 		if (__predict_false(forw != pforw)) {
    288 			return false;
    289 		}
    290 		if (__predict_false(ifid != nbuf->nb_ifid)) {
    291 			return false;
    292 		}
    293 	}
    294 	return true;
    295 }
    296 
    297 /*
    298  * npf_conn_lookup: lookup if there is an established connection.
    299  *
    300  * => If found, we will hold a reference for the caller.
    301  */
    302 npf_conn_t *
    303 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
    304 {
    305 	npf_t *npf = npc->npc_ctx;
    306 	const nbuf_t *nbuf = npc->npc_nbuf;
    307 	npf_conn_t *con;
    308 	npf_connkey_t key;
    309 
    310 	/* Construct a key and lookup for a connection in the store. */
    311 	if (!npf_conn_conkey(npc, &key, true)) {
    312 		return NULL;
    313 	}
    314 	con = npf_conndb_lookup(npf->conn_db, &key, forw);
    315 	if (con == NULL) {
    316 		return NULL;
    317 	}
    318 	KASSERT(npc->npc_proto == con->c_proto);
    319 
    320 	/* Extra checks for the connection and packet. */
    321 	if (!npf_conn_check(con, nbuf, di, *forw)) {
    322 		atomic_dec_uint(&con->c_refcnt);
    323 		return NULL;
    324 	}
    325 
    326 	/* Update the last activity time. */
    327 	conn_update_atime(con);
    328 	return con;
    329 }
    330 
    331 /*
    332  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
    333  *
    334  * => If found, we will hold a reference for the caller.
    335  */
    336 npf_conn_t *
    337 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
    338 {
    339 	nbuf_t *nbuf = npc->npc_nbuf;
    340 	npf_conn_t *con;
    341 	bool forw, ok;
    342 
    343 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    344 	if (!npf_conn_trackable_p(npc)) {
    345 		return NULL;
    346 	}
    347 
    348 	/* Query ALG which may lookup connection for us. */
    349 	if ((con = npf_alg_conn(npc, di)) != NULL) {
    350 		/* Note: reference is held. */
    351 		return con;
    352 	}
    353 	if (nbuf_head_mbuf(nbuf) == NULL) {
    354 		*error = ENOMEM;
    355 		return NULL;
    356 	}
    357 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    358 
    359 	/* Main lookup of the connection. */
    360 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
    361 		return NULL;
    362 	}
    363 
    364 	/* Inspect the protocol data and handle state changes. */
    365 	mutex_enter(&con->c_lock);
    366 	ok = npf_state_inspect(npc, &con->c_state, forw);
    367 	mutex_exit(&con->c_lock);
    368 
    369 	/* If invalid state: let the rules deal with it. */
    370 	if (__predict_false(!ok)) {
    371 		npf_conn_release(con);
    372 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
    373 		return NULL;
    374 	}
    375 
    376 	/*
    377 	 * If this is multi-end state, then specially tag the packet
    378 	 * so it will be just passed-through on other interfaces.
    379 	 */
    380 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
    381 		npf_conn_release(con);
    382 		*error = ENOMEM;
    383 		return NULL;
    384 	}
    385 	return con;
    386 }
    387 
    388 /*
    389  * npf_conn_establish: create a new connection, insert into the global list.
    390  *
    391  * => Connection is created with the reference held for the caller.
    392  * => Connection will be activated on the first reference release.
    393  */
    394 npf_conn_t *
    395 npf_conn_establish(npf_cache_t *npc, int di, bool global)
    396 {
    397 	npf_t *npf = npc->npc_ctx;
    398 	const unsigned alen = npc->npc_alen;
    399 	const unsigned idx = NPF_CONNCACHE(alen);
    400 	const nbuf_t *nbuf = npc->npc_nbuf;
    401 	npf_connkey_t *fw, *bk;
    402 	npf_conn_t *con;
    403 	int error = 0;
    404 
    405 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    406 
    407 	if (!npf_conn_trackable_p(npc)) {
    408 		return NULL;
    409 	}
    410 
    411 	/* Allocate and initialise the new connection. */
    412 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
    413 	if (__predict_false(!con)) {
    414 		npf_worker_signal(npf);
    415 		return NULL;
    416 	}
    417 	NPF_PRINTF(("NPF: create conn %p\n", con));
    418 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    419 
    420 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    421 	con->c_flags = (di & PFIL_ALL);
    422 	con->c_refcnt = 0;
    423 	con->c_rproc = NULL;
    424 	con->c_nat = NULL;
    425 
    426 	con->c_proto = npc->npc_proto;
    427 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
    428 	con->c_alen = alen;
    429 
    430 	/* Initialize the protocol state. */
    431 	if (!npf_state_init(npc, &con->c_state)) {
    432 		npf_conn_destroy(npf, con);
    433 		return NULL;
    434 	}
    435 	KASSERT(npf_iscached(npc, NPC_IP46));
    436 
    437 	fw = npf_conn_getforwkey(con);
    438 	bk = npf_conn_getbackkey(con, alen);
    439 
    440 	/*
    441 	 * Construct "forwards" and "backwards" keys.  Also, set the
    442 	 * interface ID for this connection (unless it is global).
    443 	 */
    444 	if (!npf_conn_conkey(npc, fw, true) ||
    445 	    !npf_conn_conkey(npc, bk, false)) {
    446 		npf_conn_destroy(npf, con);
    447 		return NULL;
    448 	}
    449 	con->c_ifid = global ? nbuf->nb_ifid : 0;
    450 
    451 	/*
    452 	 * Set last activity time for a new connection and acquire
    453 	 * a reference for the caller before we make it visible.
    454 	 */
    455 	conn_update_atime(con);
    456 	con->c_refcnt = 1;
    457 
    458 	/*
    459 	 * Insert both keys (entries representing directions) of the
    460 	 * connection.  At this point it becomes visible, but we activate
    461 	 * the connection later.
    462 	 */
    463 	mutex_enter(&con->c_lock);
    464 	if (!npf_conndb_insert(npf->conn_db, fw, con, true)) {
    465 		error = EISCONN;
    466 		goto err;
    467 	}
    468 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    469 		npf_conn_t *ret __diagused;
    470 		ret = npf_conndb_remove(npf->conn_db, fw);
    471 		KASSERT(ret == con);
    472 		error = EISCONN;
    473 		goto err;
    474 	}
    475 err:
    476 	/*
    477 	 * If we have hit the duplicate: mark the connection as expired
    478 	 * and let the G/C thread to take care of it.  We cannot do it
    479 	 * here since there might be references acquired already.
    480 	 */
    481 	if (error) {
    482 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    483 		atomic_dec_uint(&con->c_refcnt);
    484 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
    485 	} else {
    486 		NPF_PRINTF(("NPF: establish conn %p\n", con));
    487 	}
    488 
    489 	/* Finally, insert into the connection list. */
    490 	npf_conndb_enqueue(npf->conn_db, con);
    491 	mutex_exit(&con->c_lock);
    492 
    493 	return error ? NULL : con;
    494 }
    495 
    496 void
    497 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
    498 {
    499 	const unsigned idx __unused = NPF_CONNCACHE(con->c_alen);
    500 
    501 	KASSERT(con->c_refcnt == 0);
    502 
    503 	if (con->c_nat) {
    504 		/* Release any NAT structures. */
    505 		npf_nat_destroy(con->c_nat);
    506 	}
    507 	if (con->c_rproc) {
    508 		/* Release the rule procedure. */
    509 		npf_rproc_release(con->c_rproc);
    510 	}
    511 
    512 	/* Destroy the state. */
    513 	npf_state_destroy(&con->c_state);
    514 	mutex_destroy(&con->c_lock);
    515 
    516 	/* Free the structure, increase the counter. */
    517 	pool_cache_put(npf->conn_cache[idx], con);
    518 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
    519 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
    520 }
    521 
    522 /*
    523  * npf_conn_setnat: associate NAT entry with the connection, update and
    524  * re-insert connection entry using the translation values.
    525  *
    526  * => The caller must be holding a reference.
    527  */
    528 int
    529 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
    530     npf_nat_t *nt, unsigned ntype)
    531 {
    532 	static const u_int nat_type_dimap[] = {
    533 		[NPF_NATOUT] = NPF_DST,
    534 		[NPF_NATIN] = NPF_SRC,
    535 	};
    536 	npf_t *npf = npc->npc_ctx;
    537 	npf_connkey_t key, *fw, *bk;
    538 	npf_conn_t *ret __diagused;
    539 	npf_addr_t *taddr;
    540 	in_port_t tport;
    541 
    542 	KASSERT(con->c_refcnt > 0);
    543 
    544 	npf_nat_gettrans(nt, &taddr, &tport);
    545 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
    546 
    547 	/* Construct a "backwards" key. */
    548 	if (!npf_conn_conkey(npc, &key, false)) {
    549 		return EINVAL;
    550 	}
    551 
    552 	/* Acquire the lock and check for the races. */
    553 	mutex_enter(&con->c_lock);
    554 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    555 		/* The connection got expired. */
    556 		mutex_exit(&con->c_lock);
    557 		return EINVAL;
    558 	}
    559 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
    560 
    561 	if (__predict_false(con->c_nat != NULL)) {
    562 		/* Race with a duplicate packet. */
    563 		mutex_exit(&con->c_lock);
    564 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    565 		return EISCONN;
    566 	}
    567 
    568 	/* Remove the "backwards" key. */
    569 	fw = npf_conn_getforwkey(con);
    570 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    571 	ret = npf_conndb_remove(npf->conn_db, bk);
    572 	KASSERT(ret == con);
    573 
    574 	/* Set the source/destination IDs to the translation values. */
    575 	npf_conn_adjkey(bk, taddr, tport, nat_type_dimap[ntype]);
    576 
    577 	/* Finally, re-insert the "backwards" key. */
    578 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    579 		/*
    580 		 * Race: we have hit the duplicate, remove the "forwards"
    581 		 * key and expire our connection; it is no longer valid.
    582 		 */
    583 		ret = npf_conndb_remove(npf->conn_db, fw);
    584 		KASSERT(ret == con);
    585 
    586 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    587 		mutex_exit(&con->c_lock);
    588 
    589 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    590 		return EISCONN;
    591 	}
    592 
    593 	/* Associate the NAT entry and release the lock. */
    594 	con->c_nat = nt;
    595 	mutex_exit(&con->c_lock);
    596 	return 0;
    597 }
    598 
    599 /*
    600  * npf_conn_expire: explicitly mark connection as expired.
    601  */
    602 void
    603 npf_conn_expire(npf_conn_t *con)
    604 {
    605 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
    606 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
    607 }
    608 
    609 /*
    610  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
    611  */
    612 bool
    613 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
    614 {
    615 	KASSERT(con->c_refcnt > 0);
    616 	if (__predict_true(con->c_flags & CONN_PASS)) {
    617 		mi->mi_rid = con->c_rid;
    618 		mi->mi_retfl = con->c_retfl;
    619 		*rp = con->c_rproc;
    620 		return true;
    621 	}
    622 	return false;
    623 }
    624 
    625 /*
    626  * npf_conn_setpass: mark connection as a "pass" one and associate the
    627  * rule procedure with it.
    628  */
    629 void
    630 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
    631 {
    632 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
    633 	KASSERT(con->c_refcnt > 0);
    634 	KASSERT(con->c_rproc == NULL);
    635 
    636 	/*
    637 	 * No need for atomic since the connection is not yet active.
    638 	 * If rproc is set, the caller transfers its reference to us,
    639 	 * which will be released on npf_conn_destroy().
    640 	 */
    641 	atomic_or_uint(&con->c_flags, CONN_PASS);
    642 	con->c_rproc = rp;
    643 	if (rp) {
    644 		con->c_rid = mi->mi_rid;
    645 		con->c_retfl = mi->mi_retfl;
    646 	}
    647 }
    648 
    649 /*
    650  * npf_conn_release: release a reference, which might allow G/C thread
    651  * to destroy this connection.
    652  */
    653 void
    654 npf_conn_release(npf_conn_t *con)
    655 {
    656 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
    657 		/* Activate: after this, connection is globally visible. */
    658 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
    659 	}
    660 	KASSERT(con->c_refcnt > 0);
    661 	atomic_dec_uint(&con->c_refcnt);
    662 }
    663 
    664 /*
    665  * npf_conn_getnat: return associated NAT data entry and indicate
    666  * whether it is a "forwards" or "backwards" stream.
    667  */
    668 npf_nat_t *
    669 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
    670 {
    671 	KASSERT(con->c_refcnt > 0);
    672 	*forw = (con->c_flags & PFIL_ALL) == (u_int)di;
    673 	return con->c_nat;
    674 }
    675 
    676 /*
    677  * npf_conn_expired: criterion to check if connection is expired.
    678  */
    679 bool
    680 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
    681 {
    682 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
    683 	int elapsed;
    684 
    685 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    686 		/* Explicitly marked to be expired. */
    687 		return true;
    688 	}
    689 
    690 	/*
    691 	 * Note: another thread may update 'atime' and it might
    692 	 * become greater than 'now'.
    693 	 */
    694 	elapsed = (int64_t)tsnow - con->c_atime;
    695 	return elapsed > etime;
    696 }
    697 
    698 /*
    699  * npf_conn_remove: unlink the connection and mark as expired.
    700  */
    701 void
    702 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
    703 {
    704 	/* Remove both entries of the connection. */
    705 	mutex_enter(&con->c_lock);
    706 	if ((con->c_flags & CONN_REMOVED) == 0) {
    707 		npf_connkey_t *fw, *bk;
    708 		npf_conn_t *ret __diagused;
    709 
    710 		fw = npf_conn_getforwkey(con);
    711 		ret = npf_conndb_remove(cd, fw);
    712 		KASSERT(ret == con);
    713 
    714 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    715 		ret = npf_conndb_remove(cd, bk);
    716 		KASSERT(ret == con);
    717 	}
    718 
    719 	/* Flag the removal and expiration. */
    720 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    721 	mutex_exit(&con->c_lock);
    722 }
    723 
    724 /*
    725  * npf_conn_worker: G/C to run from a worker thread.
    726  */
    727 void
    728 npf_conn_worker(npf_t *npf)
    729 {
    730 	npf_conndb_gc(npf, npf->conn_db, false, true);
    731 }
    732 
    733 /*
    734  * npf_conndb_export: construct a list of connections prepared for saving.
    735  * Note: this is expected to be an expensive operation.
    736  */
    737 int
    738 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
    739 {
    740 	npf_conn_t *head, *con;
    741 
    742 	/*
    743 	 * Note: acquire conn_lock to prevent from the database
    744 	 * destruction and G/C thread.
    745 	 */
    746 	mutex_enter(&npf->conn_lock);
    747 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    748 		mutex_exit(&npf->conn_lock);
    749 		return 0;
    750 	}
    751 	head = npf_conndb_getlist(npf->conn_db);
    752 	con = head;
    753 	while (con) {
    754 		nvlist_t *cdict;
    755 
    756 		if ((cdict = npf_conn_export(npf, con)) != NULL) {
    757 			nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
    758 			nvlist_destroy(cdict);
    759 		}
    760 		if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
    761 			break;
    762 		}
    763 	}
    764 	mutex_exit(&npf->conn_lock);
    765 	return 0;
    766 }
    767 
    768 /*
    769  * npf_conn_export: serialise a single connection.
    770  */
    771 static nvlist_t *
    772 npf_conn_export(npf_t *npf, npf_conn_t *con)
    773 {
    774 	nvlist_t *cdict, *kdict;
    775 	npf_connkey_t *fw, *bk;
    776 	unsigned alen;
    777 
    778 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
    779 		return NULL;
    780 	}
    781 	cdict = nvlist_create(0);
    782 	nvlist_add_number(cdict, "flags", con->c_flags);
    783 	nvlist_add_number(cdict, "proto", con->c_proto);
    784 	if (con->c_ifid) {
    785 		char ifname[IFNAMSIZ];
    786 		npf_ifmap_copyname(npf, con->c_ifid, ifname, sizeof(ifname));
    787 		nvlist_add_string(cdict, "ifname", ifname);
    788 	}
    789 	nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
    790 
    791 	fw = npf_conn_getforwkey(con);
    792 	alen = NPF_CONNKEY_ALEN(fw);
    793 	KASSERT(alen == con->c_alen);
    794 	bk = npf_conn_getbackkey(con, alen);
    795 
    796 	kdict = npf_connkey_export(fw);
    797 	nvlist_move_nvlist(cdict, "forw-key", kdict);
    798 
    799 	kdict = npf_connkey_export(bk);
    800 	nvlist_move_nvlist(cdict, "back-key", kdict);
    801 
    802 	/* Let the address length be based on on first key. */
    803 	nvlist_add_number(cdict, "alen", alen);
    804 
    805 	if (con->c_nat) {
    806 		npf_nat_export(cdict, con->c_nat);
    807 	}
    808 	return cdict;
    809 }
    810 
    811 /*
    812  * npf_conn_import: fully reconstruct a single connection from a
    813  * nvlist and insert into the given database.
    814  */
    815 int
    816 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
    817     npf_ruleset_t *natlist)
    818 {
    819 	npf_conn_t *con;
    820 	npf_connkey_t *fw, *bk;
    821 	const nvlist_t *nat, *conkey;
    822 	const char *ifname;
    823 	const void *state;
    824 	unsigned alen, idx;
    825 	size_t len;
    826 
    827 	/*
    828 	 * To determine the length of the connection, which depends
    829 	 * on the address length in the connection keys.
    830 	 */
    831 	alen = dnvlist_get_number(cdict, "alen", 0);
    832 	idx = NPF_CONNCACHE(alen);
    833 
    834 	/* Allocate a connection and initialise it (clear first). */
    835 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
    836 	memset(con, 0, sizeof(npf_conn_t));
    837 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    838 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    839 
    840 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
    841 	con->c_flags = dnvlist_get_number(cdict, "flags", 0);
    842 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
    843 	conn_update_atime(con);
    844 
    845 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
    846 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
    847 		goto err;
    848 	}
    849 
    850 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
    851 	if (!state || len != sizeof(npf_state_t)) {
    852 		goto err;
    853 	}
    854 	memcpy(&con->c_state, state, sizeof(npf_state_t));
    855 
    856 	/* Reconstruct NAT association, if any. */
    857 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
    858 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
    859 		goto err;
    860 	}
    861 
    862 	/*
    863 	 * Fetch and copy the keys for each direction.
    864 	 */
    865 	fw = npf_conn_getforwkey(con);
    866 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
    867 	if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
    868 		goto err;
    869 	}
    870 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    871 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
    872 	if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
    873 		goto err;
    874 	}
    875 
    876 	/* Guard against the contradicting address lengths. */
    877 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
    878 		goto err;
    879 	}
    880 
    881 	/* Insert the entries and the connection itself. */
    882 	if (!npf_conndb_insert(cd, fw, con, true)) {
    883 		goto err;
    884 	}
    885 	if (!npf_conndb_insert(cd, bk, con, false)) {
    886 		npf_conndb_remove(cd, fw);
    887 		goto err;
    888 	}
    889 
    890 	NPF_PRINTF(("NPF: imported conn %p\n", con));
    891 	npf_conndb_enqueue(cd, con);
    892 	return 0;
    893 err:
    894 	npf_conn_destroy(npf, con);
    895 	return EINVAL;
    896 }
    897 
    898 int
    899 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
    900 {
    901 	const nvlist_t *kdict;
    902 	npf_connkey_t key;
    903 	npf_conn_t *con;
    904 	uint16_t dir;
    905 	bool forw;
    906 
    907 	kdict = dnvlist_get_nvlist(idict, "key", NULL);
    908 	if (!kdict || !npf_connkey_import(kdict, &key)) {
    909 		return EINVAL;
    910 	}
    911 	con = npf_conndb_lookup(npf->conn_db, &key, &forw);
    912 	if (con == NULL) {
    913 		return ESRCH;
    914 	}
    915 	dir = dnvlist_get_number(idict, "direction", 0);
    916 	if (!npf_conn_check(con, NULL, dir, true)) {
    917 		atomic_dec_uint(&con->c_refcnt);
    918 		return ESRCH;
    919 	}
    920 	*odict = npf_conn_export(npf, con);
    921 	atomic_dec_uint(&con->c_refcnt);
    922 	return *odict ? 0 : ENOSPC;
    923 }
    924 
    925 #if defined(DDB) || defined(_NPF_TESTING)
    926 
    927 void
    928 npf_conn_print(npf_conn_t *con)
    929 {
    930 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
    931 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    932 	const unsigned proto = con->c_proto;
    933 	struct timespec tspnow;
    934 
    935 	getnanouptime(&tspnow);
    936 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
    937 	    proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime),
    938 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
    939 	npf_connkey_print(fw);
    940 	npf_connkey_print(bk);
    941 	npf_state_dump(&con->c_state);
    942 	if (con->c_nat) {
    943 		npf_nat_dump(con->c_nat);
    944 	}
    945 }
    946 
    947 #endif
    948