Home | History | Annotate | Line # | Download | only in npf
npf_conn.c revision 1.27
      1 /*-
      2  * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
      3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This material is based upon work partially supported by The
      7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * NPF connection tracking for stateful filtering and translation.
     33  *
     34  * Overview
     35  *
     36  *	Packets can be incoming or outgoing with respect to an interface.
     37  *	Connection direction is identified by the direction of its first
     38  *	packet.  The meaning of incoming/outgoing packet in the context of
     39  *	connection direction can be confusing.  Therefore, we will use the
     40  *	terms "forwards stream" and "backwards stream", where packets in
     41  *	the forwards stream mean the packets travelling in the direction
     42  *	as the connection direction.
     43  *
     44  *	All connections have two keys and thus two entries:
     45  *
     46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
     47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
     48  *
     49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
     50  *	to allocate variable-length npf_conn_t structures based on whether
     51  *	the IPv4 or IPv6 addresses are used.  See the npf_connkey.c source
     52  *	file for the description of the key layouts.
     53  *
     54  *	The keys are formed from the 5-tuple (source/destination address,
     55  *	source/destination port and the protocol).  Additional matching
     56  *	is performed for the interface (a common behaviour is equivalent
     57  *	to the 6-tuple lookup including the interface ID).  Note that the
     58  *	key may be formed using translated values in a case of NAT.
     59  *
     60  *	Connections can serve two purposes: for the implicit passing or
     61  *	to accommodate the dynamic NAT.  Connections for the former purpose
     62  *	are created by the rules with "stateful" attribute and are used for
     63  *	stateful filtering.  Such connections indicate that the packet of
     64  *	the backwards stream should be passed without inspection of the
     65  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
     66  *	with a connection.  Such connections are created by the NAT policies
     67  *	and they have a relationship with NAT translation structure via
     68  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
     69  *	which is a common case.
     70  *
     71  * Connection life-cycle
     72  *
     73  *	Connections are established when a packet matches said rule or
     74  *	NAT policy.  Both keys of the established connection are inserted
     75  *	into the connection database.  A garbage collection thread
     76  *	periodically scans all connections and depending on connection
     77  *	properties (e.g. last activity time, protocol) removes connection
     78  *	entries and expires the actual connections.
     79  *
     80  *	Each connection has a reference count.  The reference is acquired
     81  *	on lookup and should be released by the caller.  It guarantees that
     82  *	the connection will not be destroyed, although it may be expired.
     83  *
     84  * Synchronisation
     85  *
     86  *	Connection database is accessed in a lock-less manner by the main
     87  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
     88  *	are always called from a software interrupt, the database is
     89  *	protected using passive serialisation.  The main place which can
     90  *	destroy a connection is npf_conn_worker().  The database itself
     91  *	can be replaced and destroyed in npf_conn_reload().
     92  *
     93  * ALG support
     94  *
     95  *	Application-level gateways (ALGs) can override generic connection
     96  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
     97  *	performing their own lookup using different key.  Recursive call
     98  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
     99  *	npf_conn_lookup() function for this purpose.
    100  *
    101  * Lock order
    102  *
    103  *	npf_config_lock ->
    104  *		conn_lock ->
    105  *			npf_conn_t::c_lock
    106  */
    107 
    108 #ifdef _KERNEL
    109 #include <sys/cdefs.h>
    110 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.27 2019/07/23 00:52:01 rmind Exp $");
    111 
    112 #include <sys/param.h>
    113 #include <sys/types.h>
    114 
    115 #include <netinet/in.h>
    116 #include <netinet/tcp.h>
    117 
    118 #include <sys/atomic.h>
    119 #include <sys/kmem.h>
    120 #include <sys/mutex.h>
    121 #include <net/pfil.h>
    122 #include <sys/pool.h>
    123 #include <sys/queue.h>
    124 #include <sys/systm.h>
    125 #endif
    126 
    127 #define __NPF_CONN_PRIVATE
    128 #include "npf_conn.h"
    129 #include "npf_impl.h"
    130 
    131 /* A helper to select the IPv4 or IPv6 connection cache. */
    132 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
    133 
    134 /*
    135  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
    136  */
    137 CTASSERT(PFIL_ALL == (0x001 | 0x002));
    138 #define	CONN_ACTIVE	0x004	/* visible on inspection */
    139 #define	CONN_PASS	0x008	/* perform implicit passing */
    140 #define	CONN_EXPIRE	0x010	/* explicitly expire */
    141 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
    142 
    143 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
    144 
    145 static nvlist_t *npf_conn_export(npf_t *, npf_conn_t *);
    146 
    147 /*
    148  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
    149  */
    150 
    151 void
    152 npf_conn_init(npf_t *npf, int flags)
    153 {
    154 	npf->conn_cache[0] = pool_cache_init(
    155 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
    156 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
    157 	npf->conn_cache[1] = pool_cache_init(
    158 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
    159 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
    160 
    161 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
    162 	npf->conn_tracking = CONN_TRACKING_OFF;
    163 	npf->conn_db = npf_conndb_create();
    164 
    165 	if ((flags & NPF_NO_GC) == 0) {
    166 		npf_worker_register(npf, npf_conn_worker);
    167 	}
    168 	npf_conndb_sysinit(npf);
    169 }
    170 
    171 void
    172 npf_conn_fini(npf_t *npf)
    173 {
    174 	npf_conndb_sysfini(npf);
    175 
    176 	/* Note: the caller should have flushed the connections. */
    177 	KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    178 	npf_worker_unregister(npf, npf_conn_worker);
    179 
    180 	npf_conndb_destroy(npf->conn_db);
    181 	pool_cache_destroy(npf->conn_cache[0]);
    182 	pool_cache_destroy(npf->conn_cache[1]);
    183 	mutex_destroy(&npf->conn_lock);
    184 }
    185 
    186 /*
    187  * npf_conn_load: perform the load by flushing the current connection
    188  * database and replacing it with the new one or just destroying.
    189  *
    190  * => The caller must disable the connection tracking and ensure that
    191  *    there are no connection database lookups or references in-flight.
    192  */
    193 void
    194 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
    195 {
    196 	npf_conndb_t *odb = NULL;
    197 
    198 	KASSERT(npf_config_locked_p(npf));
    199 
    200 	/*
    201 	 * The connection database is in the quiescent state.
    202 	 * Prevent G/C thread from running and install a new database.
    203 	 */
    204 	mutex_enter(&npf->conn_lock);
    205 	if (ndb) {
    206 		KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
    207 		odb = npf->conn_db;
    208 		npf->conn_db = ndb;
    209 		membar_sync();
    210 	}
    211 	if (track) {
    212 		/* After this point lookups start flying in. */
    213 		npf->conn_tracking = CONN_TRACKING_ON;
    214 	}
    215 	mutex_exit(&npf->conn_lock);
    216 
    217 	if (odb) {
    218 		/*
    219 		 * Flush all, no sync since the caller did it for us.
    220 		 * Also, release the pool cache memory.
    221 		 */
    222 		npf_conndb_gc(npf, odb, true, false);
    223 		npf_conndb_destroy(odb);
    224 		pool_cache_invalidate(npf->conn_cache[0]);
    225 		pool_cache_invalidate(npf->conn_cache[1]);
    226 	}
    227 }
    228 
    229 /*
    230  * npf_conn_tracking: enable/disable connection tracking.
    231  */
    232 void
    233 npf_conn_tracking(npf_t *npf, bool track)
    234 {
    235 	KASSERT(npf_config_locked_p(npf));
    236 	npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
    237 }
    238 
    239 static inline bool
    240 npf_conn_trackable_p(const npf_cache_t *npc)
    241 {
    242 	const npf_t *npf = npc->npc_ctx;
    243 
    244 	/*
    245 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
    246 	 * not cached - protocol is not supported or packet is invalid.
    247 	 */
    248 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    249 		return false;
    250 	}
    251 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    252 		return false;
    253 	}
    254 	return true;
    255 }
    256 
    257 static inline void
    258 conn_update_atime(npf_conn_t *con)
    259 {
    260 	struct timespec tsnow;
    261 
    262 	getnanouptime(&tsnow);
    263 	con->c_atime = tsnow.tv_sec;
    264 }
    265 
    266 /*
    267  * npf_conn_check: check that:
    268  *
    269  *	- the connection is active;
    270  *
    271  *	- the packet is travelling in the right direction with the respect
    272  *	  to the connection direction (if interface-id is not zero);
    273  *
    274  *	- the packet is travelling on the same interface as the
    275  *	  connection interface (if interface-id is not zero).
    276  */
    277 static bool
    278 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
    279     const unsigned di, const bool forw)
    280 {
    281 	const uint32_t flags = con->c_flags;
    282 	const unsigned ifid = con->c_ifid;
    283 	bool active, pforw;
    284 
    285 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
    286 	if (__predict_false(!active)) {
    287 		return false;
    288 	}
    289 	if (ifid && nbuf) {
    290 		pforw = (flags & PFIL_ALL) == (unsigned)di;
    291 		if (__predict_false(forw != pforw)) {
    292 			return false;
    293 		}
    294 		if (__predict_false(ifid != nbuf->nb_ifid)) {
    295 			return false;
    296 		}
    297 	}
    298 	return true;
    299 }
    300 
    301 /*
    302  * npf_conn_lookup: lookup if there is an established connection.
    303  *
    304  * => If found, we will hold a reference for the caller.
    305  */
    306 npf_conn_t *
    307 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
    308 {
    309 	npf_t *npf = npc->npc_ctx;
    310 	const nbuf_t *nbuf = npc->npc_nbuf;
    311 	npf_conn_t *con;
    312 	npf_connkey_t key;
    313 
    314 	/* Construct a key and lookup for a connection in the store. */
    315 	if (!npf_conn_conkey(npc, &key, true)) {
    316 		return NULL;
    317 	}
    318 	con = npf_conndb_lookup(npf->conn_db, &key, forw);
    319 	if (con == NULL) {
    320 		return NULL;
    321 	}
    322 	KASSERT(npc->npc_proto == con->c_proto);
    323 
    324 	/* Extra checks for the connection and packet. */
    325 	if (!npf_conn_check(con, nbuf, di, *forw)) {
    326 		atomic_dec_uint(&con->c_refcnt);
    327 		return NULL;
    328 	}
    329 
    330 	/* Update the last activity time. */
    331 	conn_update_atime(con);
    332 	return con;
    333 }
    334 
    335 /*
    336  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
    337  *
    338  * => If found, we will hold a reference for the caller.
    339  */
    340 npf_conn_t *
    341 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
    342 {
    343 	nbuf_t *nbuf = npc->npc_nbuf;
    344 	npf_conn_t *con;
    345 	bool forw, ok;
    346 
    347 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    348 	if (!npf_conn_trackable_p(npc)) {
    349 		return NULL;
    350 	}
    351 
    352 	/* Query ALG which may lookup connection for us. */
    353 	if ((con = npf_alg_conn(npc, di)) != NULL) {
    354 		/* Note: reference is held. */
    355 		return con;
    356 	}
    357 	if (nbuf_head_mbuf(nbuf) == NULL) {
    358 		*error = ENOMEM;
    359 		return NULL;
    360 	}
    361 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    362 
    363 	/* Main lookup of the connection. */
    364 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
    365 		return NULL;
    366 	}
    367 
    368 	/* Inspect the protocol data and handle state changes. */
    369 	mutex_enter(&con->c_lock);
    370 	ok = npf_state_inspect(npc, &con->c_state, forw);
    371 	mutex_exit(&con->c_lock);
    372 
    373 	/* If invalid state: let the rules deal with it. */
    374 	if (__predict_false(!ok)) {
    375 		npf_conn_release(con);
    376 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
    377 		return NULL;
    378 	}
    379 
    380 	/*
    381 	 * If this is multi-end state, then specially tag the packet
    382 	 * so it will be just passed-through on other interfaces.
    383 	 */
    384 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
    385 		npf_conn_release(con);
    386 		*error = ENOMEM;
    387 		return NULL;
    388 	}
    389 	return con;
    390 }
    391 
    392 /*
    393  * npf_conn_establish: create a new connection, insert into the global list.
    394  *
    395  * => Connection is created with the reference held for the caller.
    396  * => Connection will be activated on the first reference release.
    397  */
    398 npf_conn_t *
    399 npf_conn_establish(npf_cache_t *npc, int di, bool global)
    400 {
    401 	npf_t *npf = npc->npc_ctx;
    402 	const unsigned alen = npc->npc_alen;
    403 	const unsigned idx = NPF_CONNCACHE(alen);
    404 	const nbuf_t *nbuf = npc->npc_nbuf;
    405 	npf_connkey_t *fw, *bk;
    406 	npf_conn_t *con;
    407 	int error = 0;
    408 
    409 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    410 
    411 	if (!npf_conn_trackable_p(npc)) {
    412 		return NULL;
    413 	}
    414 
    415 	/* Allocate and initialise the new connection. */
    416 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
    417 	if (__predict_false(!con)) {
    418 		npf_worker_signal(npf);
    419 		return NULL;
    420 	}
    421 	NPF_PRINTF(("NPF: create conn %p\n", con));
    422 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    423 
    424 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    425 	con->c_flags = (di & PFIL_ALL);
    426 	con->c_refcnt = 0;
    427 	con->c_rproc = NULL;
    428 	con->c_nat = NULL;
    429 
    430 	con->c_proto = npc->npc_proto;
    431 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
    432 
    433 	/* Initialize the protocol state. */
    434 	if (!npf_state_init(npc, &con->c_state)) {
    435 		npf_conn_destroy(npf, con);
    436 		return NULL;
    437 	}
    438 	KASSERT(npf_iscached(npc, NPC_IP46));
    439 
    440 	fw = npf_conn_getforwkey(con);
    441 	bk = npf_conn_getbackkey(con, alen);
    442 
    443 	/*
    444 	 * Construct "forwards" and "backwards" keys.  Also, set the
    445 	 * interface ID for this connection (unless it is global).
    446 	 */
    447 	if (!npf_conn_conkey(npc, fw, true) ||
    448 	    !npf_conn_conkey(npc, bk, false)) {
    449 		npf_conn_destroy(npf, con);
    450 		return NULL;
    451 	}
    452 	con->c_ifid = global ? nbuf->nb_ifid : 0;
    453 
    454 	/*
    455 	 * Set last activity time for a new connection and acquire
    456 	 * a reference for the caller before we make it visible.
    457 	 */
    458 	conn_update_atime(con);
    459 	con->c_refcnt = 1;
    460 
    461 	/*
    462 	 * Insert both keys (entries representing directions) of the
    463 	 * connection.  At this point it becomes visible, but we activate
    464 	 * the connection later.
    465 	 */
    466 	mutex_enter(&con->c_lock);
    467 	if (!npf_conndb_insert(npf->conn_db, fw, con, true)) {
    468 		error = EISCONN;
    469 		goto err;
    470 	}
    471 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    472 		npf_conn_t *ret __diagused;
    473 		ret = npf_conndb_remove(npf->conn_db, fw);
    474 		KASSERT(ret == con);
    475 		error = EISCONN;
    476 		goto err;
    477 	}
    478 err:
    479 	/*
    480 	 * If we have hit the duplicate: mark the connection as expired
    481 	 * and let the G/C thread to take care of it.  We cannot do it
    482 	 * here since there might be references acquired already.
    483 	 */
    484 	if (error) {
    485 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    486 		atomic_dec_uint(&con->c_refcnt);
    487 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
    488 	} else {
    489 		NPF_PRINTF(("NPF: establish conn %p\n", con));
    490 	}
    491 
    492 	/* Finally, insert into the connection list. */
    493 	npf_conndb_enqueue(npf->conn_db, con);
    494 	mutex_exit(&con->c_lock);
    495 
    496 	return error ? NULL : con;
    497 }
    498 
    499 void
    500 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
    501 {
    502 	const npf_connkey_t *key = npf_conn_getforwkey(con);
    503 	const unsigned alen = NPF_CONNKEY_ALEN(key);
    504 	const unsigned idx __unused = NPF_CONNCACHE(alen);
    505 
    506 	KASSERT(con->c_refcnt == 0);
    507 
    508 	if (con->c_nat) {
    509 		/* Release any NAT structures. */
    510 		npf_nat_destroy(con->c_nat);
    511 	}
    512 	if (con->c_rproc) {
    513 		/* Release the rule procedure. */
    514 		npf_rproc_release(con->c_rproc);
    515 	}
    516 
    517 	/* Destroy the state. */
    518 	npf_state_destroy(&con->c_state);
    519 	mutex_destroy(&con->c_lock);
    520 
    521 	/* Free the structure, increase the counter. */
    522 	pool_cache_put(npf->conn_cache[idx], con);
    523 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
    524 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
    525 }
    526 
    527 /*
    528  * npf_conn_setnat: associate NAT entry with the connection, update and
    529  * re-insert connection entry using the translation values.
    530  *
    531  * => The caller must be holding a reference.
    532  */
    533 int
    534 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
    535     npf_nat_t *nt, unsigned ntype)
    536 {
    537 	static const u_int nat_type_dimap[] = {
    538 		[NPF_NATOUT] = NPF_DST,
    539 		[NPF_NATIN] = NPF_SRC,
    540 	};
    541 	npf_t *npf = npc->npc_ctx;
    542 	npf_connkey_t key, *fw, *bk;
    543 	npf_conn_t *ret __diagused;
    544 	npf_addr_t *taddr;
    545 	in_port_t tport;
    546 
    547 	KASSERT(con->c_refcnt > 0);
    548 
    549 	npf_nat_gettrans(nt, &taddr, &tport);
    550 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
    551 
    552 	/* Construct a "backwards" key. */
    553 	if (!npf_conn_conkey(npc, &key, false)) {
    554 		return EINVAL;
    555 	}
    556 
    557 	/* Acquire the lock and check for the races. */
    558 	mutex_enter(&con->c_lock);
    559 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    560 		/* The connection got expired. */
    561 		mutex_exit(&con->c_lock);
    562 		return EINVAL;
    563 	}
    564 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
    565 
    566 	if (__predict_false(con->c_nat != NULL)) {
    567 		/* Race with a duplicate packet. */
    568 		mutex_exit(&con->c_lock);
    569 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    570 		return EISCONN;
    571 	}
    572 
    573 	/* Remove the "backwards" key. */
    574 	fw = npf_conn_getforwkey(con);
    575 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    576 	ret = npf_conndb_remove(npf->conn_db, bk);
    577 	KASSERT(ret == con);
    578 
    579 	/* Set the source/destination IDs to the translation values. */
    580 	npf_conn_adjkey(bk, taddr, tport, nat_type_dimap[ntype]);
    581 
    582 	/* Finally, re-insert the "backwards" key. */
    583 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
    584 		/*
    585 		 * Race: we have hit the duplicate, remove the "forwards"
    586 		 * key and expire our connection; it is no longer valid.
    587 		 */
    588 		ret = npf_conndb_remove(npf->conn_db, fw);
    589 		KASSERT(ret == con);
    590 
    591 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    592 		mutex_exit(&con->c_lock);
    593 
    594 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    595 		return EISCONN;
    596 	}
    597 
    598 	/* Associate the NAT entry and release the lock. */
    599 	con->c_nat = nt;
    600 	mutex_exit(&con->c_lock);
    601 	return 0;
    602 }
    603 
    604 /*
    605  * npf_conn_expire: explicitly mark connection as expired.
    606  */
    607 void
    608 npf_conn_expire(npf_conn_t *con)
    609 {
    610 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
    611 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
    612 }
    613 
    614 /*
    615  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
    616  */
    617 bool
    618 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
    619 {
    620 	KASSERT(con->c_refcnt > 0);
    621 	if (__predict_true(con->c_flags & CONN_PASS)) {
    622 		mi->mi_rid = con->c_rid;
    623 		mi->mi_retfl = con->c_retfl;
    624 		*rp = con->c_rproc;
    625 		return true;
    626 	}
    627 	return false;
    628 }
    629 
    630 /*
    631  * npf_conn_setpass: mark connection as a "pass" one and associate the
    632  * rule procedure with it.
    633  */
    634 void
    635 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
    636 {
    637 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
    638 	KASSERT(con->c_refcnt > 0);
    639 	KASSERT(con->c_rproc == NULL);
    640 
    641 	/*
    642 	 * No need for atomic since the connection is not yet active.
    643 	 * If rproc is set, the caller transfers its reference to us,
    644 	 * which will be released on npf_conn_destroy().
    645 	 */
    646 	atomic_or_uint(&con->c_flags, CONN_PASS);
    647 	con->c_rproc = rp;
    648 	if (rp) {
    649 		con->c_rid = mi->mi_rid;
    650 		con->c_retfl = mi->mi_retfl;
    651 	}
    652 }
    653 
    654 /*
    655  * npf_conn_release: release a reference, which might allow G/C thread
    656  * to destroy this connection.
    657  */
    658 void
    659 npf_conn_release(npf_conn_t *con)
    660 {
    661 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
    662 		/* Activate: after this, connection is globally visible. */
    663 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
    664 	}
    665 	KASSERT(con->c_refcnt > 0);
    666 	atomic_dec_uint(&con->c_refcnt);
    667 }
    668 
    669 /*
    670  * npf_conn_getnat: return associated NAT data entry and indicate
    671  * whether it is a "forwards" or "backwards" stream.
    672  */
    673 npf_nat_t *
    674 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
    675 {
    676 	KASSERT(con->c_refcnt > 0);
    677 	*forw = (con->c_flags & PFIL_ALL) == (u_int)di;
    678 	return con->c_nat;
    679 }
    680 
    681 /*
    682  * npf_conn_expired: criterion to check if connection is expired.
    683  */
    684 bool
    685 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
    686 {
    687 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
    688 	int elapsed;
    689 
    690 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
    691 		/* Explicitly marked to be expired. */
    692 		return true;
    693 	}
    694 
    695 	/*
    696 	 * Note: another thread may update 'atime' and it might
    697 	 * become greater than 'now'.
    698 	 */
    699 	elapsed = (int64_t)tsnow - con->c_atime;
    700 	return elapsed > etime;
    701 }
    702 
    703 /*
    704  * npf_conn_remove: unlink the connection and mark as expired.
    705  */
    706 void
    707 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
    708 {
    709 	/* Remove both entries of the connection. */
    710 	mutex_enter(&con->c_lock);
    711 	if ((con->c_flags & CONN_REMOVED) == 0) {
    712 		npf_connkey_t *fw, *bk;
    713 		npf_conn_t *ret __diagused;
    714 
    715 		fw = npf_conn_getforwkey(con);
    716 		ret = npf_conndb_remove(cd, fw);
    717 		KASSERT(ret == con);
    718 
    719 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    720 		ret = npf_conndb_remove(cd, bk);
    721 		KASSERT(ret == con);
    722 	}
    723 
    724 	/* Flag the removal and expiration. */
    725 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    726 	mutex_exit(&con->c_lock);
    727 }
    728 
    729 /*
    730  * npf_conn_worker: G/C to run from a worker thread.
    731  */
    732 void
    733 npf_conn_worker(npf_t *npf)
    734 {
    735 	npf_conndb_gc(npf, npf->conn_db, false, true);
    736 }
    737 
    738 /*
    739  * npf_conndb_export: construct a list of connections prepared for saving.
    740  * Note: this is expected to be an expensive operation.
    741  */
    742 int
    743 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
    744 {
    745 	npf_conn_t *head, *con;
    746 
    747 	/*
    748 	 * Note: acquire conn_lock to prevent from the database
    749 	 * destruction and G/C thread.
    750 	 */
    751 	mutex_enter(&npf->conn_lock);
    752 	if (npf->conn_tracking != CONN_TRACKING_ON) {
    753 		mutex_exit(&npf->conn_lock);
    754 		return 0;
    755 	}
    756 	head = npf_conndb_getlist(npf->conn_db);
    757 	con = head;
    758 	while (con) {
    759 		nvlist_t *cdict;
    760 
    761 		if ((cdict = npf_conn_export(npf, con)) != NULL) {
    762 			nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
    763 			nvlist_destroy(cdict);
    764 		}
    765 		if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
    766 			break;
    767 		}
    768 	}
    769 	mutex_exit(&npf->conn_lock);
    770 	return 0;
    771 }
    772 
    773 /*
    774  * npf_conn_export: serialise a single connection.
    775  */
    776 static nvlist_t *
    777 npf_conn_export(npf_t *npf, npf_conn_t *con)
    778 {
    779 	nvlist_t *cdict, *kdict;
    780 	npf_connkey_t *fw, *bk;
    781 	unsigned alen;
    782 
    783 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
    784 		return NULL;
    785 	}
    786 	cdict = nvlist_create(0);
    787 	nvlist_add_number(cdict, "flags", con->c_flags);
    788 	nvlist_add_number(cdict, "proto", con->c_proto);
    789 	if (con->c_ifid) {
    790 		const char *ifname = npf_ifmap_getname(npf, con->c_ifid);
    791 		nvlist_add_string(cdict, "ifname", ifname);
    792 	}
    793 	nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
    794 
    795 	fw = npf_conn_getforwkey(con);
    796 	alen = NPF_CONNKEY_ALEN(fw);
    797 	bk = npf_conn_getbackkey(con, alen);
    798 
    799 	kdict = npf_connkey_export(fw);
    800 	nvlist_move_nvlist(cdict, "forw-key", kdict);
    801 
    802 	kdict = npf_connkey_export(bk);
    803 	nvlist_move_nvlist(cdict, "back-key", kdict);
    804 
    805 	/* Let the address length be based on on first key. */
    806 	nvlist_add_number(cdict, "alen", alen);
    807 
    808 	if (con->c_nat) {
    809 		npf_nat_export(cdict, con->c_nat);
    810 	}
    811 	return cdict;
    812 }
    813 
    814 /*
    815  * npf_conn_import: fully reconstruct a single connection from a
    816  * nvlist and insert into the given database.
    817  */
    818 int
    819 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
    820     npf_ruleset_t *natlist)
    821 {
    822 	npf_conn_t *con;
    823 	npf_connkey_t *fw, *bk;
    824 	const nvlist_t *nat, *conkey;
    825 	const char *ifname;
    826 	const void *state;
    827 	unsigned alen, idx;
    828 	size_t len;
    829 
    830 	/*
    831 	 * To determine the length of the connection, which depends
    832 	 * on the address length in the connection keys.
    833 	 */
    834 	alen = dnvlist_get_number(cdict, "alen", 0);
    835 	idx = NPF_CONNCACHE(alen);
    836 
    837 	/* Allocate a connection and initialise it (clear first). */
    838 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
    839 	memset(con, 0, sizeof(npf_conn_t));
    840 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    841 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    842 
    843 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
    844 	con->c_flags = dnvlist_get_number(cdict, "flags", 0);
    845 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
    846 	conn_update_atime(con);
    847 
    848 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
    849 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
    850 		goto err;
    851 	}
    852 
    853 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
    854 	if (!state || len != sizeof(npf_state_t)) {
    855 		goto err;
    856 	}
    857 	memcpy(&con->c_state, state, sizeof(npf_state_t));
    858 
    859 	/* Reconstruct NAT association, if any. */
    860 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
    861 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
    862 		goto err;
    863 	}
    864 
    865 	/*
    866 	 * Fetch and copy the keys for each direction.
    867 	 */
    868 	fw = npf_conn_getforwkey(con);
    869 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
    870 	if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
    871 		goto err;
    872 	}
    873 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    874 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
    875 	if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
    876 		goto err;
    877 	}
    878 
    879 	/* Guard against the contradicting address lengths. */
    880 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
    881 		goto err;
    882 	}
    883 
    884 	/* Insert the entries and the connection itself. */
    885 	if (!npf_conndb_insert(cd, fw, con, true)) {
    886 		goto err;
    887 	}
    888 	if (!npf_conndb_insert(cd, bk, con, false)) {
    889 		npf_conndb_remove(cd, fw);
    890 		goto err;
    891 	}
    892 
    893 	NPF_PRINTF(("NPF: imported conn %p\n", con));
    894 	npf_conndb_enqueue(cd, con);
    895 	return 0;
    896 err:
    897 	npf_conn_destroy(npf, con);
    898 	return EINVAL;
    899 }
    900 
    901 int
    902 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
    903 {
    904 	const nvlist_t *kdict;
    905 	npf_connkey_t key;
    906 	npf_conn_t *con;
    907 	uint16_t dir;
    908 	bool forw;
    909 
    910 	kdict = dnvlist_get_nvlist(idict, "key", NULL);
    911 	if (!kdict || !npf_connkey_import(kdict, &key)) {
    912 		return EINVAL;
    913 	}
    914 	con = npf_conndb_lookup(npf->conn_db, &key, &forw);
    915 	if (con == NULL) {
    916 		return ESRCH;
    917 	}
    918 	dir = dnvlist_get_number(idict, "direction", 0);
    919 	if (!npf_conn_check(con, NULL, dir, true)) {
    920 		atomic_dec_uint(&con->c_refcnt);
    921 		return ESRCH;
    922 	}
    923 	*odict = npf_conn_export(npf, con);
    924 	atomic_dec_uint(&con->c_refcnt);
    925 	return *odict ? 0 : ENOSPC;
    926 }
    927 
    928 #if defined(DDB) || defined(_NPF_TESTING)
    929 
    930 void
    931 npf_conn_print(npf_conn_t *con)
    932 {
    933 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
    934 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    935 	const unsigned proto = con->c_proto;
    936 	struct timespec tspnow;
    937 
    938 	getnanouptime(&tspnow);
    939 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
    940 	    proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime),
    941 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
    942 	npf_connkey_print(fw);
    943 	npf_connkey_print(bk);
    944 	npf_state_dump(&con->c_state);
    945 	if (con->c_nat) {
    946 		npf_nat_dump(con->c_nat);
    947 	}
    948 }
    949 
    950 #endif
    951