Home | History | Annotate | Line # | Download | only in npf
npf_conn.c revision 1.32
      1 /*-
      2  * Copyright (c) 2014-2020 Mindaugas Rasiukevicius <rmind at noxt eu>
      3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This material is based upon work partially supported by The
      7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * NPF connection tracking for stateful filtering and translation.
     33  *
     34  * Overview
     35  *
     36  *	Packets can be incoming or outgoing with respect to an interface.
     37  *	Connection direction is identified by the direction of its first
     38  *	packet.  The meaning of incoming/outgoing packet in the context of
     39  *	connection direction can be confusing.  Therefore, we will use the
     40  *	terms "forwards stream" and "backwards stream", where packets in
     41  *	the forwards stream mean the packets travelling in the direction
     42  *	as the connection direction.
     43  *
     44  *	All connections have two keys and thus two entries:
     45  *
     46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
     47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
     48  *
     49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
     50  *	to allocate variable-length npf_conn_t structures based on whether
     51  *	the IPv4 or IPv6 addresses are used.
     52  *
     53  *	The key is an n-tuple used to identify the connection flow: see the
     54  *	npf_connkey.c source file for the description of the key layouts.
     55  *	The key may be formed using translated values in a case of NAT.
     56  *
     57  *	Connections can serve two purposes: for the implicit passing and/or
     58  *	to accommodate the dynamic NAT.  Connections for the former purpose
     59  *	are created by the rules with "stateful" attribute and are used for
     60  *	stateful filtering.  Such connections indicate that the packet of
     61  *	the backwards stream should be passed without inspection of the
     62  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
     63  *	with a connection.  Such connections are created by the NAT policies
     64  *	and they have a relationship with NAT translation structure via
     65  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
     66  *	which is a common case.
     67  *
     68  * Connection life-cycle
     69  *
     70  *	Connections are established when a packet matches said rule or
     71  *	NAT policy.  Both keys of the established connection are inserted
     72  *	into the connection database.  A garbage collection thread
     73  *	periodically scans all connections and depending on connection
     74  *	properties (e.g. last activity time, protocol) removes connection
     75  *	entries and expires the actual connections.
     76  *
     77  *	Each connection has a reference count.  The reference is acquired
     78  *	on lookup and should be released by the caller.  It guarantees that
     79  *	the connection will not be destroyed, although it may be expired.
     80  *
     81  * Synchronization
     82  *
     83  *	Connection database is accessed in a lock-free manner by the main
     84  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
     85  *	are always called from a software interrupt, the database is
     86  *	protected using EBR.  The main place which can destroy a connection
     87  *	is npf_conn_worker().  The database itself can be replaced and
     88  *	destroyed in npf_conn_reload().
     89  *
     90  * ALG support
     91  *
     92  *	Application-level gateways (ALGs) can override generic connection
     93  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
     94  *	performing their own lookup using different key.  Recursive call
     95  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
     96  *	npf_conn_lookup() function for this purpose.
     97  *
     98  * Lock order
     99  *
    100  *	npf_config_lock ->
    101  *		conn_lock ->
    102  *			npf_conn_t::c_lock
    103  */
    104 
    105 #ifdef _KERNEL
    106 #include <sys/cdefs.h>
    107 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.32 2020/05/30 14:16:56 rmind Exp $");
    108 
    109 #include <sys/param.h>
    110 #include <sys/types.h>
    111 
    112 #include <netinet/in.h>
    113 #include <netinet/tcp.h>
    114 
    115 #include <sys/atomic.h>
    116 #include <sys/kmem.h>
    117 #include <sys/mutex.h>
    118 #include <net/pfil.h>
    119 #include <sys/pool.h>
    120 #include <sys/queue.h>
    121 #include <sys/systm.h>
    122 #endif
    123 
    124 #define __NPF_CONN_PRIVATE
    125 #include "npf_conn.h"
    126 #include "npf_impl.h"
    127 
    128 /* A helper to select the IPv4 or IPv6 connection cache. */
    129 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
    130 
    131 /*
    132  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
    133  */
    134 CTASSERT(PFIL_ALL == (0x001 | 0x002));
    135 #define	CONN_ACTIVE	0x004	/* visible on inspection */
    136 #define	CONN_PASS	0x008	/* perform implicit passing */
    137 #define	CONN_EXPIRE	0x010	/* explicitly expire */
    138 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
    139 
    140 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
    141 
    142 static int	npf_conn_export(npf_t *, npf_conn_t *, nvlist_t *);
    143 
    144 /*
    145  * npf_conn_sys{init,fini}: initialize/destroy connection tracking.
    146  */
    147 
    148 void
    149 npf_conn_init(npf_t *npf)
    150 {
    151 	npf_conn_params_t *params = npf_param_allocgroup(npf,
    152 	    NPF_PARAMS_CONN, sizeof(npf_conn_params_t));
    153 	npf_param_t param_map[] = {
    154 		{
    155 			"state.key.interface",
    156 			&params->connkey_interface,
    157 			.default_val = 1, // true
    158 			.min = 0, .max = 1
    159 		},
    160 		{
    161 			"state.key.direction",
    162 			&params->connkey_direction,
    163 			.default_val = 1, // true
    164 			.min = 0, .max = 1
    165 		},
    166 	};
    167 	npf_param_register(npf, param_map, __arraycount(param_map));
    168 
    169 	npf->conn_cache[0] = pool_cache_init(
    170 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
    171 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
    172 	npf->conn_cache[1] = pool_cache_init(
    173 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
    174 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
    175 
    176 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
    177 	atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_OFF);
    178 	npf->conn_db = npf_conndb_create();
    179 	npf_conndb_sysinit(npf);
    180 
    181 	npf_worker_addfunc(npf, npf_conn_worker);
    182 }
    183 
    184 void
    185 npf_conn_fini(npf_t *npf)
    186 {
    187 	const size_t len = sizeof(npf_conn_params_t);
    188 
    189 	/* Note: the caller should have flushed the connections. */
    190 	KASSERT(atomic_load_relaxed(&npf->conn_tracking) == CONN_TRACKING_OFF);
    191 
    192 	npf_conndb_destroy(npf->conn_db);
    193 	pool_cache_destroy(npf->conn_cache[0]);
    194 	pool_cache_destroy(npf->conn_cache[1]);
    195 	mutex_destroy(&npf->conn_lock);
    196 
    197 	npf_param_freegroup(npf, NPF_PARAMS_CONN, len);
    198 	npf_conndb_sysfini(npf);
    199 }
    200 
    201 /*
    202  * npf_conn_load: perform the load by flushing the current connection
    203  * database and replacing it with the new one or just destroying.
    204  *
    205  * => The caller must disable the connection tracking and ensure that
    206  *    there are no connection database lookups or references in-flight.
    207  */
    208 void
    209 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
    210 {
    211 	npf_conndb_t *odb = NULL;
    212 
    213 	KASSERT(npf_config_locked_p(npf));
    214 
    215 	/*
    216 	 * The connection database is in the quiescent state.
    217 	 * Prevent G/C thread from running and install a new database.
    218 	 */
    219 	mutex_enter(&npf->conn_lock);
    220 	if (ndb) {
    221 		KASSERT(atomic_load_relaxed(&npf->conn_tracking)
    222 		    == CONN_TRACKING_OFF);
    223 		odb = atomic_load_relaxed(&npf->conn_db);
    224 		membar_sync();
    225 		atomic_store_relaxed(&npf->conn_db, ndb);
    226 	}
    227 	if (track) {
    228 		/* After this point lookups start flying in. */
    229 		membar_producer();
    230 		atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_ON);
    231 	}
    232 	mutex_exit(&npf->conn_lock);
    233 
    234 	if (odb) {
    235 		/*
    236 		 * Flush all, no sync since the caller did it for us.
    237 		 * Also, release the pool cache memory.
    238 		 */
    239 		npf_conndb_gc(npf, odb, true, false);
    240 		npf_conndb_destroy(odb);
    241 		pool_cache_invalidate(npf->conn_cache[0]);
    242 		pool_cache_invalidate(npf->conn_cache[1]);
    243 	}
    244 }
    245 
    246 /*
    247  * npf_conn_tracking: enable/disable connection tracking.
    248  */
    249 void
    250 npf_conn_tracking(npf_t *npf, bool track)
    251 {
    252 	KASSERT(npf_config_locked_p(npf));
    253 	atomic_store_relaxed(&npf->conn_tracking,
    254 	    track ? CONN_TRACKING_ON : CONN_TRACKING_OFF);
    255 }
    256 
    257 static inline bool
    258 npf_conn_trackable_p(const npf_cache_t *npc)
    259 {
    260 	const npf_t *npf = npc->npc_ctx;
    261 
    262 	/*
    263 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
    264 	 * not cached - protocol is not supported or packet is invalid.
    265 	 */
    266 	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
    267 		return false;
    268 	}
    269 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    270 		return false;
    271 	}
    272 	return true;
    273 }
    274 
    275 static inline void
    276 conn_update_atime(npf_conn_t *con)
    277 {
    278 	struct timespec tsnow;
    279 
    280 	getnanouptime(&tsnow);
    281 	atomic_store_relaxed(&con->c_atime, tsnow.tv_sec);
    282 }
    283 
    284 /*
    285  * npf_conn_check: check that:
    286  *
    287  *	- the connection is active;
    288  *
    289  *	- the packet is travelling in the right direction with the respect
    290  *	  to the connection direction (if interface-id is not zero);
    291  *
    292  *	- the packet is travelling on the same interface as the
    293  *	  connection interface (if interface-id is not zero).
    294  */
    295 static bool
    296 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
    297     const unsigned di, const npf_flow_t flow)
    298 {
    299 	const uint32_t flags = atomic_load_relaxed(&con->c_flags);
    300 	const unsigned ifid = atomic_load_relaxed(&con->c_ifid);
    301 	bool active;
    302 
    303 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
    304 	if (__predict_false(!active)) {
    305 		return false;
    306 	}
    307 	if (ifid && nbuf) {
    308 		const bool match = (flags & PFIL_ALL) == di;
    309 		npf_flow_t pflow = match ? NPF_FLOW_FORW : NPF_FLOW_BACK;
    310 
    311 		if (__predict_false(flow != pflow)) {
    312 			return false;
    313 		}
    314 		if (__predict_false(ifid != nbuf->nb_ifid)) {
    315 			return false;
    316 		}
    317 	}
    318 	return true;
    319 }
    320 
    321 /*
    322  * npf_conn_lookup: lookup if there is an established connection.
    323  *
    324  * => If found, we will hold a reference for the caller.
    325  */
    326 npf_conn_t *
    327 npf_conn_lookup(const npf_cache_t *npc, const unsigned di, npf_flow_t *flow)
    328 {
    329 	npf_t *npf = npc->npc_ctx;
    330 	const nbuf_t *nbuf = npc->npc_nbuf;
    331 	npf_conn_t *con;
    332 	npf_connkey_t key;
    333 
    334 	/* Construct a key and lookup for a connection in the store. */
    335 	if (!npf_conn_conkey(npc, &key, di, NPF_FLOW_FORW)) {
    336 		return NULL;
    337 	}
    338 	con = npf_conndb_lookup(npf, &key, flow);
    339 	if (con == NULL) {
    340 		return NULL;
    341 	}
    342 	KASSERT(npc->npc_proto == atomic_load_relaxed(&con->c_proto));
    343 
    344 	/* Extra checks for the connection and packet. */
    345 	if (!npf_conn_check(con, nbuf, di, *flow)) {
    346 		atomic_dec_uint(&con->c_refcnt);
    347 		return NULL;
    348 	}
    349 
    350 	/* Update the last activity time. */
    351 	conn_update_atime(con);
    352 	return con;
    353 }
    354 
    355 /*
    356  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
    357  *
    358  * => If found, we will hold a reference for the caller.
    359  */
    360 npf_conn_t *
    361 npf_conn_inspect(npf_cache_t *npc, const unsigned di, int *error)
    362 {
    363 	nbuf_t *nbuf = npc->npc_nbuf;
    364 	npf_flow_t flow;
    365 	npf_conn_t *con;
    366 	bool ok;
    367 
    368 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    369 	if (!npf_conn_trackable_p(npc)) {
    370 		return NULL;
    371 	}
    372 
    373 	/* Query ALG which may lookup connection for us. */
    374 	if ((con = npf_alg_conn(npc, di)) != NULL) {
    375 		/* Note: reference is held. */
    376 		return con;
    377 	}
    378 	if (nbuf_head_mbuf(nbuf) == NULL) {
    379 		*error = ENOMEM;
    380 		return NULL;
    381 	}
    382 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    383 
    384 	/* The main lookup of the connection (acquires a reference). */
    385 	if ((con = npf_conn_lookup(npc, di, &flow)) == NULL) {
    386 		return NULL;
    387 	}
    388 
    389 	/* Inspect the protocol data and handle state changes. */
    390 	mutex_enter(&con->c_lock);
    391 	ok = npf_state_inspect(npc, &con->c_state, flow);
    392 	mutex_exit(&con->c_lock);
    393 
    394 	/* If invalid state: let the rules deal with it. */
    395 	if (__predict_false(!ok)) {
    396 		npf_conn_release(con);
    397 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
    398 		return NULL;
    399 	}
    400 #if 0
    401 	/*
    402 	 * TODO -- determine when this might be wanted/used.
    403 	 *
    404 	 * Note: skipping the connection lookup and ruleset inspection
    405 	 * on other interfaces will also bypass dynamic NAT.
    406 	 */
    407 	if (atomic_load_relaxed(&con->c_flags) & CONN_GPASS) {
    408 		/*
    409 		 * Note: if tagging fails, then give this packet a chance
    410 		 * to go through a regular ruleset.
    411 		 */
    412 		(void)nbuf_add_tag(nbuf, NPF_NTAG_PASS);
    413 	}
    414 #endif
    415 	return con;
    416 }
    417 
    418 /*
    419  * npf_conn_establish: create a new connection, insert into the global list.
    420  *
    421  * => Connection is created with the reference held for the caller.
    422  * => Connection will be activated on the first reference release.
    423  */
    424 npf_conn_t *
    425 npf_conn_establish(npf_cache_t *npc, const unsigned di, bool global)
    426 {
    427 	npf_t *npf = npc->npc_ctx;
    428 	const unsigned alen = npc->npc_alen;
    429 	const unsigned idx = NPF_CONNCACHE(alen);
    430 	const nbuf_t *nbuf = npc->npc_nbuf;
    431 	npf_connkey_t *fw, *bk;
    432 	npf_conndb_t *conn_db;
    433 	npf_conn_t *con;
    434 	int error = 0;
    435 
    436 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    437 
    438 	if (!npf_conn_trackable_p(npc)) {
    439 		return NULL;
    440 	}
    441 
    442 	/* Allocate and initialize the new connection. */
    443 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
    444 	if (__predict_false(!con)) {
    445 		npf_worker_signal(npf);
    446 		return NULL;
    447 	}
    448 	NPF_PRINTF(("NPF: create conn %p\n", con));
    449 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    450 
    451 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    452 	atomic_store_relaxed(&con->c_flags, di & PFIL_ALL);
    453 	atomic_store_relaxed(&con->c_refcnt, 0);
    454 	con->c_rproc = NULL;
    455 	con->c_nat = NULL;
    456 
    457 	con->c_proto = npc->npc_proto;
    458 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
    459 	con->c_alen = alen;
    460 
    461 	/* Initialize the protocol state. */
    462 	if (!npf_state_init(npc, &con->c_state)) {
    463 		npf_conn_destroy(npf, con);
    464 		return NULL;
    465 	}
    466 	KASSERT(npf_iscached(npc, NPC_IP46));
    467 
    468 	fw = npf_conn_getforwkey(con);
    469 	bk = npf_conn_getbackkey(con, alen);
    470 
    471 	/*
    472 	 * Construct "forwards" and "backwards" keys.  Also, set the
    473 	 * interface ID for this connection (unless it is global).
    474 	 */
    475 	if (!npf_conn_conkey(npc, fw, di, NPF_FLOW_FORW) ||
    476 	    !npf_conn_conkey(npc, bk, di ^ PFIL_ALL, NPF_FLOW_BACK)) {
    477 		npf_conn_destroy(npf, con);
    478 		return NULL;
    479 	}
    480 	con->c_ifid = global ? nbuf->nb_ifid : 0;
    481 
    482 	/*
    483 	 * Set last activity time for a new connection and acquire
    484 	 * a reference for the caller before we make it visible.
    485 	 */
    486 	conn_update_atime(con);
    487 	atomic_store_relaxed(&con->c_refcnt, 1);
    488 
    489 	/*
    490 	 * Insert both keys (entries representing directions) of the
    491 	 * connection.  At this point it becomes visible, but we activate
    492 	 * the connection later.
    493 	 */
    494 	mutex_enter(&con->c_lock);
    495 	conn_db = atomic_load_relaxed(&npf->conn_db);
    496 	if (!npf_conndb_insert(conn_db, fw, con, NPF_FLOW_FORW)) {
    497 		error = EISCONN;
    498 		goto err;
    499 	}
    500 	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
    501 		npf_conn_t *ret __diagused;
    502 		ret = npf_conndb_remove(conn_db, fw);
    503 		KASSERT(ret == con);
    504 		error = EISCONN;
    505 		goto err;
    506 	}
    507 err:
    508 	/*
    509 	 * If we have hit the duplicate: mark the connection as expired
    510 	 * and let the G/C thread to take care of it.  We cannot do it
    511 	 * here since there might be references acquired already.
    512 	 */
    513 	if (error) {
    514 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    515 		atomic_dec_uint(&con->c_refcnt);
    516 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
    517 	} else {
    518 		NPF_PRINTF(("NPF: establish conn %p\n", con));
    519 	}
    520 
    521 	/* Finally, insert into the connection list. */
    522 	npf_conndb_enqueue(conn_db, con);
    523 	mutex_exit(&con->c_lock);
    524 
    525 	return error ? NULL : con;
    526 }
    527 
    528 void
    529 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
    530 {
    531 	const unsigned idx __unused = NPF_CONNCACHE(con->c_alen);
    532 
    533 	KASSERT(atomic_load_relaxed(&con->c_refcnt) == 0);
    534 
    535 	if (con->c_nat) {
    536 		/* Release any NAT structures. */
    537 		npf_nat_destroy(con, con->c_nat);
    538 	}
    539 	if (con->c_rproc) {
    540 		/* Release the rule procedure. */
    541 		npf_rproc_release(con->c_rproc);
    542 	}
    543 
    544 	/* Destroy the state. */
    545 	npf_state_destroy(&con->c_state);
    546 	mutex_destroy(&con->c_lock);
    547 
    548 	/* Free the structure, increase the counter. */
    549 	pool_cache_put(npf->conn_cache[idx], con);
    550 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
    551 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
    552 }
    553 
    554 /*
    555  * npf_conn_setnat: associate NAT entry with the connection, update and
    556  * re-insert connection entry using the translation values.
    557  *
    558  * => The caller must be holding a reference.
    559  */
    560 int
    561 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
    562     npf_nat_t *nt, unsigned ntype)
    563 {
    564 	static const unsigned nat_type_which[] = {
    565 		/* See the description in npf_nat_which(). */
    566 		[NPF_NATOUT] = NPF_DST,
    567 		[NPF_NATIN] = NPF_SRC,
    568 	};
    569 	npf_t *npf = npc->npc_ctx;
    570 	npf_conn_t *ret __diagused;
    571 	npf_conndb_t *conn_db;
    572 	npf_connkey_t *bk;
    573 	npf_addr_t *taddr;
    574 	in_port_t tport;
    575 	uint32_t flags;
    576 
    577 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
    578 
    579 	npf_nat_gettrans(nt, &taddr, &tport);
    580 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
    581 
    582 	/* Acquire the lock and check for the races. */
    583 	mutex_enter(&con->c_lock);
    584 	flags = atomic_load_relaxed(&con->c_flags);
    585 	if (__predict_false(flags & CONN_EXPIRE)) {
    586 		/* The connection got expired. */
    587 		mutex_exit(&con->c_lock);
    588 		return EINVAL;
    589 	}
    590 	KASSERT((flags & CONN_REMOVED) == 0);
    591 
    592 	if (__predict_false(con->c_nat != NULL)) {
    593 		/* Race with a duplicate packet. */
    594 		mutex_exit(&con->c_lock);
    595 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    596 		return EISCONN;
    597 	}
    598 
    599 	/* Remove the "backwards" key. */
    600 	conn_db = atomic_load_relaxed(&npf->conn_db);
    601 	bk = npf_conn_getbackkey(con, con->c_alen);
    602 	ret = npf_conndb_remove(conn_db, bk);
    603 	KASSERT(ret == con);
    604 
    605 	/* Set the source/destination IDs to the translation values. */
    606 	npf_conn_adjkey(bk, taddr, tport, nat_type_which[ntype]);
    607 
    608 	/* Finally, re-insert the "backwards" key. */
    609 	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
    610 		/*
    611 		 * Race: we have hit the duplicate, remove the "forwards"
    612 		 * key and expire our connection; it is no longer valid.
    613 		 */
    614 		npf_connkey_t *fw = npf_conn_getforwkey(con);
    615 		ret = npf_conndb_remove(conn_db, fw);
    616 		KASSERT(ret == con);
    617 
    618 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    619 		mutex_exit(&con->c_lock);
    620 
    621 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
    622 		return EISCONN;
    623 	}
    624 
    625 	/* Associate the NAT entry and release the lock. */
    626 	con->c_nat = nt;
    627 	mutex_exit(&con->c_lock);
    628 	return 0;
    629 }
    630 
    631 /*
    632  * npf_conn_expire: explicitly mark connection as expired.
    633  *
    634  * => Must be called with: a) reference held  b) the relevant lock held.
    635  *    The relevant lock should prevent from connection destruction, e.g.
    636  *    npf_t::conn_lock or npf_natpolicy_t::n_lock.
    637  */
    638 void
    639 npf_conn_expire(npf_conn_t *con)
    640 {
    641 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
    642 }
    643 
    644 /*
    645  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
    646  */
    647 bool
    648 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
    649 {
    650 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
    651 	if (__predict_true(atomic_load_relaxed(&con->c_flags) & CONN_PASS)) {
    652 		mi->mi_retfl = atomic_load_relaxed(&con->c_retfl);
    653 		mi->mi_rid = con->c_rid;
    654 		*rp = con->c_rproc;
    655 		return true;
    656 	}
    657 	return false;
    658 }
    659 
    660 /*
    661  * npf_conn_setpass: mark connection as a "pass" one and associate the
    662  * rule procedure with it.
    663  */
    664 void
    665 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
    666 {
    667 	KASSERT((atomic_load_relaxed(&con->c_flags) & CONN_ACTIVE) == 0);
    668 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
    669 	KASSERT(con->c_rproc == NULL);
    670 
    671 	/*
    672 	 * No need for atomic since the connection is not yet active.
    673 	 * If rproc is set, the caller transfers its reference to us,
    674 	 * which will be released on npf_conn_destroy().
    675 	 */
    676 	atomic_or_uint(&con->c_flags, CONN_PASS);
    677 	con->c_rproc = rp;
    678 	if (rp) {
    679 		con->c_rid = mi->mi_rid;
    680 		con->c_retfl = mi->mi_retfl;
    681 	}
    682 }
    683 
    684 /*
    685  * npf_conn_release: release a reference, which might allow G/C thread
    686  * to destroy this connection.
    687  */
    688 void
    689 npf_conn_release(npf_conn_t *con)
    690 {
    691 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
    692 
    693 	if ((flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
    694 		/* Activate: after this, connection is globally visible. */
    695 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
    696 	}
    697 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
    698 	atomic_dec_uint(&con->c_refcnt);
    699 }
    700 
    701 /*
    702  * npf_conn_getnat: return the associated NAT entry, if any.
    703  */
    704 npf_nat_t *
    705 npf_conn_getnat(const npf_conn_t *con)
    706 {
    707 	return con->c_nat;
    708 }
    709 
    710 /*
    711  * npf_conn_expired: criterion to check if connection is expired.
    712  */
    713 bool
    714 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
    715 {
    716 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
    717 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
    718 	int elapsed;
    719 
    720 	if (__predict_false(flags & CONN_EXPIRE)) {
    721 		/* Explicitly marked to be expired. */
    722 		return true;
    723 	}
    724 
    725 	/*
    726 	 * Note: another thread may update 'atime' and it might
    727 	 * become greater than 'now'.
    728 	 */
    729 	elapsed = (int64_t)tsnow - atomic_load_relaxed(&con->c_atime);
    730 	return elapsed > etime;
    731 }
    732 
    733 /*
    734  * npf_conn_remove: unlink the connection and mark as expired.
    735  */
    736 void
    737 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
    738 {
    739 	/* Remove both entries of the connection. */
    740 	mutex_enter(&con->c_lock);
    741 	if ((atomic_load_relaxed(&con->c_flags) & CONN_REMOVED) == 0) {
    742 		npf_connkey_t *fw, *bk;
    743 		npf_conn_t *ret __diagused;
    744 
    745 		fw = npf_conn_getforwkey(con);
    746 		ret = npf_conndb_remove(cd, fw);
    747 		KASSERT(ret == con);
    748 
    749 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    750 		ret = npf_conndb_remove(cd, bk);
    751 		KASSERT(ret == con);
    752 	}
    753 
    754 	/* Flag the removal and expiration. */
    755 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
    756 	mutex_exit(&con->c_lock);
    757 }
    758 
    759 /*
    760  * npf_conn_worker: G/C to run from a worker thread or via npfk_gc().
    761  */
    762 void
    763 npf_conn_worker(npf_t *npf)
    764 {
    765 	npf_conndb_t *conn_db = atomic_load_relaxed(&npf->conn_db);
    766 	npf_conndb_gc(npf, conn_db, false, true);
    767 }
    768 
    769 /*
    770  * npf_conndb_export: construct a list of connections prepared for saving.
    771  * Note: this is expected to be an expensive operation.
    772  */
    773 int
    774 npf_conndb_export(npf_t *npf, nvlist_t *nvl)
    775 {
    776 	npf_conn_t *head, *con;
    777 	npf_conndb_t *conn_db;
    778 
    779 	/*
    780 	 * Note: acquire conn_lock to prevent from the database
    781 	 * destruction and G/C thread.
    782 	 */
    783 	mutex_enter(&npf->conn_lock);
    784 	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
    785 		mutex_exit(&npf->conn_lock);
    786 		return 0;
    787 	}
    788 	conn_db = atomic_load_relaxed(&npf->conn_db);
    789 	head = npf_conndb_getlist(conn_db);
    790 	con = head;
    791 	while (con) {
    792 		nvlist_t *con_nvl;
    793 
    794 		con_nvl = nvlist_create(0);
    795 		if (npf_conn_export(npf, con, con_nvl) == 0) {
    796 			nvlist_append_nvlist_array(nvl, "conn-list", con_nvl);
    797 		}
    798 		nvlist_destroy(con_nvl);
    799 
    800 		if ((con = npf_conndb_getnext(conn_db, con)) == head) {
    801 			break;
    802 		}
    803 	}
    804 	mutex_exit(&npf->conn_lock);
    805 	return 0;
    806 }
    807 
    808 /*
    809  * npf_conn_export: serialize a single connection.
    810  */
    811 static int
    812 npf_conn_export(npf_t *npf, npf_conn_t *con, nvlist_t *nvl)
    813 {
    814 	nvlist_t *knvl;
    815 	npf_connkey_t *fw, *bk;
    816 	unsigned flags, alen;
    817 
    818 	flags = atomic_load_relaxed(&con->c_flags);
    819 	if ((flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
    820 		return ESRCH;
    821 	}
    822 	nvlist_add_number(nvl, "flags", flags);
    823 	nvlist_add_number(nvl, "proto", con->c_proto);
    824 	if (con->c_ifid) {
    825 		char ifname[IFNAMSIZ];
    826 		npf_ifmap_copyname(npf, con->c_ifid, ifname, sizeof(ifname));
    827 		nvlist_add_string(nvl, "ifname", ifname);
    828 	}
    829 	nvlist_add_binary(nvl, "state", &con->c_state, sizeof(npf_state_t));
    830 
    831 	fw = npf_conn_getforwkey(con);
    832 	alen = NPF_CONNKEY_ALEN(fw);
    833 	KASSERT(alen == con->c_alen);
    834 	bk = npf_conn_getbackkey(con, alen);
    835 
    836 	knvl = npf_connkey_export(npf, fw);
    837 	nvlist_move_nvlist(nvl, "forw-key", knvl);
    838 
    839 	knvl = npf_connkey_export(npf, bk);
    840 	nvlist_move_nvlist(nvl, "back-key", knvl);
    841 
    842 	/* Let the address length be based on on first key. */
    843 	nvlist_add_number(nvl, "alen", alen);
    844 
    845 	if (con->c_nat) {
    846 		npf_nat_export(npf, con->c_nat, nvl);
    847 	}
    848 	return 0;
    849 }
    850 
    851 /*
    852  * npf_conn_import: fully reconstruct a single connection from a
    853  * nvlist and insert into the given database.
    854  */
    855 int
    856 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
    857     npf_ruleset_t *natlist)
    858 {
    859 	npf_conn_t *con;
    860 	npf_connkey_t *fw, *bk;
    861 	const nvlist_t *nat, *conkey;
    862 	unsigned flags, alen, idx;
    863 	const char *ifname;
    864 	const void *state;
    865 	size_t len;
    866 
    867 	/*
    868 	 * To determine the length of the connection, which depends
    869 	 * on the address length in the connection keys.
    870 	 */
    871 	alen = dnvlist_get_number(cdict, "alen", 0);
    872 	idx = NPF_CONNCACHE(alen);
    873 
    874 	/* Allocate a connection and initialize it (clear first). */
    875 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
    876 	memset(con, 0, sizeof(npf_conn_t));
    877 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    878 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
    879 
    880 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
    881 	flags = dnvlist_get_number(cdict, "flags", 0);
    882 	flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
    883 	atomic_store_relaxed(&con->c_flags, flags);
    884 	conn_update_atime(con);
    885 
    886 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
    887 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
    888 		goto err;
    889 	}
    890 
    891 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
    892 	if (!state || len != sizeof(npf_state_t)) {
    893 		goto err;
    894 	}
    895 	memcpy(&con->c_state, state, sizeof(npf_state_t));
    896 
    897 	/* Reconstruct NAT association, if any. */
    898 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
    899 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
    900 		goto err;
    901 	}
    902 
    903 	/*
    904 	 * Fetch and copy the keys for each direction.
    905 	 */
    906 	fw = npf_conn_getforwkey(con);
    907 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
    908 	if (conkey == NULL || !npf_connkey_import(npf, conkey, fw)) {
    909 		goto err;
    910 	}
    911 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    912 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
    913 	if (conkey == NULL || !npf_connkey_import(npf, conkey, bk)) {
    914 		goto err;
    915 	}
    916 
    917 	/* Guard against the contradicting address lengths. */
    918 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
    919 		goto err;
    920 	}
    921 
    922 	/* Insert the entries and the connection itself. */
    923 	if (!npf_conndb_insert(cd, fw, con, NPF_FLOW_FORW)) {
    924 		goto err;
    925 	}
    926 	if (!npf_conndb_insert(cd, bk, con, NPF_FLOW_BACK)) {
    927 		npf_conndb_remove(cd, fw);
    928 		goto err;
    929 	}
    930 
    931 	NPF_PRINTF(("NPF: imported conn %p\n", con));
    932 	npf_conndb_enqueue(cd, con);
    933 	return 0;
    934 err:
    935 	npf_conn_destroy(npf, con);
    936 	return EINVAL;
    937 }
    938 
    939 /*
    940  * npf_conn_find: lookup a connection in the list of connections
    941  */
    942 int
    943 npf_conn_find(npf_t *npf, const nvlist_t *req, nvlist_t *resp)
    944 {
    945 	const nvlist_t *key_nv;
    946 	npf_conn_t *con;
    947 	npf_connkey_t key;
    948 	npf_flow_t flow;
    949 	int error;
    950 
    951 	key_nv = dnvlist_get_nvlist(req, "key", NULL);
    952 	if (!key_nv || !npf_connkey_import(npf, key_nv, &key)) {
    953 		return EINVAL;
    954 	}
    955 	con = npf_conndb_lookup(npf, &key, &flow);
    956 	if (con == NULL) {
    957 		return ESRCH;
    958 	}
    959 	if (!npf_conn_check(con, NULL, 0, NPF_FLOW_FORW)) {
    960 		atomic_dec_uint(&con->c_refcnt);
    961 		return ESRCH;
    962 	}
    963 	error = npf_conn_export(npf, con, resp);
    964 	nvlist_add_number(resp, "flow", flow);
    965 	atomic_dec_uint(&con->c_refcnt);
    966 	return error;
    967 }
    968 
    969 #if defined(DDB) || defined(_NPF_TESTING)
    970 
    971 void
    972 npf_conn_print(npf_conn_t *con)
    973 {
    974 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
    975 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
    976 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
    977 	const unsigned proto = con->c_proto;
    978 	struct timespec tspnow;
    979 
    980 	getnanouptime(&tspnow);
    981 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
    982 	    proto, flags, (long)(tspnow.tv_sec - con->c_atime),
    983 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
    984 	npf_connkey_print(fw);
    985 	npf_connkey_print(bk);
    986 	npf_state_dump(&con->c_state);
    987 	if (con->c_nat) {
    988 		npf_nat_dump(con->c_nat);
    989 	}
    990 }
    991 
    992 #endif
    993