Home | History | Annotate | Line # | Download | only in npf
      1 /*-
      2  * Copyright (c) 2014-2020 Mindaugas Rasiukevicius <rmind at noxt eu>
      3  * Copyright (c) 2010-2013 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This material is based upon work partially supported by The
      7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * NPF network address port translation (NAPT) and other forms of NAT.
     33  * Described in RFC 2663, RFC 3022, etc.
     34  *
     35  * Overview
     36  *
     37  *	There are a few mechanisms: NAT policy, port map and translation.
     38  *	The NAT module has a separate ruleset where rules always have an
     39  *	associated NAT policy.
     40  *
     41  * Translation types
     42  *
     43  *	There are two types of translation: outbound (NPF_NATOUT) and
     44  *	inbound (NPF_NATIN).  It should not be confused with connection
     45  *	direction.  See npf_nat_which() for the description of how the
     46  *	addresses are rewritten.  The bi-directional NAT is a combined
     47  *	outbound and inbound translation, therefore is constructed as
     48  *	two policies.
     49  *
     50  * NAT policies and port maps
     51  *
     52  *	The NAT (translation) policy is applied when packet matches the
     53  *	rule.  Apart from the filter criteria, the NAT policy always has
     54  *	a translation IP address or a table.  If port translation is set,
     55  *	then NAT mechanism relies on port map mechanism.
     56  *
     57  * Connections, translation entries and their life-cycle
     58  *
     59  *	NAT relies on the connection tracking module.  Each translated
     60  *	connection has an associated translation entry (npf_nat_t) which
     61  *	contains information used for backwards stream translation, i.e.
     62  *	the original IP address with port and translation port, allocated
     63  *	from the port map.  Each NAT entry is associated with the policy,
     64  *	which contains translation IP address.  Allocated port is returned
     65  *	to the port map and NAT entry is destroyed when connection expires.
     66  */
     67 
     68 #ifdef _KERNEL
     69 #include <sys/cdefs.h>
     70 __KERNEL_RCSID(0, "$NetBSD: npf_nat.c,v 1.54 2025/07/01 18:42:37 joe Exp $");
     71 
     72 #include <sys/param.h>
     73 #include <sys/types.h>
     74 
     75 #include <sys/atomic.h>
     76 #include <sys/condvar.h>
     77 #include <sys/kmem.h>
     78 #include <sys/mutex.h>
     79 #include <sys/pool.h>
     80 #include <sys/proc.h>
     81 #endif
     82 
     83 #include "npf_impl.h"
     84 #include "npf_conn.h"
     85 
     86 /*
     87  * NAT policy structure.
     88  */
     89 struct npf_natpolicy {
     90 	npf_t *			n_npfctx;
     91 	kmutex_t		n_lock;
     92 	LIST_HEAD(, npf_nat)	n_nat_list;
     93 	unsigned		n_refcnt;
     94 	uint64_t		n_id;
     95 
     96 	/*
     97 	 * Translation type, flags, address or table and the port.
     98 	 * Additionally, there may be translation algorithm and any
     99 	 * auxiliary data, e.g. NPTv6 adjustment value.
    100 	 *
    101 	 * NPF_NP_CMP_START mark starts here.
    102 	 */
    103 	unsigned		n_type;
    104 	unsigned		n_flags;
    105 	unsigned		n_alen;
    106 
    107 	npf_addr_t		n_taddr;
    108 	npf_netmask_t		n_tmask;
    109 	in_port_t		n_tport;
    110 	unsigned		n_tid;
    111 
    112 	unsigned		n_algo;
    113 	union {
    114 		unsigned	n_rr_idx;
    115 		uint16_t	n_npt66_adj;
    116 	};
    117 };
    118 
    119 /*
    120  * Private flags - must be in the NPF_NAT_PRIVMASK range.
    121  */
    122 #define	NPF_NAT_USETABLE	(0x01000000 & NPF_NAT_PRIVMASK)
    123 
    124 #define	NPF_NP_CMP_START	offsetof(npf_natpolicy_t, n_type)
    125 #define	NPF_NP_CMP_SIZE		(sizeof(npf_natpolicy_t) - NPF_NP_CMP_START)
    126 
    127 /*
    128  * NAT entry for a connection.
    129  */
    130 struct npf_nat {
    131 	/* Associated NAT policy. */
    132 	npf_natpolicy_t *	nt_natpolicy;
    133 
    134 	uint16_t		nt_ifid;
    135 	uint16_t		nt_alen;
    136 
    137 	/*
    138 	 * Translation address as well as the original address which is
    139 	 * used for backwards translation.  The same for ports.
    140 	 */
    141 	npf_addr_t		nt_taddr;
    142 	npf_addr_t		nt_oaddr;
    143 
    144 	in_port_t		nt_oport;
    145 	in_port_t		nt_tport;
    146 
    147 	/* ALG (if any) associated with this NAT entry. */
    148 	npf_alg_t *		nt_alg;
    149 	uintptr_t		nt_alg_arg;
    150 
    151 	LIST_ENTRY(npf_nat)	nt_entry;
    152 	npf_conn_t *		nt_conn;
    153 };
    154 
    155 static pool_cache_t		nat_cache	__read_mostly;
    156 
    157 /*
    158  * npf_nat_sys{init,fini}: initialize/destroy NAT subsystem structures.
    159  */
    160 
    161 void
    162 npf_nat_sysinit(void)
    163 {
    164 	nat_cache = pool_cache_init(sizeof(npf_nat_t), 0,
    165 	    0, 0, "npfnatpl", NULL, IPL_NET, NULL, NULL, NULL);
    166 	KASSERT(nat_cache != NULL);
    167 }
    168 
    169 void
    170 npf_nat_sysfini(void)
    171 {
    172 	/* All NAT policies should already be destroyed. */
    173 	pool_cache_destroy(nat_cache);
    174 }
    175 
    176 /*
    177  * npf_natpolicy_create: create a new NAT policy.
    178  */
    179 npf_natpolicy_t *
    180 npf_natpolicy_create(npf_t *npf, const nvlist_t *nat, npf_ruleset_t *rset)
    181 {
    182 	npf_natpolicy_t *np;
    183 	const void *addr;
    184 	size_t len;
    185 
    186 	np = kmem_zalloc(sizeof(npf_natpolicy_t), KM_SLEEP);
    187 	atomic_store_relaxed(&np->n_refcnt, 1);
    188 	np->n_npfctx = npf;
    189 
    190 	/* The translation type, flags and policy ID. */
    191 	np->n_type = dnvlist_get_number(nat, "type", 0);
    192 	np->n_flags = dnvlist_get_number(nat, "flags", 0) & ~NPF_NAT_PRIVMASK;
    193 	np->n_id = dnvlist_get_number(nat, "nat-policy", 0);
    194 
    195 	/* Should be exclusively either inbound or outbound NAT. */
    196 	if (((np->n_type == NPF_NATIN) ^ (np->n_type == NPF_NATOUT)) == 0) {
    197 		goto err;
    198 	}
    199 	mutex_init(&np->n_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    200 	LIST_INIT(&np->n_nat_list);
    201 
    202 	/*
    203 	 * Translation IP, mask and port (if applicable).  If using the
    204 	 * the table, specified by the ID, then the nat-addr/nat-mask will
    205 	 * be used as a filter for the addresses selected from table.
    206 	 */
    207 	if (nvlist_exists_number(nat, "nat-table-id")) {
    208 		if (np->n_flags & NPF_NAT_STATIC) {
    209 			goto err;
    210 		}
    211 		np->n_tid = nvlist_get_number(nat, "nat-table-id");
    212 		np->n_tmask = NPF_NO_NETMASK;
    213 		np->n_flags |= NPF_NAT_USETABLE;
    214 	} else {
    215 		addr = dnvlist_get_binary(nat, "nat-addr", &len, NULL, 0);
    216 		if (!addr || len == 0 || len > sizeof(npf_addr_t)) {
    217 			goto err;
    218 		}
    219 		memcpy(&np->n_taddr, addr, len);
    220 		np->n_alen = len;
    221 		np->n_tmask = dnvlist_get_number(nat, "nat-mask", NPF_NO_NETMASK);
    222 		if (npf_netmask_check(np->n_alen, np->n_tmask)) {
    223 			goto err;
    224 		}
    225 	}
    226 	np->n_tport = dnvlist_get_number(nat, "nat-port", 0);
    227 
    228 	/*
    229 	 * NAT algorithm.
    230 	 */
    231 	np->n_algo = dnvlist_get_number(nat, "nat-algo", 0);
    232 	switch (np->n_algo) {
    233 	case NPF_ALGO_NPT66:
    234 		np->n_npt66_adj = dnvlist_get_number(nat, "npt66-adj", 0);
    235 		break;
    236 	case NPF_ALGO_NETMAP:
    237 		break;
    238 	case NPF_ALGO_IPHASH:
    239 	case NPF_ALGO_RR:
    240 	default:
    241 		if (np->n_tmask != NPF_NO_NETMASK) {
    242 			goto err;
    243 		}
    244 		break;
    245 	}
    246 	return np;
    247 err:
    248 	mutex_destroy(&np->n_lock);
    249 	kmem_free(np, sizeof(npf_natpolicy_t));
    250 	return NULL;
    251 }
    252 
    253 int
    254 npf_natpolicy_export(const npf_natpolicy_t *np, nvlist_t *nat)
    255 {
    256 	nvlist_add_number(nat, "nat-policy", np->n_id);
    257 	nvlist_add_number(nat, "type", np->n_type);
    258 	nvlist_add_number(nat, "flags", np->n_flags);
    259 
    260 	if (np->n_flags & NPF_NAT_USETABLE) {
    261 		nvlist_add_number(nat, "nat-table-id", np->n_tid);
    262 	} else {
    263 		nvlist_add_binary(nat, "nat-addr", &np->n_taddr, np->n_alen);
    264 		nvlist_add_number(nat, "nat-mask", np->n_tmask);
    265 	}
    266 	nvlist_add_number(nat, "nat-port", np->n_tport);
    267 	nvlist_add_number(nat, "nat-algo", np->n_algo);
    268 
    269 	switch (np->n_algo) {
    270 	case NPF_ALGO_NPT66:
    271 		nvlist_add_number(nat, "npt66-adj", np->n_npt66_adj);
    272 		break;
    273 	}
    274 	return 0;
    275 }
    276 
    277 static void
    278 npf_natpolicy_release(npf_natpolicy_t *np)
    279 {
    280 	KASSERT(atomic_load_relaxed(&np->n_refcnt) > 0);
    281 
    282 	membar_release();
    283 	if (atomic_dec_uint_nv(&np->n_refcnt) != 0) {
    284 		return;
    285 	}
    286 	membar_acquire();
    287 	KASSERT(LIST_EMPTY(&np->n_nat_list));
    288 	mutex_destroy(&np->n_lock);
    289 	kmem_free(np, sizeof(npf_natpolicy_t));
    290 }
    291 
    292 /*
    293  * npf_natpolicy_destroy: free the NAT policy.
    294  *
    295  * => Called from npf_rule_free() during the reload via npf_ruleset_destroy().
    296  * => At this point, NAT policy cannot acquire new references.
    297  */
    298 void
    299 npf_natpolicy_destroy(npf_natpolicy_t *np)
    300 {
    301 	/*
    302 	 * Drain the references.  If there are active NAT connections,
    303 	 * then expire them and kick the worker.
    304 	 */
    305 	if (atomic_load_relaxed(&np->n_refcnt) > 1) {
    306 		npf_nat_t *nt;
    307 
    308 		mutex_enter(&np->n_lock);
    309 		LIST_FOREACH(nt, &np->n_nat_list, nt_entry) {
    310 			npf_conn_t *con = nt->nt_conn;
    311 			KASSERT(con != NULL);
    312 			npf_conn_expire(con);
    313 		}
    314 		mutex_exit(&np->n_lock);
    315 		npf_worker_signal(np->n_npfctx);
    316 	}
    317 	KASSERT(atomic_load_relaxed(&np->n_refcnt) >= 1);
    318 
    319 	/*
    320 	 * Drop the initial reference, but it might not be the last one.
    321 	 * If so, the last reference will be triggered via:
    322 	 *
    323 	 * npf_conn_destroy() -> npf_nat_destroy() -> npf_natpolicy_release()
    324 	 */
    325 	npf_natpolicy_release(np);
    326 }
    327 
    328 void
    329 npf_nat_freealg(npf_natpolicy_t *np, npf_alg_t *alg)
    330 {
    331 	npf_nat_t *nt;
    332 
    333 	mutex_enter(&np->n_lock);
    334 	LIST_FOREACH(nt, &np->n_nat_list, nt_entry) {
    335 		if (nt->nt_alg == alg) {
    336 			npf_alg_destroy(np->n_npfctx, alg, nt, nt->nt_conn);
    337 			nt->nt_alg = NULL;
    338 		}
    339 	}
    340 	mutex_exit(&np->n_lock);
    341 }
    342 
    343 /*
    344  * npf_natpolicy_cmp: compare two NAT policies.
    345  *
    346  * => Return 0 on match, and non-zero otherwise.
    347  */
    348 bool
    349 npf_natpolicy_cmp(npf_natpolicy_t *np, npf_natpolicy_t *mnp)
    350 {
    351 	const void *np_raw, *mnp_raw;
    352 
    353 	/*
    354 	 * Compare the relevant NAT policy information (in its raw form)
    355 	 * that is enough as a matching criteria.
    356 	 */
    357 	KASSERT(np && mnp && np != mnp);
    358 	np_raw = (const uint8_t *)np + NPF_NP_CMP_START;
    359 	mnp_raw = (const uint8_t *)mnp + NPF_NP_CMP_START;
    360 	return memcmp(np_raw, mnp_raw, NPF_NP_CMP_SIZE) == 0;
    361 }
    362 
    363 void
    364 npf_nat_setid(npf_natpolicy_t *np, uint64_t id)
    365 {
    366 	np->n_id = id;
    367 }
    368 
    369 uint64_t
    370 npf_nat_getid(const npf_natpolicy_t *np)
    371 {
    372 	return np->n_id;
    373 }
    374 
    375 /*
    376  * npf_nat_which: tell which address (source or destination) should be
    377  * rewritten given the combination of the NAT type and flow direction.
    378  *
    379  * => Returns NPF_SRC or NPF_DST constant.
    380  */
    381 static inline unsigned
    382 npf_nat_which(const unsigned type, const npf_flow_t flow)
    383 {
    384 	unsigned which;
    385 
    386 	/* The logic below relies on these values being 0 or 1. */
    387 	CTASSERT(NPF_SRC == 0 && NPF_DST == 1);
    388 	CTASSERT(NPF_FLOW_FORW == NPF_SRC && NPF_FLOW_BACK == NPF_DST);
    389 
    390 	KASSERT(type == NPF_NATIN || type == NPF_NATOUT);
    391 	KASSERT(flow == NPF_FLOW_FORW || flow == NPF_FLOW_BACK);
    392 
    393 	/*
    394 	 * Outbound NAT rewrites:
    395 	 *
    396 	 * - Source (NPF_SRC) on "forwards" stream.
    397 	 * - Destination (NPF_DST) on "backwards" stream.
    398 	 *
    399 	 * Inbound NAT is other way round.
    400 	 */
    401 	which = (type == NPF_NATOUT) ? flow : !flow;
    402 	KASSERT(which == NPF_SRC || which == NPF_DST);
    403 	return which;
    404 }
    405 
    406 /*
    407  * npf_nat_inspect: inspect packet against NAT ruleset and return a policy.
    408  *
    409  * => Acquire a reference on the policy, if found.
    410  * => NAT lookup is protected by EBR.
    411  */
    412 static npf_natpolicy_t *
    413 npf_nat_inspect(npf_cache_t *npc, const unsigned di)
    414 {
    415 	npf_t *npf = npc->npc_ctx;
    416 	int slock = npf_config_read_enter(npf);
    417 	npf_ruleset_t *rlset = npf_config_natset(npf);
    418 	npf_natpolicy_t *np;
    419 	npf_rule_t *rl;
    420 
    421 	rl = npf_ruleset_inspect(npc, rlset, di, NPF_RULE_LAYER_3);
    422 	if (rl == NULL) {
    423 		npf_config_read_exit(npf, slock);
    424 		return NULL;
    425 	}
    426 	np = npf_rule_getnat(rl);
    427 	atomic_inc_uint(&np->n_refcnt);
    428 	npf_config_read_exit(npf, slock);
    429 	return np;
    430 }
    431 
    432 static void
    433 npf_nat_algo_netmap(const npf_cache_t *npc, const npf_natpolicy_t *np,
    434     const unsigned which, npf_addr_t *addr)
    435 {
    436 	const npf_addr_t *orig_addr = npc->npc_ips[which];
    437 
    438 	/*
    439 	 * NETMAP:
    440 	 *
    441 	 *	addr = net-addr | (orig-addr & ~mask)
    442 	 */
    443 	npf_addr_mask(&np->n_taddr, np->n_tmask, npc->npc_alen, addr);
    444 	npf_addr_bitor(orig_addr, np->n_tmask, npc->npc_alen, addr);
    445 }
    446 
    447 static inline npf_addr_t *
    448 npf_nat_getaddr(npf_cache_t *npc, npf_natpolicy_t *np, const unsigned alen)
    449 {
    450 	npf_tableset_t *ts = npf_config_tableset(np->n_npfctx);
    451 	npf_table_t *t = npf_tableset_getbyid(ts, np->n_tid);
    452 	unsigned idx;
    453 
    454 	/*
    455 	 * Dynamically select the translation IP address.
    456 	 */
    457 	switch (np->n_algo) {
    458 	case NPF_ALGO_RR:
    459 		idx = atomic_inc_uint_nv(&np->n_rr_idx);
    460 		break;
    461 	case NPF_ALGO_IPHASH:
    462 	default:
    463 		idx = npf_addr_mix(alen,
    464 		    npc->npc_ips[NPF_SRC],
    465 		    npc->npc_ips[NPF_DST]);
    466 		break;
    467 	}
    468 	return npf_table_getsome(t, alen, idx);
    469 }
    470 
    471 /*
    472  * npf_nat_create: create a new NAT translation entry.
    473  *
    474  * => The caller must pass the NAT policy with a reference acquired for us.
    475  */
    476 static npf_nat_t *
    477 npf_nat_create(npf_cache_t *npc, npf_natpolicy_t *np, npf_conn_t *con)
    478 {
    479 	const unsigned proto = npc->npc_proto;
    480 	const unsigned alen = npc->npc_alen;
    481 	const nbuf_t *nbuf = npc->npc_nbuf;
    482 	npf_t *npf = npc->npc_ctx;
    483 	npf_addr_t *taddr;
    484 	npf_nat_t *nt;
    485 
    486 	KASSERT(npf_iscached(npc, NPC_IP46));
    487 	KASSERT(npf_iscached(npc, NPC_LAYER4));
    488 
    489 	/* Construct a new NAT entry and associate it with the connection. */
    490 	nt = pool_cache_get(nat_cache, PR_NOWAIT);
    491 	if (__predict_false(!nt)) {
    492 		return NULL;
    493 	}
    494 	npf_stats_inc(npf, NPF_STAT_NAT_CREATE);
    495 	nt->nt_natpolicy = np;
    496 	nt->nt_conn = con;
    497 	nt->nt_alg = NULL;
    498 
    499 	/*
    500 	 * Save the interface ID.
    501 	 *
    502 	 * Note: this can be different from the given connection if it
    503 	 * was established on a different interface, using the global state
    504 	 * mode (state.key.interface = 0).
    505 	 */
    506 	KASSERT(nbuf->nb_ifid != 0);
    507 	nt->nt_ifid = nbuf->nb_ifid;
    508 
    509 	/*
    510 	 * Select the translation address.
    511 	 */
    512 	if (np->n_flags & NPF_NAT_USETABLE) {
    513 		int slock = npf_config_read_enter(npf);
    514 		taddr = npf_nat_getaddr(npc, np, alen);
    515 		if (__predict_false(!taddr)) {
    516 			npf_config_read_exit(npf, slock);
    517 			pool_cache_put(nat_cache, nt);
    518 			return NULL;
    519 		}
    520 		memcpy(&nt->nt_taddr, taddr, alen);
    521 		npf_config_read_exit(npf, slock);
    522 
    523 	} else if (np->n_algo == NPF_ALGO_NETMAP) {
    524 		const unsigned which = npf_nat_which(np->n_type, NPF_FLOW_FORW);
    525 		npf_nat_algo_netmap(npc, np, which, &nt->nt_taddr);
    526 		taddr = &nt->nt_taddr;
    527 	} else {
    528 		/* Static IP address. */
    529 		taddr = &np->n_taddr;
    530 		memcpy(&nt->nt_taddr, taddr, alen);
    531 	}
    532 	nt->nt_alen = alen;
    533 
    534 	/* Save the original address which may be rewritten. */
    535 	if (np->n_type == NPF_NATOUT) {
    536 		/* Outbound NAT: source (think internal) address. */
    537 		memcpy(&nt->nt_oaddr, npc->npc_ips[NPF_SRC], alen);
    538 	} else {
    539 		/* Inbound NAT: destination (think external) address. */
    540 		KASSERT(np->n_type == NPF_NATIN);
    541 		memcpy(&nt->nt_oaddr, npc->npc_ips[NPF_DST], alen);
    542 	}
    543 
    544 	/*
    545 	 * Port translation, if required, and if it is TCP/UDP.
    546 	 */
    547 	if ((np->n_flags & NPF_NAT_PORTS) == 0 ||
    548 	    (proto != IPPROTO_TCP && proto != IPPROTO_UDP)) {
    549 		nt->nt_oport = 0;
    550 		nt->nt_tport = 0;
    551 		goto out;
    552 	}
    553 
    554 	/* Save the relevant TCP/UDP port. */
    555 	if (proto == IPPROTO_TCP) {
    556 		const struct tcphdr *th = npc->npc_l4.tcp;
    557 		nt->nt_oport = (np->n_type == NPF_NATOUT) ?
    558 		    th->th_sport : th->th_dport;
    559 	} else {
    560 		const struct udphdr *uh = npc->npc_l4.udp;
    561 		nt->nt_oport = (np->n_type == NPF_NATOUT) ?
    562 		    uh->uh_sport : uh->uh_dport;
    563 	}
    564 
    565 	/* Get a new port for translation. */
    566 	if ((np->n_flags & NPF_NAT_PORTMAP) != 0) {
    567 		npf_portmap_t *pm = np->n_npfctx->portmap;
    568 		nt->nt_tport = npf_portmap_get(pm, alen, taddr);
    569 	} else {
    570 		nt->nt_tport = np->n_tport;
    571 	}
    572 out:
    573 	mutex_enter(&np->n_lock);
    574 	LIST_INSERT_HEAD(&np->n_nat_list, nt, nt_entry);
    575 	/* Note: we also consume the reference on policy. */
    576 	mutex_exit(&np->n_lock);
    577 	return nt;
    578 }
    579 
    580 /*
    581  * npf_dnat_translate: perform translation given the state data.
    582  */
    583 static inline int
    584 npf_dnat_translate(npf_cache_t *npc, npf_nat_t *nt, npf_flow_t flow)
    585 {
    586 	const npf_natpolicy_t *np = nt->nt_natpolicy;
    587 	const unsigned which = npf_nat_which(np->n_type, flow);
    588 	const npf_addr_t *addr;
    589 	in_port_t port;
    590 
    591 	KASSERT(npf_iscached(npc, NPC_IP46));
    592 	KASSERT(npf_iscached(npc, NPC_LAYER4));
    593 
    594 	if (flow == NPF_FLOW_FORW) {
    595 		/* "Forwards" stream: use translation address/port. */
    596 		addr = &nt->nt_taddr;
    597 		port = nt->nt_tport;
    598 	} else {
    599 		/* "Backwards" stream: use original address/port. */
    600 		addr = &nt->nt_oaddr;
    601 		port = nt->nt_oport;
    602 	}
    603 	KASSERT((np->n_flags & NPF_NAT_PORTS) != 0 || port == 0);
    604 
    605 	/* Execute ALG translation first. */
    606 	if ((npc->npc_info & NPC_ALG_EXEC) == 0) {
    607 		npc->npc_info |= NPC_ALG_EXEC;
    608 		npf_alg_exec(npc, nt, flow);
    609 		npf_recache(npc);
    610 	}
    611 	KASSERT(!nbuf_flag_p(npc->npc_nbuf, NBUF_DATAREF_RESET));
    612 
    613 	/* Finally, perform the translation. */
    614 	return npf_napt_rwr(npc, which, addr, port);
    615 }
    616 
    617 /*
    618  * npf_snat_translate: perform translation given the algorithm.
    619  */
    620 static inline int
    621 npf_snat_translate(npf_cache_t *npc, const npf_natpolicy_t *np, npf_flow_t flow)
    622 {
    623 	const unsigned which = npf_nat_which(np->n_type, flow);
    624 	const npf_addr_t *taddr;
    625 	npf_addr_t addr;
    626 
    627 	KASSERT(np->n_flags & NPF_NAT_STATIC);
    628 
    629 	switch (np->n_algo) {
    630 	case NPF_ALGO_NETMAP:
    631 		npf_nat_algo_netmap(npc, np, which, &addr);
    632 		taddr = &addr;
    633 		break;
    634 	case NPF_ALGO_NPT66:
    635 		return npf_npt66_rwr(npc, which, &np->n_taddr,
    636 		    np->n_tmask, np->n_npt66_adj);
    637 	default:
    638 		taddr = &np->n_taddr;
    639 		break;
    640 	}
    641 	return npf_napt_rwr(npc, which, taddr, np->n_tport);
    642 }
    643 
    644 /*
    645  * Associate NAT policy with an existing connection state.
    646  */
    647 npf_nat_t *
    648 npf_nat_share_policy(npf_cache_t *npc, npf_conn_t *con, npf_nat_t *src_nt)
    649 {
    650 	npf_natpolicy_t *np = src_nt->nt_natpolicy;
    651 	npf_nat_t *nt;
    652 	int ret;
    653 
    654 	/* Create a new NAT entry. */
    655 	nt = npf_nat_create(npc, np, con);
    656 	if (__predict_false(nt == NULL)) {
    657 		return NULL;
    658 	}
    659 	atomic_inc_uint(&np->n_refcnt);
    660 
    661 	/* Associate the NAT translation entry with the connection. */
    662 	ret = npf_conn_setnat(npc, con, nt, np->n_type);
    663 	if (__predict_false(ret)) {
    664 		/* Will release the reference. */
    665 		npf_nat_destroy(con, nt);
    666 		return NULL;
    667 	}
    668 	return nt;
    669 }
    670 
    671 /*
    672  * npf_nat_lookup: lookup the (dynamic) NAT state and return its entry,
    673  *
    674  * => Checks that the packet is on the interface where NAT policy is applied.
    675  * => Determines the flow direction in the context of the NAT policy.
    676  */
    677 static npf_nat_t *
    678 npf_nat_lookup(const npf_cache_t *npc, npf_conn_t *con,
    679     const unsigned di, npf_flow_t *flow)
    680 {
    681 	const nbuf_t *nbuf = npc->npc_nbuf;
    682 	const npf_natpolicy_t *np;
    683 	npf_nat_t *nt;
    684 
    685 	if ((nt = npf_conn_getnat(con)) == NULL) {
    686 		return NULL;
    687 	}
    688 	if (nt->nt_ifid != nbuf->nb_ifid) {
    689 		return NULL;
    690 	}
    691 
    692 	np = nt->nt_natpolicy;
    693 	KASSERT(atomic_load_relaxed(&np->n_refcnt) > 0);
    694 
    695 	/*
    696 	 * We rely on NPF_NAT{IN,OUT} being equal to PFIL_{IN,OUT}.
    697 	 */
    698 	CTASSERT(NPF_NATIN == PFIL_IN && NPF_NATOUT == PFIL_OUT);
    699 	*flow = (np->n_type == di) ? NPF_FLOW_FORW : NPF_FLOW_BACK;
    700 	return nt;
    701 }
    702 
    703 /*
    704  * npf_do_nat:
    705  *
    706  *	- Inspect packet for a NAT policy, unless a connection with a NAT
    707  *	  association already exists.  In such case, determine whether it
    708  *	  is a "forwards" or "backwards" stream.
    709  *
    710  *	- Perform translation: rewrite source or destination fields,
    711  *	  depending on translation type and direction.
    712  *
    713  *	- Associate a NAT policy with a connection (may establish a new).
    714  */
    715 int
    716 npf_do_nat(npf_cache_t *npc, npf_conn_t *con, const unsigned di)
    717 {
    718 	nbuf_t *nbuf = npc->npc_nbuf;
    719 	npf_conn_t *ncon = NULL;
    720 	npf_natpolicy_t *np;
    721 	npf_flow_t flow;
    722 	npf_nat_t *nt;
    723 	int error;
    724 
    725 	/* All relevant data should be already cached. */
    726 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
    727 		return 0;
    728 	}
    729 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
    730 
    731 	/*
    732 	 * Return the NAT entry associated with the connection, if any.
    733 	 * Determines whether the stream is "forwards" or "backwards".
    734 	 * Note: no need to lock, since reference on connection is held.
    735 	 */
    736 	if (con && (nt = npf_nat_lookup(npc, con, di, &flow)) != NULL) {
    737 		np = nt->nt_natpolicy;
    738 		goto translate;
    739 	}
    740 
    741 	/*
    742 	 * Inspect the packet for a NAT policy, if there is no connection.
    743 	 * Note: acquires a reference if found.
    744 	 */
    745 	np = npf_nat_inspect(npc, di);
    746 	if (np == NULL) {
    747 		/* If packet does not match - done. */
    748 		return 0;
    749 	}
    750 	flow = NPF_FLOW_FORW;
    751 
    752 	/* Static NAT - just perform the translation. */
    753 	if (np->n_flags & NPF_NAT_STATIC) {
    754 		if (nbuf_cksum_barrier(nbuf, di)) {
    755 			npf_recache(npc);
    756 		}
    757 		error = npf_snat_translate(npc, np, flow);
    758 		npf_natpolicy_release(np);
    759 		return error;
    760 	}
    761 
    762 	/*
    763 	 * If there is no local connection (no "stateful" rule - unusual,
    764 	 * but possible configuration), establish one before translation.
    765 	 * Note that it is not a "pass" connection, therefore passing of
    766 	 * "backwards" stream depends on other, stateless filtering rules.
    767 	 */
    768 	if (con == NULL) {
    769 		ncon = npf_conn_establish(npc, di, true);
    770 		if (ncon == NULL) {
    771 			npf_natpolicy_release(np);
    772 			return ENOMEM;
    773 		}
    774 		con = ncon;
    775 	}
    776 
    777 	/*
    778 	 * Create a new NAT entry and associate with the connection.
    779 	 * We will consume the reference on success (release on error).
    780 	 */
    781 	nt = npf_nat_create(npc, np, con);
    782 	if (nt == NULL) {
    783 		npf_natpolicy_release(np);
    784 		error = ENOMEM;
    785 		goto out;
    786 	}
    787 
    788 	/* Determine whether any ALG matches. */
    789 	if (npf_alg_match(npc, nt, di)) {
    790 		KASSERT(nt->nt_alg != NULL);
    791 	}
    792 
    793 	/* Associate the NAT translation entry with the connection. */
    794 	error = npf_conn_setnat(npc, con, nt, np->n_type);
    795 	if (error) {
    796 		/* Will release the reference. */
    797 		npf_nat_destroy(con, nt);
    798 		goto out;
    799 	}
    800 
    801 translate:
    802 	/* May need to process the delayed checksums first (XXX: NetBSD). */
    803 	if (nbuf_cksum_barrier(nbuf, di)) {
    804 		npf_recache(npc);
    805 	}
    806 
    807 	/* Perform the translation. */
    808 	error = npf_dnat_translate(npc, nt, flow);
    809 out:
    810 	if (__predict_false(ncon)) {
    811 		if (error) {
    812 			/* It was created for NAT - just expire. */
    813 			npf_conn_expire(ncon);
    814 		}
    815 		npf_conn_release(ncon);
    816 	}
    817 	return error;
    818 }
    819 
    820 /*
    821  * npf_nat_gettrans: return translation IP address and port.
    822  */
    823 void
    824 npf_nat_gettrans(npf_nat_t *nt, npf_addr_t **addr, in_port_t *port)
    825 {
    826 	*addr = &nt->nt_taddr;
    827 	*port = nt->nt_tport;
    828 }
    829 
    830 /*
    831  * npf_nat_getorig: return original IP address and port from translation entry.
    832  */
    833 void
    834 npf_nat_getorig(npf_nat_t *nt, npf_addr_t **addr, in_port_t *port)
    835 {
    836 	*addr = &nt->nt_oaddr;
    837 	*port = nt->nt_oport;
    838 }
    839 
    840 /*
    841  * npf_nat_setalg: associate an ALG with the NAT entry.
    842  */
    843 void
    844 npf_nat_setalg(npf_nat_t *nt, npf_alg_t *alg, uintptr_t arg)
    845 {
    846 	nt->nt_alg = alg;
    847 	nt->nt_alg_arg = arg;
    848 }
    849 
    850 npf_alg_t *
    851 npf_nat_getalg(const npf_nat_t *nt)
    852 {
    853 	return nt->nt_alg;
    854 }
    855 
    856 uintptr_t
    857 npf_nat_getalgarg(const npf_nat_t *nt)
    858 {
    859 	return nt->nt_alg_arg;
    860 }
    861 
    862 /*
    863  * npf_nat_destroy: destroy NAT structure (performed on connection expiration).
    864  */
    865 void
    866 npf_nat_destroy(npf_conn_t *con, npf_nat_t *nt)
    867 {
    868 	npf_natpolicy_t *np = nt->nt_natpolicy;
    869 	npf_t *npf = np->n_npfctx;
    870 	npf_alg_t *alg;
    871 
    872 	/* Execute the ALG destroy callback, if any. */
    873 	if ((alg = npf_nat_getalg(nt)) != NULL) {
    874 		npf_alg_destroy(npf, alg, nt, con);
    875 		nt->nt_alg = NULL;
    876 	}
    877 
    878 	/* Return taken port to the portmap. */
    879 	if ((np->n_flags & NPF_NAT_PORTMAP) != 0 && nt->nt_tport) {
    880 		npf_portmap_t *pm = npf->portmap;
    881 		npf_portmap_put(pm, nt->nt_alen, &nt->nt_taddr, nt->nt_tport);
    882 	}
    883 	npf_stats_inc(np->n_npfctx, NPF_STAT_NAT_DESTROY);
    884 
    885 	/*
    886 	 * Remove the connection from the list and drop the reference on
    887 	 * the NAT policy.  Note: this might trigger its destruction.
    888 	 */
    889 	mutex_enter(&np->n_lock);
    890 	LIST_REMOVE(nt, nt_entry);
    891 	mutex_exit(&np->n_lock);
    892 	npf_natpolicy_release(np);
    893 
    894 	pool_cache_put(nat_cache, nt);
    895 }
    896 
    897 /*
    898  * npf_nat_export: serialize the NAT entry with a NAT policy ID.
    899  */
    900 void
    901 npf_nat_export(npf_t *npf, const npf_nat_t *nt, nvlist_t *con_nv)
    902 {
    903 	npf_natpolicy_t *np = nt->nt_natpolicy;
    904 	unsigned alen = nt->nt_alen;
    905 	nvlist_t *nat_nv;
    906 
    907 	nat_nv = nvlist_create(0);
    908 	if (nt->nt_ifid) {
    909 		char ifname[IFNAMSIZ];
    910 		npf_ifmap_copyname(npf, nt->nt_ifid, ifname, sizeof(ifname));
    911 		nvlist_add_string(nat_nv, "ifname", ifname);
    912 	}
    913 	nvlist_add_number(nat_nv, "alen", alen);
    914 
    915 	nvlist_add_binary(nat_nv, "oaddr", &nt->nt_oaddr, alen);
    916 	nvlist_add_number(nat_nv, "oport", nt->nt_oport);
    917 
    918 	nvlist_add_binary(nat_nv, "taddr", &nt->nt_taddr, alen);
    919 	nvlist_add_number(nat_nv, "tport", nt->nt_tport);
    920 
    921 	nvlist_add_number(nat_nv, "nat-policy", np->n_id);
    922 	nvlist_move_nvlist(con_nv, "nat", nat_nv);
    923 }
    924 
    925 /*
    926  * npf_nat_import: find the NAT policy and unserialize the NAT entry.
    927  */
    928 npf_nat_t *
    929 npf_nat_import(npf_t *npf, const nvlist_t *nat,
    930     npf_ruleset_t *natlist, npf_conn_t *con)
    931 {
    932 	npf_natpolicy_t *np;
    933 	npf_nat_t *nt;
    934 	const char *ifname;
    935 	const void *taddr, *oaddr;
    936 	size_t alen, len;
    937 	uint64_t np_id;
    938 
    939 	np_id = dnvlist_get_number(nat, "nat-policy", UINT64_MAX);
    940 	if ((np = npf_ruleset_findnat(natlist, np_id)) == NULL) {
    941 		return NULL;
    942 	}
    943 	nt = pool_cache_get(nat_cache, PR_WAITOK);
    944 	memset(nt, 0, sizeof(npf_nat_t));
    945 
    946 	ifname = dnvlist_get_string(nat, "ifname", NULL);
    947 	if (ifname && (nt->nt_ifid = npf_ifmap_register(npf, ifname)) == 0) {
    948 		goto err;
    949 	}
    950 
    951 	alen = dnvlist_get_number(nat, "alen", 0);
    952 	if (alen == 0 || alen > sizeof(npf_addr_t)) {
    953 		goto err;
    954 	}
    955 
    956 	taddr = dnvlist_get_binary(nat, "taddr", &len, NULL, 0);
    957 	if (!taddr || len != alen) {
    958 		goto err;
    959 	}
    960 	memcpy(&nt->nt_taddr, taddr, sizeof(npf_addr_t));
    961 
    962 	oaddr = dnvlist_get_binary(nat, "oaddr", &len, NULL, 0);
    963 	if (!oaddr || len != alen) {
    964 		goto err;
    965 	}
    966 	memcpy(&nt->nt_oaddr, oaddr, sizeof(npf_addr_t));
    967 
    968 	nt->nt_oport = dnvlist_get_number(nat, "oport", 0);
    969 	nt->nt_tport = dnvlist_get_number(nat, "tport", 0);
    970 
    971 	/* Take a specific port from port-map. */
    972 	if ((np->n_flags & NPF_NAT_PORTMAP) != 0 && nt->nt_tport) {
    973 		npf_portmap_t *pm = npf->portmap;
    974 
    975 		if (!npf_portmap_take(pm, nt->nt_alen,
    976 		    &nt->nt_taddr, nt->nt_tport)) {
    977 			goto err;
    978 		}
    979 	}
    980 	npf_stats_inc(npf, NPF_STAT_NAT_CREATE);
    981 
    982 	/*
    983 	 * Associate, take a reference and insert.  Unlocked/non-atomic
    984 	 * since the policy is not yet globally visible.
    985 	 */
    986 	nt->nt_natpolicy = np;
    987 	nt->nt_conn = con;
    988 	atomic_store_relaxed(&np->n_refcnt,
    989 	    atomic_load_relaxed(&np->n_refcnt) + 1);
    990 	LIST_INSERT_HEAD(&np->n_nat_list, nt, nt_entry);
    991 	return nt;
    992 err:
    993 	pool_cache_put(nat_cache, nt);
    994 	return NULL;
    995 }
    996 
    997 #if defined(DDB) || defined(_NPF_TESTING)
    998 
    999 void
   1000 npf_nat_dump(const npf_nat_t *nt)
   1001 {
   1002 	const npf_natpolicy_t *np;
   1003 	struct in_addr ip;
   1004 
   1005 	np = nt->nt_natpolicy;
   1006 	memcpy(&ip, &nt->nt_taddr, sizeof(ip));
   1007 	printf("\tNATP(%p): type %u flags 0x%x taddr %s tport %d\n", np,
   1008 	    np->n_type, np->n_flags, inet_ntoa(ip), ntohs(np->n_tport));
   1009 	memcpy(&ip, &nt->nt_oaddr, sizeof(ip));
   1010 	printf("\tNAT: original address %s oport %d tport %d\n",
   1011 	    inet_ntoa(ip), ntohs(nt->nt_oport), ntohs(nt->nt_tport));
   1012 	if (nt->nt_alg) {
   1013 		printf("\tNAT ALG = %p, ARG = %p\n",
   1014 		    nt->nt_alg, (void *)nt->nt_alg_arg);
   1015 	}
   1016 }
   1017 
   1018 #endif
   1019