npf_conn.c revision 1.26 1 /*-
2 * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
3 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
4 * All rights reserved.
5 *
6 * This material is based upon work partially supported by The
7 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * NPF connection tracking for stateful filtering and translation.
33 *
34 * Overview
35 *
36 * Packets can be incoming or outgoing with respect to an interface.
37 * Connection direction is identified by the direction of its first
38 * packet. The meaning of incoming/outgoing packet in the context of
39 * connection direction can be confusing. Therefore, we will use the
40 * terms "forwards stream" and "backwards stream", where packets in
41 * the forwards stream mean the packets travelling in the direction
42 * as the connection direction.
43 *
44 * All connections have two keys and thus two entries:
45 *
46 * npf_conn_t::c_forw_entry for the forwards stream and
47 * npf_conn_t::c_back_entry for the backwards stream.
48 *
49 * The keys are formed from the 5-tuple (source/destination address,
50 * source/destination port and the protocol). Additional matching
51 * is performed for the interface (a common behaviour is equivalent
52 * to the 6-tuple lookup including the interface ID). Note that the
53 * key may be formed using translated values in a case of NAT.
54 *
55 * Connections can serve two purposes: for the implicit passing or
56 * to accommodate the dynamic NAT. Connections for the former purpose
57 * are created by the rules with "stateful" attribute and are used for
58 * stateful filtering. Such connections indicate that the packet of
59 * the backwards stream should be passed without inspection of the
60 * ruleset. The other purpose is to associate a dynamic NAT mechanism
61 * with a connection. Such connections are created by the NAT policies
62 * and they have a relationship with NAT translation structure via
63 * npf_conn_t::c_nat. A single connection can serve both purposes,
64 * which is a common case.
65 *
66 * Connection life-cycle
67 *
68 * Connections are established when a packet matches said rule or
69 * NAT policy. Both keys of the established connection are inserted
70 * into the connection database. A garbage collection thread
71 * periodically scans all connections and depending on connection
72 * properties (e.g. last activity time, protocol) removes connection
73 * entries and expires the actual connections.
74 *
75 * Each connection has a reference count. The reference is acquired
76 * on lookup and should be released by the caller. It guarantees that
77 * the connection will not be destroyed, although it may be expired.
78 *
79 * Synchronisation
80 *
81 * Connection database is accessed in a lock-less manner by the main
82 * routines: npf_conn_inspect() and npf_conn_establish(). Since they
83 * are always called from a software interrupt, the database is
84 * protected using passive serialisation. The main place which can
85 * destroy a connection is npf_conn_worker(). The database itself
86 * can be replaced and destroyed in npf_conn_reload().
87 *
88 * ALG support
89 *
90 * Application-level gateways (ALGs) can override generic connection
91 * inspection (npf_alg_conn() call in npf_conn_inspect() function) by
92 * performing their own lookup using different key. Recursive call
93 * to npf_conn_inspect() is not allowed. The ALGs ought to use the
94 * npf_conn_lookup() function for this purpose.
95 *
96 * Lock order
97 *
98 * npf_config_lock ->
99 * conn_lock ->
100 * npf_conn_t::c_lock
101 */
102
103 #ifdef _KERNEL
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.26 2019/01/19 21:19:31 rmind Exp $");
106
107 #include <sys/param.h>
108 #include <sys/types.h>
109
110 #include <netinet/in.h>
111 #include <netinet/tcp.h>
112
113 #include <sys/atomic.h>
114 #include <sys/condvar.h>
115 #include <sys/kmem.h>
116 #include <sys/kthread.h>
117 #include <sys/mutex.h>
118 #include <net/pfil.h>
119 #include <sys/pool.h>
120 #include <sys/queue.h>
121 #include <sys/systm.h>
122 #endif
123
124 #define __NPF_CONN_PRIVATE
125 #include "npf_conn.h"
126 #include "npf_impl.h"
127
128 /*
129 * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
130 */
131 CTASSERT(PFIL_ALL == (0x001 | 0x002));
132 #define CONN_ACTIVE 0x004 /* visible on inspection */
133 #define CONN_PASS 0x008 /* perform implicit passing */
134 #define CONN_EXPIRE 0x010 /* explicitly expire */
135 #define CONN_REMOVED 0x020 /* "forw/back" entries removed */
136
137 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
138
139 static nvlist_t *npf_conn_export(npf_t *, const npf_conn_t *);
140
141 /*
142 * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
143 */
144
145 void
146 npf_conn_init(npf_t *npf, int flags)
147 {
148 npf->conn_cache = pool_cache_init(sizeof(npf_conn_t), coherency_unit,
149 0, 0, "npfconpl", NULL, IPL_NET, NULL, NULL, NULL);
150 mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
151 npf->conn_tracking = CONN_TRACKING_OFF;
152 npf->conn_db = npf_conndb_create();
153
154 if ((flags & NPF_NO_GC) == 0) {
155 npf_worker_register(npf, npf_conn_worker);
156 }
157 }
158
159 void
160 npf_conn_fini(npf_t *npf)
161 {
162 /* Note: the caller should have flushed the connections. */
163 KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
164 npf_worker_unregister(npf, npf_conn_worker);
165
166 npf_conndb_destroy(npf->conn_db);
167 pool_cache_destroy(npf->conn_cache);
168 mutex_destroy(&npf->conn_lock);
169 }
170
171 /*
172 * npf_conn_load: perform the load by flushing the current connection
173 * database and replacing it with the new one or just destroying.
174 *
175 * => The caller must disable the connection tracking and ensure that
176 * there are no connection database lookups or references in-flight.
177 */
178 void
179 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
180 {
181 npf_conndb_t *odb = NULL;
182
183 KASSERT(npf_config_locked_p(npf));
184
185 /*
186 * The connection database is in the quiescent state.
187 * Prevent G/C thread from running and install a new database.
188 */
189 mutex_enter(&npf->conn_lock);
190 if (ndb) {
191 KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
192 odb = npf->conn_db;
193 npf->conn_db = ndb;
194 membar_sync();
195 }
196 if (track) {
197 /* After this point lookups start flying in. */
198 npf->conn_tracking = CONN_TRACKING_ON;
199 }
200 mutex_exit(&npf->conn_lock);
201
202 if (odb) {
203 /*
204 * Flush all, no sync since the caller did it for us.
205 * Also, release the pool cache memory.
206 */
207 npf_conndb_gc(npf, odb, true, false);
208 npf_conndb_destroy(odb);
209 pool_cache_invalidate(npf->conn_cache);
210 }
211 }
212
213 /*
214 * npf_conn_tracking: enable/disable connection tracking.
215 */
216 void
217 npf_conn_tracking(npf_t *npf, bool track)
218 {
219 KASSERT(npf_config_locked_p(npf));
220 npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
221 }
222
223 static inline bool
224 npf_conn_trackable_p(const npf_cache_t *npc)
225 {
226 const npf_t *npf = npc->npc_ctx;
227
228 /*
229 * Check if connection tracking is on. Also, if layer 3 and 4 are
230 * not cached - protocol is not supported or packet is invalid.
231 */
232 if (npf->conn_tracking != CONN_TRACKING_ON) {
233 return false;
234 }
235 if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
236 return false;
237 }
238 return true;
239 }
240
241 static uint32_t
242 connkey_setkey(npf_connkey_t *key, uint16_t proto, const void *ipv,
243 const uint16_t *id, unsigned alen, bool forw)
244 {
245 uint32_t isrc, idst, *k = key->ck_key;
246 const npf_addr_t * const *ips = ipv;
247
248 if (__predict_true(forw)) {
249 isrc = NPF_SRC, idst = NPF_DST;
250 } else {
251 isrc = NPF_DST, idst = NPF_SRC;
252 }
253
254 /*
255 * Construct a key formed out of 32-bit integers. The key layout:
256 *
257 * Field: | proto | alen | src-id | dst-id | src-addr | dst-addr |
258 * +--------+--------+--------+--------+----------+----------+
259 * Bits: | 16 | 16 | 16 | 16 | 32-128 | 32-128 |
260 *
261 * The source and destination are inverted if they key is for the
262 * backwards stream (forw == false). The address length depends
263 * on the 'alen' field; it is a length in bytes, either 4 or 16.
264 */
265
266 k[0] = ((uint32_t)proto << 16) | (alen & 0xffff);
267 k[1] = ((uint32_t)id[isrc] << 16) | id[idst];
268
269 if (__predict_true(alen == sizeof(in_addr_t))) {
270 k[2] = ips[isrc]->word32[0];
271 k[3] = ips[idst]->word32[0];
272 return 4 * sizeof(uint32_t);
273 } else {
274 const u_int nwords = alen >> 2;
275 memcpy(&k[2], ips[isrc], alen);
276 memcpy(&k[2 + nwords], ips[idst], alen);
277 return (2 + (nwords * 2)) * sizeof(uint32_t);
278 }
279 }
280
281 static void
282 connkey_getkey(const npf_connkey_t *key, uint16_t *proto, npf_addr_t *ips,
283 uint16_t *id, uint16_t *alen)
284 {
285 const uint32_t *k = key->ck_key;
286
287 *proto = k[0] >> 16;
288 *alen = k[0] & 0xffff;
289 id[NPF_SRC] = k[1] >> 16;
290 id[NPF_DST] = k[1] & 0xffff;
291
292 switch (*alen) {
293 case sizeof(struct in6_addr):
294 case sizeof(struct in_addr):
295 memcpy(&ips[NPF_SRC], &k[2], *alen);
296 memcpy(&ips[NPF_DST], &k[2 + ((unsigned)*alen >> 2)], *alen);
297 return;
298 default:
299 KASSERT(0);
300 }
301 }
302
303 /*
304 * npf_conn_conkey: construct a key for the connection lookup.
305 *
306 * => Returns the key length in bytes or zero on failure.
307 */
308 unsigned
309 npf_conn_conkey(const npf_cache_t *npc, npf_connkey_t *key, const bool forw)
310 {
311 const u_int proto = npc->npc_proto;
312 const u_int alen = npc->npc_alen;
313 const struct tcphdr *th;
314 const struct udphdr *uh;
315 uint16_t id[2];
316
317 switch (proto) {
318 case IPPROTO_TCP:
319 KASSERT(npf_iscached(npc, NPC_TCP));
320 th = npc->npc_l4.tcp;
321 id[NPF_SRC] = th->th_sport;
322 id[NPF_DST] = th->th_dport;
323 break;
324 case IPPROTO_UDP:
325 KASSERT(npf_iscached(npc, NPC_UDP));
326 uh = npc->npc_l4.udp;
327 id[NPF_SRC] = uh->uh_sport;
328 id[NPF_DST] = uh->uh_dport;
329 break;
330 case IPPROTO_ICMP:
331 if (npf_iscached(npc, NPC_ICMP_ID)) {
332 const struct icmp *ic = npc->npc_l4.icmp;
333 id[NPF_SRC] = ic->icmp_id;
334 id[NPF_DST] = ic->icmp_id;
335 break;
336 }
337 return 0;
338 case IPPROTO_ICMPV6:
339 if (npf_iscached(npc, NPC_ICMP_ID)) {
340 const struct icmp6_hdr *ic6 = npc->npc_l4.icmp6;
341 id[NPF_SRC] = ic6->icmp6_id;
342 id[NPF_DST] = ic6->icmp6_id;
343 break;
344 }
345 return 0;
346 default:
347 /* Unsupported protocol. */
348 return 0;
349 }
350 return connkey_setkey(key, proto, npc->npc_ips, id, alen, forw);
351 }
352
353 static __inline void
354 connkey_set_addr(npf_connkey_t *key, const npf_addr_t *naddr, const int di)
355 {
356 const u_int alen = key->ck_key[0] & 0xffff;
357 uint32_t *addr = &key->ck_key[2 + ((alen >> 2) * di)];
358
359 KASSERT(alen > 0);
360 memcpy(addr, naddr, alen);
361 }
362
363 static __inline void
364 connkey_set_id(npf_connkey_t *key, const uint16_t id, const int di)
365 {
366 const uint32_t oid = key->ck_key[1];
367 const u_int shift = 16 * !di;
368 const uint32_t mask = 0xffff0000 >> shift;
369
370 key->ck_key[1] = ((uint32_t)id << shift) | (oid & mask);
371 }
372
373 static inline void
374 conn_update_atime(npf_conn_t *con)
375 {
376 struct timespec tsnow;
377
378 getnanouptime(&tsnow);
379 con->c_atime = tsnow.tv_sec;
380 }
381
382 /*
383 * npf_conn_ok: check if the connection is active and has the right direction.
384 */
385 static bool
386 npf_conn_ok(const npf_conn_t *con, const int di, bool forw)
387 {
388 const uint32_t flags = con->c_flags;
389
390 /* Check if connection is active and not expired. */
391 bool ok = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
392 if (__predict_false(!ok)) {
393 return false;
394 }
395
396 /* Check if the direction is consistent */
397 bool pforw = (flags & PFIL_ALL) == (unsigned)di;
398 if (__predict_false(forw != pforw)) {
399 return false;
400 }
401 return true;
402 }
403
404 /*
405 * npf_conn_lookup: lookup if there is an established connection.
406 *
407 * => If found, we will hold a reference for the caller.
408 */
409 npf_conn_t *
410 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
411 {
412 npf_t *npf = npc->npc_ctx;
413 const nbuf_t *nbuf = npc->npc_nbuf;
414 npf_conn_t *con;
415 npf_connkey_t key;
416 u_int cifid;
417
418 /* Construct a key and lookup for a connection in the store. */
419 if (!npf_conn_conkey(npc, &key, true)) {
420 return NULL;
421 }
422 con = npf_conndb_lookup(npf->conn_db, &key, forw);
423 if (con == NULL) {
424 return NULL;
425 }
426 KASSERT(npc->npc_proto == con->c_proto);
427
428 /* Check if connection is active and not expired. */
429 if (!npf_conn_ok(con, di, *forw)) {
430 atomic_dec_uint(&con->c_refcnt);
431 return NULL;
432 }
433
434 /*
435 * Match the interface and the direction of the connection entry
436 * and the packet.
437 */
438 cifid = con->c_ifid;
439 if (__predict_false(cifid && cifid != nbuf->nb_ifid)) {
440 atomic_dec_uint(&con->c_refcnt);
441 return NULL;
442 }
443
444 /* Update the last activity time. */
445 conn_update_atime(con);
446 return con;
447 }
448
449 /*
450 * npf_conn_inspect: lookup a connection and inspecting the protocol data.
451 *
452 * => If found, we will hold a reference for the caller.
453 */
454 npf_conn_t *
455 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
456 {
457 nbuf_t *nbuf = npc->npc_nbuf;
458 npf_conn_t *con;
459 bool forw, ok;
460
461 KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
462 if (!npf_conn_trackable_p(npc)) {
463 return NULL;
464 }
465
466 /* Query ALG which may lookup connection for us. */
467 if ((con = npf_alg_conn(npc, di)) != NULL) {
468 /* Note: reference is held. */
469 return con;
470 }
471 if (nbuf_head_mbuf(nbuf) == NULL) {
472 *error = ENOMEM;
473 return NULL;
474 }
475 KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
476
477 /* Main lookup of the connection. */
478 if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
479 return NULL;
480 }
481
482 /* Inspect the protocol data and handle state changes. */
483 mutex_enter(&con->c_lock);
484 ok = npf_state_inspect(npc, &con->c_state, forw);
485 mutex_exit(&con->c_lock);
486
487 /* If invalid state: let the rules deal with it. */
488 if (__predict_false(!ok)) {
489 npf_conn_release(con);
490 npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
491 return NULL;
492 }
493
494 /*
495 * If this is multi-end state, then specially tag the packet
496 * so it will be just passed-through on other interfaces.
497 */
498 if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
499 npf_conn_release(con);
500 *error = ENOMEM;
501 return NULL;
502 }
503 return con;
504 }
505
506 /*
507 * npf_conn_establish: create a new connection, insert into the global list.
508 *
509 * => Connection is created with the reference held for the caller.
510 * => Connection will be activated on the first reference release.
511 */
512 npf_conn_t *
513 npf_conn_establish(npf_cache_t *npc, int di, bool per_if)
514 {
515 npf_t *npf = npc->npc_ctx;
516 const nbuf_t *nbuf = npc->npc_nbuf;
517 npf_conn_t *con;
518 int error = 0;
519
520 KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
521
522 if (!npf_conn_trackable_p(npc)) {
523 return NULL;
524 }
525
526 /* Allocate and initialise the new connection. */
527 con = pool_cache_get(npf->conn_cache, PR_NOWAIT);
528 if (__predict_false(!con)) {
529 npf_worker_signal(npf);
530 return NULL;
531 }
532 NPF_PRINTF(("NPF: create conn %p\n", con));
533 npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
534
535 mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
536 con->c_flags = (di & PFIL_ALL);
537 con->c_refcnt = 0;
538 con->c_rproc = NULL;
539 con->c_nat = NULL;
540
541 /* Initialize the protocol state. */
542 if (!npf_state_init(npc, &con->c_state)) {
543 npf_conn_destroy(npf, con);
544 return NULL;
545 }
546
547 KASSERT(npf_iscached(npc, NPC_IP46));
548 npf_connkey_t *fw = &con->c_forw_entry;
549 npf_connkey_t *bk = &con->c_back_entry;
550
551 /*
552 * Construct "forwards" and "backwards" keys. Also, set the
553 * interface ID for this connection (unless it is global).
554 */
555 if (!npf_conn_conkey(npc, fw, true) ||
556 !npf_conn_conkey(npc, bk, false)) {
557 npf_conn_destroy(npf, con);
558 return NULL;
559 }
560 fw->ck_backptr = bk->ck_backptr = con;
561 con->c_ifid = per_if ? nbuf->nb_ifid : 0;
562 con->c_proto = npc->npc_proto;
563
564 /*
565 * Set last activity time for a new connection and acquire
566 * a reference for the caller before we make it visible.
567 */
568 conn_update_atime(con);
569 con->c_refcnt = 1;
570
571 /*
572 * Insert both keys (entries representing directions) of the
573 * connection. At this point it becomes visible, but we activate
574 * the connection later.
575 */
576 mutex_enter(&con->c_lock);
577 if (!npf_conndb_insert(npf->conn_db, fw)) {
578 error = EISCONN;
579 goto err;
580 }
581 if (!npf_conndb_insert(npf->conn_db, bk)) {
582 npf_conn_t *ret __diagused;
583 ret = npf_conndb_remove(npf->conn_db, fw);
584 KASSERT(ret == con);
585 error = EISCONN;
586 goto err;
587 }
588 err:
589 /*
590 * If we have hit the duplicate: mark the connection as expired
591 * and let the G/C thread to take care of it. We cannot do it
592 * here since there might be references acquired already.
593 */
594 if (error) {
595 atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
596 atomic_dec_uint(&con->c_refcnt);
597 npf_stats_inc(npf, NPF_STAT_RACE_CONN);
598 } else {
599 NPF_PRINTF(("NPF: establish conn %p\n", con));
600 }
601
602 /* Finally, insert into the connection list. */
603 npf_conndb_enqueue(npf->conn_db, con);
604 mutex_exit(&con->c_lock);
605
606 return error ? NULL : con;
607 }
608
609 void
610 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
611 {
612 KASSERT(con->c_refcnt == 0);
613
614 if (con->c_nat) {
615 /* Release any NAT structures. */
616 npf_nat_destroy(con->c_nat);
617 }
618 if (con->c_rproc) {
619 /* Release the rule procedure. */
620 npf_rproc_release(con->c_rproc);
621 }
622
623 /* Destroy the state. */
624 npf_state_destroy(&con->c_state);
625 mutex_destroy(&con->c_lock);
626
627 /* Free the structure, increase the counter. */
628 pool_cache_put(npf->conn_cache, con);
629 npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
630 NPF_PRINTF(("NPF: conn %p destroyed\n", con));
631 }
632
633 /*
634 * npf_conn_setnat: associate NAT entry with the connection, update and
635 * re-insert connection entry using the translation values.
636 *
637 * => The caller must be holding a reference.
638 */
639 int
640 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
641 npf_nat_t *nt, u_int ntype)
642 {
643 static const u_int nat_type_dimap[] = {
644 [NPF_NATOUT] = NPF_DST,
645 [NPF_NATIN] = NPF_SRC,
646 };
647 npf_t *npf = npc->npc_ctx;
648 npf_connkey_t key, *bk;
649 npf_conn_t *ret __diagused;
650 npf_addr_t *taddr;
651 in_port_t tport;
652 u_int tidx;
653
654 KASSERT(con->c_refcnt > 0);
655
656 npf_nat_gettrans(nt, &taddr, &tport);
657 KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
658 tidx = nat_type_dimap[ntype];
659
660 /* Construct a "backwards" key. */
661 if (!npf_conn_conkey(npc, &key, false)) {
662 return EINVAL;
663 }
664
665 /* Acquire the lock and check for the races. */
666 mutex_enter(&con->c_lock);
667 if (__predict_false(con->c_flags & CONN_EXPIRE)) {
668 /* The connection got expired. */
669 mutex_exit(&con->c_lock);
670 return EINVAL;
671 }
672 KASSERT((con->c_flags & CONN_REMOVED) == 0);
673
674 if (__predict_false(con->c_nat != NULL)) {
675 /* Race with a duplicate packet. */
676 mutex_exit(&con->c_lock);
677 npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
678 return EISCONN;
679 }
680
681 /* Remove the "backwards" entry. */
682 ret = npf_conndb_remove(npf->conn_db, &con->c_back_entry);
683 KASSERT(ret == con);
684
685 /* Set the source/destination IDs to the translation values. */
686 bk = &con->c_back_entry;
687 connkey_set_addr(bk, taddr, tidx);
688 if (tport) {
689 connkey_set_id(bk, tport, tidx);
690 }
691
692 /* Finally, re-insert the "backwards" entry. */
693 if (!npf_conndb_insert(npf->conn_db, bk)) {
694 /*
695 * Race: we have hit the duplicate, remove the "forwards"
696 * entry and expire our connection; it is no longer valid.
697 */
698 ret = npf_conndb_remove(npf->conn_db, &con->c_forw_entry);
699 KASSERT(ret == con);
700
701 atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
702 mutex_exit(&con->c_lock);
703
704 npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
705 return EISCONN;
706 }
707
708 /* Associate the NAT entry and release the lock. */
709 con->c_nat = nt;
710 mutex_exit(&con->c_lock);
711 return 0;
712 }
713
714 /*
715 * npf_conn_expire: explicitly mark connection as expired.
716 */
717 void
718 npf_conn_expire(npf_conn_t *con)
719 {
720 /* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
721 atomic_or_uint(&con->c_flags, CONN_EXPIRE);
722 }
723
724 /*
725 * npf_conn_pass: return true if connection is "pass" one, otherwise false.
726 */
727 bool
728 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
729 {
730 KASSERT(con->c_refcnt > 0);
731 if (__predict_true(con->c_flags & CONN_PASS)) {
732 mi->mi_rid = con->c_rid;
733 mi->mi_retfl = con->c_retfl;
734 *rp = con->c_rproc;
735 return true;
736 }
737 return false;
738 }
739
740 /*
741 * npf_conn_setpass: mark connection as a "pass" one and associate the
742 * rule procedure with it.
743 */
744 void
745 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
746 {
747 KASSERT((con->c_flags & CONN_ACTIVE) == 0);
748 KASSERT(con->c_refcnt > 0);
749 KASSERT(con->c_rproc == NULL);
750
751 /*
752 * No need for atomic since the connection is not yet active.
753 * If rproc is set, the caller transfers its reference to us,
754 * which will be released on npf_conn_destroy().
755 */
756 atomic_or_uint(&con->c_flags, CONN_PASS);
757 con->c_rproc = rp;
758 if (rp) {
759 con->c_rid = mi->mi_rid;
760 con->c_retfl = mi->mi_retfl;
761 }
762 }
763
764 /*
765 * npf_conn_release: release a reference, which might allow G/C thread
766 * to destroy this connection.
767 */
768 void
769 npf_conn_release(npf_conn_t *con)
770 {
771 if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
772 /* Activate: after this, connection is globally visible. */
773 atomic_or_uint(&con->c_flags, CONN_ACTIVE);
774 }
775 KASSERT(con->c_refcnt > 0);
776 atomic_dec_uint(&con->c_refcnt);
777 }
778
779 /*
780 * npf_conn_getnat: return associated NAT data entry and indicate
781 * whether it is a "forwards" or "backwards" stream.
782 */
783 npf_nat_t *
784 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
785 {
786 KASSERT(con->c_refcnt > 0);
787 *forw = (con->c_flags & PFIL_ALL) == (u_int)di;
788 return con->c_nat;
789 }
790
791 /*
792 * npf_conn_expired: criterion to check if connection is expired.
793 */
794 bool
795 npf_conn_expired(const npf_conn_t *con, uint64_t tsnow)
796 {
797 const int etime = npf_state_etime(&con->c_state, con->c_proto);
798 int elapsed;
799
800 if (__predict_false(con->c_flags & CONN_EXPIRE)) {
801 /* Explicitly marked to be expired. */
802 return true;
803 }
804
805 /*
806 * Note: another thread may update 'atime' and it might
807 * become greater than 'now'.
808 */
809 elapsed = (int64_t)tsnow - con->c_atime;
810 return elapsed > etime;
811 }
812
813 /*
814 * npf_conn_remove: unlink the connection and mark as expired.
815 */
816 void
817 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
818 {
819 /* Remove both entries of the connection. */
820 mutex_enter(&con->c_lock);
821 if ((con->c_flags & CONN_REMOVED) == 0) {
822 npf_conn_t *ret __diagused;
823
824 ret = npf_conndb_remove(cd, &con->c_forw_entry);
825 KASSERT(ret == con);
826 ret = npf_conndb_remove(cd, &con->c_back_entry);
827 KASSERT(ret == con);
828 }
829
830 /* Flag the removal and expiration. */
831 atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
832 mutex_exit(&con->c_lock);
833 }
834
835 /*
836 * npf_conn_worker: G/C to run from a worker thread.
837 */
838 void
839 npf_conn_worker(npf_t *npf)
840 {
841 npf_conndb_gc(npf, npf->conn_db, false, true);
842 }
843
844 /*
845 * npf_conndb_export: construct a list of connections prepared for saving.
846 * Note: this is expected to be an expensive operation.
847 */
848 int
849 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
850 {
851 npf_conn_t *head, *con;
852
853 /*
854 * Note: acquire conn_lock to prevent from the database
855 * destruction and G/C thread.
856 */
857 mutex_enter(&npf->conn_lock);
858 if (npf->conn_tracking != CONN_TRACKING_ON) {
859 mutex_exit(&npf->conn_lock);
860 return 0;
861 }
862 head = npf_conndb_getlist(npf->conn_db);
863 con = head;
864 while (con) {
865 nvlist_t *cdict;
866
867 if ((cdict = npf_conn_export(npf, con)) != NULL) {
868 nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
869 nvlist_destroy(cdict);
870 }
871 if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
872 break;
873 }
874 }
875 mutex_exit(&npf->conn_lock);
876 return 0;
877 }
878
879 static nvlist_t *
880 npf_connkey_export(const npf_connkey_t *key)
881 {
882 uint16_t id[2], alen, proto;
883 npf_addr_t ips[2];
884 nvlist_t *kdict;
885
886 kdict = nvlist_create(0);
887 connkey_getkey(key, &proto, ips, id, &alen);
888 nvlist_add_number(kdict, "proto", proto);
889 nvlist_add_number(kdict, "sport", id[NPF_SRC]);
890 nvlist_add_number(kdict, "dport", id[NPF_DST]);
891 nvlist_add_binary(kdict, "saddr", &ips[NPF_SRC], alen);
892 nvlist_add_binary(kdict, "daddr", &ips[NPF_DST], alen);
893 return kdict;
894 }
895
896 /*
897 * npf_conn_export: serialise a single connection.
898 */
899 static nvlist_t *
900 npf_conn_export(npf_t *npf, const npf_conn_t *con)
901 {
902 nvlist_t *cdict, *kdict;
903
904 if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
905 return NULL;
906 }
907 cdict = nvlist_create(0);
908 nvlist_add_number(cdict, "flags", con->c_flags);
909 nvlist_add_number(cdict, "proto", con->c_proto);
910 if (con->c_ifid) {
911 const char *ifname = npf_ifmap_getname(npf, con->c_ifid);
912 nvlist_add_string(cdict, "ifname", ifname);
913 }
914 nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
915
916 kdict = npf_connkey_export(&con->c_forw_entry);
917 nvlist_move_nvlist(cdict, "forw-key", kdict);
918
919 kdict = npf_connkey_export(&con->c_back_entry);
920 nvlist_move_nvlist(cdict, "back-key", kdict);
921
922 if (con->c_nat) {
923 npf_nat_export(cdict, con->c_nat);
924 }
925 return cdict;
926 }
927
928 static uint32_t
929 npf_connkey_import(const nvlist_t *kdict, npf_connkey_t *key)
930 {
931 npf_addr_t const * ips[2];
932 uint16_t proto, id[2];
933 size_t alen1, alen2;
934
935 proto = dnvlist_get_number(kdict, "proto", 0);
936 id[NPF_SRC] = dnvlist_get_number(kdict, "sport", 0);
937 id[NPF_DST] = dnvlist_get_number(kdict, "dport", 0);
938 ips[NPF_SRC] = dnvlist_get_binary(kdict, "saddr", &alen1, NULL, 0);
939 ips[NPF_DST] = dnvlist_get_binary(kdict, "daddr", &alen2, NULL, 0);
940 if (__predict_false(alen1 == 0 || alen1 != alen2)) {
941 return 0;
942 }
943 return connkey_setkey(key, proto, ips, id, alen1, true);
944 }
945
946 /*
947 * npf_conn_import: fully reconstruct a single connection from a
948 * nvlist and insert into the given database.
949 */
950 int
951 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
952 npf_ruleset_t *natlist)
953 {
954 npf_conn_t *con;
955 npf_connkey_t *fw, *bk;
956 const nvlist_t *nat, *conkey;
957 const char *ifname;
958 const void *state;
959 size_t len;
960
961 /* Allocate a connection and initialise it (clear first). */
962 con = pool_cache_get(npf->conn_cache, PR_WAITOK);
963 memset(con, 0, sizeof(npf_conn_t));
964 mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
965 npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
966
967 con->c_proto = dnvlist_get_number(cdict, "proto", 0);
968 con->c_flags = dnvlist_get_number(cdict, "flags", 0);
969 con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
970 conn_update_atime(con);
971
972 ifname = dnvlist_get_string(cdict, "ifname", NULL);
973 if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
974 goto err;
975 }
976
977 state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
978 if (!state || len != sizeof(npf_state_t)) {
979 goto err;
980 }
981 memcpy(&con->c_state, state, sizeof(npf_state_t));
982
983 /* Reconstruct NAT association, if any. */
984 if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
985 (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
986 goto err;
987 }
988
989 /*
990 * Fetch and copy the keys for each direction.
991 */
992 conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
993 fw = &con->c_forw_entry;
994 if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
995 goto err;
996 }
997 conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
998 bk = &con->c_back_entry;
999 if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
1000 goto err;
1001 }
1002 fw->ck_backptr = bk->ck_backptr = con;
1003
1004 /* Insert the entries and the connection itself. */
1005 if (!npf_conndb_insert(cd, fw)) {
1006 goto err;
1007 }
1008 if (!npf_conndb_insert(cd, bk)) {
1009 npf_conndb_remove(cd, fw);
1010 goto err;
1011 }
1012
1013 NPF_PRINTF(("NPF: imported conn %p\n", con));
1014 npf_conndb_enqueue(cd, con);
1015 return 0;
1016 err:
1017 npf_conn_destroy(npf, con);
1018 return EINVAL;
1019 }
1020
1021 int
1022 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
1023 {
1024 const nvlist_t *kdict;
1025 npf_connkey_t key;
1026 npf_conn_t *con;
1027 uint16_t dir;
1028 bool forw;
1029
1030 kdict = dnvlist_get_nvlist(idict, "key", NULL);
1031 if (!kdict || !npf_connkey_import(kdict, &key)) {
1032 return EINVAL;
1033 }
1034 dir = dnvlist_get_number(idict, "direction", 0);
1035 con = npf_conndb_lookup(npf->conn_db, &key, &forw);
1036 if (con == NULL) {
1037 return ESRCH;
1038 }
1039 if (!npf_conn_ok(con, dir, true)) {
1040 atomic_dec_uint(&con->c_refcnt);
1041 return ESRCH;
1042 }
1043 *odict = npf_conn_export(npf, con);
1044 atomic_dec_uint(&con->c_refcnt);
1045 return *odict ? 0 : ENOSPC;
1046 }
1047
1048 #if defined(DDB) || defined(_NPF_TESTING)
1049
1050 void
1051 npf_conn_print(const npf_conn_t *con)
1052 {
1053 const u_int alen = NPF_CONN_GETALEN(&con->c_forw_entry);
1054 const uint32_t *fkey = con->c_forw_entry.ck_key;
1055 const uint32_t *bkey = con->c_back_entry.ck_key;
1056 const u_int proto = con->c_proto;
1057 struct timespec tspnow;
1058 const void *src, *dst;
1059 int etime;
1060
1061 getnanouptime(&tspnow);
1062 etime = npf_state_etime(&con->c_state, proto);
1063
1064 printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
1065 proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime), etime);
1066
1067 src = &fkey[2], dst = &fkey[2 + (alen >> 2)];
1068 printf("\tforw %s:%d", npf_addr_dump(src, alen), ntohs(fkey[1] >> 16));
1069 printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(fkey[1] & 0xffff));
1070
1071 src = &bkey[2], dst = &bkey[2 + (alen >> 2)];
1072 printf("\tback %s:%d", npf_addr_dump(src, alen), ntohs(bkey[1] >> 16));
1073 printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(bkey[1] & 0xffff));
1074
1075 npf_state_dump(&con->c_state);
1076 if (con->c_nat) {
1077 npf_nat_dump(con->c_nat);
1078 }
1079 }
1080
1081 #endif
1082