Home | History | Annotate | Line # | Download | only in npf
      1 /*-
      2  * Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
      3  * All rights reserved.
      4  *
      5  * This material is based upon work partially supported by The
      6  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     27  * POSSIBILITY OF SUCH DAMAGE.
     28  */
     29 
     30 /*
     31  * NPF connection storage.
     32  *
     33  * Lock-free connection lookups are protected by EBR with an atomic
     34  * reference acquisition before exiting the critical path.  The caller
     35  * is responsible for re-checking the connection state.
     36  *
     37  * Warning (not applicable for the userspace npfkern):
     38  *
     39  *	thmap is partially lock-free data structure that uses its own
     40  *	spin-locks on the writer side (insert/delete operations).
     41  *
     42  *	The relevant interrupt priority level (IPL) must be set and the
     43  *	kernel preemption disabled across the critical paths to prevent
     44  *	deadlocks and priority inversion problems.  These are essentially
     45  *	the same guarantees as a spinning mutex(9) would provide.
     46  *
     47  *	This is achieved with SPL routines splsoftnet() and splx() around
     48  *	the thmap_del() and thmap_put() calls.  Note: we assume that the
     49  *	network stack invokes NPF at IPL_SOFTNET or lower, but not higher.
     50  */
     51 
     52 #ifdef _KERNEL
     53 #include <sys/cdefs.h>
     54 __KERNEL_RCSID(0, "$NetBSD: npf_conndb.c,v 1.9 2020/05/30 14:16:56 rmind Exp $");
     55 
     56 #include <sys/param.h>
     57 #include <sys/types.h>
     58 
     59 #include <sys/atomic.h>
     60 #include <sys/kmem.h>
     61 #include <sys/thmap.h>
     62 #endif
     63 
     64 #define __NPF_CONN_PRIVATE
     65 #include "npf_conn.h"
     66 #include "npf_impl.h"
     67 
     68 struct npf_conndb {
     69 	thmap_t *		cd_map;
     70 
     71 	/*
     72 	 * There are three lists for connections: new, all and G/C.
     73 	 *
     74 	 * New connections are atomically inserted into the "new-list".
     75 	 * The G/C worker will move them to the doubly-linked list of all
     76 	 * active connections.
     77 	 */
     78 	npf_conn_t *		cd_new;
     79 	LIST_HEAD(, npf_conn)	cd_list;
     80 	LIST_HEAD(, npf_conn)	cd_gclist;
     81 
     82 	/* The last inspected connection (for circular iteration). */
     83 	npf_conn_t *		cd_marker;
     84 };
     85 
     86 typedef struct {
     87 	int		step;
     88 	int		interval_min;
     89 	int		interval_max;
     90 } npf_conndb_params_t;
     91 
     92 /*
     93  * Pointer tag for connection keys which represent the "forwards" entry.
     94  */
     95 #define	CONNDB_FORW_BIT		((uintptr_t)0x1)
     96 #define	CONNDB_ISFORW_P(p)	(((uintptr_t)(p) & CONNDB_FORW_BIT) != 0)
     97 #define	CONNDB_GET_PTR(p)	((void *)((uintptr_t)(p) & ~CONNDB_FORW_BIT))
     98 
     99 void
    100 npf_conndb_sysinit(npf_t *npf)
    101 {
    102 	npf_conndb_params_t *params = npf_param_allocgroup(npf,
    103 	    NPF_PARAMS_CONNDB, sizeof(npf_conndb_params_t));
    104 	npf_param_t param_map[] = {
    105 		{
    106 			"gc.step",
    107 			&params->step,
    108 			.default_val = 256,
    109 			.min = 1, .max = INT_MAX
    110 		},
    111 		{
    112 			"gc.interval_min",
    113 			&params->interval_min,
    114 			.default_val = 50, // ms
    115 			.min = 10, .max = 10000
    116 		},
    117 		{
    118 			"gc.interval_max",
    119 			&params->interval_max,
    120 			.default_val = 5000, // ms
    121 			.min = 10, .max = 10000
    122 		},
    123 	};
    124 	npf_param_register(npf, param_map, __arraycount(param_map));
    125 }
    126 
    127 void
    128 npf_conndb_sysfini(npf_t *npf)
    129 {
    130 	const size_t len = sizeof(npf_conndb_params_t);
    131 	npf_param_freegroup(npf, NPF_PARAMS_CONNDB, len);
    132 }
    133 
    134 npf_conndb_t *
    135 npf_conndb_create(void)
    136 {
    137 	npf_conndb_t *cd;
    138 
    139 	cd = kmem_zalloc(sizeof(npf_conndb_t), KM_SLEEP);
    140 	cd->cd_map = thmap_create(0, NULL, THMAP_NOCOPY);
    141 	KASSERT(cd->cd_map != NULL);
    142 
    143 	LIST_INIT(&cd->cd_list);
    144 	LIST_INIT(&cd->cd_gclist);
    145 	return cd;
    146 }
    147 
    148 void
    149 npf_conndb_destroy(npf_conndb_t *cd)
    150 {
    151 	KASSERT(cd->cd_new == NULL);
    152 	KASSERT(cd->cd_marker == NULL);
    153 	KASSERT(LIST_EMPTY(&cd->cd_list));
    154 	KASSERT(LIST_EMPTY(&cd->cd_gclist));
    155 
    156 	thmap_destroy(cd->cd_map);
    157 	kmem_free(cd, sizeof(npf_conndb_t));
    158 }
    159 
    160 /*
    161  * npf_conndb_lookup: find a connection given the key.
    162  */
    163 npf_conn_t *
    164 npf_conndb_lookup(npf_t *npf, const npf_connkey_t *ck, npf_flow_t *flow)
    165 {
    166 	npf_conndb_t *cd = atomic_load_relaxed(&npf->conn_db);
    167 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
    168 	npf_conn_t *con;
    169 	void *val;
    170 
    171 	/*
    172 	 * Lookup the connection key in the key-value map.
    173 	 */
    174 	int s = npf_config_read_enter(npf);
    175 	val = thmap_get(cd->cd_map, ck->ck_key, keylen);
    176 	if (!val) {
    177 		npf_config_read_exit(npf, s);
    178 		return NULL;
    179 	}
    180 
    181 	/*
    182 	 * Determine whether this is the "forwards" or "backwards" key
    183 	 * and clear the pointer tag.
    184 	 */
    185 	*flow = CONNDB_ISFORW_P(val) ? NPF_FLOW_FORW : NPF_FLOW_BACK;
    186 	con = CONNDB_GET_PTR(val);
    187 	KASSERT(con != NULL);
    188 
    189 	/*
    190 	 * Acquire a reference and return the connection.
    191 	 */
    192 	atomic_inc_uint(&con->c_refcnt);
    193 	npf_config_read_exit(npf, s);
    194 	return con;
    195 }
    196 
    197 /*
    198  * npf_conndb_insert: insert the key representing the connection.
    199  *
    200  * => Returns true on success and false on failure.
    201  */
    202 bool
    203 npf_conndb_insert(npf_conndb_t *cd, const npf_connkey_t *ck,
    204     npf_conn_t *con, npf_flow_t flow)
    205 {
    206 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
    207 	const uintptr_t tag = (CONNDB_FORW_BIT * !flow);
    208 	void *val;
    209 	bool ok;
    210 
    211 	/*
    212 	 * Tag the connection pointer if this is the "forwards" key.
    213 	 */
    214 	KASSERT(!CONNDB_ISFORW_P(con));
    215 	val = (void *)((uintptr_t)(void *)con | tag);
    216 
    217 	int s = splsoftnet();
    218 	ok = thmap_put(cd->cd_map, ck->ck_key, keylen, val) == val;
    219 	splx(s);
    220 
    221 	return ok;
    222 }
    223 
    224 /*
    225  * npf_conndb_remove: find and delete connection key, returning the
    226  * connection it represents.
    227  */
    228 npf_conn_t *
    229 npf_conndb_remove(npf_conndb_t *cd, npf_connkey_t *ck)
    230 {
    231 	const unsigned keylen = NPF_CONNKEY_LEN(ck);
    232 	void *val;
    233 
    234 	int s = splsoftnet();
    235 	val = thmap_del(cd->cd_map, ck->ck_key, keylen);
    236 	splx(s);
    237 
    238 	return CONNDB_GET_PTR(val);
    239 }
    240 
    241 /*
    242  * npf_conndb_enqueue: atomically insert the connection into the
    243  * singly-linked list of the "new" connections.
    244  */
    245 void
    246 npf_conndb_enqueue(npf_conndb_t *cd, npf_conn_t *con)
    247 {
    248 	npf_conn_t *head;
    249 
    250 	do {
    251 		head = atomic_load_relaxed(&cd->cd_new);
    252 		atomic_store_relaxed(&con->c_next, head);
    253 	} while (atomic_cas_ptr(&cd->cd_new, head, con) != head);
    254 }
    255 
    256 /*
    257  * npf_conndb_update: migrate all new connections to the list of all
    258  * connections; this must also be performed on npf_conndb_getlist()
    259  * to provide a complete list of connections.
    260  */
    261 static void
    262 npf_conndb_update(npf_conndb_t *cd)
    263 {
    264 	npf_conn_t *con;
    265 
    266 	con = atomic_swap_ptr(&cd->cd_new, NULL);
    267 	while (con) {
    268 		npf_conn_t *next = atomic_load_relaxed(&con->c_next); // union
    269 		LIST_INSERT_HEAD(&cd->cd_list, con, c_entry);
    270 		con = next;
    271 	}
    272 }
    273 
    274 /*
    275  * npf_conndb_getlist: return the list of all connections.
    276  */
    277 npf_conn_t *
    278 npf_conndb_getlist(npf_conndb_t *cd)
    279 {
    280 	npf_conndb_update(cd);
    281 	return LIST_FIRST(&cd->cd_list);
    282 }
    283 
    284 /*
    285  * npf_conndb_getnext: return the next connection, implementing
    286  * the circular iteration.
    287  */
    288 npf_conn_t *
    289 npf_conndb_getnext(npf_conndb_t *cd, npf_conn_t *con)
    290 {
    291 	/* Next.. */
    292 	if (con == NULL || (con = LIST_NEXT(con, c_entry)) == NULL) {
    293 		con = LIST_FIRST(&cd->cd_list);
    294 	}
    295 	return con;
    296 }
    297 
    298 /*
    299  * npf_conndb_gc_incr: incremental G/C of the expired connections.
    300  */
    301 static unsigned
    302 npf_conndb_gc_incr(npf_t *npf, npf_conndb_t *cd, const time_t now)
    303 {
    304 	const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB];
    305 	unsigned target = params->step;
    306 	unsigned gc_conns = 0;
    307 	npf_conn_t *con;
    308 
    309 	KASSERT(mutex_owned(&npf->conn_lock));
    310 
    311 	/*
    312 	 * Second, start from the "last" (marker) connection.
    313 	 * We must initialise the marker if it is not set yet.
    314 	 */
    315 	if ((con = cd->cd_marker) == NULL) {
    316 		con = npf_conndb_getnext(cd, NULL);
    317 		cd->cd_marker = con;
    318 	}
    319 
    320 	/*
    321 	 * Scan the connections:
    322 	 * - Limit the scan to the G/C step size.
    323 	 * - Stop if we scanned all of them.
    324 	 * - Update the marker connection.
    325 	 */
    326 	while (con && target--) {
    327 		npf_conn_t *next = npf_conndb_getnext(cd, con);
    328 
    329 		/*
    330 		 * Can we G/C this connection?
    331 		 */
    332 		if (npf_conn_expired(npf, con, now)) {
    333 			/* Yes: move to the G/C list. */
    334 			LIST_REMOVE(con, c_entry);
    335 			LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry);
    336 			npf_conn_remove(cd, con);
    337 			gc_conns++;
    338 
    339 			/* This connection cannot be a new marker anymore. */
    340 			if (con == next) {
    341 				next = NULL;
    342 			}
    343 			if (con == cd->cd_marker) {
    344 				cd->cd_marker = next;
    345 				con = next;
    346 				continue;
    347 			}
    348 		}
    349 		con = next;
    350 
    351 		/*
    352 		 * Circular iteration: if we returned back to the
    353 		 * marker connection, then stop.
    354 		 */
    355 		if (con == cd->cd_marker) {
    356 			break;
    357 		}
    358 	}
    359 	cd->cd_marker = con;
    360 	return gc_conns;
    361 }
    362 
    363 /*
    364  * gc_freq_tune: G/C frequency self-tuning.
    365  *
    366  * If there is something to G/C, then exponentially increase the wake
    367  * up frequency.  Otherwise, reduce the frequency.  Enforce the lower
    368  * and upper bounds.
    369  *
    370  * => Returns the number milliseconds until next G/C.
    371  */
    372 static unsigned
    373 gc_freq_tune(const npf_t *npf, const npf_conndb_t *cd, const unsigned n)
    374 {
    375 	const npf_conndb_params_t *params = npf->params[NPF_PARAMS_CONNDB];
    376 	int wtime = npf->worker_wait_time;
    377 	wtime = n ? (wtime >> 1) : (wtime << 1);
    378 	return MAX(MIN(wtime, params->interval_max), params->interval_min);
    379 }
    380 
    381 /*
    382  * npf_conndb_gc: garbage collect the expired connections.
    383  *
    384  * => Must run in a single-threaded manner.
    385  * => If 'flush' is true, then destroy all connections.
    386  * => If 'sync' is true, then perform passive serialisation.
    387  */
    388 void
    389 npf_conndb_gc(npf_t *npf, npf_conndb_t *cd, bool flush, bool sync)
    390 {
    391 	struct timespec tsnow;
    392 	unsigned gc_conns = 0;
    393 	npf_conn_t *con;
    394 	void *gcref;
    395 
    396 	getnanouptime(&tsnow);
    397 
    398 	/* First, migrate all new connections. */
    399 	mutex_enter(&npf->conn_lock);
    400 	npf_conndb_update(cd);
    401 	if (flush) {
    402 		/* Just unlink and move all connections to the G/C list. */
    403 		while ((con = LIST_FIRST(&cd->cd_list)) != NULL) {
    404 			LIST_REMOVE(con, c_entry);
    405 			LIST_INSERT_HEAD(&cd->cd_gclist, con, c_entry);
    406 			npf_conn_remove(cd, con);
    407 		}
    408 		cd->cd_marker = NULL;
    409 	} else {
    410 		/* Incremental G/C of the expired connections. */
    411 		gc_conns = npf_conndb_gc_incr(npf, cd, tsnow.tv_sec);
    412 	}
    413 	mutex_exit(&npf->conn_lock);
    414 
    415 	/*
    416 	 * Ensure it is safe to destroy the connections.
    417 	 * Note: drop the conn_lock (see the lock order).
    418 	 */
    419 	gcref = thmap_stage_gc(cd->cd_map);
    420 	if (sync && (gcref || !LIST_EMPTY(&cd->cd_gclist))) {
    421 		npf_config_enter(npf);
    422 		npf_config_sync(npf);
    423 		npf_config_exit(npf);
    424 	}
    425 	thmap_gc(cd->cd_map, gcref);
    426 
    427 	/* Self-tune the G/C frequency. */
    428 	npf->worker_wait_time = gc_freq_tune(npf, cd, gc_conns);
    429 
    430 	if (LIST_EMPTY(&cd->cd_gclist)) {
    431 		return;
    432 	}
    433 
    434 	/*
    435 	 * Garbage collect all expired connections.
    436 	 * May need to wait for the references to drain.
    437 	 */
    438 	while ((con = LIST_FIRST(&cd->cd_gclist)) != NULL) {
    439 		/*
    440 		 * Destroy only if removed and no references.  Otherwise,
    441 		 * just do it next time, unless we are destroying all.
    442 		 */
    443 		const unsigned refcnt = atomic_load_relaxed(&con->c_refcnt);
    444 
    445 		if (__predict_false(refcnt)) {
    446 			if (flush) {
    447 				kpause("npfcongc", false, 1, NULL);
    448 				continue;
    449 			}
    450 			break; // exit the loop
    451 		}
    452 		LIST_REMOVE(con, c_entry);
    453 		npf_conn_destroy(npf, con);
    454 	}
    455 }
    456