Home | History | Annotate | Line # | Download | only in netinet
      1 /*	$NetBSD: ip_encap.c,v 1.78 2025/02/26 04:49:45 andvar Exp $	*/
      2 /*	$KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
      6  * All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. Neither the name of the project nor the names of its contributors
     17  *    may be used to endorse or promote products derived from this software
     18  *    without specific prior written permission.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
     21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
     24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     30  * SUCH DAMAGE.
     31  */
     32 /*
     33  * My grandfather said that there's a devil inside tunnelling technology...
     34  *
     35  * We have surprisingly many protocols that want packets with IP protocol
     36  * #4 or #41.  Here's a list of protocols that want protocol #41:
     37  *	RFC1933 configured tunnel
     38  *	RFC1933 automatic tunnel
     39  *	RFC2401 IPsec tunnel
     40  *	RFC2473 IPv6 generic packet tunnelling
     41  *	RFC2529 6over4 tunnel
     42  *	RFC3056 6to4 tunnel
     43  *	isatap tunnel
     44  *	mobile-ip6 (uses RFC2473)
     45  * Here's a list of protocol that want protocol #4:
     46  *	RFC1853 IPv4-in-IPv4 tunnelling
     47  *	RFC2003 IPv4 encapsulation within IPv4
     48  *	RFC2344 reverse tunnelling for mobile-ip4
     49  *	RFC2401 IPsec tunnel
     50  * Well, what can I say.  They impose different en/decapsulation mechanism
     51  * from each other, so they need separate protocol handler.  The only one
     52  * we can easily determine by protocol # is IPsec, which always has
     53  * AH/ESP/IPComp header right after outer IP header.
     54  *
     55  * So, clearly good old protosw does not work for protocol #4 and #41.
     56  * The code will let you match protocol via src/dst address pair.
     57  */
     58 /* XXX is M_NETADDR correct? */
     59 
     60 #include <sys/cdefs.h>
     61 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.78 2025/02/26 04:49:45 andvar Exp $");
     62 
     63 #ifdef _KERNEL_OPT
     64 #include "opt_mrouting.h"
     65 #include "opt_inet.h"
     66 #include "opt_net_mpsafe.h"
     67 #endif
     68 
     69 #include <sys/param.h>
     70 #include <sys/systm.h>
     71 #include <sys/socket.h>
     72 #include <sys/socketvar.h> /* for softnet_lock */
     73 #include <sys/sockio.h>
     74 #include <sys/mbuf.h>
     75 #include <sys/errno.h>
     76 #include <sys/queue.h>
     77 #include <sys/kmem.h>
     78 #include <sys/mutex.h>
     79 #include <sys/condvar.h>
     80 #include <sys/psref.h>
     81 #include <sys/pslist.h>
     82 #include <sys/thmap.h>
     83 
     84 #include <net/if.h>
     85 
     86 #include <netinet/in.h>
     87 #include <netinet/in_systm.h>
     88 #include <netinet/ip.h>
     89 #include <netinet/ip_var.h>
     90 #include <netinet/ip_encap.h>
     91 #ifdef MROUTING
     92 #include <netinet/ip_mroute.h>
     93 #endif /* MROUTING */
     94 
     95 #ifdef INET6
     96 #include <netinet/ip6.h>
     97 #include <netinet6/ip6_var.h>
     98 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */
     99 #include <netinet6/in6_var.h>
    100 #include <netinet6/in6_pcb.h>
    101 #include <netinet/icmp6.h>
    102 #endif
    103 
    104 #ifdef NET_MPSAFE
    105 #define ENCAP_MPSAFE	1
    106 #endif
    107 
    108 enum direction { INBOUND, OUTBOUND };
    109 
    110 #ifdef INET
    111 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction,
    112     struct psref *);
    113 #endif
    114 #ifdef INET6
    115 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction,
    116     struct psref *);
    117 #endif
    118 static int encap_add(struct encaptab *);
    119 static int encap_remove(struct encaptab *);
    120 static void encap_afcheck(int, const struct sockaddr *, const struct sockaddr *);
    121 static void encap_key_init(struct encap_key *, const struct sockaddr *,
    122     const struct sockaddr *);
    123 static void encap_key_inc(struct encap_key *);
    124 
    125 /*
    126  * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking
    127  * encap_table. So, it cannot use pserialize_read_enter()
    128  */
    129 static struct {
    130 	struct pslist_head	list;
    131 	pserialize_t		psz;
    132 	struct psref_class	*elem_class; /* for the element of et_list */
    133 } encaptab  __cacheline_aligned = {
    134 	.list = PSLIST_INITIALIZER,
    135 };
    136 #define encap_table encaptab.list
    137 
    138 static struct {
    139 	kmutex_t	lock;
    140 	kcondvar_t	cv;
    141 	struct lwp	*busy;
    142 } encap_whole __cacheline_aligned;
    143 
    144 static thmap_t *encap_map[2];	/* 0 for AF_INET, 1 for AF_INET6 */
    145 
    146 static bool encap_initialized = false;
    147 /*
    148  * must be done before other encap interfaces initialization.
    149  */
    150 void
    151 encapinit(void)
    152 {
    153 
    154 	if (encap_initialized)
    155 		return;
    156 
    157 	encaptab.psz = pserialize_create();
    158 	encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET);
    159 
    160 	mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE);
    161 	cv_init(&encap_whole.cv, "ip_encap cv");
    162 	encap_whole.busy = NULL;
    163 
    164 	encap_initialized = true;
    165 }
    166 
    167 void
    168 encap_init(void)
    169 {
    170 	static int initialized = 0;
    171 
    172 	if (initialized)
    173 		return;
    174 	initialized++;
    175 #if 0
    176 	/*
    177 	 * we cannot use LIST_INIT() here, since drivers may want to call
    178 	 * encap_attach(), on driver attach.  encap_init() will be called
    179 	 * on AF_INET{,6} initialization, which happens after driver
    180 	 * initialization - using LIST_INIT() here can nuke encap_attach()
    181 	 * from drivers.
    182 	 */
    183 	PSLIST_INIT(&encap_table);
    184 #endif
    185 
    186 	encap_map[0] = thmap_create(0, NULL, THMAP_NOCOPY);
    187 #ifdef INET6
    188 	encap_map[1] = thmap_create(0, NULL, THMAP_NOCOPY);
    189 #endif
    190 }
    191 
    192 #ifdef INET
    193 static struct encaptab *
    194 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir,
    195     struct psref *match_psref)
    196 {
    197 	struct ip *ip;
    198 	struct ip_pack4 pack;
    199 	struct encaptab *ep, *match;
    200 	int prio, matchprio;
    201 	int s;
    202 	thmap_t *emap = encap_map[0];
    203 	struct encap_key key;
    204 
    205 	KASSERT(m->m_len >= sizeof(*ip));
    206 
    207 	ip = mtod(m, struct ip *);
    208 
    209 	memset(&pack, 0, sizeof(pack));
    210 	pack.p.sp_len = sizeof(pack);
    211 	pack.mine.sin_family = pack.yours.sin_family = AF_INET;
    212 	pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in);
    213 	if (dir == INBOUND) {
    214 		pack.mine.sin_addr = ip->ip_dst;
    215 		pack.yours.sin_addr = ip->ip_src;
    216 	} else {
    217 		pack.mine.sin_addr = ip->ip_src;
    218 		pack.yours.sin_addr = ip->ip_dst;
    219 	}
    220 
    221 	match = NULL;
    222 	matchprio = 0;
    223 
    224 	s = pserialize_read_enter();
    225 
    226 	encap_key_init(&key, sintosa(&pack.mine), sintosa(&pack.yours));
    227 	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
    228 		struct psref elem_psref;
    229 
    230 		KASSERT(ep->af == AF_INET);
    231 
    232 		if (ep->proto >= 0 && ep->proto != proto) {
    233 			encap_key_inc(&key);
    234 			continue;
    235 		}
    236 
    237 		psref_acquire(&elem_psref, &ep->psref,
    238 		    encaptab.elem_class);
    239 		if (ep->func) {
    240 			pserialize_read_exit(s);
    241 			prio = (*ep->func)(m, off, proto, ep->arg);
    242 			s = pserialize_read_enter();
    243 		} else {
    244 			prio = pack.mine.sin_len + pack.yours.sin_len;
    245 		}
    246 
    247 		if (prio <= 0) {
    248 			psref_release(&elem_psref, &ep->psref,
    249 			    encaptab.elem_class);
    250 			encap_key_inc(&key);
    251 			continue;
    252 		}
    253 		if (prio > matchprio) {
    254 			/* release last matched ep */
    255 			if (match != NULL)
    256 				psref_release(match_psref, &match->psref,
    257 				    encaptab.elem_class);
    258 
    259 			psref_copy(match_psref, &elem_psref,
    260 			    encaptab.elem_class);
    261 			matchprio = prio;
    262 			match = ep;
    263 		}
    264 
    265 		psref_release(&elem_psref, &ep->psref,
    266 		    encaptab.elem_class);
    267 		encap_key_inc(&key);
    268 	}
    269 
    270 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
    271 		struct psref elem_psref;
    272 
    273 		if (ep->af != AF_INET)
    274 			continue;
    275 		if (ep->proto >= 0 && ep->proto != proto)
    276 			continue;
    277 
    278 		psref_acquire(&elem_psref, &ep->psref,
    279 		    encaptab.elem_class);
    280 		pserialize_read_exit(s);
    281 		/* ep->func is sleepable. e.g. rtalloc1 */
    282 		prio = (*ep->func)(m, off, proto, ep->arg);
    283 		s = pserialize_read_enter();
    284 
    285 		/*
    286 		 * We prioritize the matches by using bit length of the
    287 		 * matches.  user-supplied matching function
    288 		 * should return the bit length of the matches (for example,
    289 		 * if both src/dst are matched for IPv4, 64 should be returned).
    290 		 * 0 or negative return value means "it did not match".
    291 		 *
    292 		 * We need to loop through all the possible candidates
    293 		 * to get the best match - the search takes O(n) for
    294 		 * n attachments (i.e. interfaces).
    295 		 */
    296 		if (prio <= 0) {
    297 			psref_release(&elem_psref, &ep->psref,
    298 			    encaptab.elem_class);
    299 			continue;
    300 		}
    301 		if (prio > matchprio) {
    302 			/* release last matched ep */
    303 			if (match != NULL)
    304 				psref_release(match_psref, &match->psref,
    305 				    encaptab.elem_class);
    306 
    307 			psref_copy(match_psref, &elem_psref,
    308 			    encaptab.elem_class);
    309 			matchprio = prio;
    310 			match = ep;
    311 		}
    312 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
    313 			encaptab.elem_class),
    314 		    "current match = %p, but not hold its psref", match);
    315 
    316 		psref_release(&elem_psref, &ep->psref,
    317 		    encaptab.elem_class);
    318 	}
    319 	pserialize_read_exit(s);
    320 
    321 	return match;
    322 }
    323 
    324 void
    325 encap4_input(struct mbuf *m, int off, int proto)
    326 {
    327 	const struct encapsw *esw;
    328 	struct encaptab *match;
    329 	struct psref match_psref;
    330 
    331 	match = encap4_lookup(m, off, proto, INBOUND, &match_psref);
    332 	if (match) {
    333 		/* found a match, "match" has the best one */
    334 		esw = match->esw;
    335 		if (esw && esw->encapsw4.pr_input) {
    336 			(*esw->encapsw4.pr_input)(m, off, proto, match->arg);
    337 			psref_release(&match_psref, &match->psref,
    338 			    encaptab.elem_class);
    339 		} else {
    340 			psref_release(&match_psref, &match->psref,
    341 			    encaptab.elem_class);
    342 			m_freem(m);
    343 		}
    344 		return;
    345 	}
    346 
    347 	/* last resort: inject to raw socket */
    348 	SOFTNET_LOCK_IF_NET_MPSAFE();
    349 	rip_input(m, off, proto);
    350 	SOFTNET_UNLOCK_IF_NET_MPSAFE();
    351 }
    352 #endif
    353 
    354 #ifdef INET6
    355 static struct encaptab *
    356 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir,
    357     struct psref *match_psref)
    358 {
    359 	struct ip6_hdr *ip6;
    360 	struct ip_pack6 pack;
    361 	int prio, matchprio;
    362 	int s;
    363 	struct encaptab *ep, *match;
    364 	thmap_t *emap = encap_map[1];
    365 	struct encap_key key;
    366 
    367 	KASSERT(m->m_len >= sizeof(*ip6));
    368 
    369 	ip6 = mtod(m, struct ip6_hdr *);
    370 
    371 	memset(&pack, 0, sizeof(pack));
    372 	pack.p.sp_len = sizeof(pack);
    373 	pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6;
    374 	pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6);
    375 	if (dir == INBOUND) {
    376 		pack.mine.sin6_addr = ip6->ip6_dst;
    377 		pack.yours.sin6_addr = ip6->ip6_src;
    378 	} else {
    379 		pack.mine.sin6_addr = ip6->ip6_src;
    380 		pack.yours.sin6_addr = ip6->ip6_dst;
    381 	}
    382 
    383 	match = NULL;
    384 	matchprio = 0;
    385 
    386 	s = pserialize_read_enter();
    387 
    388 	encap_key_init(&key, sin6tosa(&pack.mine), sin6tosa(&pack.yours));
    389 	while ((ep = thmap_get(emap, &key, sizeof(key))) != NULL) {
    390 		struct psref elem_psref;
    391 
    392 		KASSERT(ep->af == AF_INET6);
    393 
    394 		if (ep->proto >= 0 && ep->proto != proto) {
    395 			encap_key_inc(&key);
    396 			continue;
    397 		}
    398 
    399 		psref_acquire(&elem_psref, &ep->psref,
    400 		    encaptab.elem_class);
    401 		if (ep->func) {
    402 			pserialize_read_exit(s);
    403 			prio = (*ep->func)(m, off, proto, ep->arg);
    404 			s = pserialize_read_enter();
    405 		} else {
    406 			prio = pack.mine.sin6_len + pack.yours.sin6_len;
    407 		}
    408 
    409 		if (prio <= 0) {
    410 			psref_release(&elem_psref, &ep->psref,
    411 			    encaptab.elem_class);
    412 			encap_key_inc(&key);
    413 			continue;
    414 		}
    415 		if (prio > matchprio) {
    416 			/* release last matched ep */
    417 			if (match != NULL)
    418 				psref_release(match_psref, &match->psref,
    419 				    encaptab.elem_class);
    420 
    421 			psref_copy(match_psref, &elem_psref,
    422 			    encaptab.elem_class);
    423 			matchprio = prio;
    424 			match = ep;
    425 		}
    426 		psref_release(&elem_psref, &ep->psref,
    427 		    encaptab.elem_class);
    428 		encap_key_inc(&key);
    429 	}
    430 
    431 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
    432 		struct psref elem_psref;
    433 
    434 		if (ep->af != AF_INET6)
    435 			continue;
    436 		if (ep->proto >= 0 && ep->proto != proto)
    437 			continue;
    438 
    439 		psref_acquire(&elem_psref, &ep->psref,
    440 		    encaptab.elem_class);
    441 
    442 		pserialize_read_exit(s);
    443 		/* ep->func is sleepable. e.g. rtalloc1 */
    444 		prio = (*ep->func)(m, off, proto, ep->arg);
    445 		s = pserialize_read_enter();
    446 
    447 		/* see encap4_lookup() for issues here */
    448 		if (prio <= 0) {
    449 			psref_release(&elem_psref, &ep->psref,
    450 			    encaptab.elem_class);
    451 			continue;
    452 		}
    453 		if (prio > matchprio) {
    454 			/* release last matched ep */
    455 			if (match != NULL)
    456 				psref_release(match_psref, &match->psref,
    457 				    encaptab.elem_class);
    458 
    459 			psref_copy(match_psref, &elem_psref,
    460 			    encaptab.elem_class);
    461 			matchprio = prio;
    462 			match = ep;
    463 		}
    464 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
    465 			encaptab.elem_class),
    466 		    "current match = %p, but not hold its psref", match);
    467 
    468 		psref_release(&elem_psref, &ep->psref,
    469 		    encaptab.elem_class);
    470 	}
    471 	pserialize_read_exit(s);
    472 
    473 	return match;
    474 }
    475 
    476 int
    477 encap6_input(struct mbuf **mp, int *offp, int proto)
    478 {
    479 	struct mbuf *m = *mp;
    480 	const struct encapsw *esw;
    481 	struct encaptab *match;
    482 	struct psref match_psref;
    483 	int rv;
    484 
    485 	match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref);
    486 
    487 	if (match) {
    488 		/* found a match */
    489 		esw = match->esw;
    490 		if (esw && esw->encapsw6.pr_input) {
    491 			int ret;
    492 			ret = (*esw->encapsw6.pr_input)(mp, offp, proto,
    493 			    match->arg);
    494 			psref_release(&match_psref, &match->psref,
    495 			    encaptab.elem_class);
    496 			return ret;
    497 		} else {
    498 			psref_release(&match_psref, &match->psref,
    499 			    encaptab.elem_class);
    500 			m_freem(m);
    501 			return IPPROTO_DONE;
    502 		}
    503 	}
    504 
    505 	/* last resort: inject to raw socket */
    506 	SOFTNET_LOCK_IF_NET_MPSAFE();
    507 	rv = rip6_input(mp, offp, proto);
    508 	SOFTNET_UNLOCK_IF_NET_MPSAFE();
    509 	return rv;
    510 }
    511 #endif
    512 
    513 static int
    514 encap_add(struct encaptab *ep)
    515 {
    516 
    517 	KASSERT(encap_lock_held());
    518 
    519 	PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain);
    520 
    521 	return 0;
    522 }
    523 
    524 static int
    525 encap_remove(struct encaptab *ep)
    526 {
    527 	int error = 0;
    528 
    529 	KASSERT(encap_lock_held());
    530 
    531 	PSLIST_WRITER_REMOVE(ep, chain);
    532 
    533 	return error;
    534 }
    535 
    536 static void
    537 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp)
    538 {
    539 
    540 	KASSERT(sp != NULL && dp != NULL);
    541 	KASSERT(sp->sa_len == dp->sa_len);
    542 	KASSERT(af == sp->sa_family && af == dp->sa_family);
    543 
    544 	socklen_t len __diagused = sockaddr_getsize_by_family(af);
    545 	KASSERT(len != 0 && len == sp->sa_len && len == dp->sa_len);
    546 }
    547 
    548 const struct encaptab *
    549 encap_attach_func(int af, int proto,
    550     encap_priofunc_t *func,
    551     const struct encapsw *esw, void *arg)
    552 {
    553 	struct encaptab *ep;
    554 	int error;
    555 #ifndef ENCAP_MPSAFE
    556 	int s;
    557 
    558 	s = splsoftnet();
    559 #endif
    560 
    561 	ASSERT_SLEEPABLE();
    562 
    563 	/* sanity check on args */
    564 	KASSERT(func != NULL);
    565 	KASSERT(af == AF_INET
    566 #ifdef INET6
    567 	    || af == AF_INET6
    568 #endif
    569 	);
    570 
    571 	ep = kmem_alloc(sizeof(*ep), KM_SLEEP);
    572 	memset(ep, 0, sizeof(*ep));
    573 
    574 	ep->af = af;
    575 	ep->proto = proto;
    576 	ep->func = func;
    577 	ep->esw = esw;
    578 	ep->arg = arg;
    579 	psref_target_init(&ep->psref, encaptab.elem_class);
    580 
    581 	error = encap_add(ep);
    582 	if (error)
    583 		goto gc;
    584 
    585 	error = 0;
    586 #ifndef ENCAP_MPSAFE
    587 	splx(s);
    588 #endif
    589 	return ep;
    590 
    591 gc:
    592 	kmem_free(ep, sizeof(*ep));
    593 #ifndef ENCAP_MPSAFE
    594 	splx(s);
    595 #endif
    596 	return NULL;
    597 }
    598 
    599 static void
    600 encap_key_init(struct encap_key *key,
    601     const struct sockaddr *local, const struct sockaddr *remote)
    602 {
    603 
    604 	memset(key, 0, sizeof(*key));
    605 
    606 	sockaddr_copy(&key->local_sa, sizeof(key->local_u), local);
    607 	sockaddr_copy(&key->remote_sa, sizeof(key->remote_u), remote);
    608 }
    609 
    610 static void
    611 encap_key_inc(struct encap_key *key)
    612 {
    613 
    614 	(key->seq)++;
    615 }
    616 
    617 static void
    618 encap_key_dec(struct encap_key *key)
    619 {
    620 
    621 	(key->seq)--;
    622 }
    623 
    624 static void
    625 encap_key_copy(struct encap_key *dst, const struct encap_key *src)
    626 {
    627 
    628 	memset(dst, 0, sizeof(*dst));
    629 	*dst = *src;
    630 }
    631 
    632 /*
    633  * src is always my side, and dst is always remote side.
    634  * Return value will be necessary as input (cookie) for encap_detach().
    635  */
    636 const struct encaptab *
    637 encap_attach_addr(int af, int proto,
    638     const struct sockaddr *src, const struct sockaddr *dst,
    639     encap_priofunc_t *func,
    640     const struct encapsw *esw, void *arg)
    641 {
    642 	struct encaptab *ep;
    643 	size_t l;
    644 	thmap_t *emap;
    645 	void *retep;
    646 	struct ip_pack4 *pack4;
    647 #ifdef INET6
    648 	struct ip_pack6 *pack6;
    649 #endif
    650 
    651 	ASSERT_SLEEPABLE();
    652 
    653 	encap_afcheck(af, src, dst);
    654 
    655 	switch (af) {
    656 	case AF_INET:
    657 		l = sizeof(*pack4);
    658 		emap = encap_map[0];
    659 		break;
    660 #ifdef INET6
    661 	case AF_INET6:
    662 		l = sizeof(*pack6);
    663 		emap = encap_map[1];
    664 		break;
    665 #endif
    666 	default:
    667 		return NULL;
    668 	}
    669 
    670 	ep = kmem_zalloc(sizeof(*ep), KM_SLEEP);
    671 	ep->addrpack = kmem_zalloc(l, KM_SLEEP);
    672 	ep->addrpack->sa_len = l & 0xff;
    673 	ep->af = af;
    674 	ep->proto = proto;
    675 	ep->flag = IP_ENCAP_ADDR_ENABLE;
    676 	switch (af) {
    677 	case AF_INET:
    678 		pack4 = (struct ip_pack4 *)ep->addrpack;
    679 		ep->src = (struct sockaddr *)&pack4->mine;
    680 		ep->dst = (struct sockaddr *)&pack4->yours;
    681 		break;
    682 #ifdef INET6
    683 	case AF_INET6:
    684 		pack6 = (struct ip_pack6 *)ep->addrpack;
    685 		ep->src = (struct sockaddr *)&pack6->mine;
    686 		ep->dst = (struct sockaddr *)&pack6->yours;
    687 		break;
    688 #endif
    689 	}
    690 	memcpy(ep->src, src, src->sa_len);
    691 	memcpy(ep->dst, dst, dst->sa_len);
    692 	ep->esw = esw;
    693 	ep->arg = arg;
    694 	ep->func = func;
    695 	psref_target_init(&ep->psref, encaptab.elem_class);
    696 
    697 	encap_key_init(&ep->key, src, dst);
    698 	while ((retep = thmap_put(emap, &ep->key, sizeof(ep->key), ep)) != ep)
    699 		encap_key_inc(&ep->key);
    700 	return ep;
    701 }
    702 
    703 
    704 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */
    705 
    706 #ifdef INET6
    707 void *
    708 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0)
    709 {
    710 	void *d = d0;
    711 	struct ip6_hdr *ip6;
    712 	struct mbuf *m;
    713 	int off;
    714 	struct ip6ctlparam *ip6cp = NULL;
    715 	int nxt;
    716 	int s;
    717 	struct encaptab *ep;
    718 	const struct encapsw *esw;
    719 
    720 	if (sa->sa_family != AF_INET6 ||
    721 	    sa->sa_len != sizeof(struct sockaddr_in6))
    722 		return NULL;
    723 
    724 	if ((unsigned)cmd >= PRC_NCMDS)
    725 		return NULL;
    726 	if (cmd == PRC_HOSTDEAD)
    727 		d = NULL;
    728 	else if (cmd == PRC_MSGSIZE)
    729 		; /* special code is present, see below */
    730 	else if (inet6ctlerrmap[cmd] == 0)
    731 		return NULL;
    732 
    733 	/* if the parameter is from icmp6, decode it. */
    734 	if (d != NULL) {
    735 		ip6cp = (struct ip6ctlparam *)d;
    736 		m = ip6cp->ip6c_m;
    737 		ip6 = ip6cp->ip6c_ip6;
    738 		off = ip6cp->ip6c_off;
    739 		nxt = ip6cp->ip6c_nxt;
    740 
    741 		if (ip6 && cmd == PRC_MSGSIZE) {
    742 			int valid = 0;
    743 			struct encaptab *match;
    744 			struct psref elem_psref;
    745 
    746 			/*
    747 		 	* Check to see if we have a valid encap configuration.
    748 		 	*/
    749 			match = encap6_lookup(m, off, nxt, OUTBOUND,
    750 			    &elem_psref);
    751 			if (match) {
    752 				valid++;
    753 				psref_release(&elem_psref, &match->psref,
    754 				    encaptab.elem_class);
    755 			}
    756 
    757 			/*
    758 		 	* Depending on the value of "valid" and routing table
    759 		 	* size (mtudisc_{hi,lo}wat), we will:
    760 		 	* - recalculate the new MTU and create the
    761 		 	*   corresponding routing entry, or
    762 		 	* - ignore the MTU change notification.
    763 		 	*/
    764 			icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
    765 		}
    766 	} else {
    767 		m = NULL;
    768 		ip6 = NULL;
    769 		nxt = -1;
    770 	}
    771 
    772 	/* inform all listeners */
    773 
    774 	s = pserialize_read_enter();
    775 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
    776 		struct psref elem_psref;
    777 
    778 		if (ep->af != AF_INET6)
    779 			continue;
    780 		if (ep->proto >= 0 && ep->proto != nxt)
    781 			continue;
    782 
    783 		/* should optimize by looking at address pairs */
    784 
    785 		/* XXX need to pass ep->arg or ep itself to listeners */
    786 		psref_acquire(&elem_psref, &ep->psref,
    787 		    encaptab.elem_class);
    788 		esw = ep->esw;
    789 		if (esw && esw->encapsw6.pr_ctlinput) {
    790 			pserialize_read_exit(s);
    791 			/* pr_ctlinput is sleepable. e.g. rtcache_free */
    792 			(*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg);
    793 			s = pserialize_read_enter();
    794 		}
    795 		psref_release(&elem_psref, &ep->psref,
    796 		    encaptab.elem_class);
    797 	}
    798 	pserialize_read_exit(s);
    799 
    800 	rip6_ctlinput(cmd, sa, d0);
    801 	return NULL;
    802 }
    803 #endif
    804 
    805 static int
    806 encap_detach_addr(const struct encaptab *ep)
    807 {
    808 	thmap_t *emap;
    809 	struct encaptab *retep;
    810 	struct encaptab *target;
    811 	void *thgc;
    812 	struct encap_key key;
    813 
    814 	KASSERT(encap_lock_held());
    815 	KASSERT(ep->flag & IP_ENCAP_ADDR_ENABLE);
    816 
    817 	switch (ep->af) {
    818 	case AF_INET:
    819 		emap = encap_map[0];
    820 		break;
    821 #ifdef INET6
    822 	case AF_INET6:
    823 		emap = encap_map[1];
    824 		break;
    825 #endif
    826 	default:
    827 		return EINVAL;
    828 	}
    829 
    830 	retep = thmap_del(emap, &ep->key, sizeof(ep->key));
    831 	if (retep != ep) {
    832 		return ENOENT;
    833 	}
    834 	target = retep;
    835 
    836 	/*
    837 	 * To keep continuity, decrement seq after detached encaptab.
    838 	 */
    839 	encap_key_copy(&key, &ep->key);
    840 	encap_key_inc(&key);
    841 	while ((retep = thmap_del(emap, &key, sizeof(key))) != NULL) {
    842 		void *pp;
    843 
    844 		encap_key_dec(&retep->key);
    845 		pp = thmap_put(emap, &retep->key, sizeof(retep->key), retep);
    846 		KASSERT(retep == pp);
    847 
    848 		encap_key_inc(&key);
    849 	}
    850 
    851 	thgc = thmap_stage_gc(emap);
    852 	pserialize_perform(encaptab.psz);
    853 	thmap_gc(emap, thgc);
    854 	psref_target_destroy(&target->psref, encaptab.elem_class);
    855 	kmem_free(target->addrpack, target->addrpack->sa_len);
    856 	kmem_free(target, sizeof(*target));
    857 
    858 	return 0;
    859 }
    860 
    861 int
    862 encap_detach(const struct encaptab *cookie)
    863 {
    864 	const struct encaptab *ep = cookie;
    865 	struct encaptab *p;
    866 	int error;
    867 
    868 	KASSERT(encap_lock_held());
    869 
    870 	if (ep->flag & IP_ENCAP_ADDR_ENABLE)
    871 		return encap_detach_addr(ep);
    872 
    873 	PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) {
    874 		if (p == ep) {
    875 			error = encap_remove(p);
    876 			if (error)
    877 				return error;
    878 			else
    879 				break;
    880 		}
    881 	}
    882 	if (p == NULL)
    883 		return ENOENT;
    884 
    885 	pserialize_perform(encaptab.psz);
    886 	psref_target_destroy(&p->psref,
    887 	    encaptab.elem_class);
    888 	kmem_free(p, sizeof(*p));
    889 
    890 	return 0;
    891 }
    892 
    893 int
    894 encap_lock_enter(void)
    895 {
    896 	int error;
    897 
    898 	mutex_enter(&encap_whole.lock);
    899 	while (encap_whole.busy != NULL) {
    900 		error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock);
    901 		if (error) {
    902 			mutex_exit(&encap_whole.lock);
    903 			return error;
    904 		}
    905 	}
    906 	KASSERT(encap_whole.busy == NULL);
    907 	encap_whole.busy = curlwp;
    908 	mutex_exit(&encap_whole.lock);
    909 
    910 	return 0;
    911 }
    912 
    913 void
    914 encap_lock_exit(void)
    915 {
    916 
    917 	mutex_enter(&encap_whole.lock);
    918 	KASSERT(encap_whole.busy == curlwp);
    919 	encap_whole.busy = NULL;
    920 	cv_broadcast(&encap_whole.cv);
    921 	mutex_exit(&encap_whole.lock);
    922 }
    923 
    924 bool
    925 encap_lock_held(void)
    926 {
    927 
    928 	return (encap_whole.busy == curlwp);
    929 }
    930